From 0287e879ac67bf41b862b0e4583ceab4e678ea2b Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Mon, 1 May 2017 17:41:00 -0800
Subject: [PATCH 01/51] Enable grappler to propagate shapes through queues.
 Change: 154789133

---
 .../core/common_runtime/shape_refiner.cc      | 228 ++++++++++++------
 .../core/common_runtime/shape_refiner.h       |   8 +
 .../core/common_runtime/shape_refiner_test.cc |  33 +++
 tensorflow/core/framework/shape_inference.h   |  57 ++++-
 tensorflow/core/grappler/costs/BUILD          |   4 +-
 .../core/grappler/costs/graph_properties.cc   |  68 ++++++
 .../grappler/costs/graph_properties_test.cc   |  73 ++++++
 tensorflow/core/ops/data_flow_ops.cc          |  12 +-
 8 files changed, 402 insertions(+), 81 deletions(-)
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 5135355a949..daa9e5091af 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -89,9 +89,6 @@ Status ShapeRefiner::AddNode(const Node* node) {
 
   // This needs to be filled in with real data in a second pass.
   std::vector<const Tensor*> input_tensors(node->num_inputs());
-  std::vector<Tensor> real_tensors(node->num_inputs());
-  std::vector<bool> attempted_materialization(node->num_inputs());
-  std::vector<bool> attempted_tensor_as_shape_conversion(node->num_inputs());
   std::vector<ShapeHandle> input_tensors_as_shapes;
 
   // Create the inference context for this node with the existing input shapes.
@@ -104,78 +101,7 @@ Status ShapeRefiner::AddNode(const Node* node) {
   }
 
   // Run the shape inference function, and return if there was an error.
-  if (op_reg_data->shape_inference_fn) {
-    TF_RETURN_IF_ERROR(c->Run(op_reg_data->shape_inference_fn));
-  } else {
-    TF_RETURN_IF_ERROR(c->Run(shape_inference::UnknownShape));
-  }
-
-  // We must run the shape function repeatedly, in case users write
-  // shape functions where they only conditionally call input_tensor()
-  // based on the values of another input tensor.
-  bool rerun_shape_fn;
-  do {
-    // If the result of running shape inference would have benefitted
-    // from knowing the values of input tensors, try to materialize
-    // the results of those tensors, and then run the shape inference
-    // function again using those known tensors.
-    rerun_shape_fn = false;
-
-    // NOTE: It is possible to batch the extraction and
-    // materialization of inputs, instead of materializing one input
-    // at a time like we do below.  If input-at-a-time computation
-    // becomes a bottleneck, we could separate ExtractConstantSubgraph
-    // into two functions: one that returns true if an input is
-    // derivable from constants, and another function that extracts
-    // the subgraph for multiple target nodes and executes the whole
-    // subgraph once.
-
-    for (int i = 0; i < c->num_inputs(); ++i) {
-      if (!c->requested_input_tensor(i)) {
-        continue;
-      }
-      // Check if we have not already filled in the requested input,
-      // and if not, try to materialize the tensors.
-      if (!attempted_materialization[i]) {
-        attempted_materialization[i] = true;
-
-        Tensor result;
-        bool evaluated = false;
-        TF_RETURN_IF_ERROR(
-            EvaluateConstantTensorForEdge(node, i, &evaluated, &result));
-        if (evaluated) {
-          real_tensors[i] = result;
-          input_tensors[i] = &real_tensors[i];
-          // We have more concrete information about a shape,
-          // so re-run shape inference.
-          rerun_shape_fn = true;
-        }
-      }
-      if (c->requested_input_tensor_as_partial_shape(i) &&
-          !attempted_tensor_as_shape_conversion[i]) {
-        attempted_tensor_as_shape_conversion[i] = true;
-        if (i >= input_tensors_as_shapes.size()) {
-          input_tensors_as_shapes.resize(i + 1);
-        }
-        ShapeHandle s;
-        TF_RETURN_IF_ERROR(ConstantPartialShape(c.get(), node, i, &s));
-        input_tensors_as_shapes[i] = s;
-        rerun_shape_fn = true;
-      }
-    }
-
-    if (rerun_shape_fn) {
-      // We have more information about the shapes on this pass,
-      // so re-run shape inference.
-      c->set_input_tensors(input_tensors);
-      c->set_input_tensors_as_shapes(input_tensors_as_shapes);
-      if (op_reg_data->shape_inference_fn) {
-        TF_RETURN_IF_ERROR(op_reg_data->shape_inference_fn(c.get()));
-      } else {
-        TF_RETURN_IF_ERROR(shape_inference::UnknownShape(c.get()));
-      }
-    }
-  } while (rerun_shape_fn);
+  TF_RETURN_IF_ERROR(RunShapeFn(node, op_reg_data, c.get()));
 
   // Store the resulting InferenceContext object in the map.
   node_to_context_[node].swap(c);
@@ -211,6 +137,71 @@ Status ShapeRefiner::SetShape(const Node* node, int output_port,
   return Status::OK();
 }
 
+Status ShapeRefiner::UpdateNode(const Node* node, bool* refined) {
+  auto it = node_to_context_.find(node);
+  if (it == node_to_context_.end()) {
+    *refined = true;
+    return AddNode(node);
+  }
+  InferenceContext* node_context = it->second.get();
+
+  // Check if the shapes of the nodes in the fan-in of this node have changed,
+  // and if they have update the node input shapes.
+  for (const Edge* e : node->in_edges()) {
+    if (e->IsControlEdge()) continue;
+
+    Node* input = e->src();
+    auto iter = node_to_context_.find(input);
+    if (iter == node_to_context_.end()) {
+      return errors::FailedPrecondition(
+          "Input ", e->dst_input(), " ('", input->name(), "') for '",
+          node->name(), "' was not previously added to ShapeRefiner.");
+    }
+
+    InferenceContext* c = iter->second.get();
+    DCHECK_GE(e->dst_input(), 0);
+    if (node_context->set_input(e->dst_input(), c->output(e->src_output()))) {
+      *refined = true;
+    }
+
+    // Also propagate handle shape and dtype of edges which are carrying
+    // resource handles.
+    if (e->src()->output_type(e->src_output()) == DT_RESOURCE) {
+      if (node_context->set_input_handle_dtype(
+              e->dst_input(), c->output_handle_dtype(e->src_output()))) {
+        *refined = true;
+      }
+      if (node_context->set_input_handle_shape(
+              e->dst_input(), c->output_handle_shape(e->src_output()))) {
+        *refined = true;
+      }
+    }
+  }
+
+  if (!*refined) {
+    // No input shape has changed, we're done
+    return Status::OK();
+  }
+
+  // Get and run the shape function for this node to update the shapes of the
+  // outputs.
+  const OpRegistrationData* op_reg_data;
+  TF_RETURN_IF_ERROR(ops_registry_->LookUp(node->type_string(), &op_reg_data));
+  if (op_reg_data->shape_inference_fn == nullptr &&
+      require_shape_inference_fns_) {
+    return errors::InvalidArgument(
+        "No shape inference function exists for op '", node->type_string(),
+        "', did you forget to define it?");
+  }
+
+  if (!op_reg_data->shape_inference_fn) {
+    // There is nothing more we can infer
+    return Status::OK();
+  }
+
+  return RunShapeFn(node, op_reg_data, node_context);
+}
+
 Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
                                                    int dst_idx, bool* evaluated,
                                                    Tensor* result) {
@@ -463,4 +454,91 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   return Status::OK();
 }
 
+Status ShapeRefiner::RunShapeFn(const Node* node,
+                                const OpRegistrationData* op_reg_data,
+                                shape_inference::InferenceContext* c) {
+  // This will be filled in with real data in a second pass.
+  std::vector<const Tensor*> input_tensors(node->num_inputs());
+  std::vector<Tensor> real_tensors(node->num_inputs());
+  std::vector<bool> attempted_materialization(node->num_inputs());
+  std::vector<bool> attempted_tensor_as_shape_conversion(node->num_inputs());
+  std::vector<ShapeHandle> input_tensors_as_shapes;
+
+  // Run the shape inference function, and return if there was an error.
+  if (op_reg_data->shape_inference_fn) {
+    TF_RETURN_IF_ERROR(c->Run(op_reg_data->shape_inference_fn));
+  } else {
+    TF_RETURN_IF_ERROR(c->Run(shape_inference::UnknownShape));
+  }
+
+  // We must run the shape function repeatedly, in case users write
+  // shape functions where they only conditionally call input_tensor()
+  // based on the values of another input tensor.
+  bool rerun_shape_fn;
+  do {
+    // If the result of running shape inference would have benefitted
+    // from knowing the values of input tensors, try to materialize
+    // the results of those tensors, and then run the shape inference
+    // function again using those known tensors.
+    rerun_shape_fn = false;
+
+    // NOTE: It is possible to batch the extraction and
+    // materialization of inputs, instead of materializing one input
+    // at a time like we do below.  If input-at-a-time computation
+    // becomes a bottleneck, we could separate ExtractConstantSubgraph
+    // into two functions: one that returns true if an input is
+    // derivable from constants, and another function that extracts
+    // the subgraph for multiple target nodes and executes the whole
+    // subgraph once.
+
+    for (int i = 0; i < c->num_inputs(); ++i) {
+      if (!c->requested_input_tensor(i)) {
+        continue;
+      }
+      // Check if we have not already filled in the requested input,
+      // and if not, try to materialize the tensors.
+      if (!attempted_materialization[i]) {
+        attempted_materialization[i] = true;
+
+        Tensor result;
+        bool evaluated = false;
+        TF_RETURN_IF_ERROR(
+            EvaluateConstantTensorForEdge(node, i, &evaluated, &result));
+        if (evaluated) {
+          real_tensors[i] = result;
+          input_tensors[i] = &real_tensors[i];
+          // We have more concrete information about a shape,
+          // so re-run shape inference.
+          rerun_shape_fn = true;
+        }
+      }
+      if (c->requested_input_tensor_as_partial_shape(i) &&
+          !attempted_tensor_as_shape_conversion[i]) {
+        attempted_tensor_as_shape_conversion[i] = true;
+        if (i >= input_tensors_as_shapes.size()) {
+          input_tensors_as_shapes.resize(i + 1);
+        }
+        ShapeHandle s;
+        TF_RETURN_IF_ERROR(ConstantPartialShape(c, node, i, &s));
+        input_tensors_as_shapes[i] = s;
+        rerun_shape_fn = true;
+      }
+    }
+
+    if (rerun_shape_fn) {
+      // We have more information about the shapes on this pass,
+      // so re-run shape inference.
+      c->set_input_tensors(input_tensors);
+      c->set_input_tensors_as_shapes(input_tensors_as_shapes);
+      if (op_reg_data->shape_inference_fn) {
+        TF_RETURN_IF_ERROR(op_reg_data->shape_inference_fn(c));
+      } else {
+        TF_RETURN_IF_ERROR(shape_inference::UnknownShape(c));
+      }
+    }
+  } while (rerun_shape_fn);
+
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index 2d04ea15055..9709bd03021 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -55,6 +55,11 @@ class ShapeRefiner {
   Status SetShape(const Node* node, int output_port,
                   shape_inference::ShapeHandle shape);
 
+  // Update the input shapes of node in case the shapes of the fan-ins of 'node'
+  // have themselves been modified (For example, in case of incremental shape
+  // refinement). Sets refined to true if any of the node shape has changed.
+  Status UpdateNode(const Node* node, bool* refined);
+
   // Returns the InferenceContext for 'node', if present.
   shape_inference::InferenceContext* GetContext(const Node* node) const {
     auto it = node_to_context_.find(node);
@@ -108,6 +113,9 @@ class ShapeRefiner {
                               const Node* node, int dst_idx,
                               shape_inference::ShapeHandle* result);
 
+  Status RunShapeFn(const Node* node, const OpRegistrationData* op_reg_data,
+                    shape_inference::InferenceContext* c);
+
   int32 graph_def_version_;
   const OpRegistryInterface* const ops_registry_;
 
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index d7e7c3b5ad5..b8df6dd4f62 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -768,5 +768,38 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
             m.AddNode(result).error_message());
 }
 
+TEST(ShapeRefinerTest, IncrementalUpdates) {
+  Scope root = Scope::NewRootScope();
+  Graph* g = root.graph();
+  Node* queue;
+  TF_CHECK_OK(NodeBuilder("queue", "FIFOQueueV2")
+                  .Attr("component_types", {DT_FLOAT})
+                  .Finalize(g, &queue));
+  Node* dequeue;
+  TF_CHECK_OK(NodeBuilder("dequeue", "QueueDequeueV2")
+                  .Attr("component_types", {DT_FLOAT})
+                  .Input(queue)
+                  .Finalize(g, &dequeue));
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(queue));
+  TF_ASSERT_OK(m.AddNode(dequeue));
+
+  // At this point, the shapes of the dequeued tensor are unknown.
+  shape_inference::InferenceContext* ctx = m.GetContext(dequeue);
+  EXPECT_EQ("?", ctx->DebugString(ctx->output(0)));
+
+  // Inject a shape, and incrementally propagate it to the dequeue op.
+  ctx = m.GetContext(queue);
+  shape_inference::ShapeHandle shp = ctx->MakeShape({3, 7});
+  ctx->set_output_handle_shape(0, shp);
+  ctx->set_output_handle_dtype(0, DT_FLOAT);
+
+  bool refined = false;
+  TF_ASSERT_OK(m.UpdateNode(dequeue, &refined));
+  EXPECT_TRUE(refined);
+  ctx = m.GetContext(dequeue);
+  EXPECT_EQ("[3,7]", ctx->DebugString(ctx->output(0)));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index e88f6dbb042..71663027b3c 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -191,6 +191,17 @@ class InferenceContext {
     return s;
   }
 
+  // Set the shape of the input in position idx. This requires idx to be in the
+  // [0, num_inputs) range. Returns true iff the stored input shape has been
+  // updated with a different handle.
+  bool set_input(int idx, ShapeHandle shape) {
+    if (!inputs_[idx].SameHandle(shape)) {
+      inputs_[idx] = shape;
+      return true;
+    } else {
+      return false;
+    }
+  }
   ShapeHandle input(int64 idx) const { return inputs_[idx]; }
   Status input(StringPiece input_name, std::vector<ShapeHandle>* output) const;
   int num_inputs() const { return inputs_.size(); }
@@ -430,15 +441,53 @@ class InferenceContext {
   // and dtypes of tensors which can be accessed via the handle. These methods
   // propagate that information. Output handle dtypes and shapes are ignored if
   // the output tensor is not of type DT_RESOURCE.
+
+  // Set the shape corresponding to the resource in position idx. This requires
+  // idx to be in the [0, num_inputs) range. Returns true iff the stored shape
+  // has been updated with a different handle.
+  bool set_input_handle_shape(int idx, ShapeHandle shape) {
+    if (!input_handle_shape_[idx].SameHandle(shape)) {
+      input_handle_shape_[idx] = shape;
+      return true;
+    }
+    return false;
+  }
+
+  // Set the type corresponding to the resource in position idx. This requires
+  // idx to be in the [0, num_inputs) range. Returns true iff the stored type
+  // has been updated.
+  bool set_input_handle_dtype(int idx, DataType dtype) {
+    if (input_handle_dtype_[idx] != dtype) {
+      input_handle_dtype_[idx] = dtype;
+      return true;
+    }
+    return false;
+  }
   ShapeHandle input_handle_shape(int idx);
   DataType input_handle_dtype(int idx) const {
     return input_handle_dtype_[idx];
   }
-  void set_output_handle_shape(int idx, ShapeHandle shape) {
-    output_handle_shape_[idx] = shape;
+
+  // Set the shape corresponding to the resource in position idx. This requires
+  // idx to be in the [0, num_outputs) range.
+  // Returns true iff the stored shape has been updated with a different handle.
+  bool set_output_handle_shape(int idx, ShapeHandle shape) {
+    if (!output_handle_shape_[idx].SameHandle(shape)) {
+      output_handle_shape_[idx] = shape;
+      return true;
+    }
+    return false;
   }
-  void set_output_handle_dtype(int idx, DataType dtype) {
-    output_handle_dtype_[idx] = dtype;
+
+  // Set the type corresponding to the resource in position idx. This requires
+  // idx to be in the [0, num_outputs) range. Returns true iff the stored type
+  // has been updated.
+  bool set_output_handle_dtype(int idx, DataType dtype) {
+    if (output_handle_dtype_[idx] != dtype) {
+      output_handle_dtype_[idx] = dtype;
+      return true;
+    }
+    return false;
   }
   ShapeHandle output_handle_shape(int idx) const {
     return output_handle_shape_[idx];
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index d078d9af09e..e784c2df443 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -50,11 +50,13 @@ cc_test(
     args = ["--heap_check=local"],  # The GPU tracer leaks memory
     deps = [
         ":graph_properties",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:grappler_item_builder",
         "//tensorflow/core/grappler/clusters:single_machine",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 06e91af2c2a..ad8f4f3f7cc 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -31,6 +34,71 @@ Status GraphProperties::InferStatically() {
   Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
   TF_RETURN_IF_ERROR(s);
 
+  // List the resources and the nodes using them
+  std::unordered_map<const Node*, std::unordered_set<const Node*>> resources;
+  for (const Node* const node : graph.nodes()) {
+    for (int i = 0; i < node->num_inputs(); ++i) {
+      if (node->input_type(i) == DataType::DT_RESOURCE) {
+        const Node* resource;
+        TF_CHECK_OK(node->input_node(i, &resource));
+        resources[resource].insert(node);
+      }
+    }
+  }
+
+  // If we found a resource, try to propagate the shapes through it.
+  bool done = true;
+  do {
+    std::queue<const Node*> new_shapes;
+    for (const auto& resource_data : resources) {
+      const Node* qnode = resource_data.first;
+      StringPiece type(qnode->type_string());
+      if (!type.ends_with("QueueV2")) {
+        continue;
+      }
+      auto qctx = shape_refiner.GetContext(qnode);
+      if (!qctx) {
+        continue;
+      }
+      shape_inference::ShapeHandle data_shp = qctx->output_handle_shape(0);
+      if (qctx->FullyDefined(data_shp)) {
+        continue;
+      }
+
+      for (const auto& node : resource_data.second) {
+        auto ctx = shape_refiner.GetContext(node);
+        if (!ctx) {
+          continue;
+        }
+        if (node->type_string().find("Enqueue") != std::string::npos) {
+          if (ctx->num_inputs() == 2) {
+            const DataType dtype = node->input_type(1);
+            shape_inference::ShapeHandle shp = ctx->input(1);
+            shape_inference::ShapeHandle refined;
+            TF_RETURN_IF_ERROR(qctx->Merge(shp, data_shp, &refined));
+            if (qctx->set_output_handle_shape(0, refined) ||
+                qctx->set_output_handle_dtype(0, dtype)) {
+              new_shapes.push(qnode);
+            }
+          }
+        }
+      }
+    }
+    // Propagate the shapes in the transitive fan-out of the queue.
+    done = new_shapes.empty();
+    while (!new_shapes.empty()) {
+      const Node* n = new_shapes.front();
+      new_shapes.pop();
+      for (const Node* fanout : n->out_nodes()) {
+        bool updated = false;
+        TF_RETURN_IF_ERROR(shape_refiner.UpdateNode(fanout, &updated));
+        if (updated) {
+          new_shapes.push(fanout);
+        }
+      }
+    }
+  } while (!done);
+
   for (const Node* const node : graph.nodes()) {
     VLOG(1) << "<Node> " << node->name();
     auto ctx = shape_refiner.GetContext(node);
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 32683644fbb..1eff52ba0e6 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
@@ -129,6 +132,76 @@ TEST_F(GraphPropertiesTest, DynamicProperties) {
   }
 }
 
+TEST_F(GraphPropertiesTest, VarHandles) {
+  GrapplerItem item;
+  TF_CHECK_OK(NodeDefBuilder("Var", "VarHandleOp")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("shape", TensorShape({3, 7}))
+                  .Finalize(item.graph.add_node()));
+
+  TF_CHECK_OK(NodeDefBuilder("VarRead", "ReadVariableOp")
+                  .Attr("dtype", DT_FLOAT)
+                  .Input("Var", 0, DT_RESOURCE)
+                  .Finalize(item.graph.add_node()));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  const auto props = properties.GetOutputProperties("VarRead");
+  EXPECT_EQ(1, props.size());
+  const OpInfo::TensorProperties& prop = props[0];
+  EXPECT_EQ(DT_FLOAT, prop.dtype());
+  EXPECT_FALSE(prop.shape().unknown_rank());
+  EXPECT_EQ(2, prop.shape().dim_size());
+  EXPECT_EQ(3, prop.shape().dim(0).size());
+  EXPECT_EQ(7, prop.shape().dim(1).size());
+}
+
+TEST_F(GraphPropertiesTest, Queues) {
+  // Create a graph with known input shapes, and propagate the shapes through a
+  // couple of queues.
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+
+  auto q1 = ops::FIFOQueue(root.WithOpName("Queue1"), {DataType::DT_FLOAT});
+  Output rnd =
+      ops::RandomNormal(root.WithOpName("rnd"), {3, 7}, DataType::DT_FLOAT);
+  Output square1 = ops::Square(root.WithOpName("Square1"), rnd);
+  auto enqueue1 = ops::QueueEnqueue(root.WithOpName("Enqueue1"), q1, {square1});
+  auto dequeue1 =
+      ops::QueueDequeue(root.WithOpName("Dequeue1"), q1, {DataType::DT_FLOAT});
+
+  auto q2 =
+      ops::RandomShuffleQueue(root.WithOpName("Queue2"), {DataType::DT_FLOAT});
+  Output square2 = ops::Square(root.WithOpName("Square2"), dequeue1[0]);
+  auto enqueue2 = ops::QueueEnqueue(root.WithOpName("Enqueue2"), q2, {square2});
+  auto dequeue2 =
+      ops::QueueDequeue(root.WithOpName("Dequeue2"), q2, {DataType::DT_FLOAT});
+
+  GrapplerItem item;
+  TF_CHECK_OK(root.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  const auto props1 = properties.GetOutputProperties("Dequeue1");
+  EXPECT_EQ(1, props1.size());
+  const OpInfo::TensorProperties& prop1 = props1[0];
+  EXPECT_EQ(DT_FLOAT, prop1.dtype());
+  EXPECT_FALSE(prop1.shape().unknown_rank());
+  EXPECT_EQ(2, prop1.shape().dim_size());
+  EXPECT_EQ(3, prop1.shape().dim(0).size());
+  EXPECT_EQ(7, prop1.shape().dim(1).size());
+
+  const auto props2 = properties.GetOutputProperties("Dequeue2");
+  EXPECT_EQ(1, props2.size());
+  const OpInfo::TensorProperties& prop2 = props2[0];
+  EXPECT_EQ(DT_FLOAT, prop2.dtype());
+  EXPECT_FALSE(prop2.shape().unknown_rank());
+  EXPECT_EQ(2, prop2.shape().dim_size());
+  EXPECT_EQ(3, prop2.shape().dim(0).size());
+  EXPECT_EQ(7, prop2.shape().dim(1).size());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index f82e9d1eb76..f35a1bb6489 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -623,7 +623,17 @@ REGISTER_OP("QueueDequeueV2")
     .Output("components: component_types")
     .Attr("component_types: list(type) >= 1")
     .Attr("timeout_ms: int = -1")
-    .SetShapeFn(shape_inference::UnknownShape)
+    .SetShapeFn([](InferenceContext* c) {
+      if (c->num_outputs() == 1) {
+        c->set_output(0, c->input_handle_shape(0));
+      } else {
+        // TODO(vrv): handle the case of multiple outputs.
+        for (int i = 0; i < c->num_outputs(); ++i) {
+          c->set_output(i, c->UnknownShape());
+        }
+      }
+      return Status::OK();
+    })
     .Doc(R"doc(
 Dequeues a tuple of one or more tensors from the given queue.
 

From 9e289ce04020f01f4d8c537f2c399fddae4be019 Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Mon, 1 May 2017 19:29:12 -0800
Subject: [PATCH 02/51] Add whitelist support in uid of RunConfig. Change:
 154794859

---
 .../python/learn/estimators/run_config.py     | 26 ++++++++++-
 .../learn/estimators/run_config_test.py       | 45 +++++++++++++++++++
 .../learn/python/learn/learn_runner_test.py   |  3 +-
 3 files changed, 70 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 109c8d25e12..5a63ee7fa82 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -31,6 +31,17 @@ from tensorflow.python.estimator import run_config as core_run_config
 from tensorflow.python.training import server_lib
 
 
+_DEFAULT_UID_WHITE_LIST = [
+    'tf_random_seed',
+    'save_summary_steps',
+    'save_checkpoints_steps',
+    'save_checkpoints_secs',
+    'session_config',
+    'keep_checkpoint_max',
+    'keep_checkpoint_every_n_hours',
+]
+
+
 class Environment(object):
   # For running general distributed training.
   CLOUD = 'cloud'
@@ -312,18 +323,29 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
     return new_copy
 
   @experimental
-  def uid(self):
+  def uid(self, whitelist=None):
     """Generates a 'Unique Identifier' based on all internal fields.
 
     Caller should use the uid string to check `RunConfig` instance integrity
     in one session use, but should not rely on the implementation details, which
     is subject to change.
 
+    Args:
+      whitelist: A list of the string names of the properties uid should not
+        include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which
+        includes most properites user allowes to change.
+
     Returns:
       A uid string.
     """
-    # TODO(b/33295821): Allows user to specify a whitelist.
+    if whitelist is None:
+      whitelist = _DEFAULT_UID_WHITE_LIST
+
     state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')}
+    # Pop out the keys in whitelist.
+    for k in whitelist:
+      state.pop('_' + k, None)
+
     ordered_state = collections.OrderedDict(
         sorted(state.items(), key=lambda t: t[0]))
     # For class instance without __repr__, some special cares are required.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py b/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
index 14cef7cc43d..6d39a9ad137 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
@@ -257,6 +257,51 @@ class RunConfigTest(test.TestCase):
     self.assertNotEqual(expected_uid, new_config.uid())
     self.assertEqual(ANOTHER_TEST_DIR, new_config.model_dir)
 
+  def test_uid_for_whitelist(self):
+    whitelist = ["model_dir"]
+    config = run_config_lib.RunConfig(
+        tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)
+
+    expected_uid = config.uid(whitelist)
+    self.assertEqual(expected_uid, config.uid(whitelist))
+
+    new_config = config.replace(model_dir=ANOTHER_TEST_DIR)
+    self.assertEqual(TEST_DIR, config.model_dir)
+    self.assertEqual(expected_uid, new_config.uid(whitelist))
+    self.assertEqual(ANOTHER_TEST_DIR, new_config.model_dir)
+
+  def test_uid_for_default_whitelist(self):
+    config = run_config_lib.RunConfig(
+        tf_random_seed=11,
+        save_summary_steps=12,
+        save_checkpoints_steps=13,
+        save_checkpoints_secs=14,
+        session_config=15,
+        keep_checkpoint_max=16,
+        keep_checkpoint_every_n_hours=17)
+    self.assertEqual(11, config.tf_random_seed)
+    self.assertEqual(12, config.save_summary_steps)
+    self.assertEqual(13, config.save_checkpoints_steps)
+    self.assertEqual(14, config.save_checkpoints_secs)
+    self.assertEqual(15, config.session_config)
+    self.assertEqual(16, config.keep_checkpoint_max)
+    self.assertEqual(17, config.keep_checkpoint_every_n_hours)
+
+    new_config = run_config_lib.RunConfig(
+        tf_random_seed=21,
+        save_summary_steps=22,
+        save_checkpoints_steps=23,
+        save_checkpoints_secs=24,
+        session_config=25,
+        keep_checkpoint_max=26,
+        keep_checkpoint_every_n_hours=27)
+    self.assertEqual(config.uid(), new_config.uid())
+    # model_dir is not on the default whitelist.
+    self.assertNotEqual(config.uid(whitelist=[]),
+                        new_config.uid(whitelist=[]))
+    new_config = new_config.replace(model_dir=ANOTHER_TEST_DIR)
+    self.assertNotEqual(config.uid(), new_config.uid())
+
   def test_uid_for_deepcopy(self):
     tf_config = {
         "cluster": {
diff --git a/tensorflow/contrib/learn/python/learn/learn_runner_test.py b/tensorflow/contrib/learn/python/learn/learn_runner_test.py
index 6c8cde453f3..77bdcaeb7ed 100644
--- a/tensorflow/contrib/learn/python/learn/learn_runner_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_runner_test.py
@@ -293,8 +293,7 @@ class LearnRunnerRunWithRunConfigTest(test.TestCase):
     def _experiment_fn(run_config, hparams):
       del run_config, hparams  # unused.
       # Explicitly use a new run_config.
-      new_config = run_config_lib.RunConfig(
-          model_dir=_MODIR_DIR, save_checkpoints_steps=123)
+      new_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR + "/123")
 
       return TestExperiment(config=new_config)
 

From 883e32600ef242cb44d0702bb96f71f3140b5403 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Tue, 2 May 2017 04:08:23 -0800
Subject: [PATCH 03/51] Fix a bunch of bad links and missing docs in contrib.
 Change: 154820641

---
 tensorflow/contrib/distributions/__init__.py  | 133 +++++++++---------
 tensorflow/contrib/losses/__init__.py         |  24 +++-
 .../contrib/losses/python/losses/__init__.py  | 120 +---------------
 tensorflow/contrib/seq2seq/__init__.py        |  56 ++++----
 .../seq2seq/python/ops/attention_wrapper.py   |   1 +
 .../api_guides/python/contrib.graph_editor.md |  20 +--
 .../api_guides/python/contrib.linalg.md       |   2 +-
 .../api_guides/python/contrib.losses.md       |  15 +-
 tensorflow/docs_src/get_started/tflearn.md    |   2 +-
 tensorflow/tools/docs/generate_lib.py         |   1 -
 10 files changed, 132 insertions(+), 242 deletions(-)

diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 15e33c2c6f0..cafa477f448 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -15,74 +15,6 @@
 """Classes representing statistical distributions and ops for working with them.
 
 See the @{$python/contrib.distributions} guide.
-
-## Distribution Object
-@@ReparameterizationType
-@@Distribution
-
-## Individual Distributions
-@@Binomial
-@@Bernoulli
-@@BernoulliWithSigmoidProbs
-@@Beta
-@@BetaWithSoftplusConcentration
-@@Categorical
-@@Chi2
-@@Chi2WithAbsDf
-@@Deterministic
-@@VectorDeterministic
-@@Exponential
-@@ExponentialWithSoftplusRate
-@@Gamma
-@@GammaWithSoftplusConcentrationRate
-@@Geometric
-@@InverseGamma
-@@InverseGammaWithSoftplusConcentrationRate
-@@Laplace
-@@LaplaceWithSoftplusScale
-@@Logistic
-@@NegativeBinomial
-@@Normal
-@@NormalWithSoftplusScale
-@@Poisson
-@@StudentT
-@@StudentTWithAbsDfSoftplusScale
-@@Uniform
-
-@@MultivariateNormalDiag
-@@MultivariateNormalTriL
-@@MultivariateNormalDiagPlusLowRank
-@@MultivariateNormalDiagWithSoftplusScale
-
-@@Dirichlet
-@@DirichletMultinomial
-@@Multinomial
-@@WishartCholesky
-@@WishartFull
-
-@@TransformedDistribution
-@@QuantizedDistribution
-
-@@Mixture
-
-@@ExpRelaxedOneHotCategorical
-@@OneHotCategorical
-@@RelaxedBernoulli
-@@RelaxedOneHotCategorical
-
-## Kullback-Leibler Divergence
-@@kl_divergence
-@@RegisterKL
-
-## Helper Functions
-@@matrix_diag_transform
-@@normal_conjugates_known_scale_posterior
-@@normal_conjugates_known_scale_predictive
-@@softplus_inverse
-
-## Functions for statistics of samples
-@@percentile
-
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -140,6 +72,71 @@ _allowed_symbols = [
     'ConditionalTransformedDistribution',
     'FULLY_REPARAMETERIZED',
     'NOT_REPARAMETERIZED',
+    'Affine',
+    'AffineLinearOperator',
+    'Bijector',
+    'Chain',
+    'CholeskyOuterProduct',
+    'Exp',
+    'Identity',
+    'Inline',
+    'Invert',
+    'PowerTransform',
+    'SigmoidCentered',
+    'SoftmaxCentered',
+    'Softplus',
+    'ReparameterizationType',
+    'Distribution',
+    'Binomial',
+    'Bernoulli',
+    'BernoulliWithSigmoidProbs',
+    'Beta',
+    'BetaWithSoftplusConcentration',
+    'Categorical',
+    'Chi2',
+    'Chi2WithAbsDf',
+    'Deterministic',
+    'VectorDeterministic',
+    'Exponential',
+    'ExponentialWithSoftplusRate',
+    'Gamma',
+    'GammaWithSoftplusConcentrationRate',
+    'Geometric',
+    'InverseGamma',
+    'InverseGammaWithSoftplusConcentrationRate',
+    'Laplace',
+    'LaplaceWithSoftplusScale',
+    'Logistic',
+    'NegativeBinomial',
+    'Normal',
+    'NormalWithSoftplusScale',
+    'Poisson',
+    'StudentT',
+    'StudentTWithAbsDfSoftplusScale',
+    'Uniform',
+    'MultivariateNormalDiag',
+    'MultivariateNormalTriL',
+    'MultivariateNormalDiagPlusLowRank',
+    'MultivariateNormalDiagWithSoftplusScale',
+    'Dirichlet',
+    'DirichletMultinomial',
+    'Multinomial',
+    'WishartCholesky',
+    'WishartFull',
+    'TransformedDistribution',
+    'QuantizedDistribution',
+    'Mixture',
+    'ExpRelaxedOneHotCategorical',
+    'OneHotCategorical',
+    'RelaxedBernoulli',
+    'RelaxedOneHotCategorical',
+    'kl_divergence',
+    'RegisterKL',
+    'matrix_diag_transform',
+    'normal_conjugates_known_scale_posterior',
+    'normal_conjugates_known_scale_predictive',
+    'softplus_inverse',
+    'percentile'
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/losses/__init__.py b/tensorflow/contrib/losses/__init__.py
index 9861ecc1f87..790bf61367d 100644
--- a/tensorflow/contrib/losses/__init__.py
+++ b/tensorflow/contrib/losses/__init__.py
@@ -22,10 +22,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,wildcard-import
-from tensorflow.contrib.losses.python import losses
+# pylint: disable=wildcard-import
 from tensorflow.contrib.losses.python.losses import *
-# pylint: enable=unused-import,wildcard-import
+# pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
-remove_undocumented(__name__, doc_string_modules=[losses])
+
+_allowed_symbols = [
+    'absolute_difference',
+    'add_loss',
+    'hinge_loss',
+    'compute_weighted_loss',
+    'cosine_distance',
+    'get_losses',
+    'get_regularization_losses',
+    'get_total_loss',
+    'log_loss',
+    'mean_pairwise_squared_error',
+    'mean_squared_error',
+    'sigmoid_cross_entropy',
+    'softmax_cross_entropy',
+    'sparse_softmax_cross_entropy',
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/losses/python/losses/__init__.py b/tensorflow/contrib/losses/python/losses/__init__.py
index 1b57f0baeef..6e9d1d4a773 100644
--- a/tensorflow/contrib/losses/python/losses/__init__.py
+++ b/tensorflow/contrib/losses/python/losses/__init__.py
@@ -12,127 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""## Loss operations for use in neural networks.
+"""Ops for building neural network losses.
 
-Note: By default all the losses are collected into the `GraphKeys.LOSSES`
-collection.
-
-All of the loss functions take a pair of predictions and ground truth labels,
-from which the loss is computed. It is assumed that the shape of both these
-tensors is of the form [batch_size, d1, ... dN] where `batch_size` is the number
-of samples in the batch and `d1` ... `dN` are the remaining dimensions.
-
-It is common, when training with multiple loss functions, to adjust the relative
-strengths of individual losses. This is performed by rescaling the losses via
-a `weight` parameter passed to the loss functions. For example, if we were
-training with both log_loss and sum_of_squares_loss, and we wished that the
-log_loss penalty be twice as severe as the sum_of_squares_loss, we would
-implement this as:
-
-  # Explicitely set the weight.
-  tf.contrib.losses.log(predictions, labels, weight=2.0)
-
-  # Uses default weight of 1.0
-  tf.contrib.losses.sum_of_squares(predictions, labels)
-
-  # All the losses are collected into the `GraphKeys.LOSSES` collection.
-  losses = tf.get_collection(tf.GraphKeys.LOSSES)
-
-While specifying a scalar loss rescales the loss over the entire batch,
-we sometimes want to rescale the loss per batch sample. For example, if we have
-certain examples that matter more to us to get correctly, we might want to have
-a higher loss that other samples whose mistakes matter less. In this case, we
-can provide a weight vector of length `batch_size` which results in the loss
-for each sample in the batch being scaled by the corresponding weight element.
-For example, consider the case of a classification problem where we want to
-maximize our accuracy but we especially interested in obtaining high accuracy
-for a specific class:
-
-  inputs, labels = LoadData(batch_size=3)
-  logits = MyModelPredictions(inputs)
-
-  # Ensures that the loss for examples whose ground truth class is `3` is 5x
-  # higher than the loss for all other examples.
-  weight = tf.multiply(4, tf.cast(tf.equal(labels, 3), tf.float32)) + 1
-
-  onehot_labels = tf.one_hot(labels, num_classes=5)
-  tf.contrib.losses.softmax_cross_entropy(logits, onehot_labels, weight=weight)
-
-Finally, in certain cases, we may want to specify a different loss for every
-single measurable value. For example, if we are performing per-pixel depth
-prediction, or per-pixel denoising, a single batch sample has P values where P
-is the number of pixels in the image. For many losses, the number of measurable
-values matches the number of elements in the predictions and labels tensors.
-For others, such as softmax_cross_entropy and cosine_distance, the
-loss functions reduces the dimensions of the inputs to produces a tensor of
-losses for each measurable value. For example, softmax_cross_entropy takes as
-input predictions and labels of dimension [batch_size, num_classes] but the
-number of measurable values is [batch_size]. Consequently, when passing a weight
-tensor to specify a different loss for every measurable value, the dimension of
-the tensor will depend on the loss being used.
-
-For a concrete example, consider the case of per-pixel depth prediction where
-certain ground truth depth values are missing (due to sensor noise in the
-capture process). In this case, we want to assign zero weight to losses for
-these predictions.
-
-  # 'depths' that are missing have a value of 0:
-  images, depths = LoadData(...)
-  predictions = MyModelPredictions(images)
-
-  weight = tf.cast(tf.greater(depths, 0), tf.float32)
-  loss  = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
-
-Note that when using weights for the losses, the final average is computed
-by rescaling the losses by the weights and then dividing by the total number of
-non-zero samples. For an arbitrary set of weights, this may not necessarily
-produce a weighted average. Instead, it simply and transparently rescales the
-per-element losses before averaging over the number of observations. For example
-if the losses computed by the loss function is an array [4, 1, 2, 3] and the
-weights are an array [1, 0.5, 3, 9], then the average loss is:
-
-  (4*1 + 1*0.5 + 2*3 + 3*9) / 4
-
-However, with a single loss function and an arbitrary set of weights, one can
-still easily create a loss function such that the resulting loss is a
-weighted average over the individual prediction errors:
-
-  images, labels = LoadData(...)
-  predictions = MyModelPredictions(images)
-
-  weight = MyComplicatedWeightingFunction(labels)
-  weight = tf.div(weight, tf.size(weight))
-  loss = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
-
-@@absolute_difference
-@@add_loss
-@@hinge_loss
-@@compute_weighted_loss
-@@cosine_distance
-@@get_losses
-@@get_regularization_losses
-@@get_total_loss
-@@log_loss
-@@mean_pairwise_squared_error
-@@mean_squared_error
-@@sigmoid_cross_entropy
-@@softmax_cross_entropy
-@@sparse_softmax_cross_entropy
-
-The following are deprecated in favor of `mean_pairwise_squared_error` and
-`mean_squared_error`.
-@@sum_of_pairwise_squares
-@@sum_of_squares
+See @{$python/contrib.losses}.
 """
 
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,wildcard-import
+# pylint: disable=wildcard-import
 from tensorflow.contrib.losses.python.losses.loss_ops import *
-from tensorflow.python.util.all_util import make_all
-# pylint: enable=unused-import,wildcard-import
-
-__all__ = make_all(__name__)
+# pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/seq2seq/__init__.py b/tensorflow/contrib/seq2seq/__init__.py
index dd497197e34..dc159b93a37 100644
--- a/tensorflow/contrib/seq2seq/__init__.py
+++ b/tensorflow/contrib/seq2seq/__init__.py
@@ -16,36 +16,6 @@
 """Ops for building neural network seq2seq decoders and losses.
 
 See the @{$python/contrib.seq2seq} guide.
-
-@@Decoder
-@@dynamic_decode
-
-@@BasicDecoderOutput
-@@BasicDecoder
-
-@@BeamSearchDecoderOutput
-@@BeamSearchDecoderState
-@@BeamSearchDecoder
-@@FinalBeamSearchDecoderOutput
-
-@@Helper
-@@CustomHelper
-@@GreedyEmbeddingHelper
-@@ScheduledEmbeddingTrainingHelper
-@@ScheduledOutputTrainingHelper
-@@TrainingHelper
-
-@@BahdanauAttention
-@@LuongAttention
-
-@@hardmax
-
-@@AttentionWrapperState
-@@AttentionWrapper
-
-@@gather_tree
-
-@@tile_batch
 """
 
 from __future__ import absolute_import
@@ -63,6 +33,30 @@ from tensorflow.contrib.seq2seq.python.ops.loss import *
 from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,widcard-import,line-too-long
 
-_allowed_symbols = ["sequence_loss"]
+_allowed_symbols = [
+    "sequence_loss",
+    "Decoder",
+    "dynamic_decode",
+    "BasicDecoder",
+    "BasicDecoderOutput",
+    "BeamSearchDecoder",
+    "BeamSearchDecoderOutput",
+    "BeamSearchDecoderState",
+    "Helper",
+    "CustomHelper",
+    "FinalBeamSearchDecoderOutput",
+    "gather_tree",
+    "GreedyEmbeddingHelper",
+    "ScheduledEmbeddingTrainingHelper",
+    "ScheduledOutputTrainingHelper",
+    "TrainingHelper",
+    "BahdanauAttention",
+    "LuongAttention",
+    "hardmax",
+    "AttentionWrapperState",
+    "AttentionWrapper",
+    "AttentionMechanism",
+    "tile_batch"]
+
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 9fc548aabe3..d3fc8d1d0df 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -39,6 +39,7 @@ from tensorflow.python.util import nest
 
 
 __all__ = [
+    "AttentionMechanism",
     "AttentionWrapper",
     "AttentionWrapperState",
     "LuongAttention",
diff --git a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
index f6116240792..de4f1265079 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
@@ -137,16 +137,16 @@ which to operate must always be given explicitly. This is the reason why
 
 ## Module: reroute
 
-*   @{tf.contrib.graph_editor.reroute.swap_ts}
-*   @{tf.contrib.graph_editor.reroute.reroute_ts}
-*   @{tf.contrib.graph_editor.reroute.swap_inputs}
-*   @{tf.contrib.graph_editor.reroute.reroute_inputs}
-*   @{tf.contrib.graph_editor.reroute.swap_outputs}
-*   @{tf.contrib.graph_editor.reroute.reroute_outputs}
-*   @{tf.contrib.graph_editor.reroute.swap_ios}
-*   @{tf.contrib.graph_editor.reroute.reroute_ios}
-*   @{tf.contrib.graph_editor.reroute.remove_control_inputs}
-*   @{tf.contrib.graph_editor.reroute.add_control_inputs}
+*   @{tf.contrib.graph_editor.swap_ts}
+*   @{tf.contrib.graph_editor.reroute_ts}
+*   @{tf.contrib.graph_editor.swap_inputs}
+*   @{tf.contrib.graph_editor.reroute_inputs}
+*   @{tf.contrib.graph_editor.swap_outputs}
+*   @{tf.contrib.graph_editor.reroute_outputs}
+*   @{tf.contrib.graph_editor.swap_ios}
+*   @{tf.contrib.graph_editor.reroute_ios}
+*   @{tf.contrib.graph_editor.remove_control_inputs}
+*   @{tf.contrib.graph_editor.add_control_inputs}
 
 ## Module: edit
 
diff --git a/tensorflow/docs_src/api_guides/python/contrib.linalg.md b/tensorflow/docs_src/api_guides/python/contrib.linalg.md
index efc2d76ef1e..b2c7fcf6bba 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.linalg.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.linalg.md
@@ -21,7 +21,7 @@ Subclasses of `LinearOperator` provide a access to common methods on a
 *   @{tf.contrib.linalg.LinearOperatorDiag}
 *   @{tf.contrib.linalg.LinearOperatorIdentity}
 *   @{tf.contrib.linalg.LinearOperatorScaledIdentity}
-*   @{tf.contrib.linalg.LinearOperatorMatrix}
+*   @{tf.contrib.linalg.LinearOperatorFullMatrix}
 *   @{tf.contrib.linalg.LinearOperatorTriL}
 *   @{tf.contrib.linalg.LinearOperatorUDVHUpdate}
 
diff --git a/tensorflow/docs_src/api_guides/python/contrib.losses.md b/tensorflow/docs_src/api_guides/python/contrib.losses.md
index cb93f9d549a..8c289dd5563 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.losses.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.losses.md
@@ -13,8 +13,8 @@ of samples in the batch and `d1` ... `dN` are the remaining dimensions.
 It is common, when training with multiple loss functions, to adjust the relative
 strengths of individual losses. This is performed by rescaling the losses via
 a `weight` parameter passed to the loss functions. For example, if we were
-training with both log_loss and sum_of_squares_loss, and we wished that the
-log_loss penalty be twice as severe as the sum_of_squares_loss, we would
+training with both log_loss and mean_square_error, and we wished that the
+log_loss penalty be twice as severe as the mean_square_error, we would
 implement this as:
 
 ```python
@@ -22,7 +22,7 @@ implement this as:
   tf.contrib.losses.log(predictions, labels, weight=2.0)
 
   # Uses default weight of 1.0
-  tf.contrib.losses.sum_of_squares(predictions, labels)
+  tf.contrib.losses.mean_square_error(predictions, labels)
 
   # All the losses are collected into the `GraphKeys.LOSSES` collection.
   losses = tf.get_collection(tf.GraphKeys.LOSSES)
@@ -74,7 +74,7 @@ these predictions.
   predictions = MyModelPredictions(images)
 
   weight = tf.cast(tf.greater(depths, 0), tf.float32)
-  loss  = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
+  loss  = tf.contrib.losses.mean_square_error(predictions, depths, weight)
 ```
 
 Note that when using weights for the losses, the final average is computed
@@ -100,7 +100,7 @@ weighted average over the individual prediction errors:
 
   weight = MyComplicatedWeightingFunction(labels)
   weight = tf.div(weight, tf.size(weight))
-  loss = tf.contrib.losses.sum_of_squares(predictions, depths, weight)
+  loss = tf.contrib.losses.mean_square_error(predictions, depths, weight)
 ```
 
 @{tf.contrib.losses.absolute_difference}
@@ -118,9 +118,4 @@ weighted average over the individual prediction errors:
 @{tf.contrib.losses.softmax_cross_entropy}
 @{tf.contrib.losses.sparse_softmax_cross_entropy}
 
-The following are deprecated in favor of `mean_pairwise_squared_error` and
-`mean_squared_error`.
-@{tf.contrib.losses.sum_of_pairwise_squares}
-@{tf.contrib.losses.sum_of_squares}
-
 
diff --git a/tensorflow/docs_src/get_started/tflearn.md b/tensorflow/docs_src/get_started/tflearn.md
index 079349be325..ed21969b3e9 100644
--- a/tensorflow/docs_src/get_started/tflearn.md
+++ b/tensorflow/docs_src/get_started/tflearn.md
@@ -278,7 +278,7 @@ Then, the code creates a `DNNClassifier` model using the following arguments:
 
 The `tf.contrib.learn` API uses input functions, which create the TensorFlow
 operations that generate data for the model. In this case, the data is small
-enough that it can be stored in @{tf.constant TensorFlow constants}. The
+enough that it can be stored in @{tf.constant$TensorFlow constants}. The
 following code produces the simplest possible input pipeline:
 
 ```python
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 1518cd53a39..d974f0f1af7 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -190,7 +190,6 @@ def _get_default_do_not_descend_map():
           'tensor_forest',
           'tensorboard',
           'testing',
-          'training',
           'tfprof',
       ],
       'contrib.bayesflow': [

From e8eafd94de1fc90a5f4724570f5882b01e1626dc Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 2 May 2017 08:06:50 -0800
Subject: [PATCH 04/51] Don't try to refine the shapes for a node if its
 inference context wasn't successfully built by the AddNode() method. Change:
 154838211

---
 tensorflow/core/common_runtime/shape_refiner.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index daa9e5091af..828297a1abe 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -88,7 +88,7 @@ Status ShapeRefiner::AddNode(const Node* node) {
   }
 
   // This needs to be filled in with real data in a second pass.
-  std::vector<const Tensor*> input_tensors(node->num_inputs());
+  std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
   std::vector<ShapeHandle> input_tensors_as_shapes;
 
   // Create the inference context for this node with the existing input shapes.
@@ -145,6 +145,9 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool* refined) {
   }
   InferenceContext* node_context = it->second.get();
 
+  // Give up if the context wasn't successfully built by the AddNode() method.
+  TF_RETURN_IF_ERROR(node_context->construction_status());
+
   // Check if the shapes of the nodes in the fan-in of this node have changed,
   // and if they have update the node input shapes.
   for (const Edge* e : node->in_edges()) {
@@ -458,7 +461,7 @@ Status ShapeRefiner::RunShapeFn(const Node* node,
                                 const OpRegistrationData* op_reg_data,
                                 shape_inference::InferenceContext* c) {
   // This will be filled in with real data in a second pass.
-  std::vector<const Tensor*> input_tensors(node->num_inputs());
+  std::vector<const Tensor*> input_tensors(node->num_inputs(), nullptr);
   std::vector<Tensor> real_tensors(node->num_inputs());
   std::vector<bool> attempted_materialization(node->num_inputs());
   std::vector<bool> attempted_tensor_as_shape_conversion(node->num_inputs());

From aaa5600245ea328057a91752dfeaed0770136676 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 08:25:23 -0800
Subject: [PATCH 05/51] Fix issue related to empty bazel.rc file. Change:
 154840138

---
 .gitignore     |  1 -
 configure      |  3 +--
 tools/bazel.rc | 30 ++++++++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 3 deletions(-)
 create mode 100644 tools/bazel.rc

diff --git a/.gitignore b/.gitignore
index 900e5a53cbc..d8ecef1e1e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,6 @@ node_modules
 /.tf_configure.bazelrc
 /bazel-*
 /third_party/py/numpy/numpy_include
-/tools/bazel.rc
 /tools/python_bin_path.sh
 /tools/git/gen
 /util/python/python_include
diff --git a/configure b/configure
index 75d3e160f58..dce59586ab5 100755
--- a/configure
+++ b/configure
@@ -356,9 +356,8 @@ if [[ "$TF_NEED_VERBS" == "1" ]]; then
 fi
 
 # Append CC optimization flags to bazel.rc
-echo >> tools/bazel.rc
 for opt in $CC_OPT_FLAGS; do
-  echo "build:opt --cxxopt=$opt --copt=$opt" >> tools/bazel.rc
+  write_to_bazelrc 'build:opt --cxxopt=$opt --copt=$opt'
 done
 
 # Run the gen_git_source to create links where bazel can track dependencies for
diff --git a/tools/bazel.rc b/tools/bazel.rc
new file mode 100644
index 00000000000..e67a290cf40
--- /dev/null
+++ b/tools/bazel.rc
@@ -0,0 +1,30 @@
+build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
+build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
+
+build:cuda_clang --crosstool_top=@local_config_cuda//crosstool:toolchain
+build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true
+
+build:win-cuda --define=using_cuda=true --define=using_cuda_nvcc=true
+
+build:mkl --define=using_mkl=true
+
+build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl --define=using_sycl=true
+
+build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
+
+build --define=use_fast_cpp_protos=true
+build --define=allow_oversize_protos=true
+
+build --spawn_strategy=standalone
+test --spawn_strategy=standalone
+run --spawn_strategy=standalone
+
+build --genrule_strategy=standalone
+test --genrule_strategy=standalone
+run --genrule_strategy=standalone
+
+build -c opt
+test -c opt
+run -c opt

From c0d7c1eb2f0b9bcde891dc825f9a76c948225d9f Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Tue, 2 May 2017 09:47:18 -0800
Subject: [PATCH 06/51] Remove overly precise CHECK when rendering debug output
 for a function.

An `_Arg` node can have more than three attrs, because the runtime may
(and does) add system-defined attrs (viz. "_output_shapes") that do
not change the meaning of the op.
Change: 154850526
---
 tensorflow/core/framework/function.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index c7311559241..a387d49613b 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -582,7 +582,7 @@ string Print(const GraphDef& gdef) {
   for (size_t i = 0; i < arg.size(); ++i) {
     const NodeDef* n = arg[i];
     if (i > 0) strings::StrAppend(&out, ", ");
-    CHECK_EQ(2, n->attr_size());
+    CHECK_GE(n->attr_size(), 2);
     strings::StrAppend(&out, n->name(), ":", get_type(*n));
   }
   strings::StrAppend(&out, ") -> (");

From dedd3fc2ca18da4676c221600388d8a8d8642190 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 10:18:06 -0800
Subject: [PATCH 07/51] Port makefile build breakage Change: 154855106

---
 .../makefile/sub_makefiles/hexagon_graph_execution/Makefile.in   | 1 -
 tensorflow/contrib/makefile/tf_op_files.txt                      | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
index ccbbfa41324..2a6f66edcb7 100644
--- a/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
+++ b/tensorflow/contrib/makefile/sub_makefiles/hexagon_graph_execution/Makefile.in
@@ -47,7 +47,6 @@ GRAPH_TRANSFER_SRCS := \
 tensorflow/cc/framework/scope.cc \
 tensorflow/cc/framework/ops.cc \
 tensorflow/cc/ops/const_op.cc \
-tensorflow/core/kernels/function_ops.cc \
 tensorflow/core/kernels/hexagon/graph_transfer_utils.cc \
 tensorflow/core/kernels/hexagon/graph_transferer.cc \
 tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc \
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index a6d76548f46..e4cee308a30 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -103,6 +103,7 @@ tensorflow/core/kernels/identity_op.cc
 tensorflow/core/kernels/gather_op.cc
 tensorflow/core/kernels/gather_functor.cc
 tensorflow/core/kernels/fused_batch_norm_op.cc
+tensorflow/core/kernels/function_ops.cc
 tensorflow/core/kernels/fill_functor.cc
 tensorflow/core/kernels/fifo_queue.cc
 tensorflow/core/kernels/fake_quant_ops.cc

From 5e727b8d2690c6740d52a161fabf169f87f3d786 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 10:23:53 -0800
Subject: [PATCH 08/51] [TF:XLA] Try to incorporate Tensorflow node structure
 for large HLO GraphDefs.

This change assumes that a TF subgraph/op does not cross the boundary of a HLO
computation and always put top-level TF subgraphs/ops under HLO computations.
Change: 154855884
---
 .../xla/service/hlo_tfgraph_builder.cc        | 14 ++++++++----
 .../xla/service/hlo_tfgraph_builder_test.cc   | 22 +++++++++++++++++++
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
index fdc1c0ba2d7..da07dea123a 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
@@ -88,12 +88,18 @@ const string& HloTfGraphBuilder::GetNodeNameForInstruction(
   if (ContainsKey(instruction_to_node_name_, instruction)) {
     return instruction_to_node_name_[instruction];
   }
+  string node_name;
   // If an instruction is fused, put it in the subgraph of the fusion;
   // otherwise, put it in the computation subgraph.
-  string node_name =
-      instruction->IsFused()
-          ? GetNodeNameForInstruction(instruction->fusion_instruction())
-          : instruction->parent()->name();
+  if (instruction->IsFused()) {
+    node_name = GetNodeNameForInstruction(instruction->fusion_instruction());
+  } else {
+    node_name = instruction->parent()->name();
+    if (!instruction->metadata().op_name().empty()) {
+      // Always make computations contain TF ops but not the other way around.
+      StrAppend(&node_name, "/", instruction->metadata().op_name());
+    }
+  }
   string instruction_name = instruction->name();
   if (instruction->opcode() == HloOpcode::kParameter) {
     StrAppend(&instruction_name, ".", instruction->parameter_number());
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
index df664080228..6041debc4ae 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
@@ -137,6 +137,28 @@ TEST_F(HloTfGraphBuilderTest, GreaterThanOrEqualTo) {
   EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
 }
 
+TEST_F(HloTfGraphBuilderTest, IncorparateTfOpsStructure) {
+  auto builder = HloComputation::Builder("GE");
+  auto param_1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "param0"));
+  auto param_2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32_, "param1"));
+  auto ge = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32_, HloOpcode::kGe, param_1, param_2));
+  OpMetadata metadata;
+  metadata.set_op_name("x/y");
+  metadata.set_op_type("Y");
+  ge->set_metadata(metadata);
+  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
+  GraphDef graph_def = generator_.GetGraphDef();
+  EXPECT_EQ(graph_def.node_size(), 3);
+  EXPECT_EQ(graph_def.node(0).name(), "GE/param0.0");
+  EXPECT_EQ(graph_def.node(1).name(), "GE/param1.1");
+  EXPECT_EQ(graph_def.node(2).input_size(), 2);
+  EXPECT_EQ(graph_def.node(2).name(), "GE/x/y/greater-than-or-equal-to");
+  EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
+}
+
 TEST_F(HloTfGraphBuilderTest, EmbeddedComputationsDiamond) {
   // Create computations with a diamond-shaped callgraph.
   auto negate_computation = CreateNegateComputation();

From 8304066d0215521598e1592336a2cf9cb602457c Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 2 May 2017 10:28:57 -0800
Subject: [PATCH 09/51] Added a unit test to check what happens when 2 shapes
 with known rank but unknown dimensions are merged Change: 154856675

---
 tensorflow/core/framework/shape_inference_test.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index c82b506e4b9..78d1fc0fc5e 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -558,6 +558,11 @@ TEST_F(ShapeInferenceTest, MergeShape) {
   EXPECT_TRUE(SameHandle(c.Dim(s_1_u, 0), c.Dim(out, 0)));
   EXPECT_TRUE(SameHandle(c.Dim(s_u_2, 1), c.Dim(out, 1)));
 
+  auto s_u1 = c.UnknownShapeOfRank(1);
+  auto s_u2 = c.UnknownShapeOfRank(1);
+  TF_EXPECT_OK(c.Merge(s_u1, s_u2, &out));
+  EXPECT_TRUE(SameHandle(s_u1, out));
+
   // Incompatible merges give errors and set out to nullptr.
   out = s_unknown;
   EXPECT_TRUE(

From 0774317eedf667d74110bb639eb4dfe46ee8cc5d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 10:31:38 -0800
Subject: [PATCH 10/51] [XLA] Refactor constant folding operations into a
 dedicated module

Refactor constant folding operations into a dedicated module, and added a new
ReplaceInstruction() API to collapse { computation->ReplaceInstruction();
changed=true}.
Change: 154857025
---
 tensorflow/compiler/xla/service/BUILD         |  21 ++
 .../xla/service/algebraic_simplifier.cc       | 186 ++-----------
 .../xla/service/algebraic_simplifier_test.cc  | 133 ---------
 .../xla/service/hlo_constant_folding.cc       | 256 +++++++++++++++---
 .../xla/service/hlo_constant_folding.h        |   6 +-
 .../xla/service/hlo_constant_folding_test.cc  | 169 ++++++++++++
 6 files changed, 423 insertions(+), 348 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/hlo_constant_folding_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 05fc480936f..a4e35135d73 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1418,6 +1418,27 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "hlo_constant_folding_test",
+    srcs = ["hlo_constant_folding_test.cc"],
+    deps = [
+        ":cpu_plugin",
+        ":hlo",
+        ":hlo_constant_folding",
+        ":hlo_matchers",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "device_memory_allocator",
     srcs = ["device_memory_allocator.cc"],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 124a949bac6..d6dce9745b5 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -219,12 +219,6 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
                                    HloInstruction* operand, HloInstruction* max,
                                    HloInstruction* max_operand);
 
-  // Tries to constant fold a concatenate operation, and returns true if the
-  // operation has been performed. An error status is returned in case of error.
-  StatusOr<bool> TryConcatenateConstantFold(
-      HloInstruction* concatenate,
-      tensorflow::gtl::ArraySlice<HloInstruction*> operands);
-
   // A Reshape or Broadcast that feeds an element-wise operation with a unique
   // non-scalar operand can sink to after the operation.
   StatusOr<bool> TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
@@ -236,12 +230,23 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   Status ReplaceWithNewInstruction(
       HloInstruction* old_instruction,
       std::unique_ptr<HloInstruction> new_instruction) {
-    TF_CHECK_OK(computation_->ReplaceWithNewInstruction(
+    TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
         old_instruction, std::move(new_instruction)));
     changed_ = true;
     return Status::OK();
   }
 
+  // Replaces the existing HLO instruction old_instruction, with
+  // new_instruction, and marks the optimizer status as changed.
+  // Returns the Status representing the result of the replace operation.
+  Status ReplaceInstruction(HloInstruction* old_instruction,
+                            HloInstruction* new_instruction) {
+    TF_RETURN_IF_ERROR(
+        computation_->ReplaceInstruction(old_instruction, new_instruction));
+    changed_ = true;
+    return Status::OK();
+  }
+
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
   HloComputation* computation_;
@@ -290,8 +295,7 @@ void AlgebraicSimplifierVisitor::ReplaceWithBitcast(
   auto bitcast = computation_->AddInstruction(
       HloInstruction::CreateUnary(instruction->shape(), HloOpcode::kBitcast,
                                   instruction->mutable_operand(0)));
-  TF_CHECK_OK(computation_->ReplaceInstruction(instruction, bitcast));
-  changed_ = true;
+  TF_CHECK_OK(ReplaceInstruction(instruction, bitcast));
 }
 
 bool AlgebraicSimplifierVisitor::ReplaceInstructionIfSameShape(
@@ -299,9 +303,7 @@ bool AlgebraicSimplifierVisitor::ReplaceInstructionIfSameShape(
   if (!SameShape(old_instruction, new_instruction)) {
     return false;
   }
-  TF_CHECK_OK(
-      computation_->ReplaceInstruction(old_instruction, new_instruction));
-  changed_ = true;
+  TF_CHECK_OK(ReplaceInstruction(old_instruction, new_instruction));
   return true;
 }
 
@@ -329,63 +331,6 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy,
   return Status::OK();
 }
 
-StatusOr<bool> AlgebraicSimplifierVisitor::TryConcatenateConstantFold(
-    HloInstruction* concatenate,
-    tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
-  if (operands[0]->opcode() == HloOpcode::kConstant) {
-    // If all the operands of a concatenate are constant, fold them into a
-    // single constant tensor.
-    // The concatenate dimension is going to be the sum of all the concatenate
-    // dimensions.
-    int64 concat_dim = concatenate->dimensions()[0];
-    const Shape& reference_shape = operands[0]->shape();
-    if (ShapeUtil::IsTuple(reference_shape)) {
-      VLOG(5) << "Tuples not currently supported by the concatenate constant"
-                 " folder";
-      return false;
-    }
-    int64 rank = ShapeUtil::Rank(reference_shape);
-    std::vector<int64> concat_dimensions(reference_shape.dimensions().begin(),
-                                         reference_shape.dimensions().end());
-    if (concat_dim < 0) {
-      concat_dim += rank;
-    }
-    for (int64 i = 1; i < operands.size(); ++i) {
-      const Shape& operand_shape = operands[i]->shape();
-      if (operands[i]->opcode() != HloOpcode::kConstant ||
-          ShapeUtil::IsTuple(operand_shape)) {
-        return false;
-      }
-      // Accumulate the concat dimension from all tensors taking part to the
-      // operation.
-      concat_dimensions[concat_dim] +=
-          ShapeUtil::GetDimension(operand_shape, concat_dim);
-    }
-
-    auto literal = LiteralUtil::CreateFromDimensions(
-        reference_shape.element_type(), concat_dimensions);
-    std::vector<int64> source_indices(rank, 0);
-    std::vector<int64> dest_indices(concat_dimensions.size(), 0);
-    for (auto operand : operands) {
-      const Shape& operand_shape = operand->shape();
-      Status status = LiteralUtil::Copy(
-          operand->literal(), source_indices, literal.get(), dest_indices,
-          AsInt64Slice(operand_shape.dimensions()));
-      if (!status.ok()) {
-        VLOG(1) << "Error while creating concatenated literal : " << status;
-        return false;
-      }
-      dest_indices[concat_dim] +=
-          ShapeUtil::GetDimension(operand_shape, concat_dim);
-    }
-    TF_CHECK_OK(computation_->ReplaceWithNewInstruction(
-        concatenate, HloInstruction::CreateConstant(std::move(literal))));
-    changed_ = true;
-    return true;
-  }
-  return false;
-}
-
 Status AlgebraicSimplifierVisitor::HandleConcatenate(
     HloInstruction* concatenate,
     tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
@@ -394,13 +339,6 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
     ReplaceInstructionIfSameShape(concatenate, operands[0]);
     return Status::OK();
   }
-  // If all the concatenate operands are constant, this will get folded into a
-  // new constant literal.
-  TF_ASSIGN_OR_RETURN(bool folded,
-                      TryConcatenateConstantFold(concatenate, operands));
-  if (folded) {
-    return Status::OK();
-  }
   // Filter out and remove empty operands.
   std::vector<HloInstruction*> nonempty_operands;
   for (HloInstruction* operand : operands) {
@@ -799,65 +737,6 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
   return Status::OK();
 }
 
-template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
-static std::unique_ptr<HloInstruction> ConvertIfTypesMatch(
-    const Literal& src_literal) {
-  CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
-
-  return HloInstruction::CreateConstant(
-      LiteralUtil::Convert<typename primitive_util::PrimitiveTypeToNative<
-                               primitive_src_type>::type,
-                           typename primitive_util::PrimitiveTypeToNative<
-                               primitive_dest_type>::type>(src_literal));
-}
-
-template <PrimitiveType primitive_src_type>
-static std::unique_ptr<HloInstruction> ConvertIfDestTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
-  switch (primitive_dest_type) {
-#define CONVERT_IF_TYPES_MATCH(type) \
-  case (type):                       \
-    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal);
-    CONVERT_IF_TYPES_MATCH(PRED)
-    CONVERT_IF_TYPES_MATCH(S8)
-    CONVERT_IF_TYPES_MATCH(S32)
-    CONVERT_IF_TYPES_MATCH(S64)
-    CONVERT_IF_TYPES_MATCH(U8)
-    CONVERT_IF_TYPES_MATCH(U32)
-    CONVERT_IF_TYPES_MATCH(U64)
-    CONVERT_IF_TYPES_MATCH(F32)
-    CONVERT_IF_TYPES_MATCH(F64)
-#undef CONVERT_IF_TYPES_MATCH
-    // Other types are not yet supported.
-    default:
-      LOG(FATAL) << "Unimplemented: ConvertIfDestTypeMatches for type "
-                 << PrimitiveType_Name(src_literal.shape().element_type());
-  }
-}
-
-static std::unique_ptr<HloInstruction> ConvertIfSrcTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
-  switch (src_literal.shape().element_type()) {
-#define CONVERT_IF_DEST_TYPE_MATCHES(type) \
-  case (type):                             \
-    return ConvertIfDestTypeMatches<(type)>(src_literal, primitive_dest_type);
-    CONVERT_IF_DEST_TYPE_MATCHES(PRED)
-    CONVERT_IF_DEST_TYPE_MATCHES(S8)
-    CONVERT_IF_DEST_TYPE_MATCHES(S32)
-    CONVERT_IF_DEST_TYPE_MATCHES(S64)
-    CONVERT_IF_DEST_TYPE_MATCHES(U8)
-    CONVERT_IF_DEST_TYPE_MATCHES(U32)
-    CONVERT_IF_DEST_TYPE_MATCHES(U64)
-    CONVERT_IF_DEST_TYPE_MATCHES(F32)
-    CONVERT_IF_DEST_TYPE_MATCHES(F64)
-#undef CONVERT_IF_DEST_TYPE_MATCHES
-    // Other types are not yet supported.
-    default:
-      LOG(FATAL) << "Unimplemented: ConvertIfSrcTypeMatches for type "
-                 << PrimitiveType_Name(src_literal.shape().element_type());
-  }
-}
-
 // A conversion to the same element type as the operand is a nop and can be
 // removed.  A conversion of a constant can be simplified by making a new
 // constant.
@@ -866,14 +745,7 @@ Status AlgebraicSimplifierVisitor::HandleConvert(HloInstruction* convert,
   PrimitiveType src_type = operand->shape().element_type();
   PrimitiveType dest_type = convert->shape().element_type();
   if (src_type == dest_type) {
-    changed_ = true;
-    return computation_->ReplaceInstruction(convert, operand);
-  }
-  if (operand->opcode() == HloOpcode::kConstant) {
-    const Literal& src_literal = operand->literal();
-    std::unique_ptr<HloInstruction> new_constant =
-        ConvertIfSrcTypeMatches(src_literal, dest_type);
-    return ReplaceWithNewInstruction(convert, std::move(new_constant));
+    return ReplaceInstruction(convert, operand);
   }
   return Status::OK();
 }
@@ -1080,8 +952,7 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   // Delete no-op reshapes, i.e. where shape = operand shape.
   if (SameShape(reshape, operand)) {
     VLOG(10) << "deleting no-op reshape";
-    changed_ = true;
-    return computation_->ReplaceInstruction(reshape, operand);
+    return ReplaceInstruction(reshape, operand);
   }
 
   // Merge reshapes.
@@ -1131,8 +1002,7 @@ Status AlgebraicSimplifierVisitor::HandleReverse(HloInstruction* reverse,
   };
   if (std::all_of(reverse->dimensions().begin(), reverse->dimensions().end(),
                   dim_is_one)) {
-    changed_ = true;
-    return computation_->ReplaceInstruction(reverse, operand);
+    return ReplaceInstruction(reverse, operand);
   }
   return Status::OK();
 }
@@ -1143,21 +1013,6 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice,
   if (ReplaceInstructionIfSameShape(slice, operand)) {
     return Status::OK();
   }
-  if (operand->opcode() == HloOpcode::kConstant) {
-    const Shape& shape = slice->shape();
-    auto literal = LiteralUtil::CreateFromDimensions(
-        shape.element_type(), AsInt64Slice(shape.dimensions()));
-    std::vector<int64> dest_indices(slice->slice_starts().size(), 0);
-    Status status = LiteralUtil::Copy(operand->literal(), slice->slice_starts(),
-                                      literal.get(), dest_indices,
-                                      AsInt64Slice(shape.dimensions()));
-    if (status.ok()) {
-      TF_CHECK_OK(ReplaceWithNewInstruction(
-          slice, HloInstruction::CreateConstant(std::move(literal))));
-    } else {
-      VLOG(1) << "Error while creating sliced literal : " << status;
-    }
-  }
   return Status::OK();
 }
 
@@ -1247,8 +1102,7 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
   if (std::is_sorted(transpose->dimensions().begin(),
                      transpose->dimensions().end())) {
     VLOG(10) << "deleting no-op transpose";
-    changed_ = true;
-    return computation_->ReplaceInstruction(transpose, operand);
+    return ReplaceInstruction(transpose, operand);
   }
 
   if (HloOpcode::kTranspose == operand->opcode()) {
@@ -1379,9 +1233,7 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
   auto new_rhs = add_bitcast(new_filter_shape, rhs);
   auto dot = computation_->AddInstruction(HloInstruction::CreateBinary(
       dot_output_shape, HloOpcode::kDot, new_lhs, new_rhs));
-  changed_ = true;
-  return computation_->ReplaceInstruction(convolution,
-                                          add_bitcast(convolution_shape, dot));
+  return ReplaceInstruction(convolution, add_bitcast(convolution_shape, dot));
 }
 
 bool AlgebraicSimplifierVisitor::TransformToClampIfSameShape(
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 0cce076da5b..f4b42055b78 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -466,75 +466,6 @@ TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
   EXPECT_THAT(computation->root_instruction(), input);
 }
 
-TEST_F(AlgebraicSimplifierTest, ConvertF32ToS64) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  builder.AddInstruction(
-      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input));
-
-  auto module = MakeUnique<HloModule>(TestName());
-  auto computation = module->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
-
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
-  EXPECT_EQ(LiteralUtil::GetFirstElement<int64>(
-                computation->root_instruction()->literal()),
-            42);
-}
-
-TEST_F(AlgebraicSimplifierTest, ConvertS64ToF32) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(42)));
-  builder.AddInstruction(
-      HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
-
-  auto module = MakeUnique<HloModule>(TestName());
-  auto computation = module->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
-
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
-  EXPECT_EQ(LiteralUtil::GetFirstElement<float>(
-                computation->root_instruction()->literal()),
-            42.0f);
-}
-
-TEST_F(AlgebraicSimplifierTest, ConvertF32ArrayToS64Array) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* input = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<float>({42.0f, 19.0f})));
-  builder.AddInstruction(
-      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input));
-
-  auto module = MakeUnique<HloModule>(TestName());
-  auto computation = module->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
-
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
-  EXPECT_EQ(
-      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {0}),
-      42);
-  EXPECT_EQ(
-      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {1}),
-      19);
-}
-
 // Test that copies are removed.
 TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
@@ -1666,69 +1597,5 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 }
 
-TEST_F(AlgebraicSimplifierTest, Concatenate) {
-  const struct TestConfig {
-    int concat_dimension;
-    tensorflow::gtl::ArraySlice<int64> dimensions;
-    tensorflow::gtl::ArraySlice<int64> concat_sizes;
-  } test_configs[] = {
-      {1, {11, 0, 7, 5, 9}, {2, 5, 7, 11}},
-      {3, {1, 4, 17, 0, 8}, {1, 3, 9, 12}},
-  };
-
-  for (auto& test_config : test_configs) {
-    HloComputation::Builder builder(TestName());
-    std::vector<int64> dimensions(test_config.dimensions.begin(),
-                                  test_config.dimensions.end());
-    int64 concat_size = 0;
-    std::vector<HloInstruction*> operands;
-    for (auto csize : test_config.concat_sizes) {
-      dimensions[test_config.concat_dimension] = csize;
-      concat_size += csize;
-      auto literal = LiteralUtil::CreateFromDimensions(F32, dimensions);
-      HloInstruction* insn = builder.AddInstruction(
-          HloInstruction::CreateConstant(std::move(literal)));
-      operands.push_back(insn);
-    }
-    dimensions[test_config.concat_dimension] = concat_size;
-    Shape shape = ShapeUtil::MakeShape(F32, dimensions);
-    builder.AddInstruction(HloInstruction::CreateConcatenate(
-        shape, operands, test_config.concat_dimension));
-    HloModule module(TestName());
-    auto computation = module.AddEntryComputation(builder.Build());
-
-    AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                   non_bitcasting_callback());
-    ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-
-    HloInstruction* root = computation->root_instruction();
-    EXPECT_EQ(root->opcode(), HloOpcode::kConstant);
-    EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
-  }
-}
-
-TEST_F(AlgebraicSimplifierTest, Slice) {
-  HloComputation::Builder builder(TestName());
-  const int64 dimensions[] = {11, 8, 7, 5, 9};
-  const int64 slice_start[] = {4, 2, 3, 1, 5};
-  const int64 slice_limits[] = {10, 8, 6, 5, 9};
-  auto literal = LiteralUtil::CreateFromDimensions(F32, dimensions);
-  HloInstruction* lit_insn = builder.AddInstruction(
-      HloInstruction::CreateConstant(std::move(literal)));
-  Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
-  builder.AddInstruction(
-      HloInstruction::CreateSlice(shape, lit_insn, slice_start, slice_limits));
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
-
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-
-  HloInstruction* root = computation->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kConstant);
-  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 9a5345dc13d..cb0a99d773c 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -15,16 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 
-#include <list>
-#include <map>
 #include <memory>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -34,52 +32,222 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
+namespace {
+
+template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
+static std::unique_ptr<Literal> ConvertIfTypesMatch(
+    const Literal& src_literal) {
+  CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
+  return LiteralUtil::Convert<
+      typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type,
+      typename primitive_util::PrimitiveTypeToNative<
+          primitive_dest_type>::type>(src_literal);
+}
+
+template <PrimitiveType primitive_src_type>
+static std::unique_ptr<Literal> ConvertIfDestTypeMatches(
+    const Literal& src_literal, PrimitiveType primitive_dest_type) {
+  switch (primitive_dest_type) {
+#define CONVERT_IF_TYPES_MATCH(type) \
+  case (type):                       \
+    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal);
+    CONVERT_IF_TYPES_MATCH(PRED)
+    CONVERT_IF_TYPES_MATCH(S8)
+    CONVERT_IF_TYPES_MATCH(S32)
+    CONVERT_IF_TYPES_MATCH(S64)
+    CONVERT_IF_TYPES_MATCH(U8)
+    CONVERT_IF_TYPES_MATCH(U32)
+    CONVERT_IF_TYPES_MATCH(U64)
+    CONVERT_IF_TYPES_MATCH(F32)
+    CONVERT_IF_TYPES_MATCH(F64)
+#undef CONVERT_IF_TYPES_MATCH
+    // Other types are not yet supported.
+    default:
+      LOG(FATAL) << "Unimplemented: ConvertIfDestTypeMatches for type "
+                 << PrimitiveType_Name(src_literal.shape().element_type());
+  }
+}
+
+static std::unique_ptr<Literal> ConvertIfSrcTypeMatches(
+    const Literal& src_literal, PrimitiveType primitive_dest_type) {
+  switch (src_literal.shape().element_type()) {
+#define CONVERT_IF_DEST_TYPE_MATCHES(type) \
+  case (type):                             \
+    return ConvertIfDestTypeMatches<(type)>(src_literal, primitive_dest_type);
+    CONVERT_IF_DEST_TYPE_MATCHES(PRED)
+    CONVERT_IF_DEST_TYPE_MATCHES(S8)
+    CONVERT_IF_DEST_TYPE_MATCHES(S32)
+    CONVERT_IF_DEST_TYPE_MATCHES(S64)
+    CONVERT_IF_DEST_TYPE_MATCHES(U8)
+    CONVERT_IF_DEST_TYPE_MATCHES(U32)
+    CONVERT_IF_DEST_TYPE_MATCHES(U64)
+    CONVERT_IF_DEST_TYPE_MATCHES(F32)
+    CONVERT_IF_DEST_TYPE_MATCHES(F64)
+#undef CONVERT_IF_DEST_TYPE_MATCHES
+    // Other types are not yet supported.
+    default:
+      LOG(FATAL) << "Unimplemented: ConvertIfSrcTypeMatches for type "
+                 << PrimitiveType_Name(src_literal.shape().element_type());
+  }
+}
+
+}  // namespace
+
+// ConstantFolderVisitor traverses the HLO computation and reduces certain
+// constant graph sections, to literals.
+class ConstantFolderVisitor : public DfsHloVisitorWithDefault {
+ public:
+  // Default visitor action is to do nothing and return OK.
+  Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+    return Status::OK();
+  }
+
+  Status HandleConcatenate(
+      HloInstruction* concatenate,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands) override;
+
+  Status HandleConvert(HloInstruction* convert,
+                       HloInstruction* operand) override;
+
+  Status HandleReshape(HloInstruction* reshape) override;
+
+  Status HandleSlice(HloInstruction* slice, HloInstruction* operand) override;
+
+  Status HandleTranspose(HloInstruction* transpose) override;
+
+  // Returns whether a constant folding operation has occurred.
+  const bool changed() const { return changed_; }
+
+  // Runs the visitor on a computation and returns whether any changes were
+  // performed.
+  static StatusOr<bool> Run(HloComputation* computation);
+
+ private:
+  ConstantFolderVisitor() = default;
+
+  // Replaces the existing HLO instruction old_instruction, with a literal,
+  // and marks the optimizer status as changed.
+  // Returns the Status representing the result of the replace operation.
+  Status ReplaceWithConstant(HloInstruction* old_instruction,
+                             std::unique_ptr<Literal> literal) {
+    TF_RETURN_IF_ERROR(old_instruction->parent()->ReplaceWithNewInstruction(
+        old_instruction, HloInstruction::CreateConstant(std::move(literal))));
+    changed_ = true;
+    return Status::OK();
+  }
+
+  // Whether any constant folding operations have occurred.
+  bool changed_ = false;
+};
+
+StatusOr<bool> ConstantFolderVisitor::Run(HloComputation* computation) {
+  ConstantFolderVisitor visitor;
+  TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  return visitor.changed();
+}
 
 StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
+  XLA_VLOG_LINES(2,
+                 "HloConstantFolding::Run(), before:\n" + module->ToString());
   bool changed = false;
-  for (auto& computation : module->computations()) {
-    for (auto instruction : computation->MakeInstructionPostOrder()) {
-      // Skip dead code.
-      if (instruction->user_count() == 0 &&
-          computation->root_instruction() != instruction) {
-        continue;
-      }
-      // Depending on the opcode, choose how to handle constant operands.
-      //
-      // TODO(b/35975797): Fold constant computations for more than reshapes and
-      // transposes.
-      switch (instruction->opcode()) {
-        case HloOpcode::kReshape: {
-          if (instruction->operand(0)->opcode() == HloOpcode::kConstant) {
-            TF_ASSIGN_OR_RETURN(
-                auto reshaped_literal,
-                LiteralUtil::Reshape(
-                    instruction->operand(0)->literal(),
-                    AsInt64Slice(instruction->shape().dimensions())));
-            TF_CHECK_OK(computation->ReplaceWithNewInstruction(
-                instruction,
-                HloInstruction::CreateConstant(std::move(reshaped_literal))));
-            changed = true;
-          }
-          break;
-        }
-        case HloOpcode::kTranspose: {
-          if (instruction->operand(0)->opcode() == HloOpcode::kConstant) {
-            auto transposed_literal = LiteralUtil::Transpose(
-                instruction->operand(0)->literal(), instruction->dimensions());
-            TF_CHECK_OK(computation->ReplaceWithNewInstruction(
-                instruction,
-                HloInstruction::CreateConstant(std::move(transposed_literal))));
-            changed = true;
-          }
-          break;
-        }
-        default:
-          break;
-      }
-    }
+  for (auto& comp : module->computations()) {
+    TF_ASSIGN_OR_RETURN(bool result, ConstantFolderVisitor::Run(comp.get()));
+    changed = changed || result;
   }
+  XLA_VLOG_LINES(2, "HloConstantFolding::Run(), after:\n" + module->ToString());
   return changed;
 }
 
+Status ConstantFolderVisitor::HandleReshape(HloInstruction* reshape) {
+  if (reshape->operand(0)->opcode() == HloOpcode::kConstant) {
+    TF_ASSIGN_OR_RETURN(
+        auto reshaped_literal,
+        LiteralUtil::Reshape(reshape->operand(0)->literal(),
+                             AsInt64Slice(reshape->shape().dimensions())));
+    return ReplaceWithConstant(reshape, std::move(reshaped_literal));
+  }
+  return Status::OK();
+}
+
+Status ConstantFolderVisitor::HandleTranspose(HloInstruction* transpose) {
+  if (transpose->operand(0)->opcode() == HloOpcode::kConstant) {
+    auto transposed_literal = LiteralUtil::Transpose(
+        transpose->operand(0)->literal(), transpose->dimensions());
+    return ReplaceWithConstant(transpose, std::move(transposed_literal));
+  }
+  return Status::OK();
+}
+
+Status ConstantFolderVisitor::HandleConcatenate(
+    HloInstruction* concatenate,
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+  if (operands[0]->opcode() == HloOpcode::kConstant) {
+    // If all the operands of a concatenate are constant, fold them into a
+    // single constant tensor.
+    // The result concatenate dimension is going to be the sum of all the
+    // concatenate dimensions of the arrays taking part of the operation.
+    int64 concat_dim = concatenate->dimensions()[0];
+    const Shape& reference_shape = operands[0]->shape();
+    CHECK(!ShapeUtil::IsTuple(reference_shape));
+    int64 rank = ShapeUtil::Rank(reference_shape);
+    std::vector<int64> concat_dimensions(reference_shape.dimensions().begin(),
+                                         reference_shape.dimensions().end());
+    if (concat_dim < 0) {
+      concat_dim += rank;
+    }
+    for (int64 i = 1; i < operands.size(); ++i) {
+      const Shape& operand_shape = operands[i]->shape();
+      CHECK(!ShapeUtil::IsTuple(operand_shape));
+      if (operands[i]->opcode() != HloOpcode::kConstant) {
+        return Status::OK();
+      }
+      // Accumulate the concat dimension from all tensors taking part to the
+      // operation.
+      concat_dimensions[concat_dim] +=
+          ShapeUtil::GetDimension(operand_shape, concat_dim);
+    }
+
+    auto literal = LiteralUtil::CreateFromDimensions(
+        reference_shape.element_type(), concat_dimensions);
+    std::vector<int64> source_indices(rank, 0);
+    std::vector<int64> dest_indices(concat_dimensions.size(), 0);
+    for (auto operand : operands) {
+      const Shape& operand_shape = operand->shape();
+      TF_RETURN_IF_ERROR(LiteralUtil::Copy(
+          operand->literal(), source_indices, literal.get(), dest_indices,
+          AsInt64Slice(operand_shape.dimensions())));
+      dest_indices[concat_dim] +=
+          ShapeUtil::GetDimension(operand_shape, concat_dim);
+    }
+    return ReplaceWithConstant(concatenate, std::move(literal));
+  }
+  return Status::OK();
+}
+
+Status ConstantFolderVisitor::HandleSlice(HloInstruction* slice,
+                                          HloInstruction* operand) {
+  if (operand->opcode() == HloOpcode::kConstant) {
+    const Shape& shape = slice->shape();
+    auto literal = LiteralUtil::CreateFromDimensions(
+        shape.element_type(), AsInt64Slice(shape.dimensions()));
+    std::vector<int64> dest_indices(slice->slice_starts().size(), 0);
+    TF_RETURN_IF_ERROR(LiteralUtil::Copy(
+        operand->literal(), slice->slice_starts(), literal.get(), dest_indices,
+        AsInt64Slice(shape.dimensions())));
+    TF_RETURN_IF_ERROR(ReplaceWithConstant(slice, std::move(literal)));
+  }
+  return Status::OK();
+}
+
+Status ConstantFolderVisitor::HandleConvert(HloInstruction* convert,
+                                            HloInstruction* operand) {
+  if (operand->opcode() == HloOpcode::kConstant) {
+    const Literal& src_literal = operand->literal();
+    std::unique_ptr<Literal> new_constant =
+        ConvertIfSrcTypeMatches(src_literal, convert->shape().element_type());
+    return ReplaceWithConstant(convert, std::move(new_constant));
+  }
+  return Status::OK();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.h b/tensorflow/compiler/xla/service/hlo_constant_folding.h
index 514bb8164c1..f45eccf8253 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.h
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.h
@@ -25,12 +25,10 @@ namespace xla {
 // computation on constants.
 class HloConstantFolding : public HloPassInterface {
  public:
-  explicit HloConstantFolding() {}
-  ~HloConstantFolding() override {}
   tensorflow::StringPiece name() const override { return "constant_folding"; }
 
-  // Run ConstantFolding on the given module. Returns whether the module was
-  // changed (common subexpressions were found and eliminated).
+  // Run constant folding operations on the given module. Returns whether the
+  // module was changed (constant expressions folded).
   StatusOr<bool> Run(HloModule* module) override;
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
new file mode 100644
index 00000000000..d20f423bd6c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -0,0 +1,169 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+using HloConstantFoldingTest = HloTestBase;
+
+TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* input = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+
+  HloConstantFolding simplifier;
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_EQ(LiteralUtil::GetFirstElement<int64>(
+                computation->root_instruction()->literal()),
+            42);
+}
+
+TEST_F(HloConstantFoldingTest, ConvertS64ToF32) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* input = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(42)));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+
+  HloConstantFolding simplifier;
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_EQ(LiteralUtil::GetFirstElement<float>(
+                computation->root_instruction()->literal()),
+            42.0f);
+}
+
+TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* input = builder.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR1<float>({42.0f, 19.0f})));
+  builder.AddInstruction(
+      HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input));
+
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+
+  HloConstantFolding simplifier;
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_EQ(
+      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {0}),
+      42);
+  EXPECT_EQ(
+      LiteralUtil::Get<int64>(computation->root_instruction()->literal(), {1}),
+      19);
+}
+
+TEST_F(HloConstantFoldingTest, Concatenate) {
+  const struct TestConfig {
+    int concat_dimension;
+    tensorflow::gtl::ArraySlice<int64> dimensions;
+    tensorflow::gtl::ArraySlice<int64> concat_sizes;
+  } test_configs[] = {
+      {1, {11, 0, 7, 5, 9}, {2, 5, 7, 11}},
+      {3, {1, 4, 17, 0, 8}, {1, 3, 9, 12}},
+  };
+
+  for (auto& test_config : test_configs) {
+    HloComputation::Builder builder(TestName());
+    std::vector<int64> dimensions(test_config.dimensions.begin(),
+                                  test_config.dimensions.end());
+    int64 concat_size = 0;
+    std::vector<HloInstruction*> operands;
+    for (auto csize : test_config.concat_sizes) {
+      dimensions[test_config.concat_dimension] = csize;
+      concat_size += csize;
+      auto literal = LiteralUtil::CreateFromDimensions(F32, dimensions);
+      HloInstruction* insn = builder.AddInstruction(
+          HloInstruction::CreateConstant(std::move(literal)));
+      operands.push_back(insn);
+    }
+    dimensions[test_config.concat_dimension] = concat_size;
+    Shape shape = ShapeUtil::MakeShape(F32, dimensions);
+    builder.AddInstruction(HloInstruction::CreateConcatenate(
+        shape, operands, test_config.concat_dimension));
+    HloModule module(TestName());
+    auto computation = module.AddEntryComputation(builder.Build());
+
+    HloConstantFolding simplifier;
+    ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+
+    HloInstruction* root = computation->root_instruction();
+    EXPECT_THAT(root, op::Constant());
+    EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
+  }
+}
+
+TEST_F(HloConstantFoldingTest, Slice) {
+  HloComputation::Builder builder(TestName());
+  const int64 dimensions[] = {11, 8, 7, 5, 9};
+  const int64 slice_start[] = {4, 2, 3, 1, 5};
+  const int64 slice_limits[] = {10, 8, 6, 5, 9};
+  auto literal = LiteralUtil::CreateFromDimensions(F32, dimensions);
+  HloInstruction* lit_insn = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
+  builder.AddInstruction(
+      HloInstruction::CreateSlice(shape, lit_insn, slice_start, slice_limits));
+  HloModule module(TestName());
+  auto computation = module.AddEntryComputation(builder.Build());
+
+  HloConstantFolding simplifier;
+  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Constant());
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
+}
+
+}  // namespace
+}  // namespace xla

From a27319bf06a22e1fae89aa54aa62dcd5814f2641 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 2 May 2017 10:46:53 -0800
Subject: [PATCH 11/51] Java: Docs: Update instructions for Windows.

Inspired by
http://stackoverflow.com/questions/43741775/tensorflow-in-java-running-failed
Change: 154859066
---
 tensorflow/docs_src/install/install_java.md | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 1abf3b69f5e..65cfe375d57 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -211,15 +211,20 @@ two files are available to the JVM:
   * the downloaded `.jar` file
   * the extracted JNI library
 
-For example, the following command line executes the `HelloTF` program:
+For example, the following command line executes the `HelloTF` program on Linux
+and Mac OS X:
 
 <pre><b>java -cp libtensorflow-1.1.0-rc2.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
+And the following comand line executes the `HelloTF` program on Windows:
+
+<pre><b>java -cp libtensorflow-1.1.0-rc2.jar;. -Djava.library.path=jni HelloTF</b></pre>
+
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
 outputs something else, check
-[Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow)
-for possible solutions.
+[Stack Overflow](http://stackoverflow.com/questions/tagged/tensorflow) for
+possible solutions.
 
 
 ### Advanced Example

From 5e23da5ace56f577566285352b2bf246b1fcabc9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 10:50:47 -0800
Subject: [PATCH 12/51] Add more documentation for features and labels. Change:
 154859649

---
 tensorflow/python/estimator/estimator.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index c04e37eccd6..c394315cfa8 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -94,13 +94,15 @@ class Estimator(object):
 
         * Args:
 
-          * `features`: single `Tensor` or `dict` of `Tensor`s
-                 (depending on data passed to `train`),
-          * `labels`: `Tensor` or `dict` of `Tensor`s (for multi-head
-                 models). If mode is `ModeKeys.PREDICT`, `labels=None` will be
-                 passed. If the `model_fn`'s signature does not accept
-                 `mode`, the `model_fn` must still be able to handle
-                 `labels=None`.
+          * `features`: This is the first item returned from the `input_fn`
+                 passed to `train`, 'evaluate`, and `predict`. This should be a
+                 single `Tensor` or `dict` of same.
+          * `labels`: This is the second item returned from the `input_fn`
+                 passed to `train`, 'evaluate`, and `predict`. This should be a
+                 single `Tensor` or `dict` of same (for multi-head models). If
+                 mode is `ModeKeys.PREDICT`, `labels=None` will be passed. If
+                 the `model_fn`'s signature does not accept `mode`, the
+                 `model_fn` must still be able to handle `labels=None`.
           * `mode`: Optional. Specifies if this training, evaluation or
                  prediction. See `ModeKeys`.
           * `params`: Optional `dict` of hyperparameters.  Will receive what

From 4c5274e120e33b262164686ecb2b4714a5a11f23 Mon Sep 17 00:00:00 2001
From: Toby Boyd <tobyboyd@google.com>
Date: Tue, 2 May 2017 10:54:46 -0800
Subject: [PATCH 13/51] Added link to high-performance models Change: 154860213

---
 tensorflow/docs_src/performance/performance_guide.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 8a1bba883ae..07c5d3087f3 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -1,8 +1,10 @@
-# Performance
+# Performance Guide
 
 This guide contains a collection of best practices for optimizing your
 TensorFlow code. The best practices apply to both new and experienced
-Tensorflow users.
+Tensorflow users.  As a complement to the best practices in this document, the
+@{$performance_models$High-Performance Models} document links to example code
+and details for creating models that scale on a variety of hardware.
 
 ## Best Practices
 While optimizing implementations of different types of models can be different,
@@ -73,7 +75,7 @@ Unless for a special circumstance or for example code, do not feed data
 into the session from Python variables, e.g. `dictionary`.
 
 ```python
-# This will result in poor performance.
+# Using feed_dict often results in suboptimal performance when using large inputs.
 sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
 ```
 
@@ -141,3 +143,4 @@ bn = tf.contrib.layers.batch_norm(
 The non-fused batch norm does computations using several individual Ops. Fused
 batch norm combines the individual operations into a single kernel, which runs
 faster.
+

From 1e27ffdd12d2d6d8f34f7b5275d975fd3c2faac6 Mon Sep 17 00:00:00 2001
From: Toby Boyd <tobyboyd@google.com>
Date: Tue, 2 May 2017 11:09:49 -0800
Subject: [PATCH 14/51] Navigation and index for new performance section
 documents. Change: 154862215

---
 tensorflow/docs_src/performance/index.md      | 12 ++++++++++--
 tensorflow/docs_src/performance/leftnav_files |  5 ++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tensorflow/docs_src/performance/index.md b/tensorflow/docs_src/performance/index.md
index 0ff4d2ee004..746dc0c74fe 100644
--- a/tensorflow/docs_src/performance/index.md
+++ b/tensorflow/docs_src/performance/index.md
@@ -2,11 +2,19 @@
 
 Performance is often a significant issue when training a machine learning
 model.  This section explains various ways to optimize performance.  Start
-your investigation with the following guide:
+your investigation with the @{$performance_guide$Performance Guide} and then go
+deeper with techniques detailed in @{$performance_models$High-Performance Models}:
 
-  * @{$performance_guide$Performance}, which contains a collection of best
+  * @{$performance_guide$Performance Guide}, which contains a collection of best
     practices for optimizing your TensorFlow code.
 
+  * @{$performance_models$High-Performance Models}, which contains a collection
+    advanced techniques to build highly scalable models targeting different
+    system types and network topologies.
+
+  * @{$benchmarks$Benchmarks}, which contains a collection of benchmark
+    results.
+
 XLA (Accelerated Linear Algebra) is an experimental compiler for linear
 algebra that optimizes TensorFlow computations. The following guides explore
 XLA:
diff --git a/tensorflow/docs_src/performance/leftnav_files b/tensorflow/docs_src/performance/leftnav_files
index 0f30cc7fa5c..d2284732208 100644
--- a/tensorflow/docs_src/performance/leftnav_files
+++ b/tensorflow/docs_src/performance/leftnav_files
@@ -1,4 +1,8 @@
 performance_guide.md
+performance_models.md
+benchmarks.md
+quantization.md
+>>>
 xla/index.md
 xla/broadcasting.md
 xla/developing_new_backend.md
@@ -6,4 +10,3 @@ xla/jit.md
 xla/operation_semantics.md
 xla/shapes.md
 xla/tfcompile.md
-quantization.md

From 6a533a0824eb23a32b8e9c5d64ddccf08497e45d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 11:13:39 -0800
Subject: [PATCH 15/51] Fix shape mismatch between loss and weights. Change:
 154862650

---
 .../learn/python/learn/estimators/head.py     | 11 ++++++-
 .../python/learn/estimators/head_test.py      | 30 +++++++++++++++++--
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 452f8a901ee..15e457f932c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -921,12 +921,21 @@ def _softmax_cross_entropy_loss(labels, logits, weights=None):
     if not labels.dtype.is_integer:
       raise ValueError("Labels dtype should be integer "
                        "Instead got %s." % labels.dtype)
-    # TODO(ptucker): This will break for dynamic shapes.
+
     # sparse_softmax_cross_entropy_with_logits requires [batch_size] labels.
+    is_squeezed_labels = False
+    # TODO(ptucker): This will break for dynamic shapes.
     if len(labels.get_shape()) == 2:
       labels = array_ops.squeeze(labels, squeeze_dims=(1,))
+      is_squeezed_labels = True
+
     loss = nn.sparse_softmax_cross_entropy_with_logits(
         labels=labels, logits=logits, name=name)
+
+    # Restore squeezed dimension, if necessary, so loss matches weights shape.
+    if is_squeezed_labels:
+      loss = array_ops.expand_dims(loss, axis=(1,))
+
     return _compute_weighted_loss(loss, weights)
 
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head_test.py b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
index 442530cb5ee..207a189a94d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
@@ -791,7 +791,7 @@ class BinaryClassificationHeadTest(test.TestCase):
             [b"0", b"1"], predicted_classes[0])
         self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
 
-  def testBinaryClassificationInferMode_withWightColumn(self):
+  def testBinaryClassificationInferMode_withWeightColumn(self):
     n_classes = 2
     head = head_lib.multi_class_head(n_classes=n_classes,
                                      weight_column_name="label_weight")
@@ -951,7 +951,7 @@ class MultiClassHeadTest(test.TestCase):
 
   def setUp(self):
     self._logits = ((1., 0., 0.),)
-    self._labels = (2,)
+    self._labels = ((2,),)
 
   def _expected_eval_metrics(self, expected_loss):
     return {
@@ -1131,7 +1131,7 @@ class MultiClassHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss,
                       expected_eval_metrics, model_fn_ops)
 
-  def testMultiClassWithWeight(self):
+  def testMultiClassWithScalarWeight(self):
     n_classes = 3
     head = head_lib.multi_class_head(
         n_classes=n_classes,
@@ -1154,6 +1154,30 @@ class MultiClassHeadTest(test.TestCase):
       _assert_metrics(self, expected_loss * weight,
                       self._expected_eval_metrics(expected_loss), model_fn_ops)
 
+  def testMultiClassWith2DWeight(self):
+    n_classes = 3
+    head = head_lib.multi_class_head(
+        n_classes=n_classes,
+        weight_column_name="label_weight",
+        metric_class_ids=range(n_classes))
+    with ops.Graph().as_default(), session.Session():
+      weight = .1
+      weights = ((weight,),)
+      # logloss: z:label, x:logit
+      # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
+      model_fn_ops = head.create_model_fn_ops(
+          features={"label_weight": weights},
+          labels=self._labels,
+          mode=model_fn.ModeKeys.TRAIN,
+          train_op_fn=head_lib.no_op_train_fn,
+          logits=self._logits)
+      self._assert_output_alternatives(model_fn_ops)
+      _assert_no_variables(self)
+      _assert_summary_tags(self, ["loss"])
+      expected_loss = 1.5514447
+      _assert_metrics(self, expected_loss * weight,
+                      self._expected_eval_metrics(expected_loss), model_fn_ops)
+
   def testMultiClassWithCustomLoss(self):
     n_classes = 3
     head = head_lib.multi_class_head(

From 6aae4bb997851f2de84a7d5337e980e840ba516f Mon Sep 17 00:00:00 2001
From: Cassandra Xia <cassandrax@google.com>
Date: Tue, 2 May 2017 11:13:48 -0800
Subject: [PATCH 16/51] Add examples to TensorShape documentation and ran
 autoformatter. Change: 154862667

---
 tensorflow/python/framework/tensor_shape.py | 52 ++++++++++-----------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 3664710caa3..73c810711f4 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Helper classes for tensor shape inference."""
 from __future__ import absolute_import
 from __future__ import division
@@ -31,8 +30,8 @@ class Dimension(object):
       self._value = None
     else:
       self._value = int(value)
-      if (not isinstance(value, compat.bytes_or_text_types)
-          and self._value != value):
+      if (not isinstance(value, compat.bytes_or_text_types) and
+          self._value != value):
         raise ValueError("Ambiguous dimension: %s" % value)
       if self._value < 0:
         raise ValueError("Dimension %d must be >= 0" % self._value)
@@ -89,9 +88,8 @@ class Dimension(object):
       True if this Dimension and `other` are compatible.
     """
     other = as_dimension(other)
-    return (self._value is None
-            or other.value is None
-            or self._value == other.value)
+    return (self._value is None or other.value is None or
+            self._value == other.value)
 
   def assert_is_compatible_with(self, other):
     """Raises an exception if `other` is not compatible with this Dimension.
@@ -104,8 +102,8 @@ class Dimension(object):
         is_compatible_with).
     """
     if not self.is_compatible_with(other):
-      raise ValueError("Dimensions %s and %s are not compatible"
-                       % (self, other))
+      raise ValueError("Dimensions %s and %s are not compatible" % (self,
+                                                                    other))
 
   def merge_with(self, other):
     """Returns a Dimension that combines the information in `self` and `other`.
@@ -385,18 +383,17 @@ class TensorShape(object):
   `Tensor`. It may be one of the following:
 
   * *Fully-known shape:* has a known number of dimensions and a known size
-    for each dimension.
+    for each dimension. e.g. `TensorShape([16, 256])`
   * *Partially-known shape:* has a known number of dimensions, and an unknown
-    size for one or more dimension.
+    size for one or more dimension. e.g. `TensorShape([None, 256])`
   * *Unknown shape:* has an unknown number of dimensions, and an unknown
-    size in all dimensions.
+    size in all dimensions. e.g. `TensorShape(None)`
 
   If a tensor is produced by an operation of type `"Foo"`, its shape
   may be inferred if there is a registered shape function for
-  `"Foo"`. See @{$adding_an_op#shape-functions-in-c$`Shape functions in   C++`} for
-  details of shape functions and how to register them. Alternatively,
-  the shape may be set explicitly using
-  @{tf.Tensor.set_shape}.
+  `"Foo"`. See @{$adding_an_op#shape-functions-in-c$`Shape functions in C++`}
+  for details of shape functions and how to register them. Alternatively,
+  the shape may be set explicitly using @{tf.Tensor.set_shape}.
   """
 
   def __init__(self, dims):
@@ -414,7 +411,7 @@ class TensorShape(object):
       self._dims = None
     elif isinstance(dims, compat.bytes_or_text_types):
       raise TypeError("A string has ambiguous TensorShape, please wrap in a "
-                       "list or convert to an int: %s" % dims)
+                      "list or convert to an int: %s" % dims)
     elif isinstance(dims, tensor_shape_pb2.TensorShapeProto):
       if dims.unknown_rank:
         self._dims = None
@@ -422,7 +419,8 @@ class TensorShape(object):
         self._dims = [
             # Protos store variable-size dimensions as -1
             as_dimension(dim.size if dim.size != -1 else None)
-            for dim in dims.dim]
+            for dim in dims.dim
+        ]
     elif isinstance(dims, TensorShape):
       self._dims = dims.dims
     else:
@@ -519,7 +517,7 @@ class TensorShape(object):
           # suffixes of otherwise unknown shapes.
           return unknown_shape()
         else:
-          return unknown_shape(ndims=stop-start)
+          return unknown_shape(ndims=stop - start)
       else:
         return Dimension(None)
 
@@ -560,8 +558,7 @@ class TensorShape(object):
           new_dims.append(dim.merge_with(other[i]))
         return TensorShape(new_dims)
       except ValueError:
-        raise ValueError("Shapes %s and %s are not compatible" %
-                         (self, other))
+        raise ValueError("Shapes %s and %s are not compatible" % (self, other))
 
   def concatenate(self, other):
     """Returns the concatenation of the dimension in `self` and `other`.
@@ -599,8 +596,8 @@ class TensorShape(object):
     other = as_shape(other)
     if self.ndims is not None and other.ndims is not None:
       if self.ndims != other.ndims:
-        raise ValueError(
-            "Shapes %s and %s must have the same rank" % (self, other))
+        raise ValueError("Shapes %s and %s must have the same rank" % (self,
+                                                                       other))
 
   def assert_has_rank(self, rank):
     """Raises an exception if `self` is not compatible with the given `rank`.
@@ -736,8 +733,8 @@ class TensorShape(object):
 
   def is_fully_defined(self):
     """Returns True iff `self` is fully defined in every dimension."""
-    return (self._dims is not None
-            and all(dim.value is not None for dim in self._dims))
+    return (self._dims is not None and all(dim.value is not None
+                                           for dim in self._dims))
 
   def assert_is_fully_defined(self):
     """Raises an exception if `self` is not fully defined in every dimension.
@@ -767,9 +764,10 @@ class TensorShape(object):
       return tensor_shape_pb2.TensorShapeProto(unknown_rank=True)
     else:
       return tensor_shape_pb2.TensorShapeProto(dim=[
-          tensor_shape_pb2.TensorShapeProto.Dim(
-              size=-1 if d.value is None else d.value)
-          for d in self._dims])
+          tensor_shape_pb2.TensorShapeProto.Dim(size=-1
+                                                if d.value is None else d.value)
+          for d in self._dims
+      ])
 
   def __eq__(self, other):
     """Returns True if `self` is equivalent to `other`."""

From 50df613b944c975ad719141ef8a012ccfd68b4b7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 11:21:41 -0800
Subject: [PATCH 17/51] Move linking of cudnn_plugin, cublas_plugin and
 cufft_plugin from stream_executor to the ops that need them. Change:
 154863520

---
 tensorflow/compiler/xla/service/gpu/BUILD        |  2 ++
 tensorflow/core/kernels/BUILD                    |  9 +++++++--
 .../core/platform/default/build_config/BUILD     | 16 ++++++++++++++++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 1fdbcfe5641..d26f415fd4b 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -264,6 +264,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+        "//tensorflow/core/platform/default/build_config:cudnn_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
     ],
 )
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 231e06d5f4d..29b4d63bbf8 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2042,6 +2042,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "@local_config_cuda//cuda:cusolver",
     ],
 )
@@ -2322,7 +2323,9 @@ tf_kernel_library(
     prefix = "fft_ops",
     deps = MATH_DEPS + [
         "//tensorflow/core:spectral_ops_op_lib",
-    ],
+    ] + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cufft_plugin",
+    ]),
 )
 
 tf_kernel_library(
@@ -2626,7 +2629,9 @@ tf_kernel_library(
             "@libxsmm_archive//:xsmm_avx",
         ],
         "//conditions:default": [],
-    }),
+    }) + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cudnn_plugin",
+    ]),
 )
 
 tf_kernel_library(
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 8bc412c5d8f..9e3d5f354db 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -58,6 +58,22 @@ cc_library(
     ],
 )
 
+# Dummy stream executor cuda plugins.
+cc_library(
+    name = "cublas_plugin",
+    srcs = [],
+)
+
+cc_library(
+    name = "cufft_plugin",
+    srcs = [],
+)
+
+cc_library(
+    name = "cudnn_plugin",
+    srcs = [],
+)
+
 # OSX framework for device driver access
 cc_library(
     name = "IOKit",

From 0c372ad452e3d65aa57e7e8c3fef51846383c10a Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 2 May 2017 11:48:29 -0800
Subject: [PATCH 18/51] Properly track the persistent memory usage of lookup
 tables. Change: 154866686

---
 tensorflow/core/grappler/clusters/BUILD       |   1 +
 .../core/grappler/clusters/single_machine.cc  |  48 +++++++-
 .../core/grappler/clusters/single_machine.h   |   2 +
 .../grappler/clusters/single_machine_test.cc  | 115 ++++++++++++++++++
 tensorflow/core/kernels/lookup_table_op.h     |  13 +-
 5 files changed, 171 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index 34ad4048562..b48025b86f8 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -58,6 +58,7 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:ops_util",
     ],
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 09c8d55efda..abb9e4245ee 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/cc/training/queue_runner.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -111,6 +112,8 @@ Status SingleMachine::Run(const GraphDef& graph_def,
         for (auto node : *init_metadata_.mutable_cost_graph()->mutable_node()) {
           node.clear_compute_cost();
         }
+        // Also clear the timeline to save memory
+        init_metadata_.clear_step_stats();
       }
       for (int i = 0; i < queue_runner_defs_.size(); ++i) {
         std::unique_ptr<QueueRunner> queue_runner;
@@ -133,15 +136,17 @@ Status SingleMachine::Run(const GraphDef& graph_def,
     }
   }
 
-  TF_RETURN_IF_ERROR(RunWithTimeout(feed, fetch, metadata));
-
   if (metadata) {
-    // Add the costs of initialization and the queue runners.
-    metadata->MergeFrom(init_metadata_);
-    return coordinator_->ExportCostGraph(metadata->mutable_cost_graph());
+    TF_RETURN_IF_ERROR(RunWithTimeout(feed, fetch, metadata));
+    // Merge the costs of the initialization and the queue runners.
+    CostGraphDef queue_costs;
+    TF_RETURN_IF_ERROR(coordinator_->ExportCostGraph(&queue_costs));
+    MergeCosts(metadata->mutable_cost_graph(), init_metadata_.cost_graph(),
+               queue_costs);
   } else {
-    return Status::OK();
+    return RunWithTimeout(feed, fetch, nullptr);
   }
+  return Status::OK();
 }
 
 Status SingleMachine::RunWithTimeout(
@@ -249,5 +254,36 @@ Status SingleMachine::ResetSession() {
   return Status::OK();
 }
 
+void SingleMachine::MergeCosts(CostGraphDef* graph_costs,
+                               const CostGraphDef& init_costs,
+                               const CostGraphDef& queue_costs) {
+  graph_costs->mutable_node()->Reserve(graph_costs->node_size() +
+                                       init_costs.node_size() +
+                                       queue_costs.node_size());
+  std::unordered_set<string> nodes_seen;
+  for (const auto& node : graph_costs->node()) {
+    nodes_seen.insert(node.name());
+  }
+
+  // The costs obtained by running the main graph could be more stable than
+  // the one we get from the queue runners since the queue runners run
+  // asynchronously.
+  for (const auto& node : queue_costs.node()) {
+    if (nodes_seen.find(node.name()) != nodes_seen.end()) {
+      continue;
+    }
+    graph_costs->add_node()->MergeFrom(node);
+  }
+
+  // Don't overwrite the costs with that generated during initialization since
+  // these are possibly outdated.
+  for (const auto& node : init_costs.node()) {
+    if (nodes_seen.find(node.name()) != nodes_seen.end()) {
+      continue;
+    }
+    graph_costs->add_node()->MergeFrom(node);
+  }
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index f69b11df5da..f2773376e41 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -47,6 +47,8 @@ class SingleMachine : public Cluster {
                         RunMetadata* run_metadata, int64 timeout_s);
   Status ResetSession();
   Status CloseSession(bool use_timeout);
+  void MergeCosts(CostGraphDef* graph_costs, const CostGraphDef& init_costs,
+                  const CostGraphDef& queue_costs);
 
   const int num_gpus_;
   std::unique_ptr<Session> session_;
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index 0572aa04be7..17db48817e5 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -159,6 +159,121 @@ TEST_F(SingleMachineTest, InitializationMemory) {
   EXPECT_TRUE(found);
 }
 
+namespace {
+template <class T>
+inline void SetNodeAttr(const string& key, const T& value, NodeDef* node) {
+  AttrValue attr_value;
+  SetAttrValue(value, &attr_value);
+  auto* attr_map = node->mutable_attr();
+  (*attr_map)[key] = attr_value;
+}
+template <>
+inline void SetNodeAttr(const string& key, const Tensor& tensor,
+                        NodeDef* node) {
+  TensorProto tensor_proto;
+  tensor.AsProtoTensorContent(&tensor_proto);
+  SetNodeAttr(key, tensor_proto, node);
+}
+
+}  // namespace
+
+TEST_F(SingleMachineTest, PersistentMemory) {
+  // Build a hashtable and its initialization graph.
+  GrapplerItem item;
+  const DataType key_dtype = DT_INT64;
+  const DataType data_dtype = DT_INT64;
+
+  NodeDef* hashtable_node = item.graph.add_node();
+  hashtable_node->set_op("HashTable");
+  hashtable_node->set_name("hash_table");
+  SetNodeAttr("key_dtype", key_dtype, hashtable_node);
+  SetNodeAttr("value_dtype", data_dtype, hashtable_node);
+
+  // Initial hashtable keys and values
+  NodeDef* keys_node = item.graph.add_node();
+  keys_node->set_op("Const");
+  keys_node->set_name("table_keys");
+  SetNodeAttr("dtype", key_dtype, keys_node);
+  Tensor keys(key_dtype, TensorShape{2});
+  keys.vec<int64>()(0) = 123;
+  keys.vec<int64>()(1) = 321;
+  SetNodeAttr("value", keys, keys_node);
+
+  NodeDef* values_node = item.graph.add_node();
+  values_node->set_op("Const");
+  values_node->set_name("table_values");
+  SetNodeAttr("dtype", data_dtype, values_node);
+  Tensor values(data_dtype, TensorShape{2});
+  values.vec<int64>()(0) = 789;
+  values.vec<int64>()(1) = 987;
+  SetNodeAttr("value", values, values_node);
+
+  // InitializeTable node
+  NodeDef* init_table_node = item.graph.add_node();
+  init_table_node->set_op("InitializeTable");
+  init_table_node->set_name("initialize_table");
+  SetNodeAttr("Tkey", key_dtype, init_table_node);
+  SetNodeAttr("Tval", data_dtype, init_table_node);
+  *init_table_node->add_input() = "hash_table";
+  *init_table_node->add_input() = "table_keys";
+  *init_table_node->add_input() = "table_values";
+  item.init_ops.push_back(init_table_node->name());
+
+  // Key to lookup
+  NodeDef* query_node = item.graph.add_node();
+  query_node->set_op("Const");
+  query_node->set_name("query");
+  SetNodeAttr("dtype", key_dtype, query_node);
+  Tensor query(key_dtype, TensorShape({}));
+  query.flat<int64>()(0) = 0;
+  SetNodeAttr("value", query, query_node);
+
+  // Default return value of hashtable lookup
+  NodeDef* default_value_node = item.graph.add_node();
+  default_value_node->set_op("Const");
+  default_value_node->set_name("default_table_value");
+  SetNodeAttr("dtype", data_dtype, default_value_node);
+  Tensor dflt(data_dtype, TensorShape({}));
+  dflt.flat<int64>()(0) = 456;
+  SetNodeAttr("value", dflt, default_value_node);
+
+  // HashTable lookup node
+  NodeDef* lookup_node = item.graph.add_node();
+  lookup_node->set_op("LookupTableFind");
+  lookup_node->set_name("table_lookup");
+  SetNodeAttr("Tin", key_dtype, lookup_node);
+  SetNodeAttr("Tout", data_dtype, lookup_node);
+  *lookup_node->add_input() = "hash_table";
+  *lookup_node->add_input() = "query";
+  *lookup_node->add_input() = "default_table_value";
+  item.fetch.push_back(lookup_node->name());
+
+  // Run the graph
+  TF_CHECK_OK(cluster_->Initialize(item));
+  RunMetadata metadata;
+  TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
+
+  // Check the cost model.
+  bool found_table_init = false;
+  bool found_hashtable = false;
+  for (const auto& node : metadata.cost_graph().node()) {
+    if (node.name() == "hash_table") {
+      found_hashtable = true;
+      // Persistent memory usage should be 0 since it's recorded as part of the
+      // initialize_table op.
+      EXPECT_EQ(0, node.host_persistent_memory_size());
+      EXPECT_EQ(0, node.device_persistent_memory_size());
+    } else if (node.name() == "initialize_table") {
+      found_table_init = true;
+      // Persistent memory should hold 2 keys and 2 values.
+      EXPECT_LE(4 * sizeof(int64), node.host_persistent_memory_size());
+      EXPECT_EQ(0, node.device_persistent_memory_size());
+    }
+  }
+  EXPECT_TRUE(found_table_init);
+  EXPECT_TRUE(found_hashtable);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 4cd25a3cc6f..ff23a09a24f 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -64,8 +64,8 @@ class LookupTableOp : public OpKernel {
         return ctx->status();
       }
       if (ctx->track_allocations()) {
-        ctx->record_device_persistent_memory_allocation(
-            container->MemoryUsed());
+        ctx->record_host_persistent_memory_allocation(
+            container->MemoryUsed() + table_handle_.AllocatedBytes());
       }
       *ret = container;
       return Status::OK();
@@ -225,6 +225,15 @@ class HashTable : public InitializableLookupTable {
     return Status::OK();
   }
 
+  int64 MemoryUsed() const override {
+    if (table_) {
+      const int64 num_elements = table_->size();
+      return num_elements * (sizeof(K) + sizeof(V));
+    } else {
+      return 0;
+    }
+  }
+
  private:
   std::unique_ptr<std::unordered_map<K, V>> table_;
 };

From 498565a68898b4ce0a696e24fc36a52792141631 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 2 May 2017 11:48:51 -0800
Subject: [PATCH 19/51] Reset the inputs to ShapeRefiner::RunShapeFn so that it
 behaves the same every time it's called. To properly handle queues that have
 populated by several enqueue ops, merge the shapes of the inputs to all the
 enqueue ops before calling InferenceContext::set_output_handle_shape(). This
 ensures that we detect incorrect queue setups (where the 2 enqueue ops might
 generate tensors with incompatible shapes), and that we take all the known
 shape information instead of that of just one of the enqueue ops. Change:
 154866747

---
 .../core/common_runtime/shape_refiner.cc      |  2 ++
 .../core/grappler/costs/graph_properties.cc   | 21 ++++++++++------
 .../grappler/costs/graph_properties_test.cc   | 25 +++++++++++++++++++
 3 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 828297a1abe..8eb383a14fe 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -468,6 +468,8 @@ Status ShapeRefiner::RunShapeFn(const Node* node,
   std::vector<ShapeHandle> input_tensors_as_shapes;
 
   // Run the shape inference function, and return if there was an error.
+  c->set_input_tensors(input_tensors);
+  c->set_input_tensors_as_shapes(input_tensors_as_shapes);
   if (op_reg_data->shape_inference_fn) {
     TF_RETURN_IF_ERROR(c->Run(op_reg_data->shape_inference_fn));
   } else {
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index ad8f4f3f7cc..31c1043ae69 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -60,8 +60,9 @@ Status GraphProperties::InferStatically() {
       if (!qctx) {
         continue;
       }
-      shape_inference::ShapeHandle data_shp = qctx->output_handle_shape(0);
-      if (qctx->FullyDefined(data_shp)) {
+      DataType queue_type = qctx->output_handle_dtype(0);
+      shape_inference::ShapeHandle queue_shp = qctx->output_handle_shape(0);
+      if (qctx->FullyDefined(queue_shp) && queue_type != DT_INVALID) {
         continue;
       }
 
@@ -73,16 +74,20 @@ Status GraphProperties::InferStatically() {
         if (node->type_string().find("Enqueue") != std::string::npos) {
           if (ctx->num_inputs() == 2) {
             const DataType dtype = node->input_type(1);
-            shape_inference::ShapeHandle shp = ctx->input(1);
-            shape_inference::ShapeHandle refined;
-            TF_RETURN_IF_ERROR(qctx->Merge(shp, data_shp, &refined));
-            if (qctx->set_output_handle_shape(0, refined) ||
-                qctx->set_output_handle_dtype(0, dtype)) {
-              new_shapes.push(qnode);
+            if (queue_type == DT_INVALID) {
+              queue_type = dtype;
+            } else {
+              CHECK_EQ(queue_type, dtype);
             }
+            shape_inference::ShapeHandle shp = ctx->input(1);
+            TF_RETURN_IF_ERROR(qctx->Merge(queue_shp, shp, &queue_shp));
           }
         }
       }
+      if (qctx->set_output_handle_dtype(0, queue_type) ||
+          qctx->set_output_handle_shape(0, queue_shp)) {
+        new_shapes.push(qnode);
+      }
     }
     // Propagate the shapes in the transitive fan-out of the queue.
     done = new_shapes.empty();
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 1eff52ba0e6..94b809dc44e 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -177,6 +177,19 @@ TEST_F(GraphPropertiesTest, Queues) {
   auto dequeue2 =
       ops::QueueDequeue(root.WithOpName("Dequeue2"), q2, {DataType::DT_FLOAT});
 
+  auto q3 =
+      ops::RandomShuffleQueue(root.WithOpName("Queue3"), {DataType::DT_FLOAT});
+  auto dequeue3 =
+      ops::QueueDequeue(root.WithOpName("Dequeue3"), q3, {DataType::DT_FLOAT});
+
+  auto q4 =
+      ops::RandomShuffleQueue(root.WithOpName("Queue4"), {DataType::DT_FLOAT});
+  auto enqueue4 = ops::QueueEnqueue(root.WithOpName("Enqueue4"), q4, {square2});
+  auto enqueue4_2 =
+      ops::QueueEnqueue(root.WithOpName("Enqueue4_2"), q4, {dequeue3[0]});
+  auto dequeue4 =
+      ops::QueueDequeue(root.WithOpName("Dequeue4"), q4, {DataType::DT_FLOAT});
+
   GrapplerItem item;
   TF_CHECK_OK(root.ToGraphDef(&item.graph));
 
@@ -200,6 +213,18 @@ TEST_F(GraphPropertiesTest, Queues) {
   EXPECT_EQ(2, prop2.shape().dim_size());
   EXPECT_EQ(3, prop2.shape().dim(0).size());
   EXPECT_EQ(7, prop2.shape().dim(1).size());
+
+  // The dequeue3 op shape is unknown. The square2 op shape is known. Verify
+  // that we merge the 2 properly to determine the shape of the data coming out
+  // of the queue.
+  const auto props4 = properties.GetOutputProperties("Dequeue4");
+  EXPECT_EQ(1, props4.size());
+  const OpInfo::TensorProperties& prop4 = props4[0];
+  EXPECT_EQ(DT_FLOAT, prop4.dtype());
+  EXPECT_FALSE(prop4.shape().unknown_rank());
+  EXPECT_EQ(2, prop4.shape().dim_size());
+  EXPECT_EQ(3, prop4.shape().dim(0).size());
+  EXPECT_EQ(7, prop4.shape().dim(1).size());
 }
 
 }  // namespace

From fc407cbcc0ce5d5f4e8553f5aaaaaa263d318eeb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 12:00:16 -0800
Subject: [PATCH 20/51] Making sure an error message will be produced by
 session_manager when a non-tensor object is passed in. Otherwise the 'name'
 property is missing. Change: 154868022

---
 tensorflow/python/training/session_manager.py | 24 +++++++++++++++----
 .../python/training/session_manager_test.py   | 17 +++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 6bcc6e25c36..a13b6dd976a 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -27,6 +27,23 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as saver_mod
 
 
+def _maybe_name(obj):
+  """Returns object name if it has one, or a message otherwise.
+
+  This is useful for names that apper in error messages.
+  Args:
+    obj: Object to get the name of.
+  Returns:
+    name, "None", or a "no name" message.
+  """
+  if obj is None:
+    return "None"
+  elif hasattr(obj, "name"):
+    return obj.name
+  else:
+    return "<no name for %s>" % type(obj)
+
+
 class SessionManager(object):
   """Training helper that restores from checkpoint and creates session.
 
@@ -267,8 +284,8 @@ class SessionManager(object):
     if not local_init_success:
       raise RuntimeError(
           "Init operations did not make model ready for local_init.  "
-          "Init op: %s, init fn: %s, error: %s" % ("None" if init_op is None
-                                                   else init_op.name, init_fn,
+          "Init op: %s, init fn: %s, error: %s" % (_maybe_name(init_op),
+                                                   init_fn,
                                                    msg))
 
     is_ready, msg = self._model_ready(sess)
@@ -276,8 +293,7 @@ class SessionManager(object):
       raise RuntimeError(
           "Init operations did not make model ready.  "
           "Init op: %s, init fn: %s, local_init_op: %s, error: %s" %
-          (None if init_op is None else init_op.name, init_fn,
-           self._local_init_op, msg))
+          (_maybe_name(init_op), init_fn, self._local_init_op, msg))
     return sess
 
   def recover_session(self,
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index 246e95110a6..4dc1d5abb71 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -497,6 +497,23 @@ class SessionManagerTest(test.TestCase):
                                    "Init operations did not make model ready"):
         sm2.prepare_session("", init_op=v.initializer)
 
+  def testPrepareSessionDidNotInitLocalVariableList(self):
+    with ops.Graph().as_default():
+      v = variables.Variable(1, name="v")
+      w = variables.Variable(
+          v,
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          name="w")
+      with self.test_session():
+        self.assertEqual(False, variables.is_variable_initialized(v).eval())
+        self.assertEqual(False, variables.is_variable_initialized(w).eval())
+      sm2 = session_manager.SessionManager(
+          ready_op=variables.report_uninitialized_variables())
+      with self.assertRaisesRegexp(RuntimeError,
+                                   "Init operations did not make model ready"):
+        sm2.prepare_session("", init_op=[v.initializer])
+
   def testPrepareSessionWithReadyNotReadyForLocal(self):
     with ops.Graph().as_default():
       v = variables.Variable(1, name="v")

From 867d407d98bca274db9a574cfb1877be7baad19c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 12:03:34 -0800
Subject: [PATCH 21/51] Don't needlessly synchronize the CUDA stream in
 CropAndResize. Make the op Async so we don't block an executor thread while
 waiting for the result of the box bounds check to be copied back to the host.
 Change: 154868460

---
 tensorflow/core/kernels/crop_and_resize_op.cc | 543 ++++++++++--------
 tensorflow/core/kernels/crop_and_resize_op.h  |   8 +-
 .../core/kernels/crop_and_resize_op_gpu.cu.cc |   2 +-
 .../core/kernels/crop_and_resize_op_test.cc   |   6 +-
 4 files changed, 319 insertions(+), 240 deletions(-)

diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 746fe63e2a0..1c7afcf8663 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -19,6 +19,9 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/crop_and_resize_op.h"
 
+#include <functional>
+#include <string>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -26,10 +29,13 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
@@ -37,41 +43,67 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
+using Callback = std::function<void()>;
 
-static inline void ParseAndCheckBoxSizes(OpKernelContext* context,
-                                         const Tensor& boxes,
-                                         const Tensor& box_ind,
-                                         int* num_boxes) {
-  if (boxes.NumElements() == 0 && box_ind.NumElements() == 0) {
+namespace {
+
+static inline Status ParseAndCheckBoxSizes(const Tensor& boxes,
+                                           const Tensor& box_index,
+                                           int* num_boxes) {
+  if (boxes.NumElements() == 0 && box_index.NumElements() == 0) {
     *num_boxes = 0;
-    return;
+    return Status::OK();
   }
   // The shape of 'boxes' is [num_boxes, 4].
-  OP_REQUIRES(context, boxes.dims() == 2,
-              errors::InvalidArgument("boxes must be 2-D",
-                                      boxes.shape().DebugString()));
+  if (boxes.dims() != 2) {
+    return errors::InvalidArgument("boxes must be 2-D",
+                                   boxes.shape().DebugString());
+  }
   *num_boxes = boxes.dim_size(0);
-  OP_REQUIRES(context, boxes.dim_size(1) == 4,
-              errors::InvalidArgument("boxes must have 4 columns"));
-
-  // The shape of 'box_ind' is [num_boxes].
-  OP_REQUIRES(context, box_ind.dims() == 1,
-              errors::InvalidArgument("box_ind must be 1-D",
-                                      box_ind.shape().DebugString()));
-  OP_REQUIRES(context, box_ind.dim_size(0) == *num_boxes,
-              errors::InvalidArgument("box_ind has incompatible shape"));
+  if (boxes.dim_size(1) != 4) {
+    return errors::InvalidArgument("boxes must have 4 columns");
+  }
+  // The shape of 'box_index' is [num_boxes].
+  if (box_index.dims() != 1) {
+    return errors::InvalidArgument("box_index must be 1-D",
+                                   box_index.shape().DebugString());
+  }
+  if (box_index.dim_size(0) != *num_boxes) {
+    return errors::InvalidArgument("box_index has incompatible shape");
+  }
+  return Status::OK();
 }
 
-// Verifies that all values in box_ind are in [0, batch).
+// Conditionally calls the compute callback if all values in box_index are in
+// [0, batch_size) then calls done.
 template <typename Device>
-inline void CheckValidBoxInd(
-    OpKernelContext* context,
-    typename TTypes<int32, 1>::ConstTensor box_ind_data, int batch);
+inline void RunIfBoxIndexIsValid(
+    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
+    int batch_size, Callback compute, Callback done);
+
+// Specialization of CheckValidBoxIndex for a CPUDevice.
+template <>
+inline void RunIfBoxIndexIsValid<CPUDevice>(
+    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
+    int batch_size, Callback compute, Callback done) {
+  const int num_boxes = box_index.dimension(0);
+  for (int b = 0; b < num_boxes; ++b) {
+    OP_REQUIRES_ASYNC(
+        context, FastBoundsCheck(box_index(b), batch_size),
+        errors::OutOfRange("box_index has values outside [0, batch_size)"),
+        done);
+  }
+  compute();
+  done();
+}
+
+}  // namespace
 
 template <typename Device, typename T>
-class CropAndResizeOp : public OpKernel {
+class CropAndResizeOp : public AsyncOpKernel {
  public:
-  explicit CropAndResizeOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit CropAndResizeOp(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {
     string method;
     OP_REQUIRES_OK(context, context->GetAttr("method", &method));
     OP_REQUIRES(context, method == "bilinear",
@@ -80,69 +112,77 @@ class CropAndResizeOp : public OpKernel {
                                              &extrapolation_value_));
   }
 
-  void Compute(OpKernelContext* context) override {
-    // The shape of 'image' is [batch, image_height, image_width, channels].
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    // The shape of 'image' is [batch_size, image_height, image_width,
+    // channels].
     const Tensor& image = context->input(0);
-    OP_REQUIRES(context, image.dims() == 4,
-                errors::InvalidArgument("input image must be 4-D",
-                                        image.shape().DebugString()));
-
-    const int batch = image.dim_size(0);
-    const int image_height = image.dim_size(1);
-    const int image_width = image.dim_size(2);
-    const int depth = image.dim_size(3);
-    OP_REQUIRES(context, image_height > 0 && image_width > 0,
-                errors::InvalidArgument("image dimensions must be positive"));
-
     // The shape of 'boxes' is [num_boxes, 4].
     const Tensor& boxes = context->input(1);
-
-    // The shape of 'box_ind' is [num_boxes].
-    const Tensor& box_ind = context->input(2);
-
-    int num_boxes = 0;
-    ParseAndCheckBoxSizes(context, boxes, box_ind, &num_boxes);
-
+    // The shape of 'box_index' is [num_boxes].
+    const Tensor& box_index = context->input(2);
     // The shape of 'crop_size' is [2].
     const Tensor& crop_size = context->input(3);
 
-    OP_REQUIRES(context, crop_size.dims() == 1,
-                errors::InvalidArgument("crop_size must be 1-D",
-                                        crop_size.shape().DebugString()));
-    OP_REQUIRES(context, crop_size.dim_size(0) == 2,
-                errors::InvalidArgument("crop_size must have two elements",
-                                        crop_size.shape().DebugString()));
+    // Validate inputs dimensions.
+    OP_REQUIRES_ASYNC(context, image.dims() == 4,
+                      errors::InvalidArgument("input image must be 4-D",
+                                              image.shape().DebugString()),
+                      done);
+    const int batch_size = image.dim_size(0);
+    const int image_height = image.dim_size(1);
+    const int image_width = image.dim_size(2);
+    const int depth = image.dim_size(3);
+    OP_REQUIRES_ASYNC(
+        context, image_height > 0 && image_width > 0,
+        errors::InvalidArgument("image dimensions must be positive"), done);
+    int num_boxes = 0;
+    OP_REQUIRES_OK_ASYNC(
+        context, ParseAndCheckBoxSizes(boxes, box_index, &num_boxes), done);
 
+    OP_REQUIRES_ASYNC(context, crop_size.dims() == 1,
+                      errors::InvalidArgument("crop_size must be 1-D",
+                                              crop_size.shape().DebugString()),
+                      done);
+    OP_REQUIRES_ASYNC(
+        context, crop_size.dim_size(0) == 2,
+        errors::InvalidArgument("crop_size must have two elements",
+                                crop_size.shape().DebugString()),
+        done);
+
+    // Copy and validate crop sizes.
     auto crop_size_vec = crop_size.vec<int32>();
     const int crop_height = internal::SubtleMustCopy(crop_size_vec(0));
     const int crop_width = internal::SubtleMustCopy(crop_size_vec(1));
-    OP_REQUIRES(context, crop_height > 0 && crop_width > 0,
-                errors::InvalidArgument("crop dimensions must be positive"));
+    OP_REQUIRES_ASYNC(
+        context, crop_height > 0 && crop_width > 0,
+        errors::InvalidArgument("crop dimensions must be positive"), done);
 
     // Allocate output tensor.
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(
+    OP_REQUIRES_OK_ASYNC(
         context,
         context->allocate_output(
             0, TensorShape({num_boxes, crop_height, crop_width, depth}),
-            &output));
+            &output),
+        done);
 
-    typename TTypes<T, 4>::ConstTensor image_data = image.tensor<T, 4>();
-    typename TTypes<float, 2>::ConstTensor boxes_data =
-        boxes.tensor<float, 2>();
-    typename TTypes<int32, 1>::ConstTensor box_ind_data =
-        box_ind.tensor<int32, 1>();
-    typename TTypes<float, 4>::Tensor crops_data = output->tensor<float, 4>();
+    auto compute_callback = [this, context, output]() {
+      const Tensor& image = context->input(0);
+      const Tensor& boxes = context->input(1);
+      const Tensor& box_index = context->input(2);
+      const bool status = functor::CropAndResize<Device, T>()(
+          context->eigen_device<Device>(), image.tensor<T, 4>(),
+          boxes.tensor<float, 2>(), box_index.tensor<int32, 1>(),
+          extrapolation_value_, output->tensor<float, 4>());
+      if (!status) {
+        context->SetStatus(
+            errors::Internal("Failed launch CropAndResizeKernel."));
+      }
+    };
 
-    CheckValidBoxInd<Device>(context, box_ind_data, batch);
-
-    bool status = functor::CropAndResize<Device, T>()(
-        context->eigen_device<Device>(), image_data, boxes_data, box_ind_data,
-        extrapolation_value_, crops_data);
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launch CropAndResizeKernel."));
-    }
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+                                 batch_size, std::move(compute_callback),
+                                 std::move(done));
   }
 
  private:
@@ -155,10 +195,10 @@ template <typename T>
 struct CropAndResize<CPUDevice, T> {
   bool operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32, 1>::ConstTensor box_index,
                   float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops) {
-    const int batch = image.dimension(0);
+    const int batch_size = image.dimension(0);
     const int image_height = image.dimension(1);
     const int image_width = image.dimension(2);
 
@@ -173,8 +213,8 @@ struct CropAndResize<CPUDevice, T> {
       const float y2 = boxes(b, 2);
       const float x2 = boxes(b, 3);
 
-      const int32 b_in = box_ind(b);
-      if (b_in < 0 || b_in >= batch) {
+      const int32 b_in = box_index(b);
+      if (!FastBoundsCheck(b_in, batch_size)) {
         continue;
       }
 
@@ -235,89 +275,94 @@ struct CropAndResize<CPUDevice, T> {
     return true;
   }
 };
+
 }  // namespace functor
 
 template <typename Device, typename T>
-class CropAndResizeGradImageOp : public OpKernel {
+class CropAndResizeGradImageOp : public AsyncOpKernel {
  public:
   explicit CropAndResizeGradImageOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+      : AsyncOpKernel(context) {
     string method;
     OP_REQUIRES_OK(context, context->GetAttr("method", &method));
     OP_REQUIRES(context, method == "bilinear",
                 errors::InvalidArgument("method must be 'bilinear'", method));
   }
 
-  void Compute(OpKernelContext* context) override {
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
     // The shape of 'grads' is [num_boxes, crop_height, crop_width, depth].
     const Tensor& grads = context->input(0);
-
-    OP_REQUIRES(context, grads.dims() == 4,
-                errors::InvalidArgument("grads image must be 4-D",
-                                        grads.shape().DebugString()));
-    const int crop_height = grads.dim_size(1);
-    const int crop_width = grads.dim_size(2);
-    OP_REQUIRES(context, crop_height > 0 && crop_width > 0,
-                errors::InvalidArgument("grads dimensions must be positive"));
-
     // The shape of 'boxes' is [num_boxes, 4].
     const Tensor& boxes = context->input(1);
-
-    // The shape of 'box_ind' is [num_boxes].
-    const Tensor& box_ind = context->input(2);
-
-    int num_boxes = 0;
-    ParseAndCheckBoxSizes(context, boxes, box_ind, &num_boxes);
-
-    OP_REQUIRES(
-        context, grads.dim_size(0) == num_boxes,
-        errors::InvalidArgument("boxes and grads have incompatible shape"));
-
+    // The shape of 'box_index' is [num_boxes].
+    const Tensor& box_index = context->input(2);
     // The shape of 'image_size' is [4].
     const Tensor& image_size = context->input(3);
-    OP_REQUIRES(context, image_size.dims() == 1,
-                errors::InvalidArgument("image_size must be 1-D",
-                                        image_size.shape().DebugString()));
-    OP_REQUIRES(context, image_size.dim_size(0) == 4,
-                errors::InvalidArgument("image_size must have 4 elements",
-                                        image_size.shape().DebugString()));
 
+    // Validate input shapes.
+    OP_REQUIRES_ASYNC(context, grads.dims() == 4,
+                      errors::InvalidArgument("grads image must be 4-D",
+                                              grads.shape().DebugString()),
+                      done);
+    const int crop_height = grads.dim_size(1);
+    const int crop_width = grads.dim_size(2);
+    OP_REQUIRES_ASYNC(
+        context, crop_height > 0 && crop_width > 0,
+        errors::InvalidArgument("grads dimensions must be positive"), done);
+    int num_boxes = 0;
+    OP_REQUIRES_OK_ASYNC(
+        context, ParseAndCheckBoxSizes(boxes, box_index, &num_boxes), done);
+    OP_REQUIRES_ASYNC(
+        context, grads.dim_size(0) == num_boxes,
+        errors::InvalidArgument("boxes and grads have incompatible shape"),
+        done);
+
+    OP_REQUIRES_ASYNC(context, image_size.dims() == 1,
+                      errors::InvalidArgument("image_size must be 1-D",
+                                              image_size.shape().DebugString()),
+                      done);
+    OP_REQUIRES_ASYNC(context, image_size.dim_size(0) == 4,
+                      errors::InvalidArgument("image_size must have 4 elements",
+                                              image_size.shape().DebugString()),
+                      done);
     auto image_size_vec = image_size.vec<int32>();
-    const int batch = internal::SubtleMustCopy(image_size_vec(0));
+    const int batch_size = internal::SubtleMustCopy(image_size_vec(0));
     const int image_height = internal::SubtleMustCopy(image_size_vec(1));
     const int image_width = internal::SubtleMustCopy(image_size_vec(2));
     const int depth = internal::SubtleMustCopy(image_size_vec(3));
-
-    OP_REQUIRES(context, image_height > 0 && image_width > 0,
-                errors::InvalidArgument("image dimensions must be positive"));
-    OP_REQUIRES(
+    OP_REQUIRES_ASYNC(
+        context, image_height > 0 && image_width > 0,
+        errors::InvalidArgument("image dimensions must be positive"), done);
+    OP_REQUIRES_ASYNC(
         context, grads.dim_size(3) == depth,
-        errors::InvalidArgument("image_size and grads are incompatible"));
+        errors::InvalidArgument("image_size and grads are incompatible"), done);
 
     // Allocate output tensor.
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(
-                     0, TensorShape({batch, image_height, image_width, depth}),
-                     &output));
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_output(
+            0, TensorShape({batch_size, image_height, image_width, depth}),
+            &output),
+        done);
 
-    typename TTypes<float, 4>::ConstTensor grads_data =
-        grads.tensor<float, 4>();
-    typename TTypes<float, 2>::ConstTensor boxes_data =
-        boxes.tensor<float, 2>();
-    typename TTypes<int32, 1>::ConstTensor box_ind_data =
-        box_ind.tensor<int32, 1>();
-    typename TTypes<T, 4>::Tensor output_data = output->tensor<T, 4>();
+    auto compute_callback = [context, output]() {
+      const Tensor& grads = context->input(0);
+      const Tensor& boxes = context->input(1);
+      const Tensor& box_index = context->input(2);
+      const bool status = functor::CropAndResizeBackpropImage<Device, T>()(
+          context->eigen_device<Device>(), grads.tensor<float, 4>(),
+          boxes.tensor<float, 2>(), box_index.tensor<int32, 1>(),
+          output->tensor<T, 4>());
+      if (!status) {
+        context->SetStatus(errors::Internal(
+            "Failed launch CropAndResizeBackpropImage kernel."));
+      }
+    };
 
-    CheckValidBoxInd<Device>(context, box_ind_data, batch);
-
-    bool status = functor::CropAndResizeBackpropImage<Device, T>()(
-        context->eigen_device<Device>(), grads_data, boxes_data, box_ind_data,
-        output_data);
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launch CropAndResizeBackpropImageKernel."));
-    }
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+                                 batch_size, std::move(compute_callback),
+                                 std::move(done));
   }
 };
 
@@ -328,9 +373,9 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
   bool operator()(const CPUDevice& d,
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32, 1>::ConstTensor box_index,
                   typename TTypes<T, 4>::Tensor grads_image) {
-    const int batch = grads_image.dimension(0);
+    const int batch_size = grads_image.dimension(0);
     const int image_height = grads_image.dimension(1);
     const int image_width = grads_image.dimension(2);
 
@@ -347,8 +392,8 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
       const float y2 = boxes(b, 2);
       const float x2 = boxes(b, 3);
 
-      const int32 b_in = box_ind(b);
-      if (b_in < 0 || b_in >= batch) {
+      const int32 b_in = box_index(b);
+      if (!FastBoundsCheck(b_in, batch_size)) {
         continue;
       }
 
@@ -399,83 +444,90 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
     return true;
   }
 };
+
 }  // namespace functor
 
 template <typename Device, typename T>
-class CropAndResizeGradBoxesOp : public OpKernel {
+class CropAndResizeGradBoxesOp : public AsyncOpKernel {
  public:
   explicit CropAndResizeGradBoxesOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+      : AsyncOpKernel(context) {
     string method;
     OP_REQUIRES_OK(context, context->GetAttr("method", &method));
     OP_REQUIRES(context, method == "bilinear",
                 errors::InvalidArgument("method must be 'bilinear'", method));
   }
 
-  void Compute(OpKernelContext* context) override {
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
     // The shape of 'grads' is [num_boxes, crop_height, crop_width, depth].
     const Tensor& grads = context->input(0);
+    // The shape of 'boxes' is [num_boxes, 4].
+    const Tensor& boxes = context->input(2);
+    // The shape of 'box_index' is [num_boxes].
+    const Tensor& box_index = context->input(3);
+    // The shape of 'image' is [batch_size, image_height, image_width, depth].
+    const Tensor& image = context->input(1);
 
-    OP_REQUIRES(context, grads.dims() == 4,
-                errors::InvalidArgument("grads image must be 4-D",
-                                        grads.shape().DebugString()));
-
+    // Validate input shapes.
+    OP_REQUIRES_ASYNC(context, grads.dims() == 4,
+                      errors::InvalidArgument("grads image must be 4-D",
+                                              grads.shape().DebugString()),
+                      done);
     const int crop_height = grads.dim_size(1);
     const int crop_width = grads.dim_size(2);
     const int depth = grads.dim_size(3);
-    OP_REQUIRES(context, crop_height > 0 && crop_width > 0,
-                errors::InvalidArgument("grads dimensions must be positive"));
+    OP_REQUIRES_ASYNC(
+        context, crop_height > 0 && crop_width > 0,
+        errors::InvalidArgument("grads dimensions must be positive"), done);
 
-    // The shape of 'image' is [batch, image_height, image_width, depth].
-    const Tensor& image = context->input(1);
-    OP_REQUIRES(context, image.dims() == 4,
-                errors::InvalidArgument("input image must be 4-D",
-                                        image.shape().DebugString()));
-
-    const int batch = image.dim_size(0);
+    OP_REQUIRES_ASYNC(context, image.dims() == 4,
+                      errors::InvalidArgument("input image must be 4-D",
+                                              image.shape().DebugString()),
+                      done);
+    const int batch_size = image.dim_size(0);
     const int image_height = image.dim_size(1);
     const int image_width = image.dim_size(2);
-    OP_REQUIRES(context, image_height > 0 && image_width > 0,
-                errors::InvalidArgument("image dimensions must be positive"));
-    OP_REQUIRES(context, image.dim_size(3) == depth,
-                errors::InvalidArgument("image, grads depth differ"));
-
-    // The shape of 'boxes' is [num_boxes, 4].
-    const Tensor& boxes = context->input(2);
-
-    // The shape of 'box_ind' is [num_boxes].
-    const Tensor& box_ind = context->input(3);
+    OP_REQUIRES_ASYNC(
+        context, image_height > 0 && image_width > 0,
+        errors::InvalidArgument("image dimensions must be positive"), done);
+    OP_REQUIRES_ASYNC(context, image.dim_size(3) == depth,
+                      errors::InvalidArgument("image, grads depth differ"),
+                      done);
 
     int num_boxes = 0;
-    ParseAndCheckBoxSizes(context, boxes, box_ind, &num_boxes);
+    OP_REQUIRES_OK_ASYNC(
+        context, ParseAndCheckBoxSizes(boxes, box_index, &num_boxes), done);
 
-    OP_REQUIRES(
+    OP_REQUIRES_ASYNC(
         context, grads.dim_size(0) == num_boxes,
-        errors::InvalidArgument("boxes and grads have incompatible shape"));
+        errors::InvalidArgument("boxes and grads have incompatible shape"),
+        done);
 
     // Allocate output tensor.
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({num_boxes, 4}), &output));
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_output(0, TensorShape({num_boxes, 4}), &output),
+        done);
 
-    typename TTypes<float, 4>::ConstTensor grads_data =
-        grads.tensor<float, 4>();
-    typename TTypes<T, 4>::ConstTensor image_data = image.tensor<T, 4>();
-    typename TTypes<float, 2>::ConstTensor boxes_data =
-        boxes.tensor<float, 2>();
-    typename TTypes<int32, 1>::ConstTensor box_ind_data =
-        box_ind.tensor<int32, 1>();
-    typename TTypes<float, 2>::Tensor output_data = output->tensor<float, 2>();
+    auto compute_callback = [context, output]() {
+      const Tensor& grads = context->input(0);
+      const Tensor& image = context->input(1);
+      const Tensor& boxes = context->input(2);
+      const Tensor& box_index = context->input(3);
+      const bool status = functor::CropAndResizeBackpropBoxes<Device, T>()(
+          context->eigen_device<Device>(), grads.tensor<float, 4>(),
+          image.tensor<T, 4>(), boxes.tensor<float, 2>(),
+          box_index.tensor<int32, 1>(), output->tensor<float, 2>());
+      if (!status) {
+        context->SetStatus(errors::Internal(
+            "Failed launch CropAndResizeBackpropBoxes kernel."));
+      }
+    };
 
-    CheckValidBoxInd<Device>(context, box_ind_data, batch);
-
-    bool status = functor::CropAndResizeBackpropBoxes<Device, T>()(
-        context->eigen_device<Device>(), grads_data, image_data, boxes_data,
-        box_ind_data, output_data);
-    if (!status) {
-      context->SetStatus(
-          errors::Internal("Failed launch CropAndResizeBackpropBoxesKernel."));
-    }
+    RunIfBoxIndexIsValid<Device>(context, box_index.tensor<int32, 1>(),
+                                 batch_size, std::move(compute_callback),
+                                 std::move(done));
   }
 };
 
@@ -487,9 +539,9 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
-                  typename TTypes<int32, 1>::ConstTensor box_ind,
+                  typename TTypes<int32, 1>::ConstTensor box_index,
                   typename TTypes<float, 2>::Tensor grads_boxes) {
-    const int batch = image.dimension(0);
+    const int batch_size = image.dimension(0);
     const int image_height = image.dimension(1);
     const int image_width = image.dimension(2);
 
@@ -506,8 +558,8 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
       const float y2 = boxes(b, 2);
       const float x2 = boxes(b, 3);
 
-      const int32 b_in = box_ind(b);
-      if (b_in < 0 || b_in >= batch) {
+      const int32 b_in = box_index(b);
+      if (!FastBoundsCheck(b_in, batch_size)) {
         continue;
       }
 
@@ -589,30 +641,19 @@ struct CropAndResizeBackpropBoxes<CPUDevice, T> {
     return true;
   }
 };
+
 }  // namespace functor
 
-// Specialization of CheckValidBoxInd for a CPUDevice.
-template <>
-inline void CheckValidBoxInd<CPUDevice>(
-    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_ind,
-    int batch) {
-  const int num_boxes = box_ind.dimension(0);
-  for (int b = 0; b < num_boxes; ++b) {
-    OP_REQUIRES(context, box_ind(b) >= 0 && box_ind(b) < batch,
-                errors::OutOfRange("box_ind has values outside [0, batch)"));
-  }
-}
-
-#define REGISTER_KERNEL(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("CropAndResize")                    \
-                              .Device(DEVICE_CPU)                  \
-                              .TypeConstraint<T>("T")              \
-                              .HostMemory("crop_size"),            \
-                          CropAndResizeOp<CPUDevice, T>);          \
-                                                                   \
-  REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradBoxes")           \
-                              .Device(DEVICE_CPU)                  \
-                              .TypeConstraint<T>("T"),             \
+#define REGISTER_KERNEL(T)                                \
+  REGISTER_KERNEL_BUILDER(Name("CropAndResize")           \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<T>("T")     \
+                              .HostMemory("crop_size"),   \
+                          CropAndResizeOp<CPUDevice, T>); \
+                                                          \
+  REGISTER_KERNEL_BUILDER(Name("CropAndResizeGradBoxes")  \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<T>("T"),    \
                           CropAndResizeGradBoxesOp<CPUDevice, T>);
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
@@ -634,50 +675,86 @@ TF_CALL_double(REGISTER_KERNEL);
 
 #if GOOGLE_CUDA
 
-// Forward declaration of the CheckValidBoxIndHelper specialization for GPU.
+// Forward declaration of the CheckValidBoxIndexHelper specialization for GPU.
 namespace functor {
 template <>
-void CheckValidBoxIndHelper<GPUDevice>::operator()(
-    const GPUDevice& d, typename TTypes<int32, 1>::ConstTensor box_ind,
-    int batch, typename TTypes<bool, 0>::Tensor isvalid);
-extern template struct CheckValidBoxIndHelper<GPUDevice>;
+void CheckValidBoxIndexHelper<GPUDevice>::operator()(
+    const GPUDevice& d, typename TTypes<int32, 1>::ConstTensor box_index,
+    int batch_size, typename TTypes<bool, 0>::Tensor isvalid);
+extern template struct CheckValidBoxIndexHelper<GPUDevice>;
 }  // namespace functor
 
-// Specialization of CheckValidBoxInd for a GPUDevice.
+namespace {
+
+// Specialization of CheckValidBoxIndex for a GPUDevice.
 template <>
-inline void CheckValidBoxInd<GPUDevice>(
-    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_ind,
-    int batch) {
-  const int num_boxes = box_ind.dimension(0);
+inline void RunIfBoxIndexIsValid<GPUDevice>(
+    OpKernelContext* context, typename TTypes<int32, 1>::ConstTensor box_index,
+    int batch_size, Callback compute, Callback done) {
+  const int num_boxes = box_index.dimension(0);
   if (num_boxes == 0) {
+    compute();
+    done();
     return;
   }
-  Tensor isvalid_tensor;
-  OP_REQUIRES_OK(context,
-                 context->allocate_temp(DataTypeToEnum<bool>::value,
-                                        TensorShape({}), &isvalid_tensor));
 
-  typename TTypes<bool, 0>::Tensor isvalid = isvalid_tensor.tensor<bool, 0>();
+  Tensor isvalid_dev_tensor;
+  OP_REQUIRES_OK_ASYNC(
+      context,
+      context->allocate_temp(DataTypeToEnum<bool>::value, TensorShape({}),
+                             &isvalid_dev_tensor),
+      done);
+  typename TTypes<bool, 0>::Tensor isvalid_dev =
+      isvalid_dev_tensor.tensor<bool, 0>();
 
-  functor::CheckValidBoxIndHelper<GPUDevice>()(
-      context->eigen_device<GPUDevice>(), box_ind, batch, isvalid);
+  // Run the actual box check on the device.
+  functor::CheckValidBoxIndexHelper<GPUDevice>()(
+      context->eigen_device<GPUDevice>(), box_index, batch_size, isvalid_dev);
 
+  // Copy the result back to the host.
   auto* stream = context->op_device_context()->stream();
-  OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+  OP_REQUIRES_ASYNC(context, stream,
+                    errors::Internal("No GPU stream available."), done);
+  Tensor isvalid_host_tensor;
+  // Use pinned host memory on the host to avoid unnecessary
+  // synchronization.
+  AllocatorAttributes alloc_attr;
+  alloc_attr.set_on_host(true);
+  alloc_attr.set_gpu_compatible(true);
+  OP_REQUIRES_OK_ASYNC(
+      context,
+      context->allocate_temp(DataTypeToEnum<bool>::value, TensorShape({}),
+                             &isvalid_host_tensor, alloc_attr),
+      done);
+  typename TTypes<bool, 0>::Tensor isvalid_host =
+      isvalid_host_tensor.tensor<bool, 0>();
 
-  bool isvalid_host = false;
-  perftools::gputools::DeviceMemoryBase isvalid_gpu(isvalid.data(),
-                                                    sizeof(bool));
-  stream->ThenMemcpy(&isvalid_host, isvalid_gpu, sizeof(bool));
-  stream->BlockHostUntilDone();
+  perftools::gputools::DeviceMemoryBase wrapped(isvalid_dev.data(),
+                                                sizeof(bool));
+  const bool status = stream
+                          ->ThenMemcpy(isvalid_host.data() /* destination */,
+                                       wrapped /* source */, sizeof(bool))
+                          .ok();
+  OP_REQUIRES_ASYNC(
+      context, status,
+      errors::Internal("Failed to launch copy of isvalid from device to host."),
+      done);
 
-  OP_REQUIRES(context, stream->ok(),
-              errors::Internal("cudaMemcpy from device to host failed"));
+  auto wrapped_callback = [context, isvalid_host, compute, done]() {
+    OP_REQUIRES_ASYNC(
+        context, isvalid_host(),
+        errors::OutOfRange("box_index has values outside [0, batch_size)"),
+        done);
+    compute();
+    done();
+  };
 
-  OP_REQUIRES(context, isvalid_host,
-              errors::OutOfRange("box_ind has values outside [0, batch)"));
+  context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
+      stream, wrapped_callback);
 }
 
+}  // namespace
+
 #define REGISTER_KERNEL(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("CropAndResize")                    \
                               .Device(DEVICE_GPU)                  \
diff --git a/tensorflow/core/kernels/crop_and_resize_op.h b/tensorflow/core/kernels/crop_and_resize_op.h
index 22df1bdd56b..460dbad22b4 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.h
+++ b/tensorflow/core/kernels/crop_and_resize_op.h
@@ -53,12 +53,12 @@ struct CropAndResizeBackpropBoxes {
 };
 
 template <typename Device>
-struct CheckValidBoxIndHelper {
-  // Checks if all values in box_ind are in [0, batch).
+struct CheckValidBoxIndexHelper {
+  // Checks if all values in box_index are in [0, batch).
   void operator()(const Device& d,
-                  typename TTypes<int32, 1>::ConstTensor box_ind, int batch,
+                  typename TTypes<int32, 1>::ConstTensor box_index, int batch,
                   typename TTypes<bool, 0>::Tensor isvalid) {
-    isvalid.device(d) = ((box_ind >= 0) && (box_ind < batch)).all();
+    isvalid.device(d) = ((box_index >= 0) && (box_index < batch)).all();
   }
 };
 
diff --git a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
index 254475db465..c1235fda892 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
@@ -440,7 +440,7 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 
 #undef DEFINE_GPU_SPECS
 
-template struct CheckValidBoxIndHelper<GPUDevice>;
+template struct CheckValidBoxIndexHelper<GPUDevice>;
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index 3a7f180598e..d6139dae966 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -251,7 +251,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("box_ind has incompatible shape"))
+      StringPiece(s.ToString()).contains("box_index has incompatible shape"))
       << s;
 }
 
@@ -264,8 +264,10 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("box_ind has values outside [0, batch)"))
+                  .contains("box_index has values outside [0, batch_size)"))
       << s;
 }
 
+// TODO(zhengxq, rmlarsen): Add a benchmark.
+
 }  // namespace tensorflow

From a8cf54a732d2385fa7b97495c73a91e4515e8d60 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Tue, 2 May 2017 13:00:15 -0800
Subject: [PATCH 22/51] Add contribution guidelines and standards section to
 CONTRIBUTING.md

Several parts are largely based on the post by @yaroslavvb at: #7443#issuecomment-279182613

Fixes #7443
Change: 154876045
---
 CONTRIBUTING.md | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5ae5c0fbbcd..c36ef1ecd3b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -27,3 +27,140 @@ contributions, often because we probably won't get to them right now. If you
 decide to start on an issue, leave a comment so that other people know that
 you're working on it. If you want to help out, but not alone, use the issue
 comment thread to coordinate.
+
+### Contribution guidelines and standards
+
+Before sending your pull request for
+[review](https://github.com/tensorflow/tensorflow/pulls),
+make sure your changes are consistent with the guidelines and follow the
+TensorFlow coding style.
+
+#### General guidelines and philosophy for contribution
+
+* Include unit tests when you contribute new features, as they help to
+  a) prove that your code works correctly, b) guard against future breaking
+  changes to lower the maintenance cost.
+* Bug fixes also generally require unit tests, because the presense of bugs
+  usually indicates insufficient test coverage.
+* Keep API compatibility in mind when you change code in core TensorFlow,
+  e.g., code in [tensorflow/core](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core) and  [tensorflow/python](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python).
+  TensorFlow has reached version 1 and hence cannot make
+  non-backward-compatible API changes without a major release. Reviewers of your
+  pull request will comment on any API compatibility issues.
+* When you contribute a new feature to TensorFlow, the maintenance burden is (by
+  default) transferred to the TensorFlow team. This means that benefit of
+  contribution must be compared against the cost of maintaining the feature.
+* Full new features (e.g., a new op implementing a cutting-edge algorithm)
+  typically will live in
+  [tensorflow/contrib](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib)
+  to get some airtime before decision is made regarding whether they are to be
+  migrated to the core.
+
+#### License
+
+Include a license at the top of new files.
+
+* [C/C++ license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op.cc#L1)
+* [Python license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/nn.py#L1)
+* [Java license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/java/src/main/java/org/tensorflow/Graph.java#L1)
+* [Go license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/go/operation.go#L1)
+* [Bash license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/ci_build/ci_sanity.sh#L2)
+* [HTML license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/dist/index.html#L2)
+* [JavaScript/TypeScript license example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tensorboard/components/tf_backend/backend.ts#L1)
+
+Bazel BUILD files also need to include a license section, e.g.,
+[BUILD example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/BUILD#L61).
+
+#### C++ coding style
+
+Changes to TensorFlow C++ code should conform to
+[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
+Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do:
+
+```bash
+apt-get install -y clang-tidy
+```
+
+You can check a C/C++ file by doing:
+
+
+```bash
+clang-format <my_cc_file> --style=google > /tmp/my_cc_file.cc
+diff <my_cc_file> /tmp/my_cc_file.cc
+```
+
+#### Python coding style
+
+Changes to TensorFlow Python code should conform to
+[Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
+
+Use `pylint` to check your Python changes. To install `pylint` and
+retrieve TensorFlow's custom style definition:
+
+```bash
+pip install pylint
+wget -O /tmp/pylintrc https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/tools/ci_build/pylintrc
+```
+
+To check a file with `pylint`:
+
+```bash
+pylint --rcfile=/tmp/pylintrc myfile.py
+```
+
+#### Coding style for other languages
+
+* [Google Java Style Guide](https://google.github.io/styleguide/javaguide.html)
+* [Google JavaScript Style Guide](https://google.github.io/styleguide/jsguide.html)
+* [Google Shell Style Guide](https://google.github.io/styleguide/shell.xml)
+
+#### Running sanity check
+
+If you have Docker installed on your system, you can perform a sanity check on
+your changes by running the command:
+
+```bash
+tensorflow/tools/ci_build/ci_build.sh CPU tensorflow/tools/ci_build/ci_sanity.sh
+```
+
+This will catch most license, Python coding style and BUILD file issues that
+may exist in your changes.
+
+#### Running unit tests
+
+There are two ways to run TensorFlow unit tests.
+
+1. Using tools and libraries installed directly on your system.
+
+   Refer to the
+   [CPU-only developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel) and
+   [GPU developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel-gpu)
+   for the required packages. Alternatively, use the said
+   [Docker images](https://hub.docker.com/r/tensorflow/tensorflow/tags/), e.g.,
+   `tensorflow/tensorflow:nightly-devel` and `tensorflow/tensorflow:nightly-devel-gpu`
+   for development to avoid installing the packages directly on your system.
+
+   Once you have the packages installed, you can run a specific unit test in
+   bazel by doing as follows:
+
+   If the tests are to be run on GPU, add CUDA paths to LD_LIBRARY_PATH and add
+   the `cuda` option flag
+
+   ```bash
+   export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+
+   export flags="--config=opt --config=cuda -k"
+   ```
+
+   For example, to run all tests under tensorflow/python, do:
+
+   ```bash
+   bazel test ${flags} //tensorflow/python/...
+   ```
+
+2. Using Docker and TensorFlow's CI scripts.
+
+   See
+   [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build) for details.
+

From 86b5d66d66d172a273929f0cda7087aa8c45697d Mon Sep 17 00:00:00 2001
From: Toby Boyd <tobyboyd@google.com>
Date: Tue, 2 May 2017 13:03:41 -0800
Subject: [PATCH 23/51] Final draft Change: 154876563

---
 .../performance/performance_models.md         | 458 ++++++++----------
 1 file changed, 199 insertions(+), 259 deletions(-)

diff --git a/tensorflow/docs_src/performance/performance_models.md b/tensorflow/docs_src/performance/performance_models.md
index 71c4e6cfe06..70c415a024e 100644
--- a/tensorflow/docs_src/performance/performance_models.md
+++ b/tensorflow/docs_src/performance/performance_models.md
@@ -1,155 +1,109 @@
 # High-Performance Models
 
-TensorFlow is a powerful and flexible machine learning platform.
-It can be used to distribute model training and inference across a large number
-of machines and computation devices.
-
-Its software stack is made of a few layers:
-
-* a fast and powerful C++ core
-* low-level Python primitives that sit right above individual kernels
-* a diverse range of high-level libraries that aim to make building real models
-  easier
-
-There are many existing examples and tutorials that explain useful features in
-TensorFlow.  The goal of this set of scripts is to demonstrate that we can build
-flexible and powerful high-performance models using the low-level APIs.
-In the future, many of the high-performance primitives will be incorporated into
-high-level APIs, and made available to more users transparently.
-But meanwhile, we show that it is fairly easy for advanced users to build highly
-scalable models targeting different system types, network topologies, etc.
-
-We divide our effort to build high-performance models into three categories:
-
-1. A fast input pipeline to read data from disk, preprocess it, and make it
-   ready on the GPU.
-2. A high-throughput model that trains on GPU very efficiently.
-3. Fast variable and gradients distribution mechanisms that scale well across
-   many machines and computation devices.
+This document and accompanying
+[scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+detail how to build highly scalable models that target a variety of system types
+and network topologies. The techniques in this document utilize some low-level
+TensorFlow Python primitives. In the future, many of these techniques will be
+incorporated into high-level APIs.
 
 ## Input Pipeline
 
-The input pipeline is the part of a TensorFlow program that reads input data,
-shuffles it, and preprocesses it.
+The @{$performance_guide$Performance Guide} explains how to identify possible
+input pipeline issues and best practices. We found that using @{tf.FIFOQueue}
+and @{tf.train.queue_runner} could not saturate multiple current generation GPUs
+when using large inputs and processing with higher samples per second, such
+as training ImageNet with [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf).
+This is due to the the use of Python threads as its underlying implementation.
+The overhead of Python threads is too large.
 
-Among the most important features to build a fast input pipeline:
+Another approach, which we have implemented in the
+[scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks),
+is to build an input pipeline using the native parallelism in TensorFlow. Our
+implementation is made up of 3 stages:
 
-* Avoid using feed-dictionary to feed a large amount of data for each step.
-  * Instead, use reader ops to get data into TensorFlow directly.
-* Parallelize data processing.
-* Use software pipelining to feed data, so that data is available immediately
-  when needed.
+*   I/O reads: Choose and read image files from disk.
+*   Image Processing: Decode image records into images, preprocess, and organize
+    into mini-batches.
+*   CPU-to-GPU Data Transfer: Transfer images from CPU to GPU.
 
-One way to implement software pipelining in TensorFlow is through
-`tf.FifoQueue`, and it is possible to parallelize data processing through
-`tf.train.queue_runner`, which uses Python threads as its underlying
-implementation.
-This lays the foundation for the current Inception input pipeline.
-This design is well built for feeding older generation of GPUs,
-but the overhead of Python threads is too large to feed newer GPUs that are four
-to five times faster.
+The dominant part of each stage is executed in parallel with the other stages
+using `data_flow_ops.StagingArea`. `StagingArea` is a queue-like operator
+similar to @{tf.FIFOQueue}. The difference is that `StagingArea` offers simpler
+functionality and can be executed on both CPU and GPU in parallel with other
+stages. Breaking the input pipeline into 3 stages that operate independently in
+parallel is scalable and takes full advantage of large multi-core environments.
+The rest of this section details the stages followed by details about using
+`data_flow_ops.StagingArea`.
 
-In this model, we explore an alternative design that uses the native
-parallelism in TensorFlow.  In our example of an image model input pipeline,
-there are a few important parts:
+### Parallelize I/O Reads
 
-* Choose and read the image files from the disk.
-* Decode the image data into images, transform and add distortion so they are
-ready to be used.
-* Organize the transformed images into a minibatch.
-* Transfer the images from CPU to GPU, so they are ready for model training.
+`data_flow_ops.RecordInput` is used to parallelize reading from disk. Given a
+list of input files representing TFRecords, `RecordInput` continuously reads
+records using background threads. The records are placed into its own large
+internal pool and when it has loaded at least half of its capacity, it produces
+output tensors.
 
-It is important to note that the dominant part of each stage can happen in
-parallel with that of other stages:
-the file IO uses DMA to transfer the data from hard disk to memory;
-image decoding, transformation and distortion are CPU-heavy;
-the data transfer from CPU to GPU uses the GPU's copy-engine unit;
-and the GPU kernels use the main SMs of the GPU.
-It is natural to cut our pipeline into those parts so they can run in parallel
-with each other.
-
-Also, as mentioned earlier, most of the current input pipeline heavily uses
-Python threads.  However, the large overhead introduced by Python threads
-severely limits its scalability when the newer GPUs are a lot faster; we can
-alleviate this by making a single `session.run` call execute all parts of the
-pipeline.
-
-### Parallelize IO Reads
-
-In this new model, we use the native parallelism in TensorFlow: TensorFlow
-subscribes to an eager-execution model, which means that when nodes in the graph
-became available, TensorFlow will try to execute as many of them as possible.
-
-In order to parallelize reading from hard disk, we use `data_flow_ops.RecordInput`
-in this model.
-Given a list of input files of TFRecords, `RecordInput` continuously reads
-records using background threads, placing the records into its own large,
-internal pool of records.
-When it is has loaded at least half of its capacity, it produces output tensors.
-
-Since this op has its internal threads, and is dominated by IO time that doesn’t
-consume much CPU time, it naturally runs in parallel with the rest of the model.
+This op has its own internal threads that are dominated by I/O time that consume
+minimal CPU, which allows it to run smoothly in parallel with the rest of the
+model.
 
 ### Parallelize Image Processing
 
-After reading from “RecordInput”, the tensors are passed to the input processing
-pipeline.  For example, if we need to feed 8 GPUs, each with a batch-size of 32,
-then for each step we do the following.
+After images are read from `RecordInput` they are passed as tensors to the image
+processing pipeline. To make the image processing pipeline easier to explain,
+assume that the input pipeline is targeting 8 GPUs with a batch size of 256 (32
+per GPU).
 
-First, read 32x8=256 records, and process them individually, in
-parallel. This starts with 256 independent RecordInput read ops in the graph.
+256 records are read and processed individually in parallel. This starts with
+256 independent `RecordInput` read ops in the graph. Each read op is followed by
+an identical set of ops for image preprocessing that are considered independent
+and executed in parallel. The image preprocessing ops include operations such as
+image decoding, distortion, and resizing.
 
-Then, follow each read with identical set of ops for processing. Each set is
-considered independent and will execute in parallel.  The operations include
-image decoding, image distortion, and resizing.
-
-Finally, once the images are ready, they will be concatenated together into 8
-batch-size 32 tensors.
-Note that we can use “tf.concat” for this purpose.
-However, “tf.concat” is implemented as a single op, which waits for all
-the inputs to be ready, and then concatenates them together. Since all
-inputs are produced in parallel, there will be a long tail waiting for all
-inputs to be available; and when concatenation happens, the op becomes memory
-limited as all input tensors compete for memory bandwidth.
-So for the final concatenation, we use `tf.parallel_stack` instead. This
+Once the images are through preprocessing, they are concatenated together into 8
+batch size 32 tensors. Rather than use @{tf.concat} for this purpose, which is
+implemented as a single op that waits for all the inputs to be ready before
+concatenating them together, @{tf.parallel_stack} is used. @{tf.parallel_stack}
 allocates an uninitialized tensor as an output, and each input tensor is written
 to its designated portion of the output tensor as soon as the input is
-available.  When all the input tensors are finished, the output tensor is passed
-along in the graph. This effectively hides all the memory latency with the long
-tail of producing all the input tensors.
+available.
+
+When all the input tensors are finished, the output tensor is passed along in
+the graph. This effectively hides all the memory latency with the long tail of
+producing all the input tensors.
 
 ### Parallelize CPU-to-GPU Data Transfer
 
-In our example, once all the input images are processed and concatenated
-together by the CPU, we have 8 tensors, each of which has a batch-size of 32.
-These tensors are then to be used by the GPU for the model training.
+Continuing with the assumption that the target is 8 GPUs with a batch size of
+256 (32 per GPU). Once the input images are processed and concatenated together
+by the CPU, we have 8 tensors each with a batch-size of 32.
 
-In TensorFlow, users can use tensors from one device on any other device
-directly.  TensorFlow inserts implicit copies to make the tensors available on
-any devices where they are used.  The runtime schedules the copy between devices
-to run before the tensors are actually used.  However, if the copy cannot finish
-in time, the computation that needs those tensors will stall.
+TensorFlow enables tensors from one device to be used on any other device
+directly. TensorFlow inserts implicit copies to make the tensors available on
+any devices where they are used. The runtime schedules the copy between devices
+to run before the tensors are actually used. However, if the copy cannot finish
+in time, the computation that needs those tensors will stall and result in
+decreased performance.
 
-For high-performance models, it is helpful to explicitly schedule the copy ahead
-of the time in parallel, so when the computation starts on GPU, all the tensors
-are already available on the right device.
+In this implementation, `data_flow_ops.StagingArea` is used to explicitly
+schedule the copy in parallel. The end result is that when computation starts on
+the GPU, all the tensors are already available.
 
 ### Software Pipelining
 
-With all the stages capable of being driven by different processors, we insert
-`data_flow_ops.StagingArea` in between them so they run in parallel.
-`StagingArea` is a queue-like operator similar to `tf.FifoQueue`.
-But it offers simpler functionalities and can be executed on both CPU and GPU.
+With all the stages capable of being driven by different processors,
+`data_flow_ops.StagingArea` is used between them so they run in parallel.
+`StagingArea` is a queue-like operator similar to @{tf.FIFOQueue} that offers
+simpler functionalities that can be executed on both CPU and GPU.
 
-Before the model starts running all the stages, we warm up the stages in order
-so the staging buffers in between all have one set of data in them.
-During each run step that follows, we will run all the stages.
-They read one set of data from the staging buffers at the beginning of each
-stage, and push one set at end end.
+Before the model starts running all the stages, the input pipeline stages are
+warmed up to prime the staging buffers in between with one set of data.
+During each run step, one set of data is read from the staging buffers at
+the beginning of each stage, and one set is pushed at the end.
 
-For example: if there are three stages: A, B and C.
-There are two staging areas in between: S1 and S2.
-During the warm up, we run:
+For example: if there are three stages: A, B and C. There are two staging areas
+in between: S1 and S2. During the warm up, we run:
 
 ```
 Warm up:
@@ -162,123 +116,126 @@ Step 4: A3  B2  C1
 Step 5: A4  B3  C2
 ```
 
-After the warm up, S1 and S2 each have one set of data in them.
-For each step of the actual execution, one set of data is consumed from each
-staging area, and one set is added to each.
+After the warm up, S1 and S2 each have one set of data in them. For each step of
+the actual execution, one set of data is consumed from each staging area, and
+one set is added to each.
 
-There are a few nice properties about the scheme:
+Benefits of using this scheme:
 
-* All the stages are non-blocking, since the staging areas always have one set
-of data after the warm up.
-* Each stage can run in parallel since they can all start immediately.
-* The staging buffers have a fixed memory overhead. They will have at most one
-  extra set of data.
-* Only a single`session.run()` call is needed to run all stages of the step,
-  which makes profiling and debugging much easier.
+*   All stages are non-blocking, since the staging areas always have one set of
+    data after the warm up.
+*   Each stage can run in parallel since they can all start immediately.
+*   The staging buffers have a fixed memory overhead. They will have at most one
+    extra set of data.
+*   Only a single`session.run()` call is needed to run all stages of the step,
+    which makes profiling and debugging much easier.
 
 ## Best Practices in Building High-Performance Models
 
-The computation on GPU can happen immediately since the input data have already
-been transferred onto GPU when the step starts.
-But it is still important to build the model that runs as fast as possible.
-Here are some tips for a high-performance convolutional neural network (CNN)
-model:
+Collected below are a couple of additional best practices that can improve
+performance and increase the flexiblity of models.
 
 ### Build the model with both NHWC and NCHW
 
 Most TensorFlow operations used by a CNN support both NHWC and NCHW data format.
-On GPU, NCHW is faster.
-But on CPU, NHWC is sometimes faster.
+On GPU, NCHW is faster. But on CPU, NHWC is sometimes faster.
 
-So it is a good idea to build the model that can work in both ways.
-Our model shows a good way to do that effectively.
-For GPU training, we should always use NCHW.
-But if the model needs inference on CPU, we could use NHWC; weights obtained
-from training with NCHW data format can be used for inference in NHWC data
-format.
+Building a model to support both date formats keeps the model flexible and
+capable of operating optimally regardless of platform. Most TensorFlow
+operations used by a CNN support both NHWC and NCHW data format. The benchmark
+script was written to support both NCHW and NHWC. NCHW should always be used
+when training with GPUs. NHWC is sometimes faster on CPU. A flexible model can
+be trained on GPUs using NCHW with inference done on CPU using NHWC with the
+weights obtained from training.
 
 ### Use Fused Batch-Normalization
 
 The default batch-normalization in TensorFlow is implemented as composite
-operations.
-This is very general, but often leads to suboptimal performance.
-An alternative is the fused batch-normalization, and the performance on GPU
-is often much faster.
+operations. This is very general, but often leads to suboptimal performance. An
+alternative is to use fused batch-normalization which often has much better
+performance on GPU. Below is an example of using @{tf.contrib.layers.batch_norm}
+to implement fused batch-normalization.
+
+```python
+bn = tf.contrib.layers.batch_norm(
+          input_layer, fused=True, data_format='NCHW'
+          scope=scope)
+```
 
 ## Variable Distribution and Gradient Aggregation
 
 During training, training variable values are updated using aggregated gradients
-and deltas.  In this model, we demonstrate that with the flexible and
-general-purpose TensorFlow primitives, it is fairly easy to build a diverse
-range of high-performance distribution and aggregation schemes for different
-types of systems.
+and deltas. In the benchmark script, we demonstrate that with the flexible and
+general-purpose TensorFlow primitives, a diverse range of high-performance
+distribution and aggregation schemes can be built.
 
-For example:
+Three examples of variable distribution and aggregation were included in the
+script:
 
-* The standard parameter-server where each replica of the training model reads
-  the variables directly, and updates the variable independently.  When each
-  model needs the variables, they are copied over through the standard implicit
-  copies added by the TensorFlow runtime. It is shown how to use this method
-  in either local training, distributed synchronous training, and distributed
-  asynchronous training.
-* A replicated mode for local training where each GPU has an identical
-  copy of the training parameters.  The forward and backward computation can
-  start immediately as the variable data is immediately available.  Gradients
-  are accumulated across all GPUs, and the aggregated total is applied to
-  each GPU's copy of the variables so that they stay in sync.
-* A distributed replicated mode of training where each GPU has an identical copy
-  of the training parameters, and a master copy of the variables is stored
-  on the parameter-servers.  The forward and backward computation can
-  start immediately as the variable data is immediately available.  Gradients
-  are accumulated across all GPUs on each server and then the per-server
-  aggregated gradients are applied to the master copy. After all workers do
-  this, each worker updates its copy of the variable from the master copy.
+*   `parameter_server` where each replica of the training model reads the
+    variables from a parameter server and updates the variable independently.
+    When each model needs the variables, they are copied over through the
+    standard implicit copies added by the TensorFlow runtime. The example
+    [script](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+    illustrates using this method for local training, distributed synchronous
+    training, and distributed asynchronous training.
+*   `replicated` places an identical copy of each training variable on each
+    GPU. The forward and backward computation can start immediately as the
+    variable data is immediately available. Gradients are accumulated across all
+    GPUs, and the aggregated total is applied to each GPU's copy of the
+    variables to keep them in sync.
+*   `distributed_replicated` places an identical copy of the training parameters
+    on each GPU along with a master copy on the parameter servers. The forward
+    and backward computation can start immediately as the variable data is
+    immediately available. Gradients are accumulated across all GPUs on each
+    server and then the per-server aggregated gradients are applied to the
+    master copy. After all workers do this, each worker updates its copy of the
+    variable from the master copy.
 
-We show that most of the variable distribution and aggregation subsystem can
-be implemented through TensorFlow low-level primitives with manageable
-complexity at the model level. Here we discuss some more details.
+Below are additional details about each approach.
 
-### Parameter-server Variables
+### Parameter Server Variables
 
-The most common way trainable variables are managed in TensorFlow models is the
+The most common way trainable variables are managed in TensorFlow models is
 parameter server mode.
 
-In a distributed system, this means that each worker process runs the same
-model, and parameter server processes own the master copies of the variables.
-When a worker needs a variable from a parameter server, it refers to it
-directly.  The TensorFlow runtime adds implicit copies to the graph to make the
-variable value available on the computation device that needs it. When a
-gradient is computed on a worker, it is sent to the parameter server that owns
-the particular variable, and the corresponding optimizer is used to update the
-variable.
+In a distributed system, each worker process runs the same model, and parameter
+server processes own the master copies of the variables. When a worker needs a
+variable from a parameter server, it refers to it directly. The TensorFlow
+runtime adds implicit copies to the graph to make the variable value available
+on the computation device that needs it. When a gradient is computed on a
+worker, it is sent to the parameter server that owns the particular variable,
+and the corresponding optimizer is used to update the variable.
 
 There are some techniques to improve throughput:
 
-* The variables are spread among parameter servers based on their size, for load
-  balancing.
-* When each worker has multiple GPUs, gradients are accumulated across the GPUs
-  and a single aggregated gradient is sent to the parameter server. This reduces
-  the network bandwidth and the amount of work done by the parameter servers.
+*   The variables are spread among parameter servers based on their size, for
+    load balancing.
+*   When each worker has multiple GPUs, gradients are accumulated across the
+    GPUs and a single aggregated gradient is sent to the parameter server. This
+    reduces the network bandwidth and the amount of work done by the parameter
+    servers.
 
 For coordinating between workers, a very common mode is async updates, where
 each worker updates the master copy of the variables without synchronizing with
-other workers.  In our model, we demonstrate that it is fairly easy to introduce
+other workers. In our model, we demonstrate that it is fairly easy to introduce
 synchronization across workers so updates for all workers are finished in one
 step before the next step can start.
 
-The parameter-server method can also be used for local training, In this case,
+The parameter server method can also be used for local training, In this case,
 instead of spreading the master copies of variables across parameters servers,
 they are either on the CPU or spread across the available GPUs.
 
 Due to the simple nature of this setup, this architecture has gained a lot of
 popularity within the community.
 
-This is available in the benchmark scripts as the 'parameter_server'
-variable_update mode.
+This mode can be used in the script by passing
+`--variable_update=parameter_server`.
 
-![parameter_server mode in distributed
-training](../images/perf_parameter_server_mode_doc.png){
-width="900" style="max-width: inherit"}
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" alt="parameter_server mode in distributed training"
+   src="../images/perf_parameter_server_mode_doc.png">
+</div>
 
 ### Replicated Variables
 
@@ -292,19 +249,18 @@ devices and the fully aggregated gradient is then applied to each local copy.
 
 Gradient aggregation across the server can be done in different ways:
 
-* Using standard TensorFlow operations to accumulate the total on a single
-  device (CPU or GPU) and then copy it back to all GPUs.
-* Using NVIDIA NCCL, described below in the NCCL section.
+*   Using standard TensorFlow operations to accumulate the total on a single
+    device (CPU or GPU) and then copy it back to all GPUs.
+*   Using NVIDIA® NCCL, described below in the NCCL section.
 
-This is available in the benchmark scripts for local execution only, as the
-'replicated' variable_update mode.
+This mode can be used in the script by passing `--variable_update=replicated`.
 
 ### Replicated Variables in Distributed Training
 
-The replicated method for variables can be extended to distributed training.
-One way to do this like the replicated mode: aggregate the gradients fully
-across the cluster and apply them to each local copy of the variable. This may
-be shown in a future version of this scripts; the scripts do present a different
+The replicated method for variables can be extended to distributed training. One
+way to do this like the replicated mode: aggregate the gradients fully across
+the cluster and apply them to each local copy of the variable. This may be shown
+in a future version of this scripts; the scripts do present a different
 variation, described here.
 
 In this mode, in addition to each GPU's copy of the variables, a master copy is
@@ -314,28 +270,30 @@ immediately using the local copies of the variables.
 As the gradients of the weights become available, they are sent back to the
 parameter servers and all local copies are updated:
 
-1. All the gradients from the GPU on the same worker are aggregated together.
-2. Aggregated gradients from each worker are sent to the parameter server that
-   owns the variable, where the specified optimizer is used to update the
-   master copy of the variable.
-3. Each worker updates its local copy of the variable from the master. In
-   the example model, this is done with a cross-replica barrier that waits for
-   all the workers to finish updating the variables, and fetches the new
-   variable only after the barrier has been released by all replicas.  Once the
-   copy finishes for all variables, this marks the end of a training step, and a
-   new step can start.
+1.  All the gradients from the GPU on the same worker are aggregated together.
+2.  Aggregated gradients from each worker are sent to the parameter server that
+    owns the variable, where the specified optimizer is used to update the
+    master copy of the variable.
+3.  Each worker updates its local copy of the variable from the master. In the
+    example model, this is done with a cross-replica barrier that waits for all
+    the workers to finish updating the variables, and fetches the new variable
+    only after the barrier has been released by all replicas. Once the copy
+    finishes for all variables, this marks the end of a training step, and a new
+    step can start.
 
 Although this sounds similar to the standard use of parameter servers, the
-performance is often better in many cases.  This is largely due to the fact the
+performance is often better in many cases. This is largely due to the fact the
 computation can happen without any delay, and much of the copy latency of early
 gradients can be hidden by later computation layers.
 
-This is available in the benchmark scripts as the 'distributed_replicated'
-variable_update mode.
+This mode can be used in the script by passing
+`--variable_update=distributed_replicated`.
 
-![distributed_replicated mode](
-../images/perf_distributed_replicated_mode_doc.png){
-width="900" style="max-width: inherit"}
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" alt="distributed_replicated mode"
+   src="../images/perf_distributed_replicated_mode_doc.png">
+</div>
 
 #### NCCL
 
@@ -343,47 +301,29 @@ In order to broadcast variables and aggregate gradients across different GPUs
 within the same host machine, we can use the default TensorFlow implicit copy
 mechanism.
 
-However, we can instead use the optional NCCL support.  NCCL is an NVIDIA
-library that can efficiently broadcast and aggregate data across different GPUs.
-It schedules a cooperating kernel on each GPU that knows how to best utilize the
-underlying hardware topology; this kernel uses a single SM of the GPU.
+However, we can instead use the optional NCCL (@{tf.contrib.nccl}) support. NCCL
+is an NVIDIA® library that can efficiently broadcast and aggregate data across
+different GPUs. It schedules a cooperating kernel on each GPU that knows how to
+best utilize the underlying hardware topology; this kernel uses a single SM of
+the GPU.
 
 In our experiment, we demonstrate that although NCCL often leads to much faster
-data aggregation by itself, it doesn't necessarily lead to faster training.  Our
+data aggregation by itself, it doesn't necessarily lead to faster training. Our
 hypothesis is that the implicit copies are essentially free since they go to the
 copy engine on GPU, as long as its latency can be hidden by the main computation
-itself.  Although NCCL can transfer data faster, it takes one SM away, and adds
-more pressure to the underlying L2 cache.  Our results show that for 8-GPUs,
-NCCL often leads to better performance.  However, for fewer GPUs, the implicit
-copies often perform better.
+itself. Although NCCL can transfer data faster, it takes one SM away, and adds
+more pressure to the underlying L2 cache. Our results show that for 8-GPUs, NCCL
+often leads to better performance. However, for fewer GPUs, the implicit copies
+often perform better.
 
 #### Staged Variables
 
 We further introduce a staged-variable mode where we use staging areas for both
-the variable reads, and their updates.
-Similar to software pipelining of the input pipeline, this can hide the data
-copy latency.
-If the computation time takes longer than the copy and aggregation, the copy
-itself becomes essentially free.
+the variable reads, and their updates. Similar to software pipelining of the
+input pipeline, this can hide the data copy latency. If the computation time
+takes longer than the copy and aggregation, the copy itself becomes essentially
+free.
 
 The downside is that all the weights read are from the previous training step.
-So it is a different algorithm from SGD.
-But it is possible to improve its convergence by adjusting learning rate and
-other hyperparameters.
-
-## Conclusions
-
-In this high-performance model, we present a number of options to build
-high-performance models in TensorFlow.
-Due to the flexible design in TensorFlow, advanced features like this often
-requires no system-level changes, and can be largely achieved through
-model-level changes.
-
-We do not claim which combination works best for a particular model.
-That should be left to the engineers who build the model and the training system.
-Many of the ingredients of the high-performance model will find their ways
-to high-level primitives that become transparent to users.
-However, we have shown that advanced users can easily tune and modify the
-underlying model behavior using low-level primitives.
-This could be very useful when improving performance for particular system
-setups and model configurations.
+So it is a different algorithm from SGD. But it is possible to improve its
+convergence by adjusting learning rate and other hyperparameters.

From 978d492bf0ae708644cf6d61a5aaf1e503a6c04f Mon Sep 17 00:00:00 2001
From: Toby Boyd <tobyboyd@google.com>
Date: Tue, 2 May 2017 13:04:08 -0800
Subject: [PATCH 24/51] Final draft Change: 154876646

---
 tensorflow/docs_src/performance/benchmarks.md | 128 +++++++++++-------
 1 file changed, 77 insertions(+), 51 deletions(-)

diff --git a/tensorflow/docs_src/performance/benchmarks.md b/tensorflow/docs_src/performance/benchmarks.md
index 8c0cff138de..bfb47d9f908 100644
--- a/tensorflow/docs_src/performance/benchmarks.md
+++ b/tensorflow/docs_src/performance/benchmarks.md
@@ -1,17 +1,17 @@
-# TensorFlow Performance Benchmarks
+# Benchmarks
 
 ## Overview
 
 A selection of image classification models were tested across multiple platforms
 to create a point of reference for the TensorFlow community. The methodology,
-links to the scripts, and commands to reproduce the results are in the
-[appendix](#appendix).
+links to the benchmark scripts, and commands to reproduce the results are in the
+[Appendix](#appendix).
 
 ## Results for image classification models
 
-InceptionV3 ([arXiv:1512.00567](https://arxiv.org/abs/1512.00567)),
-ResNet-50 ([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)),
-ResNet-152 ([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)), VGG16
+InceptionV3 ([arXiv:1512.00567](https://arxiv.org/abs/1512.00567)), ResNet-50
+([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)), ResNet-152
+([arXiv:1512.03385](https://arxiv.org/abs/1512.03385)), VGG16
 ([arXiv:1409.1556](https://arxiv.org/abs/1409.1556)), and
 [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf)
 were tested using the [ImageNet](http://www.image-net.org/) data set. Tests were
@@ -27,32 +27,32 @@ input pipeline and the underlying disk I/O are saturating the compute units.
 
 ### Training with NVIDIA® DGX-1™ (NVIDIA® Tesla® P100)
 
-<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../images/perf_summary_p100_single_server.png">
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:80%" src="../images/perf_summary_p100_single_server.png">
 </div>
 
 Details and additional results are in the [Details for NVIDIA® DGX-1™ (NVIDIA®
-Tesla® P100)](#details-for-nvidia®-dgx-1™-nvidia®-tesla®-p100) section.
+Tesla® P100)](#details_for_nvidia_dgx-1tm_nvidia_tesla_p100) section.
 
 ### Training with NVIDIA® Tesla® K80
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../images/perf_summary_k80_single_server.png">
+  <img style="width:80%" src="../images/perf_summary_k80_single_server.png">
 </div>
 
 Details and additional results are in the [Details for Google Compute Engine
-(NVIDIA® Tesla® K80)](#details-for-google-compute-engine-nvidia®-tesla®-k80) and
+(NVIDIA® Tesla® K80)](#details_for_google_compute_engine_nvidia_tesla_k80) and
 [Details for Amazon EC2 (NVIDIA® Tesla®
-K80)](#details-for-amazon-ec2-nvidia®-tesla®-k80) sections.
+K80)](#details_for_amazon_ec2_nvidia_tesla_k80) sections.
 
 ### Distributed training with NVIDIA® Tesla® K80
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../images/perf_summary_k80_aws_distributed.png">
+  <img style="width:80%" src="../images/perf_summary_k80_aws_distributed.png">
 </div>
 
 Details and additional results are in the [Details for Amazon EC2 Distributed
-(NVIDIA® Tesla® K80)](#details-for-amazon-ec2-distributed-nvidia®-tesla®-k80)
+(NVIDIA® Tesla® K80)](#details_for_amazon_ec2_distributed_nvidia_tesla_k80)
 section.
 
 ### Compare synthetic with real data training
@@ -82,12 +82,15 @@ section.
 *   **TensorFlow GitHub hash:** b1e174e
 *   **Build Command:** `bazel build -c opt --copt=-march="haswell" --config=cuda
     //tensorflow/tools/pip_package:build_pip_package`
-*   **Disk:** local SSD
+*   **Disk:** Local SSD
 *   **DataSet:** ImageNet
 
-Batch size and optimizer used for each model.
+Batch size and optimizer used for each model are listed in the table below. In
+addition to the batch sizes listed in the table, InceptionV3, ResNet-50,
+ResNet-152, and VGG16 were tested with a batch size of 32. Those results are in
+the *other results* section.
 
-                   | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+Options            | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
 ------------------ | ----------- | --------- | ---------- | ------- | -----
 Batch size per GPU | 64          | 64        | 64         | 512     | 64
 Optimizer          | sgd         | sgd       | sgd        | sgd     | sgd
@@ -104,10 +107,8 @@ VGG16       | replicated (with NCCL) | n/a
 
 ### Results
 
-Batch size and optimizer used for each model are listed in the table below.
-
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../images/perf_summary_p100_single_server.png">
+  <img style="width:80%" src="../images/perf_summary_p100_single_server.png">
 </div>
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
@@ -136,6 +137,28 @@ GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
 Training AlexNet with real data on 8 GPUs was excluded from the graph and table
 above due to it maxing out the input pipeline.
 
+### Other Results
+
+The results below are all with a batch size of 32.
+
+**Training synthetic data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | VGG16
+---- | ----------- | --------- | ---------- | -----
+1    | 128         | 210       | 85.3       | 124
+2    | 259         | 412       | 166        | 241
+4    | 520         | 827       | 330        | 470
+8    | 995         | 1623      | 643        | 738
+
+**Training real data**
+
+GPUs | InceptionV3 | ResNet-50 | ResNet-152 | VGG16
+---- | ----------- | --------- | ---------- | -----
+1    | 130         | 208       | 85.0       | 124
+2    | 257         | 403       | 163        | 221
+4    | 507         | 814       | 325        | 401
+8    | 966         | 1525      | 641        | 619
+
 ## Details for Google Compute Engine (NVIDIA® Tesla® K80)
 
 ### Environment
@@ -156,7 +179,7 @@ addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were
 tested with a batch size of 32. Those results are in the *other results*
 section.
 
-                   | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+Options            | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
 ------------------ | ----------- | --------- | ---------- | ------- | -----
 Batch size per GPU | 64          | 64        | 32         | 512     | 32
 Optimizer          | sgd         | sgd       | sgd        | sgd     | sgd
@@ -184,10 +207,10 @@ GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
 
 GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
 ---- | ----------- | --------- | ---------- | ------- | -----
-1    | 30.5        | 56.7      | 20.7       | 639     | 30.2
-2    | 57.8        | 107       | 39         | 1136    | 55.5
-4    | 115         | 211       | 77.3       | 2067    | 106
-8    | 225         | 418       | 150        | 4056    | 213
+  1  | 30.6        | 56.7      | 20.7       | 639     | 30.2       
+  2  | 58.4        | 107       | 39.0       | 1136    | 55.5       
+  4  | 115         | 211       | 77.3       | 2067    | 106        
+  8  | 225         | 422       | 151        | 4056    | 213   
 
 ### Other Results
 
@@ -204,10 +227,10 @@ GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
 
 GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
 ---- | --------------------------- | -------------------------
-1    | 29.3                        | 53.6
-2    | 55                          | 102
-4    | 109                         | 200
-8    | 215                         | 387
+  1  | 29.5                        | 53.6       
+  2  | 55.4                        | 102        
+  4  | 110                         | 201        
+  8  | 216                         | 387  
 
 ## Details for Amazon EC2 (NVIDIA® Tesla® K80)
 
@@ -230,7 +253,7 @@ addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were
 tested with a batch size of 32. Those results are in the *other results*
 section.
 
-                   | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
+Options            | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
 ------------------ | ----------- | --------- | ---------- | ------- | -----
 Batch size per GPU | 64          | 64        | 32         | 512     | 32
 Optimizer          | sgd         | sgd       | sgd        | sgd     | sgd
@@ -289,7 +312,7 @@ GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
 GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
 ---- | --------------------------- | -------------------------
 1    | 30.0                        | 53.6
-2    | 57.5                        | 101
+2    | 57.5                        | 102
 4    | 113                         | 202
 8    | 212                         | 379
 
@@ -313,7 +336,7 @@ addition to the batch sizes listed in the table, InceptionV3 and ResNet-50 were
 tested with a batch size of 32. Those results are in the *other results*
 section.
 
-                   | InceptionV3 | ResNet-50 | ResNet-152
+Options            | InceptionV3 | ResNet-50 | ResNet-152
 ------------------ | ----------- | --------- | ----------
 Batch size per GPU | 64          | 64        | 32
 Optimizer          | sgd         | sgd       | sgd
@@ -337,7 +360,7 @@ used with the following exceptions:
 ### Results
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:95%" src="../images/perf_summary_k80_aws_distributed.png">
+  <img style="width:80%" src="../images/perf_summary_k80_aws_distributed.png">
 </div>
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
@@ -374,34 +397,37 @@ GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
 
 ### Executing benchmark tests
 
-The code for the benchmarks was created to be both used for benchmarking
-TensorFlow as well as used as a tool to test hardware platforms. The benchmark
-code includes modes such as `trivial` that run a virtually empty model that is
-useful for testing the maximum possibly samples/sec for the input pipeline among
-other things. Not only does this test TensorFlow but also the throughput of the
-underlying systems. There are two ways to execute the benchmarks in
-[tf_cnn_benchmarks.py](TODO: LINK TO GITHUB):
+The [benchmark code](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+was created to be used for benchmarking TensorFlow as well as used as a tool to
+test hardware platforms. Techniques used in the benchmark scripts are detailed
+in @{$performance_models$High-Performance Models}.
 
-1.  Execute [tf_cnn_benchmarks.py](TODO: LINK TO GITHUB) directly
-2.  Utilize the [small wrapper](TODO: LINK TO GITHUB) that helps pick the
-    correct config
+There are two ways to execute the benchmark code:
+
+1.  Execute [tf_cnn_benchmarks.py](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py)
+    directly.
+2.  Utilize the [scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks/main.py)
+    that helps pick the correct config for each platform executes
+    `tf_cnn_benchmarks.py`.
 
 The wrapper is suggested as a starting point. Then investigate the variety of
-options available in `tf_cnn_benchmarks.py`. While the wrapper extensive
-examples, below are a couple highlights.
+options available in `tf_cnn_benchmarks.py`. Below are a couple examples of
+using the wrapper.
 
-Run ResNet-50 on a single instance with 8 GPUs. The `system` argument is used to
-determine the optimal configuration. The supported values are gce, aws, and
-dgx1. If `system` is not passeed, the best config for the most widely available
-hardware is used.
+**Single Server**
+This example illustrates training ResNet-50 on a single instance with 8 GPUs.
+The `system` flag is used to determine the optimal configuration. The
+supported values are gce, aws, and dgx1. If `system` is not passed, the best
+config for the most widely available hardware is used.
 
 ```bash
 python main.py --model=resnet50 --num_gpus=8
 python main.py --system=aws --model=resnet50 --num_gpus=8
 ```
 
-Run ResNet-50 on 2 hosts, e.g. host_0 (10.0.0.1) and host_1 (10.0.0.2), with 8
-GPUs each on aws.
+**Distributed**
+This example illustrates training ResNet-50 on 2 hosts, e.g. host_0 (10.0.0.1)
+and host_1 (10.0.0.2), with 8 GPUs each on AWS (Amazon EC2).
 
 ```bash
 # Run the following commands on host_0 (10.0.0.1):

From c590b00b2cbd757a94594da55de89d1b66a8b064 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Tue, 2 May 2017 13:05:29 -0800
Subject: [PATCH 25/51] Fix losses documentation.

Fix documentation of get_total_loss() to be correct.
And add a helpful comment about a common pitfall.
Change: 154876822
---
 tensorflow/python/ops/losses/util.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index 09ad874faee..3414df475f5 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -57,7 +57,7 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
 
 
 def get_regularization_losses(scope=None):
-  """Gets the regularization losses.
+  """Gets the list of regularization losses.
 
   Args:
     scope: An optional scope for filtering the losses to return.
@@ -88,7 +88,11 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"):
 def get_total_loss(add_regularization_losses=True, name="total_loss"):
   """Returns a tensor whose value represents the total loss.
 
-  Notice that the function adds the given losses to the regularization losses.
+  In particular, this adds any losses you have added with `tf.add_loss()` to
+  any regularization losses that have been added by regularization parameters
+  on layers constructors e.g. `tf.layers`. Be very sure to use this if you
+  are constructing a loss_op manually. Otherwise regularization arguments
+  on `tf.layers` methods will not function.
 
   Args:
     add_regularization_losses: A boolean indicating whether or not to use the

From 5b6d276679b62ad4992d397b1987a4a4a2bd84c6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 13:10:44 -0800
Subject: [PATCH 26/51] [XLA] Second change for HLO interpreter.

Extends HloEvaluator to allow evaluation of HLO Computation or single HLO instruction
with non-constant operands, by traversing the instruction in post order and keeps track of
each instruction along the way as evaluated literals.
Change: 154877580
---
 tensorflow/compiler/xla/service/BUILD         |   2 +
 .../compiler/xla/service/hlo_evaluator.cc     | 256 +++++++++++-------
 .../compiler/xla/service/hlo_evaluator.h      |  81 +++++-
 .../xla/service/hlo_evaluator_test.cc         |  65 ++++-
 tensorflow/compiler/xla/service/hlo_query.cc  |  10 +
 tensorflow/compiler/xla/service/hlo_query.h   |   4 +
 6 files changed, 306 insertions(+), 112 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index a4e35135d73..bdb69b6e55e 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -80,6 +80,8 @@ cc_library(
         ":hlo_query",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index ebe74280525..1b3babc2140 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -26,20 +26,21 @@ limitations under the License.
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/bitmap.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -53,9 +54,7 @@ std::unique_ptr<Literal> ElementWiseUnaryOp(
     const Literal& operand) {
   DCHECK(ShapeUtil::SameDimensions(shape, operand.shape()));
 
-  auto result = MakeUnique<Literal>();
-  *result->mutable_shape() = shape;
-  LiteralUtil::Reserve(ShapeUtil::ElementsIn(shape), result.get());
+  auto result = LiteralUtil::CreateFromShape(shape);
 
   std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
   do {
@@ -74,9 +73,7 @@ std::unique_ptr<Literal> ElementWiseBinaryOp(
   DCHECK(ShapeUtil::SameDimensions(shape, rhs.shape()));
   DCHECK(ShapeUtil::SameDimensions(lhs.shape(), rhs.shape()));
 
-  auto result = MakeUnique<Literal>();
-  *result->mutable_shape() = shape;
-  LiteralUtil::Reserve(ShapeUtil::ElementsIn(shape), result.get());
+  auto result = LiteralUtil::CreateFromShape(shape);
 
   std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
   do {
@@ -99,9 +96,7 @@ std::unique_ptr<Literal> ElementWiseTernaryOp(
   DCHECK(ShapeUtil::SameDimensions(lhs.shape(), rhs.shape()));
   DCHECK(ShapeUtil::SameDimensions(rhs.shape(), ehs.shape()));
 
-  auto result = MakeUnique<Literal>();
-  *result->mutable_shape() = shape;
-  LiteralUtil::Reserve(ShapeUtil::ElementsIn(shape), result.get());
+  auto result = LiteralUtil::CreateFromShape(shape);
 
   std::vector<int64> multi_index(ShapeUtil::Rank(result->shape()), 0);
   do {
@@ -130,29 +125,130 @@ NativeT AbsoluteVal(NativeT value) {
   return std::abs(value);
 }
 
-template <typename NativeT>
-StatusOr<std::unique_ptr<Literal>> EvaluateOpForLiteralInternal(
-    HloInstruction* instruction) {
-  DCHECK(hlo_query::AllOperandsAreConstants(*instruction));
+}  // namespace
 
+Status HloEvaluator::DefaultAction(HloInstruction* hlo) {
+  VLOG(2) << "Handle instruction: " << hlo->ToString();
+  Shape shape = hlo->shape();
+  TF_CHECK_OK(ShapeUtil::ValidateShape(shape));
+
+  TF_ASSIGN_OR_RETURN(evaluated_[hlo], EvaluateBasedOnType(hlo));
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
+  VLOG(2) << "HandleParameter: " << parameter->ToString();
+  const Literal* input_literal = arg_literals_[parameter->parameter_number()];
+  VLOG(2) << "Parameter evaluated to: "
+          << LiteralUtil::ToString(*input_literal);
+  CHECK(ShapeUtil::Equal(parameter->shape(), input_literal->shape()));
+
+  evaluated_[parameter] = MakeUnique<Literal>(*input_literal);
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleConstant(HloInstruction* constant,
+                                    const Literal& literal) {
+  VLOG(2) << "HandleConstant: " << constant->ToString();
+  CHECK(ShapeUtil::Equal(constant->shape(), literal.shape()));
+
+  evaluated_[constant] = MakeUnique<Literal>(literal);
+  return Status::OK();
+}
+
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
+    HloComputation* computation,
+    tensorflow::gtl::ArraySlice<const Literal*> args) {
+  arg_literals_ = args;
+  TF_RETURN_IF_ERROR(computation->Accept(this));
+  return std::move(FindOrDie(evaluated_, computation->root_instruction()));
+}
+
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::Evaluate(
+    HloInstruction* instruction,
+    tensorflow::gtl::ArraySlice<const Literal*> args) {
+  DCHECK(hlo_query::AllOperandsAreParametersOrConstants(*instruction));
+  Shape shape = instruction->shape();
+  TF_CHECK_OK(ShapeUtil::ValidateShape(shape));
+
+  arg_literals_ = args;
+
+  // Evaluate operands of Parameter type against the input literals which caches
+  // the evaluated literal results.
+  for (const auto operand : instruction->operands()) {
+    if (operand->opcode() == HloOpcode::kParameter) {
+      TF_CHECK_OK(HandleParameter(operand));
+    } else if (operand->opcode() == HloOpcode::kConstant) {
+      evaluated_[operand] = MakeUnique<Literal>(operand->literal());
+    }
+  }
+
+  TF_RETURN_IF_ERROR(instruction->Visit(this));
+  return std::move(FindOrDie(evaluated_, instruction));
+}
+
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::EvaluateBasedOnType(
+    HloInstruction* instruction) {
+  Shape shape = instruction->shape();
+  TF_CHECK_OK(ShapeUtil::ValidateShape(shape));
+
+  switch (shape.element_type()) {
+    case PRED:
+      return EvaluateSameTypedElementwise<bool>(instruction);
+    case U8:
+      return EvaluateSameTypedElementwise<uint8>(instruction);
+    case U16:
+      return Unimplemented("unhandled primitive type: %s.",
+                           PrimitiveType_Name(U16).c_str());
+    case U32:
+      return EvaluateSameTypedElementwise<uint32>(instruction);
+    case U64:
+      return EvaluateSameTypedElementwise<uint64>(instruction);
+    case S8:
+      return EvaluateSameTypedElementwise<int8>(instruction);
+    case S16:
+      return Unimplemented("unhandled primitive type: %s.",
+                           PrimitiveType_Name(S16).c_str());
+    case S32:
+      return EvaluateSameTypedElementwise<int32>(instruction);
+    case S64:
+      return EvaluateSameTypedElementwise<int64>(instruction);
+    case F16:
+      return Unimplemented("unhandled primitive type: %s.",
+                           PrimitiveType_Name(F16).c_str());
+    case F32:
+      return EvaluateSameTypedElementwise<float>(instruction);
+    case F64:
+      return EvaluateSameTypedElementwise<double>(instruction);
+    default:
+      return Unimplemented("unhandled primitive type: %s.",
+                           PrimitiveType_Name(shape.element_type()).c_str());
+  }
+}
+
+template <typename NativeT>
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::EvaluateSameTypedElementwise(
+    HloInstruction* instruction) {
   const std::vector<HloInstruction*>& operands = instruction->operands();
   HloOpcode opcode = instruction->opcode();
   const Shape& shape = instruction->shape();
 
   switch (opcode) {
     // TODO(b/35950897): many of the stl function used here are not overloaded
-    // for all XLA primitive types.
+    // for every XLA primitive types.
+
     // Unary element-wise ops.
+    //
     case HloOpcode::kAbs:
       CHECK_EQ(operands.size(), 1);
       return ElementWiseUnaryOp<NativeT>(
           shape, [](NativeT operand) { return AbsoluteVal(operand); },
-          operands[0]->literal());
+          GetEvaluatedLiteralFor(operands[0]));
     case HloOpcode::kCeil:
       CHECK_EQ(operands.size(), 1);
       return ElementWiseUnaryOp<NativeT>(
           shape, [](NativeT operand) { return std::ceil(operand); },
-          operands[0]->literal());
+          GetEvaluatedLiteralFor(operands[0]));
     case HloOpcode::kConvert:
       CHECK_EQ(operands.size(), 1);
       // TODO(b/35950897): implement Convert.
@@ -162,37 +258,37 @@ StatusOr<std::unique_ptr<Literal>> EvaluateOpForLiteralInternal(
       CHECK_EQ(operands.size(), 1);
       return ElementWiseUnaryOp<NativeT>(
           shape, [](NativeT operand) { return operand; },
-          operands[0]->literal());
+          GetEvaluatedLiteralFor(operands[0]));
     case HloOpcode::kExp:
       CHECK_EQ(operands.size(), 1);
       return ElementWiseUnaryOp<NativeT>(
           shape, [](NativeT operand) { return std::exp(operand); },
-          operands[0]->literal());
+          GetEvaluatedLiteralFor(operands[0]));
     case HloOpcode::kFloor:
       CHECK_EQ(operands.size(), 1);
       return ElementWiseUnaryOp<NativeT>(
           shape, [](NativeT operand) { return std::floor(operand); },
-          operands[0]->literal());
+          GetEvaluatedLiteralFor(operands[0]));
     case HloOpcode::kIsFinite:
       CHECK_EQ(operands.size(), 1);
       return ElementWiseUnaryOp<NativeT>(
           shape, [](NativeT operand) { return std::isfinite(operand); },
-          operands[0]->literal());
+          GetEvaluatedLiteralFor(operands[0]));
     case HloOpcode::kLog:
       CHECK_EQ(operands.size(), 1);
       return ElementWiseUnaryOp<NativeT>(
           shape, [](NativeT operand) { return std::log(operand); },
-          operands[0]->literal());
+          GetEvaluatedLiteralFor(operands[0]));
     case HloOpcode::kLogicalNot:
       CHECK_EQ(operands.size(), 1);
       return ElementWiseUnaryOp<NativeT>(
           shape, [](NativeT operand) { return !operand; },
-          operands[0]->literal());
+          GetEvaluatedLiteralFor(operands[0]));
     case HloOpcode::kNegate:
       CHECK_EQ(operands.size(), 1);
       return ElementWiseUnaryOp<NativeT>(
           shape, [](NativeT operand) { return -operand; },
-          operands[0]->literal());
+          GetEvaluatedLiteralFor(operands[0]));
     case HloOpcode::kSign:
       CHECK_EQ(operands.size(), 1);
       CHECK(primitive_util::IsIntegralType(shape.element_type()));
@@ -201,95 +297,113 @@ StatusOr<std::unique_ptr<Literal>> EvaluateOpForLiteralInternal(
                                        return (NativeT(0) < operand) -
                                               (operand < NativeT(0));
                                      },
-                                     operands[0]->literal());
+                                     GetEvaluatedLiteralFor(operands[0]));
     case HloOpcode::kTanh:
       CHECK_EQ(operands.size(), 1);
       return ElementWiseUnaryOp<NativeT>(
           shape, [](NativeT operand) { return std::tanh(operand); },
-          operands[0]->literal());
+          GetEvaluatedLiteralFor(operands[0]));
     // Binary element-wise ops.
+    //
     case HloOpcode::kAdd:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<NativeT>(
           shape, [](NativeT lhs, NativeT rhs) { return lhs + rhs; },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kDivide:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<NativeT>(
           shape, [](NativeT lhs, NativeT rhs) { return lhs / rhs; },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kMultiply:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<NativeT>(
           shape, [](NativeT lhs, NativeT rhs) { return lhs * rhs; },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kSubtract:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<NativeT>(
           shape, [](NativeT lhs, NativeT rhs) { return lhs - rhs; },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kEq:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<bool>(
           shape, [](NativeT lhs, NativeT rhs) { return lhs == rhs; },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kGe:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<bool>(
           shape, [](NativeT lhs, NativeT rhs) { return lhs >= rhs; },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kGt:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<bool>(
           shape, [](NativeT lhs, NativeT rhs) { return lhs > rhs; },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kLe:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<bool>(
           shape, [](NativeT lhs, NativeT rhs) { return lhs <= rhs; },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kLt:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<bool>(
           shape, [](NativeT lhs, NativeT rhs) { return lhs < rhs; },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kNe:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<bool>(
           shape, [](NativeT lhs, NativeT rhs) { return lhs != rhs; },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kMaximum:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<NativeT>(
           shape, [](NativeT lhs, NativeT rhs) { return std::max(lhs, rhs); },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kMinimum:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<NativeT>(
           shape, [](NativeT lhs, NativeT rhs) { return std::min(lhs, rhs); },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kPower:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<NativeT>(
           shape, [](NativeT lhs, NativeT rhs) { return std::pow(lhs, rhs); },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kRemainder:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<NativeT>(
           shape,
           [](NativeT lhs, NativeT rhs) { return std::remainder(lhs, rhs); },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kLogicalAnd:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<NativeT>(
           shape, [](NativeT lhs, NativeT rhs) { return lhs && rhs; },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     case HloOpcode::kLogicalOr:
       CHECK_EQ(operands.size(), 2);
       return ElementWiseBinaryOp<NativeT>(
           shape, [](NativeT lhs, NativeT rhs) { return lhs || rhs; },
-          operands[0]->literal(), operands[1]->literal());
+          GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]));
     // Ternary element-wise ops.
+    //
     case HloOpcode::kClamp: {
       CHECK_EQ(operands.size(), 3);
       std::function<NativeT(NativeT, NativeT, NativeT)> clamp_op =
@@ -297,8 +411,9 @@ StatusOr<std::unique_ptr<Literal>> EvaluateOpForLiteralInternal(
             return std::max(low, std::min(value, high));
           };
       return ElementWiseTernaryOp<NativeT, NativeT, NativeT, NativeT>(
-          shape, std::move(clamp_op), operands[0]->literal(),
-          operands[1]->literal(), operands[2]->literal());
+          shape, std::move(clamp_op), GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]),
+          GetEvaluatedLiteralFor(operands[2]));
     } break;
     case HloOpcode::kSelect: {
       CHECK_EQ(operands.size(), 3);
@@ -311,8 +426,9 @@ StatusOr<std::unique_ptr<Literal>> EvaluateOpForLiteralInternal(
             return on_false;
           };
       return ElementWiseTernaryOp<NativeT, bool, NativeT, NativeT>(
-          shape, std::move(select_op), operands[0]->literal(),
-          operands[1]->literal(), operands[2]->literal());
+          shape, std::move(select_op), GetEvaluatedLiteralFor(operands[0]),
+          GetEvaluatedLiteralFor(operands[1]),
+          GetEvaluatedLiteralFor(operands[2]));
     } break;
     default:
       return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
@@ -320,48 +436,4 @@ StatusOr<std::unique_ptr<Literal>> EvaluateOpForLiteralInternal(
   }
 }
 
-}  // namespace
-
-/* static */ StatusOr<std::unique_ptr<Literal>>
-HloEvaluator::EvaluateOpForLiteral(HloInstruction* instruction) {
-  DCHECK(hlo_query::AllOperandsAreConstants(*instruction));
-
-  Shape shape = instruction->shape();
-  TF_CHECK_OK(ShapeUtil::ValidateShape(shape));
-
-  // REVIEW QUESTION: other than a few operations, do we need to handle the
-  // general case of operands being of different types in the context of the
-  // evaluator?
-
-  switch (shape.element_type()) {
-    case PRED:
-      return EvaluateOpForLiteralInternal<bool>(instruction);
-    case U8:
-      return EvaluateOpForLiteralInternal<uint8>(instruction);
-    case U16:
-      LOG(FATAL) << "U16/uint16 is unimplemented.";
-    case U32:
-      return EvaluateOpForLiteralInternal<uint32>(instruction);
-    case U64:
-      return EvaluateOpForLiteralInternal<uint64>(instruction);
-    case S8:
-      return EvaluateOpForLiteralInternal<int8>(instruction);
-    case S16:
-      LOG(FATAL) << "S16/int16 is unimplemented.";
-    case S32:
-      return EvaluateOpForLiteralInternal<int32>(instruction);
-    case S64:
-      return EvaluateOpForLiteralInternal<int64>(instruction);
-    case F16:
-      LOG(FATAL) << "F16 is unimplemented.";
-    case F32:
-      return EvaluateOpForLiteralInternal<float>(instruction);
-    case F64:
-      return EvaluateOpForLiteralInternal<double>(instruction);
-    default:
-      return Unimplemented("unhandled primitive type: %s.",
-                           PrimitiveType_Name(shape.element_type()).c_str());
-  }
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index c6ec650d674..6372a6c2690 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -18,22 +18,89 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace xla {
 
-// Responsible for evaluating a HLO instruction with constant operands.
-class HloEvaluator {
+// Responsible for evaluating HLO and obtain literal as the evaluation results.
+//
+// This class is not thread-safe.
+class HloEvaluator : public DfsHloVisitorWithDefault {
  public:
-  // Evaluates a single HLO instruction for constants and return the result as a
-  // Literal.
-  // Precondition: all operands of the instruction are constants, instruction is
-  // valid with corresponding number of operands for the given operator.
+  HloEvaluator() {}
+  ~HloEvaluator() override {}
+
+  // Evaluates a HLO computation and an array of pointers to literals.
+  // Return the evaluated result as literal if successful.
+  // Precondition: argument literals are in post-order corresponding to the
+  // input instruction's parameters.
+  StatusOr<std::unique_ptr<Literal>> Evaluate(
+      HloComputation* computation,
+      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+
+  // Evaluates a single HLO instruction and an array of pointers to literals.
+  // Return the evaluated result as literal if successful.
+  // Precondition:
+  // 1. argument literals are in post-order corresponding to the input
+  // instruction's parameters.
+  // 2. the instruction's operands must be of either Parameter or Constant type.
   // TODO(b/35950897): implement more ops other than element-wise ops.
-  static StatusOr<std::unique_ptr<Literal>> EvaluateOpForLiteral(
+  // TODO(b/35950897): handle broadcasts.
+  StatusOr<std::unique_ptr<Literal>> Evaluate(
+      HloInstruction* instruction,
+      tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
+
+ protected:
+  // The following methods implement the DfsHloVisitor interface.
+  //
+  // DefaultAction here handles all non-specificialized (i.e., instruction
+  // without corresponding Handle* method) instructions.
+  // TODO(b/35950897): it's likely better to refactor the switches here and push
+  // up the switch to templated methods instead, likely at DfsHloVisitor level.
+  Status DefaultAction(HloInstruction* hlo_instruction) override;
+
+  Status HandleParameter(HloInstruction* parameter) override;
+  Status HandleConstant(HloInstruction* constant,
+                        const Literal& literal) override;
+
+ private:
+  // Evaluates a single HLO instruction return the result as a Literal if
+  // successful. A Status will be returned on error.
+  StatusOr<std::unique_ptr<Literal>> EvaluateBasedOnType(
       HloInstruction* instruction);
+
+  // Evaluates an element-wise HLO instruction that has the same output literal
+  // type as the operands' types.
+  template <typename NativeT>
+  StatusOr<std::unique_ptr<Literal>> EvaluateSameTypedElementwise(
+      HloInstruction* instruction);
+
+  // Returns the already-evaluated literal result for the instruction.
+  // Crash with log if the given instruction has not been evaluated previously.
+  const Literal& GetEvaluatedLiteralFor(const HloInstruction* hlo) {
+    auto it = evaluated_.find(hlo);
+    CHECK(it != evaluated_.end())
+        << "could not find evaluated value for: " << hlo->ToString();
+    return *(it->second);
+  }
+
+  // Tracks the HLO instruciton and its evaluated literal result.
+  tensorflow::gtl::FlatMap<const HloInstruction*, std::unique_ptr<Literal>>
+      evaluated_;
+  // Stores input literals, assuming they are in post-order. Literals are not
+  // owned by this class, and they must outlive the lifetime of the instance of
+  // this class.
+  tensorflow::gtl::ArraySlice<const Literal*> arg_literals_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(HloEvaluator);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 585fe65def3..443e5ad4f42 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 
+#include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -29,9 +32,16 @@ limitations under the License.
 namespace xla {
 namespace {
 
+class HloEvaluatorTest : public ::testing::Test {
+ protected:
+  HloEvaluatorTest() { evaluator_ = MakeUnique<HloEvaluator>(); }
+
+  std::unique_ptr<HloEvaluator> evaluator_;
+};
+
 // Verifies that HloEvaluator evaluates a HLO instruction that performs clamp
 // with 3 operands.
-TEST(HloEvaluatorTest, DoesClamp) {
+TEST_F(HloEvaluatorTest, DoesClamp) {
   auto low = LiteralUtil::CreateR2<float>({{0.f, 2.f}, {2.f, 4.f}});
   auto high = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
   auto value = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
@@ -44,7 +54,7 @@ TEST(HloEvaluatorTest, DoesClamp) {
       shape, HloOpcode::kClamp, c1.get(), c2.get(), c3.get());
 
   std::unique_ptr<Literal> result =
-      HloEvaluator::EvaluateOpForLiteral(instruction.get()).ConsumeValueOrDie();
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
   auto expected = LiteralUtil::CreateR2<float>({{0, 4}, {2, 4}});
 
@@ -53,7 +63,7 @@ TEST(HloEvaluatorTest, DoesClamp) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs select
 // with 3 operands.
-TEST(HloEvaluatorTest, DoesSelect) {
+TEST_F(HloEvaluatorTest, DoesSelect) {
   auto pred = LiteralUtil::CreateR2<bool>({{true, false}, {false, true}});
   auto on_true = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
   auto on_false = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
@@ -66,7 +76,7 @@ TEST(HloEvaluatorTest, DoesSelect) {
       shape, HloOpcode::kSelect, c1.get(), c2.get(), c3.get());
 
   std::unique_ptr<Literal> result =
-      HloEvaluator::EvaluateOpForLiteral(instruction.get()).ConsumeValueOrDie();
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
   auto expected = LiteralUtil::CreateR2<float>({{2, 5}, {0, 4}});
 
@@ -75,7 +85,7 @@ TEST(HloEvaluatorTest, DoesSelect) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise addition with 2 operands.
-TEST(HloEvaluatorTest, DoesAdd) {
+TEST_F(HloEvaluatorTest, DoesAdd) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
 
@@ -86,7 +96,7 @@ TEST(HloEvaluatorTest, DoesAdd) {
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, c1.get(), c2.get());
 
   std::unique_ptr<Literal> result =
-      HloEvaluator::EvaluateOpForLiteral(instruction.get()).ConsumeValueOrDie();
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
   auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-96, 8}});
 
@@ -95,7 +105,7 @@ TEST(HloEvaluatorTest, DoesAdd) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise divide with 2 operands.
-TEST(HloEvaluatorTest, DoesDivide) {
+TEST_F(HloEvaluatorTest, DoesDivide) {
   auto lhs_s64 = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs_s64 = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
 
@@ -106,7 +116,7 @@ TEST(HloEvaluatorTest, DoesDivide) {
                                                   c1_s64.get(), c2_s64.get());
 
   std::unique_ptr<Literal> result =
-      HloEvaluator::EvaluateOpForLiteral(instruction.get()).ConsumeValueOrDie();
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
   auto expected = LiteralUtil::CreateR2<int64>({{0, 0}, {-25, 1}});
 
@@ -121,8 +131,7 @@ TEST(HloEvaluatorTest, DoesDivide) {
   instruction = HloInstruction::CreateBinary(shape_f64, HloOpcode::kDivide,
                                              c1_f64.get(), c2_f64.get());
 
-  result =
-      HloEvaluator::EvaluateOpForLiteral(instruction.get()).ConsumeValueOrDie();
+  result = evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
   expected =
       LiteralUtil::CreateR2<double>({{0.45454545454545453, 0}, {-25, 1}});
@@ -132,21 +141,51 @@ TEST(HloEvaluatorTest, DoesDivide) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise abs op with 1 operand.
-TEST(HloEvaluatorTest, DoesAbs) {
+TEST_F(HloEvaluatorTest, DoesAbs) {
   auto operand = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
-
   Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
   auto c1 = HloInstruction::CreateConstant(std::move(operand));
   auto instruction =
       HloInstruction::CreateUnary(shape, HloOpcode::kAbs, c1.get());
 
   std::unique_ptr<Literal> result =
-      HloEvaluator::EvaluateOpForLiteral(instruction.get()).ConsumeValueOrDie();
+      evaluator_->Evaluate(instruction.get(), {}).ConsumeValueOrDie();
 
   auto expected = LiteralUtil::CreateR2<int64>({{1, 20}, {100, 4}});
 
   EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
 }
 
+// Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor
+// constant operands.
+TEST_F(HloEvaluatorTest, DoesTraveseInstructions) {
+  HloComputation::Builder builder(
+      ::testing::UnitTest::GetInstance()->current_test_info()->name());
+
+  auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
+  auto rhs2 = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
+  std::vector<const Literal*> args = {lhs.get(), rhs.get(), rhs2.get()};
+
+  Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
+
+  auto param_lhs = HloInstruction::CreateParameter(0, shape, "lhs");
+  auto param_rhs = HloInstruction::CreateParameter(1, shape, "rhs");
+  auto lhs_instruction = HloInstruction::CreateBinary(
+      shape, HloOpcode::kAdd, param_lhs.get(), param_rhs.get());
+
+  auto param_rhs2 = HloInstruction::CreateParameter(2, shape, "rhs2");
+  auto root_instruction = HloInstruction::CreateBinary(
+      shape, HloOpcode::kAdd, lhs_instruction.get(), param_rhs2.get());
+
+  builder.AddInstruction(std::move(root_instruction));
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(builder.Build().get(), args).ConsumeValueOrDie();
+
+  auto expected = LiteralUtil::CreateR2<int64>({{4, -16}, {-196, 12}});
+
+  EXPECT_TRUE(LiteralUtil::Equal(*result, *expected));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/service/hlo_query.cc
index d6997378642..a153d73dbd8 100644
--- a/tensorflow/compiler/xla/service/hlo_query.cc
+++ b/tensorflow/compiler/xla/service/hlo_query.cc
@@ -32,6 +32,16 @@ bool IsConstantR0F32(HloInstruction* instruction, float* out) {
   return false;
 }
 
+bool AllOperandsAreParametersOrConstants(const HloInstruction& instruction) {
+  for (const auto& operand : instruction.operands()) {
+    if (operand->opcode() != HloOpcode::kParameter &&
+        operand->opcode() != HloOpcode::kConstant) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool AllOperandsAreParameters(const HloInstruction& instruction) {
   for (const auto& operand : instruction.operands()) {
     if (operand->opcode() != HloOpcode::kParameter) {
diff --git a/tensorflow/compiler/xla/service/hlo_query.h b/tensorflow/compiler/xla/service/hlo_query.h
index 56f3cfd863c..c79347bbf9d 100644
--- a/tensorflow/compiler/xla/service/hlo_query.h
+++ b/tensorflow/compiler/xla/service/hlo_query.h
@@ -28,6 +28,10 @@ namespace hlo_query {
 // Precondition: out != nullptr
 bool IsConstantR0F32(HloInstruction* instruction, float* out);
 
+// Returns whether all of an instruction's operands are of the types constants
+// and parameters.
+bool AllOperandsAreParametersOrConstants(const HloInstruction& instruction);
+
 // Returns whether all of an instruction's operands are parameters.
 bool AllOperandsAreParameters(const HloInstruction& instruction);
 

From a16a92f104fea20da6c0a62ebe2d91ab03429e29 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Tue, 2 May 2017 13:15:17 -0800
Subject: [PATCH 27/51] [tf distributions] Move the remaining whitelisted
 distributions to core. Change: 154878206

---
 tensorflow/contrib/distributions/BUILD        | 174 ------------------
 tensorflow/contrib/distributions/__init__.py  |  22 +--
 .../bijectors/cholesky_outer_product_test.py  |   2 +-
 .../kernel_tests/bijectors/invert_test.py     |   2 +-
 .../ops/bijectors/bijector_test_util.py       |   2 +-
 .../contrib/distributions/python/ops/chi2.py  |   2 +-
 .../distributions/python/ops/mixture.py       |   2 +-
 .../python/ops/vector_student_t.py            |   2 +-
 .../contrib/seq2seq/python/ops/helper.py      |   4 +-
 .../python/kernel_tests/distributions/BUILD   | 174 ++++++++++++++++++
 .../distributions}/bernoulli_test.py          |  30 ++-
 .../kernel_tests/distributions}/beta_test.py  |  43 ++++-
 .../distributions}/categorical_test.py        |   2 +-
 .../dirichlet_multinomial_test.py             |   5 +-
 .../distributions}/dirichlet_test.py          |  35 +++-
 .../distributions}/exponential_test.py        |  47 ++++-
 .../kernel_tests/distributions}/gamma_test.py |  92 ++++++---
 .../distributions}/laplace_test.py            |  80 ++++++--
 .../distributions}/multinomial_test.py        |  56 +++---
 .../distributions}/student_t_test.py          | 131 ++++++++-----
 .../distributions}/uniform_test.py            |  29 ++-
 tensorflow/python/ops/distributions/BUILD     |   1 +
 .../ops/distributions}/bernoulli.py           |   0
 .../ops => python/ops/distributions}/beta.py  |   0
 .../ops/distributions}/categorical.py         |   0
 .../distributions/conditional_distribution.py |   2 +-
 .../ops/distributions}/dirichlet.py           |   0
 .../distributions}/dirichlet_multinomial.py   |   0
 .../ops/distributions}/exponential.py         |   2 +-
 .../ops => python/ops/distributions}/gamma.py |   0
 .../ops/distributions}/laplace.py             |   0
 .../ops/distributions}/multinomial.py         |   0
 tensorflow/python/ops/distributions/normal.py |   6 +-
 .../ops/distributions}/student_t.py           |  12 +-
 .../ops/distributions}/uniform.py             |   0
 35 files changed, 608 insertions(+), 351 deletions(-)
 rename tensorflow/{contrib/distributions/python/kernel_tests => python/kernel_tests/distributions}/bernoulli_test.py (94%)
 rename tensorflow/{contrib/distributions/python/kernel_tests => python/kernel_tests/distributions}/beta_test.py (94%)
 rename tensorflow/{contrib/distributions/python/kernel_tests => python/kernel_tests/distributions}/categorical_test.py (99%)
 rename tensorflow/{contrib/distributions/python/kernel_tests => python/kernel_tests/distributions}/dirichlet_multinomial_test.py (99%)
 rename tensorflow/{contrib/distributions/python/kernel_tests => python/kernel_tests/distributions}/dirichlet_test.py (94%)
 rename tensorflow/{contrib/distributions/python/kernel_tests => python/kernel_tests/distributions}/exponential_test.py (88%)
 rename tensorflow/{contrib/distributions/python/kernel_tests => python/kernel_tests/distributions}/gamma_test.py (93%)
 rename tensorflow/{contrib/distributions/python/kernel_tests => python/kernel_tests/distributions}/laplace_test.py (92%)
 rename tensorflow/{contrib/distributions/python/kernel_tests => python/kernel_tests/distributions}/multinomial_test.py (87%)
 rename tensorflow/{contrib/distributions/python/kernel_tests => python/kernel_tests/distributions}/student_t_test.py (83%)
 rename tensorflow/{contrib/distributions/python/kernel_tests => python/kernel_tests/distributions}/uniform_test.py (93%)
 rename tensorflow/{contrib/distributions/python/ops => python/ops/distributions}/bernoulli.py (100%)
 rename tensorflow/{contrib/distributions/python/ops => python/ops/distributions}/beta.py (100%)
 rename tensorflow/{contrib/distributions/python/ops => python/ops/distributions}/categorical.py (100%)
 rename tensorflow/{contrib/distributions/python/ops => python/ops/distributions}/dirichlet.py (100%)
 rename tensorflow/{contrib/distributions/python/ops => python/ops/distributions}/dirichlet_multinomial.py (100%)
 rename tensorflow/{contrib/distributions/python/ops => python/ops/distributions}/exponential.py (98%)
 rename tensorflow/{contrib/distributions/python/ops => python/ops/distributions}/gamma.py (100%)
 rename tensorflow/{contrib/distributions/python/ops => python/ops/distributions}/laplace.py (100%)
 rename tensorflow/{contrib/distributions/python/ops => python/ops/distributions}/multinomial.py (100%)
 rename tensorflow/{contrib/distributions/python/ops => python/ops/distributions}/student_t.py (97%)
 rename tensorflow/{contrib/distributions/python/ops => python/ops/distributions}/uniform.py (100%)

diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 1b9bd6ad91c..9f675c66135 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -193,38 +193,6 @@ cuda_py_test(
     tags = ["notap"],  # http://b/30441813
 )
 
-cuda_py_test(
-    name = "bernoulli_test",
-    size = "small",
-    srcs = ["python/kernel_tests/bernoulli_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "beta_test",
-    size = "small",
-    srcs = ["python/kernel_tests/beta_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "binomial_test",
     size = "small",
@@ -238,24 +206,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "categorical_test",
-    size = "small",
-    srcs = ["python/kernel_tests/categorical_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-    ],
-)
-
 cuda_py_test(
     name = "chi2_test",
     srcs = ["python/kernel_tests/chi2_test.py"],
@@ -287,66 +237,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "dirichlet_test",
-    size = "small",
-    srcs = ["python/kernel_tests/dirichlet_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "dirichlet_multinomial_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/dirichlet_multinomial_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "exponential_test",
-    srcs = ["python/kernel_tests/exponential_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "gamma_test",
-    srcs = ["python/kernel_tests/gamma_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "geometric_test",
     size = "small",
@@ -378,36 +268,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "laplace_test",
-    srcs = ["python/kernel_tests/laplace_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "multinomial_test",
-    srcs = ["python/kernel_tests/multinomial_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 cuda_py_test(
     name = "mvn_diag_test",
     size = "small",
@@ -528,24 +388,6 @@ cuda_py_test(
     tags = ["nomsan"],  # disable to avoid false positives from scipy.
 )
 
-cuda_py_test(
-    name = "student_t_test",
-    size = "small",
-    srcs = ["python/kernel_tests/student_t_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-    ],
-    tags = ["nomsan"],  # disable to avoid false positives from scipy.
-)
-
 cuda_py_test(
     name = "vector_student_t_test",
     size = "medium",
@@ -562,22 +404,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "uniform_test",
-    size = "small",
-    srcs = ["python/kernel_tests/uniform_test.py"],
-    additional_deps = [
-        ":distributions_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-    ],
-)
-
 cuda_py_test(
     name = "wishart_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index cafa477f448..6ea74fab0e4 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -23,25 +23,16 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import,line-too-long,g-importing-member
 
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops.bernoulli import *
-from tensorflow.contrib.distributions.python.ops.beta import *
 from tensorflow.contrib.distributions.python.ops.binomial import *
-from tensorflow.contrib.distributions.python.ops.categorical import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
 from tensorflow.contrib.distributions.python.ops.conditional_transformed_distribution import *
 from tensorflow.contrib.distributions.python.ops.deterministic import *
-from tensorflow.contrib.distributions.python.ops.dirichlet import *
-from tensorflow.contrib.distributions.python.ops.dirichlet_multinomial import *
 from tensorflow.contrib.distributions.python.ops.distribution_util import matrix_diag_transform
 from tensorflow.contrib.distributions.python.ops.distribution_util import softplus_inverse
-from tensorflow.contrib.distributions.python.ops.exponential import *
-from tensorflow.contrib.distributions.python.ops.gamma import *
 from tensorflow.contrib.distributions.python.ops.geometric import *
 from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
-from tensorflow.contrib.distributions.python.ops.laplace import *
 from tensorflow.contrib.distributions.python.ops.logistic import *
 from tensorflow.contrib.distributions.python.ops.mixture import *
-from tensorflow.contrib.distributions.python.ops.multinomial import *
 from tensorflow.contrib.distributions.python.ops.mvn_diag import *
 from tensorflow.contrib.distributions.python.ops.mvn_diag_plus_low_rank import *
 from tensorflow.contrib.distributions.python.ops.mvn_tril import *
@@ -53,14 +44,23 @@ from tensorflow.contrib.distributions.python.ops.quantized_distribution import *
 from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
 from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.sample_stats import *
-from tensorflow.contrib.distributions.python.ops.student_t import *
 from tensorflow.contrib.distributions.python.ops.transformed_distribution import *
-from tensorflow.contrib.distributions.python.ops.uniform import *
 from tensorflow.contrib.distributions.python.ops.wishart import *
+from tensorflow.python.ops.distributions.bernoulli import *
+from tensorflow.python.ops.distributions.beta import *
+from tensorflow.python.ops.distributions.categorical import *
 from tensorflow.python.ops.distributions.conditional_distribution import *
+from tensorflow.python.ops.distributions.dirichlet import *
+from tensorflow.python.ops.distributions.dirichlet_multinomial import *
 from tensorflow.python.ops.distributions.distribution import *
+from tensorflow.python.ops.distributions.exponential import *
+from tensorflow.python.ops.distributions.gamma import *
 from tensorflow.python.ops.distributions.kullback_leibler import *
+from tensorflow.python.ops.distributions.laplace import *
+from tensorflow.python.ops.distributions.multinomial import *
 from tensorflow.python.ops.distributions.normal import *
+from tensorflow.python.ops.distributions.student_t import *
+from tensorflow.python.ops.distributions.uniform import *
 
 # pylint: enable=unused-import,wildcard-import,line-too-long,g-importing-member
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
index 267e4ad3509..a4688829f1f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/cholesky_outer_product_test.py
@@ -19,11 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops import gamma as gamma_lib
 from tensorflow.contrib.distributions.python.ops import transformed_distribution as transformed_distribution_lib
 from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
index 267e4ad3509..a4688829f1f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/invert_test.py
@@ -19,11 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops import gamma as gamma_lib
 from tensorflow.contrib.distributions.python.ops import transformed_distribution as transformed_distribution_lib
 from tensorflow.contrib.distributions.python.ops.bijectors.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/bijector_test_util.py b/tensorflow/contrib/distributions/python/ops/bijectors/bijector_test_util.py
index a0834423329..ff3535c6264 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/bijector_test_util.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/bijector_test_util.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import uniform as uniform_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import uniform as uniform_lib
 
 
 def assert_finite(array):
diff --git a/tensorflow/contrib/distributions/python/ops/chi2.py b/tensorflow/contrib/distributions/python/ops/chi2.py
index 45d3accdd6c..bdd5571c966 100644
--- a/tensorflow/contrib/distributions/python/ops/chi2.py
+++ b/tensorflow/contrib/distributions/python/ops/chi2.py
@@ -18,11 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import gamma
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import gamma
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index 6d318014adc..f3b09f60f3e 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import categorical
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -29,6 +28,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import util as distribution_util
 
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index d7115f6f0bc..299ff36962e 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops import student_t
 from tensorflow.contrib.distributions.python.ops import transformed_distribution
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import student_t
 from tensorflow.python.ops.distributions import util as distribution_util
 
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index d6c0527ad27..bdd7d7ca73e 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -23,8 +23,6 @@ import abc
 
 import six
 
-from tensorflow.contrib.distributions.python.ops import bernoulli
-from tensorflow.contrib.distributions.python.ops import categorical
 from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,6 +33,8 @@ from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops.distributions import bernoulli
+from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.util import nest
 
 __all__ = [
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index 3c1a4d5125c..3630adc9549 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -41,6 +41,180 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "beta_test",
+    size = "small",
+    srcs = ["beta_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "bernoulli_test",
+    size = "small",
+    srcs = ["bernoulli_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "categorical_test",
+    size = "small",
+    srcs = ["categorical_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "dirichlet_test",
+    size = "small",
+    srcs = ["dirichlet_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "dirichlet_multinomial_test",
+    size = "medium",
+    srcs = ["dirichlet_multinomial_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "exponential_test",
+    srcs = ["exponential_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "gamma_test",
+    srcs = ["gamma_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "laplace_test",
+    srcs = ["laplace_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "multinomial_test",
+    srcs = ["multinomial_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_test(
+    name = "student_t_test",
+    size = "small",
+    srcs = ["student_t_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    tags = ["nomsan"],  # disable to avoid false positives from scipy.
+)
+
+cuda_py_test(
+    name = "uniform_test",
+    size = "small",
+    srcs = ["uniform_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/distributions",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 cuda_py_test(
     name = "normal_test",
     size = "medium",
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
similarity index 94%
rename from tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
rename to tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index e8b0eb4eb86..ef93c4dab08 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -18,15 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-import scipy.special
-from tensorflow.contrib.distributions.python.ops import bernoulli
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bernoulli
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
 
 
 def make_bernoulli(batch_shape, dtype=dtypes.int32):
@@ -54,13 +69,16 @@ class BernoulliTest(test.TestCase):
     with self.test_session():
       self.assertAllClose(logits, dist.logits.eval())
 
+    if not special:
+      return
+
     with self.test_session():
-      self.assertAllClose(scipy.special.expit(logits), dist.probs.eval())
+      self.assertAllClose(special.expit(logits), dist.probs.eval())
 
     p = [0.01, 0.99, 0.42]
     dist = bernoulli.Bernoulli(probs=p)
     with self.test_session():
-      self.assertAllClose(scipy.special.logit(p), dist.logits.eval())
+      self.assertAllClose(special.logit(p), dist.logits.eval())
 
   def testInvalidP(self):
     invalid_ps = [1.01, 2.]
@@ -160,7 +178,9 @@ class BernoulliTest(test.TestCase):
   def testPmfWithP(self):
     p = [[0.2, 0.4], [0.3, 0.6]]
     self._testPmf(probs=p)
-    self._testPmf(logits=scipy.special.logit(p))
+    if not special:
+      return
+    self._testPmf(logits=special.logit(p))
 
   def testBroadcasting(self):
     with self.test_session():
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py b/tensorflow/python/kernel_tests/distributions/beta_test.py
similarity index 94%
rename from tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
rename to tensorflow/python/kernel_tests/distributions/beta_test.py
index ec16a85991d..91a451f033f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/beta_test.py
+++ b/tensorflow/python/kernel_tests/distributions/beta_test.py
@@ -16,18 +16,33 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import special
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import beta as beta_lib
+
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import beta as beta_lib
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
 
 
 class BetaTest(test.TestCase):
@@ -167,18 +182,22 @@ class BetaTest(test.TestCase):
     with session.Session():
       a = [1., 2, 3]
       b = [2., 4, 1.2]
-      expected_mean = stats.beta.mean(a, b)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_mean = stats.beta.mean(a, b)
       self.assertAllClose(expected_mean, dist.mean().eval())
 
   def testBetaVariance(self):
     with session.Session():
       a = [1., 2, 3]
       b = [2., 4, 1.2]
-      expected_variance = stats.beta.var(a, b)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variance = stats.beta.var(a, b)
       self.assertAllClose(expected_variance, dist.variance().eval())
 
   def testBetaMode(self):
@@ -228,9 +247,11 @@ class BetaTest(test.TestCase):
     with session.Session():
       a = [1., 2, 3]
       b = [2., 4, 1.2]
-      expected_entropy = stats.beta.entropy(a, b)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.beta.entropy(a, b)
       self.assertAllClose(expected_entropy, dist.entropy().eval())
 
   def testBetaSample(self):
@@ -243,6 +264,8 @@ class BetaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000,))
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       self.assertLess(
           stats.kstest(
               # Beta is a univariate distribution.
@@ -286,6 +309,8 @@ class BetaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000, 3, 2, 2))
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values[:, 1, :].mean(axis=0),
           stats.beta.mean(a, b)[1, :],
@@ -301,6 +326,8 @@ class BetaTest(test.TestCase):
         actual = beta_lib.Beta(a, b).cdf(x).eval()
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
         self.assertAllClose(stats.beta.cdf(x, a, b), actual, rtol=1e-4, atol=0)
 
   def testBetaLogCdf(self):
@@ -313,6 +340,8 @@ class BetaTest(test.TestCase):
         actual = math_ops.exp(beta_lib.Beta(a, b).log_cdf(x)).eval()
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
+        if not stats:
+          return
         self.assertAllClose(stats.beta.cdf(x, a, b), actual, rtol=1e-4, atol=0)
 
   def testBetaWithSoftplusConcentration(self):
@@ -342,6 +371,8 @@ class BetaTest(test.TestCase):
         d2_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a2_sp,
                                                        concentration0=b2_sp)
 
+        if not special:
+          return
         kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1) +
                        (a1 - a2) * special.digamma(a1) +
                        (b1 - b2) * special.digamma(b1) +
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
similarity index 99%
rename from tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
rename to tensorflow/python/kernel_tests/distributions/categorical_test.py
index 269c02ede3a..bfdb5fa9fe7 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import categorical
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
@@ -29,6 +28,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
similarity index 99%
rename from tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
rename to tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index bc25366cfa4..d009f4e9319 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -17,14 +17,15 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib import distributions
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import dirichlet_multinomial
 from tensorflow.python.platform import test
 
-ds = distributions
+
+ds = dirichlet_multinomial
 
 
 class DirichletMultinomialTest(test.TestCase):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
similarity index 94%
rename from tensorflow/contrib/distributions/python/kernel_tests/dirichlet_test.py
rename to tensorflow/python/kernel_tests/distributions/dirichlet_test.py
index cd634da09dd..a2f1de5aaf3 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/dirichlet_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
@@ -16,14 +16,29 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import dirichlet as dirichlet_lib
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import dirichlet as dirichlet_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class DirichletTest(test.TestCase):
@@ -132,9 +147,11 @@ class DirichletTest(test.TestCase):
   def testMean(self):
     with self.test_session():
       alpha = [1., 2, 3]
-      expected_mean = stats.dirichlet.mean(alpha)
       dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
       self.assertEqual(dirichlet.mean().get_shape(), [3])
+      if not stats:
+        return
+      expected_mean = stats.dirichlet.mean(alpha)
       self.assertAllClose(dirichlet.mean().eval(), expected_mean)
 
   def testCovarianceFromSampling(self):
@@ -177,11 +194,13 @@ class DirichletTest(test.TestCase):
     with self.test_session():
       alpha = [1., 2, 3]
       denominator = np.sum(alpha)**2 * (np.sum(alpha) + 1)
+      dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
+      self.assertEqual(dirichlet.covariance().get_shape(), (3, 3))
+      if not stats:
+        return
       expected_covariance = np.diag(stats.dirichlet.var(alpha))
       expected_covariance += [[0., -2, -3], [-2, 0, -6],
                               [-3, -6, 0]] / denominator
-      dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
-      self.assertEqual(dirichlet.covariance().get_shape(), (3, 3))
       self.assertAllClose(dirichlet.covariance().eval(), expected_covariance)
 
   def testMode(self):
@@ -213,9 +232,11 @@ class DirichletTest(test.TestCase):
   def testEntropy(self):
     with self.test_session():
       alpha = [1., 2, 3]
-      expected_entropy = stats.dirichlet.entropy(alpha)
       dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
       self.assertEqual(dirichlet.entropy().get_shape(), ())
+      if not stats:
+        return
+      expected_entropy = stats.dirichlet.entropy(alpha)
       self.assertAllClose(dirichlet.entropy().eval(), expected_entropy)
 
   def testSample(self):
@@ -227,6 +248,8 @@ class DirichletTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertTrue(np.all(sample_values > 0.0))
+      if not stats:
+        return
       self.assertLess(
           stats.kstest(
               # Beta is a univariate distribution.
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py b/tensorflow/python/kernel_tests/distributions/exponential_test.py
similarity index 88%
rename from tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py
rename to tensorflow/python/kernel_tests/distributions/exponential_test.py
index 61712024138..7afdf0f9476 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/exponential_test.py
+++ b/tensorflow/python/kernel_tests/distributions/exponential_test.py
@@ -18,13 +18,28 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import exponential as exponential_lib
+
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import exponential as exponential_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class ExponentialTest(test.TestCase):
@@ -36,14 +51,17 @@ class ExponentialTest(test.TestCase):
       lam_v = 2.0
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
       exponential = exponential_lib.Exponential(rate=lam)
-      expected_log_pdf = stats.expon.logpdf(x, scale=1 / lam_v)
 
       log_pdf = exponential.log_prob(x)
       self.assertEqual(log_pdf.get_shape(), (6,))
-      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
 
       pdf = exponential.prob(x)
       self.assertEqual(pdf.get_shape(), (6,))
+
+      if not stats:
+        return
+      expected_log_pdf = stats.expon.logpdf(x, scale=1 / lam_v)
+      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
       self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
 
   def testExponentialCDF(self):
@@ -54,34 +72,43 @@ class ExponentialTest(test.TestCase):
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       exponential = exponential_lib.Exponential(rate=lam)
-      expected_cdf = stats.expon.cdf(x, scale=1 / lam_v)
 
       cdf = exponential.cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+
+      if not stats:
+        return
+      expected_cdf = stats.expon.cdf(x, scale=1 / lam_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testExponentialMean(self):
     with session.Session():
       lam_v = np.array([1.0, 4.0, 2.5])
-      expected_mean = stats.expon.mean(scale=1 / lam_v)
       exponential = exponential_lib.Exponential(rate=lam_v)
       self.assertEqual(exponential.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_mean = stats.expon.mean(scale=1 / lam_v)
       self.assertAllClose(exponential.mean().eval(), expected_mean)
 
   def testExponentialVariance(self):
     with session.Session():
       lam_v = np.array([1.0, 4.0, 2.5])
-      expected_variance = stats.expon.var(scale=1 / lam_v)
       exponential = exponential_lib.Exponential(rate=lam_v)
       self.assertEqual(exponential.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variance = stats.expon.var(scale=1 / lam_v)
       self.assertAllClose(exponential.variance().eval(), expected_variance)
 
   def testExponentialEntropy(self):
     with session.Session():
       lam_v = np.array([1.0, 4.0, 2.5])
-      expected_entropy = stats.expon.entropy(scale=1 / lam_v)
       exponential = exponential_lib.Exponential(rate=lam_v)
       self.assertEqual(exponential.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.expon.entropy(scale=1 / lam_v)
       self.assertAllClose(exponential.entropy().eval(), expected_entropy)
 
   def testExponentialSample(self):
@@ -95,6 +122,8 @@ class ExponentialTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       for i in range(2):
         self.assertLess(
             stats.kstest(
@@ -116,6 +145,8 @@ class ExponentialTest(test.TestCase):
       sample_values = samples.eval()
 
       self.assertFalse(np.any(sample_values < 0.0))
+      if not stats:
+        return
       for i in range(2):
         self.assertLess(
             stats.kstest(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/gamma_test.py b/tensorflow/python/kernel_tests/distributions/gamma_test.py
similarity index 93%
rename from tensorflow/contrib/distributions/python/kernel_tests/gamma_test.py
rename to tensorflow/python/kernel_tests/distributions/gamma_test.py
index 5ccf2308a51..5e4813ac076 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/gamma_test.py
+++ b/tensorflow/python/kernel_tests/distributions/gamma_test.py
@@ -17,18 +17,32 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-from scipy import special
-from scipy import stats
+import importlib
+
+import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import gamma as gamma_lib
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import gamma as gamma_lib
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+special = try_import("scipy.special")
+stats = try_import("scipy.stats")
 
 
 class GammaTest(test.TestCase):
@@ -53,13 +67,14 @@ class GammaTest(test.TestCase):
       beta_v = 3.0
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
       log_pdf = gamma.log_prob(x)
       self.assertEqual(log_pdf.get_shape(), (6,))
-      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
-
       pdf = gamma.prob(x)
       self.assertEqual(pdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
+      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
       self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
 
   def testGammaLogPDFMultidimensional(self):
@@ -71,15 +86,16 @@ class GammaTest(test.TestCase):
       beta_v = np.array([3.0, 4.0])
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
       log_pdf = gamma.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
-
       pdf = gamma.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+      if not stats:
+        return
+      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testGammaLogPDFMultidimensionalBroadcasting(self):
@@ -91,15 +107,17 @@ class GammaTest(test.TestCase):
       beta_v = 3.0
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
       log_pdf = gamma.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
-
       pdf = gamma.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+
+      if not stats:
+        return
+      expected_log_pdf = stats.gamma.logpdf(x, alpha_v, scale=1 / beta_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testGammaCDF(self):
@@ -112,10 +130,11 @@ class GammaTest(test.TestCase):
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       gamma = gamma_lib.Gamma(concentration=alpha, rate=beta)
-      expected_cdf = stats.gamma.cdf(x, alpha_v, scale=1 / beta_v)
-
       cdf = gamma.cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_cdf = stats.gamma.cdf(x, alpha_v, scale=1 / beta_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testGammaMean(self):
@@ -123,8 +142,10 @@ class GammaTest(test.TestCase):
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
-      expected_means = stats.gamma.mean(alpha_v, scale=1 / beta_v)
       self.assertEqual(gamma.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_means = stats.gamma.mean(alpha_v, scale=1 / beta_v)
       self.assertAllClose(gamma.mean().eval(), expected_means)
 
   def testGammaModeAllowNanStatsIsFalseWorksWhenAllBatchMembersAreDefined(self):
@@ -165,8 +186,10 @@ class GammaTest(test.TestCase):
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
-      expected_variances = stats.gamma.var(alpha_v, scale=1 / beta_v)
       self.assertEqual(gamma.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variances = stats.gamma.var(alpha_v, scale=1 / beta_v)
       self.assertAllClose(gamma.variance().eval(), expected_variances)
 
   def testGammaStd(self):
@@ -174,17 +197,21 @@ class GammaTest(test.TestCase):
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
-      expected_stddev = stats.gamma.std(alpha_v, scale=1. / beta_v)
       self.assertEqual(gamma.stddev().get_shape(), (3,))
+      if not stats:
+        return
+      expected_stddev = stats.gamma.std(alpha_v, scale=1. / beta_v)
       self.assertAllClose(gamma.stddev().eval(), expected_stddev)
 
   def testGammaEntropy(self):
     with self.test_session():
       alpha_v = np.array([1.0, 3.0, 2.5])
       beta_v = np.array([1.0, 4.0, 5.0])
-      expected_entropy = stats.gamma.entropy(alpha_v, scale=1 / beta_v)
       gamma = gamma_lib.Gamma(concentration=alpha_v, rate=beta_v)
       self.assertEqual(gamma.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.gamma.entropy(alpha_v, scale=1 / beta_v)
       self.assertAllClose(gamma.entropy().eval(), expected_entropy)
 
   def testGammaSampleSmallAlpha(self):
@@ -199,6 +226,9 @@ class GammaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
+      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(),
           stats.gamma.mean(
@@ -208,7 +238,6 @@ class GammaTest(test.TestCase):
           sample_values.var(),
           stats.gamma.var(alpha_v, scale=1 / beta_v),
           atol=.15)
-      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
 
   def testGammaSample(self):
     with session.Session():
@@ -222,6 +251,9 @@ class GammaTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
+      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(),
           stats.gamma.mean(
@@ -231,7 +263,6 @@ class GammaTest(test.TestCase):
           sample_values.var(),
           stats.gamma.var(alpha_v, scale=1 / beta_v),
           atol=.15)
-      self.assertTrue(self._kstest(alpha_v, beta_v, sample_values))
 
   def testGammaSampleMultiDimensional(self):
     with session.Session():
@@ -246,6 +277,8 @@ class GammaTest(test.TestCase):
       zeros = np.zeros_like(alpha_v + beta_v)  # 10 x 100
       alpha_bc = alpha_v + zeros
       beta_bc = beta_v + zeros
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(axis=0),
           stats.gamma.mean(
@@ -266,6 +299,8 @@ class GammaTest(test.TestCase):
 
   def _kstest(self, alpha, beta, samples):
     # Uses the Kolmogorov-Smirnov test for goodness of fit.
+    if not stats:
+      return True  # If we can't test, return that the test passes.
     ks, _ = stats.kstest(samples, stats.gamma(alpha, scale=1 / beta).cdf)
     # Return True when the test passes.
     return ks < 0.02
@@ -279,6 +314,12 @@ class GammaTest(test.TestCase):
       sample_vals, pdf_vals = sess.run([samples, pdfs])
       self.assertEqual(samples.get_shape(), (num, 2, 2))
       self.assertEqual(pdfs.get_shape(), (num, 2, 2))
+      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+      if not stats:
+        return
       self.assertAllClose(
           stats.gamma.mean(
               [[7., 11.], [7., 11.]], scale=1 / np.array([[5., 5.], [6., 6.]])),
@@ -289,10 +330,6 @@ class GammaTest(test.TestCase):
                           scale=1 / np.array([[5., 5.], [6., 6.]])),
           sample_vals.var(axis=0),
           atol=.1)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1e-3):
     s_p = zip(sample_vals, pdf_vals)
@@ -350,6 +387,10 @@ class GammaTest(test.TestCase):
     # Execute graph.
     [kl_sample_, kl_actual_] = sess.run([kl_sample, kl_actual])
 
+    self.assertEqual(beta0.shape, kl_actual.get_shape())
+
+    if not special:
+      return
     kl_expected = ((alpha0 - alpha1) * special.digamma(alpha0)
                    + special.gammaln(alpha1)
                    - special.gammaln(alpha0)
@@ -357,7 +398,6 @@ class GammaTest(test.TestCase):
                    - alpha1 * np.log(beta1)
                    + alpha0 * (beta1 / beta0 - 1.))
 
-    self.assertEqual(beta0.shape, kl_actual.get_shape())
     self.assertAllClose(kl_expected, kl_actual_, atol=0., rtol=1e-6)
     self.assertAllClose(kl_sample_, kl_actual_, atol=0., rtol=1e-2)
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/laplace_test.py b/tensorflow/python/kernel_tests/distributions/laplace_test.py
similarity index 92%
rename from tensorflow/contrib/distributions/python/kernel_tests/laplace_test.py
rename to tensorflow/python/kernel_tests/distributions/laplace_test.py
index 1f58d495f02..55577386c45 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/laplace_test.py
+++ b/tensorflow/python/kernel_tests/distributions/laplace_test.py
@@ -17,15 +17,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import laplace as laplace_lib
+
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import laplace as laplace_lib
 from tensorflow.python.platform import test
 
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
+
 
 class LaplaceTest(test.TestCase):
 
@@ -49,9 +65,11 @@ class LaplaceTest(test.TestCase):
       scale_v = 3.0
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       log_pdf = laplace.log_prob(x)
       self.assertEqual(log_pdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       self.assertAllClose(log_pdf.eval(), expected_log_pdf)
 
       pdf = laplace.prob(x)
@@ -67,15 +85,17 @@ class LaplaceTest(test.TestCase):
       scale_v = np.array([3.0, 4.0])
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       log_pdf = laplace.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
 
       pdf = laplace.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+      if not stats:
+        return
+      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testLaplaceLogPDFMultidimensionalBroadcasting(self):
@@ -87,15 +107,17 @@ class LaplaceTest(test.TestCase):
       scale_v = 3.0
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
       log_pdf = laplace.log_prob(x)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllClose(log_pdf_values, expected_log_pdf)
 
       pdf = laplace.prob(x)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+      if not stats:
+        return
+      expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
+      self.assertAllClose(log_pdf_values, expected_log_pdf)
       self.assertAllClose(pdf_values, np.exp(expected_log_pdf))
 
   def testLaplaceCDF(self):
@@ -108,10 +130,12 @@ class LaplaceTest(test.TestCase):
       x = np.array([2.5, 2.5, 4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_cdf = stats.laplace.cdf(x, loc_v, scale=scale_v)
 
       cdf = laplace.cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_cdf = stats.laplace.cdf(x, loc_v, scale=scale_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testLaplaceLogCDF(self):
@@ -124,10 +148,12 @@ class LaplaceTest(test.TestCase):
       x = np.array([-2.5, 2.5, -4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_cdf = stats.laplace.logcdf(x, loc_v, scale=scale_v)
 
       cdf = laplace.log_cdf(x)
       self.assertEqual(cdf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_cdf = stats.laplace.logcdf(x, loc_v, scale=scale_v)
       self.assertAllClose(cdf.eval(), expected_cdf)
 
   def testLaplaceLogSurvivalFunction(self):
@@ -140,10 +166,12 @@ class LaplaceTest(test.TestCase):
       x = np.array([-2.5, 2.5, -4.0, 0.1, 1.0, 2.0], dtype=np.float32)
 
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
-      expected_sf = stats.laplace.logsf(x, loc_v, scale=scale_v)
 
       sf = laplace.log_survival_function(x)
       self.assertEqual(sf.get_shape(), (6,))
+      if not stats:
+        return
+      expected_sf = stats.laplace.logsf(x, loc_v, scale=scale_v)
       self.assertAllClose(sf.eval(), expected_sf)
 
   def testLaplaceMean(self):
@@ -151,8 +179,10 @@ class LaplaceTest(test.TestCase):
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
-      expected_means = stats.laplace.mean(loc_v, scale=scale_v)
       self.assertEqual(laplace.mean().get_shape(), (3,))
+      if not stats:
+        return
+      expected_means = stats.laplace.mean(loc_v, scale=scale_v)
       self.assertAllClose(laplace.mean().eval(), expected_means)
 
   def testLaplaceMode(self):
@@ -168,8 +198,10 @@ class LaplaceTest(test.TestCase):
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
-      expected_variances = stats.laplace.var(loc_v, scale=scale_v)
       self.assertEqual(laplace.variance().get_shape(), (3,))
+      if not stats:
+        return
+      expected_variances = stats.laplace.var(loc_v, scale=scale_v)
       self.assertAllClose(laplace.variance().eval(), expected_variances)
 
   def testLaplaceStd(self):
@@ -177,17 +209,21 @@ class LaplaceTest(test.TestCase):
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
-      expected_stddev = stats.laplace.std(loc_v, scale=scale_v)
       self.assertEqual(laplace.stddev().get_shape(), (3,))
+      if not stats:
+        return
+      expected_stddev = stats.laplace.std(loc_v, scale=scale_v)
       self.assertAllClose(laplace.stddev().eval(), expected_stddev)
 
   def testLaplaceEntropy(self):
     with self.test_session():
       loc_v = np.array([1.0, 3.0, 2.5])
       scale_v = np.array([1.0, 4.0, 5.0])
-      expected_entropy = stats.laplace.entropy(loc_v, scale=scale_v)
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
       self.assertEqual(laplace.entropy().get_shape(), (3,))
+      if not stats:
+        return
+      expected_entropy = stats.laplace.entropy(loc_v, scale=scale_v)
       self.assertAllClose(laplace.entropy().eval(), expected_entropy)
 
   def testLaplaceSample(self):
@@ -202,6 +238,8 @@ class LaplaceTest(test.TestCase):
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(),
           stats.laplace.mean(
@@ -228,6 +266,8 @@ class LaplaceTest(test.TestCase):
       zeros = np.zeros_like(loc_v + scale_v)  # 10 x 100
       loc_bc = loc_v + zeros
       scale_bc = scale_v + zeros
+      if not stats:
+        return
       self.assertAllClose(
           sample_values.mean(axis=0),
           stats.laplace.mean(
@@ -250,6 +290,8 @@ class LaplaceTest(test.TestCase):
 
   def _kstest(self, loc, scale, samples):
     # Uses the Kolmogorov-Smirnov test for goodness of fit.
+    if not stats:
+      return True  # If scipy isn't available, return "True" for passing
     ks, _ = stats.kstest(samples, stats.laplace(loc, scale=scale).cdf)
     # Return True when the test passes.
     return ks < 0.02
@@ -263,6 +305,12 @@ class LaplaceTest(test.TestCase):
       sample_vals, pdf_vals = sess.run([samples, pdfs])
       self.assertEqual(samples.get_shape(), (num, 2, 2))
       self.assertEqual(pdfs.get_shape(), (num, 2, 2))
+      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+      if not stats:
+        return
       self.assertAllClose(
           stats.laplace.mean(
               [[7., 11.], [7., 11.]], scale=np.array([[5., 5.], [6., 6.]])),
@@ -275,10 +323,6 @@ class LaplaceTest(test.TestCase):
           sample_vals.var(axis=0),
           rtol=0.05,
           atol=0.)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1e-3):
     s_p = zip(sample_vals, pdf_vals)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
similarity index 87%
rename from tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
rename to tensorflow/python/kernel_tests/distributions/multinomial_test.py
index b1c0c9f7a9d..80caf10391d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -17,15 +17,14 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.contrib import distributions
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import multinomial
 from tensorflow.python.platform import test
 
-ds = distributions
-
 
 class MultinomialTest(test.TestCase):
 
@@ -35,7 +34,7 @@ class MultinomialTest(test.TestCase):
   def testSimpleShapes(self):
     with self.test_session():
       p = [.1, .3, .6]
-      dist = ds.Multinomial(total_count=1., probs=p)
+      dist = multinomial.Multinomial(total_count=1., probs=p)
       self.assertEqual(3, dist.event_shape_tensor().eval())
       self.assertAllEqual([], dist.batch_shape_tensor().eval())
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
@@ -45,7 +44,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = 0.5 * np.ones([3, 2, 2], dtype=np.float32)
       n = [[3., 2], [4, 5], [6, 7]]
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       self.assertEqual(2, dist.event_shape_tensor().eval())
       self.assertAllEqual([3, 2], dist.batch_shape_tensor().eval())
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
@@ -55,14 +54,14 @@ class MultinomialTest(test.TestCase):
     p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
     n = [[3.], [4]]
     with self.test_session():
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       self.assertEqual((2, 1), dist.total_count.get_shape())
       self.assertAllClose(n, dist.total_count.eval())
 
   def testP(self):
     p = [[0.1, 0.2, 0.7]]
     with self.test_session():
-      dist = ds.Multinomial(total_count=3., probs=p)
+      dist = multinomial.Multinomial(total_count=3., probs=p)
       self.assertEqual((1, 3), dist.probs.get_shape())
       self.assertEqual((1, 3), dist.logits.get_shape())
       self.assertAllClose(p, dist.probs.eval())
@@ -71,7 +70,7 @@ class MultinomialTest(test.TestCase):
     p = np.array([[0.1, 0.2, 0.7]], dtype=np.float32)
     logits = np.log(p) - 50.
     with self.test_session():
-      multinom = ds.Multinomial(total_count=3., logits=logits)
+      multinom = multinomial.Multinomial(total_count=3., logits=logits)
       self.assertEqual((1, 3), multinom.probs.get_shape())
       self.assertEqual((1, 3), multinom.logits.get_shape())
       self.assertAllClose(p, multinom.probs.eval())
@@ -81,7 +80,7 @@ class MultinomialTest(test.TestCase):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
     with self.test_session():
-      dist = ds.Multinomial(total_count=n, probs=p, validate_args=True)
+      dist = multinomial.Multinomial(total_count=n, probs=p, validate_args=True)
       dist.prob([2., 3, 0]).eval()
       dist.prob([3., 0, 2]).eval()
       with self.assertRaisesOpError("must be non-negative"):
@@ -94,7 +93,8 @@ class MultinomialTest(test.TestCase):
     n = [[5.]]
     with self.test_session():
       # No errors with integer n.
-      multinom = ds.Multinomial(total_count=n, probs=p, validate_args=True)
+      multinom = multinomial.Multinomial(
+          total_count=n, probs=p, validate_args=True)
       multinom.prob([2., 1, 2]).eval()
       multinom.prob([3., 0, 2]).eval()
       # Counts don't sum to n.
@@ -106,7 +106,8 @@ class MultinomialTest(test.TestCase):
           "cannot contain fractional components."):
         multinom.prob(x).eval(feed_dict={x: [1.0, 2.5, 1.5]})
 
-      multinom = ds.Multinomial(total_count=n, probs=p, validate_args=False)
+      multinom = multinomial.Multinomial(
+          total_count=n, probs=p, validate_args=False)
       multinom.prob([1., 2., 2.]).eval()
       # Non-integer arguments work.
       multinom.prob([1.0, 2.5, 1.5]).eval()
@@ -116,7 +117,7 @@ class MultinomialTest(test.TestCase):
       # Both zero-batches.  No broadcast
       p = [0.5, 0.5]
       counts = [1., 0]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose(0.5, pmf.eval())
       self.assertEqual((), pmf.get_shape())
 
@@ -125,7 +126,7 @@ class MultinomialTest(test.TestCase):
       # Both zero-batches.  No broadcast
       p = [0.1, 0.9]
       counts = [3., 2]
-      dist = ds.Multinomial(total_count=5., probs=p)
+      dist = multinomial.Multinomial(total_count=5., probs=p)
       pmf = dist.prob(counts)
       # 5 choose 3 = 5 choose 2 = 10. 10 * (.9)^2 * (.1)^3 = 81/10000.
       self.assertAllClose(81. / 10000, pmf.eval())
@@ -135,7 +136,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [[0.1, 0.9]]
       counts = [[1., 0], [0, 1]]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose([0.1, 0.9], pmf.eval())
       self.assertEqual((2), pmf.get_shape())
 
@@ -143,7 +144,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [0.1, 0.9]
       counts = [[1., 0], [0, 1]]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose([0.1, 0.9], pmf.eval())
       self.assertEqual((2), pmf.get_shape())
 
@@ -151,7 +152,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
       counts = [[1., 0]]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose(pmf.eval(), [0.1, 0.7])
       self.assertEqual((2), pmf.get_shape())
 
@@ -159,7 +160,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
       counts = [1., 0]
-      pmf = ds.Multinomial(total_count=1., probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
       self.assertAllClose(pmf.eval(), [0.1, 0.7])
       self.assertEqual(pmf.get_shape(), (2))
 
@@ -171,7 +172,7 @@ class MultinomialTest(test.TestCase):
       n = [[3., 3], [3, 3]]
       # [2]
       counts = [2., 1]
-      pmf = ds.Multinomial(total_count=n, probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
       pmf.eval()
       self.assertEqual(pmf.get_shape(), (2, 2))
 
@@ -180,7 +181,7 @@ class MultinomialTest(test.TestCase):
       p = [0.1, 0.9]
       counts = [3., 2]
       n = np.full([4, 3], 5., dtype=np.float32)
-      pmf = ds.Multinomial(total_count=n, probs=p).prob(counts)
+      pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
       pmf.eval()
       self.assertEqual((4, 3), pmf.get_shape())
 
@@ -188,7 +189,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       expected_means = 5 * np.array(p, dtype=np.float32)
       self.assertEqual((3,), dist.mean().get_shape())
       self.assertAllClose(expected_means, dist.mean().eval())
@@ -197,7 +198,7 @@ class MultinomialTest(test.TestCase):
     with self.test_session():
       n = 5.
       p = [0.1, 0.2, 0.7]
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       expected_covariances = [[9. / 20, -1 / 10, -7 / 20],
                               [-1 / 10, 4 / 5, -7 / 10],
                               [-7 / 20, -7 / 10, 21 / 20]]
@@ -210,7 +211,7 @@ class MultinomialTest(test.TestCase):
       n = [5.] * 2
       # Shape [4, 1, 2]
       p = [[[0.1, 0.9]], [[0.1, 0.9]]] * 2
-      dist = ds.Multinomial(total_count=n, probs=p)
+      dist = multinomial.Multinomial(total_count=n, probs=p)
       # Shape [2, 2]
       inner_var = [[9. / 20, -9 / 20], [-9 / 20, 9 / 20]]
       # Shape [4, 2, 2, 2]
@@ -228,8 +229,8 @@ class MultinomialTest(test.TestCase):
     ns2 = np.random.randint(low=1, high=11, size=[6, 1]).astype(np.float32)
 
     with self.test_session():
-      dist = ds.Multinomial(ns, p)
-      dist2 = ds.Multinomial(ns2, p2)
+      dist = multinomial.Multinomial(ns, p)
+      dist2 = multinomial.Multinomial(ns2, p2)
 
       covariance = dist.covariance()
       covariance2 = dist2.covariance()
@@ -246,7 +247,8 @@ class MultinomialTest(test.TestCase):
     # doesn't support different total counts.
     n = np.float32(5)
     with self.test_session() as sess:
-      dist = ds.Multinomial(n, theta)  # batch_shape=[2], event_shape=[3]
+      # batch_shape=[2], event_shape=[3]
+      dist = multinomial.Multinomial(n, theta)
       x = dist.sample(int(250e3), seed=1)
       sample_mean = math_ops.reduce_mean(x, 0)
       x_centered = x - sample_mean[array_ops.newaxis, ...]
@@ -281,7 +283,7 @@ class MultinomialTest(test.TestCase):
 
   def testSampleUnbiasedNonScalarBatch(self):
     with self.test_session() as sess:
-      dist = ds.Multinomial(
+      dist = multinomial.Multinomial(
           total_count=5.,
           logits=math_ops.log(2. * self._rng.rand(4, 3, 2).astype(np.float32)))
       n = int(3e3)
@@ -310,7 +312,7 @@ class MultinomialTest(test.TestCase):
 
   def testSampleUnbiasedScalarBatch(self):
     with self.test_session() as sess:
-      dist = ds.Multinomial(
+      dist = multinomial.Multinomial(
           total_count=5.,
           logits=math_ops.log(2. * self._rng.rand(4).astype(np.float32)))
       n = int(5e3)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/student_t_test.py b/tensorflow/python/kernel_tests/distributions/student_t_test.py
similarity index 83%
rename from tensorflow/contrib/distributions/python/kernel_tests/student_t_test.py
rename to tensorflow/python/kernel_tests/distributions/student_t_test.py
index 209ef696caa..f1150de58e0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/student_t_test.py
+++ b/tensorflow/python/kernel_tests/distributions/student_t_test.py
@@ -18,19 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
 import math
 
 import numpy as np
-from scipy import stats
-from tensorflow.contrib import distributions
-from tensorflow.contrib.distributions.python.ops import student_t
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import student_t
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
 
-ds = distributions
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class StudentTTest(test.TestCase):
@@ -45,7 +56,7 @@ class StudentTTest(test.TestCase):
       mu_v = 7.
       sigma_v = 8.
       t = np.array([-2.5, 2.5, 8., 0., -1., 2.], dtype=np.float32)
-      student = ds.StudentT(df, loc=mu, scale=-sigma)
+      student = student_t.StudentT(df, loc=mu, scale=-sigma)
 
       log_pdf = student.log_prob(t)
       self.assertEquals(log_pdf.get_shape(), (6,))
@@ -54,6 +65,9 @@ class StudentTTest(test.TestCase):
       self.assertEquals(pdf.get_shape(), (6,))
       pdf_values = pdf.eval()
 
+      if not stats:
+        return
+
       expected_log_pdf = stats.t.logpdf(t, df_v, loc=mu_v, scale=sigma_v)
       expected_pdf = stats.t.pdf(t, df_v, loc=mu_v, scale=sigma_v)
       self.assertAllClose(expected_log_pdf, log_pdf_values)
@@ -72,13 +86,16 @@ class StudentTTest(test.TestCase):
       mu_v = np.array([3., -3.])
       sigma_v = np.array([np.sqrt(10.), np.sqrt(15.)])
       t = np.array([[-2.5, 2.5, 4., 0., -1., 2.]], dtype=np.float32).T
-      student = ds.StudentT(df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df, loc=mu, scale=sigma)
       log_pdf = student.log_prob(t)
       log_pdf_values = log_pdf.eval()
       self.assertEqual(log_pdf.get_shape(), (6, 2))
       pdf = student.prob(t)
       pdf_values = pdf.eval()
       self.assertEqual(pdf.get_shape(), (6, 2))
+
+      if not stats:
+        return
       expected_log_pdf = stats.t.logpdf(t, df_v, loc=mu_v, scale=sigma_v)
       expected_pdf = stats.t.pdf(t, df_v, loc=mu_v, scale=sigma_v)
       self.assertAllClose(expected_log_pdf, log_pdf_values)
@@ -105,6 +122,8 @@ class StudentTTest(test.TestCase):
       self.assertEquals(cdf.get_shape(), (6,))
       cdf_values = cdf.eval()
 
+      if not stats:
+        return
       expected_log_cdf = stats.t.logcdf(t, df_v, loc=mu_v, scale=sigma_v)
       expected_cdf = stats.t.cdf(t, df_v, loc=mu_v, scale=sigma_v)
       self.assertAllClose(expected_log_cdf, log_cdf_values, atol=0., rtol=1e-5)
@@ -119,7 +138,7 @@ class StudentTTest(test.TestCase):
     mu_v = np.array([[1., -1, 0]])  # 1x3
     sigma_v = np.array([[1., -2., 3.]]).T  # transposed => 3x1
     with self.test_session():
-      student = ds.StudentT(df=df_v, loc=mu_v, scale=sigma_v)
+      student = student_t.StudentT(df=df_v, loc=mu_v, scale=sigma_v)
       ent = student.entropy()
       ent_values = ent.eval()
 
@@ -128,6 +147,8 @@ class StudentTTest(test.TestCase):
     sigma_bc = np.abs(sigma_v) * ones
     mu_bc = ones.T * mu_v
     df_bc = ones.T * df_v
+    if not stats:
+      return
     expected_entropy = stats.t.entropy(
         np.reshape(df_bc, [-1]),
         loc=np.reshape(mu_bc, [-1]),
@@ -144,7 +165,7 @@ class StudentTTest(test.TestCase):
       mu_v = 3.
       sigma_v = np.sqrt(10.)
       n = constant_op.constant(200000)
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       samples = student.sample(n, seed=123456)
       sample_values = samples.eval()
       n_val = 200000
@@ -166,11 +187,13 @@ class StudentTTest(test.TestCase):
       n = constant_op.constant(100)
 
       random_seed.set_random_seed(654321)
-      student = ds.StudentT(df=df, loc=mu, scale=sigma, name="student_t1")
+      student = student_t.StudentT(
+          df=df, loc=mu, scale=sigma, name="student_t1")
       samples1 = student.sample(n, seed=123456).eval()
 
       random_seed.set_random_seed(654321)
-      student2 = ds.StudentT(df=df, loc=mu, scale=sigma, name="student_t2")
+      student2 = student_t.StudentT(
+          df=df, loc=mu, scale=sigma, name="student_t2")
       samples2 = student2.sample(n, seed=123456).eval()
 
       self.assertAllClose(samples1, samples2)
@@ -180,7 +203,7 @@ class StudentTTest(test.TestCase):
       df_v = [1e-1, 1e-5, 1e-10, 1e-20]
       df = constant_op.constant(df_v)
       n = constant_op.constant(200000)
-      student = ds.StudentT(df=df, loc=1., scale=1.)
+      student = student_t.StudentT(df=df, loc=1., scale=1.)
       samples = student.sample(n, seed=123456)
       sample_values = samples.eval()
       n_val = 200000
@@ -198,7 +221,7 @@ class StudentTTest(test.TestCase):
       mu_v = [3., -3.]
       sigma_v = [np.sqrt(10.), np.sqrt(15.)]
       n = constant_op.constant(200000)
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       samples = student.sample(n, seed=123456)
       sample_values = samples.eval()
       self.assertEqual(samples.get_shape(), (200000, batch_size, 2))
@@ -222,6 +245,8 @@ class StudentTTest(test.TestCase):
   def _checkKLApprox(self, df, mu, sigma, samples):
     n = samples.size
     np.random.seed(137)
+    if not stats:
+      return
     sample_scipy = stats.t.rvs(df, loc=mu, scale=sigma, size=n)
     covg = 0.99
     r = stats.t.interval(covg, df, loc=mu, scale=sigma)
@@ -247,9 +272,9 @@ class StudentTTest(test.TestCase):
       self.assertEqual(student.prob(2.).get_shape(), (3,))
       self.assertEqual(student.sample(37, seed=123456).get_shape(), (37, 3,))
 
-    _check(ds.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
-    _check(ds.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
-    _check(ds.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
+    _check(student_t.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
+    _check(student_t.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
+    _check(student_t.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
 
   def testBroadcastingPdfArgs(self):
 
@@ -266,9 +291,9 @@ class StudentTTest(test.TestCase):
       xs = xs.T
       _assert_shape(student, xs, (3, 3))
 
-    _check(ds.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
-    _check(ds.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
-    _check(ds.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
+    _check(student_t.StudentT(df=[2., 3., 4.,], loc=2., scale=1.))
+    _check(student_t.StudentT(df=7., loc=[2., 3., 4.,], scale=1.))
+    _check(student_t.StudentT(df=7., loc=3., scale=[2., 3., 4.,]))
 
     def _check2d(student):
       _assert_shape(student, 2., (1, 3))
@@ -279,9 +304,9 @@ class StudentTTest(test.TestCase):
       xs = xs.T
       _assert_shape(student, xs, (3, 3))
 
-    _check2d(ds.StudentT(df=[[2., 3., 4.,]], loc=2., scale=1.))
-    _check2d(ds.StudentT(df=7., loc=[[2., 3., 4.,]], scale=1.))
-    _check2d(ds.StudentT(df=7., loc=3., scale=[[2., 3., 4.,]]))
+    _check2d(student_t.StudentT(df=[[2., 3., 4.,]], loc=2., scale=1.))
+    _check2d(student_t.StudentT(df=7., loc=[[2., 3., 4.,]], scale=1.))
+    _check2d(student_t.StudentT(df=7., loc=3., scale=[[2., 3., 4.,]]))
 
     def _check2d_rows(student):
       _assert_shape(student, 2., (3, 1))
@@ -292,22 +317,23 @@ class StudentTTest(test.TestCase):
       xs = xs.T  # (3,1)
       _assert_shape(student, xs, (3, 1))
 
-    _check2d_rows(ds.StudentT(df=[[2.], [3.], [4.]], loc=2., scale=1.))
-    _check2d_rows(ds.StudentT(df=7., loc=[[2.], [3.], [4.]], scale=1.))
-    _check2d_rows(ds.StudentT(df=7., loc=3., scale=[[2.], [3.], [4.]]))
+    _check2d_rows(student_t.StudentT(df=[[2.], [3.], [4.]], loc=2., scale=1.))
+    _check2d_rows(student_t.StudentT(df=7., loc=[[2.], [3.], [4.]], scale=1.))
+    _check2d_rows(student_t.StudentT(df=7., loc=3., scale=[[2.], [3.], [4.]]))
 
   def testMeanAllowNanStatsIsFalseWorksWhenAllBatchMembersAreDefined(self):
     with self.test_session():
       mu = [1., 3.3, 4.4]
-      student = ds.StudentT(df=[3., 5., 7.], loc=mu, scale=[3., 2., 1.])
+      student = student_t.StudentT(df=[3., 5., 7.], loc=mu, scale=[3., 2., 1.])
       mean = student.mean().eval()
       self.assertAllClose([1., 3.3, 4.4], mean)
 
   def testMeanAllowNanStatsIsFalseRaisesWhenBatchMemberIsUndefined(self):
     with self.test_session():
       mu = [1., 3.3, 4.4]
-      student = ds.StudentT(df=[0.5, 5., 7.], loc=mu, scale=[3., 2., 1.],
-                            allow_nan_stats=False)
+      student = student_t.StudentT(
+          df=[0.5, 5., 7.], loc=mu, scale=[3., 2., 1.],
+          allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
         student.mean().eval()
 
@@ -315,8 +341,9 @@ class StudentTTest(test.TestCase):
     with self.test_session():
       mu = [-2, 0., 1., 3.3, 4.4]
       sigma = [5., 4., 3., 2., 1.]
-      student = ds.StudentT(df=[0.5, 1., 3., 5., 7.], loc=mu, scale=sigma,
-                            allow_nan_stats=True)
+      student = student_t.StudentT(
+          df=[0.5, 1., 3., 5., 7.], loc=mu, scale=sigma,
+          allow_nan_stats=True)
       mean = student.mean().eval()
       self.assertAllClose([np.nan, np.nan, 1., 3.3, 4.4], mean)
 
@@ -327,7 +354,8 @@ class StudentTTest(test.TestCase):
       df = [0.5, 1.5, 3., 5., 7.]
       mu = [-2, 0., 1., 3.3, 4.4]
       sigma = [5., 4., 3., 2., 1.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma, allow_nan_stats=True)
+      student = student_t.StudentT(
+          df=df, loc=mu, scale=sigma, allow_nan_stats=True)
       var = student.variance().eval()
       ## scipy uses inf for variance when the mean is undefined.  When mean is
       # undefined we say variance is undefined as well.  So test the first
@@ -336,6 +364,8 @@ class StudentTTest(test.TestCase):
       self.assertTrue(np.isnan(var[0]))
       var[0] = np.inf
 
+      if not stats:
+        return
       expected_var = [
           stats.t.var(d, loc=m, scale=s) for (d, m, s) in zip(df, mu, sigma)
       ]
@@ -348,9 +378,11 @@ class StudentTTest(test.TestCase):
       df = [1.5, 3., 5., 7.]
       mu = [0., 1., 3.3, 4.4]
       sigma = [4., 3., 2., 1.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       var = student.variance().eval()
 
+      if not stats:
+        return
       expected_var = [
           stats.t.var(d, loc=m, scale=s) for (d, m, s) in zip(df, mu, sigma)
       ]
@@ -359,13 +391,15 @@ class StudentTTest(test.TestCase):
   def testVarianceAllowNanStatsFalseRaisesForUndefinedBatchMembers(self):
     with self.test_session():
       # df <= 1 ==> variance not defined
-      student = ds.StudentT(df=1., loc=0., scale=1., allow_nan_stats=False)
+      student = student_t.StudentT(
+          df=1., loc=0., scale=1., allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
         student.variance().eval()
 
     with self.test_session():
       # df <= 1 ==> variance not defined
-      student = ds.StudentT(df=0.5, loc=0., scale=1., allow_nan_stats=False)
+      student = student_t.StudentT(
+          df=0.5, loc=0., scale=1., allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
         student.variance().eval()
 
@@ -375,11 +409,13 @@ class StudentTTest(test.TestCase):
       df = [3.5, 5., 3., 5., 7.]
       mu = [-2.2]
       sigma = [5., 4., 3., 2., 1.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       # Test broadcast of mu across shape of df/sigma
       stddev = student.stddev().eval()
       mu *= len(df)
 
+      if not stats:
+        return
       expected_stddev = [
           stats.t.std(d, loc=m, scale=s) for (d, m, s) in zip(df, mu, sigma)
       ]
@@ -390,14 +426,14 @@ class StudentTTest(test.TestCase):
       df = [0.5, 1., 3]
       mu = [-1, 0., 1]
       sigma = [5., 4., 3.]
-      student = ds.StudentT(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       # Test broadcast of mu across shape of df/sigma
       mode = student.mode().eval()
       self.assertAllClose([-1., 0, 1], mode)
 
   def testPdfOfSample(self):
     with self.test_session() as sess:
-      student = ds.StudentT(df=3., loc=np.pi, scale=1.)
+      student = student_t.StudentT(df=3., loc=np.pi, scale=1.)
       num = 20000
       samples = student.sample(num, seed=123456)
       pdfs = student.prob(samples)
@@ -410,13 +446,15 @@ class StudentTTest(test.TestCase):
       self.assertEqual(mean.get_shape(), ())
       self.assertNear(np.pi, np.mean(sample_vals), err=0.02)
       self.assertNear(np.pi, mean_val, err=1e-6)
-      self.assertNear(stats.t.pdf(np.pi, 3., loc=np.pi), mean_pdf_val, err=1e-6)
       # Verify integral over sample*pdf ~= 1.
       self._assertIntegral(sample_vals, pdf_vals, err=2e-3)
+      if not stats:
+        return
+      self.assertNear(stats.t.pdf(np.pi, 3., loc=np.pi), mean_pdf_val, err=1e-6)
 
   def testPdfOfSampleMultiDims(self):
     with self.test_session() as sess:
-      student = ds.StudentT(df=[7., 11.], loc=[[5.], [6.]], scale=3.)
+      student = student_t.StudentT(df=[7., 11.], loc=[[5.], [6.]], scale=3.)
       self.assertAllEqual([], student.event_shape)
       self.assertAllEqual([], student.event_shape_tensor().eval())
       self.assertAllEqual([2, 2], student.batch_shape)
@@ -429,6 +467,12 @@ class StudentTTest(test.TestCase):
       self.assertEqual(pdfs.get_shape(), (num, 2, 2))
       self.assertNear(5., np.mean(sample_vals[:, 0, :]), err=.03)
       self.assertNear(6., np.mean(sample_vals[:, 1, :]), err=.03)
+      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+      if not stats:
+        return
       self.assertNear(
           stats.t.var(7., loc=0., scale=3.),  # loc d.n. effect var
           np.var(sample_vals[:, :, 0]),
@@ -437,10 +481,6 @@ class StudentTTest(test.TestCase):
           stats.t.var(11., loc=0., scale=3.),  # loc d.n. effect var
           np.var(sample_vals[:, :, 1]),
           err=.4)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1.5e-3):
     s_p = zip(sample_vals, pdf_vals)
@@ -454,8 +494,8 @@ class StudentTTest(test.TestCase):
 
   def testNegativeDofFails(self):
     with self.test_session():
-      student = ds.StudentT(df=[2, -5.], loc=0., scale=1.,
-                            validate_args=True, name="S")
+      student = student_t.StudentT(df=[2, -5.], loc=0., scale=1.,
+                                   validate_args=True, name="S")
       with self.assertRaisesOpError(r"Condition x > 0 did not hold"):
         student.mean().eval()
 
@@ -464,7 +504,8 @@ class StudentTTest(test.TestCase):
       df = constant_op.constant([-3.2, -4.6])
       mu = constant_op.constant([-4.2, 3.4])
       sigma = constant_op.constant([-6.4, -8.8])
-      student = ds.StudentTWithAbsDfSoftplusScale(df=df, loc=mu, scale=sigma)
+      student = student_t.StudentTWithAbsDfSoftplusScale(
+          df=df, loc=mu, scale=sigma)
       self.assertAllClose(
           math_ops.floor(math_ops.abs(df)).eval(), student.df.eval())
       self.assertAllClose(mu.eval(), student.loc.eval())
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/uniform_test.py b/tensorflow/python/kernel_tests/distributions/uniform_test.py
similarity index 93%
rename from tensorflow/contrib/distributions/python/kernel_tests/uniform_test.py
rename to tensorflow/python/kernel_tests/distributions/uniform_test.py
index c3c97b98f0d..df99a0ed257 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/uniform_test.py
+++ b/tensorflow/python/kernel_tests/distributions/uniform_test.py
@@ -18,15 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
+
 import numpy as np
-from scipy import stats
-from tensorflow.contrib.distributions.python.ops import uniform as uniform_lib
+
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import uniform as uniform_lib
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
 
 
 class UniformTest(test.TestCase):
@@ -126,7 +141,7 @@ class UniformTest(test.TestCase):
       b_v = np.array([1.0, 2.0, 3.0], dtype=np.float32)
       uniform = uniform_lib.Uniform(low=a_v, high=b_v, validate_args=True)
 
-      with self.assertRaisesWithPredicateMatch(errors_impl.InvalidArgumentError,
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
                                                "x < y"):
         uniform.low.eval()
 
@@ -187,6 +202,8 @@ class UniformTest(test.TestCase):
       a = 10.0
       b = 100.0
       uniform = uniform_lib.Uniform(low=a, high=b)
+      if not stats:
+        return
       s_uniform = stats.uniform(loc=a, scale=b - a)
       self.assertAllClose(uniform.mean().eval(), s_uniform.mean())
 
@@ -195,6 +212,8 @@ class UniformTest(test.TestCase):
       a = 10.0
       b = 100.0
       uniform = uniform_lib.Uniform(low=a, high=b)
+      if not stats:
+        return
       s_uniform = stats.uniform(loc=a, scale=b - a)
       self.assertAllClose(uniform.variance().eval(), s_uniform.var())
 
@@ -203,6 +222,8 @@ class UniformTest(test.TestCase):
       a = 10.0
       b = 100.0
       uniform = uniform_lib.Uniform(low=a, high=b)
+      if not stats:
+        return
       s_uniform = stats.uniform(loc=a, scale=b - a)
       self.assertAllClose(uniform.stddev().eval(), s_uniform.std())
 
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
index 90d3f04c72f..833239eb5fa 100644
--- a/tensorflow/python/ops/distributions/BUILD
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -24,6 +24,7 @@ py_library(
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:special_math_ops",
         "//tensorflow/python:util",
     ],
 )
diff --git a/tensorflow/contrib/distributions/python/ops/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
similarity index 100%
rename from tensorflow/contrib/distributions/python/ops/bernoulli.py
rename to tensorflow/python/ops/distributions/bernoulli.py
diff --git a/tensorflow/contrib/distributions/python/ops/beta.py b/tensorflow/python/ops/distributions/beta.py
similarity index 100%
rename from tensorflow/contrib/distributions/python/ops/beta.py
rename to tensorflow/python/ops/distributions/beta.py
diff --git a/tensorflow/contrib/distributions/python/ops/categorical.py b/tensorflow/python/ops/distributions/categorical.py
similarity index 100%
rename from tensorflow/contrib/distributions/python/ops/categorical.py
rename to tensorflow/python/ops/distributions/categorical.py
diff --git a/tensorflow/python/ops/distributions/conditional_distribution.py b/tensorflow/python/ops/distributions/conditional_distribution.py
index a04373afbf2..ef25d4aedd6 100644
--- a/tensorflow/python/ops/distributions/conditional_distribution.py
+++ b/tensorflow/python/ops/distributions/conditional_distribution.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 class ConditionalDistribution(distribution.Distribution):
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
similarity index 100%
rename from tensorflow/contrib/distributions/python/ops/dirichlet.py
rename to tensorflow/python/ops/distributions/dirichlet.py
diff --git a/tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
similarity index 100%
rename from tensorflow/contrib/distributions/python/ops/dirichlet_multinomial.py
rename to tensorflow/python/ops/distributions/dirichlet_multinomial.py
diff --git a/tensorflow/contrib/distributions/python/ops/exponential.py b/tensorflow/python/ops/distributions/exponential.py
similarity index 98%
rename from tensorflow/contrib/distributions/python/ops/exponential.py
rename to tensorflow/python/ops/distributions/exponential.py
index a293d1e0dc2..281641b9156 100644
--- a/tensorflow/contrib/distributions/python/ops/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.distributions.python.ops import gamma
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import gamma
 
 
 __all__ = [
diff --git a/tensorflow/contrib/distributions/python/ops/gamma.py b/tensorflow/python/ops/distributions/gamma.py
similarity index 100%
rename from tensorflow/contrib/distributions/python/ops/gamma.py
rename to tensorflow/python/ops/distributions/gamma.py
diff --git a/tensorflow/contrib/distributions/python/ops/laplace.py b/tensorflow/python/ops/distributions/laplace.py
similarity index 100%
rename from tensorflow/contrib/distributions/python/ops/laplace.py
rename to tensorflow/python/ops/distributions/laplace.py
diff --git a/tensorflow/contrib/distributions/python/ops/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
similarity index 100%
rename from tensorflow/contrib/distributions/python/ops/multinomial.py
rename to tensorflow/python/ops/distributions/multinomial.py
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 4c531b03788..0ef1c91df8c 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -70,14 +70,14 @@ class Normal(distribution.Distribution):
 
   ```python
   # Define a single scalar Normal distribution.
-  dist = tf.contrib.distributions.Normal(loc=0., scale=3.)
+  dist = tf.distributions.Normal(loc=0., scale=3.)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1.)
 
   # Define a batch of two scalar valued Normals.
   # The first has mean 1 and standard deviation 11, the second 2 and 22.
-  dist = tf.contrib.distributions.Normal(loc=[1, 2.], scale=[11, 22.])
+  dist = tf.distributions.Normal(loc=[1, 2.], scale=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -92,7 +92,7 @@ class Normal(distribution.Distribution):
   ```python
   # Define a batch of two scalar valued Normals.
   # Both have mean 1, but different standard deviations.
-  dist = tf.contrib.distributions.Normal(loc=1., scale=[11, 22.])
+  dist = tf.distributions.Normal(loc=1., scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
diff --git a/tensorflow/contrib/distributions/python/ops/student_t.py b/tensorflow/python/ops/distributions/student_t.py
similarity index 97%
rename from tensorflow/contrib/distributions/python/ops/student_t.py
rename to tensorflow/python/ops/distributions/student_t.py
index 7872569a2b9..073ac4286be 100644
--- a/tensorflow/contrib/distributions/python/ops/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -42,8 +42,10 @@ __all__ = [
 
 
 class StudentT(distribution.Distribution):
-  # pylint: disable=line-too-long
-  """Student's t-distribution with degree of freedom `df`, location `loc`, and `scale` parameters.
+  """Student's t-distribution.
+
+  This distribution has parameters: degree of freedom `df`, location `loc`,
+  and `scale`.
 
   #### Mathematical details
 
@@ -82,7 +84,7 @@ class StudentT(distribution.Distribution):
 
   ```python
   # Define a single scalar Student t distribution.
-  single_dist = tf.contrib.distributions.StudentT(df=3)
+  single_dist = tf.distributions.StudentT(df=3)
 
   # Evaluate the pdf at 1, returning a scalar Tensor.
   single_dist.prob(1.)
@@ -90,7 +92,7 @@ class StudentT(distribution.Distribution):
   # Define a batch of two scalar valued Student t's.
   # The first has degrees of freedom 2, mean 1, and scale 11.
   # The second 3, 2 and 22.
-  multi_dist = tf.contrib.distributions.StudentT(df=[2, 3],
+  multi_dist = tf.distributions.StudentT(df=[2, 3],
                                                  loc=[1, 2.],
                                                  scale=[11, 22.])
 
@@ -107,7 +109,7 @@ class StudentT(distribution.Distribution):
   ```python
   # Define a batch of two Student's t distributions.
   # Both have df 2 and mean 1, but different scales.
-  dist = tf.contrib.distributions.StudentT(df=2, loc=1, scale=[11, 22.])
+  dist = tf.distributions.StudentT(df=2, loc=1, scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
diff --git a/tensorflow/contrib/distributions/python/ops/uniform.py b/tensorflow/python/ops/distributions/uniform.py
similarity index 100%
rename from tensorflow/contrib/distributions/python/ops/uniform.py
rename to tensorflow/python/ops/distributions/uniform.py

From a5749019e065b25f49531de8b9f29627fb12fc5f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 13:30:00 -0800
Subject: [PATCH 28/51] Add shape to error message. Change: 154880260

---
 tensorflow/python/ops/weights_broadcast_ops.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/weights_broadcast_ops.py b/tensorflow/python/ops/weights_broadcast_ops.py
index 257b9f1faa4..35e93249c31 100644
--- a/tensorflow/python/ops/weights_broadcast_ops.py
+++ b/tensorflow/python/ops/weights_broadcast_ops.py
@@ -97,9 +97,10 @@ def assert_broadcastable(weights, values):
         return control_flow_ops.no_op(name="static_scalar_check_success")
       if weights_rank_static != values_rank_static:
         raise ValueError(
-            "%s values.rank=%s. weights.rank=%s." % (
+            "%s values.rank=%s. weights.rank=%s."
+            " values.shape=%s. weights.shape=%s." % (
                 _ASSERT_BROADCASTABLE_ERROR_PREFIX, values_rank_static,
-                weights_rank_static))
+                weights_rank_static, values.shape, weights.shape))
       weights_shape_static = tensor_util.constant_value(weights_shape)
       values_shape_static = tensor_util.constant_value(values_shape)
       if weights_shape_static is not None and values_shape_static is not None:

From 58196d4bf923d6fa2500e84d9d22ed8227ba305c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 16:27:12 -0800
Subject: [PATCH 29/51] [TF:XLA] Added unittest for transpose constant folding

Transpose constant folding was missing a unittest.
Change: 154903586
---
 tensorflow/compiler/xla/literal_util.h        | 46 +++++++++++
 tensorflow/compiler/xla/service/BUILD         |  1 +
 .../xla/service/hlo_constant_folding_test.cc  | 80 ++++++++++++++-----
 .../compiler/xla/tests/literal_test_util.h    | 65 +++++++++++++++
 4 files changed, 174 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index ae3d43e56c7..3a6d21979e7 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -339,6 +340,14 @@ class LiteralUtil {
                                               const Layout& layout,
                                               Literal* literal);
 
+  // Populates literal values by calling the generator function for every cell
+  // in the literal object.
+  template <typename NativeT>
+  static Status Populate(
+      Literal* literal,
+      const std::function<NativeT(tensorflow::gtl::ArraySlice<int64> indexes)>&
+          generator);
+
   // Creates a Literal of the given dimensions with all elements set to the
   // given value.
   template <typename NativeT>
@@ -992,6 +1001,43 @@ template <typename NativeT>
                                   literal);
 }
 
+template <typename NativeT>
+/* static */ Status LiteralUtil::Populate(
+    Literal* literal,
+    const std::function<NativeT(tensorflow::gtl::ArraySlice<int64> indexes)>&
+        generator) {
+  const Shape& shape = literal->shape();
+  int64 rank = ShapeUtil::Rank(shape);
+  TF_RET_CHECK(shape.element_type() ==
+               primitive_util::NativeToPrimitiveType<NativeT>());
+  tensorflow::protobuf::RepeatedField<NativeT>* data =
+      GetMutableRepeatedField<NativeT>(literal);
+  if (rank > 0) {
+    std::vector<int64> base(rank, 0);
+    std::vector<int64> step(rank, 1);
+    std::vector<int64> minor_scan_indexes(rank, 0);
+    int64 minor_dimension = shape.layout().minor_to_major()[0];
+    int64 minor_dimension_size =
+        ShapeUtil::GetDimension(shape, minor_dimension);
+
+    step[minor_dimension] = minor_dimension_size;
+    auto init_function = [&](const std::vector<int64>& indexes) {
+      int64 index = LinearIndex(*literal, indexes);
+      std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin());
+      for (int64 i = 0; i < minor_dimension_size; ++i) {
+        minor_scan_indexes[minor_dimension] = i;
+        data->Set(index + i, generator(minor_scan_indexes));
+      }
+      return true;
+    };
+    ShapeUtil::ForEachIndex(shape, base, AsInt64Slice(shape.dimensions()), step,
+                            init_function);
+  } else {
+    data->Set(0, generator({}));
+  }
+  return Status::OK();
+}
+
 template <typename NativeT>
 /* static */ void LiteralUtil::PopulateWithValue(
     NativeT value, tensorflow::gtl::ArraySlice<int64> dimensions,
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index bdb69b6e55e..750e1ee3f2c 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1436,6 +1436,7 @@ cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index d20f423bd6c..21d93a1f27f 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace op = xla::testing::opcode_matchers;
@@ -49,8 +50,9 @@ TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
-  HloConstantFolding simplifier;
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
 
   EXPECT_THAT(computation->root_instruction(), op::Constant());
   EXPECT_EQ(LiteralUtil::GetFirstElement<int64>(
@@ -70,8 +72,9 @@ TEST_F(HloConstantFoldingTest, ConvertS64ToF32) {
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
-  HloConstantFolding simplifier;
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
 
   EXPECT_THAT(computation->root_instruction(), op::Constant());
   EXPECT_EQ(LiteralUtil::GetFirstElement<float>(
@@ -91,8 +94,9 @@ TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) {
 
   EXPECT_THAT(computation->root_instruction(), op::Convert(input));
 
-  HloConstantFolding simplifier;
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
 
   EXPECT_THAT(computation->root_instruction(), op::Constant());
   EXPECT_EQ(
@@ -131,11 +135,12 @@ TEST_F(HloConstantFoldingTest, Concatenate) {
     Shape shape = ShapeUtil::MakeShape(F32, dimensions);
     builder.AddInstruction(HloInstruction::CreateConcatenate(
         shape, operands, test_config.concat_dimension));
-    HloModule module(TestName());
-    auto computation = module.AddEntryComputation(builder.Build());
+    auto module = MakeUnique<HloModule>(TestName());
+    auto computation = module->AddEntryComputation(builder.Build());
 
-    HloConstantFolding simplifier;
-    ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+    HloConstantFolding const_folder;
+    TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+    EXPECT_TRUE(result);
 
     HloInstruction* root = computation->root_instruction();
     EXPECT_THAT(root, op::Constant());
@@ -148,22 +153,61 @@ TEST_F(HloConstantFoldingTest, Slice) {
   const int64 dimensions[] = {11, 8, 7, 5, 9};
   const int64 slice_start[] = {4, 2, 3, 1, 5};
   const int64 slice_limits[] = {10, 8, 6, 5, 9};
-  auto literal = LiteralUtil::CreateFromDimensions(F32, dimensions);
-  HloInstruction* lit_insn = builder.AddInstruction(
+  TF_ASSIGN_OR_ASSERT_OK(auto literal,
+                         LiteralTestUtil::CreateRandomLiteral<F32>(
+                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
+  HloInstruction* literal_instruction = builder.AddInstruction(
       HloInstruction::CreateConstant(std::move(literal)));
   Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
-  builder.AddInstruction(
-      HloInstruction::CreateSlice(shape, lit_insn, slice_start, slice_limits));
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      shape, literal_instruction, slice_start, slice_limits));
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
 
-  HloConstantFolding simplifier;
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Constant());
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
 }
 
+TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
+  HloComputation::Builder builder(TestName());
+  const int64 dimensions[] = {11, 8, 7, 5, 9};
+  TF_ASSIGN_OR_ASSERT_OK(auto literal,
+                         LiteralTestUtil::CreateRandomLiteral<F32>(
+                             ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
+  auto literal_clone = LiteralUtil::CloneToUnique(*literal);
+  HloInstruction* literal_instruction = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  Shape shape = ShapeUtil::MakeShape(F32, {8, 7, 11, 9, 5});
+  const int64 permutation[] = {1, 2, 0, 4, 3};
+  builder.AddInstruction(
+      HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
+  auto module = MakeUnique<HloModule>(TestName());
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  HloConstantFolding const_folder;
+  TF_ASSIGN_OR_ASSERT_OK(bool result, const_folder.Run(module.get()));
+  EXPECT_TRUE(result);
+
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Constant());
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
+
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
+  bool matched = true;
+  LiteralUtil::EachCell<NativeT>(
+      root->literal(),
+      [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT value) {
+        std::vector<int64> rindexes = Permute(permutation, indices);
+        matched = matched && (value == LiteralUtil::Get<NativeT>(*literal_clone,
+                                                                 rindexes));
+      });
+  EXPECT_TRUE(matched);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index aeadc023cc0..4f980830333 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <initializer_list>
 #include <memory>
+#include <random>
 #include <string>
 
 #include "tensorflow/compiler/xla/array2d.h"
@@ -171,6 +172,36 @@ class LiteralTestUtil {
       tensorflow::gtl::ArraySlice<int64> minor_to_major,
       const Literal& literal);
 
+  // Creates a literal with the supplied shape, and uses the provided value
+  // generator to populate the literal's values.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape,
+      const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation, and using the engine as entropy generator.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type, typename E,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape, E* engine, T mean, T stddev);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape, T mean, T stddev);
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(LiteralTestUtil);
 };
@@ -270,6 +301,40 @@ template <typename NativeT>
   ExpectNear(*LiteralUtil::CreateR4FromArray4D(expected), actual, error);
 }
 
+template <PrimitiveType type, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>>
+LiteralTestUtil::CreateRandomLiteral(
+    const Shape& shape,
+    const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  TF_RET_CHECK(shape.element_type() == type);
+  std::unique_ptr<Literal> literal = LiteralUtil::CreateFromShape(shape);
+  TF_RETURN_IF_ERROR(LiteralUtil::Populate<NativeT>(
+      literal.get(), [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+        return generator(indexes);
+      }));
+  return std::move(literal);
+}
+
+template <PrimitiveType type, typename E, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>>
+LiteralTestUtil::CreateRandomLiteral(const Shape& shape, E* engine, T mean,
+                                     T stddev) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  std::normal_distribution<NativeT> generator(mean, stddev);
+  return CreateRandomLiteral<type, NativeT>(
+      shape, [&](tensorflow::gtl::ArraySlice<int64> /*indexes*/) {
+        return generator(*engine);
+      });
+}
+
+template <PrimitiveType type, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>>
+LiteralTestUtil::CreateRandomLiteral(const Shape& shape, T mean, T stddev) {
+  std::minstd_rand0 engine;
+  return CreateRandomLiteral<type>(shape, &engine, mean, stddev);
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_LITERAL_TEST_UTIL_H_

From 5ad12420e78d0aa756fd2a41945468e826e267c2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 May 2017 17:21:15 -0800
Subject: [PATCH 30/51] [XLA:HLO] Run HeapSimulator on whole-module if all
 computations are sequential.

Previously the HeapSimulator was only run on a per-computation basis. This meant
that if you had many sub-computations in your module (e.g. many While loops),
the space for all of the temporary buffers inside the conditions and bodies of
the loops were in distinct memory ranges. This is overly pessimistic if all
computations in the module are sequential.

This CL changes the HeapSimulator to also run whole-module simulation, calling
Alloc and Free on sub-computation buffers at the appropriate nested spot, right
next to the calling instruction. The BufferAssigner is updated to take advantage
of this when possible, as is MinimumMemoryForSequence.
Change: 154908856
---
 tensorflow/compiler/xla/service/BUILD         |  31 ++---
 .../compiler/xla/service/buffer_assignment.cc | 126 +++++++++++++-----
 .../compiler/xla/service/buffer_assignment.h  |  29 +++-
 .../compiler/xla/service/heap_simulator.cc    |  92 ++++++++++---
 .../compiler/xla/service/heap_simulator.h     |  32 ++++-
 .../xla/service/heap_simulator_test.cc        | 126 +++++++++++++++++-
 .../compiler/xla/service/hlo_ordering.cc      |  62 +++++----
 .../compiler/xla/service/hlo_ordering_test.cc |  59 ++++++++
 8 files changed, 442 insertions(+), 115 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 750e1ee3f2c..2452158efa2 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -666,8 +666,8 @@ cc_library(
     ],
     deps = [
         ":buffer_liveness",
-        ":heap_simulator",
         ":hlo",
+        ":hlo_ordering",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -707,51 +707,38 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "heap_simulator",
-    srcs = [
-        "heap_simulator.cc",
-    ],
-    hdrs = [
-        "heap_simulator.h",
-    ],
-    deps = [
-        ":hlo",
-        ":liveness_util",
-        ":logical_buffer",
-        ":tuple_points_to_analysis",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_test(
     name = "heap_simulator_test",
     srcs = ["heap_simulator_test.cc"],
     deps = [
-        ":heap_simulator",
         ":hlo",
+        ":hlo_ordering",
         ":logical_buffer",
         ":tuple_points_to_analysis",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
     ],
 )
 
+# The hlo_ordering library contains both hlo_ordering and heap_simulator because
+# they are mutually dependent.
 cc_library(
     name = "hlo_ordering",
     srcs = [
+        "heap_simulator.cc",
         "hlo_ordering.cc",
     ],
     hdrs = [
+        "heap_simulator.h",
         "hlo_ordering.h",
     ],
     deps = [
         ":call_graph",
-        ":heap_simulator",
         ":hlo",
+        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 3cdbf892f7a..a79468f939f 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -548,6 +548,8 @@ Status BufferAssigner::AssignBuffersForComputation(
     const FlatSet<const HloInstruction*>* hlos_to_allocate,
     const FlatSet<const LogicalBuffer*>& colocated_buffers,
     const FlatSet<BufferAllocation::Index>& colocated_allocations,
+    FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>*
+        buffers_to_assign_sequentially,
     BufferAssignment* assignment) {
   // Buffers are sorted and assigned to BufferAllocations in decreasing order of
   // size.
@@ -578,9 +580,16 @@ Status BufferAssigner::AssignBuffersForComputation(
   // If there is a sequential instruction ordering, we'll delay assignment of
   // temp buffers until after the main assignment loop.
   const BufferLiveness& liveness = assignment->liveness();
-  const std::vector<const HloInstruction*>* sequential_order =
-      liveness.hlo_ordering().SequentialOrder(*computation);
-  FlatSet<const LogicalBuffer*> unassigned_temp_buffers;
+  const bool has_sequential_order =
+      liveness.hlo_ordering().SequentialOrder(*computation) != nullptr;
+  if (has_sequential_order && buffers_to_assign_sequentially != nullptr) {
+    // Every sequential computation must get an entry in the
+    // buffers_to_assign_sequentially map, even if we end up with an empty set
+    // of buffers. This ensures we can correctly determine whether to run
+    // whole-module heap simulation.
+    buffers_to_assign_sequentially->emplace(computation,
+                                            FlatSet<const LogicalBuffer*>());
+  }
 
   // Sort the LogicalBuffers first by size. We assign the larger LogicalBuffers
   // first for simplicity. This means any previously created BufferAllocation is
@@ -599,7 +608,7 @@ Status BufferAssigner::AssignBuffersForComputation(
   // important reuse case where an elementwise instruction reuses one of its
   // operand's buffer. This improves locality.
   std::sort(sorted_buffers.begin(), sorted_buffers.end(),
-            [this, sequential_order, &liveness, &post_order_position](
+            [this, has_sequential_order, &liveness, &post_order_position](
                 const LogicalBuffer* a, const LogicalBuffer* b) {
               // Primary sort is by decreasing buffer size.
               const int64 a_size = buffer_size_(*a);
@@ -609,7 +618,7 @@ Status BufferAssigner::AssignBuffersForComputation(
               }
               // Otherwise live out buffers come before others, if the
               // instructions are sequentially ordered.
-              if (sequential_order != nullptr) {
+              if (has_sequential_order) {
                 const bool a_live_out = liveness.MaybeLiveOut(*a);
                 const bool b_live_out = liveness.MaybeLiveOut(*b);
                 if (a_live_out != b_live_out) {
@@ -746,7 +755,7 @@ Status BufferAssigner::AssignBuffersForComputation(
       }
     }
 
-    if (!assignment->HasAllocation(*buffer) && sequential_order != nullptr &&
+    if (!assignment->HasAllocation(*buffer) && has_sequential_order &&
         !liveness.MaybeLiveOut(*buffer)) {
       // There is a sequential instruction ordering, so we delay assignment of
       // temp buffers until after the loop. We do this right before we decide to
@@ -758,7 +767,7 @@ Status BufferAssigner::AssignBuffersForComputation(
       // for the definition of temp buffers.
       CHECK(!is_entry_parameter) << *buffer;
       CHECK(!is_thread_local) << *buffer;
-      unassigned_temp_buffers.insert(buffer);
+      (*buffers_to_assign_sequentially)[computation].insert(buffer);
       VLOG(3) << "Delaying assignment of temp buffer: " << *buffer;
       continue;
     }
@@ -772,27 +781,68 @@ Status BufferAssigner::AssignBuffersForComputation(
     }
   }
 
-  if (!unassigned_temp_buffers.empty()) {
-    TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
-        *sequential_order, unassigned_temp_buffers, *computation, assignment));
-  }
   return Status::OK();
 }
 
 Status BufferAssigner::AssignBuffersWithSequentialOrdering(
-    const std::vector<const HloInstruction*>& sequence,
-    const FlatSet<const LogicalBuffer*>& buffers_to_assign,
-    const HloComputation& computation, BufferAssignment* assignment) {
+    const FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>&
+        buffers_to_assign_sequentially,
+    bool run_whole_module_heap_simulation, BufferAssignment* assignment) {
   // Run the sequence of instructions through the heap simulator.  The heuristic
   // that seems to give the best results is lazy-best-fit, with all runs of
   // alloc / free calls sorted in decreasing size order.
-  TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
-                             MakeUnique<LazyBestFitHeap>(alignment_)),
-                         sequence, computation,
-                         assignment->points_to_analysis(), buffer_size_,
-                         &buffers_to_assign));
+  const HloOrdering& hlo_ordering = assignment->liveness().hlo_ordering();
+  if (run_whole_module_heap_simulation) {
+    // Run the heap simulation over the whole module. This reduces memory usage,
+    // since buffers for kCall and kWhile sub-computations are only live for the
+    // duration of their calling instructions.
+    VLOG(1) << "Running whole-module heap simulation";
+    SequentialHloOrdering::HloModuleSequence module_sequence;
+    FlatSet<const LogicalBuffer*> all_buffers_to_assign;
+    for (const auto& pair : buffers_to_assign_sequentially) {
+      const HloComputation* computation = pair.first;
+      const FlatSet<const LogicalBuffer*>& buffers_to_assign = pair.second;
+      const std::vector<const HloInstruction*>* instruction_sequence =
+          hlo_ordering.SequentialOrder(*computation);
+      CHECK(instruction_sequence != nullptr) << computation->name();
+      module_sequence[computation] = *instruction_sequence;
+      all_buffers_to_assign.insert(buffers_to_assign.begin(),
+                                   buffers_to_assign.end());
+    }
+    TF_ASSIGN_OR_RETURN(
+        const HeapSimulator::Result result,
+        HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
+                               MakeUnique<LazyBestFitHeap>(alignment_)),
+                           assignment->module(), module_sequence,
+                           assignment->points_to_analysis(), buffer_size_,
+                           &all_buffers_to_assign));
+    AssignBuffersFromHeapSimulator(result, assignment);
+  } else {
+    // Run the heap-simulation on a per-computation basis. Buffers for
+    // sub-computations are assigned disjoint BufferAllocations, assuming the
+    // worst-case that they may all be live concurrently.
+    VLOG(1) << "Running per-computation heap simulation";
+    for (const auto& pair : buffers_to_assign_sequentially) {
+      const HloComputation* computation = pair.first;
+      const FlatSet<const LogicalBuffer*>& buffers_to_assign = pair.second;
+      const std::vector<const HloInstruction*>* instruction_sequence =
+          hlo_ordering.SequentialOrder(*computation);
+      CHECK(instruction_sequence != nullptr) << computation->name();
+      TF_ASSIGN_OR_RETURN(
+          const HeapSimulator::Result result,
+          HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
+                                 MakeUnique<LazyBestFitHeap>(alignment_)),
+                             *computation, *instruction_sequence,
+                             assignment->points_to_analysis(), buffer_size_,
+                             &buffers_to_assign));
+      AssignBuffersFromHeapSimulator(result, assignment);
+    }
+  }
+  return Status::OK();
+}
+
+void BufferAssigner::AssignBuffersFromHeapSimulator(
+    const HeapSimulator::Result& result, BufferAssignment* assignment) {
   if (assignment->stats_.preallocated_temp_fragmentation_bytes == -1) {
     assignment->stats_.preallocated_temp_fragmentation_bytes =
         result.fragmentation_size;
@@ -801,8 +851,6 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         result.fragmentation_size;
   }
 
-  // Use the results of the heap simulator to create one allocation per
-  // computation, with LogicalBuffers packed to specific offsets.
   BufferAllocation* allocation = assignment->NewEmptyAllocation(
       result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true);
   for (const auto& buffer_chunk : result.chunk_map) {
@@ -810,7 +858,6 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
     const HeapSimulator::Chunk& chunk = buffer_chunk.second;
     assignment->AddAssignment(allocation, buffer, chunk.offset, chunk.size);
   }
-  return Status::OK();
 }
 
 // Adds the 'colocated_set' of buffers to 'colocated_buffer_sets', maintaining
@@ -1108,8 +1155,6 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferLiveness> liveness,
                       BufferLiveness::Run(module, std::move(hlo_ordering)));
 
-  std::vector<const HloComputation*> thread_local_computations;
-  std::vector<const HloComputation*> global_computations;
   VLOG(1) << "Assigning buffers to module " << module->name();
   if (hlos_to_allocate != nullptr) {
     VLOG(3) << "LogicalBuffer assignment restricted to hlos: ";
@@ -1121,9 +1166,6 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   XLA_VLOG_LINES(3, liveness->ToString());
   XLA_VLOG_LINES(3, liveness->points_to_analysis().ToString());
 
-  TF_RETURN_IF_ERROR(GatherComputationsByAllocationType(
-      module, &thread_local_computations, &global_computations));
-
   // Set of HLO's to allocate if hlos_to_allocate is given. Passed as a set to
   // AssignBuffersForComputation for fast membership testing.
   std::unique_ptr<FlatSet<const HloInstruction*>> hlo_set;
@@ -1148,16 +1190,38 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   AssignColocatedBufferSets(colocated_buffer_sets, assignment.get(),
                             &colocated_buffers, &colocated_allocations);
 
+  std::vector<const HloComputation*> thread_local_computations;
+  std::vector<const HloComputation*> global_computations;
+  TF_RETURN_IF_ERROR(GatherComputationsByAllocationType(
+      module, &thread_local_computations, &global_computations));
+
+  // First assign buffers for global computatations. Temporary buffers for
+  // sequential computations are collected in 'buffers_to_assign_sequentially'.
+  FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>
+      buffers_to_assign_sequentially;
   for (auto* computation : global_computations) {
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
         computation, /*is_thread_local=*/false, hlo_set.get(),
-        colocated_buffers, colocated_allocations, assignment.get()));
+        colocated_buffers, colocated_allocations,
+        &buffers_to_assign_sequentially, assignment.get()));
   }
+  // Assign buffers with sequential ordering, if any. If all global computations
+  // are sequential, we can run heap simuation on the whole module, which
+  // reduces memory usage.
+  const bool run_whole_module_heap_simulation =
+      buffers_to_assign_sequentially.size() == global_computations.size();
+  TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
+      buffers_to_assign_sequentially, run_whole_module_heap_simulation,
+      assignment.get()));
+
+  // Now assign buffers for thread-local computations. All LogicalBuffers get
+  // their own BufferAllocation.
   for (auto* computation : thread_local_computations) {
     TF_RET_CHECK(computation != module->entry_computation());
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
         computation, /*is_thread_local=*/true, hlo_set.get(), colocated_buffers,
-        colocated_allocations, assignment.get()));
+        colocated_allocations, /*buffers_to_assign_sequentially=*/nullptr,
+        assignment.get()));
   }
 
   // Mark all buffers which may be live out of the entry computation as
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 34667c435d5..7e96caf0f4e 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
+#include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -354,6 +355,9 @@ class BufferAssignment {
   void AddAssignment(BufferAllocation* allocation, const LogicalBuffer& buffer,
                      int64 offset, int64 size);
 
+  // Returns the HloModule used to construct this assignment.
+  const HloModule& module() { return *module_; }
+
   // Returns the BufferLiveness object used to construct this assignment.
   const BufferLiveness& liveness() { return *liveness_; }
 
@@ -427,14 +431,27 @@ class BufferAssigner {
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>& colocated_buffers,
       const tensorflow::gtl::FlatSet<BufferAllocation::Index>&
           colocated_allocations,
+      tensorflow::gtl::FlatMap<const HloComputation*,
+                               tensorflow::gtl::FlatSet<const LogicalBuffer*>>*
+          buffers_to_assign_sequentially,
       BufferAssignment* assignment);
 
-  // Assigns 'buffers_to_assign' assuming the HLO instructions will be executed
-  // in the given 'sequential_order'.
+  // Assigns 'buffers_to_assign_sequentially' using heap simulation, assuming
+  // the HLO instructions will be executed in the sequential order given by
+  // assignment->liveness().hlo_ordering().SequentialOrder. If
+  // 'run_whole_module_heap_simulation' is true, the heap simulation will be run
+  // assuming all global computations are sequentially ordered.
   Status AssignBuffersWithSequentialOrdering(
-      const std::vector<const HloInstruction*>& sequential_order,
-      const tensorflow::gtl::FlatSet<const LogicalBuffer*>& buffers_to_assign,
-      const HloComputation& computation, BufferAssignment* assignment);
+      const tensorflow::gtl::FlatMap<
+          const HloComputation*,
+          tensorflow::gtl::FlatSet<const LogicalBuffer*>>&
+          buffers_to_assign_sequentially,
+      bool run_whole_module_heap_simulation, BufferAssignment* assignment);
+
+  // Uses the results of the heap simulator to create a single allocation, with
+  // LogicalBuffers packed to specific offsets.
+  void AssignBuffersFromHeapSimulator(const HeapSimulator::Result& result,
+                                      BufferAssignment* assignment);
 
   // Tries to assign the given instruction to the given buffer. Returns if the
   // assignment was successful.
@@ -477,8 +494,6 @@ class BufferAssigner {
       const HloComputation& computation, const BufferLiveness& buffer_liveness,
       std::vector<ColocatedBufferSet>* colocated_buffer_sets);
 
-  const HloModule* module_;
-
   // Function which returns the buffer size for a given logical buffer (shape).
   LogicalBuffer::SizeFunction buffer_size_;
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 9c4899a67de..d7aa5664df4 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -53,12 +53,44 @@ std::vector<const LogicalBuffer*> UniqueOperandSourceBuffers(
 
 /*static*/
 StatusOr<HeapSimulator::Result> HeapSimulator::Run(
-    std::unique_ptr<HeapAlgorithm> algorithm,
-    const std::vector<const HloInstruction*>& instruction_sequence,
-    const HloComputation& computation,
+    std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
+    const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_fn,
     const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
+  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign);
+  const HloComputation* entry_computation = module.entry_computation();
+  const std::vector<const HloInstruction*>& instruction_sequence =
+      FindOrDie(module_sequence, entry_computation);
+  TF_RETURN_IF_ERROR(heap.RunComputation(*entry_computation,
+                                         instruction_sequence,
+                                         points_to_analysis, &module_sequence));
+  return heap.Finish();
+}
+
+/*static*/
+StatusOr<HeapSimulator::Result> HeapSimulator::Run(
+    std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
+    const std::vector<const HloInstruction*>& instruction_sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_fn,
+    const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
+  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign);
+  TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
+                                         points_to_analysis,
+                                         /*module_sequence=*/nullptr));
+  return heap.Finish();
+}
+
+// Runs a heap simulation for the given 'computation', assuming the given
+// 'instruction_sequence'. If 'module_sequence' is non-null, it is used to find
+// kCall and kWhile sub-computations, and the heap simulation for those
+// sub-computations will be run recursively.
+Status HeapSimulator::RunComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& instruction_sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const SequentialHloOrdering::HloModuleSequence* module_sequence) {
   // The goal here is to minimize memory usage, assuming the given sequential
   // ordering of instructions.  The strategy is to walk through the instruction
   // sequence, calling Alloc and Free on the underlying heap algorithm.  The
@@ -67,7 +99,6 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
   // 'live_buffers' tracks the liveness of each buffer that we assign, by
   // associating it with a set of HloInstructions that need to be visited.  When
   // the set becomes empty, the buffer is no longer used, and can be freed.
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign);
   FlatMap<const LogicalBuffer*, FlatSet<const HloInstruction*>> live_buffers;
 
   const HloInstruction* root = computation.root_instruction();
@@ -90,7 +121,7 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     // lifetime of buffers that aren't already connected by a data dependency.
     std::vector<const LogicalBuffer*> dead_buffers_to_free;
     for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
-      if (heap.IgnoreBuffer(buffer)) {
+      if (IgnoreBuffer(buffer)) {
         continue;
       }
       for (const BufferAlias& alias :
@@ -127,7 +158,7 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::vector<const LogicalBuffer*> operand_buffers_to_free;
     for (const LogicalBuffer* operand_buffer :
          UniqueOperandSourceBuffers(instruction, points_to_analysis)) {
-      if (heap.IgnoreBuffer(operand_buffer)) {
+      if (IgnoreBuffer(operand_buffer)) {
         continue;
       }
       live_buffers[operand_buffer].erase(instruction);
@@ -142,10 +173,10 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     // happen before dead or operand buffers are freed; the instruction reads
     // the operand buffers to produce its output.
     //
-    // INVARIANT: Either heap.Alloc or heap.ShareBuffer will be called for each
-    // buffer that we should assign.
+    // INVARIANT: Either Alloc or ShareBuffer will be called for each buffer
+    // that we should assign.
     for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
-      if (heap.IgnoreBuffer(buffer)) {
+      if (IgnoreBuffer(buffer)) {
         continue;
       }
 
@@ -159,24 +190,50 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
             CanShareOperandBufferWithUser(
                 operand_buffer->instruction(), operand_buffer->index(),
                 buffer->instruction(), buffer->index(), points_to_analysis)) {
-          heap.ShareBuffer(buffer, operand_buffer);
+          ShareBuffer(buffer, operand_buffer);
           shared = true;
           break;
         }
       }
 
       if (!shared) {
-        heap.Alloc(buffer);
+        Alloc(buffer);
       }
     }
 
+    // If the whole module is sequential, we can save memory by running the
+    // heap-simulation for sub-computations inline. E.g. the buffers for the
+    // condition and body of a kWhile instruction are only live for the duration
+    // of the instruction itself.
+    //
+    // The order that the sub-computations are simulated does not affect
+    // correctness; since the whole module is sequential, we know that the
+    // sub-computations will never be run concurrently.
+    if (module_sequence != nullptr) {
+      if (instruction->opcode() == HloOpcode::kCall ||
+          instruction->opcode() == HloOpcode::kWhile) {
+        for (const HloComputation* called_computation :
+             instruction->called_computations()) {
+          const std::vector<const HloInstruction*>& called_sequence =
+              FindOrDie(*module_sequence, called_computation);
+          TF_RETURN_IF_ERROR(RunComputation(*called_computation,
+                                            called_sequence, points_to_analysis,
+                                            module_sequence));
+        }
+      }
+
+      // Other sub-computations (e.g. Map, Reduce, ...) are skipped; they are
+      // assigned "thread-local" allocations, meaning their buffers are not
+      // allocated up-front at the beginning of the computation.
+    }
+
     // Free buffers that are no longer live.  This is the earliest point that we
     // can de-allocate; right after the last use of the buffer.
     for (const LogicalBuffer* buffer : dead_buffers_to_free) {
-      heap.Free(buffer);
+      Free(buffer);
     }
     for (const LogicalBuffer* buffer : operand_buffers_to_free) {
-      heap.Free(buffer);
+      Free(buffer);
     }
   }
 
@@ -187,10 +244,10 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     const FlatSet<const HloInstruction*>& pending = buffer_pending.second;
     CHECK_EQ(pending.size(), 1) << *buffer;
     CHECK(*pending.begin() == nullptr) << *buffer;
-    heap.Free(buffer);
+    Free(buffer);
   }
 
-  return heap.Finish();
+  return Status::OK();
 }
 
 HeapSimulator::HeapSimulator(
@@ -309,6 +366,11 @@ HeapSimulator::Result HeapSimulator::Finish() {
         result.chunk_map.emplace(buffer, chunk);
       }
     }
+    // If we were told to assign specific buffers, make sure we've assigned
+    // exactly that many buffers.
+    if (buffers_to_assign_ != nullptr) {
+      CHECK_EQ(buffers_to_assign_->size(), result.chunk_map.size());
+    }
   }
 
   // Fragmentation is the difference between the actual and ideal sizes.
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 0ce29067678..3d980462619 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -63,17 +64,32 @@ class HeapSimulator {
   };
 
   // Run the heap simulation with the given algorithm, assuming the given
-  // sequential ordering of instructions.  The 'instruction_sequence' must
-  // contain a topologically-consistent total ordering of all instructions in
-  // the computation.  The result is invalid if instructions are not run in
-  // exactly this sequence.
+  // module_sequence, which must contain a topologically-consistent total
+  // ordering of all instructions within each computation. The result is invalid
+  // if instructions are not run in exactly this sequence.
+  //
+  // Running heap simulation on the whole module tends to save memory, compared
+  // to running on a per-computation basis, since we can re-use buffer space for
+  // called sub-computations.
   //
   // If 'buffers_to_assign' is provided, only those buffers are assigned
   // offsets, otherwise all buffers defined by the instructions are assigned.
+  static StatusOr<Result> Run(
+      std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
+      const SequentialHloOrdering::HloModuleSequence& module_sequence,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const LogicalBuffer::SizeFunction& size_fn,
+      const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
+          nullptr);
+
+  // Same as above, but runs on a single computation. The 'instruction_sequence'
+  // must contain a topologically-consistent total ordering of all instructions
+  // in the computation. The result is invalid if instructions are not run in
+  // exactly this sequence.
   static StatusOr<Result> Run(
       std::unique_ptr<HeapAlgorithm> algorithm,
-      const std::vector<const HloInstruction*>& instruction_sequence,
       const HloComputation& computation,
+      const std::vector<const HloInstruction*>& instruction_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_fn,
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign =
@@ -86,6 +102,12 @@ class HeapSimulator {
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign);
   ~HeapSimulator();
 
+  Status RunComputation(
+      const HloComputation& computation,
+      const std::vector<const HloInstruction*>& instruction_sequence,
+      const TuplePointsToAnalysis& points_to_analysis,
+      const SequentialHloOrdering::HloModuleSequence* module_sequence);
+
   bool IgnoreBuffer(const LogicalBuffer* buffer) const;
   void Alloc(const LogicalBuffer* buffer);
   void Free(const LogicalBuffer* buffer);
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 874bd5f1060..0a6900f7330 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -19,13 +19,16 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace xla {
 namespace {
@@ -69,6 +72,7 @@ class HeapCallRecorder : public HeapAlgorithm {
 // sequence against an expected sequence.
 class HeapSimulatorTracker {
  public:
+  // Constructor for testing a single entry computation.
   HeapSimulatorTracker(
       const string& name, std::unique_ptr<HloComputation> computation,
       const std::vector<const HloInstruction*>& instruction_sequence) {
@@ -83,12 +87,48 @@ class HeapSimulatorTracker {
     auto zero_size = [](const LogicalBuffer& buffer) { return 0; };
     auto algorithm = MakeUnique<DecreasingSizeRunsHeap>(
         MakeUnique<HeapCallRecorder>(&actual_calls_));
-    result_ = HeapSimulator::Run(std::move(algorithm), instruction_sequence,
-                                 *module_->entry_computation(),
-                                 *points_to_analysis_, zero_size)
+    result_ = HeapSimulator::Run(
+                  std::move(algorithm), *module_->entry_computation(),
+                  instruction_sequence, *points_to_analysis_, zero_size)
                   .ConsumeValueOrDie();
   }
 
+  explicit HeapSimulatorTracker(const string& name) {
+    module_ = MakeUnique<HloModule>(name);
+  }
+
+  // Similar to the single entry computation constructor above, but runs the
+  // simulation over the entire module.
+  void RunWholeModule(
+      const std::vector<const HloInstruction*>& full_module_sequence) {
+    points_to_analysis_ =
+        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
+
+    // Construct the module sequence grouped by computation.
+    SequentialHloOrdering::HloModuleSequence module_sequence;
+    tensorflow::gtl::FlatMap<const HloInstruction*, int> reverse_position;
+    for (int i = 0; i < full_module_sequence.size(); ++i) {
+      const HloInstruction* instruction = full_module_sequence[i];
+      module_sequence[instruction->parent()].push_back(instruction);
+      reverse_position[instruction] = full_module_sequence.size() - i;
+    }
+
+    // Hack the size_fn so that it returns a decreasing value as we step through
+    // the sequence. This lets us ensure the Alloc calls are in the sequence
+    // order. The Free calls are sorted by LogicalBuffer.id, which is at least
+    // deterministic.
+    auto size_fn = [&reverse_position](const LogicalBuffer& buffer) {
+      return reverse_position[buffer.instruction()];
+    };
+    auto algorithm = MakeUnique<DecreasingSizeRunsHeap>(
+        MakeUnique<HeapCallRecorder>(&actual_calls_));
+    result_ = HeapSimulator::Run(std::move(algorithm), *module_,
+                                 module_sequence, *points_to_analysis_, size_fn)
+                  .ConsumeValueOrDie();
+  }
+
+  HloModule* module() { return module_.get(); }
+
   // Returns the buffer defined at the given instruction and index.
   const LogicalBuffer* BufferAt(const HloInstruction* instruction,
                                 const ShapeIndex& index) const {
@@ -358,6 +398,86 @@ TEST_F(HeapSimulatorTest, MultiplyDotDotTuple) {
   });
 }
 
+TEST_F(HeapSimulatorTest, WholeModule) {
+  HeapSimulatorTracker tracker(TestName());
+
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "cond_param"));
+  HloInstruction* cond_iter = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 0));
+  HloInstruction* cond_data = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
+  HloInstruction* cond_lt = cond_builder.AddInstruction(
+      HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
+                                   HloOpcode::kLt, cond_iter, cond_data));
+  HloComputation* cond_computation =
+      tracker.module()->AddEmbeddedComputation(cond_builder.Build());
+
+  auto body_builder = HloComputation::Builder("WhileBody");
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "body_param"));
+  HloComputation* body_computation =
+      tracker.module()->AddEmbeddedComputation(body_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
+      tuple_shape, cond_computation, body_computation, param));
+  tracker.module()->AddEntryComputation(builder.Build());
+
+  tracker.RunWholeModule(
+      {param, while_op, body_param, cond_param, cond_iter, cond_data, cond_lt});
+  tracker.ExpectCallSequence({
+      // The entry computation param and while_op are allocated first.
+      {kAlloc, tracker.BufferAt(param, {})},
+      {kAlloc, tracker.BufferAt(param, {0})},
+      {kAlloc, tracker.BufferAt(param, {1})},
+      {kAlloc, tracker.BufferAt(while_op, {})},
+      {kAlloc, tracker.BufferAt(while_op, {0})},
+      {kAlloc, tracker.BufferAt(while_op, {1})},
+
+      // Now the while body param is allocated and freed.
+      {kAlloc, tracker.BufferAt(body_param, {})},
+      {kAlloc, tracker.BufferAt(body_param, {0})},
+      {kAlloc, tracker.BufferAt(body_param, {1})},
+      {kFree, tracker.BufferAt(body_param, {})},
+      {kFree, tracker.BufferAt(body_param, {0})},
+      {kFree, tracker.BufferAt(body_param, {1})},
+
+      // Now the while cond param is allocated. The GTE instructions just alias
+      // the param elements, so the param tuple can immediately be freed.
+      {kAlloc, tracker.BufferAt(cond_param, {})},
+      {kAlloc, tracker.BufferAt(cond_param, {0})},
+      {kAlloc, tracker.BufferAt(cond_param, {1})},
+      {kFree, tracker.BufferAt(cond_param, {})},
+
+      // Now the final cond less-than buffer is allocated.
+      {kAlloc, tracker.BufferAt(cond_lt, {})},
+
+      // The order of the remaining Free calls is based on the LogicalBuffer.id,
+      // which is deterministic, but not obvious.
+      {kFree, tracker.BufferAt(param, {})},
+      {kFree, tracker.BufferAt(param, {0})},
+      {kFree, tracker.BufferAt(param, {1})},
+
+      {kFree, tracker.BufferAt(while_op, {})},
+      {kFree, tracker.BufferAt(while_op, {0})},
+      {kFree, tracker.BufferAt(while_op, {1})},
+
+      {kFree, tracker.BufferAt(cond_param, {0})},
+      {kFree, tracker.BufferAt(cond_param, {1})},
+      {kFree, tracker.BufferAt(cond_lt, {})},
+
+      {kFinish, nullptr},
+  });
+}
+
 // Base class for heap algorithm tests.
 class HeapAlgorithmTestBase : public ::testing::Test {
  protected:
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 7476b72f029..725ce17d664 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -221,23 +221,6 @@ string SequentialHloOrdering::ToString() const {
   return tensorflow::str_util::Join(pieces, "\n");
 }
 
-namespace {
-StatusOr<int64> MinimumMemoryForSequence(
-    const HloComputation& computation,
-    const std::vector<const HloInstruction*>& sequence,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
-  // The absolute minimum memory required for a given sequence of instructions
-  // is determined by the sequence of Alloc and Free calls on a simulated heap,
-  // ignoring fragmentation.
-  TF_ASSIGN_OR_RETURN(
-      HeapSimulator::Result result,
-      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), sequence,
-                         computation, points_to_analysis, size_function));
-  return result.heap_size;
-}
-}  // namespace
-
 StatusOr<int64> MinimumMemoryForSequence(
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const LogicalBuffer::SizeFunction& size_function) {
@@ -249,17 +232,16 @@ StatusOr<int64> MinimumMemoryForSequence(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(module));
 
-  int64 total_memory = 0;
-  for (const auto& pair : module_sequence) {
-    const HloComputation* computation = pair.first;
-    const std::vector<const HloInstruction*>& sequence = pair.second;
-    TF_ASSIGN_OR_RETURN(
-        const int64 memory,
-        MinimumMemoryForSequence(*computation, sequence, *points_to_analysis,
-                                 size_function));
-    total_memory += memory;
-  }
-  return total_memory;
+  // The absolute minimum memory required for a given sequence of instructions
+  // is determined by the sequence of Alloc and Free calls on a simulated heap,
+  // ignoring fragmentation. We run the heap simulation on the whole module,
+  // rather than summing each computation, since it gives us a better lower
+  // bound, by minimizing the liveness of sub-computations.
+  TF_ASSIGN_OR_RETURN(
+      HeapSimulator::Result result,
+      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), *module,
+                         module_sequence, *points_to_analysis, size_function));
+  return result.heap_size;
 }
 
 namespace {
@@ -516,6 +498,18 @@ StatusOr<std::vector<const HloInstruction*>> RunDFSMemoryScheduler(
   return sequence;
 }
 
+StatusOr<int64> MinimumMemoryForComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function) {
+  TF_ASSIGN_OR_RETURN(
+      HeapSimulator::Result result,
+      HeapSimulator::Run(MakeUnique<NoFragmentationStatsHeap>(), computation,
+                         sequence, points_to_analysis, size_function));
+  return result.heap_size;
+}
+
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
@@ -523,13 +517,17 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
   // We try both a list-scheduler based ordering and a DFS based ordering, and
   // choose whichever returns a lower min-memory, not accounting for
   // fragmentation.
+  //
+  // Note that this is just a heuristic. One obvious inaccuracy is that the
+  // memory required for sub-computations might be different when considered
+  // within the caller's context. But it's good enough for now.
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> list_sequence,
       ListScheduler::Run(computation, points_to_analysis, size_function));
   TF_ASSIGN_OR_RETURN(
       const int64 list_memory,
-      MinimumMemoryForSequence(computation, list_sequence, points_to_analysis,
-                               size_function));
+      MinimumMemoryForComputation(computation, list_sequence,
+                                  points_to_analysis, size_function));
   VLOG(2) << "Min-memory list sequence: " << list_memory << " bytes";
 
   TF_ASSIGN_OR_RETURN(
@@ -537,8 +535,8 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
       RunDFSMemoryScheduler(computation, points_to_analysis, size_function));
   TF_ASSIGN_OR_RETURN(
       const int64 dfs_memory,
-      MinimumMemoryForSequence(computation, dfs_sequence, points_to_analysis,
-                               size_function));
+      MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
+                                  size_function));
   VLOG(2) << "Min-memory dfs sequence: " << dfs_memory << " bytes";
 
   if (list_memory <= dfs_memory) {
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 01b5fd93644..c387fbb89b1 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -155,6 +155,65 @@ TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
   EXPECT_FALSE(ordering.ExecutesBefore(y, c));
 }
 
+class MinimumMemoryForSequenceTest : public HloTestBase {};
+
+TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
+  HloModule module(TestName());
+  const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
+
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "cond_param"));
+  HloInstruction* cond_iter = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 0));
+  HloInstruction* cond_data = cond_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1));
+  // Free cond_param[] (16 bytes), Alloc PRED[] (1 byte)
+  HloInstruction* cond_lt = cond_builder.AddInstruction(
+      HloInstruction::CreateBinary(ShapeUtil::MakeShape(PRED, {}),
+                                   HloOpcode::kLt, cond_iter, cond_data));
+  HloComputation* cond_computation =
+      module.AddEmbeddedComputation(cond_builder.Build());
+
+  auto body_builder = HloComputation::Builder("WhileBody");
+  // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element)
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "body_param"));
+  HloComputation* body_computation =
+      module.AddEmbeddedComputation(body_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  // Entry params: 8 bytes (4 bytes per param), TOTAL=8
+  HloInstruction* iter = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "param_iter"));
+  HloInstruction* data = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "param_data"));
+  // Tuple: 16 bytes (8 bytes per pointer), TOTAL=24
+  HloInstruction* tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({iter, data}));
+  // While: 8 bytes (4 bytes per element), TOTAL=32
+  // Both cond and body use a max of 24 bytes, TOTAL=56
+  HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
+      tuple_shape, cond_computation, body_computation, tuple));
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build());
+
+  auto size_fn = [](const LogicalBuffer& buffer) {
+    return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+  };
+
+  SequentialHloOrdering::HloModuleSequence module_sequence;
+  module_sequence[cond_computation] = {cond_param, cond_iter, cond_data,
+                                       cond_lt};
+  module_sequence[body_computation] = {body_param};
+  module_sequence[entry_computation] = {iter, data, tuple, while_op};
+  EXPECT_EQ(56,
+            MinimumMemoryForSequence(module_sequence, size_fn).ValueOrDie());
+}
+
 }  // namespace
 
 }  // namespace xla

From a8d720c2b7e4260bee7020822168bfba852274ac Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Tue, 2 May 2017 18:02:26 -0800
Subject: [PATCH 31/51] Calling InferenceContext::UnknownShapes twice produces
 2 shape handles for which ShapeHandle::SameHandle returns false. Therefore we
 need to merge the shapes handles in InferenceContext::set_input,
 InferenceContext::set_input_handle_shape, and
 InferenceContext::set_output_handle_shape Change: 154911808

---
 .../core/common_runtime/shape_refiner.cc      |   4 +-
 tensorflow/core/framework/shape_inference.h   |  62 +++--
 .../core/grappler/costs/graph_properties.cc   |   2 +-
 .../grappler/costs/graph_properties_test.cc   | 227 ++++++++++++++++++
 4 files changed, 268 insertions(+), 27 deletions(-)

diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 8eb383a14fe..876f34b9911 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -163,7 +163,7 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool* refined) {
 
     InferenceContext* c = iter->second.get();
     DCHECK_GE(e->dst_input(), 0);
-    if (node_context->set_input(e->dst_input(), c->output(e->src_output()))) {
+    if (node_context->MergeInput(e->dst_input(), c->output(e->src_output()))) {
       *refined = true;
     }
 
@@ -174,7 +174,7 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool* refined) {
               e->dst_input(), c->output_handle_dtype(e->src_output()))) {
         *refined = true;
       }
-      if (node_context->set_input_handle_shape(
+      if (node_context->MergeInputHandleShape(
               e->dst_input(), c->output_handle_shape(e->src_output()))) {
         *refined = true;
       }
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 71663027b3c..cebadcc5b45 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -191,16 +191,18 @@ class InferenceContext {
     return s;
   }
 
-  // Set the shape of the input in position idx. This requires idx to be in the
-  // [0, num_inputs) range. Returns true iff the stored input shape has been
-  // updated with a different handle.
-  bool set_input(int idx, ShapeHandle shape) {
-    if (!inputs_[idx].SameHandle(shape)) {
-      inputs_[idx] = shape;
-      return true;
-    } else {
+  // Merge the stored shape of the input in position idx with the specified
+  // shape. This requires idx to be in the [0, num_inputs) range. If the merge
+  // is successful and the new shape differs from the old one, store the new
+  // shape and return true. Return false otherwise.
+  bool MergeInput(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    if (!Merge(inputs_[idx], shape, &new_shape).ok() ||
+        inputs_[idx].SameHandle(new_shape)) {
       return false;
     }
+    inputs_[idx] = new_shape;
+    return true;
   }
   ShapeHandle input(int64 idx) const { return inputs_[idx]; }
   Status input(StringPiece input_name, std::vector<ShapeHandle>* output) const;
@@ -442,15 +444,18 @@ class InferenceContext {
   // propagate that information. Output handle dtypes and shapes are ignored if
   // the output tensor is not of type DT_RESOURCE.
 
-  // Set the shape corresponding to the resource in position idx. This requires
-  // idx to be in the [0, num_inputs) range. Returns true iff the stored shape
-  // has been updated with a different handle.
-  bool set_input_handle_shape(int idx, ShapeHandle shape) {
-    if (!input_handle_shape_[idx].SameHandle(shape)) {
-      input_handle_shape_[idx] = shape;
-      return true;
+  // Merge the stored shape corresponding to the input handle in position idx
+  // with the specified shape. This requires idx to be in the [0, num_inputs)
+  // range. If the merge is successful and the new shape differs from the old
+  // one, store the new shape and return true. Return false otherwise.
+  bool MergeInputHandleShape(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    if (!Merge(input_handle_shape_[idx], shape, &new_shape).ok() ||
+        input_handle_shape_[idx].SameHandle(new_shape)) {
+      return false;
     }
-    return false;
+    input_handle_shape_[idx] = shape;
+    return true;
   }
 
   // Set the type corresponding to the resource in position idx. This requires
@@ -468,15 +473,24 @@ class InferenceContext {
     return input_handle_dtype_[idx];
   }
 
-  // Set the shape corresponding to the resource in position idx. This requires
-  // idx to be in the [0, num_outputs) range.
-  // Returns true iff the stored shape has been updated with a different handle.
-  bool set_output_handle_shape(int idx, ShapeHandle shape) {
-    if (!output_handle_shape_[idx].SameHandle(shape)) {
-      output_handle_shape_[idx] = shape;
-      return true;
+  // Merge the stored shape corresponding to the output handle in position idx
+  // with the specified shape. This requires idx to be in the [0, num_outputs)
+  // range. If the merge is successful and the new shape differs from the old
+  // one, store the new shape and return true. Return false otherwise.
+
+  bool MergeOutputHandleShape(int idx, ShapeHandle shape) {
+    ShapeHandle new_shape;
+    if (!Merge(output_handle_shape_[idx], shape, &new_shape).ok() ||
+        output_handle_shape_[idx].SameHandle(new_shape)) {
+      return false;
     }
-    return false;
+    output_handle_shape_[idx] = shape;
+    return true;
+  }
+  // Overwrite the shape corresponding to the output handle in position idx with
+  // the specified shape.
+  void set_output_handle_shape(int idx, ShapeHandle shape) {
+    output_handle_shape_[idx] = shape;
   }
 
   // Set the type corresponding to the resource in position idx. This requires
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 31c1043ae69..75f2f16c5bb 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -85,7 +85,7 @@ Status GraphProperties::InferStatically() {
         }
       }
       if (qctx->set_output_handle_dtype(0, queue_type) ||
-          qctx->set_output_handle_shape(0, queue_shp)) {
+          qctx->MergeOutputHandleShape(0, queue_shp)) {
         new_shapes.push(qnode);
       }
     }
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 94b809dc44e..be5ae3c3646 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -177,10 +177,14 @@ TEST_F(GraphPropertiesTest, Queues) {
   auto dequeue2 =
       ops::QueueDequeue(root.WithOpName("Dequeue2"), q2, {DataType::DT_FLOAT});
 
+  // Create a queue that feeds itself.
   auto q3 =
       ops::RandomShuffleQueue(root.WithOpName("Queue3"), {DataType::DT_FLOAT});
   auto dequeue3 =
       ops::QueueDequeue(root.WithOpName("Dequeue3"), q3, {DataType::DT_FLOAT});
+  auto merge3 = ops::Merge(root.WithOpName("Merge3"), {dequeue3[0], square2});
+  auto enqueue3 =
+      ops::QueueEnqueue(root.WithOpName("Enqueue3"), q3, {merge3.output});
 
   auto q4 =
       ops::RandomShuffleQueue(root.WithOpName("Queue4"), {DataType::DT_FLOAT});
@@ -227,6 +231,229 @@ TEST_F(GraphPropertiesTest, Queues) {
   EXPECT_EQ(7, prop4.shape().dim(1).size());
 }
 
+TEST_F(GraphPropertiesTest, Loops) {
+  // Test graph produced in python using:
+  /*
+     with tf.Graph().as_default():
+       i = tf.constant(0)
+       c = lambda i: tf.less(i, 10)
+       b = lambda i: tf.add(i, 1)
+       r = tf.while_loop(c, b, [i])
+       with open('/tmp/graph.txt', 'w') as f:
+         f.write(str(tf.get_default_graph().as_graph_def()))
+  */
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/Add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/Add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 11
+}
+  )EOF";
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically());
+
+  const auto props = properties.GetOutputProperties("while/Exit");
+  EXPECT_EQ(1, props.size());
+  const OpInfo::TensorProperties& prop = props[0];
+  EXPECT_EQ(DT_INT32, prop.dtype());
+  EXPECT_TRUE(prop.shape().unknown_rank());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow

From 3af03be757b63ea6fbd28cc351d5d2323c526354 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Tue, 2 May 2017 18:56:32 -0800
Subject: [PATCH 32/51] tfdbg: internal-only changes Change: 154914490

---
 tensorflow/python/debug/wrappers/framework.py        |  6 ------
 tensorflow/python/debug/wrappers/framework_test.py   | 12 ------------
 tensorflow/tools/dist_test/server/BUILD              |  2 +-
 .../tools/dist_test/server/grpc_tensorflow_server.py | 12 +++++++++++-
 4 files changed, 12 insertions(+), 20 deletions(-)
 mode change 100755 => 100644 tensorflow/tools/dist_test/server/grpc_tensorflow_server.py

diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index 50645c1c874..0d8616a69fb 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -348,12 +348,6 @@ class BaseDebugWrapperSession(session.SessionInterface):
 
     _check_type(sess, session.BaseSession)
 
-    # TODO(cais): Remove this check once tfdbg is integrated with GrpcSession.
-    if sess.sess_str:
-      raise NotImplementedError(
-          "Non-DirectSession support is not available from TensorFlow "
-          "Debugger yet (sess_str=%s)" % sess.sess_str)
-
     # The session being wrapped.
     self._sess = sess
     self._thread_name_filter_pattern = (re.compile(thread_name_filter)
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 1d69c7769a2..fd0efcd925f 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -384,18 +384,6 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
         ["a_init", "b_init"],
         [datum.node_name for datum in dump.dumped_tensor_data])
 
-  def testUsingNonDirectSessionRaisesNotImplementedError(self):
-    # TODO(cais): Remove this test once tfdbg is integrated with GrpcSession.
-    fake_non_direct_session = session.Session()
-    fake_non_direct_session._target = "foo"
-
-    with self.assertRaisesRegexp(
-        NotImplementedError,
-        r"Non-DirectSession support is not available from TensorFlow Debugger "
-        r"yet \(sess_str=foo\)"):
-      TestDebugWrapperSession(
-          fake_non_direct_session, self._dump_root, self._observer)
-
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/tools/dist_test/server/BUILD b/tensorflow/tools/dist_test/server/BUILD
index 9d008ec9ce5..865af8dd7b2 100644
--- a/tensorflow/tools/dist_test/server/BUILD
+++ b/tensorflow/tools/dist_test/server/BUILD
@@ -9,7 +9,7 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-py_library(
+py_binary(
     name = "grpc_tensorflow_server",
     srcs = [
         "grpc_tensorflow_server.py",
diff --git a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py b/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
old mode 100755
new mode 100644
index 2d774577b6d..bd6700a0b1f
--- a/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
+++ b/tensorflow/tools/dist_test/server/grpc_tensorflow_server.py
@@ -36,6 +36,7 @@ from __future__ import print_function
 import argparse
 import sys
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python.platform import app
 from tensorflow.python.training import server_lib
@@ -103,8 +104,11 @@ def main(unused_args):
     raise ValueError("Invalid task_id: %d" % FLAGS.task_id)
   server_def.task_index = FLAGS.task_id
 
+  config = config_pb2.ConfigProto(gpu_options=config_pb2.GPUOptions(
+      per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction))
+
   # Create GRPC Server instance
-  server = server_lib.Server(server_def)
+  server = server_lib.Server(server_def, config=config)
 
   # join() is blocking, unlike start()
   server.join()
@@ -137,6 +141,11 @@ if __name__ == "__main__":
       default=0,
       help="Task index, e.g., 0"
   )
+  parser.add_argument(
+      "--gpu_memory_fraction",
+      type=float,
+      default=1.0,
+      help="Fraction of GPU memory allocated",)
   parser.add_argument(
       "--verbose",
       type="bool",
@@ -145,5 +154,6 @@ if __name__ == "__main__":
       default=False,
       help="Verbose mode"
   )
+
   FLAGS, unparsed = parser.parse_known_args()
   app.run(main=main, argv=[sys.argv[0]] + unparsed)

From 485a24eda09965b83af1b2218bc12c529cc35c91 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Tue, 2 May 2017 19:24:23 -0800
Subject: [PATCH 33/51] [tf layers] Delay marking a layer as built until the
 end of its first apply().

This allows the layer's call() method to call add_variable, making it much
easier to create variables while building the layer's logic.
Change: 154916035
---
 .../contrib/keras/python/keras/layers/core.py |  1 +
 .../keras/python/keras/layers/merge.py        |  3 ++
 .../keras/python/keras/layers/wrappers.py     |  1 +
 tensorflow/python/layers/base.py              |  4 +--
 tensorflow/python/layers/base_test.py         | 30 +++++++++++++++++++
 tensorflow/python/layers/convolutional.py     |  4 ++-
 tensorflow/python/layers/core.py              |  1 +
 tensorflow/python/layers/normalization.py     |  1 +
 tensorflow/python/layers/pooling.py           |  3 ++
 9 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/keras/python/keras/layers/core.py b/tensorflow/contrib/keras/python/keras/layers/core.py
index 7a9e0d1736f..0b6cdc65a4f 100644
--- a/tensorflow/contrib/keras/python/keras/layers/core.py
+++ b/tensorflow/contrib/keras/python/keras/layers/core.py
@@ -741,6 +741,7 @@ class Dense(tf_core_layers.Dense, Layer):
       self.constraints[self.kernel] = self.kernel_constraint
     if self.use_bias and self.bias_constraint:
       self.constraints[self.bias] = self.bias_constraint
+    self.built = True
 
   def get_config(self):
     config = {
diff --git a/tensorflow/contrib/keras/python/keras/layers/merge.py b/tensorflow/contrib/keras/python/keras/layers/merge.py
index 25921979bdd..b4bb9935fde 100644
--- a/tensorflow/contrib/keras/python/keras/layers/merge.py
+++ b/tensorflow/contrib/keras/python/keras/layers/merge.py
@@ -111,6 +111,7 @@ class _Merge(Layer):
       self._reshape_required = False
     else:
       self._reshape_required = True
+    self.built = True
 
   def call(self, inputs):
     if self._reshape_required:
@@ -302,6 +303,7 @@ class Concatenate(_Merge):
                        'inputs with matching shapes '
                        'except for the concat axis. '
                        'Got inputs shapes: %s' % (input_shape))
+    self.built = True
 
   def call(self, inputs):
     if not isinstance(inputs, list):
@@ -414,6 +416,7 @@ class Dot(_Merge):
       raise ValueError('Dimension incompatibility '
                        '%s != %s. ' % (shape1[axes[0]], shape2[axes[1]]) +
                        'Layer shapes: %s, %s' % (shape1, shape2))
+    self.built = True
 
   def call(self, inputs):
     x1 = inputs[0]
diff --git a/tensorflow/contrib/keras/python/keras/layers/wrappers.py b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
index ce6458fd0c8..092501cb114 100644
--- a/tensorflow/contrib/keras/python/keras/layers/wrappers.py
+++ b/tensorflow/contrib/keras/python/keras/layers/wrappers.py
@@ -166,6 +166,7 @@ class TimeDistributed(Wrapper):
       self.layer.build(child_input_shape)
       self.layer.built = True
     super(TimeDistributed, self).build()
+    self.built = True
 
   def _compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index f6b816333ea..cfcd844800c 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -335,7 +335,7 @@ class Layer(object):
 
   def add_variable(self, name, shape, dtype=None,
                    initializer=None, regularizer=None, trainable=True):
-    """Adds a new variable to the layer.
+    """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
       name: variable name.
@@ -424,7 +424,6 @@ class Layer(object):
             self.build(input_shapes[0])
           else:
             self.build(input_shapes)
-          self.built = True
         if 'scope' in tf_inspect.getargspec(self.call).args:
           kwargs['scope'] = scope
         outputs = self.call(inputs, *args, **kwargs)
@@ -443,6 +442,7 @@ class Layer(object):
 
     # Update global default collections.
     _add_elements_to_collection(self.updates, ops.GraphKeys.UPDATE_OPS)
+    self.built = True
     return outputs
 
   @property
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 9acf1c05e2a..9e2457a4891 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -153,6 +153,36 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer.built, True)
     self.assertEqual(outputs.op.name, 'my_layer/Square')
 
+  def testFirstCallCanCreateVariablesButSecondCanNotWhenBuildEmpty(self):
+
+    class MyLayer(base_layers.Layer):
+
+      def build(self, _):
+        # Do not mark the layer as built.
+        pass
+
+      def call(self, inputs):
+        self.my_var = self.add_variable('my_var', [2, 2])
+        if self.built:
+          # Skip creating on the first call; try to create after it's
+          # built.  This is expected to fail.
+          self.add_variable('this_will_break_on_second_call', [2, 2])
+        return inputs + math_ops.square(self.my_var)
+
+    layer = MyLayer(name='my_layer')
+    inputs = random_ops.random_uniform((2,), seed=1)
+    outputs = layer.apply(inputs)
+    self.assertEqual(layer.built, True)
+    self.assertEqual(outputs.op.name, 'my_layer/add')
+    self.assertListEqual(
+        [v.name for v in layer.variables], ['my_layer/my_var:0'])
+    with self.assertRaisesRegexp(ValueError,
+                                 'my_layer/this_will_break_on_second_call'):
+      layer.apply(inputs)
+    # The list of variables hasn't changed.
+    self.assertListEqual(
+        [v.name for v in layer.variables], ['my_layer/my_var:0'])
+
   def testDeepCopy(self):
 
     class MyLayer(base_layers.Layer):
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 50709bb51da..b2fe9feb442 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -145,6 +145,7 @@ class _Conv(base.Layer):
                                     dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     outputs = nn.convolution(
@@ -837,6 +838,7 @@ class SeparableConv2D(Conv2D):
                                     dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     if self.data_format == 'channels_first':
@@ -1070,6 +1072,7 @@ class Conv2DTranspose(Conv2D):
                                     dtype=self.dtype)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
@@ -1224,4 +1227,3 @@ convolution2d = conv2d
 convolution3d = conv3d
 separable_convolution2d = separable_conv2d
 convolution2d_transpose = deconvolution2d = deconv2d = conv2d_transpose
-
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 49f6499ca47..1ec4e51e5ea 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -130,6 +130,7 @@ class Dense(base.Layer):
                                     trainable=True)
     else:
       self.bias = None
+    self.built = True
 
   def call(self, inputs):
     inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 2970ddb8ce1..871f840c529 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -201,6 +201,7 @@ class BatchNormalization(base.Layer):
                 'renorm_stddev_weight', ())
     finally:
       self._scope.set_partitioner(partitioner)
+    self.built = True
 
   def _renorm_correction_and_moments(self, mean, variance, training):
     """Returns the correction and update values for renorm."""
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index b8193729239..a1dfab09de3 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -71,6 +71,7 @@ class _Pooling1D(base.Layer):
     if len(input_shape) != 3:
       raise ValueError('Inputs should have rank 3. '
                        'Received input shape:', str(input_shape))
+    self.built = True
 
   def call(self, inputs):
     # There is no TF op for 1D pooling, hence we make the inputs 4D.
@@ -261,6 +262,7 @@ class _Pooling2D(base.Layer):
     if len(input_shape) != 4:
       raise ValueError('Inputs should have rank 4. '
                        'Received input shape:', str(input_shape))
+    self.built = True
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
@@ -448,6 +450,7 @@ class _Pooling3D(base.Layer):
     if len(input_shape) != 5:
       raise ValueError('Inputs should have rank 5. '
                        'Received input shape:', str(input_shape))
+    self.built = True
 
   def call(self, inputs):
     pool_shape = (1,) + self.pool_size + (1,)

From a0a56e977e0a495df085bd6e5fa05664ebf05789 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 May 2017 00:35:09 -0800
Subject: [PATCH 34/51] Allow importers of scoped metagraphs to specify
 collections to restore with a predicate.

Predicates offer a flexible generalization of blacklists & whitelists.
Change: 154931947
---
 tensorflow/python/framework/meta_graph.py     | 13 +++-
 .../python/framework/meta_graph_test.py       | 60 +++++++++++++++++++
 2 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 26344d38528..7b13fcfbdf2 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -422,14 +422,15 @@ def import_scoped_meta_graph(meta_graph_or_file,
                              graph=None,
                              import_scope=None,
                              input_map=None,
-                             unbound_inputs_col_name="unbound_inputs"):
-  """Recreates a`Graph` saved in a `MetaGraphDef` proto.
+                             unbound_inputs_col_name="unbound_inputs",
+                             restore_collections_predicate=(lambda key: True)):
+  """Recreates a `Graph` saved in a `MetaGraphDef` proto.
 
   This function takes a `MetaGraphDef` protocol buffer as input. If
   the argument is a file containing a `MetaGraphDef` protocol buffer ,
   it constructs a protocol buffer from the file content. The function
   then adds all the nodes from the `graph_def` field to the
-  current graph, recreates all the collections, and returns a saver
+  current graph, recreates the desired collections, and returns a saver
   constructed from the `saver_def` field.
 
   In combination with `export_scoped_meta_graph()`, this function can be used to
@@ -453,6 +454,10 @@ def import_scoped_meta_graph(meta_graph_or_file,
       `Tensor` objects. The values of the named input tensors in the imported
       graph will be re-mapped to the respective `Tensor` values.
     unbound_inputs_col_name: Collection name for looking up unbound inputs.
+    restore_collections_predicate: a predicate on collection names. A collection
+      named c (i.e whose key is c) will be restored iff
+      1) `restore_collections_predicate(c)` is True, and
+      2) `c != unbound_inputs_col_name`.
 
   Returns:
     A dictionary of all the `Variables` imported into the name scope.
@@ -503,6 +508,8 @@ def import_scoped_meta_graph(meta_graph_or_file,
       # Don't add unbound_inputs to the new graph.
       if key == unbound_inputs_col_name:
         continue
+      if not restore_collections_predicate(key):
+        continue
 
       kind = col_def.WhichOneof("kind")
       if kind is None:
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index f8056ade3e4..49d59977846 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -335,6 +335,66 @@ class ScopedMetaGraphTest(test.TestCase):
     for a, b in zip(orig_meta_graphs, new_meta_graphs):
       test_util.assert_meta_graph_protos_equal(self, a, b)
 
+  def testScopedImportWithSelectedCollections(self):
+    meta_graph_filename = os.path.join(
+        _TestDir("selected_collections_import"), "meta_graph.pb")
+
+    graph = ops.Graph()
+    # Add a variable to populate two collections. The functionality tested is
+    # not specific to variables, but using variables in the test is convenient.
+    with graph.as_default():
+      variables.Variable(initial_value=1.0, trainable=True)
+    self.assertTrue(
+        all([
+            graph.get_collection(key)
+            for key in
+            [ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES]
+        ]))
+    meta_graph.export_scoped_meta_graph(
+        filename=meta_graph_filename, graph=graph)
+
+    def _test_import(include_collection_keys, omit_collection_keys):
+      assert set(include_collection_keys).isdisjoint(omit_collection_keys)
+      newgraph = ops.Graph()
+      import_scope = "some_scope_name"
+
+      def _restore_collections_predicate(collection_key):
+        return (collection_key in include_collection_keys and
+                collection_key not in omit_collection_keys)
+
+      meta_graph.import_scoped_meta_graph(
+          meta_graph_filename,
+          graph=newgraph,
+          import_scope=import_scope,
+          restore_collections_predicate=_restore_collections_predicate)
+      collection_values = [
+          newgraph.get_collection(name=key, scope=import_scope)
+          for key in include_collection_keys
+      ]
+      self.assertTrue(all(collection_values))
+      collection_values = [
+          newgraph.get_collection(name=key, scope=import_scope)
+          for key in omit_collection_keys
+      ]
+      self.assertFalse(any(collection_values))
+
+    _test_import(
+        include_collection_keys=[
+            ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES
+        ],
+        omit_collection_keys=[])
+    _test_import(
+        include_collection_keys=[ops.GraphKeys.GLOBAL_VARIABLES],
+        omit_collection_keys=[ops.GraphKeys.TRAINABLE_VARIABLES])
+    _test_import(
+        include_collection_keys=[ops.GraphKeys.TRAINABLE_VARIABLES],
+        omit_collection_keys=[ops.GraphKeys.GLOBAL_VARIABLES])
+    _test_import(
+        include_collection_keys=[],
+        omit_collection_keys=[
+            ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES
+        ])
+
   def _testScopedExportWithQueue(self, test_dir, exported_filename):
     graph = ops.Graph()
     with graph.as_default():

From 562136cf7fb887b5dba755319263230062d512e0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 May 2017 06:16:09 -0800
Subject: [PATCH 35/51] [TF:XLA] Set metadata of all added HLO instructions
 when lowering computations. Change: 154952289

---
 .../compiler/xla/service/user_computation.cc  | 164 ++++++++----------
 .../xla/service/user_computation_test.cc      |   5 +
 2 files changed, 79 insertions(+), 90 deletions(-)

diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 34e8ee8acad..e9fcc9fa666 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -1928,6 +1928,12 @@ HloInstruction* ComputationLowerer::Visit(
 
   const OperationRequest& request =
       session_computation_.requests().at(handle.handle());
+  auto add_instruction = [&](std::unique_ptr<HloInstruction> instruction) {
+    HloInstruction* hlo_instruction =
+        hlo_builder_.AddInstruction(std::move(instruction));
+    hlo_instruction->set_metadata(request.request().metadata());
+    return hlo_instruction;
+  };
   HloInstruction* hlo_instruction;
   switch (request.request().op_case()) {
     case OpRequest::kRngRequest: {
@@ -1936,7 +1942,7 @@ HloInstruction* ComputationLowerer::Visit(
       for (const ComputationDataHandle& param : rng_request.parameter()) {
         parameters.push_back(Visit(param, visited));
       }
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateRng(
+      hlo_instruction = add_instruction(HloInstruction::CreateRng(
           request.output_shape(), rng_request.distribution(), parameters));
       break;
     }
@@ -1944,9 +1950,8 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kConstantRequest: {
       const ConstantRequest& constant_request =
           request.request().constant_request();
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateConstant(
-              LiteralUtil::CloneToUnique(constant_request.literal())));
+      hlo_instruction = add_instruction(HloInstruction::CreateConstant(
+          LiteralUtil::CloneToUnique(constant_request.literal())));
       break;
     }
 
@@ -1955,17 +1960,15 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().get_tuple_element_request();
       HloInstruction* operand =
           Visit(get_tuple_element_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateGetTupleElement(
-              request.output_shape(), operand,
-              get_tuple_element_request.index()));
+      hlo_instruction = add_instruction(HloInstruction::CreateGetTupleElement(
+          request.output_shape(), operand, get_tuple_element_request.index()));
       break;
     }
 
     case OpRequest::kSliceRequest: {
       const SliceRequest& slice_request = request.request().slice_request();
       HloInstruction* operand = Visit(slice_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateSlice(
+      hlo_instruction = add_instruction(HloInstruction::CreateSlice(
           request.output_shape(), operand,
           AsInt64Slice(slice_request.start_indices()),
           AsInt64Slice(slice_request.limit_indices())));
@@ -1979,10 +1982,9 @@ HloInstruction* ComputationLowerer::Visit(
       HloInstruction* start_indices =
           Visit(dynamic_slice_request.start_indices(), visited);
 
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateDynamicSlice(
-              request.output_shape(), operand, start_indices,
-              AsInt64Slice(dynamic_slice_request.slice_sizes())));
+      hlo_instruction = add_instruction(HloInstruction::CreateDynamicSlice(
+          request.output_shape(), operand, start_indices,
+          AsInt64Slice(dynamic_slice_request.slice_sizes())));
       break;
     }
 
@@ -1996,7 +1998,7 @@ HloInstruction* ComputationLowerer::Visit(
       HloInstruction* start_indices =
           Visit(dynamic_update_slice_request.start_indices(), visited);
       hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          add_instruction(HloInstruction::CreateDynamicUpdateSlice(
               request.output_shape(), operand, update, start_indices));
       break;
     }
@@ -2010,9 +2012,8 @@ HloInstruction* ComputationLowerer::Visit(
         HloInstruction* operand = Visit(handle, visited);
         operands.push_back(operand);
       }
-      hlo_instruction = hlo_builder_.AddInstruction(
-          HloInstruction::CreateConcatenate(request.output_shape(), operands,
-                                            concatenate_request.dimension()));
+      hlo_instruction = add_instruction(HloInstruction::CreateConcatenate(
+          request.output_shape(), operands, concatenate_request.dimension()));
       break;
     }
 
@@ -2021,10 +2022,9 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().convolve_request();
       HloInstruction* lhs = Visit(convolve_request.lhs(), visited);
       HloInstruction* rhs = Visit(convolve_request.rhs(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateConvolve(
-              request.output_shape(), lhs, rhs, convolve_request.window(),
-              convolve_request.dimension_numbers()));
+      hlo_instruction = add_instruction(HloInstruction::CreateConvolve(
+          request.output_shape(), lhs, rhs, convolve_request.window(),
+          convolve_request.dimension_numbers()));
       break;
     }
 
@@ -2033,17 +2033,15 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().cross_replica_sum_request();
       HloInstruction* operand =
           Visit(cross_replica_sum_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateCrossReplicaSum(
-              request.output_shape(), operand));
+      hlo_instruction = add_instruction(HloInstruction::CreateCrossReplicaSum(
+          request.output_shape(), operand));
       break;
     }
 
     case OpRequest::kInfeedRequest: {
       const InfeedRequest& infeed_request = request.request().infeed_request();
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateInfeed(
-              request.output_shape(), infeed_request.config()));
+      hlo_instruction = add_instruction(HloInstruction::CreateInfeed(
+          request.output_shape(), infeed_request.config()));
       break;
     }
 
@@ -2051,9 +2049,8 @@ HloInstruction* ComputationLowerer::Visit(
       const OutfeedRequest& outfeed_request =
           request.request().outfeed_request();
       HloInstruction* operand = Visit(outfeed_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(
-          HloInstruction::CreateOutfeed(outfeed_request.shape(), operand,
-                                        outfeed_request.outfeed_config()));
+      hlo_instruction = add_instruction(HloInstruction::CreateOutfeed(
+          outfeed_request.shape(), operand, outfeed_request.outfeed_config()));
       break;
     }
 
@@ -2069,7 +2066,7 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* map_computation =
           ResolveComputation(map_request.to_apply(), map_version);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateMap(
+      hlo_instruction = add_instruction(HloInstruction::CreateMap(
           request.output_shape(), operands, map_computation));
       break;
     }
@@ -2083,10 +2080,9 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* reduce_computation =
           ResolveComputation(reduce_request.to_apply(), reduce_version);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReduce(
-              request.output_shape(), operand, init_value,
-              AsInt64Slice(reduce_request.dimensions()), reduce_computation));
+      hlo_instruction = add_instruction(HloInstruction::CreateReduce(
+          request.output_shape(), operand, init_value,
+          AsInt64Slice(reduce_request.dimensions()), reduce_computation));
       break;
     }
 
@@ -2101,10 +2097,9 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* reduce_window_computation = ResolveComputation(
           reduce_window_request.to_apply(), reduce_window_version);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReduceWindow(
-              request.output_shape(), operand, init_value,
-              reduce_window_request.window(), reduce_window_computation));
+      hlo_instruction = add_instruction(HloInstruction::CreateReduceWindow(
+          request.output_shape(), operand, init_value,
+          reduce_window_request.window(), reduce_window_computation));
       break;
     }
 
@@ -2126,11 +2121,10 @@ HloInstruction* ComputationLowerer::Visit(
           select_and_scatter_request.select(), select_version);
       HloComputation* scatter_computation = ResolveComputation(
           select_and_scatter_request.scatter(), scatter_version);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateSelectAndScatter(
-              request.output_shape(), operand, select_computation,
-              select_and_scatter_request.window(), source, init_value,
-              scatter_computation));
+      hlo_instruction = add_instruction(HloInstruction::CreateSelectAndScatter(
+          request.output_shape(), operand, select_computation,
+          select_and_scatter_request.window(), source, init_value,
+          scatter_computation));
       break;
     }
 
@@ -2151,9 +2145,8 @@ HloInstruction* ComputationLowerer::Visit(
                                        ShapeUtil::Rank(request.output_shape()) -
                                        ShapeUtil::Rank(operand->shape()));
       }
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateBroadcast(
-              request.output_shape(), operand, broadcast_dimensions));
+      hlo_instruction = add_instruction(HloInstruction::CreateBroadcast(
+          request.output_shape(), operand, broadcast_dimensions));
       break;
     }
 
@@ -2165,14 +2158,13 @@ HloInstruction* ComputationLowerer::Visit(
       if (IsIdentityPermutation(AsInt64Slice(reshape_request.dimensions()))) {
         transposed = operand;
       } else {
-        transposed =
-            hlo_builder_.AddInstruction(HloInstruction::CreateTranspose(
-                ShapeUtil::PermuteDimensions(InversePermutation(AsInt64Slice(
-                                                 reshape_request.dimensions())),
-                                             operand->shape()),
-                operand, AsInt64Slice(reshape_request.dimensions())));
+        transposed = add_instruction(HloInstruction::CreateTranspose(
+            ShapeUtil::PermuteDimensions(
+                InversePermutation(AsInt64Slice(reshape_request.dimensions())),
+                operand->shape()),
+            operand, AsInt64Slice(reshape_request.dimensions())));
       }
-      hlo_instruction = hlo_builder_.AddInstruction(
+      hlo_instruction = add_instruction(
           HloInstruction::CreateReshape(request.output_shape(), transposed));
       break;
     }
@@ -2181,12 +2173,11 @@ HloInstruction* ComputationLowerer::Visit(
       const TransposeRequest& transpose_request =
           request.request().transpose_request();
       HloInstruction* operand = Visit(transpose_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateTranspose(
-              ShapeUtil::PermuteDimensions(InversePermutation(AsInt64Slice(
-                                               transpose_request.dimensions())),
-                                           operand->shape()),
-              operand, AsInt64Slice(transpose_request.dimensions())));
+      hlo_instruction = add_instruction(HloInstruction::CreateTranspose(
+          ShapeUtil::PermuteDimensions(
+              InversePermutation(AsInt64Slice(transpose_request.dimensions())),
+              operand->shape()),
+          operand, AsInt64Slice(transpose_request.dimensions())));
       break;
     }
 
@@ -2194,10 +2185,9 @@ HloInstruction* ComputationLowerer::Visit(
       const ReverseRequest& reverse_request =
           request.request().reverse_request();
       HloInstruction* operand = Visit(reverse_request.operand(), visited);
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateReverse(
-              request.output_shape(), operand,
-              AsInt64Slice(reverse_request.dimensions())));
+      hlo_instruction = add_instruction(HloInstruction::CreateReverse(
+          request.output_shape(), operand,
+          AsInt64Slice(reverse_request.dimensions())));
       break;
     }
 
@@ -2206,7 +2196,7 @@ HloInstruction* ComputationLowerer::Visit(
       HloInstruction* operand = Visit(pad_request.operand(), visited);
       HloInstruction* padding_value =
           Visit(pad_request.padding_value(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreatePad(
+      hlo_instruction = add_instruction(HloInstruction::CreatePad(
           request.output_shape(), operand, padding_value,
           pad_request.padding_config()));
       break;
@@ -2214,7 +2204,7 @@ HloInstruction* ComputationLowerer::Visit(
 
     case OpRequest::kRecvRequest: {
       const RecvRequest& recv_request = request.request().recv_request();
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateRecv(
+      hlo_instruction = add_instruction(HloInstruction::CreateRecv(
           request.output_shape(), recv_request.channel_handle().handle()));
       break;
     }
@@ -2222,10 +2212,9 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kParameterRequest: {
       const ParameterRequest& parameter_request =
           request.request().parameter_request();
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateParameter(
-              parameter_request.parameter(), request.output_shape(),
-              parameter_request.name()));
+      hlo_instruction = add_instruction(HloInstruction::CreateParameter(
+          parameter_request.parameter(), request.output_shape(),
+          parameter_request.name()));
       break;
     }
 
@@ -2233,7 +2222,7 @@ HloInstruction* ComputationLowerer::Visit(
       const ConvertRequest& convert_request =
           request.request().convert_request();
       HloInstruction* operand = Visit(convert_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(
+      hlo_instruction = add_instruction(
           HloInstruction::CreateConvert(request.output_shape(), operand));
       break;
     }
@@ -2250,7 +2239,7 @@ HloInstruction* ComputationLowerer::Visit(
       HloComputation* body =
           ResolveComputation(while_request.body(), body_version);
       HloInstruction* init = Visit(while_request.init(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateWhile(
+      hlo_instruction = add_instruction(HloInstruction::CreateWhile(
           request.output_shape(), condition, body, init));
       break;
     }
@@ -2262,9 +2251,8 @@ HloInstruction* ComputationLowerer::Visit(
       HloInstruction* rhs = Visit(ternary_op_request.rhs(), visited);
       HloInstruction* ehs = Visit(ternary_op_request.ehs(), visited);
       auto hlo_opcode = TernaryOperationToHloOpcode(ternary_op_request.triop());
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateTernary(
-              request.output_shape(), hlo_opcode, lhs, rhs, ehs));
+      hlo_instruction = add_instruction(HloInstruction::CreateTernary(
+          request.output_shape(), hlo_opcode, lhs, rhs, ehs));
       break;
     }
 
@@ -2279,9 +2267,8 @@ HloInstruction* ComputationLowerer::Visit(
       }
       auto hlo_opcode =
           VariadicOperationToHloOpcode(variadic_op_request.varop());
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateVariadic(
-              request.output_shape(), hlo_opcode, operands));
+      hlo_instruction = add_instruction(HloInstruction::CreateVariadic(
+          request.output_shape(), hlo_opcode, operands));
       break;
     }
 
@@ -2296,7 +2283,7 @@ HloInstruction* ComputationLowerer::Visit(
           request.embedded_computation_versions(0);
       HloComputation* call_computation =
           ResolveComputation(call_request.to_apply(), call_version);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateCall(
+      hlo_instruction = add_instruction(HloInstruction::CreateCall(
           request.output_shape(), operands, call_computation));
       break;
     }
@@ -2308,9 +2295,8 @@ HloInstruction* ComputationLowerer::Visit(
       for (const ComputationDataHandle& operand : cc_request.operands()) {
         operands.push_back(Visit(operand, visited));
       }
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateCustomCall(
-              cc_request.shape(), operands, cc_request.call_target_name()));
+      hlo_instruction = add_instruction(HloInstruction::CreateCustomCall(
+          cc_request.shape(), operands, cc_request.call_target_name()));
       break;
     }
 
@@ -2319,7 +2305,7 @@ HloInstruction* ComputationLowerer::Visit(
           request.request().unary_op_request();
       HloInstruction* operand = Visit(unary_op_request.operand(), visited);
       auto hlo_opcode = UnaryOperationToHloOpcode(unary_op_request.unop());
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateUnary(
+      hlo_instruction = add_instruction(HloInstruction::CreateUnary(
           request.output_shape(), hlo_opcode, operand));
       break;
     }
@@ -2347,23 +2333,22 @@ HloInstruction* ComputationLowerer::Visit(
         // identical to the HLO broadcast semantics so the broadcast_dimensions
         // field can just be passed to the instruction builder.
         HloInstruction* broadcasted_operand =
-            hlo_builder_.AddInstruction(HloInstruction::CreateBroadcast(
+            add_instruction(HloInstruction::CreateBroadcast(
                 broadcast_shape, operand_to_broadcast,
                 AsInt64Slice(binary_op_request.broadcast_dimensions())));
 
         lhs = (lhs == operand_to_broadcast) ? broadcasted_operand : lhs;
         rhs = (rhs == operand_to_broadcast) ? broadcasted_operand : rhs;
       }
-      hlo_instruction =
-          hlo_builder_.AddInstruction(HloInstruction::CreateBinary(
-              request.output_shape(), hlo_opcode, lhs, rhs));
+      hlo_instruction = add_instruction(HloInstruction::CreateBinary(
+          request.output_shape(), hlo_opcode, lhs, rhs));
       break;
     }
 
     case OpRequest::kTraceRequest: {
       const TraceRequest& trace_request = request.request().trace_request();
       HloInstruction* operand = Visit(trace_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(
+      hlo_instruction = add_instruction(
           HloInstruction::CreateTrace(trace_request.tag(), operand));
       operand->set_tracing(hlo_instruction);
       break;
@@ -2372,7 +2357,7 @@ HloInstruction* ComputationLowerer::Visit(
     case OpRequest::kSendRequest: {
       const SendRequest& send_request = request.request().send_request();
       HloInstruction* operand = Visit(send_request.operand(), visited);
-      hlo_instruction = hlo_builder_.AddInstruction(HloInstruction::CreateSend(
+      hlo_instruction = add_instruction(HloInstruction::CreateSend(
           operand, send_request.channel_handle().handle()));
       break;
     }
@@ -2383,7 +2368,6 @@ HloInstruction* ComputationLowerer::Visit(
     default:
       LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
   }
-  hlo_instruction->set_metadata(request.request().metadata());
   (*visited)[handle.handle()] = hlo_instruction;
   return hlo_instruction;
 }
diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc
index 032b5cfac60..cf04cfde500 100644
--- a/tensorflow/compiler/xla/service/user_computation_test.cc
+++ b/tensorflow/compiler/xla/service/user_computation_test.cc
@@ -59,6 +59,9 @@ TEST_F(UserComputationTest, SimpleComputation) {
   param_request.set_name("param0");
   TF_ASSIGN_OR_ASSERT_OK(ComputationDataHandle param_handle,
                          computation.AddParameterInstruction(param_request));
+  OpMetadata metadata;
+  metadata.set_op_name("meta");
+  TF_ASSERT_OK(computation.SetOpMetadata(param_handle, metadata));
 
   OutfeedRequest outfeed_request;
   *outfeed_request.mutable_operand() = constant_handle;
@@ -135,6 +138,8 @@ TEST_F(UserComputationTest, SimpleComputation) {
     // The root of the instruction should be the parameter instruction (not the
     // outfeed).
     EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
+    EXPECT_EQ(hlo_computation->root_instruction()->metadata().op_name(),
+              "meta");
   }
 }
 

From 27aaf4a653aa36e1c742fd859426f046ff6cf7af Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Wed, 3 May 2017 07:52:23 -0800
Subject: [PATCH 36/51] Made sure that both queue shapes and types are updated
 in one single pass instead of two. Change: 154960772

---
 tensorflow/core/grappler/costs/graph_properties.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 75f2f16c5bb..035483ec179 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -84,7 +84,7 @@ Status GraphProperties::InferStatically() {
           }
         }
       }
-      if (qctx->set_output_handle_dtype(0, queue_type) ||
+      if (qctx->set_output_handle_dtype(0, queue_type) |
           qctx->MergeOutputHandleShape(0, queue_shp)) {
         new_shapes.push(qnode);
       }

From b93dd62e8ae9faed909c677781edc278632704f0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 May 2017 08:20:17 -0800
Subject: [PATCH 37/51] Move bucketized_column to core. Change: 154963963

---
 .../python/feature_column/feature_column.py   | 128 +++++++-
 .../feature_column/feature_column_test.py     | 280 +++++++++++++++++-
 2 files changed, 401 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 7d8a42080d7..e408506cb06 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -200,13 +200,13 @@ def make_linear_model(features,
     builder = _LazyBuilder(features)
     for column in sorted(feature_columns, key=lambda x: x.name):
       with variable_scope.variable_scope(None, default_name=column.name):
-        if isinstance(column, _DenseColumn):
-          weigthed_sums.append(_create_dense_column_weighted_sum(
-              column, builder, units, weight_collections, trainable))
-        else:
+        if isinstance(column, _CategoricalColumn):
           weigthed_sums.append(_create_categorical_column_weighted_sum(
               column, builder, units, sparse_combiner, weight_collections,
               trainable))
+        else:
+          weigthed_sums.append(_create_dense_column_weighted_sum(
+              column, builder, units, weight_collections, trainable))
     predictions_no_bias = math_ops.add_n(
         weigthed_sums, name='weighted_sum_no_bias')
     bias = variable_scope.get_variable(
@@ -237,7 +237,7 @@ def numeric_column(key,
   # or
   bucketized_price = bucketized_column(price, boundaries=[...])
   all_feature_columns = [bucketized_price, ...]
-  linear_prediction, _, _ = make_linear_model(features, all_feature_columns)
+  linear_prediction = make_linear_model(features, all_feature_columns)
 
   ```
 
@@ -291,6 +291,55 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
+def bucketized_column(source_column, boundaries):
+  """Represents discretized dense input.
+
+  Buckets include the left boundary, and exclude the right boundary. Namely,
+  `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
+  `[1., 2.)`, and `[2., +inf)`.
+
+  An example:
+  ```python
+  price = numeric_column('price')
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  all_feature_columns = [bucketized_price, ...]
+  linear_prediction = make_linear_model(features, all_feature_columns)
+
+  # or
+  all_feature_columns = [bucketized_price, ...]
+  dense_tensor = make_input_layer(features, all_feature_columns)
+  ```
+
+  Args:
+    source_column: A one-dimensional dense column which is generated with
+      `numeric_column`.
+    boundaries: A sorted list or tuple of floats specifying the boundaries.
+
+  Returns:
+    A `_BucketizedColumn`.
+
+  Raises:
+    ValueError: If `source_column` is not a numeric column, or if it is not
+      one-dimensional.
+    ValueError: If `boundaries` is not a sorted list or tuple.
+  """
+  if not isinstance(source_column, _NumericColumn):
+    raise ValueError(
+        'source_column must be a column generated with numeric_column(). '
+        'Given: {}'.format(source_column))
+  if len(source_column.shape) > 1:
+    raise ValueError(
+        'source_column must be one-dimensional column. '
+        'Given: {}'.format(source_column))
+  if (not boundaries or
+      not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):
+    raise ValueError('boundaries must be a sorted list.')
+  for i in range(len(boundaries) - 1):
+    if boundaries[i] >= boundaries[i + 1]:
+      raise ValueError('boundaries must be a sorted list.')
+  return _BucketizedColumn(source_column, tuple(boundaries))
+
+
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
                                         dtype=dtypes.string):
@@ -303,8 +352,8 @@ def categorical_column_with_hash_bucket(key,
   An example:
   ```python
   keywords = categorical_column_with_hash_bucket("keywords", 10K)
-  linear_prediction, _, _ = make_linear_model(features, all_feature_columns)
   all_feature_columns = [keywords, ...]
+  linear_prediction = make_linear_model(features, all_feature_columns)
 
   # or
   keywords_embedded = embedding_column(keywords, 16)
@@ -668,6 +717,73 @@ class _NumericColumn(_DenseColumn,
     return inputs.get(self)
 
 
+class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
+                        collections.namedtuple('_BucketizedColumn', [
+                            'source_column', 'boundaries'])):
+  """See `bucketized_column`."""
+
+  @property
+  def name(self):
+    return '{}_bucketized'.format(self.source_column.name)
+
+  @property
+  def _parse_example_config(self):
+    return self.source_column._parse_example_config  # pylint: disable=protected-access
+
+  def _transform_feature(self, inputs):
+    source_tensor = inputs.get(self.source_column)
+    return math_ops._bucketize(  # pylint: disable=protected-access
+        source_tensor,
+        boundaries=self.boundaries)
+
+  @property
+  def _variable_shape(self):
+    return tuple(self.source_column.shape) + (len(self.boundaries) + 1,)
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    input_tensor = inputs.get(self)
+    return array_ops.one_hot(
+        indices=math_ops.to_int64(input_tensor),
+        depth=len(self.boundaries) + 1,
+        on_value=1.,
+        off_value=0.)
+
+  @property
+  def _num_buckets(self):
+    # By construction, source_column is always one-dimensional.
+    return (len(self.boundaries) + 1) * self.source_column.shape[0]
+
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    input_tensor = inputs.get(self)
+    batch_size = array_ops.shape(input_tensor)[0]
+    # By construction, source_column is always one-dimensional.
+    source_dimension = self.source_column.shape[0]
+
+    i1 = array_ops.reshape(
+        array_ops.tile(
+            array_ops.expand_dims(math_ops.range(0, batch_size), 1),
+            [1, source_dimension]),
+        (-1,))
+    i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])
+    # Flatten the bucket indices and unique them across dimensions
+    # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
+    bucket_indices = (
+        array_ops.reshape(input_tensor, (-1,)) +
+        (len(self.boundaries) + 1) * i2)
+
+    indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2))))
+    dense_shape = math_ops.to_int64(array_ops.stack(
+        [batch_size, source_dimension]))
+    sparse_tensor = sparse_tensor_lib.SparseTensor(
+        indices=indices,
+        values=bucket_indices,
+        dense_shape=dense_shape)
+    return _CategoricalColumn.IdWeightPair(sparse_tensor, None)
+
+
 def _create_tuple(shape, value):
   """Returns a tuple with given shape and filled with value."""
   if shape:
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index eefe3b02978..32d6a4e8f0a 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -151,7 +151,7 @@ class LazyColumnTest(test.TestCase):
       builder.get(NotAFeatureColumn())
 
 
-class NumericalColumnTest(test.TestCase):
+class NumericColumnTest(test.TestCase):
 
   def test_defaults(self):
     a = fc.numeric_column('aaa')
@@ -327,6 +327,231 @@ class NumericalColumnTest(test.TestCase):
         self.assertAllClose([[10.], [50.]], predictions.eval())
 
 
+class BucketizedColumnTest(test.TestCase):
+
+  def test_invalid_source_column_type(self):
+    a = fc.categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'source_column must be a column generated with numeric_column'):
+      fc.bucketized_column(a, boundaries=[0, 1])
+
+  def test_invalid_source_column_shape(self):
+    a = fc.numeric_column('aaa', shape=[2, 3])
+    with self.assertRaisesRegexp(
+        ValueError, 'source_column must be one-dimensional column'):
+      fc.bucketized_column(a, boundaries=[0, 1])
+
+  def test_invalid_boundaries(self):
+    a = fc.numeric_column('aaa')
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=None)
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=1.)
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=[1, 0])
+    with self.assertRaisesRegexp(
+        ValueError, 'boundaries must be a sorted list'):
+      fc.bucketized_column(a, boundaries=[1, 1])
+
+  def test_name(self):
+    a = fc.numeric_column('aaa', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    self.assertEqual('aaa_bucketized', b.name)
+
+  def test_parse_config(self):
+    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    self.assertEqual({
+        'aaa': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32)
+    }, b._parse_example_config)
+
+  def test_variable_shape(self):
+    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    # Column 'aaa` has shape [2] times three buckets -> variable_shape=[2, 3].
+    self.assertAllEqual((2, 3), b._variable_shape)
+
+  def test_num_buckets(self):
+    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    # Column 'aaa` has shape [2] times three buckets -> num_buckets=6.
+    self.assertEqual(6, b._num_buckets)
+
+  def test_parse_example(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'price':
+                feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                    value=[20., 110.]))
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=bucketized_price._parse_example_config)
+    self.assertIn('price', features)
+    with self.test_session():
+      self.assertAllEqual([[20., 110.]], features['price'].eval())
+
+  def test_transform_feature(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = fc._LazyBuilder({
+          'price': constant_op.constant([[-1., 1.], [5., 6.]])
+      })
+      transformed_tensor = builder.get(bucketized_price)
+      with _initialized_session():
+        self.assertAllEqual([[0, 1], [3, 4]], transformed_tensor.eval())
+
+  def test_get_dense_tensor_one_input_value(self):
+    """Tests _get_dense_tensor() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = fc._LazyBuilder({
+          'price': constant_op.constant([[-1.], [1.], [5.], [6.]])
+      })
+      with _initialized_session():
+        bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
+        self.assertAllClose(
+            # One-hot tensor.
+            [[[1., 0., 0., 0., 0.]],
+             [[0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.]],
+             [[0., 0., 0., 0., 1.]]],
+            bucketized_price_tensor.eval())
+
+  def test_get_dense_tensor_two_input_values(self):
+    """Tests _get_dense_tensor() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = fc._LazyBuilder({
+          'price': constant_op.constant([[-1., 1.], [5., 6.]])
+      })
+      with _initialized_session():
+        bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
+        self.assertAllClose(
+            # One-hot tensor.
+            [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
+            bucketized_price_tensor.eval())
+
+  def test_get_sparse_tensors_one_input_value(self):
+    """Tests _get_sparse_tensors() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = fc._LazyBuilder({
+          'price': constant_op.constant([[-1.], [1.], [5.], [6.]])
+      })
+      with _initialized_session() as sess:
+        id_weight_pair = bucketized_price._get_sparse_tensors(builder)
+        self.assertIsNone(id_weight_pair.weight_tensor)
+        id_tensor_value = sess.run(id_weight_pair.id_tensor)
+        self.assertAllEqual(
+            [[0, 0], [1, 0], [2, 0], [3, 0]], id_tensor_value.indices)
+        self.assertAllEqual([0, 1, 3, 4], id_tensor_value.values)
+        self.assertAllEqual([4, 1], id_tensor_value.dense_shape)
+
+  def test_get_sparse_tensors_two_input_values(self):
+    """Tests _get_sparse_tensors() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      builder = fc._LazyBuilder({
+          'price': constant_op.constant([[-1., 1.], [5., 6.]])
+      })
+      with _initialized_session() as sess:
+        id_weight_pair = bucketized_price._get_sparse_tensors(builder)
+        self.assertIsNone(id_weight_pair.weight_tensor)
+        id_tensor_value = sess.run(id_weight_pair.id_tensor)
+        self.assertAllEqual(
+            [[0, 0], [0, 1], [1, 0], [1, 1]], id_tensor_value.indices)
+        # Values 0-4 correspond to the first column of the input price.
+        # Values 5-9 correspond to the second column of the input price.
+        self.assertAllEqual([0, 6, 3, 9], id_tensor_value.values)
+        self.assertAllEqual([2, 2], id_tensor_value.dense_shape)
+
+  def test_sparse_tensor_input_not_supported(self):
+    price = fc.numeric_column('price')
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])
+    builder = fc._LazyBuilder({
+        'price':
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
+    })
+    with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
+      bucketized_price._transform_feature(builder)
+
+  def test_deep_copy(self):
+    a = fc.numeric_column('aaa', shape=[2])
+    a_bucketized = fc.bucketized_column(a, boundaries=[0, 1])
+    a_bucketized_copy = copy.deepcopy(a_bucketized)
+    self.assertEqual(a_bucketized_copy.name, 'aaa_bucketized')
+    self.assertAllEqual(a_bucketized_copy._variable_shape, (2, 3))
+    self.assertEqual(a_bucketized_copy.boundaries, (0, 1))
+
+  def test_make_linear_model_one_input_value(self):
+    """Tests make_linear_model() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': constant_op.constant([[-1.], [1.], [5.], [6.]])}
+      predictions = fc.make_linear_model(features, [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight variable per bucket, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        sess.run(bucketized_price_var.assign(
+            [[10.], [20.], [30.], [40.], [50.]]))
+        # price -1. is in the 0th bucket, whose weight is 10.
+        # price 1. is in the 1st bucket, whose weight is 20.
+        # price 5. is in the 3rd bucket, whose weight is 40.
+        # price 6. is in the 4th bucket, whose weight is 50.
+        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+
+  def test_make_linear_model_two_input_values(self):
+    """Tests make_linear_model() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': constant_op.constant([[-1., 1.], [5., 6.]])}
+      predictions = fc.make_linear_model(features, [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight per bucket per input column, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
+            bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(bucketized_price_var.assign(
+            [[10.], [20.], [30.], [40.], [50.],
+             [60.], [70.], [80.], [90.], [100.]]))
+        # 1st example:
+        #   price -1. is in the 0th bucket, whose weight is 10.
+        #   price 1. is in the 6th bucket, whose weight is 70.
+        # 2nd example:
+        #   price 5. is in the 3rd bucket, whose weight is 40.
+        #   price 6. is in the 9th bucket, whose weight is 100.
+        self.assertAllClose([[80.], [140.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[81.], [141.]], predictions.eval())
+
+
 class SparseColumnHashedTest(test.TestCase):
 
   def test_defaults(self):
@@ -567,6 +792,59 @@ class MakeLinearModelTest(test.TestCase):
         sess.run(price_var.assign([[10.]]))
         self.assertAllClose([[1015.], [10065.]], predictions.eval())
 
+  def test_dense_and_sparse_column(self):
+    """When the column is both dense and sparse, uses sparse tensors."""
+
+    class _DenseAndSparseColumn(fc._DenseColumn, fc._CategoricalColumn):
+
+      @property
+      def name(self):
+        return 'dense_and_sparse_column'
+
+      @property
+      def _parse_example_config(self):
+        return {self.name: parsing_ops.VarLenFeature(self.dtype)}
+
+      def _transform_feature(self, inputs):
+        return inputs.get(self.name)
+
+      @property
+      def _variable_shape(self):
+        raise ValueError('Should not use this method.')
+
+      def _get_dense_tensor(self, inputs, weight_collections=None,
+                            trainable=None):
+        raise ValueError('Should not use this method.')
+
+      @property
+      def _num_buckets(self):
+        return 4
+
+      def _get_sparse_tensors(self, inputs, weight_collections=None,
+                              trainable=None):
+        sp_tensor = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [1, 0], [1, 1]],
+            values=[2, 0, 3],
+            dense_shape=[2, 2])
+        return fc._CategoricalColumn.IdWeightPair(sp_tensor, None)
+
+    dense_and_sparse_column = _DenseAndSparseColumn()
+    with ops.Graph().as_default():
+      sp_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {dense_and_sparse_column.name: sp_tensor}
+      predictions = fc.make_linear_model(features, [dense_and_sparse_column])
+      bias = get_linear_model_bias()
+      dense_and_sparse_column_var = get_linear_model_column_var(
+          dense_and_sparse_column)
+      with _initialized_session() as sess:
+        sess.run(dense_and_sparse_column_var.assign(
+            [[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
   def test_dense_multi_output(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():

From 00bfa9069e2f8456e26e09d6a7a1d0dff807e5bf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 May 2017 08:52:26 -0800
Subject: [PATCH 38/51] fixed a bug passing two flag values: show_type and
 show_summary, they currently override the value of show_time Change:
 154967526

---
 tensorflow/tools/benchmark/benchmark_model.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index c2e41e49187..8c480f8d9db 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -334,8 +334,8 @@ int Main(int argc, char** argv) {
       Flag("show_memory", &show_memory, "whether to list stats by memory used"),
       Flag("memory_limit", &memory_limit,
            "how many items to show by memory used"),
-      Flag("show_type", &show_time, "whether to list stats by op type"),
-      Flag("show_summary", &show_time,
+      Flag("show_type", &show_type, "whether to list stats by op type"),
+      Flag("show_summary", &show_summary,
            "whether to show a summary of the stats"),
       Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
       Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),

From 834a2e27a9c31b43c8dad631c555676555f63f62 Mon Sep 17 00:00:00 2001
From: Toby Boyd <tobyboyd@google.com>
Date: Wed, 3 May 2017 08:59:36 -0800
Subject: [PATCH 39/51] Adding tf_cnn_benchmark.py examples Change: 154968353

---
 tensorflow/docs_src/performance/benchmarks.md | 146 ++++++------------
 .../performance/performance_models.md         |  93 ++++++++++-
 2 files changed, 141 insertions(+), 98 deletions(-)

diff --git a/tensorflow/docs_src/performance/benchmarks.md b/tensorflow/docs_src/performance/benchmarks.md
index bfb47d9f908..19d37794ab8 100644
--- a/tensorflow/docs_src/performance/benchmarks.md
+++ b/tensorflow/docs_src/performance/benchmarks.md
@@ -3,9 +3,9 @@
 ## Overview
 
 A selection of image classification models were tested across multiple platforms
-to create a point of reference for the TensorFlow community. The methodology,
-links to the benchmark scripts, and commands to reproduce the results are in the
-[Appendix](#appendix).
+to create a point of reference for the TensorFlow community. The
+[Methodology](#methodology) section details how the test were executed and has
+links to the scripts used.
 
 ## Results for image classification models
 
@@ -120,19 +120,19 @@ VGG16       | replicated (with NCCL) | n/a
 
 GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
 ---- | ----------- | --------- | ---------- | ------- | -----
-1    | 142         | 238       | 95.6       | 2987    | 132
-2    | 284         | 479       | 187        | 5658    | 259
-4    | 569         | 948       | 374        | 10509   | 511
-8    | 1131        | 1886      | 744        | 17822   | 959
+  1  | 142         | 238       | 95.6       | 2987    | 154
+  2  | 284         | 479       | 187        | 5658    | 295
+  4  | 569         | 948       | 374        | 10509   | 584
+  8  | 1131        | 1886      | 744        | 17822   | 1081
 
 **Training real data**
 
 GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
 ---- | ----------- | --------- | ---------- | ------- | -----
-1    | 142         | 239       | 95.5       | 2890    | 132
-2    | 278         | 468       | 187        | 4448    | 245
-4    | 551         | 938       | 373        | 7105    | 466
-8    | 1079        | 1802      | 721        | N/A     | 794
+  1  | 142         | 239       | 95.5       | 2890    | 154
+  2  | 278         | 468       | 187        | 4448    | 284
+  4  | 551         | 938       | 373        | 7105    | 534
+  8  | 1079        | 1802      | 721        | N/A     | 898
 
 Training AlexNet with real data on 8 GPUs was excluded from the graph and table
 above due to it maxing out the input pipeline.
@@ -145,19 +145,19 @@ The results below are all with a batch size of 32.
 
 GPUs | InceptionV3 | ResNet-50 | ResNet-152 | VGG16
 ---- | ----------- | --------- | ---------- | -----
-1    | 128         | 210       | 85.3       | 124
-2    | 259         | 412       | 166        | 241
-4    | 520         | 827       | 330        | 470
-8    | 995         | 1623      | 643        | 738
+  1  | 128         | 210       | 85.3       | 144
+  2  | 259         | 412       | 166        | 281
+  4  | 520         | 827       | 330        | 549
+  8  | 995         | 1623      | 643        | 820
 
 **Training real data**
 
 GPUs | InceptionV3 | ResNet-50 | ResNet-152 | VGG16
 ---- | ----------- | --------- | ---------- | -----
-1    | 130         | 208       | 85.0       | 124
-2    | 257         | 403       | 163        | 221
-4    | 507         | 814       | 325        | 401
-8    | 966         | 1525      | 641        | 619
+  1  | 130         | 208       | 85.0       | 144
+  2  | 257         | 403       | 163        | 253
+  4  | 507         | 814       | 325        | 457
+  8  | 966         | 1525      | 641        | 690
 
 ## Details for Google Compute Engine (NVIDIA® Tesla® K80)
 
@@ -198,19 +198,19 @@ The configuration used for each model was `variable_update` equal to
 
 GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
 ---- | ----------- | --------- | ---------- | ------- | -----
-1    | 30.5        | 56.8      | 20.8       | 656     | 30.3
-2    | 57.8        | 107       | 39.1       | 1210    | 56.2
-4    | 116         | 212       | 77.2       | 2330    | 106
-8    | 227         | 419       | 151        | 4640    | 222
+  1  | 30.5        | 56.8      | 20.8       | 656     | 35.4
+  2  | 57.8        | 107       | 39.1       | 1209    | 64.8
+  4  | 116         | 212       | 77.2       | 2328    | 120
+  8  | 227         | 419       | 151        | 4640    | 234
 
 **Training real data**
 
 GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
 ---- | ----------- | --------- | ---------- | ------- | -----
-  1  | 30.6        | 56.7      | 20.7       | 639     | 30.2       
-  2  | 58.4        | 107       | 39.0       | 1136    | 55.5       
-  4  | 115         | 211       | 77.3       | 2067    | 106        
-  8  | 225         | 422       | 151        | 4056    | 213   
+  1  | 30.6        | 56.7      | 20.7       | 639     | 34.2
+  2  | 58.4        | 107       | 39.0       | 1136    | 62.9
+  4  | 115         | 211       | 77.3       | 2067    | 118
+  8  | 225         | 422       | 151        | 4056    | 230
 
 ### Other Results
 
@@ -227,10 +227,10 @@ GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
 
 GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
 ---- | --------------------------- | -------------------------
-  1  | 29.5                        | 53.6       
-  2  | 55.4                        | 102        
-  4  | 110                         | 201        
-  8  | 216                         | 387  
+  1  | 29.5                        | 53.6
+  2  | 55.4                        | 102
+  4  | 110                         | 201
+  8  | 216                         | 387
 
 ## Details for Amazon EC2 (NVIDIA® Tesla® K80)
 
@@ -279,19 +279,19 @@ VGG16       | parameter_server          | gpu
 
 GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
 ---- | ----------- | --------- | ---------- | ------- | -----
-1    | 30.8        | 56.3      | 20.9       | 684     | 32.4
-2    | 58.7        | 108       | 39.3       | 1244    | 61.5
-4    | 117         | 217       | 79.1       | 2479    | 123
-8    | 230         | 419       | 156        | 4853    | 234
+  1  | 30.8        | 56.3      | 20.9       | 684     | 36.3
+  2  | 58.7        | 108       | 39.3       | 1244    | 69.4
+  4  | 117         | 217       | 79.1       | 2479    | 141
+  8  | 230         | 419       | 156        | 4853    | 260
 
 **Training real data**
 
 GPUs | InceptionV3 | ResNet-50 | ResNet-152 | Alexnet | VGG16
 ---- | ----------- | --------- | ---------- | ------- | -----
-1    | 30.5        | 56.0      | 20.6       | 674     | 32.0
-2    | 58.7        | 107       | 39.0       | 1227    | 61.0
-4    | 118         | 205       | 77.9       | 2201    | 120
-8    | 228         | 405       | 152        | N/A     | 191
+  1  | 30.5        | 56.0      | 20.6       | 674     | 36.3
+  2  | 59.0        | 107       | 39.0       | 1227    | 67.5
+  4  | 118         | 205       | 77.9       | 2201    | 136
+  8  | 228         | 405       | 152        | N/A     | 242
 
 Training AlexNet with real data on 8 GPUs was excluded from the graph and table
 above due to our EFS setup not providing enough throughput.
@@ -393,63 +393,17 @@ GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
 32   | 820                         | 1265
 64   | 1608                        | 2623
 
-## Appendix
 
-### Executing benchmark tests
+## Methodology
 
-The [benchmark code](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
-was created to be used for benchmarking TensorFlow as well as used as a tool to
-test hardware platforms. Techniques used in the benchmark scripts are detailed
-in @{$performance_models$High-Performance Models}.
+This [script](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
+was run on the various platforms to generate the above results.
+@{$performance_models$High-Performance Models} details techniques in the script
+along with examples of how to execute the script.
 
-There are two ways to execute the benchmark code:
-
-1.  Execute [tf_cnn_benchmarks.py](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py)
-    directly.
-2.  Utilize the [scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks/main.py)
-    that helps pick the correct config for each platform executes
-    `tf_cnn_benchmarks.py`.
-
-The wrapper is suggested as a starting point. Then investigate the variety of
-options available in `tf_cnn_benchmarks.py`. Below are a couple examples of
-using the wrapper.
-
-**Single Server**
-This example illustrates training ResNet-50 on a single instance with 8 GPUs.
-The `system` flag is used to determine the optimal configuration. The
-supported values are gce, aws, and dgx1. If `system` is not passed, the best
-config for the most widely available hardware is used.
-
-```bash
-python main.py --model=resnet50 --num_gpus=8
-python main.py --system=aws --model=resnet50 --num_gpus=8
-```
-
-**Distributed**
-This example illustrates training ResNet-50 on 2 hosts, e.g. host_0 (10.0.0.1)
-and host_1 (10.0.0.2), with 8 GPUs each on AWS (Amazon EC2).
-
-```bash
-# Run the following commands on host_0 (10.0.0.1):
-  $  python main.py --system=aws --model=resnet50 --job_name=worker
-     --hosts=10.0.0.1,10.0.0.2 --task_index=0
-
-  $  python main.py --system=aws --model=resnet50 --job_name=ps
-     --hosts=10.0.0.1,10.0.0.2 --task_index=0
-
-# Run the following commands on host_1 (10.0.0.2):
-  $  python main.py --system=aws --model=resnet50 --job_name=worker
-     --hosts=10.0.0.1,10.0.0.2 --task_index=1
-
-  $  python main.py --system=aws --model=resnet50 --job_name=ps
-     --hosts=10.0.0.1,10.0.0.2 --task_index=1
-```
-
-### Methodology
-
-Unless otherwise stated, each test is run 5 times and then the times are
-averaged together. GPUs are run in their default state on the given platform.
-For NVIDIA® Tesla® K80 this means leaving on [GPU
-Boost](https://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/)
-unless it has been turned off by the provider. For a given test, 10 warmup steps
-are done and then the next 100 steps are averaged.
+In order to create results that are as repeatable as possible, each test was run
+5 times and then the times were averaged together. GPUs are run in their default
+state on the given platform.  For NVIDIA® Tesla® K80 this means leaving on [GPU
+Boost](https://devblogs.nvidia.com/parallelforall/increase-performance-gpu-boost-k80-autoboost/).
+For each test, 10 warmup steps are done and then the next 100 steps are
+averaged.
diff --git a/tensorflow/docs_src/performance/performance_models.md b/tensorflow/docs_src/performance/performance_models.md
index 70c415a024e..027ecb195ed 100644
--- a/tensorflow/docs_src/performance/performance_models.md
+++ b/tensorflow/docs_src/performance/performance_models.md
@@ -14,8 +14,8 @@ input pipeline issues and best practices. We found that using @{tf.FIFOQueue}
 and @{tf.train.queue_runner} could not saturate multiple current generation GPUs
 when using large inputs and processing with higher samples per second, such
 as training ImageNet with [AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf).
-This is due to the the use of Python threads as its underlying implementation.
-The overhead of Python threads is too large.
+This is due to the use of Python threads as its underlying implementation. The
+overhead of Python threads is too large.
 
 Another approach, which we have implemented in the
 [scripts](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks),
@@ -327,3 +327,92 @@ free.
 The downside is that all the weights read are from the previous training step.
 So it is a different algorithm from SGD. But it is possible to improve its
 convergence by adjusting learning rate and other hyperparameters.
+
+### Executing the script
+
+This section lists the core command line arguments and a few basic examples for
+executing the main script
+([tf_cnn_benchmarks.py](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py)).
+
+#### Base command line arguments
+
+*   **`model`**: Model to use, e.g. `resnet50`, `inception3`, `vgg16`, and
+    `alexnet`.
+*   **`num_gpus`**: Number of GPUs to use.
+*   **`data_dir`**: Path to data to process. If not set, synthetic data is used.
+    To use Imagenet data use these
+    [instructions(https://github.com/tensorflow/models/tree/master/inception#getting-started)
+    as a starting point.
+*   **`batch_size`**: Batch size for each GPU.
+*   **`variable_update`**: The method for managing variables: `parameter_server`
+    ,`replicated`, `distributed_replicated`, `independent`
+*   **`local_parameter_device`**: Device to use as parameter server: `cpu` or
+    `gpu`.
+
+#### Single instance examples
+
+```bash
+# VGG16 training ImageNet with 8 GPUs using arguments that optimize for
+# Google Compute Engine.
+python tf_cnn_benchmarks.py --local_parameter_device=cpu --num_gpus=8 \
+--batch_size=32 --model=vgg16 --data_dir=/home/ubuntu/imagenet/train \
+--variable_update=parameter_server --nodistortions
+
+# VGG16 training synthetic ImageNet data with 8 GPUs using arguments that
+# optimize for the NVIDIA DGX-1.
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=vgg16 --variable_update=replicated --use_nccl=True
+
+# VGG16 training ImageNet data with 8 GPUs using arguments that optimize for
+# Amazon EC2.
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=vgg16 --variable_update=parameter_server
+
+# ResNet-50 training ImageNet data with 8 GPUs using arguments that optimize for
+# Amazon EC2.
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=replicated --use_nccl=False
+
+```
+
+#### Distributed command line arguments
+
+*   **`ps_hosts`**: Comma separated list of hosts to use as parameter servers
+    in the format of ```<host>:port```, e.g. ```10.0.0.2:50000```.
+*   **`worker_hosts`**: Comma separated list of hosts to use as workers in the
+    format of ```<host>:port```, e.g. ```10.0.0.2:50001```.
+*   **`task_index`**: Index of the host in the list of `ps_hosts` or
+    `worker_hosts` being started.
+*   **`job_name`**: Type of job, e.g `ps` or `worker`
+
+#### Distributed examples
+
+Below is an example of training ResNet-50 on 2 hosts: host_0 (10.0.0.1) and
+host_1 (10.0.0.2). The example uses synthetic data. To use real data pass the
+`--data_dir` argument.
+
+```bash
+# Run the following commands on host_0 (10.0.0.1):
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=worker --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=0
+
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=ps --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=0
+
+
+# Run the following commands on host_1 (10.0.0.2):
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=worker --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=1
+
+python tf_cnn_benchmarks.py --local_parameter_device=gpu --num_gpus=8 \
+--batch_size=64 --model=resnet50 --variable_update=distributed_replicated \
+--job_name=ps --ps_hosts=10.0.0.1:50000,10.0.0.2:50000 \
+--worker_hosts=10.0.0.1:50001,10.0.0.2:50001 --task_index=1
+
+```

From 7bb4cd35457fd7c3fdcb53a92495073aa9adcd5e Mon Sep 17 00:00:00 2001
From: Ian Langmore <langmore@google.com>
Date: Wed, 3 May 2017 09:05:41 -0800
Subject: [PATCH 40/51] 1. Default implementation of determinant,
 log_determinant added. Now the only methods lacking a default are the
 assert_* (this will be discussed at design review), and the abstractmethods
 shape/apply/...

2. is_square hint added to all operators and used.

3. errant reference to non-existent "log_determinant" method changed to "log_abs_determinant".
Change: 154969228
---
 .../linear_operator_composition_test.py       |  5 +-
 .../linear_operator_full_matrix_test.py       | 19 ++--
 .../kernel_tests/linear_operator_test.py      |  3 +
 .../linalg/python/ops/linear_operator.py      | 95 +++++++++++++------
 .../python/ops/linear_operator_composition.py |  7 +-
 .../linalg/python/ops/linear_operator_diag.py | 11 ++-
 .../python/ops/linear_operator_full_matrix.py | 34 +------
 .../python/ops/linear_operator_identity.py    | 19 +++-
 .../linalg/python/ops/linear_operator_tril.py | 13 ++-
 9 files changed, 130 insertions(+), 76 deletions(-)

diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
index 998073e28bd..0585a0ba5a9 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_composition_test.py
@@ -65,12 +65,15 @@ class SquareLinearOperatorCompositionTest(
       # feed_dict.
       matrices = sess.run(matrices)
       operator = linalg.LinearOperatorComposition(
-          [linalg.LinearOperatorFullMatrix(m_ph) for m_ph in matrices_ph])
+          [linalg.LinearOperatorFullMatrix(m_ph) for m_ph in matrices_ph],
+          is_square=True)
       feed_dict = {m_ph: m for (m_ph, m) in zip(matrices_ph, matrices)}
     else:
       operator = linalg.LinearOperatorComposition(
           [linalg.LinearOperatorFullMatrix(m) for m in matrices])
       feed_dict = None
+      # Should be auto-set.
+      self.assertTrue(operator.is_square)
 
     # Convert back to Tensor.  Needed if use_placeholder, since then we have
     # already evaluated each matrix to a numpy array.
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py
index d4a9e97ce7a..12c299683aa 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_full_matrix_test.py
@@ -45,9 +45,10 @@ class SquareLinearOperatorFullMatrixTest(
       # values are random and we want the same value used for both mat and
       # feed_dict.
       matrix = matrix.eval()
-      operator = linalg.LinearOperatorFullMatrix(matrix_ph)
+      operator = linalg.LinearOperatorFullMatrix(matrix_ph, is_square=True)
       feed_dict = {matrix_ph: matrix}
     else:
+      # is_square should be auto-detected here.
       operator = linalg.LinearOperatorFullMatrix(matrix)
       feed_dict = None
 
@@ -68,6 +69,8 @@ class SquareLinearOperatorFullMatrixTest(
     self.assertTrue(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
+    # Auto-detected.
+    self.assertTrue(operator.is_square)
 
 
 class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
@@ -104,6 +107,7 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
       # values are random and we want the same value used for both mat and
       # feed_dict.
       matrix = matrix.eval()
+      # is_square is auto-set because of self_adjoint/pd.
       operator = linalg.LinearOperatorFullMatrix(
           matrix_ph, is_self_adjoint=True, is_positive_definite=True)
       feed_dict = {matrix_ph: matrix}
@@ -129,7 +133,8 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
 
     # Should be auto-set
     self.assertTrue(operator.is_non_singular)
-    self.assertTrue(operator._is_spd)
+    self.assertTrue(operator._can_use_cholesky)
+    self.assertTrue(operator.is_square)
 
 
 class NonSquareLinearOperatorFullMatrixTest(
@@ -157,16 +162,14 @@ class NonSquareLinearOperatorFullMatrixTest(
     return operator, mat, feed_dict
 
   def test_is_x_flags(self):
-    # Matrix with two positive eigenvalues.
-    matrix = [[3., 0.], [1., 1.]]
+    matrix = [[3., 2., 1.], [1., 1., 1.]]
     operator = linalg.LinearOperatorFullMatrix(
         matrix,
-        is_positive_definite=True,
-        is_non_singular=True,
         is_self_adjoint=False)
-    self.assertTrue(operator.is_positive_definite)
-    self.assertTrue(operator.is_non_singular)
+    self.assertEqual(operator.is_positive_definite, None)
+    self.assertEqual(operator.is_non_singular, None)
     self.assertFalse(operator.is_self_adjoint)
+    self.assertFalse(operator.is_square)
 
   def test_matrix_must_have_at_least_two_dims_or_raises(self):
     with self.assertRaisesRegexp(ValueError, "at least 2 dimensions"):
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
index c5bfc6e1fd5..d24388fce32 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
+++ b/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_test.py
@@ -54,6 +54,9 @@ class LinearOperatorShape(linalg.LinearOperator):
   def _shape_tensor(self):
     return constant_op.constant(self._stored_shape, dtype=dtypes.int32)
 
+  def _apply(self):
+    raise NotImplementedError("Not needed for this test.")
+
 
 class LinearOperatorApplyOnly(linalg.LinearOperator):
   """LinearOperator that simply wraps a [batch] matrix and implements apply."""
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator.py b/tensorflow/contrib/linalg/python/ops/linear_operator.py
index 454411d93cf..8d0a1d7de20 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import contextlib
 
 from tensorflow.contrib import framework as contrib_framework
@@ -25,6 +26,7 @@ from tensorflow.contrib.linalg.python.ops import linear_operator_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
 
 __all__ = ["LinearOperator"]
 
@@ -50,11 +52,9 @@ class LinearOperator(object):
 
   #### Performance contract
 
-  Subclasses should implement a method only if it can be done with a reasonable
-  performance increase over generic dense operations, either in time, parallel
-  scalability, or memory usage.  For example, if the determinant can only be
-  computed using `tf.matrix_determinant(self.to_dense())`, then determinants
-  should not be implemented.
+  Subclasses should only implement the assert methods
+  (e.g. `assert_non_singular`) if they can be done in less than `O(N^3)`
+  time.
 
   Class docstrings should contain an explanation of computational complexity.
   Since this is a high-performance library, attention should be paid to detail,
@@ -100,7 +100,7 @@ class LinearOperator(object):
   operator.shape()
   ==> [2, 4, 4]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> Shape [2] Tensor
 
   x = ... Shape [2, 4, 5] Tensor
@@ -131,6 +131,7 @@ class LinearOperator(object):
   * If `is_X == None` (the default), callers should have no expectation either
     way.
   """
+  __metaclass__ = abc.ABCMeta
 
   def __init__(self,
                dtype,
@@ -167,17 +168,23 @@ class LinearOperator(object):
       ValueError:  If hints are set incorrectly.
     """
     # Check and auto-set flags.
-    if is_square is False:
-      if is_non_singular or is_positive_definite:
-        raise ValueError(
-            "A non-singular or positive definite operator is always square.")
-    self._is_square_set_by_user = is_square
-
     if is_positive_definite:
       if is_non_singular is False:
         raise ValueError("A positive definite matrix is always non-singular.")
       is_non_singular = True
 
+    if is_non_singular:
+      if is_square is False:
+        raise ValueError("A non-singular matrix is always square.")
+      is_square = True
+
+    if is_self_adjoint:
+      if is_square is False:
+        raise ValueError("A self-adjoint matrix is always square.")
+      is_square = True
+
+    self._is_square_set_or_implied_by_hints = is_square
+
     graph_parents = [] if graph_parents is None else graph_parents
     for i, t in enumerate(graph_parents):
       if t is None or not contrib_framework.is_tensor(t):
@@ -239,15 +246,16 @@ class LinearOperator(object):
     """Return `True/False` depending on if this operator is square."""
     # Static checks done after __init__.  Why?  Because domain/range dimension
     # sometimes requires lots of work done in the derived class after init.
-    static_square_check = self.domain_dimension == self.range_dimension
-    if self._is_square_set_by_user is False and static_square_check:
+    auto_square_check = self.domain_dimension == self.range_dimension
+    if self._is_square_set_or_implied_by_hints is False and auto_square_check:
       raise ValueError(
           "User set is_square hint to False, but the operator was square.")
-    if self._is_square_set_by_user is None:
-      return static_square_check
+    if self._is_square_set_or_implied_by_hints is None:
+      return auto_square_check
 
-    return self._is_square_set_by_user
+    return self._is_square_set_or_implied_by_hints
 
+  @abc.abstractmethod
   def _shape(self):
     # Write this in derived class to enable all static shape methods.
     raise NotImplementedError("_shape is not implemented.")
@@ -265,6 +273,7 @@ class LinearOperator(object):
     """
     return self._shape()
 
+  @abc.abstractmethod
   def _shape_tensor(self):
     raise NotImplementedError("_shape_tensor is not implemented.")
 
@@ -367,8 +376,7 @@ class LinearOperator(object):
           self._cached_tensor_rank_tensor = ops.convert_to_tensor(
               self.tensor_rank)
         else:
-          self._cached_tensor_rank_tensor = array_ops.size(
-              self.shape_tensor())
+          self._cached_tensor_rank_tensor = array_ops.size(self.shape_tensor())
       return self._cached_tensor_rank_tensor
 
   @property
@@ -486,9 +494,10 @@ class LinearOperator(object):
     """Check that arg.dtype == self.dtype."""
     if arg.dtype != self.dtype:
       raise TypeError(
-          "Expected argument to have dtype %s.  Found: %s in tensor %s"
-          % (self.dtype, arg.dtype, arg))
+          "Expected argument to have dtype %s.  Found: %s in tensor %s" %
+          (self.dtype, arg.dtype, arg))
 
+  @abc.abstractmethod
   def _apply(self, x, adjoint=False, adjoint_arg=False):
     raise NotImplementedError("_apply is not implemented.")
 
@@ -517,7 +526,9 @@ class LinearOperator(object):
       return self._apply(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
   def _determinant(self):
-    raise NotImplementedError("_det is not implemented.")
+    if self._can_use_cholesky():
+      return math_ops.exp(self.log_abs_determinant())
+    return linalg_ops.matrix_determinant(self._matrix)
 
   def determinant(self, name="det"):
     """Determinant for every batch member.
@@ -539,7 +550,11 @@ class LinearOperator(object):
       return self._determinant()
 
   def _log_abs_determinant(self):
-    raise NotImplementedError("_log_abs_det is not implemented.")
+    if self._can_use_cholesky():
+      diag = array_ops.matrix_diag_part(self._get_cached_chol())
+      return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
+    abs_det = math_ops.abs(self.determinant())
+    return math_ops.log(abs_det)
 
   def log_abs_determinant(self, name="log_abs_det"):
     """Log absolute value of determinant for every batch member.
@@ -561,13 +576,20 @@ class LinearOperator(object):
       return self._log_abs_determinant()
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
-    # Since this is an exact solve method for all rhs, this will only be
-    # available for non-singular (batch) operators, in particular the operator
-    # must be square.
-    raise NotImplementedError("_solve is not implemented.")
+    if self.is_square is False:
+      raise NotImplementedError(
+          "Solve is not yet implemented for non-square operators.")
+    rhs = linear_operator_util.matrix_adjoint(rhs) if adjoint_arg else rhs
+    if self._can_use_cholesky():
+      return linalg_ops.cholesky_solve(self._get_cached_chol(), rhs)
+    return linalg_ops.matrix_solve(
+        self._get_cached_dense_matrix(), rhs, adjoint=adjoint)
 
   def solve(self, rhs, adjoint=False, adjoint_arg=False, name="solve"):
-    """Solve `R` (batch) systems of equations exactly: `A X = rhs`.
+    """Solve `R` (batch) systems of equations with best effort: `A X = rhs`.
+
+    The solution may not be exact, and in this case it will be close in some
+    sense (see class docstring for details).
 
     Examples:
 
@@ -689,3 +711,20 @@ class LinearOperator(object):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
       return self._add_to_tensor(x)
+
+  def _can_use_cholesky(self):
+    # TODO(langmore) Add complex types when tf.cholesky can use them.
+    return (not self.dtype.is_complex and self.is_self_adjoint and
+            self.is_positive_definite)
+
+  def _get_cached_dense_matrix(self):
+    if not hasattr(self, "_cached_dense_matrix"):
+      self._cached_dense_matrix = self.to_dense()
+    return self._cached_dense_matrix
+
+  def _get_cached_chol(self):
+    if not self._can_use_cholesky():
+      return None
+    if not hasattr(self, "_cached_chol"):
+      self._cached_chol = linalg_ops.cholesky(self._get_cached_dense_matrix())
+    return self._cached_chol
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py b/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
index b1557769b22..550c630497c 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_composition.py
@@ -63,7 +63,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
@@ -96,7 +96,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -112,6 +112,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name=None):
     r"""Initialize a `LinearOperatorComposition`.
 
@@ -132,6 +133,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.  Default is the individual
         operators names joined with `_o_`.
 
@@ -177,6 +179,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   @property
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
index 97e52d08a43..d81dea65143 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_diag.py
@@ -52,7 +52,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
@@ -97,7 +97,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -113,6 +113,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name="LinearOperatorDiag"):
     r"""Initialize a `LinearOperatorDiag`.
 
@@ -129,6 +130,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
     Raises:
@@ -147,12 +149,17 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
         else:
           is_self_adjoint = True
 
+      if is_square is False:
+        raise ValueError("Only square diagonal operators currently supported.")
+      is_square = True
+
       super(LinearOperatorDiag, self).__init__(
           dtype=self._diag.dtype,
           graph_parents=[self._diag],
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   def _check_diag(self, diag):
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py b/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
index 64ab5614577..0f245e609b0 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_full_matrix.py
@@ -19,11 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.linalg.python.ops import linear_operator
-from tensorflow.contrib.linalg.python.ops import linear_operator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 
 __all__ = ["LinearOperatorFullMatrix"]
@@ -49,7 +47,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
@@ -93,7 +91,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -109,6 +107,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name="LinearOperatorFullMatrix"):
     r"""Initialize a `LinearOperatorFullMatrix`.
 
@@ -124,6 +123,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
     Raises:
@@ -134,19 +134,13 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
       self._matrix = ops.convert_to_tensor(matrix, name="matrix")
       self._check_matrix(self._matrix)
 
-      # Special treatment for (real) Symmetric Positive Definite.
-      self._is_spd = (
-          (not self._matrix.dtype.is_complex)
-          and is_self_adjoint and is_positive_definite)
-      if self._is_spd:
-        self._chol = linalg_ops.cholesky(self._matrix)
-
       super(LinearOperatorFullMatrix, self).__init__(
           dtype=self._matrix.dtype,
           graph_parents=[self._matrix],
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   def _check_matrix(self, matrix):
@@ -177,23 +171,5 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
     return math_ops.matmul(
         self._matrix, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
-  def _determinant(self):
-    if self._is_spd:
-      return math_ops.exp(self.log_abs_determinant())
-    return linalg_ops.matrix_determinant(self._matrix)
-
-  def _log_abs_determinant(self):
-    if self._is_spd:
-      diag = array_ops.matrix_diag_part(self._chol)
-      return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
-    abs_det = math_ops.abs(self.determinant())
-    return math_ops.log(abs_det)
-
-  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
-    rhs = linear_operator_util.matrix_adjoint(rhs) if adjoint_arg else rhs
-    if self._is_spd:
-      return linalg_ops.cholesky_solve(self._chol, rhs)
-    return linalg_ops.matrix_solve(self._matrix, rhs, adjoint=adjoint)
-
   def _to_dense(self):
     return self._matrix
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py b/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
index 845bf25192e..d595442c70b 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_identity.py
@@ -112,7 +112,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> 0.
 
   x = ... Shape [2, 4] Tensor
@@ -180,7 +180,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -198,6 +198,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
                is_non_singular=True,
                is_self_adjoint=True,
                is_positive_definite=True,
+               is_square=True,
                assert_proper_shapes=False,
                name="LinearOperatorIdentity"):
     r"""Initialize a `LinearOperatorIdentity`.
@@ -224,6 +225,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
         If `True`, and static checks are inconclusive, add asserts to the graph.
@@ -248,12 +250,15 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         raise ValueError("An identity operator is always non-singular.")
       if not is_positive_definite:
         raise ValueError("An identity operator is always positive-definite.")
+      if not is_square:
+        raise ValueError("An identity operator is always square.")
 
       super(LinearOperatorIdentity, self).__init__(
           dtype=dtype,
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
       self._num_rows = linear_operator_util.shape_tensor(
@@ -459,7 +464,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> 2 * Log[3]
 
   x = ... Shape [2, 4] Tensor
@@ -510,7 +515,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -527,6 +532,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=True,
                assert_proper_shapes=False,
                name="LinearOperatorScaledIdentity"):
     r"""Initialize a `LinearOperatorScaledIdentity`.
@@ -550,6 +556,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
         If `True`, and static checks are inconclusive, add asserts to the graph.
@@ -561,6 +568,9 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     """
     self._assert_proper_shapes = assert_proper_shapes
 
+    if not is_square:
+      raise ValueError("A ScaledIdentity operator is always square.")
+
     with ops.name_scope(name, values=[multiplier, num_rows]):
       self._multiplier = ops.convert_to_tensor(multiplier, name="multiplier")
 
@@ -569,6 +579,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
       # Shape [B1,...Bb, 1, 1]
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
index 756e26cc130..6d4033c2a30 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_tril.py
@@ -53,7 +53,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   operator.shape
   ==> [2, 2]
 
-  operator.log_determinant()
+  operator.log_abs_determinant()
   ==> scalar Tensor
 
   x = ... Shape [2, 4] Tensor
@@ -90,7 +90,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
   #### Matrix property hints
 
   This `LinearOperator` is initialized with boolean flags of the form `is_X`,
-  for `X = non_singular, self_adjoint, positive_definite`.
+  for `X = non_singular, self_adjoint, positive_definite, square`.
   These have the following meaning
   * If `is_X == True`, callers should expect the operator to have the
     property `X`.  This is a promise that should be fulfilled, but is *not* a
@@ -106,6 +106,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
                is_non_singular=None,
                is_self_adjoint=None,
                is_positive_definite=None,
+               is_square=None,
                name="LinearOperatorTriL"):
     r"""Initialize a `LinearOperatorTriL`.
 
@@ -126,12 +127,19 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
         self-adjoint to be positive-definite.  See:
         https://en.wikipedia.org/wiki/Positive-definite_matrix\
             #Extension_for_non_symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
     Raises:
       TypeError:  If `diag.dtype` is not an allowed type.
+      ValueError:  If `is_square` is `False`.
     """
 
+    if is_square is False:
+      raise ValueError(
+          "Only square lower triangular operators supported at this time.")
+    is_square = True
+
     with ops.name_scope(name, values=[tril]):
       self._tril = ops.convert_to_tensor(tril, name="tril")
       self._check_tril(self._tril)
@@ -144,6 +152,7 @@ class LinearOperatorTriL(linear_operator.LinearOperator):
           is_non_singular=is_non_singular,
           is_self_adjoint=is_self_adjoint,
           is_positive_definite=is_positive_definite,
+          is_square=is_square,
           name=name)
 
   def _check_tril(self, tril):

From c800d2e36954edddcb83aa1df7f623f2780c7780 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 May 2017 09:26:28 -0800
Subject: [PATCH 41/51] Add support for python expression as SavedModel CLI
 inputs with '--input_exprs' option. Now python expression can be passed to
 --input_exprs option for saved_model_cli run command. For example
 --input_exprs "x=np.ones((50,50))"

Also change inputs for both '--inputs' and '--input_exprs' options to be semicolon separated instead of comma.
Change: 154971814
---
 tensorflow/python/tools/saved_model_cli.py    | 156 +++++++++++-------
 .../python/tools/saved_model_cli_test.py      |  90 ++++++----
 2 files changed, 161 insertions(+), 85 deletions(-)

diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 9dfafb77e7d..2fea29d961e 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -56,20 +56,22 @@ Example output:
 To show all available information in the SavedModel:
   $saved_model_cli show --dir /tmp/saved_model --all
 
-'run' command usage: saved_model_cli run [-h] --dir DIR --tag_set TAG_SET
-                         --signature_def SIGNATURE_DEF_KEY --inputs INPUTS
-                         [--outdir OUTDIR] [--overwrite]
+usage: saved_model_cli run [-h] --dir DIR --tag_set TAG_SET --signature_def
+                           SIGNATURE_DEF_KEY [--inputs INPUTS]
+                           [--input_exprs INPUT_EXPRS] [--outdir OUTDIR]
+                           [--overwrite] [--tf_debug]
+
 Examples:
 To run input tensors from files through a MetaGraphDef and save the output
 tensors to files:
   $saved_model_cli run --dir /tmp/saved_model --tag_set serve
-  --signature_def serving_default --inputs x:0=/tmp/124.npz,x2=/tmp/123.npy
-  --outdir /tmp/out
+  --signature_def serving_default --inputs x=/tmp/124.npz
+  --input_exprs 'x2=np.ones((6,2))' --outdir /tmp/out
 
 To observe the intermediate Tensor values in the runtime graph, use the
 --tf_debug flag, e.g.:
   $saved_model_cli run --dir /tmp/saved_model --tag_set serve
-  --signature_def serving_default --inputs x:0=/tmp/124.npz,x2=/tmp/123.npy
+  --signature_def serving_default --inputs 'x=/tmp/124.npz;x2=/tmp/123.npy'
   --outdir /tmp/out --tf_debug
 
 To build this tool from source, run:
@@ -367,7 +369,7 @@ def run_saved_model_with_feed_dict(saved_model_dir, tag_set, signature_def_key,
                                             output_full_path))
 
 
-def preprocess_input_arg_string(inputs_str):
+def preprocess_inputs_arg_string(inputs_str):
   """Parses input arg into dictionary that maps input to file/variable tuple.
 
   Parses input string in the format of, for example,
@@ -375,74 +377,94 @@ def preprocess_input_arg_string(inputs_str):
   dictionary looks like
   {'input_key1': (filename1, variable_name1),
    'input_key2': (file2, None)}
-  , which maps input keys to a tuple of file name and varaible name(None if
+  , which maps input keys to a tuple of file name and variable name(None if
   empty).
 
   Args:
-    inputs_str: A string that specified where to load inputs. Each input is
-        separated by comma.
-        * If the command line arg for inputs is quoted and contains
-            whitespace(s), all whitespaces will be ignored.
+    inputs_str: A string that specified where to load inputs. Inputs are
+    separated by semicolons.
         * For each input key:
-            'input=filename<[variable_name]>'
-        * The "[variable_name]" key is optional. Will be set to None if not
-            specified.
+            '<input_key>=<filename>' or
+            '<input_key>=<filename>[<variable_name>]'
+        * The optional 'variable_name' key will be set to None if not specified.
 
   Returns:
-    A dictionary that maps input keys to a tuple of file name and varaible name.
+    A dictionary that maps input keys to a tuple of file name and variable name.
 
   Raises:
-    RuntimeError: An error when the given input is in a bad format.
+    RuntimeError: An error when the given input string is in a bad format.
   """
   input_dict = {}
-  inputs_raw = inputs_str.split(',')
+  inputs_raw = inputs_str.split(';')
   for input_raw in filter(bool, inputs_raw):  # skip empty strings
-    # Remove quotes and whitespaces
-    input_raw = input_raw.replace('"', '').replace('\'', '').replace(' ', '')
-
     # Format of input=filename[variable_name]'
-    match = re.match(r'^([\w\-]+)=([\w\-.\/]+)\[([\w\-]+)\]$', input_raw)
+    match = re.match(r'([^=]+)=([^\[\]]+)\[([^\[\]]+)\]$', input_raw)
+
     if match:
-      input_dict[match.group(1)] = (match.group(2), match.group(3))
+      input_dict[match.group(1)] = match.group(2), match.group(3)
     else:
       # Format of input=filename'
-      match = re.match(r'^([\w\-]+)=([\w\-.\/]+)$', input_raw)
+      match = re.match(r'([^=]+)=([^\[\]]+)$', input_raw)
       if match:
-        input_dict[match.group(1)] = (match.group(2), None)
+        input_dict[match.group(1)] = match.group(2), None
       else:
         raise RuntimeError(
-            'Input \"%s\" format is incorrect. Please follow \"--inputs '
-            'input_key=file_name[variable_name]\" or input_key=file_name' %
-            input_raw)
+            '--inputs "%s" format is incorrect. Please follow'
+            '"<input_key>=<filename>", or'
+            '"<input_key>=<filename>[<variable_name>]"' % input_raw)
 
   return input_dict
 
 
-def load_inputs_from_input_arg_string(inputs_str):
-  """Parses input arg string and load inputs into a dictionary.
+def preprocess_input_exprs_arg_string(input_exprs_str):
+  """Parses input arg into dictionary that maps input key to python expression.
 
-  Parses input string in the format of, for example,
-  "input1=filename1[variable_name1],input2=filename2" into a
-  dictionary looks like
-  {'input1:0': ndarray_saved_as_variable_name1_in_filename1 ,
-   'input2:0': ndarray_saved_in_filename2}
-  , which maps input keys to a numpy ndarray loaded from file. See Args section
-  for more details on inputs format.
+  Parses input string in the format of 'input_key=<python expression>' into a
+  dictionary that maps each input_key to its python expression.
+
+  Args:
+    input_exprs_str: A string that specifies python expression for input keys.
+    Each input is separated by semicolon. For each input key:
+        'input_key=<python expression>'
+
+  Returns:
+    A dictionary that maps input keys to python expressions.
+
+  Raises:
+    RuntimeError: An error when the given input string is in a bad format.
+  """
+  input_dict = {}
+
+  for input_raw in filter(bool, input_exprs_str.split(';')):
+    if '=' not in input_exprs_str:
+      raise RuntimeError('--input_exprs "%s" format is incorrect. Please follow'
+                         '"<input_key>=<python expression>"' % input_exprs_str)
+    input_key, expr = input_raw.split('=')
+    input_dict[input_key] = expr
+
+  return input_dict
+
+
+def load_inputs_from_input_arg_string(inputs_str, input_exprs_str):
+  """Parses input arg strings and create inputs feed_dict.
+
+  Parses '--inputs' string for inputs to be loaded from file, and parses
+  '--input_exprs' string for inputs to be evaluated from python expression.
 
   Args:
     inputs_str: A string that specified where to load inputs. Each input is
-        separated by comma.
-        * If the command line arg for inputs is quoted and contains
-            whitespace(s), all whitespaces will be ignored.
+        separated by semicolon.
         * For each input key:
-            'input=filename[variable_name]'
+            '<input_key>=<filename>' or
+            '<input_key>=<filename>[<variable_name>]'
+        * The optional 'variable_name' key will be set to None if not specified.
         * File specified by 'filename' will be loaded using numpy.load. Inputs
             can be loaded from only .npy, .npz or pickle files.
         * The "[variable_name]" key is optional depending on the input file type
             as descripted in more details below.
         When loading from a npy file, which always contains a numpy ndarray, the
         content will be directly assigned to the specified input tensor. If a
-        varaible_name is specified, it will be ignored and a warning will be
+        variable_name is specified, it will be ignored and a warning will be
         issued.
         When loading from a npz zip file, user can specify which variable within
         the zip file to load for the input tensor inside the square brackets. If
@@ -453,10 +475,12 @@ def load_inputs_from_input_arg_string(inputs_str):
         to the specified input tensor, else SavedModel CLI will assume a
         dictionary is stored in the pickle file and the value corresponding to
         the variable_name will be used.
+    input_exprs_str: A string that specified python expressions for inputs.
+        * In the format of: '<input_key>=<python expression>'.
+        * numpy module is available as np.
 
   Returns:
-    A dictionary that maps input tensor keys to a numpy ndarray loaded from
-    file.
+    A dictionary that maps input tensor keys to numpy ndarrays.
 
   Raises:
     RuntimeError: An error when a key is specified, but the input file contains
@@ -466,13 +490,14 @@ def load_inputs_from_input_arg_string(inputs_str):
   """
   tensor_key_feed_dict = {}
 
-  for input_tensor_key, (
-      filename,
-      variable_name) in preprocess_input_arg_string(inputs_str).items():
+  inputs = preprocess_inputs_arg_string(inputs_str)
+  input_exprs = preprocess_input_exprs_arg_string(input_exprs_str)
+
+  for input_tensor_key, (filename, variable_name) in inputs.items():
+    data = np.load(filename)
+
     # When a variable_name key is specified for the input file
     if variable_name:
-      data = np.load(filename)
-
       # if file contains a single ndarray, ignore the input name
       if isinstance(data, np.ndarray):
         warnings.warn(
@@ -488,7 +513,6 @@ def load_inputs_from_input_arg_string(inputs_str):
               (filename, variable_name))
     # When no key is specified for the input file.
     else:
-      data = np.load(filename)
       # Check if npz file only contains a single numpy ndarray.
       if isinstance(data, np.lib.npyio.NpzFile):
         variable_name_list = data.files
@@ -500,6 +524,16 @@ def load_inputs_from_input_arg_string(inputs_str):
       else:
         tensor_key_feed_dict[input_tensor_key] = data
 
+  # When input is a python expression:
+  for input_tensor_key, py_expr in input_exprs.items():
+    if input_tensor_key in tensor_key_feed_dict:
+      warnings.warn(
+          'input_key %s has been specified with both --inputs and --input_exprs'
+          ' options. Value in --input_exprs will be used.' % input_tensor_key)
+
+    # ast.literal_eval does not work with numpy expressions
+    tensor_key_feed_dict[input_tensor_key] = eval(py_expr)  # pylint: disable=eval-used
+
   return tensor_key_feed_dict
 
 
@@ -531,7 +565,8 @@ def run(args):
   Args:
     args: A namespace parsed from command line.
   """
-  tensor_key_feed_dict = load_inputs_from_input_arg_string(args.inputs)
+  tensor_key_feed_dict = load_inputs_from_input_arg_string(
+      args.inputs, args.input_exprs)
   run_saved_model_with_feed_dict(args.dir, args.tag_set, args.signature_def,
                                  tensor_key_feed_dict, args.outdir,
                                  args.overwrite, tf_debug=args.tf_debug)
@@ -559,7 +594,7 @@ def create_parser():
       'MetaGraphDef specified by its tag-set:\n'
       '$saved_model_cli show --dir /tmp/saved_model --tag_set serve\n'
       'For a MetaGraphDef with multiple tags in the tag-set, all tags must be '
-      'passed in, separated by \',\':\n'
+      'passed in, separated by \';\':\n'
       '$saved_model_cli show --dir /tmp/saved_model --tag_set serve,gpu\n\n'
       'To show all inputs and outputs TensorInfo for a specific'
       ' SignatureDef specified by the SignatureDef key in a'
@@ -601,7 +636,7 @@ def create_parser():
              '$saved_model_cli show --dir /tmp/saved_model --tag_set serve'
              '--signature_def serving_default '
              '--inputs input1_key=/tmp/124.npz[x],input2_key=/tmp/123.npy'
-             '--outdir=/out\n\n'
+             '--input_exprs \'input3_key=np.ones(2)\' --outdir=/out\n\n'
              'For more information about input file format, please see:\n'
              'https://www.tensorflow.org/programmers_guide/saved_model_cli\n')
   parser_run = subparsers.add_parser(
@@ -622,10 +657,15 @@ def create_parser():
       required=True,
       metavar='SIGNATURE_DEF_KEY',
       help='key of SignatureDef to run')
-  msg = ('inputs in the format of \'input_key=filename[variable_name]\', '
-         'separated by \',\'. Inputs can only be loaded from .npy, .npz or '
-         'pickle files. Please use input keys instead of input names.')
-  parser_run.add_argument('--inputs', type=str, required=True, help=msg)
+  msg = ('Loading inputs from files, in the format of \'<input_key>=<filename>,'
+         ' or \'<input_key>=<filename>[<variable_name>]\', separated by \';\'.'
+         ' The file format can only be from .npy, .npz or pickle.')
+  parser_run.add_argument('--inputs', type=str, default='', help=msg)
+  msg = ('Specifying inputs by python expressions, in the format of'
+         ' "<input_key>=\'<python expression>\'", separated by \';\'. '
+         'numpy module is available as \'np\'. '
+         'Will override duplicate input_keys from --inputs option.')
+  parser_run.add_argument('--input_exprs', type=str, default='', help=msg)
   parser_run.add_argument(
       '--outdir',
       type=str,
@@ -649,6 +689,8 @@ def create_parser():
 def main():
   parser = create_parser()
   args = parser.parse_args()
+  if not args.inputs and not args.input_exprs:
+    args.error('At least one of --inputs and --input_exprs is required')
   args.func(args)
 
 
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
index a321ada2dd5..1c7a44b3ebd 100644
--- a/tensorflow/python/tools/saved_model_cli_test.py
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -201,28 +201,37 @@ Method name is: tensorflow/serving/predict"""
     self.assertEqual(err.getvalue().strip(), '')
 
   def testInputPreProcessFormats(self):
-    input_str = 'input1=/path/file.txt[ab3], input2=file2,,'
-    input_dict = saved_model_cli.preprocess_input_arg_string(input_str)
+    input_str = 'input1=/path/file.txt[ab3];input2=file2'
+    input_expr_str = 'input3=np.zeros([2,2]);input4=[4,5]'
+    input_dict = saved_model_cli.preprocess_inputs_arg_string(input_str)
+    input_expr_dict = saved_model_cli.preprocess_input_exprs_arg_string(
+        input_expr_str)
     self.assertTrue(input_dict['input1'] == ('/path/file.txt', 'ab3'))
     self.assertTrue(input_dict['input2'] == ('file2', None))
-
-  def testInputPreProcessQuoteAndWhitespace(self):
-    input_str = '\' input1 = file[v_1]\', input2=file ["sd"] '
-    input_dict = saved_model_cli.preprocess_input_arg_string(input_str)
-    self.assertTrue(input_dict['input1'] == ('file', 'v_1'))
-    self.assertTrue(input_dict['input2'] == ('file', 'sd'))
+    self.assertTrue(input_expr_dict['input3'] == 'np.zeros([2,2])')
+    self.assertTrue(input_expr_dict['input4'] == '[4,5]')
     self.assertTrue(len(input_dict) == 2)
+    self.assertTrue(len(input_expr_dict) == 2)
+
+  def testInputPreProcessFileNames(self):
+    input_str = (r'inputx=C:\Program Files\data.npz[v:0];'
+                 r'input:0=c:\PROGRA~1\data.npy')
+    input_dict = saved_model_cli.preprocess_inputs_arg_string(input_str)
+    print(input_dict)
+    self.assertTrue(input_dict['inputx'] == (r'C:\Program Files\data.npz',
+                                             'v:0'))
+    self.assertTrue(input_dict['input:0'] == (r'c:\PROGRA~1\data.npy', None))
 
   def testInputPreProcessErrorBadFormat(self):
     input_str = 'inputx=file[[v1]v2'
     with self.assertRaises(RuntimeError):
-      saved_model_cli.preprocess_input_arg_string(input_str)
+      saved_model_cli.preprocess_inputs_arg_string(input_str)
     input_str = 'inputx:file'
     with self.assertRaises(RuntimeError):
-      saved_model_cli.preprocess_input_arg_string(input_str)
-    input_str = 'inputx=file(v_1)'
+      saved_model_cli.preprocess_inputs_arg_string(input_str)
+    input_str = 'inputx:np.zeros((5))'
     with self.assertRaises(RuntimeError):
-      saved_model_cli.preprocess_input_arg_string(input_str)
+      saved_model_cli.preprocess_input_exprs_arg_string(input_str)
 
   def testInputParserNPY(self):
     x0 = np.array([[1], [2]])
@@ -231,8 +240,8 @@ Method name is: tensorflow/serving/predict"""
     input1_path = os.path.join(test.get_temp_dir(), 'input1.npy')
     np.save(input0_path, x0)
     np.save(input1_path, x1)
-    input_str = 'x0=' + input0_path + '[x0],x1=' + input1_path
-    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str)
+    input_str = 'x0=' + input0_path + '[x0];x1=' + input1_path
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
     self.assertTrue(np.all(feed_dict['x0'] == x0))
     self.assertTrue(np.all(feed_dict['x1'] == x1))
 
@@ -240,8 +249,8 @@ Method name is: tensorflow/serving/predict"""
     x0 = np.array([[1], [2]])
     input_path = os.path.join(test.get_temp_dir(), 'input.npz')
     np.savez(input_path, a=x0)
-    input_str = 'x=' + input_path + '[a],y=' + input_path
-    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str)
+    input_str = 'x=' + input_path + '[a];y=' + input_path
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
     self.assertTrue(np.all(feed_dict['x'] == x0))
     self.assertTrue(np.all(feed_dict['y'] == x0))
 
@@ -258,25 +267,50 @@ Method name is: tensorflow/serving/predict"""
       pickle.dump(pkl1, f)
     with open(input_path2, 'wb') as f:
       pickle.dump(pkl2, f)
-    input_str = 'x=' + input_path0 + '[b],y=' + input_path1 + '[c],'
+    input_str = 'x=' + input_path0 + '[b];y=' + input_path1 + '[c];'
     input_str += 'z=' + input_path2
-    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str)
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
     self.assertTrue(np.all(feed_dict['x'] == pkl0['b']))
     self.assertTrue(np.all(feed_dict['y'] == pkl1))
     self.assertTrue(np.all(feed_dict['z'] == pkl2))
 
-  def testInputParserQuoteAndWhitespace(self):
+  def testInputParserPythonExpression(self):
+    x1 = np.ones([2, 10])
+    x2 = np.array([[1], [2], [3]])
+    x3 = np.mgrid[0:5, 0:5]
+    x4 = [[3], [4]]
+    input_expr_str = ('x1=np.ones([2,10]);x2=np.array([[1],[2],[3]]);'
+                      'x3=np.mgrid[0:5,0:5];x4=[[3],[4]]')
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        '', input_expr_str)
+    self.assertTrue(np.all(feed_dict['x1'] == x1))
+    self.assertTrue(np.all(feed_dict['x2'] == x2))
+    self.assertTrue(np.all(feed_dict['x3'] == x3))
+    self.assertTrue(np.all(feed_dict['x4'] == x4))
+
+  def testInputParserBoth(self):
     x0 = np.array([[1], [2]])
-    x1 = np.array(range(6)).reshape(2, 3)
-    input0_path = os.path.join(test.get_temp_dir(), 'input0.npy')
-    input1_path = os.path.join(test.get_temp_dir(), 'input1.npy')
-    np.save(input0_path, x0)
-    np.save(input1_path, x1)
-    input_str = '"x0=' + input0_path + '[x0] , x1 = ' + input1_path + '"'
-    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(input_str)
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0)
+    x1 = np.ones([2, 10])
+    input_str = 'x0=' + input_path + '[a]'
+    input_expr_str = 'x1=np.ones([2,10])'
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        input_str, input_expr_str)
     self.assertTrue(np.all(feed_dict['x0'] == x0))
     self.assertTrue(np.all(feed_dict['x1'] == x1))
 
+  def testInputParserBothDuplicate(self):
+    x0 = np.array([[1], [2]])
+    input_path = os.path.join(test.get_temp_dir(), 'input.npz')
+    np.savez(input_path, a=x0)
+    x1 = np.ones([2, 10])
+    input_str = 'x0=' + input_path + '[a]'
+    input_expr_str = 'x0=np.ones([2,10])'
+    feed_dict = saved_model_cli.load_inputs_from_input_arg_string(
+        input_str, input_expr_str)
+    self.assertTrue(np.all(feed_dict['x0'] == x1))
+
   def testInputParserErrorNoName(self):
     x0 = np.array([[1], [2]])
     x1 = np.array(range(5))
@@ -284,7 +318,7 @@ Method name is: tensorflow/serving/predict"""
     np.savez(input_path, a=x0, b=x1)
     input_str = 'x=' + input_path
     with self.assertRaises(RuntimeError):
-      saved_model_cli.load_inputs_from_input_arg_string(input_str)
+      saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
 
   def testInputParserErrorWrongName(self):
     x0 = np.array([[1], [2]])
@@ -293,7 +327,7 @@ Method name is: tensorflow/serving/predict"""
     np.savez(input_path, a=x0, b=x1)
     input_str = 'x=' + input_path + '[c]'
     with self.assertRaises(RuntimeError):
-      saved_model_cli.load_inputs_from_input_arg_string(input_str)
+      saved_model_cli.load_inputs_from_input_arg_string(input_str, '')
 
   def testRunCommandExistingOutdir(self):
     self.parser = saved_model_cli.create_parser()

From 622e2f22041e69656531d224c7a87b9bd2299c25 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 May 2017 09:31:24 -0800
Subject: [PATCH 42/51] Fix some pylint errors. Change: 154972424

---
 tensorflow/python/ops/data_flow_ops.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 6395451e2ae..95e803e2aa0 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 
 import collections
 import hashlib
-import re
 import threading
 
 import six
@@ -56,6 +55,7 @@ def _as_type_list(dtypes):
 def _as_shape_list(shapes, dtypes, unknown_dim_allowed=False,
                    unknown_rank_allowed=False):
   """Convert shapes to a list of tuples of int (or None)."""
+  del dtypes
   if unknown_dim_allowed:
     if (not isinstance(shapes, collections.Sequence)
         or not shapes
@@ -925,16 +925,18 @@ class Barrier(object):
     If barrier has no completed elements, this operation will block
     until there are 'num_elements' elements to take.
 
+    TODO(b/25743580): the semantics of `allow_small_batch` are experimental
+    and may be extended to other cases in the future.
+
+    TODO(ebrevdo): If a take_many(allow_small_batch=True) is blocking
+    already when the barrier is closed, it will block for ever. Fix this
+    by using asynchronous operations.
+
     Args:
       num_elements: The number of elements to take.
       allow_small_batch: If the barrier is closed, don't block if there are less
         completed elements than requested, but instead return all available
         completed elements.
-        TODO(b/25743580): the semantics of `allow_small_batch` are experimental
-        and may be extended to other cases in the future.
-        TODO(ebrevdo): If a take_many(allow_small_batch=True) is blocking
-        already when the barrier is closed, it will block for ever. Fix this
-        by using asynchronous operations.
       timeout: This specifies the number of milliseconds to block
         before returning with DEADLINE_EXCEEDED. (This option is not
         supported yet.)

From 44446379790a211df6c77cf7dff8ad819cd4ef57 Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Wed, 3 May 2017 09:31:39 -0800
Subject: [PATCH 43/51] Adds model_dir in TF_CONFIG, which is read by
 RunConfig. Change: 154972457

---
 .../python/learn/estimators/run_config.py     | 29 ++++++++++++++++---
 .../learn/estimators/run_config_test.py       | 21 ++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 5a63ee7fa82..3b7d618b60f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -28,6 +28,7 @@ import six
 from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as core_run_config
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 
 
@@ -260,10 +261,12 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
         the feature.
       evaluation_master: the master on which to perform evaluation.
       model_dir: directory where model parameters, graph etc are saved. If
-        `None`, see `Estimator` about where the model will be saved.
+        `None`, will use `model_dir` property in `TF_CONFIG` environment
+        variable. If both are set, must have same value. If both are `None`, see
+        `Estimator` about where the model will be saved.
       session_config: a ConfigProto used to set session parameters, or None.
-         Note - using this argument, it is easy to provide settings which break
-         otherwise perfectly good models. Use with care.
+        Note - using this argument, it is easy to provide settings which break
+        otherwise perfectly good models. Use with care.
     """
     super(RunConfig, self).__init__(
         master=master, evaluation_master=evaluation_master)
@@ -291,7 +294,7 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
     # create Scaffold and Saver in their model_fn to set these.
     self._keep_checkpoint_max = keep_checkpoint_max
     self._keep_checkpoint_every_n_hours = keep_checkpoint_every_n_hours
-    self._model_dir = model_dir
+    self._model_dir = _get_model_dir(model_dir)
 
   def replace(self, **kwargs):
     """Returns a new instance of `RunConfig` replacing specified properties.
@@ -434,3 +437,21 @@ def _get_master(cluster_spec, task_type, task_id):
   # For backwards compatibility, we return empty string if task_type was
   # not set (task_type did not previously exist).
   return ''
+
+
+def _get_model_dir(model_dir):
+  """Returns `model_dir` based user provided `model_dir` or `TF_CONFIG`."""
+
+  model_dir_in_tf_config = json.loads(
+      os.environ.get('TF_CONFIG') or '{}').get('model_dir', None)
+  if model_dir_in_tf_config is not None:
+    if model_dir is not None and model_dir_in_tf_config != model_dir:
+      raise ValueError(
+          '`model_dir` provided in RunConfig construct, if set, '
+          'must have the same value as the model_dir in TF_CONFIG. '
+          'model_dir: {}\nTF_CONFIG["model_dir"]: {}.\n'.format(
+              model_dir, model_dir_in_tf_config))
+
+    logging.info('Using model_dir in TF_CONFIG: %s', model_dir_in_tf_config)
+
+  return model_dir or model_dir_in_tf_config
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py b/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
index 6d39a9ad137..9102e42bfbb 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config_test.py
@@ -223,6 +223,27 @@ class RunConfigTest(test.TestCase):
     config = run_config_lib.RunConfig(model_dir=TEST_DIR)
     self.assertEqual(TEST_DIR, config.model_dir)
 
+  def test_model_dir_in_tf_config(self):
+    tf_config = {"model_dir": TEST_DIR}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      run_config = run_config_lib.RunConfig()
+    self.assertEqual(TEST_DIR, run_config.model_dir)
+
+  def test_model_dir_both_in_tf_config_and_constructor(self):
+    tf_config = {"model_dir": TEST_DIR}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      run_config = run_config_lib.RunConfig(model_dir=TEST_DIR)
+    self.assertEqual(TEST_DIR, run_config.model_dir)
+
+  def test_model_dir_fail_if_constructor_value_mismatch_tf_config(self):
+    tf_config = {"model_dir": TEST_DIR}
+    with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
+      with self.assertRaisesRegexp(
+          ValueError,
+          "`model_dir` provided in RunConfig .* must have "
+          "the same value .* in TF_CONFIG"):
+        run_config_lib.RunConfig(model_dir=TEST_DIR + "/sub_dir")
+
   def test_replace(self):
     config = run_config_lib.RunConfig(
         tf_random_seed=RANDOM_SEED, model_dir=TEST_DIR)

From 427081c11b55d494b92fa836f67ad07439096115 Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Wed, 3 May 2017 10:13:00 -0800
Subject: [PATCH 44/51] [XLA] Account for computation output size correctly in
 HLO rematerialization. Previously in HLO rematerialization, the tuple
 elements of the output were not counted against memory use. This change fixes
 this. Also, move the accounting for the parameter to the caller of HLO
 rematerialization because the arguments to the computation are existing
 allocation prior to execution and are not part of the memory allocated
 specifically for the computation. Change: 154978404

---
 .../xla/service/hlo_rematerialization.cc      | 55 ++++++++++---------
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 44293f582e6..b1ee2e46b0f 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -1160,28 +1160,25 @@ StatusOr<bool> HloRematerialization::Run(
                       TuplePointsToAnalysis::Run(
                           module, /*include_loop_fusion_instructions=*/true));
 
-  // Adjust memory limit to account for the parameter and output of the entry
+  // Adjust memory limit to account for the output of the entry
   // computation. This is necessary because the per-computation accounting in
-  // MemoryUsageTracker do not include parameters and output as these are
-  // typically allocated by the caller. With this adjustment the memory limit
-  // accounts for the size of all HLO instructions (parameters, output
-  // instructions, etc).
-  auto total_size = [this](const HloInstruction* instruction) {
-    int64 total_size = 0;
-    for (const LogicalBuffer* logical_buffer :
-         points_to_analysis_->GetBuffersDefinedByInstruction(instruction)) {
-      total_size += size_function_(logical_buffer->shape());
-    }
-    return total_size;
-  };
-  const HloComputation* entry_computation = module->entry_computation();
-  memory_limit_bytes -= total_size(entry_computation->root_instruction());
-  for (const HloInstruction* param :
-       entry_computation->parameter_instructions()) {
-    memory_limit_bytes -= total_size(param);
-  }
-  VLOG(1) << "Adjusted memory limit accounting for parameters and output: "
-          << HumanReadableNumBytes(memory_limit_bytes);
+  // MemoryUsageTracker do not include output as these are typically allocated
+  // by the caller.
+  int64 module_output_size = 0;
+  ShapeUtil::ForEachSubshape(
+      module->entry_computation()->root_instruction()->shape(),
+      [&module_output_size, this](const Shape& subshape,
+                                  const ShapeIndex& /*index*/) {
+        module_output_size += size_function_(subshape);
+        return Status::OK();
+      })
+      .IgnoreError();
+
+  const int64 adjusted_memory_limit_bytes =
+      memory_limit_bytes - module_output_size;
+  VLOG(1) << "Adjusted memory limit accounting for output ("
+          << HumanReadableNumBytes(module_output_size)
+          << "): " << HumanReadableNumBytes(adjusted_memory_limit_bytes);
 
   XLA_VLOG_LINES(3, "Before HloRematerialization:\n" + module->ToString());
   // Create initial sequence of HLO instructions.
@@ -1204,8 +1201,13 @@ StatusOr<bool> HloRematerialization::Run(
         return Status::OK();
       }));
 
+  // The peak memory usage of the module equals the peak memory use of the entry
+  // computation plus the output size of the computation. This is because the
+  // peak memory for a computation does not include the output as this is
+  // typically accounted for in the caller.
   const int64 before_peak_memory =
-      computation_peak_memory_.at(module->entry_computation());
+      computation_peak_memory_.at(module->entry_computation()) +
+      module_output_size;
   VLOG(1) << "Peak memory usage of module (before): "
           << HumanReadableNumBytes(before_peak_memory);
 
@@ -1216,9 +1218,9 @@ StatusOr<bool> HloRematerialization::Run(
 
   // Subcomputations called by the entry computation will also be
   // rematerialized.
-  TF_ASSIGN_OR_RETURN(bool changed,
-                      RematerializeComputation(module->entry_computation(),
-                                               sequence, memory_limit_bytes));
+  TF_ASSIGN_OR_RETURN(bool changed, RematerializeComputation(
+                                        module->entry_computation(), sequence,
+                                        adjusted_memory_limit_bytes));
 
   // Rematerialization can introduce dead code. This occurs if all uses of an
   // instruction are replaced with rematerializations of the instruction.
@@ -1257,7 +1259,8 @@ StatusOr<bool> HloRematerialization::Run(
           << " instructions in module " << module->name() << "; "
           << net_instructions_added_ << " net instructions added";
   const int64 current_peak_memory =
-      computation_peak_memory_.at(module->entry_computation());
+      computation_peak_memory_.at(module->entry_computation()) +
+      module_output_size;
   VLOG(1) << "Peak memory usage of module now "
           << HumanReadableNumBytes(current_peak_memory) << " ("
           << current_peak_memory << " bytes), was "

From 87ffdd2d500674c28b95abec7cb9b2f7eb0bba91 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 May 2017 10:14:21 -0800
Subject: [PATCH 45/51] Internal change Change: 154978617

---
 third_party/ortools.BUILD |  13 +++++
 tools/tf_env_collect.sh   | 108 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 third_party/ortools.BUILD
 create mode 100644 tools/tf_env_collect.sh

diff --git a/third_party/ortools.BUILD b/third_party/ortools.BUILD
new file mode 100644
index 00000000000..61191e3d271
--- /dev/null
+++ b/third_party/ortools.BUILD
@@ -0,0 +1,13 @@
+# Google's software suite for combinatorial optimization
+
+licenses(["notice"])  # Apache2 license
+
+exports_files(["LICENSE-2.0.txt"])
+
+native.cc_library(
+    name = "linear_solver_glop",
+    deps = [
+        "@ortools_archive//linear_solver:linear_solver_glop",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tools/tf_env_collect.sh b/tools/tf_env_collect.sh
new file mode 100644
index 00000000000..71b17f4b7b8
--- /dev/null
+++ b/tools/tf_env_collect.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -u  # Check for undefined variables
+
+echo "Collecting system information..."
+
+OUTPUT_FILE=tf_env.txt
+
+echo >> $OUTPUT_FILE
+echo "== cat /etc/issue ===============================================" >> $OUTPUT_FILE
+uname -a >> $OUTPUT_FILE
+uname=`uname -s`
+if [ "$(uname)" == "Darwin" ]; then
+  echo Mac OS X `sw_vers -productVersion` >> $OUTPUT_FILE
+elif [ "$(uname)" == "Linux" ]; then
+  cat /etc/*release | grep VERSION >> $OUTPUT_FILE
+fi
+
+
+echo >> $OUTPUT_FILE
+echo '== are we in docker =============================================' >> $OUTPUT_FILE
+num=`cat /proc/1/cgroup | grep docker | wc -l`;
+if [ $num -ge 1 ]; then
+  echo "Yes" >> $OUTPUT_FILE
+else
+  echo "No" >> $OUTPUT_FILE
+fi
+
+echo >> $OUTPUT_FILE
+echo '== compiler =====================================================' >> $OUTPUT_FILE
+c++ --version &>> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+echo '== uname -a =====================================================' >> $OUTPUT_FILE
+uname -a >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+echo '== check pips ===================================================' >> $OUTPUT_FILE
+pip list 2>&1 | grep "proto\|numpy\|tensorflow" &>> $OUTPUT_FILE
+
+
+echo >> $OUTPUT_FILE
+echo '== check for virtualenv =========================================' >> $OUTPUT_FILE
+python -c "import sys;print(hasattr(sys, \"real_prefix\"))" >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+echo '== tensorflow import ============================================' >> $OUTPUT_FILE
+cat <<EOF > /tmp/check_tf.py
+import tensorflow as tf;
+print("tf.VERSION = %s" % tf.VERSION)
+print("tf.GIT_VERSION = %s" % tf.GIT_VERSION)
+print("tf.COMPILER_VERSION = %s" % tf.GIT_VERSION)
+with tf.Session() as sess:
+  print("Sanity check: %r" % sess.run(tf.constant([1,2,3])[:1]))
+EOF
+python /tmp/check_tf.py &>> ${OUTPUT_FILE}
+
+DEBUG_LD=libs python -c "import tensorflow"  2>>${OUTPUT_FILE} > /tmp/loadedlibs
+grep libcudnn.so /tmp/loadedlibs >> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+echo '== env ==========================================================' >> $OUTPUT_FILE
+if [ -z ${LD_LIBRARY_PATH+x} ]; then
+  echo "LD_LIBRARY_PATH is unset" >> $OUTPUT_FILE;
+else
+  echo LD_LIBRARY_PATH ${LD_LIBRARY_PATH}  >> $OUTPUT_FILE;
+fi
+if [ -z ${DYLD_LIBRARY_PATH+x} ]; then
+  echo "DYLD_LIBRARY_PATH is unset" >> $OUTPUT_FILE;
+else
+  echo DYLD_LIBRARY_PATH ${DYLD_LIBRARY_PATH}  >> $OUTPUT_FILE;
+fi
+
+
+echo >> $OUTPUT_FILE >> $OUTPUT_FILE
+echo '== nvidia-smi ===================================================' >> $OUTPUT_FILE
+nvidia-smi &>> $OUTPUT_FILE
+
+echo >> $OUTPUT_FILE
+
+echo '== cuda libs  ===================================================' >> $OUTPUT_FILE
+find /usr/local -type f -name 'libcudart*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
+find /usr/local -type f -name 'libudnn*'  2>/dev/null | grep cuda |  grep -v "\\.cache" >> ${OUTPUT_FILE}
+
+# Remove any words with google.
+mv $OUTPUT_FILE old-$OUTPUT_FILE
+grep -v -i google old-${OUTPUT_FILE} > $OUTPUT_FILE
+
+echo "Wrote environment to ${OUTPUT_FILE}. You can review the contents of that file."
+echo "and use it to populate the fields in the github issue template."
+echo
+echo "cat ${OUTPUT_FILE}"
+echo
+

From 7828637e07b0081a37dfdc66ff912dd1d6ff3228 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 May 2017 11:04:22 -0800
Subject: [PATCH 46/51] In the CUDA path of depthwise_conv2d, use compile-time
 constants if the filter size and depth_multiplier match the xception model.

Hardening the depthwise_conv2d forward test by using non-uniform filter values.
Change: 154985456
---
 .../core/kernels/depthwise_conv_op_gpu.cu.cc  | 238 +++++++++++-------
 .../kernel_tests/depthwise_conv_op_test.py    |  10 +-
 2 files changed, 157 insertions(+), 91 deletions(-)

diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 5377d09ec69..b16adf6102b 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -24,28 +24,32 @@ limitations under the License.
 
 #if !defined(_MSC_VER)
 #define UNROLL _Pragma("unroll")
+#define NOUNROLL _Pragma("nounroll")
 #else
 #define UNROLL
+#define NOUNROLL
 #endif
 
 namespace tensorflow {
 
-namespace {
-
-typedef Eigen::GpuDevice GPUDevice;
+using Eigen::GpuDevice;
 
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NHWC format.
-template <typename T>
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
 __global__ void DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
                                              const T* input, const T* filter,
                                              T* output, int num_outputs) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -114,16 +118,20 @@ __global__ void DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args,
 
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
-template <typename T>
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
 __global__ void DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args,
                                              const T* input, const T* filter,
                                              T* output, int num_outputs) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -235,29 +243,41 @@ __global__ void DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args,
   }
 }
 
-}  // namespace
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
+                              const T* input, const T* filter, T* output,
+                              TensorFormat data_format) {
+  const int num_outputs =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_outputs, d);
+  if (data_format == FORMAT_NHWC) {
+    DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
+                                 kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, input, filter, output, num_outputs);
+  } else if (data_format == FORMAT_NCHW) {
+    DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
+                                 kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, input, filter, output, num_outputs);
+  } else {
+    assert(false);
+  }
+}
 
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
 struct DepthwiseConv2dGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args, const T* input,
+  static void Run(const GpuDevice& d, const DepthwiseArgs args, const T* input,
                   const T* filter, T* output, TensorFormat data_format) {
-    // In this kernel, each thread is computing the gradients from one element
-    // in the out_backprop. Note that one element in the out_backprop can map
-    // to multiple filter elements.
-    const int num_outputs =
-        args.batch * args.out_rows * args.out_cols * args.out_depth;
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_outputs, d);
-    if (data_format == FORMAT_NHWC) {
-      DepthwiseConv2dGPUKernelNHWC<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, input, filter, output, num_outputs);
-    } else if (data_format == FORMAT_NCHW) {
-      DepthwiseConv2dGPUKernelNCHW<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, input, filter, output, num_outputs);
+    if (args.filter_rows == 3 && args.filter_cols == 3 &&
+        args.depth_multiplier == 1) {
+      LaunchDepthwiseConv2dGPU<T, 3, 3, 1>(d, args, input, filter, output,
+                                           data_format);
     } else {
-      assert(false);
+      LaunchDepthwiseConv2dGPU<T, -1, -1, -1>(d, args, input, filter, output,
+                                              data_format);
     }
   }
 };
@@ -266,18 +286,20 @@ template struct DepthwiseConv2dGPULaunch<float>;
 template struct DepthwiseConv2dGPULaunch<double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
-template <typename T, int KNOWN_DEPTH_MULTIPLIER>
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
 __global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
     const DepthwiseArgs args, const T* out_backprop, const T* filter,
     T* in_backprop, int num_in_backprop) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = KNOWN_DEPTH_MULTIPLIER == -1
-                                   ? args.depth_multiplier
-                                   : KNOWN_DEPTH_MULTIPLIER;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -301,14 +323,12 @@ __global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
         tf_max(0, (in_c - filter_cols + pad_cols + stride) / stride);
     const int out_c_end = tf_min(out_cols - 1, (in_c + pad_cols) / stride);
 
-#pragma nounroll
-    for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
+    NOUNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
       const int f_r = in_r + pad_rows - out_r * stride;
       const int temp_out_backprop_offset =
           out_depth * out_cols * (out_r + out_rows * b);
       const int temp_filter_offset = filter_cols * f_r;
-#pragma nounroll
-      for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
+      NOUNROLL for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
         const int f_c = in_c + pad_cols - out_c * stride;
         int filter_offset =
             depth_multiplier * (in_d + in_depth * (f_c + temp_filter_offset));
@@ -328,7 +348,8 @@ __global__ void DepthwiseConv2dBackpropInputGPUKernelNHWC(
   }
 }
 
-template <typename T>
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
 __global__ void __launch_bounds__(1024)
     DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args,
                                               const T* out_backprop,
@@ -337,9 +358,12 @@ __global__ void __launch_bounds__(1024)
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -395,34 +419,52 @@ __global__ void __launch_bounds__(1024)
   }
 }
 
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
+                                           const DepthwiseArgs args,
+                                           const T* out_backprop,
+                                           const T* filter, T* in_backprop,
+                                           TensorFormat data_format) {
+  const int num_in_backprop =
+      args.batch * args.in_rows * args.in_cols * args.in_depth;
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_in_backprop, d);
+  // Increase block count for when there are more warps/SM than threads/SM.
+  // TODO(csigg): this is pretty arbitraty and should be generalized using
+  // cudaOccupancyMaxPotentialBlockSize().
+  config.block_count *= 4;
+  if (data_format == FORMAT_NHWC) {
+    DepthwiseConv2dBackpropInputGPUKernelNHWC<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, filter, in_backprop, num_in_backprop);
+  } else if (data_format == FORMAT_NCHW) {
+    DepthwiseConv2dBackpropInputGPUKernelNCHW<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, filter, in_backprop, num_in_backprop);
+  } else {
+    assert(false);
+  }
+}
+
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
 struct DepthwiseConv2dBackpropInputGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args,
+  static void Run(const GpuDevice& d, const DepthwiseArgs args,
                   const T* out_backprop, const T* filter, T* in_backprop,
                   TensorFormat data_format) {
-    const int num_in_backprop =
-        args.batch * args.in_rows * args.in_cols * args.in_depth;
-
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_in_backprop, d);
-    // Increase block count for when there are more warps/SM than threads/SM.
-    config.block_count *= 4;
-    if (data_format == FORMAT_NHWC) {
-      if (args.depth_multiplier == 1) {
-        DepthwiseConv2dBackpropInputGPUKernelNHWC<T, 1>
-            <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-                args, out_backprop, filter, in_backprop, num_in_backprop);
+    if (args.depth_multiplier == 1) {
+      if (args.filter_rows == 3 && args.filter_cols == 3) {
+        LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3, 1>(
+            d, args, out_backprop, filter, in_backprop, data_format);
       } else {
-        DepthwiseConv2dBackpropInputGPUKernelNHWC<T, -1>
-            <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-                args, out_backprop, filter, in_backprop, num_in_backprop);
+        LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1, 1>(
+            d, args, out_backprop, filter, in_backprop, data_format);
       }
-    } else if (data_format == FORMAT_NCHW) {
-      DepthwiseConv2dBackpropInputGPUKernelNCHW<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, out_backprop, filter, in_backprop, num_in_backprop);
     } else {
-      assert(false);
+      LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1, -1>(
+          d, args, out_backprop, filter, in_backprop, data_format);
     }
   }
 };
@@ -431,16 +473,20 @@ template struct DepthwiseConv2dBackpropInputGPULaunch<float>;
 template struct DepthwiseConv2dBackpropInputGPULaunch<double>;
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
-template <typename T>
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
 __global__ void DepthwiseConv2dBackpropFilterGPUKernelNHWC(
     const DepthwiseArgs args, const T* out_backprop, const T* input,
     T* filter_backprop, int num_out_backprop) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -518,16 +564,20 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernelNHWC(
 }
 
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
-template <typename T>
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
 __global__ void DepthwiseConv2dBackpropFilterGPUKernelNCHW(
     const DepthwiseArgs args, const T* out_backprop, const T* input,
     T* filter_backprop, int num_out_backprop) {
   const int in_rows = args.in_rows;
   const int in_cols = args.in_cols;
   const int in_depth = args.in_depth;
-  const int filter_rows = args.filter_rows;
-  const int filter_cols = args.filter_cols;
-  const int depth_multiplier = args.depth_multiplier;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int depth_multiplier =
+      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
   const int stride = args.stride;
   const int pad_rows = args.pad_rows;
   const int pad_cols = args.pad_cols;
@@ -610,28 +660,44 @@ __global__ void DepthwiseConv2dBackpropFilterGPUKernelNCHW(
   }
 }
 
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          int kKnownDepthMultiplier>
+void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
+                                            const DepthwiseArgs args,
+                                            const T* out_backprop,
+                                            const T* input, T* filter_backprop,
+                                            TensorFormat data_format) {
+  const int num_out_backprop =
+      args.batch * args.out_rows * args.out_cols * args.out_depth;
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_out_backprop, d);
+  if (data_format == FORMAT_NHWC) {
+    DepthwiseConv2dBackpropFilterGPUKernelNHWC<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, input, filter_backprop, num_out_backprop);
+  } else if (data_format == FORMAT_NCHW) {
+    DepthwiseConv2dBackpropFilterGPUKernelNCHW<
+        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            args, out_backprop, input, filter_backprop, num_out_backprop);
+  } else {
+    assert(false);
+  }
+}
+
 // A simple launch pad to launch the Cuda kernel for depthwise convolution.
 template <typename T>
 struct DepthwiseConv2dBackpropFilterGPULaunch {
-  static void Run(const GPUDevice& d, const DepthwiseArgs args,
+  static void Run(const GpuDevice& d, const DepthwiseArgs args,
                   const T* out_backprop, const T* input, T* filter_backprop,
                   TensorFormat data_format) {
-    // In this kernel, each thread is computing the gradients for one element in
-    // the out_backprop.
-    const int num_out_backprop =
-        args.batch * args.out_rows * args.out_cols * args.out_depth;
-    CudaLaunchConfig config = GetCudaLaunchConfig(num_out_backprop, d);
-
-    if (data_format == FORMAT_NHWC) {
-      DepthwiseConv2dBackpropFilterGPUKernelNHWC<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, out_backprop, input, filter_backprop, num_out_backprop);
-    } else if (data_format == FORMAT_NCHW) {
-      DepthwiseConv2dBackpropFilterGPUKernelNCHW<T>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              args, out_backprop, input, filter_backprop, num_out_backprop);
+    if (args.filter_rows == 3 && args.filter_cols == 3 &&
+        args.depth_multiplier == 1) {
+      LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3, 1>(
+          d, args, out_backprop, input, filter_backprop, data_format);
     } else {
-      assert(false);
+      LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1, -1>(
+          d, args, out_backprop, input, filter_backprop, data_format);
     }
   }
 };
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index a881ed0dc9a..2fc34bd4d17 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -113,10 +113,9 @@ class DepthwiseConv2DTest(test.TestCase):
       total_size_1 *= s
     for s in filter_in_sizes:
       total_size_2 *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
+    # Initializes the input and filter tensor with numbers incrementing from 1.
     x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    x2 = [1.0 for f in range(1, total_size_2 + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
     with self.test_session(use_gpu=use_gpu) as sess:
       t1 = constant_op.constant(x1, shape=tensor_in_sizes)
       t1.set_shape(tensor_in_sizes)
@@ -147,8 +146,9 @@ class DepthwiseConv2DTest(test.TestCase):
       native_result = sess.run(conv_native)
       interface_result = sess.run(conv_interface)
 
-    print("diff matrix:",
-          np.amax(np.ravel(native_result) - np.ravel(interface_result)))
+    print("depthwise conv_2d: ", tensor_in_sizes, "*", filter_in_sizes,
+          ", stride:", stride, ", padding: ", padding, ", max diff: ",
+          np.amax(np.absolute(native_result - interface_result)))
     self.assertArrayNear(
         np.ravel(native_result), np.ravel(interface_result), 1e-5)
     self.assertShapeEqual(native_result, conv_native)

From 965d620104d375c5fd2b18881f353eb41d9a63a2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 May 2017 14:03:00 -0800
Subject: [PATCH 47/51] Internal change. Change: 155009390

---
 tensorflow/cc/BUILD                           |   10 +
 tensorflow/compiler/xla/reference_util.h      |    2 +-
 .../xla/service/cpu/cpu_instruction_fusion.cc |    5 +
 .../xla/service/gpu/instruction_fusion.cc     |    5 +
 .../compiler/xla/service/hlo_instruction.cc   |    7 +-
 .../compiler/xla/service/hlo_instruction.h    |    3 +-
 .../learn/python/learn/estimators/head.py     |    7 +-
 tensorflow/core/BUILD                         |    3 +
 tensorflow/core/common_runtime/function.cc    |   26 +-
 .../core/common_runtime/function_test.cc      | 1018 ++++++++++-------
 tensorflow/core/framework/attr_value_util.cc  |   21 +-
 tensorflow/core/graph/algorithm.cc            |   20 +-
 tensorflow/core/graph/algorithm.h             |   16 +-
 tensorflow/core/grappler/costs/BUILD          |   51 +
 .../costs/measuring_cost_estimator.cc         |  133 +++
 .../grappler/costs/measuring_cost_estimator.h |   76 ++
 .../core/grappler/costs/robust_stats.cc       |  151 +++
 tensorflow/core/grappler/costs/robust_stats.h |   42 +
 .../core/grappler/costs/robust_stats_test.cc  |   63 +
 .../core/grappler/costs/virtual_scheduler.cc  |  215 ++++
 .../core/grappler/costs/virtual_scheduler.h   |  116 ++
 tensorflow/core/kernels/BUILD                 |    7 +-
 .../performance/performance_models.md         |    6 +-
 .../python/feature_column/feature_column.py   |  147 ++-
 .../feature_column/feature_column_test.py     |  144 ++-
 .../kernel_tests/variable_scope_test.py       |    5 +
 tensorflow/python/ops/variable_scope.py       |   11 +
 tensorflow/python/training/supervisor.py      |    2 +-
 .../stream_executor/cuda/cuda_driver.cc       |   45 +-
 29 files changed, 1865 insertions(+), 492 deletions(-)
 create mode 100644 tensorflow/core/grappler/costs/measuring_cost_estimator.cc
 create mode 100644 tensorflow/core/grappler/costs/measuring_cost_estimator.h
 create mode 100644 tensorflow/core/grappler/costs/robust_stats.cc
 create mode 100644 tensorflow/core/grappler/costs/robust_stats.h
 create mode 100644 tensorflow/core/grappler/costs/robust_stats_test.cc
 create mode 100644 tensorflow/core/grappler/costs/virtual_scheduler.cc
 create mode 100644 tensorflow/core/grappler/costs/virtual_scheduler.h

diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 42fa139282a..8810b8731ae 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -388,6 +388,16 @@ tf_gen_op_wrappers_cc(
     visibility = ["//tensorflow:internal"],
 )
 
+tf_gen_op_wrappers_cc(
+    name = "functional_ops",
+    include_internal_ops = 1,
+    op_lib_names = [
+        "functional_ops",
+    ],
+    pkg = "//tensorflow/core",
+    visibility = ["//tensorflow:internal"],
+)
+
 tf_gen_op_wrappers_cc(
     name = "resource_variable_ops",
     include_internal_ops = 1,
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index 03276121294..f58f0bdc9f5 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -422,7 +422,7 @@ class ReferenceUtil {
   static std::unique_ptr<Array2D<T1>> ApplyElementwise2D(
       F&& f, const Array2D<T1>& array1, const Array2D<Ts>&... arrays) {
     AssertSameSize2D(array1, arrays...);
-    auto result = MakeUnique<Array2D<T1>>(array1.n1(), array1.n1());
+    auto result = MakeUnique<Array2D<T1>>(array1.n1(), array1.n2());
     for (int64 i = 0; i < array1.n1(); ++i) {
       for (int64 j = 0; j < array1.n2(); ++j) {
         (*result)(i, j) = f(array1(i, j), arrays(i, j)...);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 240da35ef19..dc002846e9e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -24,6 +24,11 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
+  // Output fusion is not currently supported on CPUs.
+  if (producer->opcode() == HloOpcode::kFusion) {
+    return false;
+  }
+
   // Condition for consumer: must be elementwise or a fusion op
   // (which necessarily only contains elementwise operations)
   if (!(consumer->opcode() == HloOpcode::kFusion ||
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 34a44ad4054..a36dcbbd2fa 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -46,6 +46,11 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
+  // Output fusion is not currently supported on GPUs.
+  if (producer->opcode() == HloOpcode::kFusion) {
+    return false;
+  }
+
   // RNG operations are not currently parallel-friendly on GPU.
   if (producer->opcode() == HloOpcode::kRng) {
     return false;
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 179e1832654..66fb0599752 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -1570,7 +1570,9 @@ string HloInstruction::ToCategory() const {
           return "non-elementwise fusion";
         }
       case FusionKind::kInput:
-        return "reduce fusion";
+        return "input fusion";
+      case FusionKind::kOutput:
+        return "output fusion";
       case FusionKind::kTransposeDot:
         return "dot fusion";
       case FusionKind::kConvBackwardFilter:
@@ -1618,7 +1620,6 @@ bool HloInstruction::IsFusable() const {
 
   // Some kinds of instructions don't make sense to fuse.
   switch (opcode_) {
-    case HloOpcode::kFusion:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kParameter:
@@ -2186,6 +2187,8 @@ string ToString(HloInstruction::FusionKind kind) {
       return "kLoop";
     case HloInstruction::FusionKind::kInput:
       return "kInput";
+    case HloInstruction::FusionKind::kOutput:
+      return "kOutput";
     case HloInstruction::FusionKind::kTransposeDot:
       return "kTransposeDot";
     case HloInstruction::FusionKind::kConvBackwardFilter:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 5ec17c80048..43935690dff 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -54,7 +54,8 @@ class HloInstruction {
  public:
   enum class FusionKind {
     kLoop,                // Fused into a loop.
-    kInput,               // Fused into a reduction kernel.
+    kInput,               // Op's input is fused into the op itself.
+    kOutput,              // Op's output is fused into the op itself.
     kTransposeDot,        // Fused into a dot with transposed operands.
     kConvBackwardFilter,  // Fused into a backward filter convolution.
     kConvBackwardInput,   // Fused into a backward input convolution.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 15e457f932c..25f2922bf8e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -379,7 +379,12 @@ def multi_label_head(n_classes,
                      loss_fn=None):
   """Creates a Head for multi label classification.
 
-  The Head uses sigmoid cross entropy loss.
+  Multi-label classification handles the case where each example may have zero
+  or more associated labels, from a discrete set.  This is distinct from
+  `multi_class_head` which has exactly one label from a discrete set.
+
+  This head by default uses sigmoid cross entropy loss, which expects as input
+  a multi-hot tensor of shape `(batch_size, num_classes)`.
 
   Args:
     n_classes: Integer, number of classes, must be >= 2
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1617addba05..119bc0f8997 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2417,6 +2417,9 @@ tf_cc_test(
         ":test_main",
         ":testlib",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:function_ops",
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 13e20568fff..3644279b920 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -1001,25 +1001,19 @@ string NewName(const Node* n, bool pretty) {
 void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
   // We visit nodes in forward topological sort order, which is a
   // possible execution order of the graph.
-  std::vector<size_t> pending(g->num_node_ids());
-  std::deque<const Node*> ready;
-  for (const Node* n : g->nodes()) {
-    pending[n->id()] = n->in_edges().size();
-    if (pending[n->id()] == 0) ready.push_back(n);
-  }
   gtl::InlinedVector<const Edge*, 4> inputs;
   gdef->Clear();
   gdef->mutable_versions()->CopyFrom(g->versions());
-  while (!ready.empty()) {
-    const Node* n = ready.front();
-    ready.pop_front();
-    for (const Edge* e : n->out_edges()) {
-      const Node* next = e->dst();
-      if (--pending[next->id()] == 0) {
-        ready.push_back(next);
-      }
+
+  std::vector<Node*> start_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->out_edges().empty()) {
+      start_nodes.push_back(n);
     }
-    if (!n->IsOp()) continue;
+  }
+
+  ReverseDFSFrom(*g, start_nodes, nullptr, [gdef, pretty, &inputs](Node* n) {
+    if (!n->IsOp()) return;
     NodeDef* ndef = gdef->add_node();
     ndef->set_name(NewName(n, pretty));
     ndef->set_op(n->type_string());
@@ -1054,7 +1048,7 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
         ndef->add_input(strings::StrCat(srcname, ":", e->src_output()));
       }
     }
-  }
+  });
 }
 
 string DebugString(const Graph* g) {
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 8f70ab8783c..af1ff6aec03 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -17,6 +17,10 @@ limitations under the License.
 
 #include <atomic>
 
+#include "tensorflow/cc/ops/array_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
@@ -28,10 +32,12 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
 
@@ -58,13 +64,8 @@ class FunctionTest : public ::testing::Test {
       : device_(DeviceFactory::NewDevice("CPU", {},
                                          "/job:localhost/replica:0/task:0")) {}
 
-  ~FunctionTest() override {
-    delete exec_;
-    delete device_;
-  }
-
   void Create(const FunctionDef& fdef, InstantiateAttrValueSlice attrs) {
-    delete exec_;
+    exec_ = nullptr;
     InstantiationResult result;
     TF_CHECK_OK(InstantiateFunction(fdef, attrs, GetOpSig, &result));
 
@@ -79,15 +80,18 @@ class FunctionTest : public ::testing::Test {
 
     const int version = g->versions().producer();
     LocalExecutorParams params;
-    params.device = device_;
+    params.device = device_.get();
     params.create_kernel = [this, version](const NodeDef& ndef,
                                            OpKernel** kernel) {
-      return CreateNonCachedKernel(device_, nullptr, ndef, version, kernel);
+      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
+                                   kernel);
     };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
     };
-    TF_CHECK_OK(NewLocalExecutor(params, g, &exec_));
+    Executor* exec;
+    TF_CHECK_OK(NewLocalExecutor(params, g, &exec));
+    exec_.reset(exec);
   }
 
   void Run(const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
@@ -105,8 +109,8 @@ class FunctionTest : public ::testing::Test {
     }
   }
 
-  Device* device_ = nullptr;
-  Executor* exec_ = nullptr;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<Executor> exec_;
   DataTypeVector arg_types_;
   DataTypeVector ret_types_;
 };
@@ -136,21 +140,15 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       : device_(DeviceFactory::NewDevice("CPU", {},
                                          "/job:localhost/replica:0/task:0")) {}
 
-  ~FunctionLibraryRuntimeTest() override {
-    delete lib_;
-    delete lib_def_;
-    delete device_;
-  }
-
   void Init(const std::vector<FunctionDef>& flib) {
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
-    delete lib_def_;
-    lib_def_ = new FunctionLibraryDefinition(OpRegistry::Global(), proto);
-    delete lib_;
+    lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    lib_ = NewFunctionLibraryRuntime(nullptr, Env::Default(), device_,
-                                     TF_GRAPH_DEF_VERSION, lib_def_, opts);
+    lib_.reset(NewFunctionLibraryRuntime(nullptr, Env::Default(), device_.get(),
+                                         TF_GRAPH_DEF_VERSION, lib_def_.get(),
+                                         opts));
+    fdef_lib_ = lib_def_->ToProto();
   }
 
   Status Run(const string& name, InstantiateAttrValueSlice attrs,
@@ -190,7 +188,8 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     return Status::OK();
   }
 
-  Graph* GetFuncBody(const string& name, InstantiateAttrValueSlice attrs) {
+  std::unique_ptr<Graph> GetFuncBody(const string& name,
+                                     InstantiateAttrValueSlice attrs) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(name, attrs, &handle);
     if (!status.ok()) {
@@ -199,12 +198,13 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     }
     const FunctionBody* fbody = lib_->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
-    Graph* ret = new Graph(lib_def_);
-    CopyGraph(*fbody->graph, ret);
+    std::unique_ptr<Graph> ret(new Graph(lib_def_.get()));
+    CopyGraph(*fbody->graph, ret.get());
     return ret;
   }
 
-  Graph* GetGradBody(const string& func, InstantiateAttrValueSlice attrs) {
+  std::unique_ptr<Graph> GetGradBody(const string& func,
+                                     InstantiateAttrValueSlice attrs) {
     FunctionLibraryRuntime::Handle handle;
     Status status = lib_->Instantiate(func, attrs, &handle);
     if (!status.ok()) {
@@ -213,17 +213,17 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     }
     const FunctionBody* fbody = lib_->GetFunctionBody(handle);
     CHECK_NOTNULL(fbody);
-    FunctionBody* gbody = SymbolicGradient(*fbody);
+    std::unique_ptr<FunctionBody> gbody(SymbolicGradient(*fbody));
     CHECK_NOTNULL(gbody);
-    Graph* ret = new Graph(lib_def_);
-    CopyGraph(*gbody->graph, ret);
-    delete gbody;
+    std::unique_ptr<Graph> ret(new Graph(lib_def_.get()));
+    CopyGraph(*gbody->graph, ret.get());
     return ret;
   }
 
-  Device* device_ = nullptr;
-  FunctionLibraryDefinition* lib_def_ = nullptr;
-  FunctionLibraryRuntime* lib_ = nullptr;
+  std::unique_ptr<Device> device_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<FunctionLibraryRuntime> lib_;
+  FunctionDefLibrary fdef_lib_;
 };
 
 TEST_F(FunctionLibraryRuntimeTest, IsStateful) {
@@ -254,113 +254,174 @@ TEST_F(FunctionLibraryRuntimeTest, XTimesN) {
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({16, 32, 48, 64}));
 }
 
+// Adds a function call to 'scope.
+// TODO(phawkins): replace with C++ API for calling functions, when that exists.
+Output Call(Scope* scope, const string& op_name, const string& fn_name,
+            gtl::ArraySlice<Input> inputs) {
+  NodeDef def;
+  NodeDefBuilder builder(op_name, fn_name, scope->graph()->op_registry());
+  for (const Input& input : inputs) {
+    builder.Input(input.node()->name(), input.index(),
+                  input.node()->output_type(input.index()));
+  }
+  TF_CHECK_OK(builder.Finalize(&def));
+  Status status;
+  Node* n = scope->graph()->AddNode(def, &status);
+  TF_CHECK_OK(status);
+  for (int i = 0; i < inputs.size(); ++i) {
+    scope->graph()->AddEdge(inputs[i].node(), inputs[i].index(), n, i);
+  }
+  return Output(n);
+}
+
 TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  Graph* g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
+  std::unique_ptr<Graph> g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
-  const char* e0 = R"P(
-(n2:float) -> (n4:float) {
-  n3 = XTimesFour[T=float](n2)
-  n4 = XTimesFour[T=float](n3)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
 
-  ExpandInlineFunctions(lib_, g);
-  const char* e1 = R"P(
-(n2:float) -> (n17:float) {
-  n10 = Identity[T=float](n2)
-  n7 = XTimesTwo[T=float](n10)
-  n8 = XTimesTwo[T=float](n7)
-  n11 = Identity[T=float](n8)
-  n16 = Identity[T=float](n11)
-  n13 = XTimesTwo[T=float](n16)
-  n14 = XTimesTwo[T=float](n13)
-  n17 = Identity[T=float](n14)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g));
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto arg = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto a = Call(&s, "x4", "XTimesFour", {arg});
+    auto b = Call(&s, "y", "XTimesFour", {a});
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), b, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
 
-  ExpandInlineFunctions(lib_, g);
-  const char* e2 = R"P(
-(n2:float) -> (n17:float) {
-  n18 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n25 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n32 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n39 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n19 = Cast[DstT=float, SrcT=int64](n18)
-  n26 = Cast[DstT=float, SrcT=int64](n25)
-  n33 = Cast[DstT=float, SrcT=int64](n32)
-  n40 = Cast[DstT=float, SrcT=int64](n39)
-  n10 = Identity[T=float](n2)
-  n23 = Identity[T=float](n10)
-  n21 = Mul[T=float](n23, n19)
-  n24 = Identity[T=float](n21)
-  n30 = Identity[T=float](n24)
-  n28 = Mul[T=float](n30, n26)
-  n31 = Identity[T=float](n28)
-  n11 = Identity[T=float](n31)
-  n16 = Identity[T=float](n11)
-  n37 = Identity[T=float](n16)
-  n35 = Mul[T=float](n37, n33)
-  n38 = Identity[T=float](n35)
-  n44 = Identity[T=float](n38)
-  n42 = Mul[T=float](n44, n40)
-  n45 = Identity[T=float](n42)
-  n17 = Identity[T=float](n45)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g));
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    TF_ASSERT_OK(s.graph()->AddFunctionLibrary(fdef_lib_));
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto x4_x2 = Call(&s, "x4/x2", "XTimesTwo", {func0});
+    auto x4_y = Call(&s, "x4/y", "XTimesTwo", {x4_x2});
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), x4_y);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func1);
+    auto y_x2 = Call(&s, "y/x2", "XTimesTwo", {func2});
+    auto y_y = Call(&s, "y/y", "XTimesTwo", {y_x2});
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), y_y);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), func3, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  ExpandInlineFunctions(lib_.get(), g.get());
+  GraphDef e2;
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), 2LL);
+    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), 2LL);
+    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), 2LL);
+    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), 2LL);
+    auto x4_x2_scale =
+        ops::Cast(s.WithOpName("x4/x2/scale"), x4_x2_two, DT_FLOAT);
+    auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
+    auto y_x2_scale = ops::Cast(s.WithOpName("y/x2/scale"), y_x2_two, DT_FLOAT);
+    auto y_y_scale = ops::Cast(s.WithOpName("y/y/scale"), y_y_two, DT_FLOAT);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto func4 = ops::Identity(s.WithOpName("Func/_4"), func0);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), func4, x4_x2_scale);
+    auto func5 = ops::Identity(s.WithOpName("Func/_5"), x4_x2_y);
+    auto func6 = ops::Identity(s.WithOpName("Func/_6"), func5);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), func6, x4_y_scale);
+    auto func7 = ops::Identity(s.WithOpName("Func/_7"), x4_y_y);
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), func7);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), func1);
+    auto func8 = ops::Identity(s.WithOpName("Func/_8"), func2);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), func8, y_x2_scale);
+    auto func9 = ops::Identity(s.WithOpName("Func/_9"), y_x2_y);
+    auto func10 = ops::Identity(s.WithOpName("Func/_10"), func9);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), func10, y_y_scale);
+    auto func11 = ops::Identity(s.WithOpName("Func/_11"), y_y_y);
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), func11);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), func3, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&e2));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(e2, actual);
+  }
 
   // No further inlining.
-  ExpandInlineFunctions(lib_, g);
-  EXPECT_EQ(e2, DebugString(g));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(e2, actual);
+  }
 
   // Get rid of redundant Identity nodes.
-  RemoveIdentityNodes(g);
-  const char* e3 = R"P(
-(n2:float) -> (n42:float) {
-  n18 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n25 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n32 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n39 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n19 = Cast[DstT=float, SrcT=int64](n18)
-  n26 = Cast[DstT=float, SrcT=int64](n25)
-  n33 = Cast[DstT=float, SrcT=int64](n32)
-  n40 = Cast[DstT=float, SrcT=int64](n39)
-  n21 = Mul[T=float](n2, n19)
-  n28 = Mul[T=float](n21, n26)
-  n35 = Mul[T=float](n28, n33)
-  n42 = Mul[T=float](n35, n40)
-}
-)P";
-  EXPECT_EQ(e3, DebugString(g));
-  delete g;
+  RemoveIdentityNodes(g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_two = ops::Const<int64>(s.WithOpName("x4/x2/two"), 2LL);
+    auto x4_y_two = ops::Const<int64>(s.WithOpName("x4/y/two"), 2LL);
+    auto y_x2_two = ops::Const<int64>(s.WithOpName("y/x2/two"), 2LL);
+    auto y_y_two = ops::Const<int64>(s.WithOpName("y/y/two"), 2LL);
+    auto x4_x2_scale =
+        ops::Cast(s.WithOpName("x4/x2/scale"), x4_x2_two, DT_FLOAT);
+    auto x4_y_scale = ops::Cast(s.WithOpName("x4/y/scale"), x4_y_two, DT_FLOAT);
+    auto y_x2_scale = ops::Cast(s.WithOpName("y/x2/scale"), y_x2_two, DT_FLOAT);
+    auto y_y_scale = ops::Cast(s.WithOpName("y/y/scale"), y_y_two, DT_FLOAT);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), x4_x2_y, x4_y_scale);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), x4_y_y, y_x2_scale);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), y_x2_y, y_y_scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y_y_y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  std::unique_ptr<Graph> g(GetFuncBody("XTimes16", {{"T", DT_FLOAT}}));
+  std::unique_ptr<Graph> g = GetFuncBody("XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
-  ExpandInlineFunctions(lib_, g.get());
-  OptimizeGraph(lib_, &g);
-  const char* e0 = R"P(
-(n2:float) -> (n7:float) {
-  n8 = Const[dtype=float, value=Tensor<type: float shape: [] values: 2>]()
-  n4 = Mul[T=float](n2, n8)
-  n5 = Mul[T=float](n4, n8)
-  n6 = Mul[T=float](n5, n8)
-  n7 = Mul[T=float](n6, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  OptimizeGraph(lib_.get(), &g);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto x4_x2_scale = ops::Const<float>(
+        s.WithOpName("x4/x2/scale/_12__cf__2")
+            .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+        2.0f);
+    auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
+    auto x4_y_y = ops::Mul(s.WithOpName("x4/y/y"), x4_x2_y, x4_x2_scale);
+    auto y_x2_y = ops::Mul(s.WithOpName("y/x2/y"), x4_y_y, x4_x2_scale);
+    auto y_y_y = ops::Mul(s.WithOpName("y/y/y"), y_x2_y, x4_x2_scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y_y_y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
   auto func = FDH::Create(  // Creates a FunctionDef using NodeDefs
-      // Name
+                            // Name
       "ManySwapsNodeDef",
       // Input
       {"x: float", "y: float"},
@@ -379,9 +440,9 @@ TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
       // Return
       {{"o", "g:output"}});
   Init({test::function::Swap(), func});
-  std::unique_ptr<Graph> g(GetFuncBody("ManySwapsNodeDef", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("ManySwapsNodeDef", {});
   ASSERT_TRUE(g != nullptr);
-  OptimizeGraph(lib_, &g);
+  OptimizeGraph(lib_.get(), &g);
   const char* e0 = R"P(
 (n3:float, n2:float) -> (n3:float) {
 }
@@ -412,24 +473,35 @@ TEST_F(FunctionLibraryRuntimeTest, ControlDeps) {
        {{"o"}, "Add", {"x2:z:0", "y2:z:0"}, {{"T", DT_FLOAT}}}},
       {{"o", "o:z:0"}});
   Init({test::function::Swap(), func});
-  std::unique_ptr<Graph> g(GetFuncBody("ManySwapsFirst", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("ManySwapsFirst", {});
   ASSERT_TRUE(g != nullptr);
-  OptimizeGraph(lib_, &g);
+  OptimizeGraph(lib_.get(), &g);
 
-  // NOTE: We can remove n8, n9, n10, n11 with a control edge n8->n5.
+  // NOTE: We can remove func0, func1, func2, func9 with a control edge n8->n5.
   // But we don't have a pass doing that.
-  const char* e0 = R"P(
-(n3:float, n2:float) -> (n6:float) {
-  n4 = Mul[T=float](n3, n3)
-  n8 = NoOp() @ n4
-  n9 = Identity[T=float](n3) @ n8
-  n10 = Identity[T=float](n2) @ n8
-  n11 = NoOp() @ n9, n10
-  n5 = Mul[T=float](n2, n2) @ n11
-  n6 = Add[T=float](n4, n5)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto x2 = ops::Mul(s.WithOpName("x2"), x, x);
+    auto func0 = ops::NoOp(s.WithOpName("Func/_0").WithControlDependencies(x2));
+    auto func1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func0}), x);
+    auto func2 = ops::Identity(
+        s.WithOpName("Func/_2").WithControlDependencies({func0}), y);
+    auto func9 = ops::NoOp(s.WithOpName("Func/_9").WithControlDependencies(
+        {func1.output.op(), func2.output.op()}));
+    auto y2 =
+        ops::Mul(s.WithOpName("y2").WithControlDependencies({func9}), y, y);
+    auto o = ops::Add(s.WithOpName("o"), x2, y2);
+    auto ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Error_NotFound) {
@@ -476,84 +548,136 @@ TEST_F(FunctionLibraryRuntimeTest, Error_InstantiaionError) {
 TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
-  auto f = GetFuncBody("XTimesTwo", {{"T", DT_FLOAT}});
-  const char* e0 = R"P(
-(n4:float) -> (n5:float) {
-  n2 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n3 = Cast[DstT=float, SrcT=int64](n2)
-  n5 = Mul[T=float](n4, n3)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(f));
-  delete f;
-  std::unique_ptr<Graph> g(GetGradBody("XTimesTwo", {{"T", DT_FLOAT}}));
-  const char* e1 = R"P(
-(n4:float, n6:float) -> (n7:float) {
-  n2 = Const[dtype=int64, value=Tensor<type: int64 shape: [] values: 2>]()
-  n3 = Cast[DstT=float, SrcT=int64](n2)
-  n5 = Mul[T=float](n4, n3)
-  n7 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Mul[T=float]](n4, n3, n6)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g.get()));
+  std::unique_ptr<Graph> f = GetFuncBody("XTimesTwo", {{"T", DT_FLOAT}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto two = ops::Const(s.WithOpName("two"), 2LL);
+    auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
+    auto y = ops::Mul(s.WithOpName("y"), x, scale);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
 
-  OptimizeGraph(lib_, &g);
-  const char* e2 = R"P(
-(n2:float, n3:float) -> (n9:float) {
-  n10 = Const[dtype=float, value=Tensor<type: float shape: [] values: 2>]()
-  n11 = Const[dtype=int32, value=Tensor<type: int32 shape: [0] values: >]()
-  n6 = Shape[T=float, out_type=int32](n2)
-  n5 = Mul[T=float](n3, n10)
-  n7 = BroadcastGradientArgs[T=int32](n6, n11)
-  n8 = Sum[T=float, Tidx=int32, keep_dims=false](n5, n7)
-  n9 = Reshape[T=float, Tshape=int32](n8, n6)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g.get()));
+    GraphDef actual;
+    f->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  std::unique_ptr<Graph> g = GetGradBody("XTimesTwo", {{"T", DT_FLOAT}});
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
+    auto two = ops::Const(s.WithOpName("two"), 2LL);
+    auto scale = ops::Cast(s.WithOpName("scale"), two, DT_FLOAT);
+    auto y = ops::Mul(s.WithOpName("y"), x, scale);
+    NameAttrList fn;
+    fn.set_name("Mul");
+    (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
+    auto func1 = ops::SymbolicGradient(
+        s.WithOpName("Func/_1"), std::initializer_list<Input>{x, scale, func0},
+        {DT_FLOAT, DT_FLOAT}, fn);
+    auto func2 = ops::_Retval(s.WithOpName("Func/_2"), func1[0], 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  OptimizeGraph(lib_.get(), &g);
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
+    auto scale =
+        ops::Const(s.WithOpName("scale/_5__cf__6")
+                       .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+                   2.0f);
+    auto func1_gx = ops::Mul(s.WithOpName("Func/_1/gx"), func0, scale);
+    auto func1_sx = ops::Shape(s.WithOpName("Func/_1/sx"), x);
+    auto const0 =
+        ops::Const(s.WithOpName("Func/_1/sy/_6__cf__7")
+                       .WithDevice("/job:localhost/replica:0/task:0/cpu:0"),
+                   0, {0});
+    auto func1_rx = ops::internal::BroadcastGradientArgs(
+        s.WithOpName("Func/_1/rx"), func1_sx, const0);
+    auto func1_sum_gx =
+        ops::Sum(s.WithOpName("Func/_1/sum_gx"), func1_gx, func1_rx.r0);
+    auto func1_dx =
+        ops::Reshape(s.WithOpName("Func/_1/dx"), func1_sum_gx, func1_sx);
+    auto func2 = ops::_Retval(s.WithOpName("Func/_2"), func1_dx, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_Add) {
   Init({});
   auto T = DT_FLOAT;
-  auto g = GetFuncBody("SymbolicGradient",
-                       {{"f", FDH::FunctionRef("Add", {{"T", T}})}});
-  const char* e0 = R"P(
-(n7:float, n5:float, n2:float) -> (n14:float, n11:float) {
-  n3 = Identity[T=float](n2)
-  n4 = Identity[T=float](n2)
-  n6 = Shape[T=float, out_type=int32](n5)
-  n8 = Shape[T=float, out_type=int32](n7)
-  n9 = BroadcastGradientArgs[T=int32](n8, n6)
-  n10 = Sum[T=float, Tidx=int32, keep_dims=false](n3, n9:1)
-  n13 = Sum[T=float, Tidx=int32, keep_dims=false](n4, n9)
-  n11 = Reshape[T=float, Tshape=int32](n10, n6)
-  n14 = Reshape[T=float, Tshape=int32](n13, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
-  delete g;
+  std::unique_ptr<Graph> g = GetFuncBody(
+      "SymbolicGradient", {{"f", FDH::FunctionRef("Add", {{"T", T}})}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::_Arg(s.WithOpName("dz"), DT_FLOAT, 2);
+    auto gx = ops::Identity(s.WithOpName("gx"), dz);
+    auto gy = ops::Identity(s.WithOpName("gy"), dz);
+    auto sx = ops::Shape(s.WithOpName("sx"), x);
+    auto sy = ops::Shape(s.WithOpName("sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(s.WithOpName("rx"), sx, sy);
+    auto sum_gx = ops::Sum(s.WithOpName("sum_gx"), gx, rx.r0);
+    auto sum_gy = ops::Sum(s.WithOpName("sum_gy"), gy, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("dy"), sum_gy, sy);
+    auto dx_ret = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_ret = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_Mul) {
   Init({});
   auto T = DT_FLOAT;
-  auto g = GetFuncBody("SymbolicGradient",
-                       {{"f", FDH::FunctionRef("Mul", {{"T", T}})}});
-  const char* e0 = R"P(
-(n6:float, n3:float, n2:float) -> (n14:float, n11:float) {
-  n4 = Mul[T=float](n2, n3)
-  n5 = Shape[T=float, out_type=int32](n3)
-  n7 = Mul[T=float](n6, n2)
-  n8 = Shape[T=float, out_type=int32](n6)
-  n9 = BroadcastGradientArgs[T=int32](n8, n5)
-  n10 = Sum[T=float, Tidx=int32, keep_dims=false](n7, n9:1)
-  n13 = Sum[T=float, Tidx=int32, keep_dims=false](n4, n9)
-  n11 = Reshape[T=float, Tshape=int32](n10, n5)
-  n14 = Reshape[T=float, Tshape=int32](n13, n8)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g));
-  delete g;
+  std::unique_ptr<Graph> g = GetFuncBody(
+      "SymbolicGradient", {{"f", FDH::FunctionRef("Mul", {{"T", T}})}});
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::_Arg(s.WithOpName("dz"), DT_FLOAT, 2);
+    auto gx = ops::Mul(s.WithOpName("gx"), dz, y);
+    auto sx = ops::Shape(s.WithOpName("sx"), x);
+    auto gy = ops::Mul(s.WithOpName("gy"), x, dz);
+    auto sy = ops::Shape(s.WithOpName("sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(s.WithOpName("rx"), sx, sy);
+    auto sum_gx = ops::Sum(s.WithOpName("sum_gx"), gx, rx.r0);
+    auto sum_gy = ops::Sum(s.WithOpName("sum_gy"), gy, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("dy"), sum_gy, sy);
+    auto dx_ret = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_ret = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
@@ -570,108 +694,170 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
                           });
 
   // TestGrad = Test'(x, y)
-  auto grad =
-      FDH::Define("TestGrad", {"x:float", "y:float"}, {"dx:float", "dy:float"},
-                  {}, {FDH::Const<float>("dz", 1),
-                       {{"grad0", "grad1"},
-                        "SymbolicGradient",
-                        {"x", "y", "dz"},
-                        {
-                            {"f", FDH::FunctionRef("Test")},
-                            {"Tin", DataTypeSlice{T, T, T}},
-                            {"Tout", DataTypeSlice{T, T}},
-                        }},
-                       {{"dx"}, "Identity", {"grad0"}, {{"T", DT_FLOAT}}},
-                       {{"dy"}, "Identity", {"grad1"}, {{"T", DT_FLOAT}}}});
+  auto grad = FDH::Define("TestGrad", {"x:float", "y:float"},
+                          {"dx:float", "dy:float"}, {},
+                          {FDH::Const<float>("dz", 1),
+                           {{"grad0", "grad1"},
+                            "SymbolicGradient",
+                            {"x", "y", "dz"},
+                            {
+                                {"f", FDH::FunctionRef("Test")},
+                                {"Tin", DataTypeSlice{T, T, T}},
+                                {"Tout", DataTypeSlice{T, T}},
+                            }},
+                           {{"dx"}, "Identity", {"grad0"}, {{"T", DT_FLOAT}}},
+                           {{"dy"}, "Identity", {"grad1"}, {{"T", DT_FLOAT}}}});
 
   Init({test, grad});
 
-  std::unique_ptr<Graph> g(GetFuncBody("TestGrad", {}));
+  std::unique_ptr<Graph> g = GetFuncBody("TestGrad", {});
   ASSERT_TRUE(g != nullptr);
-  const char* e0 = R"P(
-(n4:float, n3:float) -> (n8:float, n6:float) {
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n5 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Test](n4, n3, n2)
-  n6 = Identity[T=float](n5:1)
-  n8 = Identity[T=float](n5)
-}
-)P";
-  EXPECT_EQ(e0, DebugString(g.get()));
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    NameAttrList fn;
+    fn.set_name("Test");
+    auto grad0 = ops::SymbolicGradient(s.WithOpName("grad0"),
+                                       std::initializer_list<Input>{x, y, dz},
+                                       {DT_FLOAT, DT_FLOAT}, fn);
+    auto dx = ops::Identity(s.WithOpName("dx"), grad0[0]);
+    auto dy = ops::Identity(s.WithOpName("dy"), grad0[1]);
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
 
-  ExpandInlineFunctions(lib_, g.get());
-  const char* e1 = R"P(
-(n4:float, n3:float) -> (n8:float, n6:float) {
-  n10 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n11 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n26 = Identity[T=float](n2)
-  n25 = Identity[T=float](n3)
-  n24 = Identity[T=float](n4)
-  n14 = Add[T=float](n24, n25)
-  n15 = Rank[T=float](n14)
-  n16 = Range[Tidx=int32](n11, n15, n10)
-  n20 = ZerosLike[T=int32](n15)
-  n17 = Sum[T=float, Tidx=int32, keep_dims=false](n14, n16)
-  n19 = SymbolicGradient[Tin={float, int32, float}, Tout={float, int32}, f=Sum[T=float, Tidx=int32, keep_dims=false]](n14, n16, n26)
-  n21 = SymbolicGradient[Tin={float, float, float}, Tout={float, float}, f=Add[T=float]](n24, n25, n19)
-  n27 = Identity[T=float](n21)
-  n28 = Identity[T=float](n21:1)
-  n8 = Identity[T=float](n27)
-  n6 = Identity[T=float](n28)
-}
-)P";
-  EXPECT_EQ(e1, DebugString(g.get()));
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 
-  OptimizeGraph(lib_, &g);
-  const char* e2 = R"P(
-(n4:float, n3:float) -> (n25:float, n23:float) {
-  n2 = Const[dtype=float, value=Tensor<type: float shape: [] values: 1>]()
-  n7 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  n8 = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  n19 = Shape[T=float, out_type=int32](n3)
-  n9 = Add[T=float](n4, n3)
-  n20 = Shape[T=float, out_type=int32](n4)
-  n10 = Rank[T=float](n9)
-  n14 = Shape[T=float, out_type=int32](n9)
-  n21 = BroadcastGradientArgs[T=int32](n20, n19)
-  n11 = Range[Tidx=int32](n8, n10, n7)
-  n12 = Shape[T=int32, out_type=int32](n11)
-  n13 = Fill[T=int32](n12, n7)
-  n15 = DynamicStitch[N=2, T=int32](n11, n11, n14, n13)
-  n16 = Reshape[T=float, Tshape=int32](n2, n15)
-  n17 = Div[T=int32](n14, n15)
-  n18 = Tile[T=float, Tmultiples=int32](n16, n17)
-  n22 = Sum[T=float, Tidx=int32, keep_dims=false](n18, n21:1)
-  n24 = Sum[T=float, Tidx=int32, keep_dims=false](n18, n21)
-  n23 = Reshape[T=float, Tshape=int32](n22, n19)
-  n25 = Reshape[T=float, Tshape=int32](n24, n20)
-}
-)P";
-  EXPECT_EQ(e2, DebugString(g.get()));
+  ExpandInlineFunctions(lib_.get(), g.get());
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    auto grad0_zero = ops::Const(s.WithOpName("grad0/zero"), 0);
+    auto grad0_one = ops::Const(s.WithOpName("grad0/one"), 1);
+    auto func0 = ops::Identity(s.WithOpName("Func/_0"), x);
+    auto func1 = ops::Identity(s.WithOpName("Func/_1"), y);
+    auto func2 = ops::Identity(s.WithOpName("Func/_2"), dz);
+    auto grad0_z = ops::Add(s.WithOpName("grad0/z"), func0, func1);
+    auto grad0_r = ops::Rank(s.WithOpName("grad0/r"), grad0_z);
+    auto grad0_indices = ops::Range(s.WithOpName("grad0/indices"), grad0_zero,
+                                    grad0_r, grad0_one);
+    auto grad0_l = ops::Sum(s.WithOpName("grad0/l"), grad0_z, grad0_indices);
+
+    NameAttrList sum;
+    sum.set_name("Sum");
+    (*sum.mutable_attr())["T"].set_type(DT_FLOAT);
+    (*sum.mutable_attr())["Tidx"].set_type(DT_INT32);
+    (*sum.mutable_attr())["keep_dims"].set_b(false);
+    auto grad0_func1 = ops::SymbolicGradient(
+        s.WithOpName("grad0/Func/_1"),
+        std::initializer_list<Input>{grad0_z, grad0_indices, func2},
+        {DT_FLOAT, DT_INT32}, sum);
+
+    auto grad0_func2 = ops::ZerosLike(s.WithOpName("grad0/Func/_2"), grad0_r);
+
+    NameAttrList add;
+    add.set_name("Add");
+    (*add.mutable_attr())["T"].set_type(DT_FLOAT);
+    auto grad0_func3 = ops::SymbolicGradient(
+        s.WithOpName("grad0/Func/_3"),
+        std::initializer_list<Input>{func0, func1, grad0_func1[0]},
+        {DT_FLOAT, DT_FLOAT}, add);
+
+    auto func3 = ops::Identity(s.WithOpName("Func/_3"), grad0_func3[0]);
+    auto func4 = ops::Identity(s.WithOpName("Func/_4"), grad0_func3[1]);
+    auto dx = ops::Identity(s.WithOpName("dx"), func3);
+    auto dy = ops::Identity(s.WithOpName("dy"), func4);
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
+
+  OptimizeGraph(lib_.get(), &g);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto dz = ops::Const(s.WithOpName("dz"), 1.0f);
+    auto grad0_zero = ops::Const(s.WithOpName("grad0/zero"), 0);
+    auto grad0_one = ops::Const(s.WithOpName("grad0/one"), 1);
+    auto grad0_z = ops::Add(s.WithOpName("grad0/z"), x, y);
+    auto grad0_r = ops::Rank(s.WithOpName("grad0/r"), grad0_z);
+    auto grad0_indices = ops::Range(s.WithOpName("grad0/indices"), grad0_zero,
+                                    grad0_r, grad0_one);
+    auto i_shape =
+        ops::Shape(s.WithOpName("grad0/Func/_1/i_shape"), grad0_indices);
+    auto stitch_val = ops::Fill(s.WithOpName("grad0/Func/_1/stitch_val1"),
+                                i_shape, grad0_one);
+    auto x_shape = ops::Shape(s.WithOpName("grad0/Func/_1/x_shape"), grad0_z);
+    auto y_shape = ops::DynamicStitch(
+        s.WithOpName("grad0/Func/_1/y_shape"),
+        std::initializer_list<Input>{grad0_indices, grad0_indices},
+        std::initializer_list<Input>{x_shape, stitch_val});
+    auto dy_reshaped =
+        ops::Reshape(s.WithOpName("grad0/Func/_1/dy_reshaped"), dz, y_shape);
+    auto tile_scaling =
+        ops::Div(s.WithOpName("grad0/Func/_1/tile_scaling"), x_shape, y_shape);
+    auto func1_dx =
+        ops::Tile(s.WithOpName("grad0/Func/_1/dx"), dy_reshaped, tile_scaling);
+
+    auto sx = ops::Shape(s.WithOpName("grad0/Func/_3/sx"), x);
+    auto sy = ops::Shape(s.WithOpName("grad0/Func/_3/sy"), y);
+    auto rx = ops::internal::BroadcastGradientArgs(
+        s.WithOpName("grad0/Func/_3/rx"), sx, sy);
+    auto sum_gx =
+        ops::Sum(s.WithOpName("grad0/Func/_3/sum_gx"), func1_dx, rx.r0);
+    auto sum_gy =
+        ops::Sum(s.WithOpName("grad0/Func/_3/sum_gy"), func1_dx, rx.r1);
+    auto dx = ops::Reshape(s.WithOpName("grad0/Func/_3/dx"), sum_gx, sx);
+    auto dy = ops::Reshape(s.WithOpName("grad0/Func/_3/dy"), sum_gy, sy);
+
+    auto dx_retval = ops::_Retval(s.WithOpName("dx_RetVal"), dx, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("dy_RetVal"), dy, 1);
+
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+
+    GraphDef actual;
+    g->ToGraphDef(&actual);
+    TF_EXPECT_GRAPH_EQ(expected, actual);
+  }
 }
 
 namespace {
 
 bool DoNothing(Graph* g) { return false; }
 
-string Optimize(const std::function<bool(Graph* g)>& pass,
-                const FunctionDef& fdef) {
+GraphDef Optimize(const std::function<bool(Graph* g)>& pass,
+                  const FunctionDef& fdef) {
   InstantiationResult result;
   InstantiateAttrValueMap empty;
   TF_CHECK_OK(InstantiateFunction(fdef, empty, GetOpSig, &result));
-  Graph* g = new Graph(OpRegistry::Global());
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
   GraphConstructorOptions opts;
   opts.allow_internal_ops = true;
   opts.expect_device_spec = false;
-  TF_CHECK_OK(ConvertGraphDefToGraph(opts, result.gdef, g));
-  pass(g);
-  Graph* g1 = new Graph(OpRegistry::Global());
-  CopyGraph(*g, g1);
-  delete g;
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, result.gdef, g.get()));
+  pass(g.get());
+  std::unique_ptr<Graph> g1(new Graph(OpRegistry::Global()));
+  CopyGraph(*g, g1.get());
+  g = nullptr;
   GraphDef gdef;
   g1->ToGraphDef(&gdef);
-  delete g1;
-  return DebugString(gdef);
+  return gdef;
 }
 
 }  // end namespace
@@ -700,21 +886,25 @@ TEST(OptimizationTest, RemoveDeadNodes) {
        {{"keep_me"}, "RandomUniform", {"o"}, {{"T", T}, {"dtype", DT_FLOAT}}},
        // y = Add<T>(a, o)
        {{"y"}, "Add", {"a", "o"}, {{"T", T}}}});
-  const char* e0 = R"S(
-(x:int32) -> (y:int32) {
-  o = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  keep_me = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](o)
-  x1 = Add[T=int32](o, o)
-  a = Square[T=int32](x)
-  y = Add[T=int32](a, o)
-  x2 = Mul[T=int32](a, x1)
-  x3 = Mul[T=int32](x1, x2)
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
+
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto keep_me = ops::RandomUniform(s.WithOpName("keep_me"), {o}, DT_FLOAT);
+    auto x1 = ops::Add(s.WithOpName("x1"), o, o);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto x2 = ops::Mul(s.WithOpName("x2"), a, x1);
+    auto x3 = ops::Mul(s.WithOpName("x3"), x1, x2);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
 
   // TODO(zhifengc): Comes up another test case.
-  EXPECT_EQ(Optimize(::tensorflow::RemoveDeadNodes, func), e0);
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(::tensorflow::RemoveDeadNodes, func));
 }
 
 TEST(OptimizationTest, RemoveIdentityNodes_Ref) {
@@ -735,23 +925,19 @@ TEST(OptimizationTest, RemoveIdentityNodes_Ref) {
        {{"v_read"}, "Identity", {"v"}, {{"T", T}}},
        // returns v + v
        {{"ret"}, "Add", {"v_read", "v_read"}, {{"T", T}}}});
-  const char* e0 = R"S(
-() -> (ret:float) {
-  v = VariableV2[container="", dtype=float, shape=[], shared_name=""]()
-  v_read = Identity[T=float](v)
-  ret = Add[T=float](v_read, v_read)
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
 
-  const char* e1 = R"S(
-() -> (ret:float) {
-  v = VariableV2[container="", dtype=float, shape=[], shared_name=""]()
-  v_read = Identity[T=float](v)
-  ret = Add[T=float](v_read, v_read)
-}
-)S";
-  EXPECT_EQ(Optimize(::tensorflow::RemoveIdentityNodes, func), e1);
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto v = ops::Variable(s.WithOpName("v"), PartialTensorShape({}), DT_FLOAT);
+    auto v_read = ops::Identity(s.WithOpName("v_read"), v);
+    auto ret = ops::Add(s.WithOpName("ret"), v_read, v_read);
+    auto ret_retval = ops::_Retval(s.WithOpName("ret_RetVal"), ret, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  TF_EXPECT_GRAPH_EQ(expected,
+                     Optimize(::tensorflow::RemoveIdentityNodes, func));
 }
 
 TEST(OptimizationTest, RemoveIdentityNodes) {
@@ -782,28 +968,38 @@ TEST(OptimizationTest, RemoveIdentityNodes) {
         {"x3"}},
        // y = Add<T>(a, o)
        {{"y"}, "Add", {"a", "o"}, {{"T", T}}}});
-  const char* e0 = R"S(
-(x:int32) -> (y:int32) {
-  o = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  a = Square[T=int32](x)
-  y = Add[T=int32](a, o)
-  x1 = Identity[T=int32](a)
-  x2 = Identity[T=int32](x1)
-  x3 = Identity[T=int32](x2)
-  keep_me = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](o) @ x3
-}
-)S";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
 
-  const char* e1 = R"S(
-(x:int32) -> (y:int32) {
-  o = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 1>]()
-  a = Square[T=int32](x)
-  y = Add[T=int32](a, o)
-  keep_me = RandomUniform[T=int32, dtype=float, seed2=0, seed=0](o) @ a
-}
-)S";
-  EXPECT_EQ(Optimize(::tensorflow::RemoveIdentityNodes, func), e1);
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto x1 = ops::Identity(s.WithOpName("x1"), a);
+    auto x2 = ops::Identity(s.WithOpName("x2"), x1);
+    auto x3 = ops::Identity(s.WithOpName("x3"), x2);
+    auto keep_me = ops::RandomUniform(
+        s.WithOpName("keep_me").WithControlDependencies(x3), {o}, DT_FLOAT);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
+
+  {
+    Scope s = Scope::NewRootScope();
+    auto x = ops::_Arg(s.WithOpName("x"), DT_INT32, 0);
+    auto o = ops::Const(s.WithOpName("o"), 1);
+    auto a = ops::Square(s.WithOpName("a"), x);
+    auto y = ops::Add(s.WithOpName("y"), a, o);
+    auto keep_me = ops::RandomUniform(
+        s.WithOpName("keep_me").WithControlDependencies(a), {o}, DT_FLOAT);
+    auto ret = ops::_Retval(s.WithOpName("y_RetVal"), y, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected,
+                       Optimize(::tensorflow::RemoveIdentityNodes, func));
+  }
 }
 
 TEST(OptimizationTest, RemoveListArrayConverter) {
@@ -840,49 +1036,63 @@ TEST(OptimizationTest, RemoveListArrayConverter) {
       // Return values
       {{"o", "o:sum"}});
 
-  const char* e0 = R"P(
-(i:float) -> (o:float) {
-  zero = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  s = Split[T=float, num_split=4](zero, i)
-  a = _ArrayToList[N=4, T=float, out_types={float, float, float, float}](s, s:1, s:2, s:3)
-  r = Mul[T=float](a:2, a:3)
-  l = Mul[T=float](a, a:1)
-  x = _ListToArray[N=2, T=float, Tin={float, float}](l, r)
-  o = AddN[N=2, T=float](x, x:1)
-}
-)P";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto a = ops::_ArrayToList(scope.WithOpName("a"), s.output,
+                               {DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT});
+    auto r = ops::Mul(scope.WithOpName("r"), a[2], a[3]);
+    auto l = ops::Mul(scope.WithOpName("l"), a[0], a[1]);
+    auto x = ops::_ListToArray(scope.WithOpName("x"),
+                               std::initializer_list<Input>{l, r}, DT_FLOAT, 2);
+    auto o = ops::AddN(scope.WithOpName("o"), x.output);
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
 
-  const char* e1 = R"P(
-(i:float) -> (o:float) {
-  zero = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  s = Split[T=float, num_split=4](zero, i)
-  r = Mul[T=float](Func/_2, Func/_3)
-  l = Mul[T=float](Func/_0, Func/_1)
-  o = AddN[N=2, T=float](Func/_4, Func/_5)
-  Func/_0 = Identity[T=float](s)
-  Func/_1 = Identity[T=float](s:1)
-  Func/_2 = Identity[T=float](s:2)
-  Func/_3 = Identity[T=float](s:3)
-  Func/_4 = Identity[T=float](l)
-  Func/_5 = Identity[T=float](r)
-}
-)P";
-  EXPECT_EQ(Optimize(RemoveListArrayConverter, func), e1);
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto func_0 = ops::Identity(scope.WithOpName("Func/_0"), s[0]);
+    auto func_1 = ops::Identity(scope.WithOpName("Func/_1"), s[1]);
+    auto func_2 = ops::Identity(scope.WithOpName("Func/_2"), s[2]);
+    auto func_3 = ops::Identity(scope.WithOpName("Func/_3"), s[3]);
+    auto r = ops::Mul(scope.WithOpName("r"), func_2, func_3);
+    auto l = ops::Mul(scope.WithOpName("l"), func_0, func_1);
+    auto func_4 = ops::Identity(scope.WithOpName("Func/_4"), l);
+    auto func_5 = ops::Identity(scope.WithOpName("Func/_5"), r);
+    auto o = ops::AddN(scope.WithOpName("o"),
+                       std::initializer_list<Input>{func_4, func_5});
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(RemoveListArrayConverter, func));
+  }
 
-  const char* e2 = R"P(
-(i:float) -> (o:float) {
-  zero = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  s = Split[T=float, num_split=4](zero, i)
-  r = Mul[T=float](s:2, s:3)
-  l = Mul[T=float](s, s:1)
-  o = AddN[N=2, T=float](l, r)
-}
-)P";
-  auto remove_listarray_and_identity = [](Graph* g) {
-    return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
-  };
-  EXPECT_EQ(Optimize(remove_listarray_and_identity, func), e2);
+  {
+    Scope scope = Scope::NewRootScope();
+    auto i = ops::_Arg(scope.WithOpName("i"), DT_FLOAT, 0);
+    auto zero = ops::Const(scope.WithOpName("zero"), 0);
+    auto s = ops::Split(scope.WithOpName("s"), zero, i, 4);
+    auto r = ops::Mul(scope.WithOpName("r"), s[2], s[3]);
+    auto l = ops::Mul(scope.WithOpName("l"), s[0], s[1]);
+    auto o =
+        ops::AddN(scope.WithOpName("o"), std::initializer_list<Input>{l, r});
+    auto o_ret = ops::_Retval(scope.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+
+    auto remove_listarray_and_identity = [](Graph* g) {
+      return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
+    };
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(remove_listarray_and_identity, func));
+  }
 }
 
 TEST(OptimizationTest, RemoveListArrayConverter_WithContolDeps) {
@@ -911,33 +1121,47 @@ TEST(OptimizationTest, RemoveListArrayConverter_WithContolDeps) {
         {"x"}}},
       {{"o", "o:sum"}});
 
-  const char* e0 = R"P(
-(i:float) -> (o:float) {
-  dummy = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  x = _ListToArray[N=2, T=float, Tin={float, float}](i, i) @ dummy
-  o = AddN[N=2, T=float](x, x:1) @ x
-}
-)P";
-  EXPECT_EQ(Optimize(DoNothing, func), e0);
+  {
+    Scope s = Scope::NewRootScope();
+    auto i = ops::_Arg(s.WithOpName("i"), DT_FLOAT, 0);
+    auto dummy = ops::Const(s.WithOpName("dummy"), 0);
+    auto x = ops::_ListToArray(s.WithOpName("x").WithControlDependencies(dummy),
+                               std::initializer_list<Input>{i, i}, DT_FLOAT, 2);
+    auto o =
+        ops::AddN(s.WithOpName("o").WithControlDependencies({x.output[0].op()}),
+                  x.output);
+    auto o_ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    GraphDef expected;
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, Optimize(DoNothing, func));
+  }
 
-  const char* e1 = R"P(
-(i:float) -> (o:float) {
-  dummy = Const[dtype=int32, value=Tensor<type: int32 shape: [] values: 0>]()
-  o = AddN[N=2, T=float](Func/_0, Func/_1) @ Func/_3
-  Func/_0 = Identity[T=float](i) @ Func/_2
-  Func/_1 = Identity[T=float](i) @ Func/_2
-  Func/_2 = NoOp() @ dummy
-  Func/_3 = NoOp() @ Func/_0, Func/_1
-}
-)P";
-  EXPECT_EQ(Optimize(RemoveListArrayConverter, func), e1);
+  GraphDef expected;
+  {
+    Scope s = Scope::NewRootScope();
+    auto i = ops::_Arg(s.WithOpName("i"), DT_FLOAT, 0);
+    auto dummy = ops::Const(s.WithOpName("dummy"), 0);
+    auto func_2 =
+        ops::NoOp(s.WithOpName("Func/_2").WithControlDependencies(dummy));
+    auto func_0 = ops::Identity(
+        s.WithOpName("Func/_0").WithControlDependencies({func_2}), i);
+    auto func_1 = ops::Identity(
+        s.WithOpName("Func/_1").WithControlDependencies({func_2}), i);
+    auto func_3 = ops::NoOp(s.WithOpName("Func/_3").WithControlDependencies(
+        {func_0.output.op(), func_1.output.op()}));
+    auto o = ops::AddN(s.WithOpName("o").WithControlDependencies({func_3}),
+                       std::initializer_list<Input>{func_0, func_1});
+    auto o_ret = ops::_Retval(s.WithOpName("o_RetVal"), o, 0);
+    TF_ASSERT_OK(s.ToGraphDef(&expected));
+  }
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(RemoveListArrayConverter, func));
 
   auto remove_listarray_and_identity = [](Graph* g) {
     return RemoveListArrayConverter(g) && RemoveIdentityNodes(g);
   };
   // NOTE: We are not removing Identity nodes with any control
   // dependencies yet.
-  EXPECT_EQ(Optimize(remove_listarray_and_identity, func), e1);
+  TF_EXPECT_GRAPH_EQ(expected, Optimize(remove_listarray_and_identity, func));
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index 452cfdda9e6..3573cc6ec21 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -400,16 +400,33 @@ void SetAttrValue(gtl::ArraySlice<NameAttrList> value, AttrValue* out) {
   }
 }
 
+// Wrapper around protocol buffer serialization that requests deterministic
+// serialization, in particular for Map fields, which serialize in a random
+// order by default. Returns true on success.
+template <typename T>
+static bool DeterministicSerialization(const T& t, string* result) {
+  const int size = t.ByteSize();
+  *result = string(size, '\0');
+  ::tensorflow::protobuf::io::ArrayOutputStream array_stream(&(*result)[0],
+                                                             size);
+  ::tensorflow::protobuf::io::CodedOutputStream output_stream(&array_stream);
+  output_stream.SetSerializationDeterministic(true);
+  t.SerializeWithCachedSizes(&output_stream);
+  return !output_stream.HadError() && size == output_stream.ByteCount();
+}
+
 bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b) {
   string a_str, b_str;
-  a.SerializeToString(&a_str);
-  b.SerializeToString(&b_str);
+  DeterministicSerialization(a, &a_str);
+  DeterministicSerialization(b, &b_str);
   // Note: it should be safe to compare proto serializations of the attr
   // values since at most one field should be set in each (indeed, it
   // must be the same field if they are to compare equal).
   // Exception: there are multiple equivalent representations of
   // TensorProtos.  So a return value of true implies a == b, but not the
   // converse.
+  // TODO(phawkins): this is incorrect for NameAttrList attributes that may
+  // contain nested AttrValue maps.
   return a_str == b_str;
 }
 
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 38f011ecaf1..3bfba3fc4ee 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -23,8 +23,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-void DFS(const Graph& g, std::function<void(Node*)> enter,
-         std::function<void(Node*)> leave) {
+void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+         const std::function<void(Node*)>& leave) {
   // Stack of work to do.
   struct Work {
     Node* node;
@@ -61,15 +61,23 @@ void DFS(const Graph& g, std::function<void(Node*)> enter,
   }
 }
 
-void ReverseDFS(const Graph& g, std::function<void(Node*)> enter,
-                std::function<void(Node*)> leave) {
+void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
+                const std::function<void(Node*)>& leave) {
+  ReverseDFSFrom(g, {g.sink_node()}, enter, leave);
+}
+
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                    const std::function<void(Node*)>& enter,
+                    const std::function<void(Node*)>& leave) {
   // Stack of work to do.
   struct Work {
     Node* node;
     bool leave;  // Are we entering or leaving n?
   };
-  std::vector<Work> stack;
-  stack.push_back(Work{g.sink_node(), false});
+  std::vector<Work> stack(start.size());
+  for (int i = 0; i < start.size(); ++i) {
+    stack[i] = Work{start[i], false};
+  }
 
   std::vector<bool> visited(g.num_node_ids(), false);
   while (!stack.empty()) {
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 74aace80722..01d36e0a124 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -21,20 +21,28 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
 
 // Perform a depth-first-search on g starting at the source node.
 // If enter is not empty, calls enter(n) before visiting any children of n.
 // If leave is not empty, calls leave(n) after visiting all children of n.
-extern void DFS(const Graph& g, std::function<void(Node*)> enter,
-                std::function<void(Node*)> leave);
+extern void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+                const std::function<void(Node*)>& leave);
 
 // Perform a reverse depth-first-search on g starting at the sink node.
 // If enter is not empty, calls enter(n) before visiting any parents of n.
 // If leave is not empty, calls leave(n) after visiting all parents of n.
-extern void ReverseDFS(const Graph& g, std::function<void(Node*)> enter,
-                       std::function<void(Node*)> leave);
+extern void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
+                       const std::function<void(Node*)>& leave);
+
+// Perform a reverse depth-first-search on g starting at the 'start' nodes.
+// If enter is not empty, calls enter(n) before visiting any parents of n.
+// If leave is not empty, calls leave(n) after visiting all parents of n.
+extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                           const std::function<void(Node*)>& enter,
+                           const std::function<void(Node*)>& leave);
 
 // Stores in *order the post-order numbering of all nodes
 // in graph found via a depth first search starting at the source node.
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index e784c2df443..22f4708d032 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -90,6 +90,23 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "robust_stats",
+    srcs = ["robust_stats.cc"],
+    hdrs = ["robust_stats.h"],
+    visibility = ["//visibility:public"],
+)
+
+cc_test(
+    name = "robust_stats_test",
+    srcs = ["robust_stats_test.cc"],
+    deps = [
+        ":robust_stats",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = ["utils.cc"],
@@ -116,3 +133,37 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
+
+cc_library(
+    name = "virtual_scheduler",
+    srcs = ["virtual_scheduler.cc"],
+    hdrs = ["virtual_scheduler.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+    ],
+)
+
+cc_library(
+    name = "measuring_cost_estimator",
+    srcs = ["measuring_cost_estimator.cc"],
+    hdrs = ["measuring_cost_estimator.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":robust_stats",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
new file mode 100644
index 00000000000..6179dc05c1e
--- /dev/null
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -0,0 +1,133 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/measuring_cost_estimator.h"
+
+#include <limits>
+
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace grappler {
+
+MeasuringCostEstimator::MeasuringCostEstimator(Cluster* cluster,
+                                               int measurement_steps,
+                                               int measurement_threads)
+    : measurement_steps_(measurement_steps),
+      measurement_threads_(measurement_threads) {
+  CHECK_GE(measurement_steps, 1);
+  if (measurement_threads > 0) {
+    thread_pool_.reset(new thread::ThreadPool(
+        Env::Default(), SanitizeThreadSuffix("measurements"),
+        measurement_threads));
+  }
+  cluster_ = cluster;
+}
+
+Status MeasuringCostEstimator::Initialize(const GrapplerItem& item) {
+  feed_ = item.feed;
+  fetch_ = item.fetch;
+  return cluster_->Initialize(item);
+}
+
+Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
+                                            CostGraphDef* cost_graph,
+                                            Costs* costs) const {
+  std::vector<double> times(measurement_steps_);
+  BlockingCounter barrier(measurement_steps_);
+
+  mutex status_mu;
+  Status status;
+
+  auto measurement_fn = [&](const int step) {
+    const Costs::MicroSeconds start = Env::Default()->NowMicros();
+
+    RunMetadata metadata;
+    const Status local_status =
+        cluster_->Run(optimized_graph, feed_, fetch_, &metadata);
+    {
+      mutex_lock lock(status_mu);
+      status.Update(local_status);
+    }
+    if (step < 0) {
+      // Discard the first iteration as it triggers the warmup, and therefore
+      // takes much longer than a normal step.
+      return;
+    }
+    if (!local_status.ok()) {
+      // Discard the data if the run wasn't sucessful.
+      barrier.DecrementCount();
+      return;
+    }
+
+    const Costs::MicroSeconds finish = Env::Default()->NowMicros();
+    const double time = (finish - start).count() * 1e3;
+    times[step] = time;
+
+    if (cost_graph && (step + 1 == measurement_steps_)) {
+      metadata.mutable_cost_graph()->Swap(cost_graph);
+    }
+
+    barrier.DecrementCount();
+  };
+
+  // Initialize the computation and warm up TensorFlow.
+  measurement_fn(-1);
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to run start measurements: "
+               << status.error_message();
+    costs->execution_time = Costs::Duration::max();
+    return status;
+  }
+
+  // Run "measurement_steps_" and measure the time.
+  if (measurement_threads_ > 0) {
+    for (int i = 0; i < measurement_steps_; ++i) {
+      thread_pool_->Schedule([i, &measurement_fn]() { measurement_fn(i); });
+    }
+    barrier.Wait();
+  } else {
+    for (int i = 0; i < measurement_steps_ && status.ok(); ++i) {
+      measurement_fn(i);
+    }
+  }
+
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to measure graph performance: "
+               << status.error_message();
+    costs->execution_time = Costs::Duration::max();
+    costs->max_execution_time = Costs::Duration::max();
+    costs->min_execution_time = 0;
+    return status;
+  }
+
+  // Compute the average time of the measure steps. Use Huber statistics
+  // to filter out outliers.
+  RobustStats stats(times);
+  costs->execution_time = Costs::Duration(stats.mean());
+  costs->max_execution_time = Costs::Duration(stats.hi());
+  costs->min_execution_time = Costs::Duration(stats.lo());
+
+  return Status::OK();
+}
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.h b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
new file mode 100644
index 00000000000..a84853f6c71
--- /dev/null
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+#define TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+class CostGraphDef;
+class GraphDef;
+}  // namespace tensorflow
+
+namespace tensorflow {
+namespace grappler {
+
+class Cluster;
+struct GrapplerItem;
+
+// Estimate the cost of running a Grappler item by actually running the
+// corresponding TensorFlow graph on the specified cluster and measuring the
+// runtimes.
+class MeasuringCostEstimator : public CostEstimator {
+ public:
+  // Run the model for measurement_steps to measure its average cost.
+  // When measurement_threads is greater than 0, use a threadpool of as many
+  // threads to run the measurements; otherwise, run them serially. Does not
+  // take ownership of cluster.
+  explicit MeasuringCostEstimator(Cluster* cluster, int measurement_steps,
+                                  int measurement_threads);
+  ~MeasuringCostEstimator() override {}
+
+  // Initalizes the estimator for the specified grappler item.
+  // This implementation always returns OK.
+  Status Initialize(const GrapplerItem& item) override;
+
+  // Runs the optimized version of the graph on the cluster, measure
+  // the runtimes of each operation, and annotated the CostGraphDef
+  // with the corresponding measurements.
+  // Returns the average latency for the whole graph.
+  Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
+                      Costs* overall_cost) const override;
+
+ private:
+  Cluster* cluster_;  // Not owned.
+  int measurement_steps_;
+  int measurement_threads_;
+  std::vector<std::pair<string, Tensor>> feed_;
+  std::vector<string> fetch_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/robust_stats.cc b/tensorflow/core/grappler/costs/robust_stats.cc
new file mode 100644
index 00000000000..87cda1c0d2e
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats.cc
@@ -0,0 +1,151 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include <algorithm>
+
+namespace tensorflow {
+namespace grappler {
+
+// Given a sorted vector of values, calculate the median.
+// Returns 0 for an empty vector.  Does not verify sortedness.
+static double SortedMedian(const std::vector<double> &values) {
+  const int n = values.size();
+  if (n == 0) return 0.0;
+  if (n & 1) {
+    return values[n / 2];
+  } else {
+    return (values[n / 2] + values[n / 2 - 1]) / 2.0;
+  }
+}
+
+// Given a vector of values (sorted or not), calculate the median.
+static double Median(std::vector<double> &&values) {
+  const size_t n = values.size();
+  if (n == 0) return 0;
+  const auto middle = values.begin() + (n / 2);
+  // Put the middle value in its place.
+  std::nth_element(values.begin(), middle, values.end());
+  if (n & 1) {
+    return *middle;
+  }
+  // Return the average of the two elements, the max_element lower than
+  // *middle is found between begin and middle as a post-cond of
+  // nth_element.
+  const auto lower_middle = std::max_element(values.begin(), middle);
+  // Preventing overflow. We know that '*lower_middle <= *middle'.
+  // If both are on oposite sides of zero, the sum won't overflow, otherwise
+  // the difference won't overflow.
+  if (*lower_middle <= 0 && *middle >= 0) {
+    return (*lower_middle + *middle) / 2;
+  }
+  return *lower_middle + (*middle - *lower_middle) / 2;
+}
+
+// Given a set of values, calculates the scaled Median Absolute Deviation (a
+// robust approximation to the standard deviation).  This is calculated as the
+// median of the absolute deviations from the median, scaled by 1.4826.  Its
+// advantage over the standard deviation is that it is not (as) affected by
+// outlier values.  Returns a pair<median, mad>.
+static std::pair<double, double> ScaledMedianAbsoluteDeviation(
+    const std::vector<double> &sorted_values) {
+  double median = SortedMedian(sorted_values);
+
+  // Next, we calculate the absolute deviations from the median,
+  // find the median of the resulting data, and scale by 1.4826.
+  std::vector<double> deviations;
+  deviations.reserve(sorted_values.size());
+  for (double d : sorted_values) {
+    deviations.push_back(std::abs(d - median));
+  }
+  double mad = Median(std::move(deviations)) * 1.4826;
+  return std::pair<double, double>(median, mad);
+}
+
+RobustStats::RobustStats(const std::vector<double> &values)
+    : RobustStats(std::vector<double>(values)) {}
+
+RobustStats::RobustStats(std::vector<double> &&values) {
+  std::sort(values.begin(), values.end());
+  lo_ = values[0];
+  hi_ = values.back();
+  HuberMAD(values);
+}
+
+// Computes an updated mean using Huber's weighting function (values beyond
+// the margin are weighted by margin / abs(value - mean).
+double UpdateHuberMean(const std::vector<double> &sorted_values, double mean,
+                       double margin) {
+  int num_within = 0;
+  double sum = 0.0;
+
+  for (double d : sorted_values) {
+    if (d < mean - margin) {
+      sum -= margin;
+    } else if (d > mean + margin) {
+      sum += margin;
+    } else {
+      sum += d;
+      ++num_within;
+    }
+  }
+
+  // It is possible, for a set with an interquartile distance of 0, i.e., with
+  // more than half of the values at the median, to encounter the case where
+  // the Huber mean drifts slightly off the median and there are no values
+  // within the margin.  In that case, just return the old mean, and the caller
+  // will quit.
+  if (num_within > 0) {
+    return sum / num_within;
+  } else {
+    return mean;
+  }
+}
+
+// Given a list of values, this approximates the stddev using the MAD and then
+// uses it to compute a Huber robust mean (sandwich mean).  A margin of
+// c*stddev is defined around the current mean, and values are weighted by
+// margin / abs(value - mean) if outside the margin, or 1 if inside.  This
+// computes the mean iteratively, because each time it changes the margin
+// shifts a bit.  It typically settles very quickly, but it's possible for it
+// to be unstable.  We limit it to 10 iterations.
+//
+void RobustStats::HuberMAD(const std::vector<double> &sorted_values) {
+  const std::pair<double, double> median_mad =
+      ScaledMedianAbsoluteDeviation(sorted_values);
+  mean_ = median_mad.first;
+  stddev_ = median_mad.second;
+
+  // c = 1.345 is the commonly used cutoff with 95% efficiency at the normal.
+  // We're using c = 1.5 to be a little more conservative, and because that's
+  // the default in S-plus.
+  // TODO(dehnert): Specialize Stats for integral types so we don't implement
+  // methods that don't make sense.
+  const double c = 1.5;
+  const double margin = c * stddev_;
+
+  // Iterate 10 times, or until the Huber mean stabilizes.
+  // If the margin is zero, we don't want mean to drift from the median.
+  if (margin > 0.0) {
+    for (int k = 0; k < 10; ++k) {
+      double old_mean = mean_;
+      mean_ = UpdateHuberMean(sorted_values, mean_, margin);
+      if (mean_ == old_mean) break;
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/robust_stats.h b/tensorflow/core/grappler/costs/robust_stats.h
new file mode 100644
index 00000000000..9d8f5bc970a
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
+#define TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
+
+#include <vector>
+namespace tensorflow {
+namespace grappler {
+class RobustStats {
+ public:
+  RobustStats(const std::vector<double>& values);
+  RobustStats(std::vector<double>&& values);
+
+  double lo() const { return lo_; }
+  double hi() const { return hi_; }
+  double mean() const { return mean_; }
+
+ private:
+  void HuberMAD(const std::vector<double>& values);
+
+  double lo_;
+  double hi_;
+  double mean_;
+  double stddev_;
+};
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
diff --git a/tensorflow/core/grappler/costs/robust_stats_test.cc b/tensorflow/core/grappler/costs/robust_stats_test.cc
new file mode 100644
index 00000000000..924097b126d
--- /dev/null
+++ b/tensorflow/core/grappler/costs/robust_stats_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/robust_stats.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class RobustStatsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    for (double d = 1.0; d <= 5.0; d += 1.0) {
+      values1_.push_back(5.0 - d);
+      values1_.push_back(5.0 + d);
+      values2_.push_back(25.0 - 2 * d);
+      values2_.push_back(25.0 + 2 * d);
+      values3_.push_back(-3.0 - d);
+      values3_.push_back(-3.0 + d);
+    }
+    values1_.push_back(5.0);  // Odd # elements, mean is 5.0
+    values3_.push_back(197.0);
+    values3_.push_back(-203.0);  // Even # elements, mean is -3.0
+  }
+
+  std::vector<double> values1_;
+  std::vector<double> values2_;
+  std::vector<double> values3_;
+};
+
+TEST_F(RobustStatsTest, Simple) {
+  RobustStats s1(values1_);
+  EXPECT_EQ(5.0, s1.mean());
+  EXPECT_EQ(0.0, s1.lo());
+  EXPECT_EQ(10.0, s1.hi());
+
+  RobustStats s2(values2_);
+  EXPECT_EQ(25.0, s2.mean());
+  EXPECT_EQ(15.0, s2.lo());
+  EXPECT_EQ(35.0, s2.hi());
+
+  RobustStats s3(values3_);
+  EXPECT_EQ(-3.0, s3.mean());
+  EXPECT_EQ(-203.0, s3.lo());
+  EXPECT_EQ(197.0, s3.hi());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
new file mode 100644
index 00000000000..8f77d7677ac
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -0,0 +1,215 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/costs/virtual_scheduler.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+Costs CombineCosts(const Costs& left, const Costs& right) {
+  CHECK_NE(left.max_memory, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_buffers, kMemoryUnknown);
+  CHECK_NE(left.max_per_op_streaming, kMemoryUnknown);
+
+  Costs result = left;
+  result.execution_time += right.execution_time;
+  if (right.max_memory != kMemoryUnknown) {
+    result.max_memory += right.max_memory;
+  }
+  if (right.max_per_op_buffers != kMemoryUnknown) {
+    result.max_per_op_buffers =
+        std::max(left.max_per_op_buffers, right.max_per_op_buffers);
+  }
+  if (right.max_per_op_streaming != kMemoryUnknown) {
+    result.max_per_op_streaming =
+        std::max(left.max_per_op_streaming, right.max_per_op_streaming);
+  }
+  VLOG(2) << "costs execution_time=" << result.execution_time.count()
+          << " max_memory=" << result.max_memory
+          << " max_per_op_buffers=" << result.max_per_op_buffers
+          << " max_per_op_streaming=" << result.max_per_op_streaming;
+  return result;
+}
+}  // namespace
+
+VirtualScheduler::VirtualScheduler(const GraphDef& graph,
+                                   const std::vector<string>& fetch_nodes)
+    : graph_costs_(Costs::ZeroCosts()),
+      // TODO(dyoon): Use a better way than FIFO.
+      ready_nodes_(new FIFOManager()) {
+  // First, get the nodes that would run to output fetch_nodes.
+  std::vector<const NodeDef*> nodes =
+      ComputeTransitiveFanin(graph, fetch_nodes);
+
+  // TODO(dyoon): this is a bit inefficient as name_to_node is already built in
+  // ComputeTransitiveFanin().
+  std::unordered_map<string, const NodeDef*> name_to_node;
+  for (const auto& node : graph.node()) {
+    name_to_node[node.name()] = &node;
+  }
+
+  // Build node_map.
+  for (const auto* node : nodes) {
+    auto& node_state = GetNodeStateOrCreateIt(node);
+    // TODO(dyoon): add SendRecv considering devices and control dependency.
+    for (const string& input : node->input()) {
+      const NodeDef* in = name_to_node[NodeName(input)];
+      CHECK(in);
+      node_state.inputs.push_back(in);
+      auto& input_node_state = GetNodeStateOrCreateIt(in);
+      input_node_state.outputs.push_back(node);
+    }
+    if (node->input().empty()) {
+      node_state.time_ready =
+          Costs::Duration();  // Node without input: ready at time 0.
+      ready_nodes_->AddNode(node);
+    }
+  }
+}
+
+const NodeDef* VirtualScheduler::GetCurrNode() const {
+  return ready_nodes_->GetCurrNode();
+}
+
+NodeState& VirtualScheduler::GetNodeStateOrCreateIt(const NodeDef* node) {
+  auto it = node_map_.find(node);
+  if (it == node_map_.end()) {
+    it = node_map_.emplace(node, NodeState()).first;
+  }
+  return it->second;
+}
+
+bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
+  // Update graph_costs_ and per-op costs.
+  graph_costs_ = CombineCosts(graph_costs_, node_costs);
+  const auto* node = GetCurrNode();
+  const auto& op_name = node->op();
+
+  auto it = op_to_cost_.find(op_name);
+  if (it == op_to_cost_.end()) {
+    it = op_to_cost_.emplace(op_name, Costs::ZeroCosts()).first;
+  }
+  auto& op_cost = it->second;
+  op_cost = CombineCosts(op_cost, node_costs);
+
+  // Update node and device states.
+  auto& node_state = node_map_[node];
+  auto& device = device_[node->device()];
+  device.nodes_executed.push_back(node);
+  // Node is scheduled when the device is available AND all the inputs are
+  // ready; hence, time_scheduled is time_ready if time_ready > device curr
+  // time.
+  node_state.time_scheduled =
+      std::max(device.GetCurrTime(), node_state.time_ready);
+  // Override device curr time with the time_scheduled.
+  device.device_costs.execution_time = node_state.time_scheduled;
+  device.device_costs = CombineCosts(device.device_costs, node_costs);
+  auto curr_time = device.GetCurrTime();
+  node_state.time_finished = curr_time;
+
+  // Update device's per-op cost.
+  {
+    auto it = device.op_to_cost.find(op_name);
+    if (it == device.op_to_cost.end()) {
+      it = device.op_to_cost.emplace(op_name, Costs::ZeroCosts()).first;
+    }
+    auto& op_cost = it->second;
+    op_cost = CombineCosts(op_cost, node_costs);
+
+    VLOG(2) << "Op scheduled -- name: " << node->name()
+            << ", op: " << node->op() << ", device: " << node->device()
+            << ", ready: " << node_state.time_ready.count()
+            << ", scheduled: " << node_state.time_scheduled.count()
+            << ", finished: " << node_state.time_finished.count();
+
+    // Increment num_inputs_ready of the output nodes.
+    for (auto* output : node_state.outputs) {
+      auto& output_state = node_map_[output];
+      output_state.num_inputs_ready++;
+      if (output_state.num_inputs_ready == output_state.inputs.size()) {
+        // This output node is now ready.
+        output_state.time_ready = curr_time;
+        ready_nodes_->AddNode(output);
+      }
+    }
+
+    // Increment num_outputs_executed of the input nodes.
+    for (auto* input : node_state.inputs) {
+      auto& input_state = node_map_[input];
+      input_state.num_outputs_executed++;
+      if (input_state.num_outputs_executed == input_state.outputs.size()) {
+        // All the outputs are executed; no reference to this input nodel
+        input_state.time_no_reference = curr_time;
+        // TODO(dyoon): collect device memory usage; note that this input node
+        // use device memory between time_scheduled and time_no_reference.
+      }
+    }
+  }
+
+  // Remove the current node; assume FIFO.
+  ready_nodes_->RemoveCurrNode();
+  return !ready_nodes_->Empty();  // True if not empty.
+}
+
+Costs VirtualScheduler::Summary() const {
+  // Print out basic execution summary.
+  VLOG(1) << "Expected execution time: " << graph_costs_.execution_time.count();
+  VLOG(1) << "Expected max memory: " << graph_costs_.max_memory;
+  VLOG(1) << "Expected max per-op buffers: " << graph_costs_.max_per_op_buffers;
+  VLOG(1) << "Expected max per-op streaming buffers: "
+          << graph_costs_.max_per_op_streaming;
+
+  VLOG(1) << "Per-op execution time:";
+  for (const auto& op_cost_pair : op_to_cost_) {
+    const auto& op = op_cost_pair.first;
+    const auto& cost = op_cost_pair.second.execution_time.count();
+    if (cost) {  // Skip printing out zero-cost ops.
+      VLOG(1) << " + " << op << " : " << cost;
+    }
+  }
+
+  // Print per device summary
+  VLOG(1) << "Devices:";
+  Costs critical_path_costs = Costs::ZeroCosts();
+
+  for (const auto& device : device_) {
+    const auto& name = device.first;
+    const auto& state = device.second;
+    VLOG(1) << "Device = " << name
+            << ", num_nodes = " << state.nodes_executed.size()
+            << ", execution_time = " << state.GetCurrTime().count();
+    VLOG(1) << "Per-op execution time:";
+    for (const auto& op_cost_pair : state.op_to_cost) {
+      const auto& op = op_cost_pair.first;
+      const auto& cost = op_cost_pair.second.execution_time.count();
+      if (cost) {  // Skip printing out zero-cost ops.
+        VLOG(1) << " + " << op << " : " << cost;
+      }
+    }
+    if (critical_path_costs.execution_time <= state.GetCurrTime()) {
+      critical_path_costs = state.device_costs;
+    }
+  }
+
+  VLOG(1) << "Critical path execution time: "
+          << critical_path_costs.execution_time.count();
+  return critical_path_costs;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
new file mode 100644
index 00000000000..b7785c94e04
--- /dev/null
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -0,0 +1,116 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
+
+#include <list>
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+struct NodeState {
+  std::vector<const NodeDef*> inputs;
+  std::vector<const NodeDef*> outputs;
+  int num_inputs_ready;
+  int num_outputs_executed;
+  Costs::Duration time_ready;
+  Costs::Duration time_scheduled;
+  Costs::Duration time_finished;
+  Costs::Duration time_no_reference;
+
+  // Node will be ready to be executed at time_ready, scheduled at
+  // time_scheduled, and finishes execution at time_finished.
+  // Between time_scheduled and time_no_reference, the node's output tensor
+  // needs to be on the device, using up device memory.
+
+  NodeState() {
+    num_inputs_ready = 0;
+    num_outputs_executed = 0;
+    time_ready = Costs::Duration::max();
+    time_scheduled = Costs::Duration::max();
+    time_finished = Costs::Duration::max();
+    time_no_reference = Costs::Duration::max();
+  }
+};
+
+struct DeviceState {
+  std::vector<const NodeDef*> nodes_executed;
+  Costs device_costs;
+  std::map<string, Costs> op_to_cost;  // Per-op cost.
+
+  DeviceState() { device_costs = Costs::ZeroCosts(); }
+
+  Costs::Duration GetCurrTime() const { return device_costs.execution_time; }
+};
+
+// ReadyNodeManager (abstract class):
+// Keeps ready nodes and picks the best one to be scheduled.
+class ReadyNodeManager {
+ public:
+  ReadyNodeManager() {}
+  virtual ~ReadyNodeManager() {}
+  virtual void AddNode(const NodeDef* node) = 0;
+  virtual const NodeDef* GetCurrNode() const = 0;
+  virtual void RemoveCurrNode() = 0;
+  virtual bool Empty() const = 0;
+};
+
+class FIFOManager : public ReadyNodeManager {
+ public:
+  FIFOManager() : ReadyNodeManager() {}
+  ~FIFOManager() override {}
+  void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
+  const NodeDef* GetCurrNode() const override { return nodes_.front(); }
+  void RemoveCurrNode() override { nodes_.pop_front(); }
+  bool Empty() const override { return nodes_.empty(); }
+
+ private:
+  std::list<const NodeDef*> nodes_;
+};
+}  // namespace
+
+// The virtual scheduler emulates execution of nodes in a graph, considering
+// dependencies, device, etc.
+class VirtualScheduler {
+ public:
+  VirtualScheduler(const GraphDef& graph,
+                   const std::vector<string>& fetch_nodes);
+
+  const NodeDef* GetCurrNode() const;
+  bool MarkCurrNodeExecuted(const Costs& node_costs);
+
+  Costs Summary() const;
+
+ private:
+  NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
+
+  Costs graph_costs_;                   // Graph cost.
+  std::map<string, Costs> op_to_cost_;  // Per-op cost.
+  std::unique_ptr<ReadyNodeManager> ready_nodes_;
+  std::unordered_map<const NodeDef*, NodeState> node_map_;
+  std::unordered_map<string, DeviceState> device_;
+};
+
+}  // namespace grappler
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_SCHEDULER_H_
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 29b4d63bbf8..0847d1279b8 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2109,7 +2109,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "matrix_triangular_solve_op",
     prefix = "matrix_triangular_solve_op",
-    deps = LINALG_DEPS,
+    deps = LINALG_DEPS + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
+    ]),
 )
 
 tf_kernel_library(
@@ -2350,6 +2352,8 @@ tf_kernel_library(
         "//conditions:default": [],
     }) + if_mkl([
         "//third_party/mkl:intel_binary_blob",
+    ]) + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
     ]),
 )
 
@@ -2630,6 +2634,7 @@ tf_kernel_library(
         ],
         "//conditions:default": [],
     }) + if_cuda([
+        "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
     ]),
 )
diff --git a/tensorflow/docs_src/performance/performance_models.md b/tensorflow/docs_src/performance/performance_models.md
index 027ecb195ed..d415c29aa13 100644
--- a/tensorflow/docs_src/performance/performance_models.md
+++ b/tensorflow/docs_src/performance/performance_models.md
@@ -328,12 +328,16 @@ The downside is that all the weights read are from the previous training step.
 So it is a different algorithm from SGD. But it is possible to improve its
 convergence by adjusting learning rate and other hyperparameters.
 
-### Executing the script
+## Executing the script
 
 This section lists the core command line arguments and a few basic examples for
 executing the main script
 ([tf_cnn_benchmarks.py](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py)).
 
+> Note: `tf_cnn_benchmarks.py` uses the config `force_gpu_compatible`,
+> which was introduced after TensorFlow 1.1. Until TensorFlow 1.2 is released
+> building from source is advised.
+
 #### Base command line arguments
 
 *   **`model`**: Model to use, e.g. `resnet50`, `inception3`, `vgg16`, and
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index e408506cb06..a96052a3ae5 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -139,6 +139,82 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
+def make_input_layer(features,
+                     feature_columns,
+                     weight_collections=None,
+                     trainable=True):
+  """Returns a dense `Tensor` as input layer based on given `feature_columns`.
+
+  Generally a single example in training data is described with FeatureColumns.
+  At the first layer of the model, this column oriented data should be converted
+  to a single `Tensor`.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  keywords_embedded = embedding_column(
+      categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
+  all_feature_columns = [price, keywords_embedded, ...]
+  dense_tensor = make_input_layer(features, all_feature_columns)
+  for units in [128, 64, 32]:
+    dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu)
+  prediction = tf.layers.dense(dense_tensor, 1)
+  ```
+
+  Args:
+    features: A mapping from key to tensors. `FeatureColumn`s look up via these
+      keys. For example `numeric_column('price') will look at 'price' key in
+      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
+      corresponding `FeatureColumn`.
+    feature_columns: An iterable containing all the `FeatureColumn`s. All items
+      should be instances of classes derived from `_DenseColumn` such as
+      `numeric_column`, `embedding_column`, `bucketized_column`,
+      `indicator_column`. If you have categorical features, you can wrap them
+      with with an `embedding_column` or `indicator_column`.
+    weight_collections: A list of collection names to which the Variable will be
+      added. Note that, variables will also be added to collections
+      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+    trainable: If `True` also add the variable to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+
+  Returns:
+    A `Tensor` which represents input layer of a model. Its shape
+    is (batch_size, first_layer_dimension) and its dtype is `float32`.
+    first_layer_dimension is determined based on given `feature_columns`.
+
+  Raises:
+    ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
+  """
+  _check_feature_columns(feature_columns)
+  for column in feature_columns:
+    if not isinstance(column, _DenseColumn):
+      raise ValueError(
+          'Items of feature_columns must be a _DenseColumn. '
+          'You can wrap a categorical column with an '
+          'embedding_column or indicator_column. Given: {}'.format(column))
+  weight_collections = list(weight_collections or [])
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+  with variable_scope.variable_scope(
+      None, default_name='make_input_layer', values=features.values()):
+    builder = _LazyBuilder(features)
+    output_tensors = []
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      with variable_scope.variable_scope(None, default_name=column.name):
+        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+            builder,
+            weight_collections=weight_collections,
+            trainable=trainable)
+        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
+        batch_size = array_ops.shape(tensor)[0]
+        tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
+        output_tensors.append(tensor)
+    return array_ops.concat(output_tensors, 1)
+
+
 def make_linear_model(features,
                       feature_columns,
                       units=1,
@@ -156,10 +232,21 @@ def make_linear_model(features,
   while `make_input_layer` explicitly requires wrapping each of them with an
   `embedding_column` or an `indicator_column`.
 
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
+  keywords = categorical_column_with_hash_bucket("keywords", 10K)
+  all_feature_columns = [price_buckets, keywords, ...]
+  prediction = make_linear_model(features, all_feature_columns)
+  ```
+
   Args:
-    features: A mapping from key to tensors. 'string' key means a base feature.
-      It can have `_FeatureColumn` as a key too. That means that FeatureColumn
-      is already transformed by the input pipeline.
+    features: A mapping from key to tensors. `FeatureColumn`s look up via these
+      keys. For example `numeric_column('price')` will look at 'price' key in
+      this dict. Values are `Tensor` or `SparseTensor` depending on
+      corresponding `FeatureColumn`.
     feature_columns: An iterable containing all the FeatureColumns. All items
       should be instances of classes derived from FeatureColumn.
     units: units: An integer, dimensionality of the output space. Default
@@ -191,9 +278,10 @@ def make_linear_model(features,
       raise ValueError('Items of feature_columns must be either a _DenseColumn '
                        'or _CategoricalColumn. Given: {}'.format(column))
   weight_collections = list(weight_collections or [])
-  weight_collections += [
-      ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES
-  ]
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
   with variable_scope.variable_scope(
       None, default_name='make_linear_model', values=features.values()):
     weigthed_sums = []
@@ -228,7 +316,8 @@ def numeric_column(key,
                    normalizer_fn=None):
   """Represents real valued or numerical features.
 
-  An example:
+  Example:
+
   ```python
   price = numeric_column('price')
   all_feature_columns = [price, ...]
@@ -298,7 +387,8 @@ def bucketized_column(source_column, boundaries):
   `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
   `[1., 2.)`, and `[2., +inf)`.
 
-  An example:
+  Example:
+
   ```python
   price = numeric_column('price')
   bucketized_price = bucketized_column(price, boundaries=[...])
@@ -349,7 +439,8 @@ def categorical_column_with_hash_bucket(key,
   want to distribute your inputs into a finite number of buckets by hashing.
   output_id = Hash(input_feature_string) % bucket_size
 
-  An example:
+  Example:
+
   ```python
   keywords = categorical_column_with_hash_bucket("keywords", 10K)
   all_feature_columns = [keywords, ...]
@@ -471,7 +562,7 @@ class _DenseColumn(_FeatureColumn):
 
   @abc.abstractproperty
   def _variable_shape(self):
-    """Returns shape of variable which is compatible with _get_dense_tensor."""
+    """Returns a `TensorShape` of variable compatible with _get_dense_tensor."""
     pass
 
   @abc.abstractmethod
@@ -480,6 +571,7 @@ class _DenseColumn(_FeatureColumn):
 
     The output of this function will be used by model-buildier-functions. For
     example the pseudo code of `make_input_layer` will be like that:
+
     ```python
     def make_input_layer(features, feature_columns, ...):
       outputs = [fc._get_dense_tensor(...) for fc in feature_columns]
@@ -503,7 +595,7 @@ def _create_dense_column_weighted_sum(
       builder,
       weight_collections=weight_collections,
       trainable=trainable)
-  num_elements = tensor_shape.TensorShape(column._variable_shape).num_elements()  # pylint: disable=protected-access
+  num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
   batch_size = array_ops.shape(tensor)[0]
   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
   weight = variable_scope.get_variable(
@@ -615,12 +707,15 @@ class _LazyBuilder(object):
     """Creates a `_LazyBuilder`.
 
     Args:
-      features: A mapping from feature column to tensors. A `string` key
+      features: A mapping from feature column to objects that are `Tensor` or
+        `SparseTensor`, or can be converted to same via
+        `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key
         signifies a base feature (not-transformed). A `FeatureColumn` key
         means that this `Tensor` is the output of an existing `FeatureColumn`
         which can be reused.
     """
-    self._columns_to_tensors = features.copy()
+    self._features = features.copy()
+    self._feature_tensors = {}
 
   def get(self, key):
     """Returns a `Tensor` for the given key.
@@ -640,9 +735,16 @@ class _LazyBuilder(object):
       ValueError: if key is not found or a transformed `Tensor` cannot be
         computed.
     """
-    if key in self._columns_to_tensors:
-      # Feature_column is already transformed or it's a raw feature.
-      return self._columns_to_tensors[key]
+    if key in self._feature_tensors:
+      # FeatureColumn is already transformed or converted.
+      return self._feature_tensors[key]
+
+    if key in self._features:
+      # FeatureColumn is a raw feature.
+      feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
+          self._features[key])
+      self._feature_tensors[key] = feature_tensor
+      return feature_tensor
 
     if not isinstance(key, (str, _FeatureColumn)):
       raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
@@ -653,11 +755,13 @@ class _LazyBuilder(object):
 
     column = key
     logging.debug('Transforming feature_column %s.', column)
-    transformed = column._transform_feature(self)  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    transformed = column._transform_feature(self)
+    # pylint: enable=protected-access
     if transformed is None:
       raise ValueError('Column {} is not supported.'.format(column.name))
-    self._columns_to_tensors[column] = transformed
-    return self._columns_to_tensors[column]
+    self._feature_tensors[column] = transformed
+    return transformed
 
 
 def _check_feature_columns(feature_columns):
@@ -709,7 +813,7 @@ class _NumericColumn(_DenseColumn,
 
   @property
   def _variable_shape(self):
-    return self.shape
+    return tensor_shape.TensorShape(self.shape)
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     del weight_collections
@@ -738,7 +842,8 @@ class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
 
   @property
   def _variable_shape(self):
-    return tuple(self.source_column.shape) + (len(self.boundaries) + 1,)
+    return tensor_shape.TensorShape(
+        tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
 
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     del weight_collections
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 32d6a4e8f0a..bc626533104 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -65,7 +65,7 @@ class LazyColumnTest(test.TestCase):
       def _parse_example_config(self):
         pass
 
-    builder = fc._LazyBuilder(features={'a': constant_op.constant([[2], [3.]])})
+    builder = fc._LazyBuilder(features={'a': [[2], [3.]]})
     column = TransformCounter()
     self.assertEqual(0, column.num_transform)
     builder.get(column)
@@ -88,7 +88,7 @@ class LazyColumnTest(test.TestCase):
       def _parse_example_config(self):
         pass
 
-    builder = fc._LazyBuilder(features={'a': constant_op.constant([[2], [3.]])})
+    builder = fc._LazyBuilder(features={'a': [[2], [3.]]})
     column = Transformer()
     self.assertEqual('Output', builder.get(column))
     self.assertEqual('Output', builder.get(column))
@@ -108,13 +108,13 @@ class LazyColumnTest(test.TestCase):
       def _parse_example_config(self):
         pass
 
-    features = {'a': constant_op.constant([[2], [3.]])}
+    features = {'a': [[2], [3.]]}
     builder = fc._LazyBuilder(features=features)
     builder.get(Transformer())
     self.assertEqual(['a'], list(features.keys()))
 
   def test_error_if_feature_is_not_found(self):
-    builder = fc._LazyBuilder(features={'a': constant_op.constant([[2], [3.]])})
+    builder = fc._LazyBuilder(features={'a': [[2], [3.]]})
     with self.assertRaisesRegexp(ValueError,
                                  'bbb is not in features dictionary'):
       builder.get('bbb')
@@ -135,7 +135,7 @@ class LazyColumnTest(test.TestCase):
       def _parse_example_config(self):
         pass
 
-    builder = fc._LazyBuilder(features={'a': constant_op.constant([[2], [3.]])})
+    builder = fc._LazyBuilder(features={'a': [[2], [3.]]})
     with self.assertRaisesRegexp(ValueError,
                                  'NotAProperColumn is not supported'):
       builder.get(NotAProperColumn())
@@ -145,7 +145,7 @@ class LazyColumnTest(test.TestCase):
     class NotAFeatureColumn(object):
       pass
 
-    builder = fc._LazyBuilder(features={'a': constant_op.constant([[2], [3.]])})
+    builder = fc._LazyBuilder(features={'a': [[2], [3.]]})
     with self.assertRaisesRegexp(
         TypeError, '"key" must be either a "str" or "_FeatureColumn".'):
       builder.get(NotAFeatureColumn())
@@ -273,7 +273,7 @@ class NumericColumnTest(test.TestCase):
 
     price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     builder = fc._LazyBuilder({
-        'price': constant_op.constant([[1., 2.], [5., 6.]])
+        'price': [[1., 2.], [5., 6.]]
     })
     output = builder.get(price)
     with self.test_session():
@@ -286,7 +286,7 @@ class NumericColumnTest(test.TestCase):
 
     price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     builder = fc._LazyBuilder({
-        'price': constant_op.constant([[1., 2.], [5., 6.]])
+        'price': [[1., 2.], [5., 6.]]
     })
     self.assertEqual(builder.get(price), price._get_dense_tensor(builder))
 
@@ -315,7 +315,7 @@ class NumericColumnTest(test.TestCase):
   def test_make_linear_model(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       predictions = fc.make_linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -402,7 +402,7 @@ class BucketizedColumnTest(test.TestCase):
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = fc._LazyBuilder({
-          'price': constant_op.constant([[-1., 1.], [5., 6.]])
+          'price': [[-1., 1.], [5., 6.]]
       })
       transformed_tensor = builder.get(bucketized_price)
       with _initialized_session():
@@ -414,7 +414,7 @@ class BucketizedColumnTest(test.TestCase):
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = fc._LazyBuilder({
-          'price': constant_op.constant([[-1.], [1.], [5.], [6.]])
+          'price': [[-1.], [1.], [5.], [6.]]
       })
       with _initialized_session():
         bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
@@ -432,7 +432,7 @@ class BucketizedColumnTest(test.TestCase):
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = fc._LazyBuilder({
-          'price': constant_op.constant([[-1., 1.], [5., 6.]])
+          'price': [[-1., 1.], [5., 6.]]
       })
       with _initialized_session():
         bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
@@ -448,7 +448,7 @@ class BucketizedColumnTest(test.TestCase):
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = fc._LazyBuilder({
-          'price': constant_op.constant([[-1.], [1.], [5.], [6.]])
+          'price': [[-1.], [1.], [5.], [6.]]
       })
       with _initialized_session() as sess:
         id_weight_pair = bucketized_price._get_sparse_tensors(builder)
@@ -465,7 +465,7 @@ class BucketizedColumnTest(test.TestCase):
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = fc._LazyBuilder({
-          'price': constant_op.constant([[-1., 1.], [5., 6.]])
+          'price': [[-1., 1.], [5., 6.]]
       })
       with _initialized_session() as sess:
         id_weight_pair = bucketized_price._get_sparse_tensors(builder)
@@ -502,7 +502,7 @@ class BucketizedColumnTest(test.TestCase):
     price = fc.numeric_column('price', shape=[1])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[-1.], [1.], [5.], [6.]])}
+      features = {'price': [[-1.], [1.], [5.], [6.]]}
       predictions = fc.make_linear_model(features, [bucketized_price])
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
@@ -527,7 +527,7 @@ class BucketizedColumnTest(test.TestCase):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[-1., 1.], [5., 6.]])}
+      features = {'price': [[-1., 1.], [5., 6.]]}
       predictions = fc.make_linear_model(features, [bucketized_price])
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
@@ -621,15 +621,15 @@ class SparseColumnHashedTest(test.TestCase):
     float_fc = fc.categorical_column_with_hash_bucket(
         'a_float', 10, dtype=dtypes.string)
     int_tensor = sparse_tensor.SparseTensor(
-        values=constant_op.constant([101]),
+        values=[101],
         indices=[[0, 0]],
         dense_shape=[1, 1])
     string_tensor = sparse_tensor.SparseTensor(
-        values=constant_op.constant(['101']),
+        values=['101'],
         indices=[[0, 0]],
         dense_shape=[1, 1])
     float_tensor = sparse_tensor.SparseTensor(
-        values=constant_op.constant([101.]),
+        values=[101.],
         indices=[[0, 0]],
         dense_shape=[1, 1])
     builder = fc._LazyBuilder({
@@ -745,7 +745,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_bias(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       predictions = fc.make_linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -848,7 +848,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_multi_output(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       predictions = fc.make_linear_model(features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -885,7 +885,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[1., 2.], [5., 6.]])}
+      features = {'price': [[1., 2.], [5., 6.]]}
       predictions = fc.make_linear_model(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
@@ -913,7 +913,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_multi_dimension_multi_output(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[1., 2.], [5., 6.]])}
+      features = {'price': [[1., 2.], [5., 6.]]}
       predictions = fc.make_linear_model(features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -928,7 +928,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       predictions = fc.make_linear_model(features, [price])
       with _initialized_session():
         with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
@@ -937,7 +937,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
-      features = {'price': constant_op.constant([[[1., 2.]], [[5., 6.]]])}
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
       predictions = fc.make_linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -953,8 +953,8 @@ class MakeLinearModelTest(test.TestCase):
     price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
       features = {
-          'price1': constant_op.constant([[1., 2.], [5., 6.]]),
-          'price2': constant_op.constant([[3.], [4.]])
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
       }
       predictions = fc.make_linear_model(features, [price1, price2])
       bias = get_linear_model_bias()
@@ -973,7 +973,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_collection(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       fc.make_linear_model(features, [price], weight_collections=['my-vars'])
       my_vars = g.get_collection('my-vars')
       bias = get_linear_model_bias()
@@ -998,7 +998,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_trainable_default(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       fc.make_linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
@@ -1022,7 +1022,7 @@ class MakeLinearModelTest(test.TestCase):
   def test_dense_trainable_false(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default() as g:
-      features = {'price': constant_op.constant([[1.], [5.]])}
+      features = {'price': [[1.], [5.]]}
       fc.make_linear_model(features, [price], trainable=False)
       trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       self.assertEqual([], trainable_vars)
@@ -1074,5 +1074,89 @@ class MakeLinearModelTest(test.TestCase):
       self.assertIn('wire_cast', my_vars[2].name)
 
 
+class MakeInputLayerTest(test.TestCase):
+
+  def test_should_be_dense_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _DenseColumn'):
+      fc.make_input_layer(
+          features={'a': [[0]]},
+          feature_columns=[
+              fc.categorical_column_with_hash_bucket('wire_cast', 4)
+          ])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.make_input_layer(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc.make_input_layer(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_one_column(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = fc.make_input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1.], [5.]], net.eval())
+
+  def test_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      net = fc.make_input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = fc.make_input_layer(features, [price])
+      with _initialized_session():
+        with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+          net.eval()
+
+  def test_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      net = fc.make_input_layer(features, [price])
+      with _initialized_session():
+        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+  def test_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3.], [4.]]
+      }
+      net = fc.make_input_layer(features, [price1, price2])
+      with _initialized_session():
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    with ops.Graph().as_default():
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+      }
+      net1 = fc.make_input_layer(features, [price_a, price_b])
+      net2 = fc.make_input_layer(features, [price_b, price_a])
+      with _initialized_session():
+        self.assertAllClose([[1., 3.]], net1.eval())
+        self.assertAllClose([[1., 3.]], net2.eval())
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 69d1a6f60e1..245dcc96db7 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -774,6 +774,11 @@ class VariableScopeTest(test.TestCase):
         self.assertEqual([v.name
                           for v in scope.global_variables()], ["foo/b:0"])
 
+  def testGetVariableWithRefDtype(self):
+    v = variable_scope.get_variable("v", shape=[3, 4], dtype=dtypes.float32)
+    # Ensure it is possible to do get_variable with a _ref dtype passed in.
+    _ = variable_scope.get_variable("w", shape=[5, 6], dtype=v.dtype)
+
 
 def axis0_into1_partitioner(shape=None, **unused_kwargs):
   part = [1] * len(shape)
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 76719f35b80..43addbe5a52 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -280,6 +280,17 @@ class _VariableStore(object):
       raise ValueError(
           "Passed a custom_getter which is not callable: %s" % custom_getter)
 
+    # If a *_ref type is passed in an error would be triggered further down the
+    # stack. We prevent this using base_dtype to get a non-ref version of the
+    # type, before doing anything else. When _ref types are removed in favour of
+    # resources, this line can be removed.
+    try:
+      dtype = dtype.base_dtype
+    except AttributeError:
+      # .base_dtype not existing means that we will try and use the raw dtype
+      # which was passed in - this might be a NumPy type which is valid.
+      pass
+
     # This is the main logic of get_variable.  However, custom_getter
     # may override this logic.  So we save it as a callable and pass
     # it to custom_getter.
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index 93e64b4ab0b..277c11386dd 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -994,7 +994,7 @@ class SVSummaryThread(coordinator.LooperThread):
       summary_strs = self._sess.run(self._sv.summary_op)
       global_step = None
     if self._sv.summary_writer:
-      logging.info("Recording summary at step %d.", global_step)
+      logging.info("Recording summary at step %s.", global_step)
       self._sv.summary_writer.add_summary(summary_strs, global_step)
 
 
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index e441321fc86..9b8e23babd6 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -227,7 +227,7 @@ string ToString(CUresult result) {
 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
 // context behind our backs).
 CUcontext CurrentContext() {
-  CUcontext current  = CUDADriver::CurrentContextOrDie();
+  CUcontext current = CUDADriver::CurrentContextOrDie();
   if (current != nullptr && !CreatedContexts::Has(current)) {
     LOG(FATAL) << "current context was not created by the StreamExecutor "
                   "cuda_driver API: "
@@ -480,27 +480,56 @@ bool DeviceOptionsToContextFlags(DeviceOptions device_options, int *flags) {
     CUdevice device, DeviceOptions device_options, CudaContext** context) {
   *context = nullptr;
 
-  CUcontext former_context = CurrentContext();
-  if (former_context != nullptr) {
-    LOG(WARNING) << "creating context when one is currently active; existing: "
-                 << former_context;
-  }
-
   int flags = 0;
   if (!DeviceOptionsToContextFlags(device_options, &flags)) {
     LOG(WARNING) << "could not convert all device options into context flags";
   }
 
   CUresult res;
+  CUcontext former_context;
   CUcontext new_context;
   {
     // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
     // context creation: see http://b/13248943
 
 #if CUDA_VERSION >= 7000
-    res = cuDevicePrimaryCtxSetFlags(device, flags);
+    {
+      unsigned int former_primary_context_flags;
+      int former_primary_context_is_active;
+      CHECK_EQ(CUDA_SUCCESS,
+               cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
+                                          &former_primary_context_is_active));
+      if (former_primary_context_flags != flags) {
+        if (former_primary_context_is_active) {
+          LOG(ERROR)
+              << "The primary context is active and has a different flag set ("
+              << former_primary_context_flags << ") than the desired flag set ("
+              << flags << ").";
+        } else {
+          CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
+        }
+      }
+    }
+
+    former_context = CUDADriver::CurrentContextOrDie();
     res = cuDevicePrimaryCtxRetain(&new_context, device);
+    if (former_context != nullptr) {
+      if (former_context == new_context) {
+        VLOG(2) << "The primary context " << former_context
+                << " exists before initializing the StreamExecutor.";
+      } else {
+        LOG(WARNING) << "A non-primary context " << former_context
+                     << " exists before initializing the StreamExecutor. We "
+                        "haven't verified StreamExecutor works with that.";
+      }
+    }
 #else
+    former_context = CurrentContext();
+    if (former_context != nullptr) {
+      LOG(WARNING)
+          << "creating context when one is currently active; existing: "
+          << former_context;
+    }
     res = cuCtxCreate(&new_context, flags, device);
 #endif
   }

From 1457d7ffdcee6a18619a74bdc465ffa60c0fd1ff Mon Sep 17 00:00:00 2001
From: Ian Langmore <langmore@google.com>
Date: Wed, 3 May 2017 14:31:45 -0800
Subject: [PATCH 48/51] sparse_ops:  Preserving static shape info in
 sparse_reshape, sparse_reorder, sparse_add, sparse_reset_shape in the cases
 where all input shapes are known and do not contain implicit "-1" dimensions.
  Exceptions are raises when appropriate, preventing a dishonest static shape
 from being set. Change: 155013345

---
 .../python/kernel_tests/sparse_add_op_test.py |  1 +
 .../python/kernel_tests/sparse_ops_test.py    | 19 +++++-
 .../kernel_tests/sparse_reorder_op_test.py    |  7 ++
 .../kernel_tests/sparse_reshape_op_test.py    | 13 ++++
 tensorflow/python/ops/sparse_ops.py           | 64 ++++++++++++++++---
 5 files changed, 92 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/kernel_tests/sparse_add_op_test.py b/tensorflow/python/kernel_tests/sparse_add_op_test.py
index 874dcbabf10..555c16194e1 100644
--- a/tensorflow/python/kernel_tests/sparse_add_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_add_op_test.py
@@ -88,6 +88,7 @@ class SparseAddTest(test.TestCase):
       for sp_a in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
         for sp_b in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
           sp_sum = sparse_ops.sparse_add(sp_a, sp_b)
+          self.assertAllEqual((3, 3), sp_sum.get_shape())
 
           sum_out = sess.run(sp_sum)
 
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 06d5cbaf2d0..bad11a29df0 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -328,6 +328,12 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     return sparse_tensor.SparseTensorValue(self._IND_2_5_6, self._VAL_2_5_6,
                                            self._SHP_2_5_6)
 
+  def testStaticShapeInfoPreservedWhenNewShapeIsProvidedAndStatic(self):
+    sp_input = self._SparseTensor_2x5x6()
+    new_shape = np.array([3, 6, 7], dtype=np.int64)
+    sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
+    self.assertAllEqual([3, 6, 7], sp_output.get_shape())
+
   def testBasic(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
@@ -397,14 +403,21 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x == y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: np.array([3, 7], dtype=np.int64)})
 
-  def testInvalidDimensionSize(self):
+  def testInvalidDimensionSizeStatic(self):
+    sp_input = self._SparseTensor_2x5x6()
+    new_shape = np.array([3, 7, 5], dtype=np.int64)
+
+    with self.assertRaisesRegexp(ValueError, "should have dimension sizes"):
+      sparse_ops.sparse_reset_shape(sp_input, new_shape)
+
+  def testInvalidDimensionSizeDynamic(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
-      new_shape = np.array([3, 7, 5], dtype=np.int64)
+      new_shape = array_ops.placeholder(dtype=dtypes.int32)
       out = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
       with self.assertRaisesOpError("x <= y did not hold element-wise"):
-        sess.run(out)
+        sess.run(out, feed_dict={new_shape: [3, 7, 5]})
 
   def testInvalidDimensionSizeInputUnavailableInGraphConstruction(self):
     sp_input = array_ops.sparse_placeholder(dtype=dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
index 5136cdadead..18335d665af 100644
--- a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
@@ -48,6 +48,13 @@ class SparseReorderTest(test.TestCase):
     shape = np.array([5, 6]).astype(np.int64)
     return sparse_tensor.SparseTensorValue(ind, val, shape)
 
+  def testStaticShapeInfoPreserved(self):
+    sp_input = sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_5x6(np.arange(6)))
+    self.assertAllEqual((5, 6), sp_input.get_shape())
+    sp_output = sparse_ops.sparse_reorder(sp_input)
+    self.assertAllEqual((5, 6), sp_output.get_shape())
+
   def testAlreadyInOrder(self):
     with self.test_session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6(np.arange(6))
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index 1bb05aa3b2a..42874ea9b7a 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -50,6 +50,13 @@ class SparseReshapeTest(test.TestCase):
     shape = np.array([2, 3, 4])
     return sparse_tensor.SparseTensorValue(ind, val, shape)
 
+  def testStaticShapeInfoPreserved(self):
+    sp_input = sparse_tensor.SparseTensor.from_value(
+        self._SparseTensorValue_5x6())
+    self.assertAllEqual((5, 6), sp_input.get_shape())
+    sp_output = sparse_ops.sparse_reshape(sp_input, shape=(1, 5, 2, 3))
+    self.assertAllEqual((1, 5, 2, 3), sp_output.get_shape())
+
   def testSameShape(self):
     with self.test_session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6()
@@ -180,6 +187,12 @@ class SparseReshapeTest(test.TestCase):
       with self.assertRaisesOpError("only one output shape size may be -1"):
         sess.run(sp_output, {sp_input: input_val})
 
+  def testProvideStaticallyMismatchedSizes(self):
+    input_val = self._SparseTensorValue_5x6()
+    sp_input = sparse_tensor.SparseTensor.from_value(input_val)
+    with self.assertRaisesRegexp(ValueError, "Cannot reshape"):
+      sparse_ops.sparse_reshape(sp_input, [4, 7])
+
   def testFeedMismatchedSizes(self):
     with self.test_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index f8eb34aa5eb..0140a27aaa7 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -51,6 +51,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -288,12 +289,21 @@ def sparse_add(a, b, thresh=0):
 
   if all(isinstance(inp, sparse_classes) for inp in [a, b]):
     a = _convert_to_sparse_tensor(a)
+    b = _convert_to_sparse_tensor(b)
     thresh = ops.convert_to_tensor(
         thresh, dtype=a.values.dtype.real_dtype, name="thresh")
     output_ind, output_val, output_shape = (gen_sparse_ops._sparse_add(
         a.indices, a.values, a.dense_shape,
         b.indices, b.values, b.dense_shape,
         thresh))
+
+    # Attempt to get output_shape statically.
+    a.get_shape().assert_is_compatible_with(b.get_shape())
+    static_shape = array_ops.broadcast_static_shape(
+        a.get_shape(), b.get_shape())
+    if static_shape.is_fully_defined():
+      output_shape = static_shape.as_list()
+
     return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
   else:
     # swap to make `a` the SparseTensor.
@@ -368,8 +378,12 @@ def sparse_reorder(sp_input, name=None):
   reordered_ind, reordered_val = (gen_sparse_ops._sparse_reorder(
       sp_input.indices, sp_input.values, sp_input.dense_shape, name=name))
 
-  return sparse_tensor.SparseTensor(reordered_ind, reordered_val,
-                                    array_ops.identity(sp_input.dense_shape))
+  if sp_input.get_shape().is_fully_defined():
+    dense_shape = sp_input.get_shape().as_list()
+  else:
+    dense_shape = array_ops.identity(sp_input.dense_shape)
+
+  return sparse_tensor.SparseTensor(reordered_ind, reordered_val, dense_shape)
 
 
 def sparse_reshape(sp_input, shape, name=None):
@@ -416,13 +430,30 @@ def sparse_reshape(sp_input, shape, name=None):
 
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
+    ValueError:  If argument `shape` requests a `SparseTensor` with a different
+      number of elements than `sp_input`.
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
+  shape = ops.convert_to_tensor(shape, dtype=dtypes.int64)
 
   with ops.name_scope(name, "SparseReshape", [sp_input]) as name:
     reshaped_ind, reshaped_shape = gen_sparse_ops._sparse_reshape(
         sp_input.indices, sp_input.dense_shape, shape, name=name)
 
+    reshaped_shape_const = tensor_util.constant_value(shape)
+    if (reshaped_shape_const is not None
+        and sp_input.get_shape().is_fully_defined()):
+      # Don't deal with inferred dimensions. That would add significant code.
+      if all(n >= 0 for n in reshaped_shape_const):
+        reshaped_size = np.prod(reshaped_shape_const)
+        in_shape_size = np.prod(sp_input.get_shape().as_list())
+        if reshaped_size != in_shape_size:
+          raise ValueError(
+              "Cannot reshape a tensor with %d elements to shape %s "
+              "(%d elements)."
+              % (in_shape_size, reshaped_shape_const, reshaped_size))
+        reshaped_shape = reshaped_shape_const
+
     return sparse_tensor.SparseTensor(
         reshaped_ind, array_ops.identity(sp_input.values),
         reshaped_shape)
@@ -986,6 +1017,8 @@ def sparse_reset_shape(sp_input, new_shape=None):
     TypeError: If `sp_input` is not a `SparseTensor`.
     ValueError: If `new_shape` represents a tensor with a different rank from
       that of `sp_input` (if shapes are known when graph is constructed).
+    ValueError:  If `new_shape` is determined during graph build to have
+      dimension sizes that are too small.
     OpError:
       - If `new_shape` has dimension sizes that are too small.
       - If shapes are not known during graph construction time, and during run
@@ -1009,14 +1042,27 @@ def sparse_reset_shape(sp_input, new_shape=None):
     # error before the sparse_tensor.SparseTensor catches it.
     output_shape_tensor.get_shape()[0].merge_with(in_shape.get_shape()[0])
 
-    # For cases where shape is not known during graph construction.
-    output_shape_tensor = control_flow_ops.with_dependencies(
-        [check_ops.assert_equal(
-            array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))],
-        output_shape_tensor)
-    output_shape_tensor = control_flow_ops.with_dependencies(
-        [check_ops.assert_less_equal(in_shape, output_shape_tensor)],
+    output_shape_tensor_const = tensor_util.constant_value(
         output_shape_tensor)
+    # For cases where all shapes are known during graph construction
+    if (output_shape_tensor_const is not None
+        and sp_input.get_shape().is_fully_defined()):
+      in_shape_const = np.array(sp_input.get_shape().as_list())
+      if not np.all(in_shape_const <= output_shape_tensor_const):
+        raise ValueError(
+            "Requested new_shape should have dimension sizes >= sp_input.shape."
+            "  Found new_shape (%s), sp_input.shape (%s)."
+            % (in_shape_const, output_shape_tensor_const))
+      output_shape_tensor = output_shape_tensor_const
+    else:
+      # For cases where shape is not known during graph construction.
+      output_shape_tensor = control_flow_ops.with_dependencies(
+          [check_ops.assert_equal(
+              array_ops.shape(in_shape), array_ops.shape(output_shape_tensor))],
+          output_shape_tensor)
+      output_shape_tensor = control_flow_ops.with_dependencies(
+          [check_ops.assert_less_equal(in_shape, output_shape_tensor)],
+          output_shape_tensor)
 
   return sparse_tensor.SparseTensor(in_indices, in_values, output_shape_tensor)
 

From 6b493f72c82593cb1a642af2d091e93b15b56ddc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 3 May 2017 14:58:36 -0800
Subject: [PATCH 49/51] Change contrib estimator to save relative paths in
 checkpoint. Change: 155016674

---
 .../python/learn/estimators/estimator.py      |  3 +-
 .../python/learn/estimators/estimator_test.py | 36 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 123db50d325..74a6da20d4e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -966,7 +966,8 @@ class BaseEstimator(
             saver.Saver(
                 sharded=True,
                 max_to_keep=self._config.keep_checkpoint_max,
-                defer_build=True))
+                defer_build=True,
+                save_relative_paths=True))
 
       chief_hooks = []
       if (self._config.save_checkpoints_secs or
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 8c61ffad553..c95df75356b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -28,6 +28,8 @@ import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from google.protobuf import text_format
+
 from tensorflow.contrib import learn
 from tensorflow.contrib import lookup
 from tensorflow.contrib.framework.python.ops import variables
@@ -50,6 +52,7 @@ from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -61,6 +64,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import checkpoint_state_pb2
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver as saver_lib
@@ -674,6 +678,38 @@ class EstimatorTest(test.TestCase):
         metrics={'MSE': metric_ops.streaming_mean_squared_error})
     self.assertLess(scores3['MSE'], scores['MSE'])
 
+  def test_checkpoint_contains_relative_paths(self):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(
+        model_dir=tmpdir,
+        model_fn=linear_model_fn_with_model_fn_ops)
+    est.fit(input_fn=boston_input_fn, steps=5)
+
+    checkpoint_file_content = file_io.read_file_to_string(
+        os.path.join(tmpdir, 'checkpoint'))
+    ckpt = checkpoint_state_pb2.CheckpointState()
+    text_format.Merge(checkpoint_file_content, ckpt)
+    self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
+    self.assertAllEqual(
+        ['model.ckpt-1', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
+
+  def test_train_save_copy_reload(self):
+    tmpdir = tempfile.mkdtemp()
+    model_dir1 = os.path.join(tmpdir, 'model_dir1')
+    est1 = estimator.Estimator(
+        model_dir=model_dir1,
+        model_fn=linear_model_fn_with_model_fn_ops)
+    est1.fit(input_fn=boston_input_fn, steps=5)
+
+    model_dir2 = os.path.join(tmpdir, 'model_dir2')
+    os.renames(model_dir1, model_dir2)
+    est2 = estimator.Estimator(
+        model_dir=model_dir2,
+        model_fn=linear_model_fn_with_model_fn_ops)
+    self.assertEqual(5, est2.get_variable_value('global_step'))
+    est2.fit(input_fn=boston_input_fn, steps=5)
+    self.assertEqual(10, est2.get_variable_value('global_step'))
+
   def testEstimatorParams(self):
     boston = base.load_boston()
     est = estimator.SKCompat(

From 4a4a1ebc485d854196f0c2231842f70eebeb8981 Mon Sep 17 00:00:00 2001
From: Neal Wu <wun@google.com>
Date: Wed, 3 May 2017 16:46:57 -0800
Subject: [PATCH 50/51] TensorFlow documentation fixes Change: 155029319

---
 tensorflow/contrib/layers/python/layers/layers.py       | 6 +++---
 tensorflow/docs_src/performance/index.md                | 2 +-
 tensorflow/docs_src/programmers_guide/variable_scope.md | 8 ++++----
 tensorflow/docs_src/tutorials/deep_cnn.md               | 2 +-
 tensorflow/python/layers/normalization.py               | 4 +++-
 tensorflow/python/ops/variable_scope.py                 | 2 +-
 6 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index c920764803d..32ca0c38d91 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -844,7 +844,7 @@ def convolution(inputs,
   variable would be created and added the activations. Finally, if
   `activation_fn` is not `None`, it is applied to the activations as well.
 
-  Performs a'trous convolution with input stride/dilation rate equal to `rate`
+  Performs atrous convolution with input stride/dilation rate equal to `rate`
   if a value > 1 for any dimension of `rate` is specified.  In this case
   `stride` values != 1 are not supported.
 
@@ -870,7 +870,7 @@ def convolution(inputs,
       "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
       For N=3, the valid values are "NDHWC" (default) and "NCDHW".
     rate: A sequence of N positive integers specifying the dilation rate to use
-      for a'trous convolution.  Can be a single integer to specify the same
+      for atrous convolution.  Can be a single integer to specify the same
       value for all spatial dimensions.  Specifying any `rate` value != 1 is
       incompatible with specifying any `stride` value != 1.
     activation_fn: Activation function. The default value is a ReLU function.
@@ -1865,7 +1865,7 @@ def separable_convolution2d(
       depthwise convolution stride. Can be an int if both strides are the same.
     padding: One of 'VALID' or 'SAME'.
     rate: A list of length 2: [rate_height, rate_width], specifying the dilation
-      rates for a'trous convolution. Can be an int if both rates are the same.
+      rates for atrous convolution. Can be an int if both rates are the same.
       If any value is larger than one, then both stride values need to be one.
     activation_fn: Activation function. The default value is a ReLU function.
       Explicitly set it to None to skip it and maintain a linear activation.
diff --git a/tensorflow/docs_src/performance/index.md b/tensorflow/docs_src/performance/index.md
index 746dc0c74fe..7c1cd152d37 100644
--- a/tensorflow/docs_src/performance/index.md
+++ b/tensorflow/docs_src/performance/index.md
@@ -9,7 +9,7 @@ deeper with techniques detailed in @{$performance_models$High-Performance Models
     practices for optimizing your TensorFlow code.
 
   * @{$performance_models$High-Performance Models}, which contains a collection
-    advanced techniques to build highly scalable models targeting different
+    of advanced techniques to build highly scalable models targeting different
     system types and network topologies.
 
   * @{$benchmarks$Benchmarks}, which contains a collection of benchmark
diff --git a/tensorflow/docs_src/programmers_guide/variable_scope.md b/tensorflow/docs_src/programmers_guide/variable_scope.md
index 5084acbab97..f4d2b3f37b8 100644
--- a/tensorflow/docs_src/programmers_guide/variable_scope.md
+++ b/tensorflow/docs_src/programmers_guide/variable_scope.md
@@ -5,7 +5,7 @@ in the way described in the @{$variables$Variables HowTo}.
 But when building complex models you often need to share large sets of
 variables and you might want to initialize all of them in one place.
 This tutorial shows how this can be done using `tf.variable_scope()` and
-the `tf.get_variable()`.
+`tf.get_variable()`.
 
 ## The Problem
 
@@ -368,6 +368,6 @@ sequence-to-sequence models.
 
 File | What's in it?
 --- | ---
-`models/tutorials/image/cifar10/cifar10.py` | Model for detecting objects in images.
-`models/tutorials/rnn/rnn_cell.py` | Cell functions for recurrent neural networks.
-`models/tutorials/rnn/seq2seq.py` | Functions for building sequence-to-sequence models.
+`tutorials/image/cifar10/cifar10.py` | Model for detecting objects in images.
+`tutorials/rnn/rnn_cell.py` | Cell functions for recurrent neural networks.
+`tutorials/rnn/seq2seq.py` | Functions for building sequence-to-sequence models.
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index d6a136fee47..f60c8fd7701 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -83,7 +83,7 @@ for details.  It consists of 1,068,298 learnable parameters and requires about
 ## Code Organization
 
 The code for this tutorial resides in
-[`tensorflow_models/tutorials/image/cifar10/`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/).
+[`models/tutorials/image/cifar10/`](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10/).
 
 File | Purpose
 --- | ---
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 871f840c529..f92ea9b05f5 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -400,7 +400,9 @@ def batch_normalization(inputs,
     training: Either a Python boolean, or a TensorFlow boolean scalar tensor
       (e.g. a placeholder). Whether to return the output in training mode
       (normalized with statistics of the current batch) or in inference mode
-      (normalized with moving statistics).
+      (normalized with moving statistics). **NOTE**: make sure to set this
+      parameter correctly, or else your training/inference will not work
+      properly.
     trainable: Boolean, if `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     name: String, the name of the layer.
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 43addbe5a52..a29ddfa9f2f 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1292,7 +1292,7 @@ def _pure_variable_scope(name_or_scope,
       well-defined semantics. Defaults to False (will later change to True).
 
   Yields:
-    A scope that can be to captured and reused.
+    A scope that can be captured and reused.
 
   Raises:
     ValueError: when trying to reuse within a create scope, or create within

From ddf30ca616de8ceebdf414c9800b857aa004e281 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Wed, 3 May 2017 16:57:29 -0800
Subject: [PATCH 51/51] Added missing header file to fix the MacOS builds
 Change: 155030326

---
 tensorflow/core/grappler/costs/robust_stats.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/grappler/costs/robust_stats.cc b/tensorflow/core/grappler/costs/robust_stats.cc
index 87cda1c0d2e..dba6efae0fd 100644
--- a/tensorflow/core/grappler/costs/robust_stats.cc
+++ b/tensorflow/core/grappler/costs/robust_stats.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/robust_stats.h"
 #include <algorithm>
+#include <cmath>
 
 namespace tensorflow {
 namespace grappler {