[TF:XLA] Limit autoclustering of ShapeConsumingOps on the first pass.

Additionally prevent partial_declustering of these ops. PiperOrigin-RevId: 248091621
2019-05-14 01:11:39 -07:00 · 2019-05-14 01:11:39 -07:00 · c1de6bc140
commit c1de6bc140
parent c095504e2e
12 changed files with 242 additions and 35 deletions
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@ -203,6 +203,7 @@ tf_cc_test(
    deps = [
        ":ops",
        ":scope",
+        "//tensorflow/cc:cc_ops",
        "//tensorflow/core:framework",
        "//tensorflow/core:test",
        "//tensorflow/core:test_main",
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@ -531,4 +531,23 @@ Scope NewInternalScope(Graph* graph, Status* status, ShapeRefiner* refiner) {
  return InternalScope::NewScope(graph, status, refiner);
 }

+Status CreateOutputWithScope(string op_name,
+                             absl::Span<const ::tensorflow::Input> inputs,
+                             const Scope& scope, Output* output) {
+  TF_RETURN_IF_ERROR(scope.status());
+  const auto unique_name = scope.GetUniqueNameForOp(op_name);
+  auto builder = ::tensorflow::NodeBuilder(unique_name, op_name);
+  for (auto input : inputs) {
+    TF_RETURN_IF_ERROR(scope.status());
+    builder = builder.Input(input.node());
+  }
+  ::tensorflow::Node* ret;
+  scope.UpdateBuilder(&builder);
+  TF_RETURN_IF_ERROR(scope.status());
+  scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
+  TF_RETURN_IF_ERROR(scope.status());
+  *output = Output(ret, 0);
+  return Status::OK();
+}
+
 }  // namespace tensorflow
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@ -255,6 +255,12 @@ struct CompositeOpScopes {
  Scope last;
 };

+// Creates a node of the given operation, with the given inputs, and assigns the
+// result to output. This does not support the ability to add additional
+// attributes.
+Status CreateOutputWithScope(string op_name,
+                             absl::Span<const ::tensorflow::Input> inputs,
+                             const Scope& scope, Output* output);
 /// @}

 }  // namespace tensorflow
--- a/tensorflow/cc/framework/scope_test.cc
+++ b/tensorflow/cc/framework/scope_test.cc
@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/

 #include "tensorflow/cc/framework/scope.h"
+
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/core/platform/test.h"

 namespace tensorflow {
@ -145,4 +147,14 @@ TEST(ScopeTest, ControlDeps) {
  EXPECT_EQ(c_c.control_deps().size(), 3);
 }

+TEST(ScopeTest, CreateOutput) {
+  Scope root = Scope::NewRootScope();
+  Output a = ops::Placeholder(root.WithOpName("a"), DT_FLOAT);
+  Output add;
+  ASSERT_TRUE(
+      CreateOutputWithScope("Add", {a, a}, root.WithOpName("add"), &add).ok());
+  EXPECT_EQ(add.node()->name(), "add");
+  EXPECT_EQ(add.node()->type_string(), "Add");
+}
+
 }  // namespace tensorflow
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.cc
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
@ -410,6 +410,16 @@ absl::Span<const int32> GraphCycles::Predecessors(int32 node) const {
  return rep_->nodes_[node]->in.GetSequence();
 }

+std::vector<int32> GraphCycles::SuccessorsCopy(int32 node) const {
+  absl::Span<const int32> successors = Successors(node);
+  return std::vector<int32>(successors.begin(), successors.end());
+}
+
+std::vector<int32> GraphCycles::PredecessorsCopy(int32 node) const {
+  absl::Span<const int32> predecessors = Predecessors(node);
+  return std::vector<int32>(predecessors.begin(), predecessors.end());
+}
+
 namespace {
 void SortInPostOrder(absl::Span<Node* const> nodes,
                     std::vector<int32>* to_sort) {
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.h
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.h
@ -118,9 +118,18 @@ class GraphCycles {
  // Expensive: should only be called from graphcycles_test.cc.
  bool CheckInvariants() const;

+  // Warning: Do not use these if iterating over the span and modifying the
+  // GraphCycles at the same time. Instead use SuccessorsCopy/PredecessorsCopy.
  absl::Span<const int32> Successors(int32 node) const;
  absl::Span<const int32> Predecessors(int32 node) const;

+  // Return a copy of the sucessors set. This is needed for code using the
+  // collection while modifying the GraphCycles.
+  std::vector<int32> SuccessorsCopy(int32 node) const;
+  // Return a copy of the predecessors set. This is needed for code using the
+  // collection while modifying the GraphCycles.
+  std::vector<int32> PredecessorsCopy(int32 node) const;
+
  // Returns all nodes in post order.
  //
  // If there is a path from X to Y then X appears after Y in the
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@ -233,6 +233,29 @@ class MarkForCompilationPassImpl {
  // Returns true if any new clusters were created.
  StatusOr<bool> RunEdgeContractionLoopInPostOrderOnce();

+  // Runs through all the nodes in `cycles_graph_` and tries to contract high
+  // priority edges for clusters. Returns true if any new clusters were created.
+  //
+  // There are potentially many maximal clustering results, but they will not
+  // all be equally performant. Some clustering decision are likely to improve
+  // performance much more than others, and we cannot order contractions on this
+  // cost function, nor can we look at global information while deciding on
+  // individual edges to contract. Instead, we will make decisions on these
+  // important edges then make decisions on all other edges, causing the highest
+  // chance of all most important edges to be contracted.
+  //
+  // An example of where this might occur is with a digraph:
+  // {A -> B, B -> C, A -> X, X -> C} where B is a Size operation and X is
+  // not-compilable. In this case, the valid clusterings are {A,B} or {B,C}. B
+  // should be clustered with A because it will prevent a potentially large
+  // tensor from A being computed and copied.
+  //
+  // This pass will ensure that contraction happens, which cannot be enforced in
+  // a single pass with the current algorithm.
+  // graph and prevent B->C from being clusterd in anticipation of a later A->B
+  // cluster.
+  StatusOr<bool> ContractPreferredEdges();
+
  // Contracts as many edges as possible to create XLA clusters.  After this
  // finishes the clustering decisions made are implicitly stored in
  // `clusters_`.
@ -314,6 +337,13 @@ class MarkForCompilationPassImpl {
  //
  // Returns nullptr if `node_id` is not a compilation candidate.
  Cluster* GetClusterForCyclesGraphNode(int node_id) {
+    // We have to check `graph_->FindNodeId(node) == nullptr` because we add all
+    // nodes in [0, graph_->num_node_ids()) to the cycle detection graph but the
+    // TF graph may be missing some node ids.
+    if (node_id >= graph_->num_node_ids() ||
+        graph_->FindNodeId(node_id) == nullptr) {
+      return nullptr;
+    }
    Cluster* cluster = cluster_for_node_[node_id].Get();
    if (cluster) {
      DCHECK_EQ(cluster->cycles_graph_node_id(), node_id);
@ -581,6 +611,50 @@ Status MarkForCompilationPassImpl::Initialize() {
  return BuildInitialClusterSet();
 }

+StatusOr<bool> MarkForCompilationPassImpl::ContractPreferredEdges() {
+  bool changed = false;
+  for (int32 node : cycles_graph_.AllNodesInPostOrder()) {
+    Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
+    if (!cluster_from) {
+      continue;
+    }
+
+    // Make a copy of the set of successors because we may modify the graph in
+    // TryToContractEdge.
+    std::vector<int32> successors_copy =
+        cycles_graph_.SuccessorsCopy(cluster_from->cycles_graph_node_id());
+
+    for (int to : successors_copy) {
+      iteration_count_++;
+
+      Cluster* cluster_to = GetClusterForCyclesGraphNode(to);
+      if (!cluster_to) {
+        continue;
+      }
+
+      if (cluster_to->cluster_size() == 1) {
+        Node* n = graph_->FindNodeId(cluster_to->GetIdOfOnlyNode());
+
+        // Shape consuming operations are desirable to cluster with their
+        // operands because they return a small set of scalar values after
+        // consuming a large amount of data. For example, given a graph X -> Y
+        // -> Size -> Z, where the possible clustering is [{X, Y, Size}, {Z}] or
+        // [{X, Y}, {Size, Z}], the better clustering is Size with Y because the
+        // output of size will be a small tensor while Y is a potentially large
+        // tensor that must be computed and possible transposed/copied before
+        // the second cluster executes.
+        if (IsShapeConsumerOp(*n)) {
+          TF_ASSIGN_OR_RETURN(bool contracted_edge,
+                              TryToContractEdge(cluster_from, cluster_to));
+          changed |= contracted_edge;
+        }
+      }
+    }
+  }
+
+  return changed;
+}
+
 StatusOr<bool>
 MarkForCompilationPassImpl::RunEdgeContractionLoopInPostOrderOnce() {
  bool changed = false;
@ -596,15 +670,8 @@ MarkForCompilationPassImpl::RunEdgeContractionLoopInPostOrderOnce() {
  //    digraph { X->Y; Y->Z; } then collapsing X->Y does not make it possible
  //    to contract Y->Z if Y->Z was not contractible originally.
  for (int32 node : cycles_graph_.AllNodesInPostOrder()) {
-    // We have to check `graph_->FindNodeId(node) == nullptr` because we add all
-    // nodes in [0, graph_->num_node_ids()) to the cycle detection graph but the
-    // TF graph may be missing some node ids.
-    if (node >= graph_->num_node_ids() || graph_->FindNodeId(node) == nullptr) {
-      continue;
-    }
-
    Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
-    if (cluster_from == nullptr) {
+    if (!cluster_from) {
      continue;
    }

@ -623,7 +690,13 @@ Status MarkForCompilationPassImpl::RunEdgeContractionLoop() {
  // TODO(hpucha): Handle the case where kXlaClusterAttr is already set (for
  // example, from the Grappler fusion pass).

-  TF_ASSIGN_OR_RETURN(bool changed, RunEdgeContractionLoopInPostOrderOnce());
+  // Run twice, first only targeted at contracting very beneficial edges then
+  // without restrictions. This helps to minimize data output from clusters (and
+  // possible transpose operations before outputs) that might occur if a
+  // ShapeConsumingOp is on the edge of 2 clusters due to cycle considerations.
+  TF_ASSIGN_OR_RETURN(bool changed, ContractPreferredEdges());
+
+  TF_ASSIGN_OR_RETURN(changed, RunEdgeContractionLoopInPostOrderOnce());

  // Check that RunEdgeContractionLoopInPostOrderOnce is idempotent.  Once the
  // linear time post-order scheme has been battle tested we can move this to
@ -711,10 +784,6 @@ MarkForCompilationPassImpl::ClusteringWillIntroduceInterDeviceDependency(
  // where a cluster is producing data for multiple devices.
  for (const auto& in_id :
       cycles_graph_.Predecessors(cluster_to.cycles_graph_node_id())) {
-    if (in_id >= graph_->num_node_ids()) {
-      continue;
-    }
-
    const Cluster* cluster_in = GetClusterForCyclesGraphNode(in_id);
    if (cluster_in) {
      TF_ASSIGN_OR_RETURN(bool devices_compatible,
@ -1070,19 +1139,11 @@ StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdgesFrom(

  // Make a copy of the set of successors because we may modify the graph in
  // TryToContractEdge.
-  std::vector<int32> successors_copy = [&] {
-    absl::Span<const int32> successors =
-        cycles_graph_.Successors(cluster_from->cycles_graph_node_id());
-    return std::vector<int32>(successors.begin(), successors.end());
-  }();
+  std::vector<int32> successors_copy =
+      cycles_graph_.SuccessorsCopy(cluster_from->cycles_graph_node_id());

  for (int to : successors_copy) {
    iteration_count_++;
-    if (to >= graph_->num_node_ids()) {
-      // Node is a fictitious node that is present only in the cycle detection
-      // graph. No clustering is possible.
-      continue;
-    }

    Cluster* cluster_to = GetClusterForCyclesGraphNode(to);
    if (!cluster_to) {
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@ -1548,5 +1548,59 @@ TEST(XlaCompilationTest, DontClusterNodesWithForwardFromAttr) {
  EXPECT_EQ(clusters["test/z"], "");
 }

+// Note, this relies on other implementation details to test the
+// specific heuristic we care about here, so other changes might be at fault if
+// this CL breaks. What we care about is that if a ShapeConsumingOp can be
+// connected with a producer or consumer and cannot be clustered with both, it
+// should be clustered with the producer.
+TEST(XlaCompilationTest, ClusterShapeConsumerWithProducer) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+
+  Output x = ops::MatMul(root.WithOpName("test/x"), a, b);
+  Output y = ops::Size(root.WithOpName("test/y"), x);
+  Output z = ops::Add(root.WithOpName("test/z"), y, y);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  // Ensure that the "Size" op can only be clustered with either the producer or
+  // consumer by putting them on different devices.
+  FindNodeByName(graph.get(), "test/x")->set_assigned_device_name(kGPU0);
+  FindNodeByName(graph.get(), "test/y")->set_assigned_device_name(kCPU0);
+  FindNodeByName(graph.get(), "test/z")->set_assigned_device_name(kGPU1);
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  EXPECT_NE(clusters["test/y"], "");
+  EXPECT_EQ(clusters["test/x"], clusters["test/y"]);
+  EXPECT_NE(clusters["test/z"], clusters["test/y"]);
+}
+
+// Test that ShapeConsuming ops are still fully clustered whenever possible.
+TEST(XlaCompilationTest, ClusterShapeConsumerWithProducerAndConsumer) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+
+  Output x = ops::MatMul(root.WithOpName("test/x"), a, b);
+  Output y = ops::Size(root.WithOpName("test/y"), x);
+  Output z = ops::Add(root.WithOpName("test/z"), y, y);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  EXPECT_NE(clusters["test/y"], "");
+  EXPECT_EQ(clusters["test/y"], clusters["test/x"]);
+  EXPECT_EQ(clusters["test/y"], clusters["test/z"]);
+}
+
 }  // namespace
 }  // namespace tensorflow
--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@ -51,6 +51,15 @@ Status FindNodesToDecluster(const Graph& graph,
      continue;
    }

+    // Assume the benefit of not outputting a larger tensor outweighs the
+    // benefit of this check.
+    // TODO(tpopp): Only apply this if the value being consumed is not output
+    // from the cluster to another consumer.
+    // TODO(tpopp): See if XlaRun can be modified to avoid this issue
+    // completely.
+    if (IsShapeConsumerOp(*n)) {
+      continue;
+    }
    // We assume the only XLA-auto-clusterable operations with side effects are
    // resource variable updates.  We can't execute these twice.
    if (HasResourceInputOrOutput(*n)) {
@ -344,12 +353,6 @@ Status PartiallyDeclusterGraph(Graph* graph,
 }  // namespace reduce_recompilation

 namespace decluster_root_shape_consumers {
-// Returns true if `node` an operator that consumes only the shape of its input,
-// not the data itself.
-bool IsShapeConsumerOp(const Node& node) {
-  return node.type_string() == "Shape" || node.type_string() == "Rank" ||
-         node.type_string() == "Size";
-}

 Status PartiallyDeclusterGraph(Graph* graph) {
  std::vector<Node*> reverse_post_order;
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@ -40,20 +40,20 @@ limitations under the License.

 namespace tensorflow {
 namespace {
-REGISTER_OP("FakeNullary").Output("out: float");
+REGISTER_OP("FakeNullary").Output("out: int32");

 REGISTER_OP("FakeBinary")
-    .Input("host_in: float")
-    .Input("device_in: float")
-    .Output("host_out: float")
-    .Output("device_out: float");
+    .Input("host_in: int32")
+    .Input("device_in: int32")
+    .Output("host_out: int32")
+    .Output("device_out: int32");

 REGISTER_OP("FakeResourceVar").Output("out: resource");

 REGISTER_OP("FakeResourceUpdate")
    .Input("in: resource")
    .Output("out: resource")
-    .Output("something_else: float");
+    .Output("something_else: int32");

 class FakeBinaryOp : public OpKernel {
 public:
@ -499,5 +499,29 @@ TEST(PartiallyDeclusterPassTest, MetadataOpsDontStartClusters) {
  EXPECT_EQ(GetXlaClusterForNode(*n_e), absl::nullopt);
 }

+TEST(PartiallyDeclusterPassTest, MetaConsumersArentDeclustered) {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  tensorflow::Scope in_cluster_and = root.WithXlaCluster("cluster_0");
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  Output a = ops::Placeholder(root.WithOpName("a"), DT_FLOAT);
+  Output b = ops::Add(in_cluster_and.WithOpName("b"), a, a);
+  Output c = ops::Rank(in_cluster_and.WithOpName("c"), b);
+
+  Output e;
+  TF_ASSERT_OK(
+      CreateOutputWithScope("FakeBinary", {c, c}, root.WithOpName("e"), &e));
+
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+
+  Node* n_b = FindNodeByName(*graph, "b");
+  ASSERT_NE(n_b, nullptr);
+  EXPECT_EQ(GetXlaClusterForNode(*n_b), "cluster_0");
+
+  Node* n_c = FindNodeByName(*graph, "c");
+  ASSERT_NE(n_c, nullptr);
+  EXPECT_EQ(GetXlaClusterForNode(*n_c), "cluster_0");
+}
+
 }  // namespace
 }  // namespace tensorflow
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@ -314,4 +314,8 @@ bool MayCallFunction(const Node& n, const FunctionLibraryDefinition* flib_def) {
                          return name_attr_pair.second.has_func();
                        });
 }
+bool IsShapeConsumerOp(const Node& node) {
+  return node.type_string() == "Shape" || node.type_string() == "Rank" ||
+         node.type_string() == "Size";
+}
 }  // namespace tensorflow
--- a/tensorflow/compiler/jit/xla_cluster_util.h
+++ b/tensorflow/compiler/jit/xla_cluster_util.h
@ -83,6 +83,10 @@ bool IsSingleGpuGraph(const Graph& g);
 // Returns true if it is possible (but not guaranteed) that `n` calls a
 // function.
 bool MayCallFunction(const Node& n, const FunctionLibraryDefinition* flib_def);
+
+// Returns true if `node` an operator that consumes only the shape of its input,
+// not the data itself.
+bool IsShapeConsumerOp(const Node& node);
 }  // namespace tensorflow

 #endif  // TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_