[TF:XLA] Limit autoclustering of ShapeConsumingOps on the first pass.
Additionally prevent partial_declustering of these ops. PiperOrigin-RevId: 248091621
This commit is contained in:
parent
c095504e2e
commit
c1de6bc140
@ -203,6 +203,7 @@ tf_cc_test(
|
||||
deps = [
|
||||
":ops",
|
||||
":scope",
|
||||
"//tensorflow/cc:cc_ops",
|
||||
"//tensorflow/core:framework",
|
||||
"//tensorflow/core:test",
|
||||
"//tensorflow/core:test_main",
|
||||
|
@ -531,4 +531,23 @@ Scope NewInternalScope(Graph* graph, Status* status, ShapeRefiner* refiner) {
|
||||
return InternalScope::NewScope(graph, status, refiner);
|
||||
}
|
||||
|
||||
Status CreateOutputWithScope(string op_name,
|
||||
absl::Span<const ::tensorflow::Input> inputs,
|
||||
const Scope& scope, Output* output) {
|
||||
TF_RETURN_IF_ERROR(scope.status());
|
||||
const auto unique_name = scope.GetUniqueNameForOp(op_name);
|
||||
auto builder = ::tensorflow::NodeBuilder(unique_name, op_name);
|
||||
for (auto input : inputs) {
|
||||
TF_RETURN_IF_ERROR(scope.status());
|
||||
builder = builder.Input(input.node());
|
||||
}
|
||||
::tensorflow::Node* ret;
|
||||
scope.UpdateBuilder(&builder);
|
||||
TF_RETURN_IF_ERROR(scope.status());
|
||||
scope.UpdateStatus(builder.Finalize(scope.graph(), &ret));
|
||||
TF_RETURN_IF_ERROR(scope.status());
|
||||
*output = Output(ret, 0);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -255,6 +255,12 @@ struct CompositeOpScopes {
|
||||
Scope last;
|
||||
};
|
||||
|
||||
// Creates a node of the given operation, with the given inputs, and assigns the
|
||||
// result to output. This does not support the ability to add additional
|
||||
// attributes.
|
||||
Status CreateOutputWithScope(string op_name,
|
||||
absl::Span<const ::tensorflow::Input> inputs,
|
||||
const Scope& scope, Output* output);
|
||||
/// @}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -14,6 +14,8 @@ limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/cc/framework/scope.h"
|
||||
|
||||
#include "tensorflow/cc/ops/array_ops.h"
|
||||
#include "tensorflow/core/platform/test.h"
|
||||
|
||||
namespace tensorflow {
|
||||
@ -145,4 +147,14 @@ TEST(ScopeTest, ControlDeps) {
|
||||
EXPECT_EQ(c_c.control_deps().size(), 3);
|
||||
}
|
||||
|
||||
TEST(ScopeTest, CreateOutput) {
|
||||
Scope root = Scope::NewRootScope();
|
||||
Output a = ops::Placeholder(root.WithOpName("a"), DT_FLOAT);
|
||||
Output add;
|
||||
ASSERT_TRUE(
|
||||
CreateOutputWithScope("Add", {a, a}, root.WithOpName("add"), &add).ok());
|
||||
EXPECT_EQ(add.node()->name(), "add");
|
||||
EXPECT_EQ(add.node()->type_string(), "Add");
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -410,6 +410,16 @@ absl::Span<const int32> GraphCycles::Predecessors(int32 node) const {
|
||||
return rep_->nodes_[node]->in.GetSequence();
|
||||
}
|
||||
|
||||
std::vector<int32> GraphCycles::SuccessorsCopy(int32 node) const {
|
||||
absl::Span<const int32> successors = Successors(node);
|
||||
return std::vector<int32>(successors.begin(), successors.end());
|
||||
}
|
||||
|
||||
std::vector<int32> GraphCycles::PredecessorsCopy(int32 node) const {
|
||||
absl::Span<const int32> predecessors = Predecessors(node);
|
||||
return std::vector<int32>(predecessors.begin(), predecessors.end());
|
||||
}
|
||||
|
||||
namespace {
|
||||
void SortInPostOrder(absl::Span<Node* const> nodes,
|
||||
std::vector<int32>* to_sort) {
|
||||
|
@ -118,9 +118,18 @@ class GraphCycles {
|
||||
// Expensive: should only be called from graphcycles_test.cc.
|
||||
bool CheckInvariants() const;
|
||||
|
||||
// Warning: Do not use these if iterating over the span and modifying the
|
||||
// GraphCycles at the same time. Instead use SuccessorsCopy/PredecessorsCopy.
|
||||
absl::Span<const int32> Successors(int32 node) const;
|
||||
absl::Span<const int32> Predecessors(int32 node) const;
|
||||
|
||||
// Return a copy of the sucessors set. This is needed for code using the
|
||||
// collection while modifying the GraphCycles.
|
||||
std::vector<int32> SuccessorsCopy(int32 node) const;
|
||||
// Return a copy of the predecessors set. This is needed for code using the
|
||||
// collection while modifying the GraphCycles.
|
||||
std::vector<int32> PredecessorsCopy(int32 node) const;
|
||||
|
||||
// Returns all nodes in post order.
|
||||
//
|
||||
// If there is a path from X to Y then X appears after Y in the
|
||||
|
@ -233,6 +233,29 @@ class MarkForCompilationPassImpl {
|
||||
// Returns true if any new clusters were created.
|
||||
StatusOr<bool> RunEdgeContractionLoopInPostOrderOnce();
|
||||
|
||||
// Runs through all the nodes in `cycles_graph_` and tries to contract high
|
||||
// priority edges for clusters. Returns true if any new clusters were created.
|
||||
//
|
||||
// There are potentially many maximal clustering results, but they will not
|
||||
// all be equally performant. Some clustering decision are likely to improve
|
||||
// performance much more than others, and we cannot order contractions on this
|
||||
// cost function, nor can we look at global information while deciding on
|
||||
// individual edges to contract. Instead, we will make decisions on these
|
||||
// important edges then make decisions on all other edges, causing the highest
|
||||
// chance of all most important edges to be contracted.
|
||||
//
|
||||
// An example of where this might occur is with a digraph:
|
||||
// {A -> B, B -> C, A -> X, X -> C} where B is a Size operation and X is
|
||||
// not-compilable. In this case, the valid clusterings are {A,B} or {B,C}. B
|
||||
// should be clustered with A because it will prevent a potentially large
|
||||
// tensor from A being computed and copied.
|
||||
//
|
||||
// This pass will ensure that contraction happens, which cannot be enforced in
|
||||
// a single pass with the current algorithm.
|
||||
// graph and prevent B->C from being clusterd in anticipation of a later A->B
|
||||
// cluster.
|
||||
StatusOr<bool> ContractPreferredEdges();
|
||||
|
||||
// Contracts as many edges as possible to create XLA clusters. After this
|
||||
// finishes the clustering decisions made are implicitly stored in
|
||||
// `clusters_`.
|
||||
@ -314,6 +337,13 @@ class MarkForCompilationPassImpl {
|
||||
//
|
||||
// Returns nullptr if `node_id` is not a compilation candidate.
|
||||
Cluster* GetClusterForCyclesGraphNode(int node_id) {
|
||||
// We have to check `graph_->FindNodeId(node) == nullptr` because we add all
|
||||
// nodes in [0, graph_->num_node_ids()) to the cycle detection graph but the
|
||||
// TF graph may be missing some node ids.
|
||||
if (node_id >= graph_->num_node_ids() ||
|
||||
graph_->FindNodeId(node_id) == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
Cluster* cluster = cluster_for_node_[node_id].Get();
|
||||
if (cluster) {
|
||||
DCHECK_EQ(cluster->cycles_graph_node_id(), node_id);
|
||||
@ -581,6 +611,50 @@ Status MarkForCompilationPassImpl::Initialize() {
|
||||
return BuildInitialClusterSet();
|
||||
}
|
||||
|
||||
StatusOr<bool> MarkForCompilationPassImpl::ContractPreferredEdges() {
|
||||
bool changed = false;
|
||||
for (int32 node : cycles_graph_.AllNodesInPostOrder()) {
|
||||
Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
|
||||
if (!cluster_from) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Make a copy of the set of successors because we may modify the graph in
|
||||
// TryToContractEdge.
|
||||
std::vector<int32> successors_copy =
|
||||
cycles_graph_.SuccessorsCopy(cluster_from->cycles_graph_node_id());
|
||||
|
||||
for (int to : successors_copy) {
|
||||
iteration_count_++;
|
||||
|
||||
Cluster* cluster_to = GetClusterForCyclesGraphNode(to);
|
||||
if (!cluster_to) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cluster_to->cluster_size() == 1) {
|
||||
Node* n = graph_->FindNodeId(cluster_to->GetIdOfOnlyNode());
|
||||
|
||||
// Shape consuming operations are desirable to cluster with their
|
||||
// operands because they return a small set of scalar values after
|
||||
// consuming a large amount of data. For example, given a graph X -> Y
|
||||
// -> Size -> Z, where the possible clustering is [{X, Y, Size}, {Z}] or
|
||||
// [{X, Y}, {Size, Z}], the better clustering is Size with Y because the
|
||||
// output of size will be a small tensor while Y is a potentially large
|
||||
// tensor that must be computed and possible transposed/copied before
|
||||
// the second cluster executes.
|
||||
if (IsShapeConsumerOp(*n)) {
|
||||
TF_ASSIGN_OR_RETURN(bool contracted_edge,
|
||||
TryToContractEdge(cluster_from, cluster_to));
|
||||
changed |= contracted_edge;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return changed;
|
||||
}
|
||||
|
||||
StatusOr<bool>
|
||||
MarkForCompilationPassImpl::RunEdgeContractionLoopInPostOrderOnce() {
|
||||
bool changed = false;
|
||||
@ -596,15 +670,8 @@ MarkForCompilationPassImpl::RunEdgeContractionLoopInPostOrderOnce() {
|
||||
// digraph { X->Y; Y->Z; } then collapsing X->Y does not make it possible
|
||||
// to contract Y->Z if Y->Z was not contractible originally.
|
||||
for (int32 node : cycles_graph_.AllNodesInPostOrder()) {
|
||||
// We have to check `graph_->FindNodeId(node) == nullptr` because we add all
|
||||
// nodes in [0, graph_->num_node_ids()) to the cycle detection graph but the
|
||||
// TF graph may be missing some node ids.
|
||||
if (node >= graph_->num_node_ids() || graph_->FindNodeId(node) == nullptr) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Cluster* cluster_from = GetClusterForCyclesGraphNode(node);
|
||||
if (cluster_from == nullptr) {
|
||||
if (!cluster_from) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -623,7 +690,13 @@ Status MarkForCompilationPassImpl::RunEdgeContractionLoop() {
|
||||
// TODO(hpucha): Handle the case where kXlaClusterAttr is already set (for
|
||||
// example, from the Grappler fusion pass).
|
||||
|
||||
TF_ASSIGN_OR_RETURN(bool changed, RunEdgeContractionLoopInPostOrderOnce());
|
||||
// Run twice, first only targeted at contracting very beneficial edges then
|
||||
// without restrictions. This helps to minimize data output from clusters (and
|
||||
// possible transpose operations before outputs) that might occur if a
|
||||
// ShapeConsumingOp is on the edge of 2 clusters due to cycle considerations.
|
||||
TF_ASSIGN_OR_RETURN(bool changed, ContractPreferredEdges());
|
||||
|
||||
TF_ASSIGN_OR_RETURN(changed, RunEdgeContractionLoopInPostOrderOnce());
|
||||
|
||||
// Check that RunEdgeContractionLoopInPostOrderOnce is idempotent. Once the
|
||||
// linear time post-order scheme has been battle tested we can move this to
|
||||
@ -711,10 +784,6 @@ MarkForCompilationPassImpl::ClusteringWillIntroduceInterDeviceDependency(
|
||||
// where a cluster is producing data for multiple devices.
|
||||
for (const auto& in_id :
|
||||
cycles_graph_.Predecessors(cluster_to.cycles_graph_node_id())) {
|
||||
if (in_id >= graph_->num_node_ids()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const Cluster* cluster_in = GetClusterForCyclesGraphNode(in_id);
|
||||
if (cluster_in) {
|
||||
TF_ASSIGN_OR_RETURN(bool devices_compatible,
|
||||
@ -1070,19 +1139,11 @@ StatusOr<bool> MarkForCompilationPassImpl::TryToContractEdgesFrom(
|
||||
|
||||
// Make a copy of the set of successors because we may modify the graph in
|
||||
// TryToContractEdge.
|
||||
std::vector<int32> successors_copy = [&] {
|
||||
absl::Span<const int32> successors =
|
||||
cycles_graph_.Successors(cluster_from->cycles_graph_node_id());
|
||||
return std::vector<int32>(successors.begin(), successors.end());
|
||||
}();
|
||||
std::vector<int32> successors_copy =
|
||||
cycles_graph_.SuccessorsCopy(cluster_from->cycles_graph_node_id());
|
||||
|
||||
for (int to : successors_copy) {
|
||||
iteration_count_++;
|
||||
if (to >= graph_->num_node_ids()) {
|
||||
// Node is a fictitious node that is present only in the cycle detection
|
||||
// graph. No clustering is possible.
|
||||
continue;
|
||||
}
|
||||
|
||||
Cluster* cluster_to = GetClusterForCyclesGraphNode(to);
|
||||
if (!cluster_to) {
|
||||
|
@ -1548,5 +1548,59 @@ TEST(XlaCompilationTest, DontClusterNodesWithForwardFromAttr) {
|
||||
EXPECT_EQ(clusters["test/z"], "");
|
||||
}
|
||||
|
||||
// Note, this relies on other implementation details to test the
|
||||
// specific heuristic we care about here, so other changes might be at fault if
|
||||
// this CL breaks. What we care about is that if a ShapeConsumingOp can be
|
||||
// connected with a producer or consumer and cannot be clustered with both, it
|
||||
// should be clustered with the producer.
|
||||
TEST(XlaCompilationTest, ClusterShapeConsumerWithProducer) {
|
||||
Scope root = Scope::NewRootScope().ExitOnError();
|
||||
Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
|
||||
Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
|
||||
|
||||
Output x = ops::MatMul(root.WithOpName("test/x"), a, b);
|
||||
Output y = ops::Size(root.WithOpName("test/y"), x);
|
||||
Output z = ops::Add(root.WithOpName("test/z"), y, y);
|
||||
|
||||
std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
|
||||
TF_ASSERT_OK(root.ToGraph(graph.get()));
|
||||
|
||||
// Ensure that the "Size" op can only be clustered with either the producer or
|
||||
// consumer by putting them on different devices.
|
||||
FindNodeByName(graph.get(), "test/x")->set_assigned_device_name(kGPU0);
|
||||
FindNodeByName(graph.get(), "test/y")->set_assigned_device_name(kCPU0);
|
||||
FindNodeByName(graph.get(), "test/z")->set_assigned_device_name(kGPU1);
|
||||
|
||||
TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
|
||||
|
||||
std::unordered_map<string, string> clusters = GetClusters(*graph);
|
||||
|
||||
EXPECT_NE(clusters["test/y"], "");
|
||||
EXPECT_EQ(clusters["test/x"], clusters["test/y"]);
|
||||
EXPECT_NE(clusters["test/z"], clusters["test/y"]);
|
||||
}
|
||||
|
||||
// Test that ShapeConsuming ops are still fully clustered whenever possible.
|
||||
TEST(XlaCompilationTest, ClusterShapeConsumerWithProducerAndConsumer) {
|
||||
Scope root = Scope::NewRootScope().ExitOnError();
|
||||
Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
|
||||
Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
|
||||
|
||||
Output x = ops::MatMul(root.WithOpName("test/x"), a, b);
|
||||
Output y = ops::Size(root.WithOpName("test/y"), x);
|
||||
Output z = ops::Add(root.WithOpName("test/z"), y, y);
|
||||
|
||||
std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
|
||||
TF_ASSERT_OK(root.ToGraph(graph.get()));
|
||||
|
||||
TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
|
||||
|
||||
std::unordered_map<string, string> clusters = GetClusters(*graph);
|
||||
|
||||
EXPECT_NE(clusters["test/y"], "");
|
||||
EXPECT_EQ(clusters["test/y"], clusters["test/x"]);
|
||||
EXPECT_EQ(clusters["test/y"], clusters["test/z"]);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tensorflow
|
||||
|
@ -51,6 +51,15 @@ Status FindNodesToDecluster(const Graph& graph,
|
||||
continue;
|
||||
}
|
||||
|
||||
// Assume the benefit of not outputting a larger tensor outweighs the
|
||||
// benefit of this check.
|
||||
// TODO(tpopp): Only apply this if the value being consumed is not output
|
||||
// from the cluster to another consumer.
|
||||
// TODO(tpopp): See if XlaRun can be modified to avoid this issue
|
||||
// completely.
|
||||
if (IsShapeConsumerOp(*n)) {
|
||||
continue;
|
||||
}
|
||||
// We assume the only XLA-auto-clusterable operations with side effects are
|
||||
// resource variable updates. We can't execute these twice.
|
||||
if (HasResourceInputOrOutput(*n)) {
|
||||
@ -344,12 +353,6 @@ Status PartiallyDeclusterGraph(Graph* graph,
|
||||
} // namespace reduce_recompilation
|
||||
|
||||
namespace decluster_root_shape_consumers {
|
||||
// Returns true if `node` an operator that consumes only the shape of its input,
|
||||
// not the data itself.
|
||||
bool IsShapeConsumerOp(const Node& node) {
|
||||
return node.type_string() == "Shape" || node.type_string() == "Rank" ||
|
||||
node.type_string() == "Size";
|
||||
}
|
||||
|
||||
Status PartiallyDeclusterGraph(Graph* graph) {
|
||||
std::vector<Node*> reverse_post_order;
|
||||
|
@ -40,20 +40,20 @@ limitations under the License.
|
||||
|
||||
namespace tensorflow {
|
||||
namespace {
|
||||
REGISTER_OP("FakeNullary").Output("out: float");
|
||||
REGISTER_OP("FakeNullary").Output("out: int32");
|
||||
|
||||
REGISTER_OP("FakeBinary")
|
||||
.Input("host_in: float")
|
||||
.Input("device_in: float")
|
||||
.Output("host_out: float")
|
||||
.Output("device_out: float");
|
||||
.Input("host_in: int32")
|
||||
.Input("device_in: int32")
|
||||
.Output("host_out: int32")
|
||||
.Output("device_out: int32");
|
||||
|
||||
REGISTER_OP("FakeResourceVar").Output("out: resource");
|
||||
|
||||
REGISTER_OP("FakeResourceUpdate")
|
||||
.Input("in: resource")
|
||||
.Output("out: resource")
|
||||
.Output("something_else: float");
|
||||
.Output("something_else: int32");
|
||||
|
||||
class FakeBinaryOp : public OpKernel {
|
||||
public:
|
||||
@ -499,5 +499,29 @@ TEST(PartiallyDeclusterPassTest, MetadataOpsDontStartClusters) {
|
||||
EXPECT_EQ(GetXlaClusterForNode(*n_e), absl::nullopt);
|
||||
}
|
||||
|
||||
TEST(PartiallyDeclusterPassTest, MetaConsumersArentDeclustered) {
|
||||
tensorflow::Scope root = tensorflow::Scope::NewRootScope();
|
||||
tensorflow::Scope in_cluster_and = root.WithXlaCluster("cluster_0");
|
||||
std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
|
||||
Output a = ops::Placeholder(root.WithOpName("a"), DT_FLOAT);
|
||||
Output b = ops::Add(in_cluster_and.WithOpName("b"), a, a);
|
||||
Output c = ops::Rank(in_cluster_and.WithOpName("c"), b);
|
||||
|
||||
Output e;
|
||||
TF_ASSERT_OK(
|
||||
CreateOutputWithScope("FakeBinary", {c, c}, root.WithOpName("e"), &e));
|
||||
|
||||
TF_ASSERT_OK(root.ToGraph(graph.get()));
|
||||
TF_ASSERT_OK(PartiallyDecluster(&graph));
|
||||
|
||||
Node* n_b = FindNodeByName(*graph, "b");
|
||||
ASSERT_NE(n_b, nullptr);
|
||||
EXPECT_EQ(GetXlaClusterForNode(*n_b), "cluster_0");
|
||||
|
||||
Node* n_c = FindNodeByName(*graph, "c");
|
||||
ASSERT_NE(n_c, nullptr);
|
||||
EXPECT_EQ(GetXlaClusterForNode(*n_c), "cluster_0");
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tensorflow
|
||||
|
@ -314,4 +314,8 @@ bool MayCallFunction(const Node& n, const FunctionLibraryDefinition* flib_def) {
|
||||
return name_attr_pair.second.has_func();
|
||||
});
|
||||
}
|
||||
bool IsShapeConsumerOp(const Node& node) {
|
||||
return node.type_string() == "Shape" || node.type_string() == "Rank" ||
|
||||
node.type_string() == "Size";
|
||||
}
|
||||
} // namespace tensorflow
|
||||
|
@ -83,6 +83,10 @@ bool IsSingleGpuGraph(const Graph& g);
|
||||
// Returns true if it is possible (but not guaranteed) that `n` calls a
|
||||
// function.
|
||||
bool MayCallFunction(const Node& n, const FunctionLibraryDefinition* flib_def);
|
||||
|
||||
// Returns true if `node` an operator that consumes only the shape of its input,
|
||||
// not the data itself.
|
||||
bool IsShapeConsumerOp(const Node& node);
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
|
||||
|
Loading…
Reference in New Issue
Block a user