TensorFlow: Add additional debugging info to error messages when

a node cannot be placed on a device, specifically when the cause
is due to colocation constraints.

For example, node colocation can cause a group of nodes to be
colocated with each other in an unsatisfiable way.  For example, if we
have three ops, A, B, C, where A supports GPU and CPU, B supports only
GPU, and C supports only CPU, a colocation group has no satisfiable
assignment.  In these cases, the cause is not just the lack of a
kernel for the op that failed to place, but possibly due the set of
ops in the colocation group.

This change adds additional logging to the error message that lists
the op types and their supported devices, so a user can figure out
which combinations of ops are problematic.

Fixes #2508 in that the error message should be clearer now.
Change: 123330848
This commit is contained in:
Vijay Vasudevan 2016-05-26 09:16:41 -08:00 committed by TensorFlower Gardener
parent afa0112782
commit e8948a2d9d
2 changed files with 68 additions and 5 deletions

View File

@ -16,6 +16,7 @@ limitations under the License.
#include "tensorflow/core/common_runtime/simple_placer.h"
#include <memory>
#include <set>
#include <utility>
#include <vector>
@ -182,6 +183,7 @@ class ColocationGraph {
Status ColocateNodes(const Node& x, const Node& y) {
int x_root = FindRoot(x.id());
int y_root = FindRoot(y.id());
Status s;
if (x_root != y_root) {
// Merge the sets by swinging the parent pointer of the smaller
@ -229,6 +231,12 @@ class ColocationGraph {
s.error_message());
}
// Transfer ids in the old group to the new one.
members_[new_root].ids_in_group.insert(
members_[old_root].ids_in_group.begin(),
members_[old_root].ids_in_group.end());
members_[old_root].ids_in_group.clear();
// Ensure that the common root has at least one supported device
// type, by computing the intersection of
// members_[new_root].supported_device_types and
@ -267,6 +275,9 @@ class ColocationGraph {
return Status::OK();
}
// String containing additional debugging info on failures.
string debug_info;
// We have not yet computed the possible devices for the
// colocated node set containing 'node', so we do so now using the
// constraints on the root node.
@ -310,6 +321,8 @@ class ColocationGraph {
// Return an error when a physical device that matches an explicit
// device specification is not found. This ensures that we don't
// assign a node to GPU when the user wanted to force it on CPU.
AddDebugInfo(node_root, &debug_info);
DeviceNameUtils::ParsedName specified_device_name;
if (DeviceNameUtils::ParseFullName(node->def().device(),
&specified_device_name) &&
@ -334,16 +347,17 @@ class ColocationGraph {
node->def().device(),
"' because no devices matching that specification "
"are registered in this process; available devices: ",
str_util::Join(device_names, ", "));
str_util::Join(device_names, ", "), debug_info);
} else if (specified_device_name.has_type) {
return errors::InvalidArgument(
"Could not satisfy explicit device specification '",
node->def().device(), "' because no supported kernel for ",
specified_device_name.type, " devices is available");
specified_device_name.type, " devices is available.",
debug_info);
} else {
return errors::InvalidArgument(
"Could not satisfy explicit device specification '",
node->def().device());
node->def().device(), debug_info);
}
} else {
// The specified device may be a valid device but the
@ -355,7 +369,7 @@ class ColocationGraph {
"required incompatible device '",
DeviceNameUtils::ParsedNameToString(
members_[node_root].device_name),
"'");
"'", debug_info);
}
}
} else {
@ -368,10 +382,11 @@ class ColocationGraph {
device_set_->devices(), members_[node_root].supported_device_types);
if (devices.empty()) {
AddDebugInfo(node_root, &debug_info);
return errors::InvalidArgument(
"Node had no OpKernel registered to support this operation: ",
"Operation was ", node->type_string(), " and inputs were ",
DataTypeVectorString(node->input_types()));
DataTypeVectorString(node->input_types()), debug_info);
}
}
@ -390,6 +405,15 @@ class ColocationGraph {
// id if it is a root. parent <= 0 indicates that this member is invalid.
int parent = -1;
// The set of ids that are part of the disjoint node set forest.
//
// This is only fully specified in the root of a disjoint
// node set forest.
std::set<int> ids_in_group;
// The type of the op for this node.
string op_type;
// A proxy for the depth of the tree that is used to prefer
// connecting smaller trees to larger trees when merging disjoint
// sets.
@ -410,8 +434,41 @@ class ColocationGraph {
std::vector<Device*> possible_devices;
};
// Adds debugging info to 'output' for the node referred to by
// 'node_root'.
void AddDebugInfo(const int node_root, string* output) {
if (members_[node_root].ids_in_group.size() > 1) {
strings::StrAppend(output, "\nColocation Debug Info:\n");
// If this node is part of a colocation group, then we want to
// collect the mapping of ops to supported devices, so that
// the user can see why an unsatisfiable placement occurred.
strings::StrAppend(
output, "Colocation group had the following types and devices: ");
std::unordered_map<string, string> type_to_devices;
for (const int id : members_[node_root].ids_in_group) {
const string& op_type = members_[id].op_type;
string devices_registered;
for (const auto& device_type : members_[id].supported_device_types) {
strings::StrAppend(&devices_registered, DeviceTypeString(device_type),
" ");
}
type_to_devices[op_type] = devices_registered;
}
for (const auto& td : type_to_devices) {
strings::StrAppend(output, "\n", td.first, ": ", td.second);
}
}
}
Status InitializeMember(const Node& node, Member* member) {
const int id = node.id();
member->ids_in_group.insert(id);
member->op_type = node.type_string();
if (id < 0) {
return errors::InvalidArgument("Node id was not positive: ", id);
}

View File

@ -729,6 +729,12 @@ TEST_F(SimplePlacerTest, TestHeterogeneousDeviceSetFailure) {
EXPECT_TRUE(StringPiece(s.error_message())
.contains("colocated with a group of nodes that required "
"incompatible device"));
// The error message should contain information that indicates which
// op types have which registered device types.
EXPECT_TRUE(StringPiece(s.error_message()).contains("VariableGPU: GPU")) << s;
EXPECT_TRUE(StringPiece(s.error_message()).contains("TestAssign: GPU CPU"))
<< s;
}
// Test that placement fails when an unknown device is requested.