TensorFlow: Add additional debugging info to error messages when

a node cannot be placed on a device, specifically when the cause
is due to colocation constraints.

For example, node colocation can cause a group of nodes to be
colocated with each other in an unsatisfiable way.  For example, if we
have three ops, A, B, C, where A supports GPU and CPU, B supports only
GPU, and C supports only CPU, a colocation group has no satisfiable
assignment.  In these cases, the cause is not just the lack of a
kernel for the op that failed to place, but possibly due the set of
ops in the colocation group.

This change adds additional logging to the error message that lists
the op types and their supported devices, so a user can figure out
which combinations of ops are problematic.

Fixes #2508 in that the error message should be clearer now.
Change: 123330848
This commit is contained in:
Vijay Vasudevan 2016-05-26 09:16:41 -08:00 committed by TensorFlower Gardener
parent afa0112782
commit e8948a2d9d
2 changed files with 68 additions and 5 deletions

View File

@ -16,6 +16,7 @@ limitations under the License.
#include "tensorflow/core/common_runtime/simple_placer.h" #include "tensorflow/core/common_runtime/simple_placer.h"
#include <memory> #include <memory>
#include <set>
#include <utility> #include <utility>
#include <vector> #include <vector>
@ -182,6 +183,7 @@ class ColocationGraph {
Status ColocateNodes(const Node& x, const Node& y) { Status ColocateNodes(const Node& x, const Node& y) {
int x_root = FindRoot(x.id()); int x_root = FindRoot(x.id());
int y_root = FindRoot(y.id()); int y_root = FindRoot(y.id());
Status s; Status s;
if (x_root != y_root) { if (x_root != y_root) {
// Merge the sets by swinging the parent pointer of the smaller // Merge the sets by swinging the parent pointer of the smaller
@ -229,6 +231,12 @@ class ColocationGraph {
s.error_message()); s.error_message());
} }
// Transfer ids in the old group to the new one.
members_[new_root].ids_in_group.insert(
members_[old_root].ids_in_group.begin(),
members_[old_root].ids_in_group.end());
members_[old_root].ids_in_group.clear();
// Ensure that the common root has at least one supported device // Ensure that the common root has at least one supported device
// type, by computing the intersection of // type, by computing the intersection of
// members_[new_root].supported_device_types and // members_[new_root].supported_device_types and
@ -267,6 +275,9 @@ class ColocationGraph {
return Status::OK(); return Status::OK();
} }
// String containing additional debugging info on failures.
string debug_info;
// We have not yet computed the possible devices for the // We have not yet computed the possible devices for the
// colocated node set containing 'node', so we do so now using the // colocated node set containing 'node', so we do so now using the
// constraints on the root node. // constraints on the root node.
@ -310,6 +321,8 @@ class ColocationGraph {
// Return an error when a physical device that matches an explicit // Return an error when a physical device that matches an explicit
// device specification is not found. This ensures that we don't // device specification is not found. This ensures that we don't
// assign a node to GPU when the user wanted to force it on CPU. // assign a node to GPU when the user wanted to force it on CPU.
AddDebugInfo(node_root, &debug_info);
DeviceNameUtils::ParsedName specified_device_name; DeviceNameUtils::ParsedName specified_device_name;
if (DeviceNameUtils::ParseFullName(node->def().device(), if (DeviceNameUtils::ParseFullName(node->def().device(),
&specified_device_name) && &specified_device_name) &&
@ -334,16 +347,17 @@ class ColocationGraph {
node->def().device(), node->def().device(),
"' because no devices matching that specification " "' because no devices matching that specification "
"are registered in this process; available devices: ", "are registered in this process; available devices: ",
str_util::Join(device_names, ", ")); str_util::Join(device_names, ", "), debug_info);
} else if (specified_device_name.has_type) { } else if (specified_device_name.has_type) {
return errors::InvalidArgument( return errors::InvalidArgument(
"Could not satisfy explicit device specification '", "Could not satisfy explicit device specification '",
node->def().device(), "' because no supported kernel for ", node->def().device(), "' because no supported kernel for ",
specified_device_name.type, " devices is available"); specified_device_name.type, " devices is available.",
debug_info);
} else { } else {
return errors::InvalidArgument( return errors::InvalidArgument(
"Could not satisfy explicit device specification '", "Could not satisfy explicit device specification '",
node->def().device()); node->def().device(), debug_info);
} }
} else { } else {
// The specified device may be a valid device but the // The specified device may be a valid device but the
@ -355,7 +369,7 @@ class ColocationGraph {
"required incompatible device '", "required incompatible device '",
DeviceNameUtils::ParsedNameToString( DeviceNameUtils::ParsedNameToString(
members_[node_root].device_name), members_[node_root].device_name),
"'"); "'", debug_info);
} }
} }
} else { } else {
@ -368,10 +382,11 @@ class ColocationGraph {
device_set_->devices(), members_[node_root].supported_device_types); device_set_->devices(), members_[node_root].supported_device_types);
if (devices.empty()) { if (devices.empty()) {
AddDebugInfo(node_root, &debug_info);
return errors::InvalidArgument( return errors::InvalidArgument(
"Node had no OpKernel registered to support this operation: ", "Node had no OpKernel registered to support this operation: ",
"Operation was ", node->type_string(), " and inputs were ", "Operation was ", node->type_string(), " and inputs were ",
DataTypeVectorString(node->input_types())); DataTypeVectorString(node->input_types()), debug_info);
} }
} }
@ -390,6 +405,15 @@ class ColocationGraph {
// id if it is a root. parent <= 0 indicates that this member is invalid. // id if it is a root. parent <= 0 indicates that this member is invalid.
int parent = -1; int parent = -1;
// The set of ids that are part of the disjoint node set forest.
//
// This is only fully specified in the root of a disjoint
// node set forest.
std::set<int> ids_in_group;
// The type of the op for this node.
string op_type;
// A proxy for the depth of the tree that is used to prefer // A proxy for the depth of the tree that is used to prefer
// connecting smaller trees to larger trees when merging disjoint // connecting smaller trees to larger trees when merging disjoint
// sets. // sets.
@ -410,8 +434,41 @@ class ColocationGraph {
std::vector<Device*> possible_devices; std::vector<Device*> possible_devices;
}; };
// Adds debugging info to 'output' for the node referred to by
// 'node_root'.
void AddDebugInfo(const int node_root, string* output) {
if (members_[node_root].ids_in_group.size() > 1) {
strings::StrAppend(output, "\nColocation Debug Info:\n");
// If this node is part of a colocation group, then we want to
// collect the mapping of ops to supported devices, so that
// the user can see why an unsatisfiable placement occurred.
strings::StrAppend(
output, "Colocation group had the following types and devices: ");
std::unordered_map<string, string> type_to_devices;
for (const int id : members_[node_root].ids_in_group) {
const string& op_type = members_[id].op_type;
string devices_registered;
for (const auto& device_type : members_[id].supported_device_types) {
strings::StrAppend(&devices_registered, DeviceTypeString(device_type),
" ");
}
type_to_devices[op_type] = devices_registered;
}
for (const auto& td : type_to_devices) {
strings::StrAppend(output, "\n", td.first, ": ", td.second);
}
}
}
Status InitializeMember(const Node& node, Member* member) { Status InitializeMember(const Node& node, Member* member) {
const int id = node.id(); const int id = node.id();
member->ids_in_group.insert(id);
member->op_type = node.type_string();
if (id < 0) { if (id < 0) {
return errors::InvalidArgument("Node id was not positive: ", id); return errors::InvalidArgument("Node id was not positive: ", id);
} }

View File

@ -729,6 +729,12 @@ TEST_F(SimplePlacerTest, TestHeterogeneousDeviceSetFailure) {
EXPECT_TRUE(StringPiece(s.error_message()) EXPECT_TRUE(StringPiece(s.error_message())
.contains("colocated with a group of nodes that required " .contains("colocated with a group of nodes that required "
"incompatible device")); "incompatible device"));
// The error message should contain information that indicates which
// op types have which registered device types.
EXPECT_TRUE(StringPiece(s.error_message()).contains("VariableGPU: GPU")) << s;
EXPECT_TRUE(StringPiece(s.error_message()).contains("TestAssign: GPU CPU"))
<< s;
} }
// Test that placement fails when an unknown device is requested. // Test that placement fails when an unknown device is requested.