Limit colocation group devices after a node is assigned

This change makes Placer's code less brittle. A quick recap of Placer:
Placer first groups nodes into colocation groups. Each group is a set of
nodes and a set of possible devices that can be used for the nodes in
the group. The set of possible devices for a group is encoded in the root
Member of the tree representing this group. All the nodes in a group must
be assigned to the same device.

Before this change, the "all nodes in a group must be assigned to the same
device" property was ensured by a relying on specifics of GetDevicesForNode
and Placer's heuristics.

This change explicitly limits the possible set of devices to the single device
that a node in the group was assigned to. All future calls to GetDevicesForNode
will return just this device making it harder for a heuristic to choose
another device.

I did not find an easy way to test this change because there is no behavior
change. My future changes include a test for this that utilizes multi-device
function calls.

PiperOrigin-RevId: 231628964
This commit is contained in:
Igor Ganichev 2019-01-30 11:05:49 -08:00 committed by TensorFlower Gardener
parent 57c8ec08ab
commit 34b64c3af3
2 changed files with 77 additions and 24 deletions

View File

@ -408,6 +408,44 @@ class ColocationGraph {
return Status::OK();
}
// Limits the possible devices of `node`'s colocation group to the device
// to which `node` is assigned. This makes sure that all nodes in this
// colocation group will be assigned to the same device. Without this
// explicit restriction, heuristics can choose a different possible device
// for other nodes in the group.
Status LimitToAssignedDevice(const Node& node) {
if (node.assigned_device_name_index() < 0) {
return errors::Internal(
"Expected an assigned node as argument to LimitToAssignedDevice but "
"got: ",
node.DebugString());
}
int root = FindRoot(node.id());
Member& root_member = members_[root];
if (node.assigned_device_name_index() ==
root_member.assigned_device_name_index) {
return Status::OK();
}
DeviceNameUtils::ParsedName parsed;
DeviceNameUtils::ParseFullName(node.assigned_device_name(), &parsed);
Status s = DeviceNameUtils::MergeDevNames(&root_member.device_name, parsed,
allow_soft_placement_);
if (!s.ok()) {
return errors::Internal(
"Constraining by assigned device should not cause an error. Original "
"root device name: ",
DeviceNameUtils::ParsedNameToString(root_member.device_name),
" assigned device name \"", node.assigned_device_name(),
". Error: ", s.error_message());
}
root_member.assigned_device_name_index = node.assigned_device_name_index();
// Clear cached possible_devices, if any.
root_member.possible_devices.clear();
return Status::OK();
}
// For the given node, subject to the constraints previously given
// to this ColocationGraph, set its assigned_device_name. Returns OK
// if a satisfying device can be found, otherwise an error.
@ -587,6 +625,16 @@ class ColocationGraph {
// those of all of its children.
DeviceNameUtils::ParsedName device_name;
// Once colocation groups have been formed and we assigned at least
// one node in this group to a device, assigned_device_name_index will
// contain this device name's index in the graph. The `device_name` will
// contain the parsed name of this device and `possible_devices`, if
// computed, will contain just this device.
// `assigned_device_name_index` is an optimization to avoid parsing and
// comparing device names. The value of -1 signals that a single device
// has not been chosen yet.
int assigned_device_name_index = -1;
// If this node is a root, stores a list of Devices to which this node
// and all of its children have been assigned, or nullptr if this
// has not yet been computed.
@ -891,6 +939,30 @@ class ColocationGraph {
const bool allow_soft_placement_;
const bool log_device_placement_;
};
void LogDeviceAssignment(const Node* node, bool log_device_placement) {
// Log placement if log_device_placement is set.
if (log_device_placement) {
printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
node->assigned_device_name().c_str());
LOG(INFO) << node->name() << ": "
<< "(" << node->type_string() << ")"
<< node->assigned_device_name();
}
}
Status AssignAndLog(int assigned_device, Node* node,
ColocationGraph* colocation_graph,
bool log_device_placement) {
node->set_assigned_device_name_index(assigned_device);
// Constraint the group of node to the assigned device.
TF_RETURN_IF_ERROR(colocation_graph->LimitToAssignedDevice(*node));
LogDeviceAssignment(node, log_device_placement);
return Status::OK();
}
} // namespace
Placer::Placer(Graph* graph, const DeviceSet* devices,
@ -927,7 +999,7 @@ Status Placer::Run() {
// devices (e.g., for stateful placements), so the placer should not try to
// place nodes that are already placed.
if (node->has_assigned_device_name()) {
LogDeviceAssignment(node);
LogDeviceAssignment(node, log_device_placement_);
continue;
}
@ -983,7 +1055,8 @@ Status Placer::Run() {
assigned_device = graph_->InternDeviceName((*devices)[0]->name());
}
AssignAndLog(assigned_device, node);
TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
log_device_placement_));
}
// Perform a second pass assignment for those nodes explicitly
@ -1022,7 +1095,8 @@ Status Placer::Run() {
assigned_device = graph_->InternDeviceName((*devices)[0]->name());
}
AssignAndLog(assigned_device, node);
TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
log_device_placement_));
}
return Status::OK();
@ -1045,20 +1119,4 @@ bool Placer::CanAssignToDevice(const string& candidate_device_name,
return false;
}
void Placer::AssignAndLog(int assigned_device, Node* node) const {
node->set_assigned_device_name_index(assigned_device);
LogDeviceAssignment(node);
}
void Placer::LogDeviceAssignment(const Node* node) const {
// Log placement if log_device_placement is set.
if (log_device_placement_) {
printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
node->assigned_device_name().c_str());
LOG(INFO) << node->name() << ": "
<< "(" << node->type_string() << ")"
<< node->assigned_device_name();
}
}
} // namespace tensorflow

View File

@ -88,11 +88,6 @@ class Placer {
bool CanAssignToDevice(const string& candidate_device_name,
const std::vector<Device*>& devices) const;
// Assigns 'node's devices to 'assigned_device', and logs the
// placement if the SessionOptions entry in 'options_' requests it.
void AssignAndLog(int assigned_device, Node* node) const;
void LogDeviceAssignment(const Node* node) const;
Graph* const graph_; // Not owned.
const DeviceSet* const devices_; // Not owned.
const SessionOptions* options_; // Not owned.