Limit colocation group devices after a node is assigned
This change makes Placer's code less brittle. A quick recap of Placer: Placer first groups nodes into colocation groups. Each group is a set of nodes and a set of possible devices that can be used for the nodes in the group. The set of possible devices for a group is encoded in the root Member of the tree representing this group. All the nodes in a group must be assigned to the same device. Before this change, the "all nodes in a group must be assigned to the same device" property was ensured by a relying on specifics of GetDevicesForNode and Placer's heuristics. This change explicitly limits the possible set of devices to the single device that a node in the group was assigned to. All future calls to GetDevicesForNode will return just this device making it harder for a heuristic to choose another device. I did not find an easy way to test this change because there is no behavior change. My future changes include a test for this that utilizes multi-device function calls. PiperOrigin-RevId: 231628964
This commit is contained in:
parent
57c8ec08ab
commit
34b64c3af3
@ -408,6 +408,44 @@ class ColocationGraph {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Limits the possible devices of `node`'s colocation group to the device
|
||||
// to which `node` is assigned. This makes sure that all nodes in this
|
||||
// colocation group will be assigned to the same device. Without this
|
||||
// explicit restriction, heuristics can choose a different possible device
|
||||
// for other nodes in the group.
|
||||
Status LimitToAssignedDevice(const Node& node) {
|
||||
if (node.assigned_device_name_index() < 0) {
|
||||
return errors::Internal(
|
||||
"Expected an assigned node as argument to LimitToAssignedDevice but "
|
||||
"got: ",
|
||||
node.DebugString());
|
||||
}
|
||||
int root = FindRoot(node.id());
|
||||
Member& root_member = members_[root];
|
||||
if (node.assigned_device_name_index() ==
|
||||
root_member.assigned_device_name_index) {
|
||||
return Status::OK();
|
||||
}
|
||||
DeviceNameUtils::ParsedName parsed;
|
||||
DeviceNameUtils::ParseFullName(node.assigned_device_name(), &parsed);
|
||||
Status s = DeviceNameUtils::MergeDevNames(&root_member.device_name, parsed,
|
||||
allow_soft_placement_);
|
||||
if (!s.ok()) {
|
||||
return errors::Internal(
|
||||
"Constraining by assigned device should not cause an error. Original "
|
||||
"root device name: ",
|
||||
DeviceNameUtils::ParsedNameToString(root_member.device_name),
|
||||
" assigned device name \"", node.assigned_device_name(),
|
||||
". Error: ", s.error_message());
|
||||
}
|
||||
|
||||
root_member.assigned_device_name_index = node.assigned_device_name_index();
|
||||
// Clear cached possible_devices, if any.
|
||||
root_member.possible_devices.clear();
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// For the given node, subject to the constraints previously given
|
||||
// to this ColocationGraph, set its assigned_device_name. Returns OK
|
||||
// if a satisfying device can be found, otherwise an error.
|
||||
@ -587,6 +625,16 @@ class ColocationGraph {
|
||||
// those of all of its children.
|
||||
DeviceNameUtils::ParsedName device_name;
|
||||
|
||||
// Once colocation groups have been formed and we assigned at least
|
||||
// one node in this group to a device, assigned_device_name_index will
|
||||
// contain this device name's index in the graph. The `device_name` will
|
||||
// contain the parsed name of this device and `possible_devices`, if
|
||||
// computed, will contain just this device.
|
||||
// `assigned_device_name_index` is an optimization to avoid parsing and
|
||||
// comparing device names. The value of -1 signals that a single device
|
||||
// has not been chosen yet.
|
||||
int assigned_device_name_index = -1;
|
||||
|
||||
// If this node is a root, stores a list of Devices to which this node
|
||||
// and all of its children have been assigned, or nullptr if this
|
||||
// has not yet been computed.
|
||||
@ -891,6 +939,30 @@ class ColocationGraph {
|
||||
const bool allow_soft_placement_;
|
||||
const bool log_device_placement_;
|
||||
};
|
||||
|
||||
void LogDeviceAssignment(const Node* node, bool log_device_placement) {
|
||||
// Log placement if log_device_placement is set.
|
||||
if (log_device_placement) {
|
||||
printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
|
||||
node->assigned_device_name().c_str());
|
||||
LOG(INFO) << node->name() << ": "
|
||||
<< "(" << node->type_string() << ")"
|
||||
<< node->assigned_device_name();
|
||||
}
|
||||
}
|
||||
|
||||
Status AssignAndLog(int assigned_device, Node* node,
|
||||
ColocationGraph* colocation_graph,
|
||||
bool log_device_placement) {
|
||||
node->set_assigned_device_name_index(assigned_device);
|
||||
|
||||
// Constraint the group of node to the assigned device.
|
||||
TF_RETURN_IF_ERROR(colocation_graph->LimitToAssignedDevice(*node));
|
||||
|
||||
LogDeviceAssignment(node, log_device_placement);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Placer::Placer(Graph* graph, const DeviceSet* devices,
|
||||
@ -927,7 +999,7 @@ Status Placer::Run() {
|
||||
// devices (e.g., for stateful placements), so the placer should not try to
|
||||
// place nodes that are already placed.
|
||||
if (node->has_assigned_device_name()) {
|
||||
LogDeviceAssignment(node);
|
||||
LogDeviceAssignment(node, log_device_placement_);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -983,7 +1055,8 @@ Status Placer::Run() {
|
||||
assigned_device = graph_->InternDeviceName((*devices)[0]->name());
|
||||
}
|
||||
|
||||
AssignAndLog(assigned_device, node);
|
||||
TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
|
||||
log_device_placement_));
|
||||
}
|
||||
|
||||
// Perform a second pass assignment for those nodes explicitly
|
||||
@ -1022,7 +1095,8 @@ Status Placer::Run() {
|
||||
assigned_device = graph_->InternDeviceName((*devices)[0]->name());
|
||||
}
|
||||
|
||||
AssignAndLog(assigned_device, node);
|
||||
TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
|
||||
log_device_placement_));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
@ -1045,20 +1119,4 @@ bool Placer::CanAssignToDevice(const string& candidate_device_name,
|
||||
return false;
|
||||
}
|
||||
|
||||
void Placer::AssignAndLog(int assigned_device, Node* node) const {
|
||||
node->set_assigned_device_name_index(assigned_device);
|
||||
LogDeviceAssignment(node);
|
||||
}
|
||||
|
||||
void Placer::LogDeviceAssignment(const Node* node) const {
|
||||
// Log placement if log_device_placement is set.
|
||||
if (log_device_placement_) {
|
||||
printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
|
||||
node->assigned_device_name().c_str());
|
||||
LOG(INFO) << node->name() << ": "
|
||||
<< "(" << node->type_string() << ")"
|
||||
<< node->assigned_device_name();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -88,11 +88,6 @@ class Placer {
|
||||
bool CanAssignToDevice(const string& candidate_device_name,
|
||||
const std::vector<Device*>& devices) const;
|
||||
|
||||
// Assigns 'node's devices to 'assigned_device', and logs the
|
||||
// placement if the SessionOptions entry in 'options_' requests it.
|
||||
void AssignAndLog(int assigned_device, Node* node) const;
|
||||
void LogDeviceAssignment(const Node* node) const;
|
||||
|
||||
Graph* const graph_; // Not owned.
|
||||
const DeviceSet* const devices_; // Not owned.
|
||||
const SessionOptions* options_; // Not owned.
|
||||
|
Loading…
x
Reference in New Issue
Block a user