Limit colocation group devices after a node is assigned
This change makes Placer's code less brittle. A quick recap of Placer: Placer first groups nodes into colocation groups. Each group is a set of nodes and a set of possible devices that can be used for the nodes in the group. The set of possible devices for a group is encoded in the root Member of the tree representing this group. All the nodes in a group must be assigned to the same device. Before this change, the "all nodes in a group must be assigned to the same device" property was ensured by a relying on specifics of GetDevicesForNode and Placer's heuristics. This change explicitly limits the possible set of devices to the single device that a node in the group was assigned to. All future calls to GetDevicesForNode will return just this device making it harder for a heuristic to choose another device. I did not find an easy way to test this change because there is no behavior change. My future changes include a test for this that utilizes multi-device function calls. PiperOrigin-RevId: 231628964
This commit is contained in:
parent
57c8ec08ab
commit
34b64c3af3
@ -408,6 +408,44 @@ class ColocationGraph {
|
|||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Limits the possible devices of `node`'s colocation group to the device
|
||||||
|
// to which `node` is assigned. This makes sure that all nodes in this
|
||||||
|
// colocation group will be assigned to the same device. Without this
|
||||||
|
// explicit restriction, heuristics can choose a different possible device
|
||||||
|
// for other nodes in the group.
|
||||||
|
Status LimitToAssignedDevice(const Node& node) {
|
||||||
|
if (node.assigned_device_name_index() < 0) {
|
||||||
|
return errors::Internal(
|
||||||
|
"Expected an assigned node as argument to LimitToAssignedDevice but "
|
||||||
|
"got: ",
|
||||||
|
node.DebugString());
|
||||||
|
}
|
||||||
|
int root = FindRoot(node.id());
|
||||||
|
Member& root_member = members_[root];
|
||||||
|
if (node.assigned_device_name_index() ==
|
||||||
|
root_member.assigned_device_name_index) {
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
DeviceNameUtils::ParsedName parsed;
|
||||||
|
DeviceNameUtils::ParseFullName(node.assigned_device_name(), &parsed);
|
||||||
|
Status s = DeviceNameUtils::MergeDevNames(&root_member.device_name, parsed,
|
||||||
|
allow_soft_placement_);
|
||||||
|
if (!s.ok()) {
|
||||||
|
return errors::Internal(
|
||||||
|
"Constraining by assigned device should not cause an error. Original "
|
||||||
|
"root device name: ",
|
||||||
|
DeviceNameUtils::ParsedNameToString(root_member.device_name),
|
||||||
|
" assigned device name \"", node.assigned_device_name(),
|
||||||
|
". Error: ", s.error_message());
|
||||||
|
}
|
||||||
|
|
||||||
|
root_member.assigned_device_name_index = node.assigned_device_name_index();
|
||||||
|
// Clear cached possible_devices, if any.
|
||||||
|
root_member.possible_devices.clear();
|
||||||
|
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
// For the given node, subject to the constraints previously given
|
// For the given node, subject to the constraints previously given
|
||||||
// to this ColocationGraph, set its assigned_device_name. Returns OK
|
// to this ColocationGraph, set its assigned_device_name. Returns OK
|
||||||
// if a satisfying device can be found, otherwise an error.
|
// if a satisfying device can be found, otherwise an error.
|
||||||
@ -587,6 +625,16 @@ class ColocationGraph {
|
|||||||
// those of all of its children.
|
// those of all of its children.
|
||||||
DeviceNameUtils::ParsedName device_name;
|
DeviceNameUtils::ParsedName device_name;
|
||||||
|
|
||||||
|
// Once colocation groups have been formed and we assigned at least
|
||||||
|
// one node in this group to a device, assigned_device_name_index will
|
||||||
|
// contain this device name's index in the graph. The `device_name` will
|
||||||
|
// contain the parsed name of this device and `possible_devices`, if
|
||||||
|
// computed, will contain just this device.
|
||||||
|
// `assigned_device_name_index` is an optimization to avoid parsing and
|
||||||
|
// comparing device names. The value of -1 signals that a single device
|
||||||
|
// has not been chosen yet.
|
||||||
|
int assigned_device_name_index = -1;
|
||||||
|
|
||||||
// If this node is a root, stores a list of Devices to which this node
|
// If this node is a root, stores a list of Devices to which this node
|
||||||
// and all of its children have been assigned, or nullptr if this
|
// and all of its children have been assigned, or nullptr if this
|
||||||
// has not yet been computed.
|
// has not yet been computed.
|
||||||
@ -891,6 +939,30 @@ class ColocationGraph {
|
|||||||
const bool allow_soft_placement_;
|
const bool allow_soft_placement_;
|
||||||
const bool log_device_placement_;
|
const bool log_device_placement_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void LogDeviceAssignment(const Node* node, bool log_device_placement) {
|
||||||
|
// Log placement if log_device_placement is set.
|
||||||
|
if (log_device_placement) {
|
||||||
|
printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
|
||||||
|
node->assigned_device_name().c_str());
|
||||||
|
LOG(INFO) << node->name() << ": "
|
||||||
|
<< "(" << node->type_string() << ")"
|
||||||
|
<< node->assigned_device_name();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Status AssignAndLog(int assigned_device, Node* node,
|
||||||
|
ColocationGraph* colocation_graph,
|
||||||
|
bool log_device_placement) {
|
||||||
|
node->set_assigned_device_name_index(assigned_device);
|
||||||
|
|
||||||
|
// Constraint the group of node to the assigned device.
|
||||||
|
TF_RETURN_IF_ERROR(colocation_graph->LimitToAssignedDevice(*node));
|
||||||
|
|
||||||
|
LogDeviceAssignment(node, log_device_placement);
|
||||||
|
return Status::OK();
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
Placer::Placer(Graph* graph, const DeviceSet* devices,
|
Placer::Placer(Graph* graph, const DeviceSet* devices,
|
||||||
@ -927,7 +999,7 @@ Status Placer::Run() {
|
|||||||
// devices (e.g., for stateful placements), so the placer should not try to
|
// devices (e.g., for stateful placements), so the placer should not try to
|
||||||
// place nodes that are already placed.
|
// place nodes that are already placed.
|
||||||
if (node->has_assigned_device_name()) {
|
if (node->has_assigned_device_name()) {
|
||||||
LogDeviceAssignment(node);
|
LogDeviceAssignment(node, log_device_placement_);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -983,7 +1055,8 @@ Status Placer::Run() {
|
|||||||
assigned_device = graph_->InternDeviceName((*devices)[0]->name());
|
assigned_device = graph_->InternDeviceName((*devices)[0]->name());
|
||||||
}
|
}
|
||||||
|
|
||||||
AssignAndLog(assigned_device, node);
|
TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
|
||||||
|
log_device_placement_));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Perform a second pass assignment for those nodes explicitly
|
// Perform a second pass assignment for those nodes explicitly
|
||||||
@ -1022,7 +1095,8 @@ Status Placer::Run() {
|
|||||||
assigned_device = graph_->InternDeviceName((*devices)[0]->name());
|
assigned_device = graph_->InternDeviceName((*devices)[0]->name());
|
||||||
}
|
}
|
||||||
|
|
||||||
AssignAndLog(assigned_device, node);
|
TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
|
||||||
|
log_device_placement_));
|
||||||
}
|
}
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
@ -1045,20 +1119,4 @@ bool Placer::CanAssignToDevice(const string& candidate_device_name,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Placer::AssignAndLog(int assigned_device, Node* node) const {
|
|
||||||
node->set_assigned_device_name_index(assigned_device);
|
|
||||||
LogDeviceAssignment(node);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Placer::LogDeviceAssignment(const Node* node) const {
|
|
||||||
// Log placement if log_device_placement is set.
|
|
||||||
if (log_device_placement_) {
|
|
||||||
printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
|
|
||||||
node->assigned_device_name().c_str());
|
|
||||||
LOG(INFO) << node->name() << ": "
|
|
||||||
<< "(" << node->type_string() << ")"
|
|
||||||
<< node->assigned_device_name();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
@ -88,11 +88,6 @@ class Placer {
|
|||||||
bool CanAssignToDevice(const string& candidate_device_name,
|
bool CanAssignToDevice(const string& candidate_device_name,
|
||||||
const std::vector<Device*>& devices) const;
|
const std::vector<Device*>& devices) const;
|
||||||
|
|
||||||
// Assigns 'node's devices to 'assigned_device', and logs the
|
|
||||||
// placement if the SessionOptions entry in 'options_' requests it.
|
|
||||||
void AssignAndLog(int assigned_device, Node* node) const;
|
|
||||||
void LogDeviceAssignment(const Node* node) const;
|
|
||||||
|
|
||||||
Graph* const graph_; // Not owned.
|
Graph* const graph_; // Not owned.
|
||||||
const DeviceSet* const devices_; // Not owned.
|
const DeviceSet* const devices_; // Not owned.
|
||||||
const SessionOptions* options_; // Not owned.
|
const SessionOptions* options_; // Not owned.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user