RefreshRemoteAttributes() is used to initialize device attributes between workers when we use CollectiveOp. We need it to send GetStatus RPC with fail_fast = false so that each worker can block waiting for other workers to start up.
PiperOrigin-RevId: 266207854
This commit is contained in:
parent
988882bb84
commit
42652b1699
@ -98,7 +98,7 @@ void DeviceResolverDistributed::RefreshRemoteAttributes(
|
||||
WorkerInterface* worker = worker_cache_->GetOrCreateWorker(task);
|
||||
CHECK(worker) << "Failed to get worker for " << task;
|
||||
worker->GetStatusAsync(
|
||||
req, resp, /*fail_fast=*/true,
|
||||
req, resp, /*fail_fast=*/false,
|
||||
[this, device, task, req, resp, worker, done](Status s) {
|
||||
if (s.ok()) {
|
||||
mutex_lock l(mu_);
|
||||
|
Loading…
Reference in New Issue
Block a user