Add the default worker name to AbortedError
messages.
This should help track down which worker is "this worker" in "this worker has restarted" errors. PiperOrigin-RevId: 221635865
This commit is contained in:
parent
82eb2d85c1
commit
a302885171
@ -30,7 +30,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
|
||||
#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
|
||||
#include "tensorflow/core/distributed_runtime/session_mgr.h"
|
||||
#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
|
||||
#include "tensorflow/core/framework/rendezvous.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
@ -1028,7 +1027,10 @@ Status RdmaTensorResponse::PrepareRecvTensor(
|
||||
return errors::Aborted(
|
||||
"RecvTensor expects a different device incarnation: ",
|
||||
parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
|
||||
". Your worker job was probably restarted. Check your "
|
||||
". Your worker job (\"",
|
||||
channel_->adapter_->worker_env_->session_mgr->LegacySession()
|
||||
->worker_name,
|
||||
"\") was probably restarted. Check your "
|
||||
"worker job for the reason why it was restarted.");
|
||||
}
|
||||
|
||||
|
@ -122,7 +122,9 @@ Status SessionMgr::WorkerSessionForSessionLocked(
|
||||
auto it = sessions_.find(session_handle);
|
||||
if (it == sessions_.end()) {
|
||||
return errors::Aborted("Session handle is not found: ", session_handle,
|
||||
". Possibly this worker just restarted.");
|
||||
". Possibly this worker (\"",
|
||||
legacy_session_->worker_name,
|
||||
"\") just restarted.");
|
||||
} else {
|
||||
*out_session = it->second;
|
||||
}
|
||||
|
@ -438,7 +438,9 @@ Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
|
||||
return errors::Aborted(
|
||||
"RecvTensor expects a different device incarnation: ",
|
||||
parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
|
||||
". Your worker job was probably restarted. Check your "
|
||||
". Your worker job (\"",
|
||||
env_->session_mgr->LegacySession()->worker_name,
|
||||
"\") was probably restarted. Check your "
|
||||
"worker job for the reason why it was restarted.");
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user