Add the default worker name to AbortedError messages.

This should help track down which worker is "this worker" in "this worker has restarted" errors.

PiperOrigin-RevId: 221635865
This commit is contained in:
Derek Murray 2018-11-15 09:25:11 -08:00 committed by TensorFlower Gardener
parent 82eb2d85c1
commit a302885171
3 changed files with 10 additions and 4 deletions

View File

@ -30,7 +30,6 @@ limitations under the License.
#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
#include "tensorflow/core/distributed_runtime/session_mgr.h"
#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
#include "tensorflow/core/framework/rendezvous.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/lib/core/status.h"
@ -1028,7 +1027,10 @@ Status RdmaTensorResponse::PrepareRecvTensor(
return errors::Aborted(
"RecvTensor expects a different device incarnation: ",
parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
". Your worker job was probably restarted. Check your "
". Your worker job (\"",
channel_->adapter_->worker_env_->session_mgr->LegacySession()
->worker_name,
"\") was probably restarted. Check your "
"worker job for the reason why it was restarted.");
}

View File

@ -122,7 +122,9 @@ Status SessionMgr::WorkerSessionForSessionLocked(
auto it = sessions_.find(session_handle);
if (it == sessions_.end()) {
return errors::Aborted("Session handle is not found: ", session_handle,
". Possibly this worker just restarted.");
". Possibly this worker (\"",
legacy_session_->worker_name,
"\") just restarted.");
} else {
*out_session = it->second;
}

View File

@ -438,7 +438,9 @@ Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
return errors::Aborted(
"RecvTensor expects a different device incarnation: ",
parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
". Your worker job was probably restarted. Check your "
". Your worker job (\"",
env_->session_mgr->LegacySession()->worker_name,
"\") was probably restarted. Check your "
"worker job for the reason why it was restarted.");
}