Use a separate mutex for updating context state.

Instead of reusing lock for access context map, we should use a separate mutex for accessing / updating the state of EagerContext. The two locks serve different purposes. This separation avoids deadlocks between executing long running functions and various other context map accesses.

PiperOrigin-RevId: 285239370
Change-Id: I497e857775101fa1f5773c693330dc0611c3426b
This commit is contained in:
Haoyu Zhang 2019-12-12 11:56:29 -08:00 committed by TensorFlower Gardener
parent 88dbc2d867
commit d5cee81f13
2 changed files with 15 additions and 11 deletions

View File

@ -277,14 +277,12 @@ Status EagerServiceImpl::UpdateContext(const UpdateContextRequest* request,
eager::CreateClusterFLR(request->context_id(), ctx, worker_session.get());
{
// Hold `contexts_mu_` exclusively, wait for all pending nodes to finish
// (implicitly calling WaitForAllPendingNodes inside `ctx->ClearCaches`),
// and update the context state.
// This lock prevents other threads from handling enqueue requests at the
// same time. Each enqueue request will be processed either with context
// state before or after the update, but the exact ordering needs to be
// determined by the client if desired.
mutex_lock lock(contexts_mu_);
// Hold `context_update_mu_` exclusively update the context state. This lock
// prevents other threads from processing an enqueued request at the same
// time. Each enqueue request will be processed either with context state
// before or after the update, but the exact ordering needs to be enforced
// by the client if desired.
mutex_lock l(context_update_mu_);
ctx->ClearCaches();
Status s = ctx->UpdateRemoteWorker(
device_mgr, std::move(remote_eager_workers),
@ -420,9 +418,6 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
core::ScopedUnref context_unref(context);
// Acquire shared lock to prevent handling enqueue requests while updating
// context (see UpdateContext).
tf_shared_lock lock(contexts_mu_);
EagerExecutor& executor =
stream_id == kInvalidStreamId
? context->Context()->Executor()
@ -431,6 +426,9 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
Status s;
for (const auto& item : request->queue()) {
auto* queue_response = response->add_queue_response();
// Acquire shared lock to prevent handling enqueue requests while updating
// context (see UpdateContext).
tf_shared_lock l(context_update_mu_);
if (item.has_operation()) {
s = ExecuteOp(item.operation(), context->Context(), &executor,
queue_response);

View File

@ -213,6 +213,12 @@ class EagerServiceImpl {
mutex contexts_mu_;
std::unordered_map<uint64, ServerContext*> contexts_ GUARDED_BY(contexts_mu_);
// Mutex to guard access to EagerContext in `contexts_`. Different from
// `contexts_mu_` which guards adding / removing item from the map, this mutex
// is supposed to be used to avoid concurrent reading/updating the state of an
// EagerContext inside the map.
mutex context_update_mu_;
std::unique_ptr<Thread> gc_thread_;
mutex gc_thread_shutdown_mu_;
condition_variable gc_thread_cv_;