Fix a segfault when deleting FLR.

When deleting FLR, it will delete all the items in it. Then item will try to release itself from the same FLR which causes segfault. This change adds an 'active_' field to avoid releasing handle from an already deleted FLR.

PiperOrigin-RevId: 256427569
This commit is contained in:
Xiao Yu 2019-07-03 13:24:16 -07:00 committed by TensorFlower Gardener
parent 73ed7102b7
commit 3b239555fe
3 changed files with 48 additions and 19 deletions

View File

@ -410,7 +410,8 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
delete this->overlay_flr; delete this->overlay_flr;
} }
}; };
std::unordered_map<Handle, std::unique_ptr<Item>> items_ GUARDED_BY(mu_); std::unique_ptr<std::unordered_map<Handle, std::unique_ptr<Item>>> items_
GUARDED_BY(mu_);
ProcessFunctionLibraryRuntime* parent_ = nullptr; // not owned. ProcessFunctionLibraryRuntime* parent_ = nullptr; // not owned.
@ -460,6 +461,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
? ProcessFunctionLibraryRuntime::kDefaultFLRDevice ? ProcessFunctionLibraryRuntime::kDefaultFLRDevice
: device_->name()), : device_->name()),
next_handle_(0), next_handle_(0),
items_(new std::unordered_map<Handle, std::unique_ptr<Item>>),
parent_(parent) { parent_(parent) {
get_func_sig_ = [this](const string& op, const OpDef** sig) { get_func_sig_ = [this](const string& op, const OpDef** sig) {
return base_lib_def_->LookUpOpDef(op, sig); return base_lib_def_->LookUpOpDef(op, sig);
@ -481,7 +483,16 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
} }
} }
FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {} FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {
// Deleting the items_ list will delete all the function handles registered in
// this object. A function may contains a few sub-functions which have also
// been registered in this object. Deleting the parent function will call
// ReleaseHandle in this class again for each of the sub-functions. These
// circular calls may cause segfault since the items_ may have already been
// partially deleted when releasing handles of sub-functions. Explicitly
// release items_ here and check it in ReleaseHandle to avoid this.
items_.reset();
}
// An asynchronous op kernel which executes an instantiated function // An asynchronous op kernel which executes an instantiated function
// defined in a library. // defined in a library.
@ -549,8 +560,8 @@ const FunctionBody* FunctionLibraryRuntimeImpl::GetFunctionBody(Handle h) {
} }
tf_shared_lock l(mu_); tf_shared_lock l(mu_);
auto iter = items_.find(local_handle); auto iter = items_->find(local_handle);
CHECK(iter != items_.end()); CHECK(iter != items_->end());
return iter->second->func_graph; return iter->second->func_graph;
} }
@ -727,8 +738,8 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
return errors::Internal("LocalHandle not found for handle ", *handle, return errors::Internal("LocalHandle not found for handle ", *handle,
"."); ".");
} }
auto item_handle = items_.find(handle_on_device); auto item_handle = items_->find(handle_on_device);
if (item_handle == items_.end()) { if (item_handle == items_->end()) {
return errors::Internal("LocalHandle ", handle_on_device, return errors::Internal("LocalHandle ", handle_on_device,
" for handle ", *handle, " for handle ", *handle,
" not found in items."); " not found in items.");
@ -769,7 +780,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
*handle = parent_->GetHandle(key); *handle = parent_->GetHandle(key);
if (*handle != kInvalidHandle) { if (*handle != kInvalidHandle) {
local_handle = parent_->GetHandleOnDevice(device_name_, *handle); local_handle = parent_->GetHandleOnDevice(device_name_, *handle);
++items_[local_handle]->instantiation_counter; ++(*items_)[local_handle]->instantiation_counter;
} else { } else {
*handle = parent_->AddHandle(key, device_name_, next_handle_); *handle = parent_->AddHandle(key, device_name_, next_handle_);
Item* item = new Item; Item* item = new Item;
@ -786,7 +797,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
return Status::OK(); return Status::OK();
}; };
local_handle = next_handle_++; local_handle = next_handle_++;
items_.emplace(local_handle, std::unique_ptr<Item>(item)); items_->emplace(local_handle, std::unique_ptr<Item>(item));
} }
} }
@ -803,13 +814,15 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
if (h == kInvalidLocalHandle) { if (h == kInvalidLocalHandle) {
return parent_->ReleaseHandle(handle); return parent_->ReleaseHandle(handle);
} }
std::unique_ptr<Item> item_to_delete; std::unique_ptr<Item> item_to_delete;
Status parent_status; Status parent_status;
{ {
mutex_lock l(mu_); mutex_lock l(mu_);
auto it = items_.find(h); // Return directly if all items has already been released.
if (it == items_.end()) { if (items_ == nullptr) return Status::OK();
auto it = items_->find(h);
if (it == items_->end()) {
return errors::Internal( return errors::Internal(
"Inconsistent FunctionLibraryRuntime. Expected to find an item for " "Inconsistent FunctionLibraryRuntime. Expected to find an item for "
"handle ", "handle ",
@ -824,7 +837,7 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
// CallOp or PartitionCallOp, their destructors will release cached // CallOp or PartitionCallOp, their destructors will release cached
// function handles, resulting in deadlock here. // function handles, resulting in deadlock here.
item_to_delete = std::move(item); item_to_delete = std::move(item);
items_.erase(h); items_->erase(h);
parent_status = parent_->RemoveHandle(handle); parent_status = parent_->RemoveHandle(handle);
} }
} }
@ -955,8 +968,8 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(LocalHandle local_handle,
Item** item) { Item** item) {
{ {
tf_shared_lock l(mu_); tf_shared_lock l(mu_);
auto iter = items_.find(local_handle); auto iter = items_->find(local_handle);
if (iter == items_.end()) { if (iter == items_->end()) {
return errors::Internal("Local function handle ", local_handle, return errors::Internal("Local function handle ", local_handle,
" is not valid. Likely an internal error."); " is not valid. Likely an internal error.");
} }

View File

@ -71,16 +71,18 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
device_mgr_(device_mgr), device_mgr_(device_mgr),
lib_def_(lib_def), lib_def_(lib_def),
default_thread_pool_(default_thread_pool), default_thread_pool_(default_thread_pool),
flr_map_(new std::unordered_map<Device*,
std::unique_ptr<FunctionLibraryRuntime>>),
next_handle_(0), next_handle_(0),
parent_(parent) { parent_(parent) {
if (device_mgr == nullptr) { if (device_mgr == nullptr) {
flr_map_[nullptr] = NewFunctionLibraryRuntime( (*flr_map_)[nullptr] = NewFunctionLibraryRuntime(
nullptr, env, nullptr, graph_def_version, lib_def_, default_thread_pool, nullptr, env, nullptr, graph_def_version, lib_def_, default_thread_pool,
optimizer_options, custom_kernel_creator, this); optimizer_options, custom_kernel_creator, this);
return; return;
} }
for (Device* d : device_mgr->ListDevices()) { for (Device* d : device_mgr->ListDevices()) {
flr_map_[d] = NewFunctionLibraryRuntime( (*flr_map_)[d] = NewFunctionLibraryRuntime(
device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool, device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool,
optimizer_options, custom_kernel_creator, this); optimizer_options, custom_kernel_creator, this);
} }
@ -197,8 +199,8 @@ FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
return nullptr; return nullptr;
} }
} }
const auto& iter = flr_map_.find(device); const auto& iter = flr_map_->find(device);
if (iter == flr_map_.end()) { if (iter == flr_map_->end()) {
LOG(ERROR) << "Could not find device: " << device_name; LOG(ERROR) << "Could not find device: " << device_name;
return nullptr; return nullptr;
} }
@ -1037,6 +1039,9 @@ Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle(
Status ProcessFunctionLibraryRuntime::ReleaseHandle( Status ProcessFunctionLibraryRuntime::ReleaseHandle(
FunctionLibraryRuntime::Handle handle) { FunctionLibraryRuntime::Handle handle) {
// Return directly if all function handles has already been released.
if (flr_map_ == nullptr) return Status::OK();
if (IsMultiDevice(handle)) { if (IsMultiDevice(handle)) {
return ReleaseMultiDeviceHandle(handle); return ReleaseMultiDeviceHandle(handle);
} }

View File

@ -40,6 +40,15 @@ class ProcessFunctionLibraryRuntime {
DistributedFunctionLibraryRuntime* parent = nullptr, DistributedFunctionLibraryRuntime* parent = nullptr,
const CustomKernelCreator* custom_kernel_creator = nullptr); const CustomKernelCreator* custom_kernel_creator = nullptr);
~ProcessFunctionLibraryRuntime() {
// Deleting the FunctionLibraryRuntime map will delete the function handles
// registered in it, which may call ReleaseHandle in this class again to
// release their sub-function. These circular calls may casue segfault
// since the flr_map_ may has already been deleted. Explicitly releasing
// flr_map_ here and checking flr_map_ in ReleaseHandle to avoid this.
flr_map_.reset();
}
// Sends `tensors_to_send` from `source_device` to `target_device` using // Sends `tensors_to_send` from `source_device` to `target_device` using
// `rendezvous`. `key_prefix` is used as a prefix for the keys sent to the // `rendezvous`. `key_prefix` is used as a prefix for the keys sent to the
// Rendezvous. `device_context` should be the DeviceContext of the device // Rendezvous. `device_context` should be the DeviceContext of the device
@ -346,7 +355,9 @@ class ProcessFunctionLibraryRuntime {
std::unique_ptr<MultiDeviceFunctionData>> std::unique_ptr<MultiDeviceFunctionData>>
mdevice_data_ GUARDED_BY(mu_); mdevice_data_ GUARDED_BY(mu_);
std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>> flr_map_; std::unique_ptr<
std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>>>
flr_map_;
int next_handle_ GUARDED_BY(mu_); int next_handle_ GUARDED_BY(mu_);
DistributedFunctionLibraryRuntime* const parent_; DistributedFunctionLibraryRuntime* const parent_;
}; };