Fix a segfault when deleting FLR.
When deleting FLR, it will delete all the items in it. Then item will try to release itself from the same FLR which causes segfault. This change adds an 'active_' field to avoid releasing handle from an already deleted FLR. PiperOrigin-RevId: 256427569
This commit is contained in:
parent
73ed7102b7
commit
3b239555fe
@ -410,7 +410,8 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
|
||||
delete this->overlay_flr;
|
||||
}
|
||||
};
|
||||
std::unordered_map<Handle, std::unique_ptr<Item>> items_ GUARDED_BY(mu_);
|
||||
std::unique_ptr<std::unordered_map<Handle, std::unique_ptr<Item>>> items_
|
||||
GUARDED_BY(mu_);
|
||||
|
||||
ProcessFunctionLibraryRuntime* parent_ = nullptr; // not owned.
|
||||
|
||||
@ -460,6 +461,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
|
||||
? ProcessFunctionLibraryRuntime::kDefaultFLRDevice
|
||||
: device_->name()),
|
||||
next_handle_(0),
|
||||
items_(new std::unordered_map<Handle, std::unique_ptr<Item>>),
|
||||
parent_(parent) {
|
||||
get_func_sig_ = [this](const string& op, const OpDef** sig) {
|
||||
return base_lib_def_->LookUpOpDef(op, sig);
|
||||
@ -481,7 +483,16 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
|
||||
}
|
||||
}
|
||||
|
||||
FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {}
|
||||
FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {
|
||||
// Deleting the items_ list will delete all the function handles registered in
|
||||
// this object. A function may contains a few sub-functions which have also
|
||||
// been registered in this object. Deleting the parent function will call
|
||||
// ReleaseHandle in this class again for each of the sub-functions. These
|
||||
// circular calls may cause segfault since the items_ may have already been
|
||||
// partially deleted when releasing handles of sub-functions. Explicitly
|
||||
// release items_ here and check it in ReleaseHandle to avoid this.
|
||||
items_.reset();
|
||||
}
|
||||
|
||||
// An asynchronous op kernel which executes an instantiated function
|
||||
// defined in a library.
|
||||
@ -549,8 +560,8 @@ const FunctionBody* FunctionLibraryRuntimeImpl::GetFunctionBody(Handle h) {
|
||||
}
|
||||
|
||||
tf_shared_lock l(mu_);
|
||||
auto iter = items_.find(local_handle);
|
||||
CHECK(iter != items_.end());
|
||||
auto iter = items_->find(local_handle);
|
||||
CHECK(iter != items_->end());
|
||||
return iter->second->func_graph;
|
||||
}
|
||||
|
||||
@ -727,8 +738,8 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
|
||||
return errors::Internal("LocalHandle not found for handle ", *handle,
|
||||
".");
|
||||
}
|
||||
auto item_handle = items_.find(handle_on_device);
|
||||
if (item_handle == items_.end()) {
|
||||
auto item_handle = items_->find(handle_on_device);
|
||||
if (item_handle == items_->end()) {
|
||||
return errors::Internal("LocalHandle ", handle_on_device,
|
||||
" for handle ", *handle,
|
||||
" not found in items.");
|
||||
@ -769,7 +780,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
|
||||
*handle = parent_->GetHandle(key);
|
||||
if (*handle != kInvalidHandle) {
|
||||
local_handle = parent_->GetHandleOnDevice(device_name_, *handle);
|
||||
++items_[local_handle]->instantiation_counter;
|
||||
++(*items_)[local_handle]->instantiation_counter;
|
||||
} else {
|
||||
*handle = parent_->AddHandle(key, device_name_, next_handle_);
|
||||
Item* item = new Item;
|
||||
@ -786,7 +797,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
|
||||
return Status::OK();
|
||||
};
|
||||
local_handle = next_handle_++;
|
||||
items_.emplace(local_handle, std::unique_ptr<Item>(item));
|
||||
items_->emplace(local_handle, std::unique_ptr<Item>(item));
|
||||
}
|
||||
}
|
||||
|
||||
@ -803,13 +814,15 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
|
||||
if (h == kInvalidLocalHandle) {
|
||||
return parent_->ReleaseHandle(handle);
|
||||
}
|
||||
|
||||
std::unique_ptr<Item> item_to_delete;
|
||||
Status parent_status;
|
||||
{
|
||||
mutex_lock l(mu_);
|
||||
auto it = items_.find(h);
|
||||
if (it == items_.end()) {
|
||||
// Return directly if all items has already been released.
|
||||
if (items_ == nullptr) return Status::OK();
|
||||
|
||||
auto it = items_->find(h);
|
||||
if (it == items_->end()) {
|
||||
return errors::Internal(
|
||||
"Inconsistent FunctionLibraryRuntime. Expected to find an item for "
|
||||
"handle ",
|
||||
@ -824,7 +837,7 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
|
||||
// CallOp or PartitionCallOp, their destructors will release cached
|
||||
// function handles, resulting in deadlock here.
|
||||
item_to_delete = std::move(item);
|
||||
items_.erase(h);
|
||||
items_->erase(h);
|
||||
parent_status = parent_->RemoveHandle(handle);
|
||||
}
|
||||
}
|
||||
@ -955,8 +968,8 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(LocalHandle local_handle,
|
||||
Item** item) {
|
||||
{
|
||||
tf_shared_lock l(mu_);
|
||||
auto iter = items_.find(local_handle);
|
||||
if (iter == items_.end()) {
|
||||
auto iter = items_->find(local_handle);
|
||||
if (iter == items_->end()) {
|
||||
return errors::Internal("Local function handle ", local_handle,
|
||||
" is not valid. Likely an internal error.");
|
||||
}
|
||||
|
@ -71,16 +71,18 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
|
||||
device_mgr_(device_mgr),
|
||||
lib_def_(lib_def),
|
||||
default_thread_pool_(default_thread_pool),
|
||||
flr_map_(new std::unordered_map<Device*,
|
||||
std::unique_ptr<FunctionLibraryRuntime>>),
|
||||
next_handle_(0),
|
||||
parent_(parent) {
|
||||
if (device_mgr == nullptr) {
|
||||
flr_map_[nullptr] = NewFunctionLibraryRuntime(
|
||||
(*flr_map_)[nullptr] = NewFunctionLibraryRuntime(
|
||||
nullptr, env, nullptr, graph_def_version, lib_def_, default_thread_pool,
|
||||
optimizer_options, custom_kernel_creator, this);
|
||||
return;
|
||||
}
|
||||
for (Device* d : device_mgr->ListDevices()) {
|
||||
flr_map_[d] = NewFunctionLibraryRuntime(
|
||||
(*flr_map_)[d] = NewFunctionLibraryRuntime(
|
||||
device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool,
|
||||
optimizer_options, custom_kernel_creator, this);
|
||||
}
|
||||
@ -197,8 +199,8 @@ FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
const auto& iter = flr_map_.find(device);
|
||||
if (iter == flr_map_.end()) {
|
||||
const auto& iter = flr_map_->find(device);
|
||||
if (iter == flr_map_->end()) {
|
||||
LOG(ERROR) << "Could not find device: " << device_name;
|
||||
return nullptr;
|
||||
}
|
||||
@ -1037,6 +1039,9 @@ Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle(
|
||||
|
||||
Status ProcessFunctionLibraryRuntime::ReleaseHandle(
|
||||
FunctionLibraryRuntime::Handle handle) {
|
||||
// Return directly if all function handles has already been released.
|
||||
if (flr_map_ == nullptr) return Status::OK();
|
||||
|
||||
if (IsMultiDevice(handle)) {
|
||||
return ReleaseMultiDeviceHandle(handle);
|
||||
}
|
||||
|
@ -40,6 +40,15 @@ class ProcessFunctionLibraryRuntime {
|
||||
DistributedFunctionLibraryRuntime* parent = nullptr,
|
||||
const CustomKernelCreator* custom_kernel_creator = nullptr);
|
||||
|
||||
~ProcessFunctionLibraryRuntime() {
|
||||
// Deleting the FunctionLibraryRuntime map will delete the function handles
|
||||
// registered in it, which may call ReleaseHandle in this class again to
|
||||
// release their sub-function. These circular calls may casue segfault
|
||||
// since the flr_map_ may has already been deleted. Explicitly releasing
|
||||
// flr_map_ here and checking flr_map_ in ReleaseHandle to avoid this.
|
||||
flr_map_.reset();
|
||||
}
|
||||
|
||||
// Sends `tensors_to_send` from `source_device` to `target_device` using
|
||||
// `rendezvous`. `key_prefix` is used as a prefix for the keys sent to the
|
||||
// Rendezvous. `device_context` should be the DeviceContext of the device
|
||||
@ -346,7 +355,9 @@ class ProcessFunctionLibraryRuntime {
|
||||
std::unique_ptr<MultiDeviceFunctionData>>
|
||||
mdevice_data_ GUARDED_BY(mu_);
|
||||
|
||||
std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>> flr_map_;
|
||||
std::unique_ptr<
|
||||
std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>>>
|
||||
flr_map_;
|
||||
int next_handle_ GUARDED_BY(mu_);
|
||||
DistributedFunctionLibraryRuntime* const parent_;
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user