diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc index 471660b857e..8e66e10cc3f 100644 --- a/tensorflow/core/common_runtime/function.cc +++ b/tensorflow/core/common_runtime/function.cc @@ -410,7 +410,8 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime { delete this->overlay_flr; } }; - std::unordered_map> items_ GUARDED_BY(mu_); + std::unique_ptr>> items_ + GUARDED_BY(mu_); ProcessFunctionLibraryRuntime* parent_ = nullptr; // not owned. @@ -460,6 +461,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl( ? ProcessFunctionLibraryRuntime::kDefaultFLRDevice : device_->name()), next_handle_(0), + items_(new std::unordered_map>), parent_(parent) { get_func_sig_ = [this](const string& op, const OpDef** sig) { return base_lib_def_->LookUpOpDef(op, sig); @@ -481,7 +483,16 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl( } } -FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {} +FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() { + // Deleting the items_ list will delete all the function handles registered in + // this object. A function may contains a few sub-functions which have also + // been registered in this object. Deleting the parent function will call + // ReleaseHandle in this class again for each of the sub-functions. These + // circular calls may cause segfault since the items_ may have already been + // partially deleted when releasing handles of sub-functions. Explicitly + // release items_ here and check it in ReleaseHandle to avoid this. + items_.reset(); +} // An asynchronous op kernel which executes an instantiated function // defined in a library. @@ -549,8 +560,8 @@ const FunctionBody* FunctionLibraryRuntimeImpl::GetFunctionBody(Handle h) { } tf_shared_lock l(mu_); - auto iter = items_.find(local_handle); - CHECK(iter != items_.end()); + auto iter = items_->find(local_handle); + CHECK(iter != items_->end()); return iter->second->func_graph; } @@ -727,8 +738,8 @@ Status FunctionLibraryRuntimeImpl::Instantiate( return errors::Internal("LocalHandle not found for handle ", *handle, "."); } - auto item_handle = items_.find(handle_on_device); - if (item_handle == items_.end()) { + auto item_handle = items_->find(handle_on_device); + if (item_handle == items_->end()) { return errors::Internal("LocalHandle ", handle_on_device, " for handle ", *handle, " not found in items."); @@ -769,7 +780,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate( *handle = parent_->GetHandle(key); if (*handle != kInvalidHandle) { local_handle = parent_->GetHandleOnDevice(device_name_, *handle); - ++items_[local_handle]->instantiation_counter; + ++(*items_)[local_handle]->instantiation_counter; } else { *handle = parent_->AddHandle(key, device_name_, next_handle_); Item* item = new Item; @@ -786,7 +797,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate( return Status::OK(); }; local_handle = next_handle_++; - items_.emplace(local_handle, std::unique_ptr(item)); + items_->emplace(local_handle, std::unique_ptr(item)); } } @@ -803,13 +814,15 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) { if (h == kInvalidLocalHandle) { return parent_->ReleaseHandle(handle); } - std::unique_ptr item_to_delete; Status parent_status; { mutex_lock l(mu_); - auto it = items_.find(h); - if (it == items_.end()) { + // Return directly if all items has already been released. + if (items_ == nullptr) return Status::OK(); + + auto it = items_->find(h); + if (it == items_->end()) { return errors::Internal( "Inconsistent FunctionLibraryRuntime. Expected to find an item for " "handle ", @@ -824,7 +837,7 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) { // CallOp or PartitionCallOp, their destructors will release cached // function handles, resulting in deadlock here. item_to_delete = std::move(item); - items_.erase(h); + items_->erase(h); parent_status = parent_->RemoveHandle(handle); } } @@ -955,8 +968,8 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(LocalHandle local_handle, Item** item) { { tf_shared_lock l(mu_); - auto iter = items_.find(local_handle); - if (iter == items_.end()) { + auto iter = items_->find(local_handle); + if (iter == items_->end()) { return errors::Internal("Local function handle ", local_handle, " is not valid. Likely an internal error."); } diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc index 0d104fcc8f5..42c5d59b400 100644 --- a/tensorflow/core/common_runtime/process_function_library_runtime.cc +++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc @@ -71,16 +71,18 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime( device_mgr_(device_mgr), lib_def_(lib_def), default_thread_pool_(default_thread_pool), + flr_map_(new std::unordered_map>), next_handle_(0), parent_(parent) { if (device_mgr == nullptr) { - flr_map_[nullptr] = NewFunctionLibraryRuntime( + (*flr_map_)[nullptr] = NewFunctionLibraryRuntime( nullptr, env, nullptr, graph_def_version, lib_def_, default_thread_pool, optimizer_options, custom_kernel_creator, this); return; } for (Device* d : device_mgr->ListDevices()) { - flr_map_[d] = NewFunctionLibraryRuntime( + (*flr_map_)[d] = NewFunctionLibraryRuntime( device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool, optimizer_options, custom_kernel_creator, this); } @@ -197,8 +199,8 @@ FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR( return nullptr; } } - const auto& iter = flr_map_.find(device); - if (iter == flr_map_.end()) { + const auto& iter = flr_map_->find(device); + if (iter == flr_map_->end()) { LOG(ERROR) << "Could not find device: " << device_name; return nullptr; } @@ -1037,6 +1039,9 @@ Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle( Status ProcessFunctionLibraryRuntime::ReleaseHandle( FunctionLibraryRuntime::Handle handle) { + // Return directly if all function handles has already been released. + if (flr_map_ == nullptr) return Status::OK(); + if (IsMultiDevice(handle)) { return ReleaseMultiDeviceHandle(handle); } diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h index cef1e49e13a..8ca6f3b9221 100644 --- a/tensorflow/core/common_runtime/process_function_library_runtime.h +++ b/tensorflow/core/common_runtime/process_function_library_runtime.h @@ -40,6 +40,15 @@ class ProcessFunctionLibraryRuntime { DistributedFunctionLibraryRuntime* parent = nullptr, const CustomKernelCreator* custom_kernel_creator = nullptr); + ~ProcessFunctionLibraryRuntime() { + // Deleting the FunctionLibraryRuntime map will delete the function handles + // registered in it, which may call ReleaseHandle in this class again to + // release their sub-function. These circular calls may casue segfault + // since the flr_map_ may has already been deleted. Explicitly releasing + // flr_map_ here and checking flr_map_ in ReleaseHandle to avoid this. + flr_map_.reset(); + } + // Sends `tensors_to_send` from `source_device` to `target_device` using // `rendezvous`. `key_prefix` is used as a prefix for the keys sent to the // Rendezvous. `device_context` should be the DeviceContext of the device @@ -346,7 +355,9 @@ class ProcessFunctionLibraryRuntime { std::unique_ptr> mdevice_data_ GUARDED_BY(mu_); - std::unordered_map> flr_map_; + std::unique_ptr< + std::unordered_map>> + flr_map_; int next_handle_ GUARDED_BY(mu_); DistributedFunctionLibraryRuntime* const parent_; };