Remove mutex lock from the ProcessState::GetCPUAllocator.

PiperOrigin-RevId: 322064244
Change-Id: I87f4abd2a8d578bb6c0c2d9ef84a2fc9e552d8cc
This commit is contained in:
Eugene Zhulenev 2020-07-19 18:44:03 -07:00 committed by TensorFlower Gardener
parent 19a8ce8888
commit 304bfa2fb4
2 changed files with 20 additions and 1 deletions

View File

@ -15,6 +15,7 @@ limitations under the License.
#include "tensorflow/core/common_runtime/process_state.h"
#include <atomic>
#include <cstring>
#include <vector>
@ -42,7 +43,8 @@ namespace tensorflow {
return instance;
}
ProcessState::ProcessState() : numa_enabled_(false) {}
ProcessState::ProcessState()
: numa_enabled_(false), cpu_allocators_cached_(0) {}
string ProcessState::MemDesc::DebugString() {
return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index,
@ -61,6 +63,12 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
Allocator* ProcessState::GetCPUAllocator(int numa_node) {
if (!numa_enabled_ || numa_node == port::kNUMANoAffinity) numa_node = 0;
// Check if allocator for the numa node is in lock-free cache.
if (numa_node < cpu_allocators_cached_.load(std::memory_order_acquire)) {
return cpu_allocators_cache_[numa_node];
}
mutex_lock lock(mu_);
while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
// If visitors have been defined we need an Allocator built from
@ -115,6 +123,10 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
allocator = new TrackingAllocator(allocator, true);
}
cpu_allocators_.push_back(allocator);
if (cpu_allocators_.size() < cpu_allocators_cache_.max_size()) {
cpu_allocators_cache_[cpu_allocators_.size() - 1] = allocator;
cpu_allocators_cached_.fetch_add(1, std::memory_order_release);
}
if (!sub_allocator) {
DCHECK(cpu_alloc_visitors_.empty() && cpu_free_visitors_.empty());
}

View File

@ -102,6 +102,13 @@ class ProcessState : public ProcessStateInterface {
std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ TF_GUARDED_BY(mu_);
std::vector<SubAllocator::Visitor> cpu_free_visitors_ TF_GUARDED_BY(mu_);
// A cache of cpu allocators indexed by a numa node. Used as a fast path to
// get CPU allocator by numa node id without locking the mutex. We can't use
// `cpu_allocators_` storage in the lock-free path because concurrent
// operation can deallocate the vector storage.
std::atomic<int> cpu_allocators_cached_;
std::array<Allocator*, 8> cpu_allocators_cache_;
// Optional RecordingAllocators that wrap the corresponding
// Allocators for runtime attribute use analysis.
MDMap mem_desc_map_;