diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc index 19f7a985f3e..300e5b9c6ea 100644 --- a/tensorflow/core/common_runtime/process_state.cc +++ b/tensorflow/core/common_runtime/process_state.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/process_state.h" +#include #include #include @@ -42,7 +43,8 @@ namespace tensorflow { return instance; } -ProcessState::ProcessState() : numa_enabled_(false) {} +ProcessState::ProcessState() + : numa_enabled_(false), cpu_allocators_cached_(0) {} string ProcessState::MemDesc::DebugString() { return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index, @@ -61,6 +63,12 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) { Allocator* ProcessState::GetCPUAllocator(int numa_node) { if (!numa_enabled_ || numa_node == port::kNUMANoAffinity) numa_node = 0; + + // Check if allocator for the numa node is in lock-free cache. + if (numa_node < cpu_allocators_cached_.load(std::memory_order_acquire)) { + return cpu_allocators_cache_[numa_node]; + } + mutex_lock lock(mu_); while (cpu_allocators_.size() <= static_cast(numa_node)) { // If visitors have been defined we need an Allocator built from @@ -115,6 +123,10 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) { allocator = new TrackingAllocator(allocator, true); } cpu_allocators_.push_back(allocator); + if (cpu_allocators_.size() < cpu_allocators_cache_.max_size()) { + cpu_allocators_cache_[cpu_allocators_.size() - 1] = allocator; + cpu_allocators_cached_.fetch_add(1, std::memory_order_release); + } if (!sub_allocator) { DCHECK(cpu_alloc_visitors_.empty() && cpu_free_visitors_.empty()); } diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h index a833c22db1c..92dd680ca1a 100644 --- a/tensorflow/core/common_runtime/process_state.h +++ b/tensorflow/core/common_runtime/process_state.h @@ -102,6 +102,13 @@ class ProcessState : public ProcessStateInterface { std::vector cpu_alloc_visitors_ TF_GUARDED_BY(mu_); std::vector cpu_free_visitors_ TF_GUARDED_BY(mu_); + // A cache of cpu allocators indexed by a numa node. Used as a fast path to + // get CPU allocator by numa node id without locking the mutex. We can't use + // `cpu_allocators_` storage in the lock-free path because concurrent + // operation can deallocate the vector storage. + std::atomic cpu_allocators_cached_; + std::array cpu_allocators_cache_; + // Optional RecordingAllocators that wrap the corresponding // Allocators for runtime attribute use analysis. MDMap mem_desc_map_;