Remove mutex lock from the ProcessState::GetCPUAllocator.

PiperOrigin-RevId: 322064244 Change-Id: I87f4abd2a8d578bb6c0c2d9ef84a2fc9e552d8cc
2020-07-19 18:44:03 -07:00 · 2020-07-19 18:44:03 -07:00 · 304bfa2fb4
commit 304bfa2fb4
parent 19a8ce8888
2 changed files with 20 additions and 1 deletions
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@ -15,6 +15,7 @@ limitations under the License.

 #include "tensorflow/core/common_runtime/process_state.h"

+#include <atomic>
 #include <cstring>
 #include <vector>

@ -42,7 +43,8 @@ namespace tensorflow {
  return instance;
 }

-ProcessState::ProcessState() : numa_enabled_(false) {}
+ProcessState::ProcessState()
+    : numa_enabled_(false), cpu_allocators_cached_(0) {}

 string ProcessState::MemDesc::DebugString() {
  return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index,
@ -61,6 +63,12 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {

 Allocator* ProcessState::GetCPUAllocator(int numa_node) {
  if (!numa_enabled_ || numa_node == port::kNUMANoAffinity) numa_node = 0;
+
+  // Check if allocator for the numa node is in lock-free cache.
+  if (numa_node < cpu_allocators_cached_.load(std::memory_order_acquire)) {
+    return cpu_allocators_cache_[numa_node];
+  }
+
  mutex_lock lock(mu_);
  while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
    // If visitors have been defined we need an Allocator built from
@ -115,6 +123,10 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
      allocator = new TrackingAllocator(allocator, true);
    }
    cpu_allocators_.push_back(allocator);
+    if (cpu_allocators_.size() < cpu_allocators_cache_.max_size()) {
+      cpu_allocators_cache_[cpu_allocators_.size() - 1] = allocator;
+      cpu_allocators_cached_.fetch_add(1, std::memory_order_release);
+    }
    if (!sub_allocator) {
      DCHECK(cpu_alloc_visitors_.empty() && cpu_free_visitors_.empty());
    }
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@ -102,6 +102,13 @@ class ProcessState : public ProcessStateInterface {
  std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ TF_GUARDED_BY(mu_);
  std::vector<SubAllocator::Visitor> cpu_free_visitors_ TF_GUARDED_BY(mu_);

+  // A cache of cpu allocators indexed by a numa node. Used as a fast path to
+  // get CPU allocator by numa node id without locking the mutex. We can't use
+  // `cpu_allocators_` storage in the lock-free path because concurrent
+  // operation can deallocate the vector storage.
+  std::atomic<int> cpu_allocators_cached_;
+  std::array<Allocator*, 8> cpu_allocators_cache_;
+
  // Optional RecordingAllocators that wrap the corresponding
  // Allocators for runtime attribute use analysis.
  MDMap mem_desc_map_;