From 4dfc9584461ac5a0f83a1c7bb2ef564c08ac2204 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 Jan 2021 15:42:40 -0800
Subject: [PATCH] Support coalescing of adjacent allocation regions in BFC
 allocator.

When "allow_growth" is enabled, this may reduce fragmentation by making larger
contiguous regions available to the allocator if two regions happen to be
allocated contiguously.

Coalescing is disabled for suballocator types where memory can't be treated
as contiguous (i.e. page-locked memory, or cuMemAlloc memory)

PiperOrigin-RevId: 351680385
Change-Id: I1ddf1022a72e2fd4703a748c3a66305c911b2f26
---
 .../core/common_runtime/bfc_allocator.cc      | 91 ++++++++++++-------
 .../core/common_runtime/bfc_allocator.h       | 58 ++++++++++--
 .../device/device_host_allocator.h            |  2 +
 .../device/device_mem_allocator.h             |  2 +
 .../common_runtime/gpu/gpu_process_state.cc   |  2 +-
 .../gpu/gpu_virtual_mem_allocator.h           |  2 +
 .../core/common_runtime/pool_allocator.h      |  2 +
 tensorflow/core/framework/allocator.h         |  4 +
 .../core/framework/cpu_allocator_impl.cc      |  2 +
 9 files changed, 123 insertions(+), 42 deletions(-)

diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 06af68bdb65..c68093d863b 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -42,6 +42,7 @@ BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
                            bool allow_growth, const string& name,
                            bool garbage_collection)
     : garbage_collection_(garbage_collection),
+      coalesce_regions_(sub_allocator->SupportsCoalescing()),
       sub_allocator_(sub_allocator),
       name_(name),
       free_chunks_list_(kInvalidChunkHandle),
@@ -150,23 +151,30 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
     curr_region_allocation_bytes_ *= 2;
   }
 
-  VLOG(1) << "Extending allocation by " << strings::HumanReadableNumBytes(bytes)
-          << " bytes.";
+  VLOG(1) << "Extending allocation by "
+          << strings::HumanReadableNumBytes(bytes_received) << " bytes.";
 
-  total_region_allocated_bytes_ += bytes;
+  total_region_allocated_bytes_ += bytes_received;
   VLOG(1) << "Total allocated bytes: "
           << strings::HumanReadableNumBytes(total_region_allocated_bytes_);
 
   VLOG(1) << "Allocated memory at " << mem_addr << " to "
-          << static_cast<void*>(static_cast<char*>(mem_addr) + bytes);
-  region_manager_.AddAllocationRegion(mem_addr, bytes_received);
+          << static_cast<void*>(static_cast<char*>(mem_addr) + bytes_received);
+
+  AllocationRegion* maybe_extended_region = nullptr;
+  if (coalesce_regions_) {
+    maybe_extended_region =
+        region_manager_.AddOrExtendAllocationRegion(mem_addr, bytes_received);
+  } else {
+    region_manager_.AddAllocationRegion(mem_addr, bytes_received);
+  }
 
   // Create one large chunk for the whole memory space that will
   // be chunked later.
   ChunkHandle h = AllocateChunk();
   BFCAllocator::Chunk* c = ChunkFromHandle(h);
   c->ptr = mem_addr;
-  c->size = bytes;
+  c->size = bytes_received;
   c->allocation_id = -1;
   c->prev = kInvalidChunkHandle;
   c->next = kInvalidChunkHandle;
@@ -174,8 +182,23 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
 
   region_manager_.set_handle(c->ptr, h);
 
-  // Insert the chunk into the right bin.
-  InsertFreeChunkIntoBin(h);
+  // If the region was extended, then there exists a previous chunk that should
+  // be linked to the new chunk.
+  if (maybe_extended_region != nullptr) {
+    ChunkHandle prev =
+        maybe_extended_region->get_handle(maybe_extended_region->ptr());
+    BFCAllocator::Chunk* prev_chunk = ChunkFromHandle(prev);
+    // Find the last recorded chunk in the extended region.
+    while (prev_chunk->next != kInvalidChunkHandle) {
+      prev = prev_chunk->next;
+      prev_chunk = ChunkFromHandle(prev);
+    }
+    c->prev = prev;
+    prev_chunk->next = h;
+  }
+
+  // Maybe merge adjacent chunks and insert the chunk into the right bin.
+  InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false));
 
   return true;
 }
@@ -469,32 +492,32 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
                               const void* chunk_ptr, int64 req_bytes,
                               int64 alloc_bytes) {
   tensorflow::profiler::TraceMe::InstantActivity(
-      [this, traceme_name, chunk_ptr, req_bytes,
-       alloc_bytes]() TF_NO_THREAD_SAFETY_ANALYSIS {
-        int64 bytes_available =
-            memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
-        const auto& annotation =
-            ScopedMemoryDebugAnnotation::CurrentAnnotation();
-        std::string tensor_shape;
-        if (annotation.pending_shape) {
-          tensor_shape = annotation.pending_shape->DebugString();
-        }
-        return tensorflow::profiler::TraceMeEncode(
-            traceme_name, {{"allocator_name", name_},
-                           {"bytes_reserved", stats_.bytes_reserved},
-                           {"bytes_allocated", stats_.bytes_in_use},
-                           {"bytes_available", bytes_available},
-                           {"fragmentation", GetFragmentation()},
-                           {"peak_bytes_in_use", stats_.peak_bytes_in_use},
-                           {"requested_bytes", req_bytes},
-                           {"allocation_bytes", alloc_bytes},
-                           {"addr", reinterpret_cast<uint64>(chunk_ptr)},
-                           {"tf_op", annotation.pending_op_name},
-                           {"id", annotation.pending_step_id},
-                           {"region_type", annotation.pending_region_type},
-                           {"data_type", annotation.pending_data_type},
-                           {"shape", tensor_shape}});
-      },
+      [this, traceme_name, chunk_ptr, req_bytes, alloc_bytes]()
+          TF_NO_THREAD_SAFETY_ANALYSIS {
+            int64 bytes_available =
+                memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
+            const auto& annotation =
+                ScopedMemoryDebugAnnotation::CurrentAnnotation();
+            std::string tensor_shape;
+            if (annotation.pending_shape) {
+              tensor_shape = annotation.pending_shape->DebugString();
+            }
+            return tensorflow::profiler::TraceMeEncode(
+                traceme_name, {{"allocator_name", name_},
+                               {"bytes_reserved", stats_.bytes_reserved},
+                               {"bytes_allocated", stats_.bytes_in_use},
+                               {"bytes_available", bytes_available},
+                               {"fragmentation", GetFragmentation()},
+                               {"peak_bytes_in_use", stats_.peak_bytes_in_use},
+                               {"requested_bytes", req_bytes},
+                               {"allocation_bytes", alloc_bytes},
+                               {"addr", reinterpret_cast<uint64>(chunk_ptr)},
+                               {"tf_op", annotation.pending_op_name},
+                               {"id", annotation.pending_step_id},
+                               {"region_type", annotation.pending_region_type},
+                               {"data_type", annotation.pending_data_type},
+                               {"shape", tensor_shape}});
+          },
       /*level=*/profiler::TraceMeLevel::kInfo);
 }
 
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index f79a6048bbb..aea1331b196 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -276,10 +276,7 @@ class BFCAllocator : public Allocator {
       DCHECK_EQ(0, memory_size % kMinAllocationSize);
       const size_t n_handles =
           (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
-      handles_.reset(new ChunkHandle[n_handles]);
-      for (size_t i = 0; i < n_handles; i++) {
-        handles_[i] = kInvalidChunkHandle;
-      }
+      handles_.resize(n_handles, kInvalidChunkHandle);
     }
 
     AllocationRegion() = default;
@@ -292,6 +289,15 @@ class BFCAllocator : public Allocator {
     void* ptr() const { return ptr_; }
     void* end_ptr() const { return end_ptr_; }
     size_t memory_size() const { return memory_size_; }
+    void extend(size_t size) {
+      memory_size_ += size;
+      DCHECK_EQ(0, memory_size_ % kMinAllocationSize);
+
+      end_ptr_ = static_cast<void*>(static_cast<char*>(end_ptr_) + size);
+      const size_t n_handles =
+          (memory_size_ + kMinAllocationSize - 1) / kMinAllocationSize;
+      handles_.resize(n_handles, kInvalidChunkHandle);
+    }
     ChunkHandle get_handle(const void* p) const {
       return handles_[IndexFor(p)];
     }
@@ -322,7 +328,7 @@ class BFCAllocator : public Allocator {
     // Array of size "memory_size / kMinAllocationSize".  It is
     // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
     // for the memory allocation represented by "p"
-    std::unique_ptr<ChunkHandle[]> handles_;
+    std::vector<ChunkHandle> handles_;
 
     TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion);
   };
@@ -338,12 +344,43 @@ class BFCAllocator : public Allocator {
     ~RegionManager() {}
 
     void AddAllocationRegion(void* ptr, size_t memory_size) {
-      // Insert sorted by end_ptr
+      // Insert sorted by end_ptr.
       auto entry =
           std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
       regions_.insert(entry, AllocationRegion(ptr, memory_size));
     }
 
+    // Adds an alloation region for the given ptr and size, potentially
+    // extending a region if ptr matches the end_ptr of an existing region.
+    // If a region is extended, returns a pointer to the extended region so that
+    // the BFC allocator can reason about chunkification.
+    AllocationRegion* AddOrExtendAllocationRegion(void* ptr,
+                                                  size_t memory_size) {
+      // Insert sorted by end_ptr.
+      auto entry =
+          std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
+      // Check if can be coalesced with preceding region.
+      if (entry != regions_.begin()) {
+        auto preceding_region = entry - 1;
+        if (preceding_region->end_ptr() == ptr) {
+          if (VLOG_IS_ON(1)) {
+            LOG(INFO) << "Extending region " << preceding_region->ptr()
+                      << " of "
+                      << strings::HumanReadableNumBytes(
+                             preceding_region->memory_size())
+                      << "  by " << strings::HumanReadableNumBytes(memory_size)
+                      << " bytes";
+          }
+          preceding_region->extend(memory_size);
+          return &*preceding_region;
+        }
+      }
+      VLOG(1) << "Inserting new region " << ptr << " of "
+              << strings::HumanReadableNumBytes(memory_size);
+      regions_.insert(entry, AllocationRegion(ptr, memory_size));
+      return nullptr;
+    }
+
     std::vector<AllocationRegion>::iterator RemoveAllocationRegion(
         std::vector<AllocationRegion>::iterator it) {
       return regions_.erase(it);
@@ -525,7 +562,14 @@ class BFCAllocator : public Allocator {
 
   // Whether the allocator will deallocate free regions to avoid OOM due to
   // memory fragmentation.
-  bool garbage_collection_;
+  const bool garbage_collection_;
+
+  // Whether the allocator will coalesce adjacent sub allocator provided
+  // AllocationRegions. This may be disabled if discrete sub allocator
+  // regions can't be treated as contiguous (e.g. if the allocation refers to
+  // device visible memory which is not adjacent to the other region in the
+  // device's address space).
+  const bool coalesce_regions_;
 
   std::unique_ptr<SubAllocator> sub_allocator_;
   string name_;
diff --git a/tensorflow/core/common_runtime/device/device_host_allocator.h b/tensorflow/core/common_runtime/device/device_host_allocator.h
index 9d11705eadf..5bd1ab304a5 100644
--- a/tensorflow/core/common_runtime/device/device_host_allocator.h
+++ b/tensorflow/core/common_runtime/device/device_host_allocator.h
@@ -59,6 +59,8 @@ class DeviceHostAllocator : public SubAllocator {
     }
   }
 
+  bool SupportsCoalescing() const override { return false; }
+
  private:
   se::StreamExecutor* stream_exec_;  // not owned, non-null
   const int numa_node_;
diff --git a/tensorflow/core/common_runtime/device/device_mem_allocator.h b/tensorflow/core/common_runtime/device/device_mem_allocator.h
index 16a14c7114b..bad824f7388 100644
--- a/tensorflow/core/common_runtime/device/device_mem_allocator.h
+++ b/tensorflow/core/common_runtime/device/device_mem_allocator.h
@@ -68,6 +68,8 @@ class DeviceMemAllocator : public SubAllocator {
     }
   }
 
+  bool SupportsCoalescing() const override { return false; }
+
  private:
   se::StreamExecutor* stream_exec_;  // not owned, non-null
   const PlatformDeviceId device_id_;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 7a58f10bb9a..ea69bfa6b7e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -260,7 +260,7 @@ Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
 
     Allocator* allocator =
         new BFCAllocator(sub_allocator, gpu_host_mem_limit,
-                         true /*allow_growth*/, "gpu_host_bfc" /*name*/);
+                         /*allow_growth=*/true, /*name=*/"gpu_host_bfc");
 
     if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
       // Wrap the allocator to track allocation ids for better logging
diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
index 86bb00f1cf9..23572262c42 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
@@ -68,6 +68,8 @@ class GpuVirtualMemAllocator : public SubAllocator {
   // this free function should never be invoked.
   void Free(void* ptr, size_t num_bytes) override;
 
+  bool SupportsCoalescing() const override { return true; }
+
  private:
   GpuVirtualMemAllocator(const std::vector<Visitor>& alloc_visitors,
                          const std::vector<Visitor>& free_visitors,
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index da1a3796830..55d09341267 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -160,6 +160,8 @@ class BasicCPUAllocator : public SubAllocator {
 
   void Free(void* ptr, size_t num_bytes) override;
 
+  bool SupportsCoalescing() const override { return false; }
+
  private:
   int numa_node_;
 
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2899a59c548..4b16c51a337 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -446,6 +446,10 @@ class SubAllocator {
                       size_t* bytes_received) = 0;
   virtual void Free(void* ptr, size_t num_bytes) = 0;
 
+  // Returns true if the BFC allocator can safely coalesce adjacent regions
+  // returned by this allocator.
+  virtual bool SupportsCoalescing() const = 0;
+
  protected:
   // Implementation of Alloc() method must call this on newly allocated
   // value.
diff --git a/tensorflow/core/framework/cpu_allocator_impl.cc b/tensorflow/core/framework/cpu_allocator_impl.cc
index 567454a3295..f3d7fdc2a1f 100644
--- a/tensorflow/core/framework/cpu_allocator_impl.cc
+++ b/tensorflow/core/framework/cpu_allocator_impl.cc
@@ -166,6 +166,8 @@ class CPUAllocatorFactory : public AllocatorFactory {
       cpu_allocator_->DeallocateRaw(ptr);
     }
 
+    bool SupportsCoalescing() const override { return false; }
+
    private:
     CPUAllocator* cpu_allocator_;
   };