Support coalescing of adjacent allocation regions in BFC allocator.

When "allow_growth" is enabled, this may reduce fragmentation by making larger contiguous regions available to the allocator if two regions happen to be allocated contiguously. Coalescing is disabled for suballocator types where memory can't be treated as contiguous (i.e. page-locked memory, or cuMemAlloc memory) PiperOrigin-RevId: 351680385 Change-Id: I1ddf1022a72e2fd4703a748c3a66305c911b2f26
2021-01-13 15:42:40 -08:00 · 2021-01-13 15:42:40 -08:00 · 4dfc958446
commit 4dfc958446
parent 50c714cde2
9 changed files with 123 additions and 42 deletions
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@ -42,6 +42,7 @@ BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
                           bool allow_growth, const string& name,
                           bool garbage_collection)
    : garbage_collection_(garbage_collection),
+      coalesce_regions_(sub_allocator->SupportsCoalescing()),
      sub_allocator_(sub_allocator),
      name_(name),
      free_chunks_list_(kInvalidChunkHandle),
@ -150,23 +151,30 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
    curr_region_allocation_bytes_ *= 2;
  }

-  VLOG(1) << "Extending allocation by " << strings::HumanReadableNumBytes(bytes)
-          << " bytes.";
+  VLOG(1) << "Extending allocation by "
+          << strings::HumanReadableNumBytes(bytes_received) << " bytes.";

-  total_region_allocated_bytes_ += bytes;
+  total_region_allocated_bytes_ += bytes_received;
  VLOG(1) << "Total allocated bytes: "
          << strings::HumanReadableNumBytes(total_region_allocated_bytes_);

  VLOG(1) << "Allocated memory at " << mem_addr << " to "
-          << static_cast<void*>(static_cast<char*>(mem_addr) + bytes);
-  region_manager_.AddAllocationRegion(mem_addr, bytes_received);
+          << static_cast<void*>(static_cast<char*>(mem_addr) + bytes_received);
+
+  AllocationRegion* maybe_extended_region = nullptr;
+  if (coalesce_regions_) {
+    maybe_extended_region =
+        region_manager_.AddOrExtendAllocationRegion(mem_addr, bytes_received);
+  } else {
+    region_manager_.AddAllocationRegion(mem_addr, bytes_received);
+  }

  // Create one large chunk for the whole memory space that will
  // be chunked later.
  ChunkHandle h = AllocateChunk();
  BFCAllocator::Chunk* c = ChunkFromHandle(h);
  c->ptr = mem_addr;
-  c->size = bytes;
+  c->size = bytes_received;
  c->allocation_id = -1;
  c->prev = kInvalidChunkHandle;
  c->next = kInvalidChunkHandle;
@ -174,8 +182,23 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {

  region_manager_.set_handle(c->ptr, h);

-  // Insert the chunk into the right bin.
-  InsertFreeChunkIntoBin(h);
+  // If the region was extended, then there exists a previous chunk that should
+  // be linked to the new chunk.
+  if (maybe_extended_region != nullptr) {
+    ChunkHandle prev =
+        maybe_extended_region->get_handle(maybe_extended_region->ptr());
+    BFCAllocator::Chunk* prev_chunk = ChunkFromHandle(prev);
+    // Find the last recorded chunk in the extended region.
+    while (prev_chunk->next != kInvalidChunkHandle) {
+      prev = prev_chunk->next;
+      prev_chunk = ChunkFromHandle(prev);
+    }
+    c->prev = prev;
+    prev_chunk->next = h;
+  }
+
+  // Maybe merge adjacent chunks and insert the chunk into the right bin.
+  InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false));

  return true;
 }
@ -469,32 +492,32 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
                              const void* chunk_ptr, int64 req_bytes,
                              int64 alloc_bytes) {
  tensorflow::profiler::TraceMe::InstantActivity(
-      [this, traceme_name, chunk_ptr, req_bytes,
-       alloc_bytes]() TF_NO_THREAD_SAFETY_ANALYSIS {
-        int64 bytes_available =
-            memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
-        const auto& annotation =
-            ScopedMemoryDebugAnnotation::CurrentAnnotation();
-        std::string tensor_shape;
-        if (annotation.pending_shape) {
-          tensor_shape = annotation.pending_shape->DebugString();
-        }
-        return tensorflow::profiler::TraceMeEncode(
-            traceme_name, {{"allocator_name", name_},
-                           {"bytes_reserved", stats_.bytes_reserved},
-                           {"bytes_allocated", stats_.bytes_in_use},
-                           {"bytes_available", bytes_available},
-                           {"fragmentation", GetFragmentation()},
-                           {"peak_bytes_in_use", stats_.peak_bytes_in_use},
-                           {"requested_bytes", req_bytes},
-                           {"allocation_bytes", alloc_bytes},
-                           {"addr", reinterpret_cast<uint64>(chunk_ptr)},
-                           {"tf_op", annotation.pending_op_name},
-                           {"id", annotation.pending_step_id},
-                           {"region_type", annotation.pending_region_type},
-                           {"data_type", annotation.pending_data_type},
-                           {"shape", tensor_shape}});
-      },
+      [this, traceme_name, chunk_ptr, req_bytes, alloc_bytes]()
+          TF_NO_THREAD_SAFETY_ANALYSIS {
+            int64 bytes_available =
+                memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
+            const auto& annotation =
+                ScopedMemoryDebugAnnotation::CurrentAnnotation();
+            std::string tensor_shape;
+            if (annotation.pending_shape) {
+              tensor_shape = annotation.pending_shape->DebugString();
+            }
+            return tensorflow::profiler::TraceMeEncode(
+                traceme_name, {{"allocator_name", name_},
+                               {"bytes_reserved", stats_.bytes_reserved},
+                               {"bytes_allocated", stats_.bytes_in_use},
+                               {"bytes_available", bytes_available},
+                               {"fragmentation", GetFragmentation()},
+                               {"peak_bytes_in_use", stats_.peak_bytes_in_use},
+                               {"requested_bytes", req_bytes},
+                               {"allocation_bytes", alloc_bytes},
+                               {"addr", reinterpret_cast<uint64>(chunk_ptr)},
+                               {"tf_op", annotation.pending_op_name},
+                               {"id", annotation.pending_step_id},
+                               {"region_type", annotation.pending_region_type},
+                               {"data_type", annotation.pending_data_type},
+                               {"shape", tensor_shape}});
+          },
      /*level=*/profiler::TraceMeLevel::kInfo);
 }

--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@ -276,10 +276,7 @@ class BFCAllocator : public Allocator {
      DCHECK_EQ(0, memory_size % kMinAllocationSize);
      const size_t n_handles =
          (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
-      handles_.reset(new ChunkHandle[n_handles]);
-      for (size_t i = 0; i < n_handles; i++) {
-        handles_[i] = kInvalidChunkHandle;
-      }
+      handles_.resize(n_handles, kInvalidChunkHandle);
    }

    AllocationRegion() = default;
@ -292,6 +289,15 @@ class BFCAllocator : public Allocator {
    void* ptr() const { return ptr_; }
    void* end_ptr() const { return end_ptr_; }
    size_t memory_size() const { return memory_size_; }
+    void extend(size_t size) {
+      memory_size_ += size;
+      DCHECK_EQ(0, memory_size_ % kMinAllocationSize);
+
+      end_ptr_ = static_cast<void*>(static_cast<char*>(end_ptr_) + size);
+      const size_t n_handles =
+          (memory_size_ + kMinAllocationSize - 1) / kMinAllocationSize;
+      handles_.resize(n_handles, kInvalidChunkHandle);
+    }
    ChunkHandle get_handle(const void* p) const {
      return handles_[IndexFor(p)];
    }
@ -322,7 +328,7 @@ class BFCAllocator : public Allocator {
    // Array of size "memory_size / kMinAllocationSize".  It is
    // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
    // for the memory allocation represented by "p"
-    std::unique_ptr<ChunkHandle[]> handles_;
+    std::vector<ChunkHandle> handles_;

    TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion);
  };
@ -338,12 +344,43 @@ class BFCAllocator : public Allocator {
    ~RegionManager() {}

    void AddAllocationRegion(void* ptr, size_t memory_size) {
-      // Insert sorted by end_ptr
+      // Insert sorted by end_ptr.
      auto entry =
          std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
      regions_.insert(entry, AllocationRegion(ptr, memory_size));
    }

+    // Adds an alloation region for the given ptr and size, potentially
+    // extending a region if ptr matches the end_ptr of an existing region.
+    // If a region is extended, returns a pointer to the extended region so that
+    // the BFC allocator can reason about chunkification.
+    AllocationRegion* AddOrExtendAllocationRegion(void* ptr,
+                                                  size_t memory_size) {
+      // Insert sorted by end_ptr.
+      auto entry =
+          std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
+      // Check if can be coalesced with preceding region.
+      if (entry != regions_.begin()) {
+        auto preceding_region = entry - 1;
+        if (preceding_region->end_ptr() == ptr) {
+          if (VLOG_IS_ON(1)) {
+            LOG(INFO) << "Extending region " << preceding_region->ptr()
+                      << " of "
+                      << strings::HumanReadableNumBytes(
+                             preceding_region->memory_size())
+                      << "  by " << strings::HumanReadableNumBytes(memory_size)
+                      << " bytes";
+          }
+          preceding_region->extend(memory_size);
+          return &*preceding_region;
+        }
+      }
+      VLOG(1) << "Inserting new region " << ptr << " of "
+              << strings::HumanReadableNumBytes(memory_size);
+      regions_.insert(entry, AllocationRegion(ptr, memory_size));
+      return nullptr;
+    }
+
    std::vector<AllocationRegion>::iterator RemoveAllocationRegion(
        std::vector<AllocationRegion>::iterator it) {
      return regions_.erase(it);
@ -525,7 +562,14 @@ class BFCAllocator : public Allocator {

  // Whether the allocator will deallocate free regions to avoid OOM due to
  // memory fragmentation.
-  bool garbage_collection_;
+  const bool garbage_collection_;
+
+  // Whether the allocator will coalesce adjacent sub allocator provided
+  // AllocationRegions. This may be disabled if discrete sub allocator
+  // regions can't be treated as contiguous (e.g. if the allocation refers to
+  // device visible memory which is not adjacent to the other region in the
+  // device's address space).
+  const bool coalesce_regions_;

  std::unique_ptr<SubAllocator> sub_allocator_;
  string name_;
--- a/tensorflow/core/common_runtime/device/device_host_allocator.h
+++ b/tensorflow/core/common_runtime/device/device_host_allocator.h
@ -59,6 +59,8 @@ class DeviceHostAllocator : public SubAllocator {
    }
  }

+  bool SupportsCoalescing() const override { return false; }
+
 private:
  se::StreamExecutor* stream_exec_;  // not owned, non-null
  const int numa_node_;
--- a/tensorflow/core/common_runtime/device/device_mem_allocator.h
+++ b/tensorflow/core/common_runtime/device/device_mem_allocator.h
@ -68,6 +68,8 @@ class DeviceMemAllocator : public SubAllocator {
    }
  }

+  bool SupportsCoalescing() const override { return false; }
+
 private:
  se::StreamExecutor* stream_exec_;  // not owned, non-null
  const PlatformDeviceId device_id_;
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@ -260,7 +260,7 @@ Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {

    Allocator* allocator =
        new BFCAllocator(sub_allocator, gpu_host_mem_limit,
-                         true /*allow_growth*/, "gpu_host_bfc" /*name*/);
+                         /*allow_growth=*/true, /*name=*/"gpu_host_bfc");

    if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
      // Wrap the allocator to track allocation ids for better logging
--- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
@ -68,6 +68,8 @@ class GpuVirtualMemAllocator : public SubAllocator {
  // this free function should never be invoked.
  void Free(void* ptr, size_t num_bytes) override;

+  bool SupportsCoalescing() const override { return true; }
+
 private:
  GpuVirtualMemAllocator(const std::vector<Visitor>& alloc_visitors,
                         const std::vector<Visitor>& free_visitors,
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@ -160,6 +160,8 @@ class BasicCPUAllocator : public SubAllocator {

  void Free(void* ptr, size_t num_bytes) override;

+  bool SupportsCoalescing() const override { return false; }
+
 private:
  int numa_node_;

--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@ -446,6 +446,10 @@ class SubAllocator {
                      size_t* bytes_received) = 0;
  virtual void Free(void* ptr, size_t num_bytes) = 0;

+  // Returns true if the BFC allocator can safely coalesce adjacent regions
+  // returned by this allocator.
+  virtual bool SupportsCoalescing() const = 0;
+
 protected:
  // Implementation of Alloc() method must call this on newly allocated
  // value.
--- a/tensorflow/core/framework/cpu_allocator_impl.cc
+++ b/tensorflow/core/framework/cpu_allocator_impl.cc
@ -166,6 +166,8 @@ class CPUAllocatorFactory : public AllocatorFactory {
      cpu_allocator_->DeallocateRaw(ptr);
    }

+    bool SupportsCoalescing() const override { return false; }
+
   private:
    CPUAllocator* cpu_allocator_;
  };