diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc index 06af68bdb65..c68093d863b 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.cc +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -42,6 +42,7 @@ BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, bool allow_growth, const string& name, bool garbage_collection) : garbage_collection_(garbage_collection), + coalesce_regions_(sub_allocator->SupportsCoalescing()), sub_allocator_(sub_allocator), name_(name), free_chunks_list_(kInvalidChunkHandle), @@ -150,23 +151,30 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { curr_region_allocation_bytes_ *= 2; } - VLOG(1) << "Extending allocation by " << strings::HumanReadableNumBytes(bytes) - << " bytes."; + VLOG(1) << "Extending allocation by " + << strings::HumanReadableNumBytes(bytes_received) << " bytes."; - total_region_allocated_bytes_ += bytes; + total_region_allocated_bytes_ += bytes_received; VLOG(1) << "Total allocated bytes: " << strings::HumanReadableNumBytes(total_region_allocated_bytes_); VLOG(1) << "Allocated memory at " << mem_addr << " to " - << static_cast(static_cast(mem_addr) + bytes); - region_manager_.AddAllocationRegion(mem_addr, bytes_received); + << static_cast(static_cast(mem_addr) + bytes_received); + + AllocationRegion* maybe_extended_region = nullptr; + if (coalesce_regions_) { + maybe_extended_region = + region_manager_.AddOrExtendAllocationRegion(mem_addr, bytes_received); + } else { + region_manager_.AddAllocationRegion(mem_addr, bytes_received); + } // Create one large chunk for the whole memory space that will // be chunked later. ChunkHandle h = AllocateChunk(); BFCAllocator::Chunk* c = ChunkFromHandle(h); c->ptr = mem_addr; - c->size = bytes; + c->size = bytes_received; c->allocation_id = -1; c->prev = kInvalidChunkHandle; c->next = kInvalidChunkHandle; @@ -174,8 +182,23 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { region_manager_.set_handle(c->ptr, h); - // Insert the chunk into the right bin. - InsertFreeChunkIntoBin(h); + // If the region was extended, then there exists a previous chunk that should + // be linked to the new chunk. + if (maybe_extended_region != nullptr) { + ChunkHandle prev = + maybe_extended_region->get_handle(maybe_extended_region->ptr()); + BFCAllocator::Chunk* prev_chunk = ChunkFromHandle(prev); + // Find the last recorded chunk in the extended region. + while (prev_chunk->next != kInvalidChunkHandle) { + prev = prev_chunk->next; + prev_chunk = ChunkFromHandle(prev); + } + c->prev = prev; + prev_chunk->next = h; + } + + // Maybe merge adjacent chunks and insert the chunk into the right bin. + InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false)); return true; } @@ -469,32 +492,32 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name, const void* chunk_ptr, int64 req_bytes, int64 alloc_bytes) { tensorflow::profiler::TraceMe::InstantActivity( - [this, traceme_name, chunk_ptr, req_bytes, - alloc_bytes]() TF_NO_THREAD_SAFETY_ANALYSIS { - int64 bytes_available = - memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use; - const auto& annotation = - ScopedMemoryDebugAnnotation::CurrentAnnotation(); - std::string tensor_shape; - if (annotation.pending_shape) { - tensor_shape = annotation.pending_shape->DebugString(); - } - return tensorflow::profiler::TraceMeEncode( - traceme_name, {{"allocator_name", name_}, - {"bytes_reserved", stats_.bytes_reserved}, - {"bytes_allocated", stats_.bytes_in_use}, - {"bytes_available", bytes_available}, - {"fragmentation", GetFragmentation()}, - {"peak_bytes_in_use", stats_.peak_bytes_in_use}, - {"requested_bytes", req_bytes}, - {"allocation_bytes", alloc_bytes}, - {"addr", reinterpret_cast(chunk_ptr)}, - {"tf_op", annotation.pending_op_name}, - {"id", annotation.pending_step_id}, - {"region_type", annotation.pending_region_type}, - {"data_type", annotation.pending_data_type}, - {"shape", tensor_shape}}); - }, + [this, traceme_name, chunk_ptr, req_bytes, alloc_bytes]() + TF_NO_THREAD_SAFETY_ANALYSIS { + int64 bytes_available = + memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use; + const auto& annotation = + ScopedMemoryDebugAnnotation::CurrentAnnotation(); + std::string tensor_shape; + if (annotation.pending_shape) { + tensor_shape = annotation.pending_shape->DebugString(); + } + return tensorflow::profiler::TraceMeEncode( + traceme_name, {{"allocator_name", name_}, + {"bytes_reserved", stats_.bytes_reserved}, + {"bytes_allocated", stats_.bytes_in_use}, + {"bytes_available", bytes_available}, + {"fragmentation", GetFragmentation()}, + {"peak_bytes_in_use", stats_.peak_bytes_in_use}, + {"requested_bytes", req_bytes}, + {"allocation_bytes", alloc_bytes}, + {"addr", reinterpret_cast(chunk_ptr)}, + {"tf_op", annotation.pending_op_name}, + {"id", annotation.pending_step_id}, + {"region_type", annotation.pending_region_type}, + {"data_type", annotation.pending_data_type}, + {"shape", tensor_shape}}); + }, /*level=*/profiler::TraceMeLevel::kInfo); } diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h index f79a6048bbb..aea1331b196 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.h +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -276,10 +276,7 @@ class BFCAllocator : public Allocator { DCHECK_EQ(0, memory_size % kMinAllocationSize); const size_t n_handles = (memory_size + kMinAllocationSize - 1) / kMinAllocationSize; - handles_.reset(new ChunkHandle[n_handles]); - for (size_t i = 0; i < n_handles; i++) { - handles_[i] = kInvalidChunkHandle; - } + handles_.resize(n_handles, kInvalidChunkHandle); } AllocationRegion() = default; @@ -292,6 +289,15 @@ class BFCAllocator : public Allocator { void* ptr() const { return ptr_; } void* end_ptr() const { return end_ptr_; } size_t memory_size() const { return memory_size_; } + void extend(size_t size) { + memory_size_ += size; + DCHECK_EQ(0, memory_size_ % kMinAllocationSize); + + end_ptr_ = static_cast(static_cast(end_ptr_) + size); + const size_t n_handles = + (memory_size_ + kMinAllocationSize - 1) / kMinAllocationSize; + handles_.resize(n_handles, kInvalidChunkHandle); + } ChunkHandle get_handle(const void* p) const { return handles_[IndexFor(p)]; } @@ -322,7 +328,7 @@ class BFCAllocator : public Allocator { // Array of size "memory_size / kMinAllocationSize". It is // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle // for the memory allocation represented by "p" - std::unique_ptr handles_; + std::vector handles_; TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion); }; @@ -338,12 +344,43 @@ class BFCAllocator : public Allocator { ~RegionManager() {} void AddAllocationRegion(void* ptr, size_t memory_size) { - // Insert sorted by end_ptr + // Insert sorted by end_ptr. auto entry = std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); regions_.insert(entry, AllocationRegion(ptr, memory_size)); } + // Adds an alloation region for the given ptr and size, potentially + // extending a region if ptr matches the end_ptr of an existing region. + // If a region is extended, returns a pointer to the extended region so that + // the BFC allocator can reason about chunkification. + AllocationRegion* AddOrExtendAllocationRegion(void* ptr, + size_t memory_size) { + // Insert sorted by end_ptr. + auto entry = + std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); + // Check if can be coalesced with preceding region. + if (entry != regions_.begin()) { + auto preceding_region = entry - 1; + if (preceding_region->end_ptr() == ptr) { + if (VLOG_IS_ON(1)) { + LOG(INFO) << "Extending region " << preceding_region->ptr() + << " of " + << strings::HumanReadableNumBytes( + preceding_region->memory_size()) + << " by " << strings::HumanReadableNumBytes(memory_size) + << " bytes"; + } + preceding_region->extend(memory_size); + return &*preceding_region; + } + } + VLOG(1) << "Inserting new region " << ptr << " of " + << strings::HumanReadableNumBytes(memory_size); + regions_.insert(entry, AllocationRegion(ptr, memory_size)); + return nullptr; + } + std::vector::iterator RemoveAllocationRegion( std::vector::iterator it) { return regions_.erase(it); @@ -525,7 +562,14 @@ class BFCAllocator : public Allocator { // Whether the allocator will deallocate free regions to avoid OOM due to // memory fragmentation. - bool garbage_collection_; + const bool garbage_collection_; + + // Whether the allocator will coalesce adjacent sub allocator provided + // AllocationRegions. This may be disabled if discrete sub allocator + // regions can't be treated as contiguous (e.g. if the allocation refers to + // device visible memory which is not adjacent to the other region in the + // device's address space). + const bool coalesce_regions_; std::unique_ptr sub_allocator_; string name_; diff --git a/tensorflow/core/common_runtime/device/device_host_allocator.h b/tensorflow/core/common_runtime/device/device_host_allocator.h index 9d11705eadf..5bd1ab304a5 100644 --- a/tensorflow/core/common_runtime/device/device_host_allocator.h +++ b/tensorflow/core/common_runtime/device/device_host_allocator.h @@ -59,6 +59,8 @@ class DeviceHostAllocator : public SubAllocator { } } + bool SupportsCoalescing() const override { return false; } + private: se::StreamExecutor* stream_exec_; // not owned, non-null const int numa_node_; diff --git a/tensorflow/core/common_runtime/device/device_mem_allocator.h b/tensorflow/core/common_runtime/device/device_mem_allocator.h index 16a14c7114b..bad824f7388 100644 --- a/tensorflow/core/common_runtime/device/device_mem_allocator.h +++ b/tensorflow/core/common_runtime/device/device_mem_allocator.h @@ -68,6 +68,8 @@ class DeviceMemAllocator : public SubAllocator { } } + bool SupportsCoalescing() const override { return false; } + private: se::StreamExecutor* stream_exec_; // not owned, non-null const PlatformDeviceId device_id_; diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc index 7a58f10bb9a..ea69bfa6b7e 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc @@ -260,7 +260,7 @@ Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) { Allocator* allocator = new BFCAllocator(sub_allocator, gpu_host_mem_limit, - true /*allow_growth*/, "gpu_host_bfc" /*name*/); + /*allow_growth=*/true, /*name=*/"gpu_host_bfc"); if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) { // Wrap the allocator to track allocation ids for better logging diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h index 86bb00f1cf9..23572262c42 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h @@ -68,6 +68,8 @@ class GpuVirtualMemAllocator : public SubAllocator { // this free function should never be invoked. void Free(void* ptr, size_t num_bytes) override; + bool SupportsCoalescing() const override { return true; } + private: GpuVirtualMemAllocator(const std::vector& alloc_visitors, const std::vector& free_visitors, diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h index da1a3796830..55d09341267 100644 --- a/tensorflow/core/common_runtime/pool_allocator.h +++ b/tensorflow/core/common_runtime/pool_allocator.h @@ -160,6 +160,8 @@ class BasicCPUAllocator : public SubAllocator { void Free(void* ptr, size_t num_bytes) override; + bool SupportsCoalescing() const override { return false; } + private: int numa_node_; diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h index 2899a59c548..4b16c51a337 100644 --- a/tensorflow/core/framework/allocator.h +++ b/tensorflow/core/framework/allocator.h @@ -446,6 +446,10 @@ class SubAllocator { size_t* bytes_received) = 0; virtual void Free(void* ptr, size_t num_bytes) = 0; + // Returns true if the BFC allocator can safely coalesce adjacent regions + // returned by this allocator. + virtual bool SupportsCoalescing() const = 0; + protected: // Implementation of Alloc() method must call this on newly allocated // value. diff --git a/tensorflow/core/framework/cpu_allocator_impl.cc b/tensorflow/core/framework/cpu_allocator_impl.cc index 567454a3295..f3d7fdc2a1f 100644 --- a/tensorflow/core/framework/cpu_allocator_impl.cc +++ b/tensorflow/core/framework/cpu_allocator_impl.cc @@ -166,6 +166,8 @@ class CPUAllocatorFactory : public AllocatorFactory { cpu_allocator_->DeallocateRaw(ptr); } + bool SupportsCoalescing() const override { return false; } + private: CPUAllocator* cpu_allocator_; };