diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 06af68bdb65..c68093d863b 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -42,6 +42,7 @@ BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
                            bool allow_growth, const string& name,
                            bool garbage_collection)
     : garbage_collection_(garbage_collection),
+      coalesce_regions_(sub_allocator->SupportsCoalescing()),
       sub_allocator_(sub_allocator),
       name_(name),
       free_chunks_list_(kInvalidChunkHandle),
@@ -150,23 +151,30 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
     curr_region_allocation_bytes_ *= 2;
   }
 
-  VLOG(1) << "Extending allocation by " << strings::HumanReadableNumBytes(bytes)
-          << " bytes.";
+  VLOG(1) << "Extending allocation by "
+          << strings::HumanReadableNumBytes(bytes_received) << " bytes.";
 
-  total_region_allocated_bytes_ += bytes;
+  total_region_allocated_bytes_ += bytes_received;
   VLOG(1) << "Total allocated bytes: "
           << strings::HumanReadableNumBytes(total_region_allocated_bytes_);
 
   VLOG(1) << "Allocated memory at " << mem_addr << " to "
-          << static_cast<void*>(static_cast<char*>(mem_addr) + bytes);
-  region_manager_.AddAllocationRegion(mem_addr, bytes_received);
+          << static_cast<void*>(static_cast<char*>(mem_addr) + bytes_received);
+
+  AllocationRegion* maybe_extended_region = nullptr;
+  if (coalesce_regions_) {
+    maybe_extended_region =
+        region_manager_.AddOrExtendAllocationRegion(mem_addr, bytes_received);
+  } else {
+    region_manager_.AddAllocationRegion(mem_addr, bytes_received);
+  }
 
   // Create one large chunk for the whole memory space that will
   // be chunked later.
   ChunkHandle h = AllocateChunk();
   BFCAllocator::Chunk* c = ChunkFromHandle(h);
   c->ptr = mem_addr;
-  c->size = bytes;
+  c->size = bytes_received;
   c->allocation_id = -1;
   c->prev = kInvalidChunkHandle;
   c->next = kInvalidChunkHandle;
@@ -174,8 +182,23 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
 
   region_manager_.set_handle(c->ptr, h);
 
-  // Insert the chunk into the right bin.
-  InsertFreeChunkIntoBin(h);
+  // If the region was extended, then there exists a previous chunk that should
+  // be linked to the new chunk.
+  if (maybe_extended_region != nullptr) {
+    ChunkHandle prev =
+        maybe_extended_region->get_handle(maybe_extended_region->ptr());
+    BFCAllocator::Chunk* prev_chunk = ChunkFromHandle(prev);
+    // Find the last recorded chunk in the extended region.
+    while (prev_chunk->next != kInvalidChunkHandle) {
+      prev = prev_chunk->next;
+      prev_chunk = ChunkFromHandle(prev);
+    }
+    c->prev = prev;
+    prev_chunk->next = h;
+  }
+
+  // Maybe merge adjacent chunks and insert the chunk into the right bin.
+  InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false));
 
   return true;
 }
@@ -469,32 +492,32 @@ void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
                               const void* chunk_ptr, int64 req_bytes,
                               int64 alloc_bytes) {
   tensorflow::profiler::TraceMe::InstantActivity(
-      [this, traceme_name, chunk_ptr, req_bytes,
-       alloc_bytes]() TF_NO_THREAD_SAFETY_ANALYSIS {
-        int64 bytes_available =
-            memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
-        const auto& annotation =
-            ScopedMemoryDebugAnnotation::CurrentAnnotation();
-        std::string tensor_shape;
-        if (annotation.pending_shape) {
-          tensor_shape = annotation.pending_shape->DebugString();
-        }
-        return tensorflow::profiler::TraceMeEncode(
-            traceme_name, {{"allocator_name", name_},
-                           {"bytes_reserved", stats_.bytes_reserved},
-                           {"bytes_allocated", stats_.bytes_in_use},
-                           {"bytes_available", bytes_available},
-                           {"fragmentation", GetFragmentation()},
-                           {"peak_bytes_in_use", stats_.peak_bytes_in_use},
-                           {"requested_bytes", req_bytes},
-                           {"allocation_bytes", alloc_bytes},
-                           {"addr", reinterpret_cast<uint64>(chunk_ptr)},
-                           {"tf_op", annotation.pending_op_name},
-                           {"id", annotation.pending_step_id},
-                           {"region_type", annotation.pending_region_type},
-                           {"data_type", annotation.pending_data_type},
-                           {"shape", tensor_shape}});
-      },
+      [this, traceme_name, chunk_ptr, req_bytes, alloc_bytes]()
+          TF_NO_THREAD_SAFETY_ANALYSIS {
+            int64 bytes_available =
+                memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
+            const auto& annotation =
+                ScopedMemoryDebugAnnotation::CurrentAnnotation();
+            std::string tensor_shape;
+            if (annotation.pending_shape) {
+              tensor_shape = annotation.pending_shape->DebugString();
+            }
+            return tensorflow::profiler::TraceMeEncode(
+                traceme_name, {{"allocator_name", name_},
+                               {"bytes_reserved", stats_.bytes_reserved},
+                               {"bytes_allocated", stats_.bytes_in_use},
+                               {"bytes_available", bytes_available},
+                               {"fragmentation", GetFragmentation()},
+                               {"peak_bytes_in_use", stats_.peak_bytes_in_use},
+                               {"requested_bytes", req_bytes},
+                               {"allocation_bytes", alloc_bytes},
+                               {"addr", reinterpret_cast<uint64>(chunk_ptr)},
+                               {"tf_op", annotation.pending_op_name},
+                               {"id", annotation.pending_step_id},
+                               {"region_type", annotation.pending_region_type},
+                               {"data_type", annotation.pending_data_type},
+                               {"shape", tensor_shape}});
+          },
       /*level=*/profiler::TraceMeLevel::kInfo);
 }
 
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index f79a6048bbb..aea1331b196 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -276,10 +276,7 @@ class BFCAllocator : public Allocator {
       DCHECK_EQ(0, memory_size % kMinAllocationSize);
       const size_t n_handles =
           (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
-      handles_.reset(new ChunkHandle[n_handles]);
-      for (size_t i = 0; i < n_handles; i++) {
-        handles_[i] = kInvalidChunkHandle;
-      }
+      handles_.resize(n_handles, kInvalidChunkHandle);
     }
 
     AllocationRegion() = default;
@@ -292,6 +289,15 @@ class BFCAllocator : public Allocator {
     void* ptr() const { return ptr_; }
     void* end_ptr() const { return end_ptr_; }
     size_t memory_size() const { return memory_size_; }
+    void extend(size_t size) {
+      memory_size_ += size;
+      DCHECK_EQ(0, memory_size_ % kMinAllocationSize);
+
+      end_ptr_ = static_cast<void*>(static_cast<char*>(end_ptr_) + size);
+      const size_t n_handles =
+          (memory_size_ + kMinAllocationSize - 1) / kMinAllocationSize;
+      handles_.resize(n_handles, kInvalidChunkHandle);
+    }
     ChunkHandle get_handle(const void* p) const {
       return handles_[IndexFor(p)];
     }
@@ -322,7 +328,7 @@ class BFCAllocator : public Allocator {
     // Array of size "memory_size / kMinAllocationSize".  It is
     // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
     // for the memory allocation represented by "p"
-    std::unique_ptr<ChunkHandle[]> handles_;
+    std::vector<ChunkHandle> handles_;
 
     TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion);
   };
@@ -338,12 +344,43 @@ class BFCAllocator : public Allocator {
     ~RegionManager() {}
 
     void AddAllocationRegion(void* ptr, size_t memory_size) {
-      // Insert sorted by end_ptr
+      // Insert sorted by end_ptr.
       auto entry =
           std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
       regions_.insert(entry, AllocationRegion(ptr, memory_size));
     }
 
+    // Adds an alloation region for the given ptr and size, potentially
+    // extending a region if ptr matches the end_ptr of an existing region.
+    // If a region is extended, returns a pointer to the extended region so that
+    // the BFC allocator can reason about chunkification.
+    AllocationRegion* AddOrExtendAllocationRegion(void* ptr,
+                                                  size_t memory_size) {
+      // Insert sorted by end_ptr.
+      auto entry =
+          std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
+      // Check if can be coalesced with preceding region.
+      if (entry != regions_.begin()) {
+        auto preceding_region = entry - 1;
+        if (preceding_region->end_ptr() == ptr) {
+          if (VLOG_IS_ON(1)) {
+            LOG(INFO) << "Extending region " << preceding_region->ptr()
+                      << " of "
+                      << strings::HumanReadableNumBytes(
+                             preceding_region->memory_size())
+                      << "  by " << strings::HumanReadableNumBytes(memory_size)
+                      << " bytes";
+          }
+          preceding_region->extend(memory_size);
+          return &*preceding_region;
+        }
+      }
+      VLOG(1) << "Inserting new region " << ptr << " of "
+              << strings::HumanReadableNumBytes(memory_size);
+      regions_.insert(entry, AllocationRegion(ptr, memory_size));
+      return nullptr;
+    }
+
     std::vector<AllocationRegion>::iterator RemoveAllocationRegion(
         std::vector<AllocationRegion>::iterator it) {
       return regions_.erase(it);
@@ -525,7 +562,14 @@ class BFCAllocator : public Allocator {
 
   // Whether the allocator will deallocate free regions to avoid OOM due to
   // memory fragmentation.
-  bool garbage_collection_;
+  const bool garbage_collection_;
+
+  // Whether the allocator will coalesce adjacent sub allocator provided
+  // AllocationRegions. This may be disabled if discrete sub allocator
+  // regions can't be treated as contiguous (e.g. if the allocation refers to
+  // device visible memory which is not adjacent to the other region in the
+  // device's address space).
+  const bool coalesce_regions_;
 
   std::unique_ptr<SubAllocator> sub_allocator_;
   string name_;
diff --git a/tensorflow/core/common_runtime/device/device_host_allocator.h b/tensorflow/core/common_runtime/device/device_host_allocator.h
index 9d11705eadf..5bd1ab304a5 100644
--- a/tensorflow/core/common_runtime/device/device_host_allocator.h
+++ b/tensorflow/core/common_runtime/device/device_host_allocator.h
@@ -59,6 +59,8 @@ class DeviceHostAllocator : public SubAllocator {
     }
   }
 
+  bool SupportsCoalescing() const override { return false; }
+
  private:
   se::StreamExecutor* stream_exec_;  // not owned, non-null
   const int numa_node_;
diff --git a/tensorflow/core/common_runtime/device/device_mem_allocator.h b/tensorflow/core/common_runtime/device/device_mem_allocator.h
index 16a14c7114b..bad824f7388 100644
--- a/tensorflow/core/common_runtime/device/device_mem_allocator.h
+++ b/tensorflow/core/common_runtime/device/device_mem_allocator.h
@@ -68,6 +68,8 @@ class DeviceMemAllocator : public SubAllocator {
     }
   }
 
+  bool SupportsCoalescing() const override { return false; }
+
  private:
   se::StreamExecutor* stream_exec_;  // not owned, non-null
   const PlatformDeviceId device_id_;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 7a58f10bb9a..ea69bfa6b7e 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -260,7 +260,7 @@ Allocator* GPUProcessState::GetGpuHostAllocator(int numa_node) {
 
     Allocator* allocator =
         new BFCAllocator(sub_allocator, gpu_host_mem_limit,
-                         true /*allow_growth*/, "gpu_host_bfc" /*name*/);
+                         /*allow_growth=*/true, /*name=*/"gpu_host_bfc");
 
     if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
       // Wrap the allocator to track allocation ids for better logging
diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
index 86bb00f1cf9..23572262c42 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
@@ -68,6 +68,8 @@ class GpuVirtualMemAllocator : public SubAllocator {
   // this free function should never be invoked.
   void Free(void* ptr, size_t num_bytes) override;
 
+  bool SupportsCoalescing() const override { return true; }
+
  private:
   GpuVirtualMemAllocator(const std::vector<Visitor>& alloc_visitors,
                          const std::vector<Visitor>& free_visitors,
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index da1a3796830..55d09341267 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -160,6 +160,8 @@ class BasicCPUAllocator : public SubAllocator {
 
   void Free(void* ptr, size_t num_bytes) override;
 
+  bool SupportsCoalescing() const override { return false; }
+
  private:
   int numa_node_;
 
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2899a59c548..4b16c51a337 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -446,6 +446,10 @@ class SubAllocator {
                       size_t* bytes_received) = 0;
   virtual void Free(void* ptr, size_t num_bytes) = 0;
 
+  // Returns true if the BFC allocator can safely coalesce adjacent regions
+  // returned by this allocator.
+  virtual bool SupportsCoalescing() const = 0;
+
  protected:
   // Implementation of Alloc() method must call this on newly allocated
   // value.
diff --git a/tensorflow/core/framework/cpu_allocator_impl.cc b/tensorflow/core/framework/cpu_allocator_impl.cc
index 567454a3295..f3d7fdc2a1f 100644
--- a/tensorflow/core/framework/cpu_allocator_impl.cc
+++ b/tensorflow/core/framework/cpu_allocator_impl.cc
@@ -166,6 +166,8 @@ class CPUAllocatorFactory : public AllocatorFactory {
       cpu_allocator_->DeallocateRaw(ptr);
     }
 
+    bool SupportsCoalescing() const override { return false; }
+
    private:
     CPUAllocator* cpu_allocator_;
   };