diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 2dd7acb2f67..201ac346bad 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -73,6 +73,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_deterministic_reductions(false);
   opts.set_xla_cpu_enable_xprof_traceme(false);
   opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
+  opts.set_xla_multiheap_size_constraint_per_heap(-1);
 
   return opts;
 }
@@ -571,6 +572,16 @@ static void AllocateFlags() {
       "that falling back to the driver can have drawbacks like using more "
       "memory and/or other bugs during compilation, so we recommend setting "
       "this flag to false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_multiheap_size_constraint_per_heap",
+      int32_setter_for(
+          &DebugOptions::set_xla_multiheap_size_constraint_per_heap),
+      flag_values->xla_multiheap_size_constraint_per_heap(),
+      "Generates multiple heaps (i.e., temp buffers) with a size "
+      "constraint on each heap to avoid Out-of-Memory due to memory "
+      "fragmentation. The constraint is soft, so it works with tensors "
+      "larger than the given constraint size."));
+
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index db34f054f35..1744bf57748 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -1330,12 +1330,23 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
   auto get_heap_algorithm = [&](int64 alignment) {
     auto algorithms = absl::make_unique<
         std::vector<std::unique_ptr<HeapAlgorithm<HloValue>>>>();
-    algorithms->push_back(
-        absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
-            alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
-    algorithms->push_back(
-        absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
-            alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
+    if (assignment->multiheap_size_constraint_per_heap() == -1) {
+      algorithms->push_back(
+          absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
+              alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
+      algorithms->push_back(
+          absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
+              alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
+    } else {
+      algorithms->push_back(
+          absl::make_unique<ConstrainedGlobalDecreasingSizeBestFitHeap>(
+              assignment->multiheap_size_constraint_per_heap(), alignment,
+              GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
+      algorithms->push_back(
+          absl::make_unique<ConstrainedGlobalDecreasingSizeBestFitHeap>(
+              assignment->multiheap_size_constraint_per_heap(), alignment,
+              GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
+    }
     return absl::make_unique<ChooseBestHeapAlgorithm<HloValue>>(
         std::move(algorithms));
   };
@@ -1500,20 +1511,25 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
   }
   VLOG(1) << "Result size from heap simulator: " << result.heap_size;
 
-  BufferAllocation* allocation =
-      assignment->NewEmptyAllocation(result.heap_size, color);
-  for (const auto& buffer_chunk : result.chunk_map) {
-    const HloValue& value = *buffer_chunk.first;
-    const HeapSimulator::Chunk& chunk = buffer_chunk.second;
-    assignment->AddAssignment(allocation, value, chunk.offset, chunk.size);
+  for (auto& heap_result : result.heap_results) {
+    BufferAllocation* allocation =
+        assignment->NewEmptyAllocation(heap_result.heap_size, color);
+    for (const auto& buffer_chunk : heap_result.chunk_map) {
+      const HloValue& value = *buffer_chunk.first;
+      const HeapSimulator::Chunk& chunk = buffer_chunk.second;
+      assignment->AddAssignment(allocation, value, chunk.offset, chunk.size);
+    }
+    // Compute peak_buffers only when the multiheap mode is off. Simply return
+    // an empty vector in the multiheap mode.
+    if (assignment->multiheap_size_constraint_per_heap() == -1) {
+      allocation->peak_buffers_ =
+          ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace);
+    }
+
+    XLA_VLOG_LINES(2, allocation->ToString());
+
+    allocation->AddHeapTrace(result.debug_trace);
   }
-  allocation->peak_buffers_ =
-      ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace);
-
-  VLOG(1) << "Ran heap simulation for allocation: ";
-  XLA_VLOG_LINES(2, allocation->ToString());
-
-  allocation->AddHeapTrace(result.debug_trace);
 }
 
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
@@ -1580,6 +1596,10 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
       buffers_to_assign_sequentially.size() == global_computations.size();
   VLOG(2) << "Running whole module heap simulation: "
           << run_whole_module_heap_simulation;
+  const int32 multiheap_size_constraint_per_heap =
+      module->config().debug_options().xla_multiheap_size_constraint_per_heap();
+  VLOG(2) << "Multiheap per heap size limit: "
+          << multiheap_size_constraint_per_heap;
   TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
       buffers_to_assign_sequentially, run_whole_module_heap_simulation,
       assignment.get()));
@@ -1618,7 +1638,9 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   // This can only be performed after all buffers have been assigned, and
   // after maybe_live_out is marked, since it is used to determine whether an
   // allocation contains temporary buffers or not.
-  assignment->CombineTempAllocations();
+  if (multiheap_size_constraint_per_heap == -1) {
+    assignment->CombineTempAllocations();
+  }
 
   XLA_VLOG_LINES(2, assignment->ToString());
   TF_RETURN_IF_ERROR(assignment->ComputeSummaryStats());
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index dfde46ca4b1..c8c430988f5 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -363,6 +363,10 @@ class BufferAssignment {
     return temp_allocation_total_size_;
   }
 
+  int32 multiheap_size_constraint_per_heap() const {
+    return multiheap_size_constraint_per_heap_;
+  }
+
   // Returns whether the given buffer has been assigned an allocation.
   bool HasAllocation(const HloValue& value) const;
 
@@ -491,7 +495,11 @@ class BufferAssignment {
         buffer_size_(std::move(buffer_size)),
         color_alignment_(std::move(color_alignment)),
         alias_analysis_(std::move(alias_analysis)),
-        hlo_live_range_(std::move(hlo_live_range)) {}
+        hlo_live_range_(std::move(hlo_live_range)),
+        multiheap_size_constraint_per_heap_(
+            module->config()
+                .debug_options()
+                .xla_multiheap_size_constraint_per_heap()) {}
 
   // Creates and returns a new BufferAllocation, with no assigned
   // LogicalBuffers. Ownership is maintained internally.
@@ -535,6 +543,8 @@ class BufferAssignment {
   // The total size of all temporary buffers.
   int64 temp_allocation_total_size_ = 0;
 
+  int32 multiheap_size_constraint_per_heap_;
+
   // Maps Buffers to the index of the BufferAllocation which holds the buffer.
   absl::flat_hash_map<const HloValue*, BufferAllocation::Index>
       allocation_index_for_value_;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 8f01d7e3c41..b382ff8b708 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1857,7 +1857,8 @@ IrEmitterUnnested::BuildKernelThunkFromBufferSlices(
     absl::string_view name, Thunk::ThunkInfo thunk_info,
     absl::Span<const BufferSlice* const> slices,
     std::function<void(const BufferSlice*, llvm::Value*)>
-        bind_slice_to_ir_value) {
+        bind_slice_to_ir_value,
+    bool insist_single_temp_buffer) {
   const auto& buffer_assn = ir_emitter_context_->buffer_assignment();
 
   // Figure out which buffer allocations need to be passed as arguments to our
@@ -1874,7 +1875,7 @@ IrEmitterUnnested::BuildKernelThunkFromBufferSlices(
     if (alloc.IsPreallocatedTempBuffer()) {
       if (!temp_buffer.has_value()) {
         temp_buffer = &alloc;
-      } else {
+      } else if (insist_single_temp_buffer) {
         LOG(FATAL) << "Multiple temp buffers found, but only one is allowed!";
       }
     }
@@ -1995,7 +1996,13 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
                 << hlo_buffer_slice->gte_index.ToString();
 
         bindings_.BindHloToIrValue(*instr, value, index);
-      });
+      },
+      // Check temp buffer numbers only when the multiheap mode is off.
+      /*insist_single_temp_buffer=*/inst->parent()
+              ->parent()
+              ->config()
+              .debug_options()
+              .xla_multiheap_size_constraint_per_heap() == -1);
 }
 
 std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunkForMlir(
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index b83af8799d3..1ad12840c9e 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -547,7 +547,8 @@ class IrEmitterUnnested : public IrEmitter,
       absl::string_view name, Thunk::ThunkInfo thunk_info,
       absl::Span<const BufferSlice* const> slices,
       std::function<void(const BufferSlice*, llvm::Value*)>
-          bind_slice_to_ir_value);
+          bind_slice_to_ir_value,
+      bool insist_single_temp_buffer = true);
 
   // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
   // caller needs to make sure `inst` outlives the lifetime of the returned
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 2e2b668eba7..7b0220753fa 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -409,11 +409,15 @@ HeapSimulator::Result<HloValue> HeapSimulator::Finish() {
   // Post-process the result to add chunks for shared buffers.  An empty chunk
   // map means that either no buffers were allocated, or the heap was only
   // collecting statistics, e.g. NoFragmentationStatsHeap.
-  if (!result.chunk_map.empty()) {
+  size_t total_chunk_count = 0;
+  absl::c_for_each(result.heap_results, [&](const HeapResult<HloValue>& hr) {
+    total_chunk_count += hr.chunk_map.size();
+  });
+  if (total_chunk_count != 0) {
     // If we were told to assign specific buffers, make sure we've assigned
     // exactly that many buffers.
     if (options_.buffers_to_assign != nullptr) {
-      CHECK_EQ(options_.buffers_to_assign->size(), result.chunk_map.size());
+      CHECK_EQ(options_.buffers_to_assign->size(), total_chunk_count);
     }
   }
 
@@ -825,7 +829,10 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::Finish() {
     CommitChunk(buffer_interval, chunk_candidate);
   }
   VLOG(1) << "result heap_size: " << result_.heap_size;
-  return result_;
+  Result result;
+  result.heap_size = result_.heap_size;
+  result.heap_results.emplace_back(result_);
+  return result;
 }
 
 template <typename BufferType>
@@ -968,6 +975,58 @@ void GlobalDecreasingSizeBestFitHeap<BufferType>::AddToChunkMap(
   DCHECK(emplace_result.second);
 }
 
+HeapSimulator::Result<HloValue>
+ConstrainedGlobalDecreasingSizeBestFitHeap::Finish() {
+  std::vector<BufferInterval> sorted_buffer_vec = GetSortedBufferIntervals();
+  // Convert into std::list so that erase() is O(1).
+  std::list<BufferInterval> sorted_buffer_intervals(sorted_buffer_vec.begin(),
+                                                    sorted_buffer_vec.end());
+
+  // Use do-while here, because we need to create 1 heap in `multi_heap_result`
+  // even if `sorted_buffer_intervals` is empty.
+  Result multi_heap_result;
+  do {
+    // Place buffers into the currently processed heap as many as possible.
+    for (auto it = sorted_buffer_intervals.begin();
+         it != sorted_buffer_intervals.end();) {
+      BufferInterval buffer_interval = *it;
+      if (!buffer_interval.need_allocation) {
+        it = sorted_buffer_intervals.erase(it);
+        continue;
+      }
+      if (buffer_interval.size > size_limit_per_heap_) {
+        LOG(WARNING) << "Alloc buffer size " << buffer_interval.size
+                     << " larger than the per-heap size limit "
+                     << size_limit_per_heap_;
+      }
+
+      ChunkCandidate chunk_candidate = FindChunkCandidate(buffer_interval);
+      if (chunk_candidate.heap_size <= size_limit_per_heap_ ||
+          // Commit the chunk as long as the heap is empty. We do this because
+          // we want the size constraint to be soft, meaning that results are
+          // successfully generated even if there are some buffer sizes larger
+          // than the given constraint size.
+          result_.heap_size == 0) {
+        CommitChunk(buffer_interval, chunk_candidate);
+        it = sorted_buffer_intervals.erase(it);
+        continue;
+      }
+
+      ++it;
+    }
+    // Collect the result from the currently processed heap and reset the heap
+    // states.
+    multi_heap_result.heap_size += result_.heap_size;
+    multi_heap_result.heap_results.push_back(std::move(result_));
+    result_ = {};
+    interval_tree_ = {};
+  } while (!sorted_buffer_intervals.empty());
+
+  VLOG(1) << "Number of heaps produced = "
+          << multi_heap_result.heap_results.size();
+  return multi_heap_result;
+}
+
 template <typename BufferType>
 HeapSimulator::Result<BufferType>
 ChooseBestHeapAlgorithm<BufferType>::Finish() {
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index b47ff685139..0c7425d8edf 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -67,14 +67,23 @@ class HeapSimulator {
     }
   };
 
-  // Result represents the result of the heap simulation.
   template <typename BufferType>
-  struct Result {
+  struct HeapResult {
     // The assignment of buffers to chunks.
     absl::flat_hash_map<const BufferType*, Chunk> chunk_map;
 
     // The total size in bytes of the heap, containing all assigned chunks.
     int64 heap_size = 0;
+  };
+  // Result represents the result of the heap simulation.
+  template <typename BufferType>
+  struct Result {
+    // Heap results.
+    std::vector<HeapResult<BufferType>> heap_results;
+
+    // The total size in bytes of the heaps.
+    // heap_size == sum([hr.heap_size for hr in heap_results]).
+    int64 heap_size = 0;
 
     // The total size in bytes of heap fragmentation.
     int64 fragmentation_size = 0;
@@ -229,6 +238,7 @@ class HeapAlgorithm {
  public:
   using Chunk = HeapSimulator::Chunk;
   using Result = HeapSimulator::Result<BufferType>;
+  using HeapResult = HeapSimulator::HeapResult<BufferType>;
 
   virtual ~HeapAlgorithm() = default;
 
@@ -347,6 +357,7 @@ class BufferIntervalTree {
 template <typename BufferType>
 class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
  public:
+  using HeapResult = HeapSimulator::HeapResult<BufferType>;
   using Result = HeapSimulator::Result<BufferType>;
   using Chunk = HeapSimulator::Chunk;
 
@@ -415,6 +426,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
                                     int64 preferred_offset = -1) const;
   void CommitChunk(const BufferInterval& buffer_interval,
                    ChunkCandidate chunk_candidate);
+
   // Adds the buffer and the chunk to the result chunk map.
   virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk);
 
@@ -426,7 +438,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
   BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
 
   absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_;
-  Result result_;
+  HeapResult result_;
   BufferIntervalCompare buffer_interval_compare_;
   BufferIntervalTree interval_tree_;
 
@@ -444,6 +456,25 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
       const BufferInterval& interval) const;
 };
 
+// This class implements an algorithm that will output multiple heaps. Each heap
+// size is constrained by a given limit. Note that the constraint is soft,
+// meaning that valid heap results are generated even if there are some buffer
+// sizes larger than the given constraint size.
+class ConstrainedGlobalDecreasingSizeBestFitHeap
+    : public GlobalDecreasingSizeBestFitHeap<HloValue> {
+ public:
+  explicit ConstrainedGlobalDecreasingSizeBestFitHeap(
+      size_t size_limit_per_heap, int64 alignment, Type type = kSpatial)
+      : size_limit_per_heap_(size_limit_per_heap),
+        GlobalDecreasingSizeBestFitHeap<HloValue>(alignment, type) {}
+  ~ConstrainedGlobalDecreasingSizeBestFitHeap() override {}
+
+  Result Finish() override;
+
+ private:
+  size_t size_limit_per_heap_;
+};
+
 // A heap algorithm that chooses the best results from other algorithms added to
 // it.
 template <typename BufferType>
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 8f7668b4965..26305eebb0d 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -256,12 +256,15 @@ class HeapCallRecorder : public HeapAlgorithm<HloValue> {
   }
   Result Finish() override {
     calls_->emplace_back(kFinish, nullptr);
-    return result_;
+    HeapSimulator::Result<HloValue> result;
+    result.heap_size = result_.heap_size;
+    result.heap_results.emplace_back(std::move(result_));
+    return result;
   }
 
  private:
   CallSequence* calls_;
-  Result result_;
+  HeapSimulator::HeapResult<HloValue> result_;
 };
 
 // HeapSimulatorTracker runs the heap simulator, recording the sequence of calls
@@ -335,7 +338,8 @@ class HeapSimulatorTracker {
 
   int64 OffsetAt(const HloInstruction* instruction, const ShapeIndex& index) {
     const HloValue* buffer = BufferAt(instruction, index);
-    return result_.chunk_map.at(buffer).offset;
+    CHECK_EQ(1, result_.heap_results.size());
+    return result_.heap_results.at(0).chunk_map.at(buffer).offset;
   }
 
   // Ensures the expected sequence of Alloc/Free/Finish calls was performed.
@@ -1051,7 +1055,8 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Empty) {
   GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
   const HeapSimulator::Result<HloValue> result = heap.Finish();
   EXPECT_EQ(0, result.heap_size);
-  EXPECT_EQ(0, result.chunk_map.size());
+  EXPECT_EQ(1, result.heap_results.size());
+  EXPECT_EQ(0, result.heap_results.at(0).chunk_map.size());
 }
 
 TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
@@ -1078,7 +1083,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
   heap.Free(buffer_c_, 20);
   heap.Free(buffer_d_, 40);
 
-  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  const HeapSimulator::Result<HloValue> results = heap.Finish();
+  EXPECT_EQ(1, results.heap_results.size());
+  const HeapSimulator::HeapResult<HloValue>& result =
+      results.heap_results.at(0);
   EXPECT_EQ(100, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
@@ -1117,7 +1125,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSizeWithAlignment) {
   heap.Free(buffer_c_, 50);
   heap.Free(buffer_d_, 40);
 
-  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  const HeapSimulator::Result<HloValue> results = heap.Finish();
+  EXPECT_EQ(1, results.heap_results.size());
+  const HeapSimulator::HeapResult<HloValue>& result =
+      results.heap_results.at(0);
   EXPECT_EQ(120, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1160,7 +1171,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, BestFit) {
   heap.Free(buffer_d_, 30);
   heap.Free(buffer_e_, 50);
 
-  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  const HeapSimulator::Result<HloValue> results = heap.Finish();
+  EXPECT_EQ(1, results.heap_results.size());
+  const HeapSimulator::HeapResult<HloValue>& result =
+      results.heap_results.at(0);
   EXPECT_EQ(140, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1192,7 +1206,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Colocated) {
   heap.ShareWith(buffer_c_, buffer_a_, 40);
   heap.Free(buffer_c_, 40);
 
-  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  const HeapSimulator::Result<HloValue> results = heap.Finish();
+  EXPECT_EQ(1, results.heap_results.size());
+  const HeapSimulator::HeapResult<HloValue>& result =
+      results.heap_results.at(0);
   EXPECT_EQ(40, result.heap_size);
   EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1221,7 +1238,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
   heap.Free(buffer_c_, 40);
   heap.Free(buffer_b_, 20);
 
-  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  const HeapSimulator::Result<HloValue> results = heap.Finish();
+  EXPECT_EQ(1, results.heap_results.size());
+  const HeapSimulator::HeapResult<HloValue>& result =
+      results.heap_results.at(0);
   EXPECT_EQ(60, result.heap_size);
   EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@@ -1251,7 +1271,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedIII) {
   heap.Free(buffer_c_, 10);
   heap.Free(buffer_b_, 30);
 
-  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  const HeapSimulator::Result<HloValue> results = heap.Finish();
+  EXPECT_EQ(1, results.heap_results.size());
+  const HeapSimulator::HeapResult<HloValue>& result =
+      results.heap_results.at(0);
   EXPECT_EQ(40, result.heap_size);
   EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
   EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
@@ -1311,6 +1334,122 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ChunkCandidate) {
   // Preferred offset 15 could not be given because it is occupied.
 }
 
+class ConstrainedGlobalDecreasingSizeBestFitHeapTest
+    : public HeapAlgorithmTestBase {};
+
+TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
+  // space
+  //   ^
+  //   |      +-------+
+  //   |      +---c---+
+  //   |    +-------+
+  //   |    |   b   |
+  //   |    +-------+
+  //   | ................ // split into two allocations.
+  //   |  +---a---+
+  //   |         +-------+
+  //   |         |       |
+  //   |         |   d   |
+  //   |         +-------+
+  //   -----------------> time
+  ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50,
+                                                  /*alignment=*/1);
+  heap.Alloc(buffer_a_, 10);
+  heap.Alloc(buffer_b_, 30);
+  heap.Alloc(buffer_c_, 20);
+  heap.Alloc(buffer_d_, 40);
+  heap.Free(buffer_a_, 10);
+  heap.Free(buffer_b_, 30);
+  heap.Free(buffer_c_, 20);
+  heap.Free(buffer_d_, 40);
+
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  EXPECT_EQ(100, result.heap_size);
+  EXPECT_EQ(2, result.heap_results.size());
+
+  EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
+  EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_d_));
+  EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size);
+  EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_d_).size);
+  EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_a_).offset);
+  EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_d_).offset);
+}
+
+TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest,
+       DecreasingSizeWithAlignment) {
+  // space
+  //   ^
+  //   |      +-------+
+  //   |      +---b---+
+  //   |            +-------+
+  //   |            |       |
+  //   |            |   d   |
+  //   |            +-------+
+  //   | ...................
+  //   |  +---a---+
+  //   |
+  //   |         +-------+
+  //   |         |       |
+  //   |         |   c   |
+  //   |         |       |
+  //   |         +-------+
+  //   ---------------------> time
+  ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/70,
+                                                  /*alignment=*/20);
+  heap.Alloc(buffer_a_, 10);
+  heap.Alloc(buffer_b_, 20);
+  heap.Alloc(buffer_c_, 50);
+  heap.Free(buffer_a_, 10);
+  heap.Alloc(buffer_d_, 40);
+  heap.Free(buffer_b_, 20);
+  heap.Free(buffer_c_, 50);
+  heap.Free(buffer_d_, 40);
+
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  EXPECT_EQ(130, result.heap_size);  // 70 + 60
+  EXPECT_EQ(2, result.heap_results.size());
+
+  EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
+  EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_));
+  EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size);
+  EXPECT_EQ(50, result.heap_results[0].chunk_map.at(buffer_c_).size);
+  EXPECT_EQ(60, result.heap_results[0].chunk_map.at(buffer_a_).offset);
+  EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset);
+}
+
+TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
+  // space
+  //   ^
+  //   |       +---------------+
+  //   |       +-------b-------+
+  //   | ....................
+  //   |+------+      +-------+
+  //   ||      |      |       |
+  //   ||      |      |       | <--- colocate with a
+  //   |+--a---+      +---c---+
+  //   ---------------------> time
+  ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50,
+                                                  /*alignment=*/20);
+  heap.Alloc(buffer_a_, 30);
+  heap.Free(buffer_a_, 30);
+  heap.Alloc(buffer_b_, 20);
+
+  heap.ShareWith(buffer_c_, buffer_a_, 40);
+  heap.Free(buffer_c_, 40);
+  heap.Free(buffer_b_, 20);
+
+  const HeapSimulator::Result<HloValue> result = heap.Finish();
+  EXPECT_EQ(50, result.heap_size);
+  EXPECT_EQ(2, result.heap_results.size());
+
+  EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
+  EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_));
+  EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_a_).size);
+  EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_c_).size);
+  EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_a_).offset);
+  EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset);
+}
+
 class IntervalTreeTest : public ::testing::Test {};
 
 TEST_F(IntervalTreeTest, InsertAndRemove) {
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 6d4b0e65010..604146e3981 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -1127,7 +1127,10 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
   VLOG(3) << allocation_info_str_;
   DumpDebugStringsIfEnabled();
 
-  return result_;
+  HeapSimulator::Result<HloValue> result;
+  result.heap_size = result_.heap_size;
+  result.heap_results.emplace_back(std::move(result_));
+  return std::move(result);
 }
 
 void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 1cf30b10373..f2488c39504 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -290,7 +290,11 @@ message DebugOptions {
   // Extra parameters to pass the GPU assembler.
   string xla_gpu_asm_extra_flags = 141;
 
-  // Next id: 142
+  // Per-heap size constraint. New heaps will be created if per-heap max size is
+  // reached.
+  int32 xla_multiheap_size_constraint_per_heap = 142;
+
+  // Next id: 143
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.