diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc index 2dd7acb2f67..201ac346bad 100644 --- a/tensorflow/compiler/xla/debug_options_flags.cc +++ b/tensorflow/compiler/xla/debug_options_flags.cc @@ -73,6 +73,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { opts.set_xla_gpu_deterministic_reductions(false); opts.set_xla_cpu_enable_xprof_traceme(false); opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false); + opts.set_xla_multiheap_size_constraint_per_heap(-1); return opts; } @@ -571,6 +572,16 @@ static void AllocateFlags() { "that falling back to the driver can have drawbacks like using more " "memory and/or other bugs during compilation, so we recommend setting " "this flag to false.")); + flag_objects->push_back(tensorflow::Flag( + "xla_multiheap_size_constraint_per_heap", + int32_setter_for( + &DebugOptions::set_xla_multiheap_size_constraint_per_heap), + flag_values->xla_multiheap_size_constraint_per_heap(), + "Generates multiple heaps (i.e., temp buffers) with a size " + "constraint on each heap to avoid Out-of-Memory due to memory " + "fragmentation. The constraint is soft, so it works with tensors " + "larger than the given constraint size.")); + ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects); } diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc index db34f054f35..1744bf57748 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment.cc @@ -1330,12 +1330,23 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering( auto get_heap_algorithm = [&](int64 alignment) { auto algorithms = absl::make_unique< std::vector>>>(); - algorithms->push_back( - absl::make_unique>( - alignment, GlobalDecreasingSizeBestFitHeap::kSpatial)); - algorithms->push_back( - absl::make_unique>( - alignment, GlobalDecreasingSizeBestFitHeap::kTemporal)); + if (assignment->multiheap_size_constraint_per_heap() == -1) { + algorithms->push_back( + absl::make_unique>( + alignment, GlobalDecreasingSizeBestFitHeap::kSpatial)); + algorithms->push_back( + absl::make_unique>( + alignment, GlobalDecreasingSizeBestFitHeap::kTemporal)); + } else { + algorithms->push_back( + absl::make_unique( + assignment->multiheap_size_constraint_per_heap(), alignment, + GlobalDecreasingSizeBestFitHeap::kSpatial)); + algorithms->push_back( + absl::make_unique( + assignment->multiheap_size_constraint_per_heap(), alignment, + GlobalDecreasingSizeBestFitHeap::kTemporal)); + } return absl::make_unique>( std::move(algorithms)); }; @@ -1500,20 +1511,25 @@ void BufferAssigner::AssignBuffersFromHeapSimulator( } VLOG(1) << "Result size from heap simulator: " << result.heap_size; - BufferAllocation* allocation = - assignment->NewEmptyAllocation(result.heap_size, color); - for (const auto& buffer_chunk : result.chunk_map) { - const HloValue& value = *buffer_chunk.first; - const HeapSimulator::Chunk& chunk = buffer_chunk.second; - assignment->AddAssignment(allocation, value, chunk.offset, chunk.size); + for (auto& heap_result : result.heap_results) { + BufferAllocation* allocation = + assignment->NewEmptyAllocation(heap_result.heap_size, color); + for (const auto& buffer_chunk : heap_result.chunk_map) { + const HloValue& value = *buffer_chunk.first; + const HeapSimulator::Chunk& chunk = buffer_chunk.second; + assignment->AddAssignment(allocation, value, chunk.offset, chunk.size); + } + // Compute peak_buffers only when the multiheap mode is off. Simply return + // an empty vector in the multiheap mode. + if (assignment->multiheap_size_constraint_per_heap() == -1) { + allocation->peak_buffers_ = + ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace); + } + + XLA_VLOG_LINES(2, allocation->ToString()); + + allocation->AddHeapTrace(result.debug_trace); } - allocation->peak_buffers_ = - ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace); - - VLOG(1) << "Ran heap simulation for allocation: "; - XLA_VLOG_LINES(2, allocation->ToString()); - - allocation->AddHeapTrace(result.debug_trace); } StatusOr> BufferAssigner::CreateAssignment( @@ -1580,6 +1596,10 @@ StatusOr> BufferAssigner::CreateAssignment( buffers_to_assign_sequentially.size() == global_computations.size(); VLOG(2) << "Running whole module heap simulation: " << run_whole_module_heap_simulation; + const int32 multiheap_size_constraint_per_heap = + module->config().debug_options().xla_multiheap_size_constraint_per_heap(); + VLOG(2) << "Multiheap per heap size limit: " + << multiheap_size_constraint_per_heap; TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering( buffers_to_assign_sequentially, run_whole_module_heap_simulation, assignment.get())); @@ -1618,7 +1638,9 @@ StatusOr> BufferAssigner::CreateAssignment( // This can only be performed after all buffers have been assigned, and // after maybe_live_out is marked, since it is used to determine whether an // allocation contains temporary buffers or not. - assignment->CombineTempAllocations(); + if (multiheap_size_constraint_per_heap == -1) { + assignment->CombineTempAllocations(); + } XLA_VLOG_LINES(2, assignment->ToString()); TF_RETURN_IF_ERROR(assignment->ComputeSummaryStats()); diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h index dfde46ca4b1..c8c430988f5 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.h +++ b/tensorflow/compiler/xla/service/buffer_assignment.h @@ -363,6 +363,10 @@ class BufferAssignment { return temp_allocation_total_size_; } + int32 multiheap_size_constraint_per_heap() const { + return multiheap_size_constraint_per_heap_; + } + // Returns whether the given buffer has been assigned an allocation. bool HasAllocation(const HloValue& value) const; @@ -491,7 +495,11 @@ class BufferAssignment { buffer_size_(std::move(buffer_size)), color_alignment_(std::move(color_alignment)), alias_analysis_(std::move(alias_analysis)), - hlo_live_range_(std::move(hlo_live_range)) {} + hlo_live_range_(std::move(hlo_live_range)), + multiheap_size_constraint_per_heap_( + module->config() + .debug_options() + .xla_multiheap_size_constraint_per_heap()) {} // Creates and returns a new BufferAllocation, with no assigned // LogicalBuffers. Ownership is maintained internally. @@ -535,6 +543,8 @@ class BufferAssignment { // The total size of all temporary buffers. int64 temp_allocation_total_size_ = 0; + int32 multiheap_size_constraint_per_heap_; + // Maps Buffers to the index of the BufferAllocation which holds the buffer. absl::flat_hash_map allocation_index_for_value_; diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 8f01d7e3c41..b382ff8b708 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1857,7 +1857,8 @@ IrEmitterUnnested::BuildKernelThunkFromBufferSlices( absl::string_view name, Thunk::ThunkInfo thunk_info, absl::Span slices, std::function - bind_slice_to_ir_value) { + bind_slice_to_ir_value, + bool insist_single_temp_buffer) { const auto& buffer_assn = ir_emitter_context_->buffer_assignment(); // Figure out which buffer allocations need to be passed as arguments to our @@ -1874,7 +1875,7 @@ IrEmitterUnnested::BuildKernelThunkFromBufferSlices( if (alloc.IsPreallocatedTempBuffer()) { if (!temp_buffer.has_value()) { temp_buffer = &alloc; - } else { + } else if (insist_single_temp_buffer) { LOG(FATAL) << "Multiple temp buffers found, but only one is allowed!"; } } @@ -1995,7 +1996,13 @@ std::unique_ptr IrEmitterUnnested::BuildKernelThunk( << hlo_buffer_slice->gte_index.ToString(); bindings_.BindHloToIrValue(*instr, value, index); - }); + }, + // Check temp buffer numbers only when the multiheap mode is off. + /*insist_single_temp_buffer=*/inst->parent() + ->parent() + ->config() + .debug_options() + .xla_multiheap_size_constraint_per_heap() == -1); } std::unique_ptr IrEmitterUnnested::BuildKernelThunkForMlir( diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h index b83af8799d3..1ad12840c9e 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h @@ -547,7 +547,8 @@ class IrEmitterUnnested : public IrEmitter, absl::string_view name, Thunk::ThunkInfo thunk_info, absl::Span slices, std::function - bind_slice_to_ir_value); + bind_slice_to_ir_value, + bool insist_single_temp_buffer = true); // Returns a KernelThunk that invokes the kernel emitted for `inst`. The // caller needs to make sure `inst` outlives the lifetime of the returned diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc index 2e2b668eba7..7b0220753fa 100644 --- a/tensorflow/compiler/xla/service/heap_simulator.cc +++ b/tensorflow/compiler/xla/service/heap_simulator.cc @@ -409,11 +409,15 @@ HeapSimulator::Result HeapSimulator::Finish() { // Post-process the result to add chunks for shared buffers. An empty chunk // map means that either no buffers were allocated, or the heap was only // collecting statistics, e.g. NoFragmentationStatsHeap. - if (!result.chunk_map.empty()) { + size_t total_chunk_count = 0; + absl::c_for_each(result.heap_results, [&](const HeapResult& hr) { + total_chunk_count += hr.chunk_map.size(); + }); + if (total_chunk_count != 0) { // If we were told to assign specific buffers, make sure we've assigned // exactly that many buffers. if (options_.buffers_to_assign != nullptr) { - CHECK_EQ(options_.buffers_to_assign->size(), result.chunk_map.size()); + CHECK_EQ(options_.buffers_to_assign->size(), total_chunk_count); } } @@ -825,7 +829,10 @@ GlobalDecreasingSizeBestFitHeap::Finish() { CommitChunk(buffer_interval, chunk_candidate); } VLOG(1) << "result heap_size: " << result_.heap_size; - return result_; + Result result; + result.heap_size = result_.heap_size; + result.heap_results.emplace_back(result_); + return result; } template @@ -968,6 +975,58 @@ void GlobalDecreasingSizeBestFitHeap::AddToChunkMap( DCHECK(emplace_result.second); } +HeapSimulator::Result +ConstrainedGlobalDecreasingSizeBestFitHeap::Finish() { + std::vector sorted_buffer_vec = GetSortedBufferIntervals(); + // Convert into std::list so that erase() is O(1). + std::list sorted_buffer_intervals(sorted_buffer_vec.begin(), + sorted_buffer_vec.end()); + + // Use do-while here, because we need to create 1 heap in `multi_heap_result` + // even if `sorted_buffer_intervals` is empty. + Result multi_heap_result; + do { + // Place buffers into the currently processed heap as many as possible. + for (auto it = sorted_buffer_intervals.begin(); + it != sorted_buffer_intervals.end();) { + BufferInterval buffer_interval = *it; + if (!buffer_interval.need_allocation) { + it = sorted_buffer_intervals.erase(it); + continue; + } + if (buffer_interval.size > size_limit_per_heap_) { + LOG(WARNING) << "Alloc buffer size " << buffer_interval.size + << " larger than the per-heap size limit " + << size_limit_per_heap_; + } + + ChunkCandidate chunk_candidate = FindChunkCandidate(buffer_interval); + if (chunk_candidate.heap_size <= size_limit_per_heap_ || + // Commit the chunk as long as the heap is empty. We do this because + // we want the size constraint to be soft, meaning that results are + // successfully generated even if there are some buffer sizes larger + // than the given constraint size. + result_.heap_size == 0) { + CommitChunk(buffer_interval, chunk_candidate); + it = sorted_buffer_intervals.erase(it); + continue; + } + + ++it; + } + // Collect the result from the currently processed heap and reset the heap + // states. + multi_heap_result.heap_size += result_.heap_size; + multi_heap_result.heap_results.push_back(std::move(result_)); + result_ = {}; + interval_tree_ = {}; + } while (!sorted_buffer_intervals.empty()); + + VLOG(1) << "Number of heaps produced = " + << multi_heap_result.heap_results.size(); + return multi_heap_result; +} + template HeapSimulator::Result ChooseBestHeapAlgorithm::Finish() { diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h index b47ff685139..0c7425d8edf 100644 --- a/tensorflow/compiler/xla/service/heap_simulator.h +++ b/tensorflow/compiler/xla/service/heap_simulator.h @@ -67,14 +67,23 @@ class HeapSimulator { } }; - // Result represents the result of the heap simulation. template - struct Result { + struct HeapResult { // The assignment of buffers to chunks. absl::flat_hash_map chunk_map; // The total size in bytes of the heap, containing all assigned chunks. int64 heap_size = 0; + }; + // Result represents the result of the heap simulation. + template + struct Result { + // Heap results. + std::vector> heap_results; + + // The total size in bytes of the heaps. + // heap_size == sum([hr.heap_size for hr in heap_results]). + int64 heap_size = 0; // The total size in bytes of heap fragmentation. int64 fragmentation_size = 0; @@ -229,6 +238,7 @@ class HeapAlgorithm { public: using Chunk = HeapSimulator::Chunk; using Result = HeapSimulator::Result; + using HeapResult = HeapSimulator::HeapResult; virtual ~HeapAlgorithm() = default; @@ -347,6 +357,7 @@ class BufferIntervalTree { template class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm { public: + using HeapResult = HeapSimulator::HeapResult; using Result = HeapSimulator::Result; using Chunk = HeapSimulator::Chunk; @@ -415,6 +426,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm { int64 preferred_offset = -1) const; void CommitChunk(const BufferInterval& buffer_interval, ChunkCandidate chunk_candidate); + // Adds the buffer and the chunk to the result chunk map. virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk); @@ -426,7 +438,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm { BufferIntervalCompare GetTemporalBufferIntervalCompare() const; absl::flat_hash_map buffer_intervals_; - Result result_; + HeapResult result_; BufferIntervalCompare buffer_interval_compare_; BufferIntervalTree interval_tree_; @@ -444,6 +456,25 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm { const BufferInterval& interval) const; }; +// This class implements an algorithm that will output multiple heaps. Each heap +// size is constrained by a given limit. Note that the constraint is soft, +// meaning that valid heap results are generated even if there are some buffer +// sizes larger than the given constraint size. +class ConstrainedGlobalDecreasingSizeBestFitHeap + : public GlobalDecreasingSizeBestFitHeap { + public: + explicit ConstrainedGlobalDecreasingSizeBestFitHeap( + size_t size_limit_per_heap, int64 alignment, Type type = kSpatial) + : size_limit_per_heap_(size_limit_per_heap), + GlobalDecreasingSizeBestFitHeap(alignment, type) {} + ~ConstrainedGlobalDecreasingSizeBestFitHeap() override {} + + Result Finish() override; + + private: + size_t size_limit_per_heap_; +}; + // A heap algorithm that chooses the best results from other algorithms added to // it. template diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc index 8f7668b4965..26305eebb0d 100644 --- a/tensorflow/compiler/xla/service/heap_simulator_test.cc +++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc @@ -256,12 +256,15 @@ class HeapCallRecorder : public HeapAlgorithm { } Result Finish() override { calls_->emplace_back(kFinish, nullptr); - return result_; + HeapSimulator::Result result; + result.heap_size = result_.heap_size; + result.heap_results.emplace_back(std::move(result_)); + return result; } private: CallSequence* calls_; - Result result_; + HeapSimulator::HeapResult result_; }; // HeapSimulatorTracker runs the heap simulator, recording the sequence of calls @@ -335,7 +338,8 @@ class HeapSimulatorTracker { int64 OffsetAt(const HloInstruction* instruction, const ShapeIndex& index) { const HloValue* buffer = BufferAt(instruction, index); - return result_.chunk_map.at(buffer).offset; + CHECK_EQ(1, result_.heap_results.size()); + return result_.heap_results.at(0).chunk_map.at(buffer).offset; } // Ensures the expected sequence of Alloc/Free/Finish calls was performed. @@ -1051,7 +1055,8 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Empty) { GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1); const HeapSimulator::Result result = heap.Finish(); EXPECT_EQ(0, result.heap_size); - EXPECT_EQ(0, result.chunk_map.size()); + EXPECT_EQ(1, result.heap_results.size()); + EXPECT_EQ(0, result.heap_results.at(0).chunk_map.size()); } TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) { @@ -1078,7 +1083,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) { heap.Free(buffer_c_, 20); heap.Free(buffer_d_, 40); - const HeapSimulator::Result result = heap.Finish(); + const HeapSimulator::Result results = heap.Finish(); + EXPECT_EQ(1, results.heap_results.size()); + const HeapSimulator::HeapResult& result = + results.heap_results.at(0); EXPECT_EQ(100, result.heap_size); EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size); EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size); @@ -1117,7 +1125,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSizeWithAlignment) { heap.Free(buffer_c_, 50); heap.Free(buffer_d_, 40); - const HeapSimulator::Result result = heap.Finish(); + const HeapSimulator::Result results = heap.Finish(); + EXPECT_EQ(1, results.heap_results.size()); + const HeapSimulator::HeapResult& result = + results.heap_results.at(0); EXPECT_EQ(120, result.heap_size); EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size); EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size); @@ -1160,7 +1171,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, BestFit) { heap.Free(buffer_d_, 30); heap.Free(buffer_e_, 50); - const HeapSimulator::Result result = heap.Finish(); + const HeapSimulator::Result results = heap.Finish(); + EXPECT_EQ(1, results.heap_results.size()); + const HeapSimulator::HeapResult& result = + results.heap_results.at(0); EXPECT_EQ(140, result.heap_size); EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size); EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size); @@ -1192,7 +1206,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Colocated) { heap.ShareWith(buffer_c_, buffer_a_, 40); heap.Free(buffer_c_, 40); - const HeapSimulator::Result result = heap.Finish(); + const HeapSimulator::Result results = heap.Finish(); + EXPECT_EQ(1, results.heap_results.size()); + const HeapSimulator::HeapResult& result = + results.heap_results.at(0); EXPECT_EQ(40, result.heap_size); EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size); EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size); @@ -1221,7 +1238,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedII) { heap.Free(buffer_c_, 40); heap.Free(buffer_b_, 20); - const HeapSimulator::Result result = heap.Finish(); + const HeapSimulator::Result results = heap.Finish(); + EXPECT_EQ(1, results.heap_results.size()); + const HeapSimulator::HeapResult& result = + results.heap_results.at(0); EXPECT_EQ(60, result.heap_size); EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size); EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size); @@ -1251,7 +1271,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedIII) { heap.Free(buffer_c_, 10); heap.Free(buffer_b_, 30); - const HeapSimulator::Result result = heap.Finish(); + const HeapSimulator::Result results = heap.Finish(); + EXPECT_EQ(1, results.heap_results.size()); + const HeapSimulator::HeapResult& result = + results.heap_results.at(0); EXPECT_EQ(40, result.heap_size); EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size); EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size); @@ -1311,6 +1334,122 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ChunkCandidate) { // Preferred offset 15 could not be given because it is occupied. } +class ConstrainedGlobalDecreasingSizeBestFitHeapTest + : public HeapAlgorithmTestBase {}; + +TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, DecreasingSize) { + // space + // ^ + // | +-------+ + // | +---c---+ + // | +-------+ + // | | b | + // | +-------+ + // | ................ // split into two allocations. + // | +---a---+ + // | +-------+ + // | | | + // | | d | + // | +-------+ + // -----------------> time + ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50, + /*alignment=*/1); + heap.Alloc(buffer_a_, 10); + heap.Alloc(buffer_b_, 30); + heap.Alloc(buffer_c_, 20); + heap.Alloc(buffer_d_, 40); + heap.Free(buffer_a_, 10); + heap.Free(buffer_b_, 30); + heap.Free(buffer_c_, 20); + heap.Free(buffer_d_, 40); + + const HeapSimulator::Result result = heap.Finish(); + EXPECT_EQ(100, result.heap_size); + EXPECT_EQ(2, result.heap_results.size()); + + EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_)); + EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_d_)); + EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size); + EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_d_).size); + EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_a_).offset); + EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_d_).offset); +} + +TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, + DecreasingSizeWithAlignment) { + // space + // ^ + // | +-------+ + // | +---b---+ + // | +-------+ + // | | | + // | | d | + // | +-------+ + // | ................... + // | +---a---+ + // | + // | +-------+ + // | | | + // | | c | + // | | | + // | +-------+ + // ---------------------> time + ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/70, + /*alignment=*/20); + heap.Alloc(buffer_a_, 10); + heap.Alloc(buffer_b_, 20); + heap.Alloc(buffer_c_, 50); + heap.Free(buffer_a_, 10); + heap.Alloc(buffer_d_, 40); + heap.Free(buffer_b_, 20); + heap.Free(buffer_c_, 50); + heap.Free(buffer_d_, 40); + + const HeapSimulator::Result result = heap.Finish(); + EXPECT_EQ(130, result.heap_size); // 70 + 60 + EXPECT_EQ(2, result.heap_results.size()); + + EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_)); + EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_)); + EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size); + EXPECT_EQ(50, result.heap_results[0].chunk_map.at(buffer_c_).size); + EXPECT_EQ(60, result.heap_results[0].chunk_map.at(buffer_a_).offset); + EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset); +} + +TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, ColocatedII) { + // space + // ^ + // | +---------------+ + // | +-------b-------+ + // | .................... + // |+------+ +-------+ + // || | | | + // || | | | <--- colocate with a + // |+--a---+ +---c---+ + // ---------------------> time + ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50, + /*alignment=*/20); + heap.Alloc(buffer_a_, 30); + heap.Free(buffer_a_, 30); + heap.Alloc(buffer_b_, 20); + + heap.ShareWith(buffer_c_, buffer_a_, 40); + heap.Free(buffer_c_, 40); + heap.Free(buffer_b_, 20); + + const HeapSimulator::Result result = heap.Finish(); + EXPECT_EQ(50, result.heap_size); + EXPECT_EQ(2, result.heap_results.size()); + + EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_)); + EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_)); + EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_a_).size); + EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_c_).size); + EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_a_).offset); + EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset); +} + class IntervalTreeTest : public ::testing::Test {}; TEST_F(IntervalTreeTest, InsertAndRemove) { diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc index 6d4b0e65010..604146e3981 100644 --- a/tensorflow/compiler/xla/service/memory_space_assignment.cc +++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc @@ -1127,7 +1127,10 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() { VLOG(3) << allocation_info_str_; DumpDebugStringsIfEnabled(); - return result_; + HeapSimulator::Result result; + result.heap_size = result_.heap_size; + result.heap_results.emplace_back(std::move(result_)); + return std::move(result); } void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals( diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto index 1cf30b10373..f2488c39504 100644 --- a/tensorflow/compiler/xla/xla.proto +++ b/tensorflow/compiler/xla/xla.proto @@ -290,7 +290,11 @@ message DebugOptions { // Extra parameters to pass the GPU assembler. string xla_gpu_asm_extra_flags = 141; - // Next id: 142 + // Per-heap size constraint. New heaps will be created if per-heap max size is + // reached. + int32 xla_multiheap_size_constraint_per_heap = 142; + + // Next id: 143 // Extra options to pass to the compilation backend (e.g. LLVM); specific // interpretation of these values is left to the backend.