Merge pull request #43850 from trentlo:size_constrained_buf_alloc_upstream

PiperOrigin-RevId: 339431959
Change-Id: Ie2aa8488f0606ea318a7d1b6c2985effdb52d243
This commit is contained in:
TensorFlower Gardener 2020-10-28 04:19:38 -07:00
commit 6c43f200d0
9 changed files with 365 additions and 50 deletions

View File

@ -73,6 +73,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
opts.set_xla_gpu_deterministic_reductions(false);
opts.set_xla_cpu_enable_xprof_traceme(false);
opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
opts.set_xla_multiheap_size_constraint_per_heap(-1);
return opts;
}
@ -571,6 +572,17 @@ static void AllocateFlags() {
"that falling back to the driver can have drawbacks like using more "
"memory and/or other bugs during compilation, so we recommend setting "
"this flag to false."));
flag_objects->push_back(tensorflow::Flag(
"xla_multiheap_size_constraint_per_heap",
int32_setter_for(
&DebugOptions::set_xla_multiheap_size_constraint_per_heap),
flag_values->xla_multiheap_size_constraint_per_heap(),
"Generates multiple heaps (i.e., temp buffers) with a size "
"constraint on each heap to avoid Out-of-Memory due to memory "
"fragmentation. The constraint is soft, so it works with tensors "
"larger than the given constraint size. -1 corresponds to no "
"constraints."));
ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
}

View File

@ -607,7 +607,10 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
// BufferAllocation.
void BufferAssignment::CombineTempAllocations() {
VLOG(1) << "CombineTempAllocations()";
flat_hash_map<BufferValue::Color, BufferAllocation> combined_allocation_map;
// Stores the combined allocations.
std::deque<BufferAllocation> combined_allocations;
// Holds the pointer to a combined allocation of each color, if any.
flat_hash_map<BufferValue::Color, BufferAllocation*> combined_allocation_map;
// Move all temp allocations into a single run at the end of the allocations
// vector.
@ -621,19 +624,31 @@ void BufferAssignment::CombineTempAllocations() {
// to the same color.
if (first_temp_it != allocations_.end()) {
for (auto it = first_temp_it; it != allocations_.end(); ++it) {
const BufferAllocation& temp_allocation = *it;
BufferAllocation& temp_allocation = *it;
BufferValue::Color color = temp_allocation.color();
auto combined_it = combined_allocation_map.find(color);
if (combined_it == combined_allocation_map.end()) {
// We have found the first temp allocation of this color. Collect
// the other temp allocations of the same color into it.
// the other temp allocations of the same color into it subject to the
// size constraint.
VLOG(1) << "Combined temp allocation for color " << color
<< " is: " << temp_allocation;
combined_allocation_map.emplace(color, temp_allocation);
combined_allocations.emplace_back(temp_allocation);
combined_allocation_map.emplace(color, &combined_allocations.back());
continue;
}
if (combined_it->second->size() + it->size() >=
multiheap_size_constraint_per_heap_) {
// We cannot put more into the current combined_it. So, appoint a new
// combined_it.
VLOG(1) << "Due to size constraint, reset temp allocation for color "
<< color << " to: " << temp_allocation;
combined_allocations.emplace_back(temp_allocation);
combined_allocation_map.emplace(color, &combined_allocations.back());
continue;
}
auto* combined_allocation = &combined_it->second;
BufferAllocation* combined_allocation = combined_it->second;
VLOG(1) << "Combined allocation absorbing temp allocation: "
<< temp_allocation;
@ -663,9 +678,9 @@ void BufferAssignment::CombineTempAllocations() {
// Replace all existing temporary allocations with the new combined
// allocations.
allocations_.erase(first_temp_it, allocations_.end());
for (auto& combined : combined_allocation_map) {
allocations_.push_back(combined.second);
temp_allocation_total_size_ += combined.second.size();
for (BufferAllocation& combined : combined_allocations) {
temp_allocation_total_size_ += combined.size();
allocations_.push_back(std::move(combined));
}
}
@ -1331,11 +1346,13 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
auto algorithms = absl::make_unique<
std::vector<std::unique_ptr<HeapAlgorithm<HloValue>>>>();
algorithms->push_back(
absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
absl::make_unique<ConstrainedGlobalDecreasingSizeBestFitHeap>(
assignment->multiheap_size_constraint_per_heap(), alignment,
GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
algorithms->push_back(
absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
absl::make_unique<ConstrainedGlobalDecreasingSizeBestFitHeap>(
assignment->multiheap_size_constraint_per_heap(), alignment,
GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
return absl::make_unique<ChooseBestHeapAlgorithm<HloValue>>(
std::move(algorithms));
};
@ -1444,6 +1461,12 @@ std::vector<const HloValue*> ComputePeakMemoryLogicalBuffers(
int64 max_live_size = 0;
int64 live_size = 0;
for (const auto& event : heap_trace.events()) {
if (!id_to_value.contains(event.buffer_id())) {
// Skip as the buffer associated with this trace event is not placed into
// this allocation. This can happen when size constraints are given to the
// heap simulator.
continue;
}
live_size += memory_delta(event);
if (max_live_size < live_size) {
max_live_size = live_size;
@ -1455,6 +1478,12 @@ std::vector<const HloValue*> ComputePeakMemoryLogicalBuffers(
absl::flat_hash_set<const HloValue*> live_values;
live_size = 0;
for (const auto& event : heap_trace.events()) {
if (!id_to_value.contains(event.buffer_id())) {
// Skip as the buffer associated with this trace event is not placed into
// this allocation. This can happen when size constraints are given to the
// heap simulator.
continue;
}
const HloValue* value = id_to_value.at(event.buffer_id());
if (event.kind() == HeapSimulatorTrace::Event::ALLOC ||
event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
@ -1500,20 +1529,24 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
}
VLOG(1) << "Result size from heap simulator: " << result.heap_size;
BufferAllocation* allocation =
assignment->NewEmptyAllocation(result.heap_size, color);
for (const auto& buffer_chunk : result.chunk_map) {
const HloValue& value = *buffer_chunk.first;
const HeapSimulator::Chunk& chunk = buffer_chunk.second;
assignment->AddAssignment(allocation, value, chunk.offset, chunk.size);
// Iterate through heap_results. For each heap_result, create a new allocation
// in `assignment`.
for (const HeapSimulator::HeapResult<HloValue>& heap_result :
result.heap_results) {
BufferAllocation* allocation =
assignment->NewEmptyAllocation(heap_result.heap_size, color);
for (const auto& buffer_chunk : heap_result.chunk_map) {
const HloValue& value = *buffer_chunk.first;
const HeapSimulator::Chunk& chunk = buffer_chunk.second;
assignment->AddAssignment(allocation, value, chunk.offset, chunk.size);
}
allocation->peak_buffers_ =
ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace);
XLA_VLOG_LINES(2, allocation->ToString());
allocation->AddHeapTrace(result.debug_trace);
}
allocation->peak_buffers_ =
ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace);
VLOG(1) << "Ran heap simulation for allocation: ";
XLA_VLOG_LINES(2, allocation->ToString());
allocation->AddHeapTrace(result.debug_trace);
}
StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
@ -1580,6 +1613,10 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
buffers_to_assign_sequentially.size() == global_computations.size();
VLOG(2) << "Running whole module heap simulation: "
<< run_whole_module_heap_simulation;
const int32 multiheap_size_constraint_per_heap =
module->config().debug_options().xla_multiheap_size_constraint_per_heap();
VLOG(2) << "Multiheap per heap size limit: "
<< multiheap_size_constraint_per_heap;
TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
buffers_to_assign_sequentially, run_whole_module_heap_simulation,
assignment.get()));
@ -1614,10 +1651,11 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
}
}
// Combines allocations of temporary buffers into one big BufferAllocation.
// This can only be performed after all buffers have been assigned, and
// after maybe_live_out is marked, since it is used to determine whether an
// allocation contains temporary buffers or not.
// Combines allocations of temporary buffers into big BufferAllocations
// subject to the buffer allocation size constraint. This can only be
// performed after all buffers have been assigned, and after maybe_live_out
// is marked, since it is used to determine whether an allocation contains
// temporary buffers or not.
assignment->CombineTempAllocations();
XLA_VLOG_LINES(2, assignment->ToString());

View File

@ -363,6 +363,10 @@ class BufferAssignment {
return temp_allocation_total_size_;
}
uint64 multiheap_size_constraint_per_heap() const {
return multiheap_size_constraint_per_heap_;
}
// Returns whether the given buffer has been assigned an allocation.
bool HasAllocation(const HloValue& value) const;
@ -491,7 +495,14 @@ class BufferAssignment {
buffer_size_(std::move(buffer_size)),
color_alignment_(std::move(color_alignment)),
alias_analysis_(std::move(alias_analysis)),
hlo_live_range_(std::move(hlo_live_range)) {}
hlo_live_range_(std::move(hlo_live_range)) {
int32 raw_value = module->config()
.debug_options()
.xla_multiheap_size_constraint_per_heap();
// -1 means no constraint.
multiheap_size_constraint_per_heap_ =
(raw_value == -1) ? UINT64_MAX : raw_value;
}
// Creates and returns a new BufferAllocation, with no assigned
// LogicalBuffers. Ownership is maintained internally.
@ -535,6 +546,8 @@ class BufferAssignment {
// The total size of all temporary buffers.
int64 temp_allocation_total_size_ = 0;
uint64 multiheap_size_constraint_per_heap_;
// Maps Buffers to the index of the BufferAllocation which holds the buffer.
absl::flat_hash_map<const HloValue*, BufferAllocation::Index>
allocation_index_for_value_;

View File

@ -2346,9 +2346,8 @@ IrEmitterUnnested::BuildKernelThunkFromBufferSlices(
for (const BufferAllocation& alloc : buffer_assn.Allocations()) {
if (alloc.IsPreallocatedTempBuffer()) {
if (!temp_buffer.has_value()) {
// Retrieve the first seen temp buffer.
temp_buffer = &alloc;
} else {
LOG(FATAL) << "Multiple temp buffers found, but only one is allowed!";
}
}
}

View File

@ -409,11 +409,16 @@ HeapSimulator::Result<HloValue> HeapSimulator::Finish() {
// Post-process the result to add chunks for shared buffers. An empty chunk
// map means that either no buffers were allocated, or the heap was only
// collecting statistics, e.g. NoFragmentationStatsHeap.
if (!result.chunk_map.empty()) {
size_t total_chunk_count = absl::c_accumulate(
result.heap_results, static_cast<size_t>(0),
[&](size_t lhs, const HeapResult<HloValue>& rhs) -> size_t {
return lhs + rhs.chunk_map.size();
});
if (total_chunk_count != 0) {
// If we were told to assign specific buffers, make sure we've assigned
// exactly that many buffers.
if (options_.buffers_to_assign != nullptr) {
CHECK_EQ(options_.buffers_to_assign->size(), result.chunk_map.size());
CHECK_EQ(options_.buffers_to_assign->size(), total_chunk_count);
}
}
@ -825,7 +830,10 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::Finish() {
CommitChunk(buffer_interval, chunk_candidate);
}
VLOG(1) << "result heap_size: " << result_.heap_size;
return result_;
Result result;
result.heap_size = result_.heap_size;
result.heap_results.emplace_back(result_);
return result;
}
template <typename BufferType>
@ -968,6 +976,58 @@ void GlobalDecreasingSizeBestFitHeap<BufferType>::AddToChunkMap(
DCHECK(emplace_result.second);
}
HeapSimulator::Result<HloValue>
ConstrainedGlobalDecreasingSizeBestFitHeap::Finish() {
std::vector<BufferInterval> sorted_buffer_vec = GetSortedBufferIntervals();
// Convert into std::list so that erase() is O(1).
std::list<BufferInterval> sorted_buffer_intervals(sorted_buffer_vec.begin(),
sorted_buffer_vec.end());
// Use do-while here, because we need to create 1 heap in `multi_heap_result`
// even if `sorted_buffer_intervals` is empty.
Result multi_heap_result;
do {
// Place buffers into the currently processed heap as many as possible.
for (auto it = sorted_buffer_intervals.begin();
it != sorted_buffer_intervals.end();) {
BufferInterval buffer_interval = *it;
if (!buffer_interval.need_allocation) {
it = sorted_buffer_intervals.erase(it);
continue;
}
if (buffer_interval.size > size_limit_per_heap_) {
LOG(WARNING) << "Alloc buffer size " << buffer_interval.size
<< " larger than the per-heap size limit "
<< size_limit_per_heap_;
}
ChunkCandidate chunk_candidate = FindChunkCandidate(buffer_interval);
if (chunk_candidate.heap_size <= size_limit_per_heap_ ||
// Commit the chunk as long as the heap is empty. We do this because
// we want the size constraint to be soft, meaning that results are
// successfully generated even if there are some buffer sizes larger
// than the given constraint size.
result_.heap_size == 0) {
CommitChunk(buffer_interval, chunk_candidate);
it = sorted_buffer_intervals.erase(it);
continue;
}
++it;
}
// Collect the result from the currently processed heap and reset the heap
// states.
multi_heap_result.heap_size += result_.heap_size;
multi_heap_result.heap_results.push_back(std::move(result_));
result_ = {};
interval_tree_ = {};
} while (!sorted_buffer_intervals.empty());
VLOG(1) << "Number of heaps produced = "
<< multi_heap_result.heap_results.size();
return multi_heap_result;
}
template <typename BufferType>
HeapSimulator::Result<BufferType>
ChooseBestHeapAlgorithm<BufferType>::Finish() {

View File

@ -67,14 +67,23 @@ class HeapSimulator {
}
};
// Result represents the result of the heap simulation.
template <typename BufferType>
struct Result {
struct HeapResult {
// The assignment of buffers to chunks.
absl::flat_hash_map<const BufferType*, Chunk> chunk_map;
// The total size in bytes of the heap, containing all assigned chunks.
int64 heap_size = 0;
};
// Result represents the result of the heap simulation.
template <typename BufferType>
struct Result {
// Heap results.
std::vector<HeapResult<BufferType>> heap_results;
// The total size in bytes of the heaps.
// heap_size == sum([hr.heap_size for hr in heap_results]).
int64 heap_size = 0;
// The total size in bytes of heap fragmentation.
int64 fragmentation_size = 0;
@ -229,6 +238,7 @@ class HeapAlgorithm {
public:
using Chunk = HeapSimulator::Chunk;
using Result = HeapSimulator::Result<BufferType>;
using HeapResult = HeapSimulator::HeapResult<BufferType>;
virtual ~HeapAlgorithm() = default;
@ -347,6 +357,7 @@ class BufferIntervalTree {
template <typename BufferType>
class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
public:
using HeapResult = HeapSimulator::HeapResult<BufferType>;
using Result = HeapSimulator::Result<BufferType>;
using Chunk = HeapSimulator::Chunk;
@ -415,6 +426,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
int64 preferred_offset = -1) const;
void CommitChunk(const BufferInterval& buffer_interval,
ChunkCandidate chunk_candidate);
// Adds the buffer and the chunk to the result chunk map.
virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk);
@ -426,7 +438,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_;
Result result_;
HeapResult result_;
BufferIntervalCompare buffer_interval_compare_;
BufferIntervalTree interval_tree_;
@ -444,6 +456,41 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
const BufferInterval& interval) const;
};
// This class implements an algorithm that will produce multiple heaps, where
// each heap size is constrained by a given limit. Note that the constraint is
// soft, meaning that a valid heap result is generated even if there are some
// buffer sizes larger than the given constraint size.
//
// Pseudocode:
// while( `buffers` is not empty ) {
// create a new heap `h`
// for (each buffer `buf` in `buffers` in the size-decreasing order) {
// if (buf.size() is larger than the heap size limit &&
// `h` is empty) {
// h.place(buf)
// buffers.remove(buf)
// } else if (placing `buf` into `h` does not violate size
// constraint) {
// h.place(buf)
// buffers.remove(buf)
// }
// }
// }
class ConstrainedGlobalDecreasingSizeBestFitHeap
: public GlobalDecreasingSizeBestFitHeap<HloValue> {
public:
explicit ConstrainedGlobalDecreasingSizeBestFitHeap(
uint64 size_limit_per_heap, int64 alignment, Type type = kSpatial)
: GlobalDecreasingSizeBestFitHeap<HloValue>(alignment, type),
size_limit_per_heap_(size_limit_per_heap) {}
~ConstrainedGlobalDecreasingSizeBestFitHeap() override {}
Result Finish() override;
private:
uint64 size_limit_per_heap_;
};
// A heap algorithm that chooses the best results from other algorithms added to
// it.
template <typename BufferType>

View File

@ -256,12 +256,15 @@ class HeapCallRecorder : public HeapAlgorithm<HloValue> {
}
Result Finish() override {
calls_->emplace_back(kFinish, nullptr);
return result_;
HeapSimulator::Result<HloValue> result;
result.heap_size = result_.heap_size;
result.heap_results.emplace_back(std::move(result_));
return result;
}
private:
CallSequence* calls_;
Result result_;
HeapSimulator::HeapResult<HloValue> result_;
};
// HeapSimulatorTracker runs the heap simulator, recording the sequence of calls
@ -335,7 +338,8 @@ class HeapSimulatorTracker {
int64 OffsetAt(const HloInstruction* instruction, const ShapeIndex& index) {
const HloValue* buffer = BufferAt(instruction, index);
return result_.chunk_map.at(buffer).offset;
CHECK_EQ(1, result_.heap_results.size());
return result_.heap_results.at(0).chunk_map.at(buffer).offset;
}
// Ensures the expected sequence of Alloc/Free/Finish calls was performed.
@ -1051,7 +1055,8 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Empty) {
GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
const HeapSimulator::Result<HloValue> result = heap.Finish();
EXPECT_EQ(0, result.heap_size);
EXPECT_EQ(0, result.chunk_map.size());
EXPECT_EQ(1, result.heap_results.size());
EXPECT_EQ(0, result.heap_results.at(0).chunk_map.size());
}
TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
@ -1078,7 +1083,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
heap.Free(buffer_c_, 20);
heap.Free(buffer_d_, 40);
const HeapSimulator::Result<HloValue> result = heap.Finish();
const HeapSimulator::Result<HloValue> results = heap.Finish();
EXPECT_EQ(1, results.heap_results.size());
const HeapSimulator::HeapResult<HloValue>& result =
results.heap_results.at(0);
EXPECT_EQ(100, result.heap_size);
EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
@ -1117,7 +1125,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSizeWithAlignment) {
heap.Free(buffer_c_, 50);
heap.Free(buffer_d_, 40);
const HeapSimulator::Result<HloValue> result = heap.Finish();
const HeapSimulator::Result<HloValue> results = heap.Finish();
EXPECT_EQ(1, results.heap_results.size());
const HeapSimulator::HeapResult<HloValue>& result =
results.heap_results.at(0);
EXPECT_EQ(120, result.heap_size);
EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@ -1160,7 +1171,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, BestFit) {
heap.Free(buffer_d_, 30);
heap.Free(buffer_e_, 50);
const HeapSimulator::Result<HloValue> result = heap.Finish();
const HeapSimulator::Result<HloValue> results = heap.Finish();
EXPECT_EQ(1, results.heap_results.size());
const HeapSimulator::HeapResult<HloValue>& result =
results.heap_results.at(0);
EXPECT_EQ(140, result.heap_size);
EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@ -1192,7 +1206,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Colocated) {
heap.ShareWith(buffer_c_, buffer_a_, 40);
heap.Free(buffer_c_, 40);
const HeapSimulator::Result<HloValue> result = heap.Finish();
const HeapSimulator::Result<HloValue> results = heap.Finish();
EXPECT_EQ(1, results.heap_results.size());
const HeapSimulator::HeapResult<HloValue>& result =
results.heap_results.at(0);
EXPECT_EQ(40, result.heap_size);
EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@ -1221,7 +1238,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
heap.Free(buffer_c_, 40);
heap.Free(buffer_b_, 20);
const HeapSimulator::Result<HloValue> result = heap.Finish();
const HeapSimulator::Result<HloValue> results = heap.Finish();
EXPECT_EQ(1, results.heap_results.size());
const HeapSimulator::HeapResult<HloValue>& result =
results.heap_results.at(0);
EXPECT_EQ(60, result.heap_size);
EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
@ -1251,7 +1271,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedIII) {
heap.Free(buffer_c_, 10);
heap.Free(buffer_b_, 30);
const HeapSimulator::Result<HloValue> result = heap.Finish();
const HeapSimulator::Result<HloValue> results = heap.Finish();
EXPECT_EQ(1, results.heap_results.size());
const HeapSimulator::HeapResult<HloValue>& result =
results.heap_results.at(0);
EXPECT_EQ(40, result.heap_size);
EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
@ -1311,6 +1334,122 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ChunkCandidate) {
// Preferred offset 15 could not be given because it is occupied.
}
class ConstrainedGlobalDecreasingSizeBestFitHeapTest
: public HeapAlgorithmTestBase {};
TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
// space
// ^
// | +-------+
// | +---c---+
// | +-------+
// | | b |
// | +-------+
// | ................ // split into two allocations.
// | +---a---+
// | +-------+
// | | |
// | | d |
// | +-------+
// -----------------> time
ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50,
/*alignment=*/1);
heap.Alloc(buffer_a_, 10);
heap.Alloc(buffer_b_, 30);
heap.Alloc(buffer_c_, 20);
heap.Alloc(buffer_d_, 40);
heap.Free(buffer_a_, 10);
heap.Free(buffer_b_, 30);
heap.Free(buffer_c_, 20);
heap.Free(buffer_d_, 40);
const HeapSimulator::Result<HloValue> result = heap.Finish();
EXPECT_EQ(100, result.heap_size);
EXPECT_EQ(2, result.heap_results.size());
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_d_));
EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size);
EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_d_).size);
EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_a_).offset);
EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_d_).offset);
}
TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest,
DecreasingSizeWithAlignment) {
// space
// ^
// | +-------+
// | +---b---+
// | +-------+
// | | |
// | | d |
// | +-------+
// | ...................
// | +---a---+
// |
// | +-------+
// | | |
// | | c |
// | | |
// | +-------+
// ---------------------> time
ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/70,
/*alignment=*/20);
heap.Alloc(buffer_a_, 10);
heap.Alloc(buffer_b_, 20);
heap.Alloc(buffer_c_, 50);
heap.Free(buffer_a_, 10);
heap.Alloc(buffer_d_, 40);
heap.Free(buffer_b_, 20);
heap.Free(buffer_c_, 50);
heap.Free(buffer_d_, 40);
const HeapSimulator::Result<HloValue> result = heap.Finish();
EXPECT_EQ(130, result.heap_size); // 70 + 60
EXPECT_EQ(2, result.heap_results.size());
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_));
EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size);
EXPECT_EQ(50, result.heap_results[0].chunk_map.at(buffer_c_).size);
EXPECT_EQ(60, result.heap_results[0].chunk_map.at(buffer_a_).offset);
EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset);
}
TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
// space
// ^
// | +---------------+
// | +-------b-------+
// | ....................
// |+------+ +-------+
// || | | |
// || | | | <--- colocate with a
// |+--a---+ +---c---+
// ---------------------> time
ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50,
/*alignment=*/20);
heap.Alloc(buffer_a_, 30);
heap.Free(buffer_a_, 30);
heap.Alloc(buffer_b_, 20);
heap.ShareWith(buffer_c_, buffer_a_, 40);
heap.Free(buffer_c_, 40);
heap.Free(buffer_b_, 20);
const HeapSimulator::Result<HloValue> result = heap.Finish();
EXPECT_EQ(50, result.heap_size);
EXPECT_EQ(2, result.heap_results.size());
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_));
EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_a_).size);
EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_c_).size);
EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_a_).offset);
EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset);
}
class IntervalTreeTest : public ::testing::Test {};
TEST_F(IntervalTreeTest, InsertAndRemove) {

View File

@ -1250,7 +1250,10 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
VLOG(3) << allocation_info_str_;
DumpDebugStringsIfEnabled();
return result_;
HeapSimulator::Result<HloValue> result;
result.heap_size = result_.heap_size;
result.heap_results.emplace_back(std::move(result_));
return result;
}
void AlternateMemoryBestFitHeap::AddRequiredAssignmentsForColocatedIntervals(

View File

@ -290,7 +290,11 @@ message DebugOptions {
// Extra parameters to pass the GPU assembler.
string xla_gpu_asm_extra_flags = 141;
// Next id: 142
// Per-heap size constraint. New heaps will be created if per-heap max size is
// reached.
int32 xla_multiheap_size_constraint_per_heap = 142;
// Next id: 143
// Extra options to pass to the compilation backend (e.g. LLVM); specific
// interpretation of these values is left to the backend.