[XLA/GPU] Size-constrained buffer allocation.
This change provide the capability to XLA to generate multiple heaps (i.e., temp buffers) with a size constraint on each heap to avoid Out-of-Memory due to memory fragmentation. Note that larger allocations are more subject to the effect of fragmentation.
This commit is contained in:
parent
9fd6313f37
commit
34dcd396d9
@ -73,6 +73,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
|
||||
opts.set_xla_gpu_deterministic_reductions(false);
|
||||
opts.set_xla_cpu_enable_xprof_traceme(false);
|
||||
opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
|
||||
opts.set_xla_multiheap_size_constraint_per_heap(-1);
|
||||
|
||||
return opts;
|
||||
}
|
||||
@ -571,6 +572,16 @@ static void AllocateFlags() {
|
||||
"that falling back to the driver can have drawbacks like using more "
|
||||
"memory and/or other bugs during compilation, so we recommend setting "
|
||||
"this flag to false."));
|
||||
flag_objects->push_back(tensorflow::Flag(
|
||||
"xla_multiheap_size_constraint_per_heap",
|
||||
int32_setter_for(
|
||||
&DebugOptions::set_xla_multiheap_size_constraint_per_heap),
|
||||
flag_values->xla_multiheap_size_constraint_per_heap(),
|
||||
"Generates multiple heaps (i.e., temp buffers) with a size "
|
||||
"constraint on each heap to avoid Out-of-Memory due to memory "
|
||||
"fragmentation. The constraint is soft, so it works with tensors "
|
||||
"larger than the given constraint size."));
|
||||
|
||||
ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
|
||||
}
|
||||
|
||||
|
||||
@ -1330,12 +1330,23 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
|
||||
auto get_heap_algorithm = [&](int64 alignment) {
|
||||
auto algorithms = absl::make_unique<
|
||||
std::vector<std::unique_ptr<HeapAlgorithm<HloValue>>>>();
|
||||
algorithms->push_back(
|
||||
absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
|
||||
alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
|
||||
algorithms->push_back(
|
||||
absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
|
||||
alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
|
||||
if (assignment->multiheap_size_constraint_per_heap() == -1) {
|
||||
algorithms->push_back(
|
||||
absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
|
||||
alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
|
||||
algorithms->push_back(
|
||||
absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
|
||||
alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
|
||||
} else {
|
||||
algorithms->push_back(
|
||||
absl::make_unique<ConstrainedGlobalDecreasingSizeBestFitHeap>(
|
||||
assignment->multiheap_size_constraint_per_heap(), alignment,
|
||||
GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
|
||||
algorithms->push_back(
|
||||
absl::make_unique<ConstrainedGlobalDecreasingSizeBestFitHeap>(
|
||||
assignment->multiheap_size_constraint_per_heap(), alignment,
|
||||
GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
|
||||
}
|
||||
return absl::make_unique<ChooseBestHeapAlgorithm<HloValue>>(
|
||||
std::move(algorithms));
|
||||
};
|
||||
@ -1500,20 +1511,25 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
|
||||
}
|
||||
VLOG(1) << "Result size from heap simulator: " << result.heap_size;
|
||||
|
||||
BufferAllocation* allocation =
|
||||
assignment->NewEmptyAllocation(result.heap_size, color);
|
||||
for (const auto& buffer_chunk : result.chunk_map) {
|
||||
const HloValue& value = *buffer_chunk.first;
|
||||
const HeapSimulator::Chunk& chunk = buffer_chunk.second;
|
||||
assignment->AddAssignment(allocation, value, chunk.offset, chunk.size);
|
||||
for (auto& heap_result : result.heap_results) {
|
||||
BufferAllocation* allocation =
|
||||
assignment->NewEmptyAllocation(heap_result.heap_size, color);
|
||||
for (const auto& buffer_chunk : heap_result.chunk_map) {
|
||||
const HloValue& value = *buffer_chunk.first;
|
||||
const HeapSimulator::Chunk& chunk = buffer_chunk.second;
|
||||
assignment->AddAssignment(allocation, value, chunk.offset, chunk.size);
|
||||
}
|
||||
// Compute peak_buffers only when the multiheap mode is off. Simply return
|
||||
// an empty vector in the multiheap mode.
|
||||
if (assignment->multiheap_size_constraint_per_heap() == -1) {
|
||||
allocation->peak_buffers_ =
|
||||
ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace);
|
||||
}
|
||||
|
||||
XLA_VLOG_LINES(2, allocation->ToString());
|
||||
|
||||
allocation->AddHeapTrace(result.debug_trace);
|
||||
}
|
||||
allocation->peak_buffers_ =
|
||||
ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace);
|
||||
|
||||
VLOG(1) << "Ran heap simulation for allocation: ";
|
||||
XLA_VLOG_LINES(2, allocation->ToString());
|
||||
|
||||
allocation->AddHeapTrace(result.debug_trace);
|
||||
}
|
||||
|
||||
StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
|
||||
@ -1580,6 +1596,10 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
|
||||
buffers_to_assign_sequentially.size() == global_computations.size();
|
||||
VLOG(2) << "Running whole module heap simulation: "
|
||||
<< run_whole_module_heap_simulation;
|
||||
const int32 multiheap_size_constraint_per_heap =
|
||||
module->config().debug_options().xla_multiheap_size_constraint_per_heap();
|
||||
VLOG(2) << "Multiheap per heap size limit: "
|
||||
<< multiheap_size_constraint_per_heap;
|
||||
TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
|
||||
buffers_to_assign_sequentially, run_whole_module_heap_simulation,
|
||||
assignment.get()));
|
||||
@ -1618,7 +1638,9 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
|
||||
// This can only be performed after all buffers have been assigned, and
|
||||
// after maybe_live_out is marked, since it is used to determine whether an
|
||||
// allocation contains temporary buffers or not.
|
||||
assignment->CombineTempAllocations();
|
||||
if (multiheap_size_constraint_per_heap == -1) {
|
||||
assignment->CombineTempAllocations();
|
||||
}
|
||||
|
||||
XLA_VLOG_LINES(2, assignment->ToString());
|
||||
TF_RETURN_IF_ERROR(assignment->ComputeSummaryStats());
|
||||
|
||||
@ -363,6 +363,10 @@ class BufferAssignment {
|
||||
return temp_allocation_total_size_;
|
||||
}
|
||||
|
||||
int32 multiheap_size_constraint_per_heap() const {
|
||||
return multiheap_size_constraint_per_heap_;
|
||||
}
|
||||
|
||||
// Returns whether the given buffer has been assigned an allocation.
|
||||
bool HasAllocation(const HloValue& value) const;
|
||||
|
||||
@ -491,7 +495,11 @@ class BufferAssignment {
|
||||
buffer_size_(std::move(buffer_size)),
|
||||
color_alignment_(std::move(color_alignment)),
|
||||
alias_analysis_(std::move(alias_analysis)),
|
||||
hlo_live_range_(std::move(hlo_live_range)) {}
|
||||
hlo_live_range_(std::move(hlo_live_range)),
|
||||
multiheap_size_constraint_per_heap_(
|
||||
module->config()
|
||||
.debug_options()
|
||||
.xla_multiheap_size_constraint_per_heap()) {}
|
||||
|
||||
// Creates and returns a new BufferAllocation, with no assigned
|
||||
// LogicalBuffers. Ownership is maintained internally.
|
||||
@ -535,6 +543,8 @@ class BufferAssignment {
|
||||
// The total size of all temporary buffers.
|
||||
int64 temp_allocation_total_size_ = 0;
|
||||
|
||||
int32 multiheap_size_constraint_per_heap_;
|
||||
|
||||
// Maps Buffers to the index of the BufferAllocation which holds the buffer.
|
||||
absl::flat_hash_map<const HloValue*, BufferAllocation::Index>
|
||||
allocation_index_for_value_;
|
||||
|
||||
@ -1857,7 +1857,8 @@ IrEmitterUnnested::BuildKernelThunkFromBufferSlices(
|
||||
absl::string_view name, Thunk::ThunkInfo thunk_info,
|
||||
absl::Span<const BufferSlice* const> slices,
|
||||
std::function<void(const BufferSlice*, llvm::Value*)>
|
||||
bind_slice_to_ir_value) {
|
||||
bind_slice_to_ir_value,
|
||||
bool insist_single_temp_buffer) {
|
||||
const auto& buffer_assn = ir_emitter_context_->buffer_assignment();
|
||||
|
||||
// Figure out which buffer allocations need to be passed as arguments to our
|
||||
@ -1874,7 +1875,7 @@ IrEmitterUnnested::BuildKernelThunkFromBufferSlices(
|
||||
if (alloc.IsPreallocatedTempBuffer()) {
|
||||
if (!temp_buffer.has_value()) {
|
||||
temp_buffer = &alloc;
|
||||
} else {
|
||||
} else if (insist_single_temp_buffer) {
|
||||
LOG(FATAL) << "Multiple temp buffers found, but only one is allowed!";
|
||||
}
|
||||
}
|
||||
@ -1995,7 +1996,13 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
|
||||
<< hlo_buffer_slice->gte_index.ToString();
|
||||
|
||||
bindings_.BindHloToIrValue(*instr, value, index);
|
||||
});
|
||||
},
|
||||
// Check temp buffer numbers only when the multiheap mode is off.
|
||||
/*insist_single_temp_buffer=*/inst->parent()
|
||||
->parent()
|
||||
->config()
|
||||
.debug_options()
|
||||
.xla_multiheap_size_constraint_per_heap() == -1);
|
||||
}
|
||||
|
||||
std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunkForMlir(
|
||||
|
||||
@ -547,7 +547,8 @@ class IrEmitterUnnested : public IrEmitter,
|
||||
absl::string_view name, Thunk::ThunkInfo thunk_info,
|
||||
absl::Span<const BufferSlice* const> slices,
|
||||
std::function<void(const BufferSlice*, llvm::Value*)>
|
||||
bind_slice_to_ir_value);
|
||||
bind_slice_to_ir_value,
|
||||
bool insist_single_temp_buffer = true);
|
||||
|
||||
// Returns a KernelThunk that invokes the kernel emitted for `inst`. The
|
||||
// caller needs to make sure `inst` outlives the lifetime of the returned
|
||||
|
||||
@ -409,11 +409,15 @@ HeapSimulator::Result<HloValue> HeapSimulator::Finish() {
|
||||
// Post-process the result to add chunks for shared buffers. An empty chunk
|
||||
// map means that either no buffers were allocated, or the heap was only
|
||||
// collecting statistics, e.g. NoFragmentationStatsHeap.
|
||||
if (!result.chunk_map.empty()) {
|
||||
size_t total_chunk_count = 0;
|
||||
absl::c_for_each(result.heap_results, [&](const HeapResult<HloValue>& hr) {
|
||||
total_chunk_count += hr.chunk_map.size();
|
||||
});
|
||||
if (total_chunk_count != 0) {
|
||||
// If we were told to assign specific buffers, make sure we've assigned
|
||||
// exactly that many buffers.
|
||||
if (options_.buffers_to_assign != nullptr) {
|
||||
CHECK_EQ(options_.buffers_to_assign->size(), result.chunk_map.size());
|
||||
CHECK_EQ(options_.buffers_to_assign->size(), total_chunk_count);
|
||||
}
|
||||
}
|
||||
|
||||
@ -825,7 +829,10 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::Finish() {
|
||||
CommitChunk(buffer_interval, chunk_candidate);
|
||||
}
|
||||
VLOG(1) << "result heap_size: " << result_.heap_size;
|
||||
return result_;
|
||||
Result result;
|
||||
result.heap_size = result_.heap_size;
|
||||
result.heap_results.emplace_back(result_);
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename BufferType>
|
||||
@ -968,6 +975,58 @@ void GlobalDecreasingSizeBestFitHeap<BufferType>::AddToChunkMap(
|
||||
DCHECK(emplace_result.second);
|
||||
}
|
||||
|
||||
HeapSimulator::Result<HloValue>
|
||||
ConstrainedGlobalDecreasingSizeBestFitHeap::Finish() {
|
||||
std::vector<BufferInterval> sorted_buffer_vec = GetSortedBufferIntervals();
|
||||
// Convert into std::list so that erase() is O(1).
|
||||
std::list<BufferInterval> sorted_buffer_intervals(sorted_buffer_vec.begin(),
|
||||
sorted_buffer_vec.end());
|
||||
|
||||
// Use do-while here, because we need to create 1 heap in `multi_heap_result`
|
||||
// even if `sorted_buffer_intervals` is empty.
|
||||
Result multi_heap_result;
|
||||
do {
|
||||
// Place buffers into the currently processed heap as many as possible.
|
||||
for (auto it = sorted_buffer_intervals.begin();
|
||||
it != sorted_buffer_intervals.end();) {
|
||||
BufferInterval buffer_interval = *it;
|
||||
if (!buffer_interval.need_allocation) {
|
||||
it = sorted_buffer_intervals.erase(it);
|
||||
continue;
|
||||
}
|
||||
if (buffer_interval.size > size_limit_per_heap_) {
|
||||
LOG(WARNING) << "Alloc buffer size " << buffer_interval.size
|
||||
<< " larger than the per-heap size limit "
|
||||
<< size_limit_per_heap_;
|
||||
}
|
||||
|
||||
ChunkCandidate chunk_candidate = FindChunkCandidate(buffer_interval);
|
||||
if (chunk_candidate.heap_size <= size_limit_per_heap_ ||
|
||||
// Commit the chunk as long as the heap is empty. We do this because
|
||||
// we want the size constraint to be soft, meaning that results are
|
||||
// successfully generated even if there are some buffer sizes larger
|
||||
// than the given constraint size.
|
||||
result_.heap_size == 0) {
|
||||
CommitChunk(buffer_interval, chunk_candidate);
|
||||
it = sorted_buffer_intervals.erase(it);
|
||||
continue;
|
||||
}
|
||||
|
||||
++it;
|
||||
}
|
||||
// Collect the result from the currently processed heap and reset the heap
|
||||
// states.
|
||||
multi_heap_result.heap_size += result_.heap_size;
|
||||
multi_heap_result.heap_results.push_back(std::move(result_));
|
||||
result_ = {};
|
||||
interval_tree_ = {};
|
||||
} while (!sorted_buffer_intervals.empty());
|
||||
|
||||
VLOG(1) << "Number of heaps produced = "
|
||||
<< multi_heap_result.heap_results.size();
|
||||
return multi_heap_result;
|
||||
}
|
||||
|
||||
template <typename BufferType>
|
||||
HeapSimulator::Result<BufferType>
|
||||
ChooseBestHeapAlgorithm<BufferType>::Finish() {
|
||||
|
||||
@ -67,14 +67,23 @@ class HeapSimulator {
|
||||
}
|
||||
};
|
||||
|
||||
// Result represents the result of the heap simulation.
|
||||
template <typename BufferType>
|
||||
struct Result {
|
||||
struct HeapResult {
|
||||
// The assignment of buffers to chunks.
|
||||
absl::flat_hash_map<const BufferType*, Chunk> chunk_map;
|
||||
|
||||
// The total size in bytes of the heap, containing all assigned chunks.
|
||||
int64 heap_size = 0;
|
||||
};
|
||||
// Result represents the result of the heap simulation.
|
||||
template <typename BufferType>
|
||||
struct Result {
|
||||
// Heap results.
|
||||
std::vector<HeapResult<BufferType>> heap_results;
|
||||
|
||||
// The total size in bytes of the heaps.
|
||||
// heap_size == sum([hr.heap_size for hr in heap_results]).
|
||||
int64 heap_size = 0;
|
||||
|
||||
// The total size in bytes of heap fragmentation.
|
||||
int64 fragmentation_size = 0;
|
||||
@ -229,6 +238,7 @@ class HeapAlgorithm {
|
||||
public:
|
||||
using Chunk = HeapSimulator::Chunk;
|
||||
using Result = HeapSimulator::Result<BufferType>;
|
||||
using HeapResult = HeapSimulator::HeapResult<BufferType>;
|
||||
|
||||
virtual ~HeapAlgorithm() = default;
|
||||
|
||||
@ -347,6 +357,7 @@ class BufferIntervalTree {
|
||||
template <typename BufferType>
|
||||
class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
|
||||
public:
|
||||
using HeapResult = HeapSimulator::HeapResult<BufferType>;
|
||||
using Result = HeapSimulator::Result<BufferType>;
|
||||
using Chunk = HeapSimulator::Chunk;
|
||||
|
||||
@ -415,6 +426,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
|
||||
int64 preferred_offset = -1) const;
|
||||
void CommitChunk(const BufferInterval& buffer_interval,
|
||||
ChunkCandidate chunk_candidate);
|
||||
|
||||
// Adds the buffer and the chunk to the result chunk map.
|
||||
virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk);
|
||||
|
||||
@ -426,7 +438,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
|
||||
BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
|
||||
|
||||
absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_;
|
||||
Result result_;
|
||||
HeapResult result_;
|
||||
BufferIntervalCompare buffer_interval_compare_;
|
||||
BufferIntervalTree interval_tree_;
|
||||
|
||||
@ -444,6 +456,25 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
|
||||
const BufferInterval& interval) const;
|
||||
};
|
||||
|
||||
// This class implements an algorithm that will output multiple heaps. Each heap
|
||||
// size is constrained by a given limit. Note that the constraint is soft,
|
||||
// meaning that valid heap results are generated even if there are some buffer
|
||||
// sizes larger than the given constraint size.
|
||||
class ConstrainedGlobalDecreasingSizeBestFitHeap
|
||||
: public GlobalDecreasingSizeBestFitHeap<HloValue> {
|
||||
public:
|
||||
explicit ConstrainedGlobalDecreasingSizeBestFitHeap(
|
||||
size_t size_limit_per_heap, int64 alignment, Type type = kSpatial)
|
||||
: size_limit_per_heap_(size_limit_per_heap),
|
||||
GlobalDecreasingSizeBestFitHeap<HloValue>(alignment, type) {}
|
||||
~ConstrainedGlobalDecreasingSizeBestFitHeap() override {}
|
||||
|
||||
Result Finish() override;
|
||||
|
||||
private:
|
||||
size_t size_limit_per_heap_;
|
||||
};
|
||||
|
||||
// A heap algorithm that chooses the best results from other algorithms added to
|
||||
// it.
|
||||
template <typename BufferType>
|
||||
|
||||
@ -256,12 +256,15 @@ class HeapCallRecorder : public HeapAlgorithm<HloValue> {
|
||||
}
|
||||
Result Finish() override {
|
||||
calls_->emplace_back(kFinish, nullptr);
|
||||
return result_;
|
||||
HeapSimulator::Result<HloValue> result;
|
||||
result.heap_size = result_.heap_size;
|
||||
result.heap_results.emplace_back(std::move(result_));
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
CallSequence* calls_;
|
||||
Result result_;
|
||||
HeapSimulator::HeapResult<HloValue> result_;
|
||||
};
|
||||
|
||||
// HeapSimulatorTracker runs the heap simulator, recording the sequence of calls
|
||||
@ -335,7 +338,8 @@ class HeapSimulatorTracker {
|
||||
|
||||
int64 OffsetAt(const HloInstruction* instruction, const ShapeIndex& index) {
|
||||
const HloValue* buffer = BufferAt(instruction, index);
|
||||
return result_.chunk_map.at(buffer).offset;
|
||||
CHECK_EQ(1, result_.heap_results.size());
|
||||
return result_.heap_results.at(0).chunk_map.at(buffer).offset;
|
||||
}
|
||||
|
||||
// Ensures the expected sequence of Alloc/Free/Finish calls was performed.
|
||||
@ -1051,7 +1055,8 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Empty) {
|
||||
GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
EXPECT_EQ(0, result.heap_size);
|
||||
EXPECT_EQ(0, result.chunk_map.size());
|
||||
EXPECT_EQ(1, result.heap_results.size());
|
||||
EXPECT_EQ(0, result.heap_results.at(0).chunk_map.size());
|
||||
}
|
||||
|
||||
TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
|
||||
@ -1078,7 +1083,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
|
||||
heap.Free(buffer_c_, 20);
|
||||
heap.Free(buffer_d_, 40);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
const HeapSimulator::Result<HloValue> results = heap.Finish();
|
||||
EXPECT_EQ(1, results.heap_results.size());
|
||||
const HeapSimulator::HeapResult<HloValue>& result =
|
||||
results.heap_results.at(0);
|
||||
EXPECT_EQ(100, result.heap_size);
|
||||
EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
|
||||
@ -1117,7 +1125,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSizeWithAlignment) {
|
||||
heap.Free(buffer_c_, 50);
|
||||
heap.Free(buffer_d_, 40);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
const HeapSimulator::Result<HloValue> results = heap.Finish();
|
||||
EXPECT_EQ(1, results.heap_results.size());
|
||||
const HeapSimulator::HeapResult<HloValue>& result =
|
||||
results.heap_results.at(0);
|
||||
EXPECT_EQ(120, result.heap_size);
|
||||
EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
|
||||
@ -1160,7 +1171,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, BestFit) {
|
||||
heap.Free(buffer_d_, 30);
|
||||
heap.Free(buffer_e_, 50);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
const HeapSimulator::Result<HloValue> results = heap.Finish();
|
||||
EXPECT_EQ(1, results.heap_results.size());
|
||||
const HeapSimulator::HeapResult<HloValue>& result =
|
||||
results.heap_results.at(0);
|
||||
EXPECT_EQ(140, result.heap_size);
|
||||
EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
|
||||
@ -1192,7 +1206,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Colocated) {
|
||||
heap.ShareWith(buffer_c_, buffer_a_, 40);
|
||||
heap.Free(buffer_c_, 40);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
const HeapSimulator::Result<HloValue> results = heap.Finish();
|
||||
EXPECT_EQ(1, results.heap_results.size());
|
||||
const HeapSimulator::HeapResult<HloValue>& result =
|
||||
results.heap_results.at(0);
|
||||
EXPECT_EQ(40, result.heap_size);
|
||||
EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
|
||||
@ -1221,7 +1238,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
|
||||
heap.Free(buffer_c_, 40);
|
||||
heap.Free(buffer_b_, 20);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
const HeapSimulator::Result<HloValue> results = heap.Finish();
|
||||
EXPECT_EQ(1, results.heap_results.size());
|
||||
const HeapSimulator::HeapResult<HloValue>& result =
|
||||
results.heap_results.at(0);
|
||||
EXPECT_EQ(60, result.heap_size);
|
||||
EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
|
||||
@ -1251,7 +1271,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedIII) {
|
||||
heap.Free(buffer_c_, 10);
|
||||
heap.Free(buffer_b_, 30);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
const HeapSimulator::Result<HloValue> results = heap.Finish();
|
||||
EXPECT_EQ(1, results.heap_results.size());
|
||||
const HeapSimulator::HeapResult<HloValue>& result =
|
||||
results.heap_results.at(0);
|
||||
EXPECT_EQ(40, result.heap_size);
|
||||
EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
|
||||
@ -1311,6 +1334,122 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ChunkCandidate) {
|
||||
// Preferred offset 15 could not be given because it is occupied.
|
||||
}
|
||||
|
||||
class ConstrainedGlobalDecreasingSizeBestFitHeapTest
|
||||
: public HeapAlgorithmTestBase {};
|
||||
|
||||
TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
|
||||
// space
|
||||
// ^
|
||||
// | +-------+
|
||||
// | +---c---+
|
||||
// | +-------+
|
||||
// | | b |
|
||||
// | +-------+
|
||||
// | ................ // split into two allocations.
|
||||
// | +---a---+
|
||||
// | +-------+
|
||||
// | | |
|
||||
// | | d |
|
||||
// | +-------+
|
||||
// -----------------> time
|
||||
ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50,
|
||||
/*alignment=*/1);
|
||||
heap.Alloc(buffer_a_, 10);
|
||||
heap.Alloc(buffer_b_, 30);
|
||||
heap.Alloc(buffer_c_, 20);
|
||||
heap.Alloc(buffer_d_, 40);
|
||||
heap.Free(buffer_a_, 10);
|
||||
heap.Free(buffer_b_, 30);
|
||||
heap.Free(buffer_c_, 20);
|
||||
heap.Free(buffer_d_, 40);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
EXPECT_EQ(100, result.heap_size);
|
||||
EXPECT_EQ(2, result.heap_results.size());
|
||||
|
||||
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
|
||||
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_d_));
|
||||
EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_d_).size);
|
||||
EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_a_).offset);
|
||||
EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_d_).offset);
|
||||
}
|
||||
|
||||
TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest,
|
||||
DecreasingSizeWithAlignment) {
|
||||
// space
|
||||
// ^
|
||||
// | +-------+
|
||||
// | +---b---+
|
||||
// | +-------+
|
||||
// | | |
|
||||
// | | d |
|
||||
// | +-------+
|
||||
// | ...................
|
||||
// | +---a---+
|
||||
// |
|
||||
// | +-------+
|
||||
// | | |
|
||||
// | | c |
|
||||
// | | |
|
||||
// | +-------+
|
||||
// ---------------------> time
|
||||
ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/70,
|
||||
/*alignment=*/20);
|
||||
heap.Alloc(buffer_a_, 10);
|
||||
heap.Alloc(buffer_b_, 20);
|
||||
heap.Alloc(buffer_c_, 50);
|
||||
heap.Free(buffer_a_, 10);
|
||||
heap.Alloc(buffer_d_, 40);
|
||||
heap.Free(buffer_b_, 20);
|
||||
heap.Free(buffer_c_, 50);
|
||||
heap.Free(buffer_d_, 40);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
EXPECT_EQ(130, result.heap_size); // 70 + 60
|
||||
EXPECT_EQ(2, result.heap_results.size());
|
||||
|
||||
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
|
||||
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_));
|
||||
EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(50, result.heap_results[0].chunk_map.at(buffer_c_).size);
|
||||
EXPECT_EQ(60, result.heap_results[0].chunk_map.at(buffer_a_).offset);
|
||||
EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset);
|
||||
}
|
||||
|
||||
TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
|
||||
// space
|
||||
// ^
|
||||
// | +---------------+
|
||||
// | +-------b-------+
|
||||
// | ....................
|
||||
// |+------+ +-------+
|
||||
// || | | |
|
||||
// || | | | <--- colocate with a
|
||||
// |+--a---+ +---c---+
|
||||
// ---------------------> time
|
||||
ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50,
|
||||
/*alignment=*/20);
|
||||
heap.Alloc(buffer_a_, 30);
|
||||
heap.Free(buffer_a_, 30);
|
||||
heap.Alloc(buffer_b_, 20);
|
||||
|
||||
heap.ShareWith(buffer_c_, buffer_a_, 40);
|
||||
heap.Free(buffer_c_, 40);
|
||||
heap.Free(buffer_b_, 20);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
EXPECT_EQ(50, result.heap_size);
|
||||
EXPECT_EQ(2, result.heap_results.size());
|
||||
|
||||
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
|
||||
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_));
|
||||
EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_c_).size);
|
||||
EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_a_).offset);
|
||||
EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset);
|
||||
}
|
||||
|
||||
class IntervalTreeTest : public ::testing::Test {};
|
||||
|
||||
TEST_F(IntervalTreeTest, InsertAndRemove) {
|
||||
|
||||
@ -1127,7 +1127,10 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
|
||||
VLOG(3) << allocation_info_str_;
|
||||
DumpDebugStringsIfEnabled();
|
||||
|
||||
return result_;
|
||||
HeapSimulator::Result<HloValue> result;
|
||||
result.heap_size = result_.heap_size;
|
||||
result.heap_results.emplace_back(std::move(result_));
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
|
||||
|
||||
@ -290,7 +290,11 @@ message DebugOptions {
|
||||
// Extra parameters to pass the GPU assembler.
|
||||
string xla_gpu_asm_extra_flags = 141;
|
||||
|
||||
// Next id: 142
|
||||
// Per-heap size constraint. New heaps will be created if per-heap max size is
|
||||
// reached.
|
||||
int32 xla_multiheap_size_constraint_per_heap = 142;
|
||||
|
||||
// Next id: 143
|
||||
|
||||
// Extra options to pass to the compilation backend (e.g. LLVM); specific
|
||||
// interpretation of these values is left to the backend.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user