Merge pull request #43850 from trentlo:size_constrained_buf_alloc_upstream
PiperOrigin-RevId: 339431959 Change-Id: Ie2aa8488f0606ea318a7d1b6c2985effdb52d243
This commit is contained in:
commit
6c43f200d0
@ -73,6 +73,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
|
||||
opts.set_xla_gpu_deterministic_reductions(false);
|
||||
opts.set_xla_cpu_enable_xprof_traceme(false);
|
||||
opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
|
||||
opts.set_xla_multiheap_size_constraint_per_heap(-1);
|
||||
|
||||
return opts;
|
||||
}
|
||||
@ -571,6 +572,17 @@ static void AllocateFlags() {
|
||||
"that falling back to the driver can have drawbacks like using more "
|
||||
"memory and/or other bugs during compilation, so we recommend setting "
|
||||
"this flag to false."));
|
||||
flag_objects->push_back(tensorflow::Flag(
|
||||
"xla_multiheap_size_constraint_per_heap",
|
||||
int32_setter_for(
|
||||
&DebugOptions::set_xla_multiheap_size_constraint_per_heap),
|
||||
flag_values->xla_multiheap_size_constraint_per_heap(),
|
||||
"Generates multiple heaps (i.e., temp buffers) with a size "
|
||||
"constraint on each heap to avoid Out-of-Memory due to memory "
|
||||
"fragmentation. The constraint is soft, so it works with tensors "
|
||||
"larger than the given constraint size. -1 corresponds to no "
|
||||
"constraints."));
|
||||
|
||||
ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
|
||||
}
|
||||
|
||||
|
@ -607,7 +607,10 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
|
||||
// BufferAllocation.
|
||||
void BufferAssignment::CombineTempAllocations() {
|
||||
VLOG(1) << "CombineTempAllocations()";
|
||||
flat_hash_map<BufferValue::Color, BufferAllocation> combined_allocation_map;
|
||||
// Stores the combined allocations.
|
||||
std::deque<BufferAllocation> combined_allocations;
|
||||
// Holds the pointer to a combined allocation of each color, if any.
|
||||
flat_hash_map<BufferValue::Color, BufferAllocation*> combined_allocation_map;
|
||||
|
||||
// Move all temp allocations into a single run at the end of the allocations
|
||||
// vector.
|
||||
@ -621,19 +624,31 @@ void BufferAssignment::CombineTempAllocations() {
|
||||
// to the same color.
|
||||
if (first_temp_it != allocations_.end()) {
|
||||
for (auto it = first_temp_it; it != allocations_.end(); ++it) {
|
||||
const BufferAllocation& temp_allocation = *it;
|
||||
BufferAllocation& temp_allocation = *it;
|
||||
BufferValue::Color color = temp_allocation.color();
|
||||
auto combined_it = combined_allocation_map.find(color);
|
||||
if (combined_it == combined_allocation_map.end()) {
|
||||
// We have found the first temp allocation of this color. Collect
|
||||
// the other temp allocations of the same color into it.
|
||||
// the other temp allocations of the same color into it subject to the
|
||||
// size constraint.
|
||||
VLOG(1) << "Combined temp allocation for color " << color
|
||||
<< " is: " << temp_allocation;
|
||||
combined_allocation_map.emplace(color, temp_allocation);
|
||||
combined_allocations.emplace_back(temp_allocation);
|
||||
combined_allocation_map.emplace(color, &combined_allocations.back());
|
||||
continue;
|
||||
}
|
||||
if (combined_it->second->size() + it->size() >=
|
||||
multiheap_size_constraint_per_heap_) {
|
||||
// We cannot put more into the current combined_it. So, appoint a new
|
||||
// combined_it.
|
||||
VLOG(1) << "Due to size constraint, reset temp allocation for color "
|
||||
<< color << " to: " << temp_allocation;
|
||||
combined_allocations.emplace_back(temp_allocation);
|
||||
combined_allocation_map.emplace(color, &combined_allocations.back());
|
||||
continue;
|
||||
}
|
||||
|
||||
auto* combined_allocation = &combined_it->second;
|
||||
BufferAllocation* combined_allocation = combined_it->second;
|
||||
VLOG(1) << "Combined allocation absorbing temp allocation: "
|
||||
<< temp_allocation;
|
||||
|
||||
@ -663,9 +678,9 @@ void BufferAssignment::CombineTempAllocations() {
|
||||
// Replace all existing temporary allocations with the new combined
|
||||
// allocations.
|
||||
allocations_.erase(first_temp_it, allocations_.end());
|
||||
for (auto& combined : combined_allocation_map) {
|
||||
allocations_.push_back(combined.second);
|
||||
temp_allocation_total_size_ += combined.second.size();
|
||||
for (BufferAllocation& combined : combined_allocations) {
|
||||
temp_allocation_total_size_ += combined.size();
|
||||
allocations_.push_back(std::move(combined));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1331,11 +1346,13 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
|
||||
auto algorithms = absl::make_unique<
|
||||
std::vector<std::unique_ptr<HeapAlgorithm<HloValue>>>>();
|
||||
algorithms->push_back(
|
||||
absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
|
||||
alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
|
||||
absl::make_unique<ConstrainedGlobalDecreasingSizeBestFitHeap>(
|
||||
assignment->multiheap_size_constraint_per_heap(), alignment,
|
||||
GlobalDecreasingSizeBestFitHeap<HloValue>::kSpatial));
|
||||
algorithms->push_back(
|
||||
absl::make_unique<GlobalDecreasingSizeBestFitHeap<HloValue>>(
|
||||
alignment, GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
|
||||
absl::make_unique<ConstrainedGlobalDecreasingSizeBestFitHeap>(
|
||||
assignment->multiheap_size_constraint_per_heap(), alignment,
|
||||
GlobalDecreasingSizeBestFitHeap<HloValue>::kTemporal));
|
||||
return absl::make_unique<ChooseBestHeapAlgorithm<HloValue>>(
|
||||
std::move(algorithms));
|
||||
};
|
||||
@ -1444,6 +1461,12 @@ std::vector<const HloValue*> ComputePeakMemoryLogicalBuffers(
|
||||
int64 max_live_size = 0;
|
||||
int64 live_size = 0;
|
||||
for (const auto& event : heap_trace.events()) {
|
||||
if (!id_to_value.contains(event.buffer_id())) {
|
||||
// Skip as the buffer associated with this trace event is not placed into
|
||||
// this allocation. This can happen when size constraints are given to the
|
||||
// heap simulator.
|
||||
continue;
|
||||
}
|
||||
live_size += memory_delta(event);
|
||||
if (max_live_size < live_size) {
|
||||
max_live_size = live_size;
|
||||
@ -1455,6 +1478,12 @@ std::vector<const HloValue*> ComputePeakMemoryLogicalBuffers(
|
||||
absl::flat_hash_set<const HloValue*> live_values;
|
||||
live_size = 0;
|
||||
for (const auto& event : heap_trace.events()) {
|
||||
if (!id_to_value.contains(event.buffer_id())) {
|
||||
// Skip as the buffer associated with this trace event is not placed into
|
||||
// this allocation. This can happen when size constraints are given to the
|
||||
// heap simulator.
|
||||
continue;
|
||||
}
|
||||
const HloValue* value = id_to_value.at(event.buffer_id());
|
||||
if (event.kind() == HeapSimulatorTrace::Event::ALLOC ||
|
||||
event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
|
||||
@ -1500,20 +1529,24 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
|
||||
}
|
||||
VLOG(1) << "Result size from heap simulator: " << result.heap_size;
|
||||
|
||||
BufferAllocation* allocation =
|
||||
assignment->NewEmptyAllocation(result.heap_size, color);
|
||||
for (const auto& buffer_chunk : result.chunk_map) {
|
||||
const HloValue& value = *buffer_chunk.first;
|
||||
const HeapSimulator::Chunk& chunk = buffer_chunk.second;
|
||||
assignment->AddAssignment(allocation, value, chunk.offset, chunk.size);
|
||||
// Iterate through heap_results. For each heap_result, create a new allocation
|
||||
// in `assignment`.
|
||||
for (const HeapSimulator::HeapResult<HloValue>& heap_result :
|
||||
result.heap_results) {
|
||||
BufferAllocation* allocation =
|
||||
assignment->NewEmptyAllocation(heap_result.heap_size, color);
|
||||
for (const auto& buffer_chunk : heap_result.chunk_map) {
|
||||
const HloValue& value = *buffer_chunk.first;
|
||||
const HeapSimulator::Chunk& chunk = buffer_chunk.second;
|
||||
assignment->AddAssignment(allocation, value, chunk.offset, chunk.size);
|
||||
}
|
||||
allocation->peak_buffers_ =
|
||||
ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace);
|
||||
|
||||
XLA_VLOG_LINES(2, allocation->ToString());
|
||||
|
||||
allocation->AddHeapTrace(result.debug_trace);
|
||||
}
|
||||
allocation->peak_buffers_ =
|
||||
ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace);
|
||||
|
||||
VLOG(1) << "Ran heap simulation for allocation: ";
|
||||
XLA_VLOG_LINES(2, allocation->ToString());
|
||||
|
||||
allocation->AddHeapTrace(result.debug_trace);
|
||||
}
|
||||
|
||||
StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
|
||||
@ -1580,6 +1613,10 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
|
||||
buffers_to_assign_sequentially.size() == global_computations.size();
|
||||
VLOG(2) << "Running whole module heap simulation: "
|
||||
<< run_whole_module_heap_simulation;
|
||||
const int32 multiheap_size_constraint_per_heap =
|
||||
module->config().debug_options().xla_multiheap_size_constraint_per_heap();
|
||||
VLOG(2) << "Multiheap per heap size limit: "
|
||||
<< multiheap_size_constraint_per_heap;
|
||||
TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering(
|
||||
buffers_to_assign_sequentially, run_whole_module_heap_simulation,
|
||||
assignment.get()));
|
||||
@ -1614,10 +1651,11 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
|
||||
}
|
||||
}
|
||||
|
||||
// Combines allocations of temporary buffers into one big BufferAllocation.
|
||||
// This can only be performed after all buffers have been assigned, and
|
||||
// after maybe_live_out is marked, since it is used to determine whether an
|
||||
// allocation contains temporary buffers or not.
|
||||
// Combines allocations of temporary buffers into big BufferAllocations
|
||||
// subject to the buffer allocation size constraint. This can only be
|
||||
// performed after all buffers have been assigned, and after maybe_live_out
|
||||
// is marked, since it is used to determine whether an allocation contains
|
||||
// temporary buffers or not.
|
||||
assignment->CombineTempAllocations();
|
||||
|
||||
XLA_VLOG_LINES(2, assignment->ToString());
|
||||
|
@ -363,6 +363,10 @@ class BufferAssignment {
|
||||
return temp_allocation_total_size_;
|
||||
}
|
||||
|
||||
uint64 multiheap_size_constraint_per_heap() const {
|
||||
return multiheap_size_constraint_per_heap_;
|
||||
}
|
||||
|
||||
// Returns whether the given buffer has been assigned an allocation.
|
||||
bool HasAllocation(const HloValue& value) const;
|
||||
|
||||
@ -491,7 +495,14 @@ class BufferAssignment {
|
||||
buffer_size_(std::move(buffer_size)),
|
||||
color_alignment_(std::move(color_alignment)),
|
||||
alias_analysis_(std::move(alias_analysis)),
|
||||
hlo_live_range_(std::move(hlo_live_range)) {}
|
||||
hlo_live_range_(std::move(hlo_live_range)) {
|
||||
int32 raw_value = module->config()
|
||||
.debug_options()
|
||||
.xla_multiheap_size_constraint_per_heap();
|
||||
// -1 means no constraint.
|
||||
multiheap_size_constraint_per_heap_ =
|
||||
(raw_value == -1) ? UINT64_MAX : raw_value;
|
||||
}
|
||||
|
||||
// Creates and returns a new BufferAllocation, with no assigned
|
||||
// LogicalBuffers. Ownership is maintained internally.
|
||||
@ -535,6 +546,8 @@ class BufferAssignment {
|
||||
// The total size of all temporary buffers.
|
||||
int64 temp_allocation_total_size_ = 0;
|
||||
|
||||
uint64 multiheap_size_constraint_per_heap_;
|
||||
|
||||
// Maps Buffers to the index of the BufferAllocation which holds the buffer.
|
||||
absl::flat_hash_map<const HloValue*, BufferAllocation::Index>
|
||||
allocation_index_for_value_;
|
||||
|
@ -2346,9 +2346,8 @@ IrEmitterUnnested::BuildKernelThunkFromBufferSlices(
|
||||
for (const BufferAllocation& alloc : buffer_assn.Allocations()) {
|
||||
if (alloc.IsPreallocatedTempBuffer()) {
|
||||
if (!temp_buffer.has_value()) {
|
||||
// Retrieve the first seen temp buffer.
|
||||
temp_buffer = &alloc;
|
||||
} else {
|
||||
LOG(FATAL) << "Multiple temp buffers found, but only one is allowed!";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -409,11 +409,16 @@ HeapSimulator::Result<HloValue> HeapSimulator::Finish() {
|
||||
// Post-process the result to add chunks for shared buffers. An empty chunk
|
||||
// map means that either no buffers were allocated, or the heap was only
|
||||
// collecting statistics, e.g. NoFragmentationStatsHeap.
|
||||
if (!result.chunk_map.empty()) {
|
||||
size_t total_chunk_count = absl::c_accumulate(
|
||||
result.heap_results, static_cast<size_t>(0),
|
||||
[&](size_t lhs, const HeapResult<HloValue>& rhs) -> size_t {
|
||||
return lhs + rhs.chunk_map.size();
|
||||
});
|
||||
if (total_chunk_count != 0) {
|
||||
// If we were told to assign specific buffers, make sure we've assigned
|
||||
// exactly that many buffers.
|
||||
if (options_.buffers_to_assign != nullptr) {
|
||||
CHECK_EQ(options_.buffers_to_assign->size(), result.chunk_map.size());
|
||||
CHECK_EQ(options_.buffers_to_assign->size(), total_chunk_count);
|
||||
}
|
||||
}
|
||||
|
||||
@ -825,7 +830,10 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::Finish() {
|
||||
CommitChunk(buffer_interval, chunk_candidate);
|
||||
}
|
||||
VLOG(1) << "result heap_size: " << result_.heap_size;
|
||||
return result_;
|
||||
Result result;
|
||||
result.heap_size = result_.heap_size;
|
||||
result.heap_results.emplace_back(result_);
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename BufferType>
|
||||
@ -968,6 +976,58 @@ void GlobalDecreasingSizeBestFitHeap<BufferType>::AddToChunkMap(
|
||||
DCHECK(emplace_result.second);
|
||||
}
|
||||
|
||||
HeapSimulator::Result<HloValue>
|
||||
ConstrainedGlobalDecreasingSizeBestFitHeap::Finish() {
|
||||
std::vector<BufferInterval> sorted_buffer_vec = GetSortedBufferIntervals();
|
||||
// Convert into std::list so that erase() is O(1).
|
||||
std::list<BufferInterval> sorted_buffer_intervals(sorted_buffer_vec.begin(),
|
||||
sorted_buffer_vec.end());
|
||||
|
||||
// Use do-while here, because we need to create 1 heap in `multi_heap_result`
|
||||
// even if `sorted_buffer_intervals` is empty.
|
||||
Result multi_heap_result;
|
||||
do {
|
||||
// Place buffers into the currently processed heap as many as possible.
|
||||
for (auto it = sorted_buffer_intervals.begin();
|
||||
it != sorted_buffer_intervals.end();) {
|
||||
BufferInterval buffer_interval = *it;
|
||||
if (!buffer_interval.need_allocation) {
|
||||
it = sorted_buffer_intervals.erase(it);
|
||||
continue;
|
||||
}
|
||||
if (buffer_interval.size > size_limit_per_heap_) {
|
||||
LOG(WARNING) << "Alloc buffer size " << buffer_interval.size
|
||||
<< " larger than the per-heap size limit "
|
||||
<< size_limit_per_heap_;
|
||||
}
|
||||
|
||||
ChunkCandidate chunk_candidate = FindChunkCandidate(buffer_interval);
|
||||
if (chunk_candidate.heap_size <= size_limit_per_heap_ ||
|
||||
// Commit the chunk as long as the heap is empty. We do this because
|
||||
// we want the size constraint to be soft, meaning that results are
|
||||
// successfully generated even if there are some buffer sizes larger
|
||||
// than the given constraint size.
|
||||
result_.heap_size == 0) {
|
||||
CommitChunk(buffer_interval, chunk_candidate);
|
||||
it = sorted_buffer_intervals.erase(it);
|
||||
continue;
|
||||
}
|
||||
|
||||
++it;
|
||||
}
|
||||
// Collect the result from the currently processed heap and reset the heap
|
||||
// states.
|
||||
multi_heap_result.heap_size += result_.heap_size;
|
||||
multi_heap_result.heap_results.push_back(std::move(result_));
|
||||
result_ = {};
|
||||
interval_tree_ = {};
|
||||
} while (!sorted_buffer_intervals.empty());
|
||||
|
||||
VLOG(1) << "Number of heaps produced = "
|
||||
<< multi_heap_result.heap_results.size();
|
||||
return multi_heap_result;
|
||||
}
|
||||
|
||||
template <typename BufferType>
|
||||
HeapSimulator::Result<BufferType>
|
||||
ChooseBestHeapAlgorithm<BufferType>::Finish() {
|
||||
|
@ -67,14 +67,23 @@ class HeapSimulator {
|
||||
}
|
||||
};
|
||||
|
||||
// Result represents the result of the heap simulation.
|
||||
template <typename BufferType>
|
||||
struct Result {
|
||||
struct HeapResult {
|
||||
// The assignment of buffers to chunks.
|
||||
absl::flat_hash_map<const BufferType*, Chunk> chunk_map;
|
||||
|
||||
// The total size in bytes of the heap, containing all assigned chunks.
|
||||
int64 heap_size = 0;
|
||||
};
|
||||
// Result represents the result of the heap simulation.
|
||||
template <typename BufferType>
|
||||
struct Result {
|
||||
// Heap results.
|
||||
std::vector<HeapResult<BufferType>> heap_results;
|
||||
|
||||
// The total size in bytes of the heaps.
|
||||
// heap_size == sum([hr.heap_size for hr in heap_results]).
|
||||
int64 heap_size = 0;
|
||||
|
||||
// The total size in bytes of heap fragmentation.
|
||||
int64 fragmentation_size = 0;
|
||||
@ -229,6 +238,7 @@ class HeapAlgorithm {
|
||||
public:
|
||||
using Chunk = HeapSimulator::Chunk;
|
||||
using Result = HeapSimulator::Result<BufferType>;
|
||||
using HeapResult = HeapSimulator::HeapResult<BufferType>;
|
||||
|
||||
virtual ~HeapAlgorithm() = default;
|
||||
|
||||
@ -347,6 +357,7 @@ class BufferIntervalTree {
|
||||
template <typename BufferType>
|
||||
class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
|
||||
public:
|
||||
using HeapResult = HeapSimulator::HeapResult<BufferType>;
|
||||
using Result = HeapSimulator::Result<BufferType>;
|
||||
using Chunk = HeapSimulator::Chunk;
|
||||
|
||||
@ -415,6 +426,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
|
||||
int64 preferred_offset = -1) const;
|
||||
void CommitChunk(const BufferInterval& buffer_interval,
|
||||
ChunkCandidate chunk_candidate);
|
||||
|
||||
// Adds the buffer and the chunk to the result chunk map.
|
||||
virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk);
|
||||
|
||||
@ -426,7 +438,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
|
||||
BufferIntervalCompare GetTemporalBufferIntervalCompare() const;
|
||||
|
||||
absl::flat_hash_map<const BufferType*, BufferInterval> buffer_intervals_;
|
||||
Result result_;
|
||||
HeapResult result_;
|
||||
BufferIntervalCompare buffer_interval_compare_;
|
||||
BufferIntervalTree interval_tree_;
|
||||
|
||||
@ -444,6 +456,41 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
|
||||
const BufferInterval& interval) const;
|
||||
};
|
||||
|
||||
// This class implements an algorithm that will produce multiple heaps, where
|
||||
// each heap size is constrained by a given limit. Note that the constraint is
|
||||
// soft, meaning that a valid heap result is generated even if there are some
|
||||
// buffer sizes larger than the given constraint size.
|
||||
//
|
||||
// Pseudocode:
|
||||
// while( `buffers` is not empty ) {
|
||||
// create a new heap `h`
|
||||
// for (each buffer `buf` in `buffers` in the size-decreasing order) {
|
||||
// if (buf.size() is larger than the heap size limit &&
|
||||
// `h` is empty) {
|
||||
// h.place(buf)
|
||||
// buffers.remove(buf)
|
||||
// } else if (placing `buf` into `h` does not violate size
|
||||
// constraint) {
|
||||
// h.place(buf)
|
||||
// buffers.remove(buf)
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
class ConstrainedGlobalDecreasingSizeBestFitHeap
|
||||
: public GlobalDecreasingSizeBestFitHeap<HloValue> {
|
||||
public:
|
||||
explicit ConstrainedGlobalDecreasingSizeBestFitHeap(
|
||||
uint64 size_limit_per_heap, int64 alignment, Type type = kSpatial)
|
||||
: GlobalDecreasingSizeBestFitHeap<HloValue>(alignment, type),
|
||||
size_limit_per_heap_(size_limit_per_heap) {}
|
||||
~ConstrainedGlobalDecreasingSizeBestFitHeap() override {}
|
||||
|
||||
Result Finish() override;
|
||||
|
||||
private:
|
||||
uint64 size_limit_per_heap_;
|
||||
};
|
||||
|
||||
// A heap algorithm that chooses the best results from other algorithms added to
|
||||
// it.
|
||||
template <typename BufferType>
|
||||
|
@ -256,12 +256,15 @@ class HeapCallRecorder : public HeapAlgorithm<HloValue> {
|
||||
}
|
||||
Result Finish() override {
|
||||
calls_->emplace_back(kFinish, nullptr);
|
||||
return result_;
|
||||
HeapSimulator::Result<HloValue> result;
|
||||
result.heap_size = result_.heap_size;
|
||||
result.heap_results.emplace_back(std::move(result_));
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
CallSequence* calls_;
|
||||
Result result_;
|
||||
HeapSimulator::HeapResult<HloValue> result_;
|
||||
};
|
||||
|
||||
// HeapSimulatorTracker runs the heap simulator, recording the sequence of calls
|
||||
@ -335,7 +338,8 @@ class HeapSimulatorTracker {
|
||||
|
||||
int64 OffsetAt(const HloInstruction* instruction, const ShapeIndex& index) {
|
||||
const HloValue* buffer = BufferAt(instruction, index);
|
||||
return result_.chunk_map.at(buffer).offset;
|
||||
CHECK_EQ(1, result_.heap_results.size());
|
||||
return result_.heap_results.at(0).chunk_map.at(buffer).offset;
|
||||
}
|
||||
|
||||
// Ensures the expected sequence of Alloc/Free/Finish calls was performed.
|
||||
@ -1051,7 +1055,8 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Empty) {
|
||||
GlobalDecreasingSizeBestFitHeap<HloValue> heap(/*alignment=*/1);
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
EXPECT_EQ(0, result.heap_size);
|
||||
EXPECT_EQ(0, result.chunk_map.size());
|
||||
EXPECT_EQ(1, result.heap_results.size());
|
||||
EXPECT_EQ(0, result.heap_results.at(0).chunk_map.size());
|
||||
}
|
||||
|
||||
TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
|
||||
@ -1078,7 +1083,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
|
||||
heap.Free(buffer_c_, 20);
|
||||
heap.Free(buffer_d_, 40);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
const HeapSimulator::Result<HloValue> results = heap.Finish();
|
||||
EXPECT_EQ(1, results.heap_results.size());
|
||||
const HeapSimulator::HeapResult<HloValue>& result =
|
||||
results.heap_results.at(0);
|
||||
EXPECT_EQ(100, result.heap_size);
|
||||
EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
|
||||
@ -1117,7 +1125,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSizeWithAlignment) {
|
||||
heap.Free(buffer_c_, 50);
|
||||
heap.Free(buffer_d_, 40);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
const HeapSimulator::Result<HloValue> results = heap.Finish();
|
||||
EXPECT_EQ(1, results.heap_results.size());
|
||||
const HeapSimulator::HeapResult<HloValue>& result =
|
||||
results.heap_results.at(0);
|
||||
EXPECT_EQ(120, result.heap_size);
|
||||
EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
|
||||
@ -1160,7 +1171,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, BestFit) {
|
||||
heap.Free(buffer_d_, 30);
|
||||
heap.Free(buffer_e_, 50);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
const HeapSimulator::Result<HloValue> results = heap.Finish();
|
||||
EXPECT_EQ(1, results.heap_results.size());
|
||||
const HeapSimulator::HeapResult<HloValue>& result =
|
||||
results.heap_results.at(0);
|
||||
EXPECT_EQ(140, result.heap_size);
|
||||
EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
|
||||
@ -1192,7 +1206,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Colocated) {
|
||||
heap.ShareWith(buffer_c_, buffer_a_, 40);
|
||||
heap.Free(buffer_c_, 40);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
const HeapSimulator::Result<HloValue> results = heap.Finish();
|
||||
EXPECT_EQ(1, results.heap_results.size());
|
||||
const HeapSimulator::HeapResult<HloValue>& result =
|
||||
results.heap_results.at(0);
|
||||
EXPECT_EQ(40, result.heap_size);
|
||||
EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
|
||||
@ -1221,7 +1238,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
|
||||
heap.Free(buffer_c_, 40);
|
||||
heap.Free(buffer_b_, 20);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
const HeapSimulator::Result<HloValue> results = heap.Finish();
|
||||
EXPECT_EQ(1, results.heap_results.size());
|
||||
const HeapSimulator::HeapResult<HloValue>& result =
|
||||
results.heap_results.at(0);
|
||||
EXPECT_EQ(60, result.heap_size);
|
||||
EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size);
|
||||
@ -1251,7 +1271,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedIII) {
|
||||
heap.Free(buffer_c_, 10);
|
||||
heap.Free(buffer_b_, 30);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
const HeapSimulator::Result<HloValue> results = heap.Finish();
|
||||
EXPECT_EQ(1, results.heap_results.size());
|
||||
const HeapSimulator::HeapResult<HloValue>& result =
|
||||
results.heap_results.at(0);
|
||||
EXPECT_EQ(40, result.heap_size);
|
||||
EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size);
|
||||
@ -1311,6 +1334,122 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ChunkCandidate) {
|
||||
// Preferred offset 15 could not be given because it is occupied.
|
||||
}
|
||||
|
||||
class ConstrainedGlobalDecreasingSizeBestFitHeapTest
|
||||
: public HeapAlgorithmTestBase {};
|
||||
|
||||
TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, DecreasingSize) {
|
||||
// space
|
||||
// ^
|
||||
// | +-------+
|
||||
// | +---c---+
|
||||
// | +-------+
|
||||
// | | b |
|
||||
// | +-------+
|
||||
// | ................ // split into two allocations.
|
||||
// | +---a---+
|
||||
// | +-------+
|
||||
// | | |
|
||||
// | | d |
|
||||
// | +-------+
|
||||
// -----------------> time
|
||||
ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50,
|
||||
/*alignment=*/1);
|
||||
heap.Alloc(buffer_a_, 10);
|
||||
heap.Alloc(buffer_b_, 30);
|
||||
heap.Alloc(buffer_c_, 20);
|
||||
heap.Alloc(buffer_d_, 40);
|
||||
heap.Free(buffer_a_, 10);
|
||||
heap.Free(buffer_b_, 30);
|
||||
heap.Free(buffer_c_, 20);
|
||||
heap.Free(buffer_d_, 40);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
EXPECT_EQ(100, result.heap_size);
|
||||
EXPECT_EQ(2, result.heap_results.size());
|
||||
|
||||
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
|
||||
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_d_));
|
||||
EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_d_).size);
|
||||
EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_a_).offset);
|
||||
EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_d_).offset);
|
||||
}
|
||||
|
||||
TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest,
|
||||
DecreasingSizeWithAlignment) {
|
||||
// space
|
||||
// ^
|
||||
// | +-------+
|
||||
// | +---b---+
|
||||
// | +-------+
|
||||
// | | |
|
||||
// | | d |
|
||||
// | +-------+
|
||||
// | ...................
|
||||
// | +---a---+
|
||||
// |
|
||||
// | +-------+
|
||||
// | | |
|
||||
// | | c |
|
||||
// | | |
|
||||
// | +-------+
|
||||
// ---------------------> time
|
||||
ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/70,
|
||||
/*alignment=*/20);
|
||||
heap.Alloc(buffer_a_, 10);
|
||||
heap.Alloc(buffer_b_, 20);
|
||||
heap.Alloc(buffer_c_, 50);
|
||||
heap.Free(buffer_a_, 10);
|
||||
heap.Alloc(buffer_d_, 40);
|
||||
heap.Free(buffer_b_, 20);
|
||||
heap.Free(buffer_c_, 50);
|
||||
heap.Free(buffer_d_, 40);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
EXPECT_EQ(130, result.heap_size); // 70 + 60
|
||||
EXPECT_EQ(2, result.heap_results.size());
|
||||
|
||||
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
|
||||
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_));
|
||||
EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(50, result.heap_results[0].chunk_map.at(buffer_c_).size);
|
||||
EXPECT_EQ(60, result.heap_results[0].chunk_map.at(buffer_a_).offset);
|
||||
EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset);
|
||||
}
|
||||
|
||||
TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, ColocatedII) {
|
||||
// space
|
||||
// ^
|
||||
// | +---------------+
|
||||
// | +-------b-------+
|
||||
// | ....................
|
||||
// |+------+ +-------+
|
||||
// || | | |
|
||||
// || | | | <--- colocate with a
|
||||
// |+--a---+ +---c---+
|
||||
// ---------------------> time
|
||||
ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50,
|
||||
/*alignment=*/20);
|
||||
heap.Alloc(buffer_a_, 30);
|
||||
heap.Free(buffer_a_, 30);
|
||||
heap.Alloc(buffer_b_, 20);
|
||||
|
||||
heap.ShareWith(buffer_c_, buffer_a_, 40);
|
||||
heap.Free(buffer_c_, 40);
|
||||
heap.Free(buffer_b_, 20);
|
||||
|
||||
const HeapSimulator::Result<HloValue> result = heap.Finish();
|
||||
EXPECT_EQ(50, result.heap_size);
|
||||
EXPECT_EQ(2, result.heap_results.size());
|
||||
|
||||
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_));
|
||||
EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_));
|
||||
EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_a_).size);
|
||||
EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_c_).size);
|
||||
EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_a_).offset);
|
||||
EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset);
|
||||
}
|
||||
|
||||
class IntervalTreeTest : public ::testing::Test {};
|
||||
|
||||
TEST_F(IntervalTreeTest, InsertAndRemove) {
|
||||
|
@ -1250,7 +1250,10 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
|
||||
VLOG(3) << allocation_info_str_;
|
||||
DumpDebugStringsIfEnabled();
|
||||
|
||||
return result_;
|
||||
HeapSimulator::Result<HloValue> result;
|
||||
result.heap_size = result_.heap_size;
|
||||
result.heap_results.emplace_back(std::move(result_));
|
||||
return result;
|
||||
}
|
||||
|
||||
void AlternateMemoryBestFitHeap::AddRequiredAssignmentsForColocatedIntervals(
|
||||
|
@ -290,7 +290,11 @@ message DebugOptions {
|
||||
// Extra parameters to pass the GPU assembler.
|
||||
string xla_gpu_asm_extra_flags = 141;
|
||||
|
||||
// Next id: 142
|
||||
// Per-heap size constraint. New heaps will be created if per-heap max size is
|
||||
// reached.
|
||||
int32 xla_multiheap_size_constraint_per_heap = 142;
|
||||
|
||||
// Next id: 143
|
||||
|
||||
// Extra options to pass to the compilation backend (e.g. LLVM); specific
|
||||
// interpretation of these values is left to the backend.
|
||||
|
Loading…
Reference in New Issue
Block a user