[XLA] Implement mechanism to repack allocations to reduce fragmentation.

This CL defines an interface for allocation repackers. A repacker can be
specified in MemorySpaceAssignment::Options. If an HloValue couldn't be
allocated due to running out of alternate memory, we now export the allocations
done so far to the repacker, run the repacker, and import the new offsets back
into memory space assignment for better memory packing.

PiperOrigin-RevId: 326040207
Change-Id: Icc518874781eb74e38701514d8f6fb20d23a4124
This commit is contained in:
Berkin Ilbeyi 2020-08-11 09:47:25 -07:00 committed by TensorFlower Gardener
parent b4297ce0a8
commit 0f35ef2abc
5 changed files with 347 additions and 6 deletions

View File

@ -3426,6 +3426,15 @@ cc_library(
],
)
cc_library(
name = "memory_space_assignment_repacking",
hdrs = ["memory_space_assignment_repacking.h"],
deps = [
"//tensorflow/compiler/xla:statusor",
"//tensorflow/compiler/xla:types",
],
)
cc_library(
name = "memory_space_assignment",
srcs = ["memory_space_assignment.cc"],
@ -3433,6 +3442,7 @@ cc_library(
deps = [
":heap_simulator",
":hlo_cost_analysis",
":memory_space_assignment_repacking",
":memory_space_assignment_utils",
"//tensorflow/compiler/xla:debug_options_flags",
"//tensorflow/core/lib/math:math_util",

View File

@ -1057,10 +1057,28 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
options_.prefetch_interval_picker->SetRetryNumber(retry_number);
Result result =
AllocateAllocationValues(absl::MakeSpan(allocation_values));
VLOG(2) << "Allocation result = "
<< absl::StrFormat("%x", static_cast<int>(result));
if (result_requires_uncommit(result) ||
(!final_retry && result_failed_because_of_async_copy(result))) {
UncommitPendingChunks(absl::MakeSpan(allocation_values));
VLOG(2) << "Couldn't allocate. Retry number " << retry_number;
} else if (result_is(result, Result::kFailOutOfMemory) &&
num_repacks_ < options_.max_repacks) {
UncommitPendingChunks(absl::MakeSpan(allocation_values));
++num_repacks_;
CHECK_NE(options_.repacker, nullptr);
std::vector<RepackAllocationBlock*> repack_allocation_blocks;
ExportAllocationsForRepacking(repack_allocation_blocks);
VLOG(2) << "Repacking.";
auto repack_status =
options_.repacker->Repack(absl::MakeSpan(repack_allocation_blocks));
CHECK_EQ(repack_status.status(), Status::OK());
VLOG(2) << "Repack complete. Modified = " << *repack_status;
if (*repack_status) {
ImportRepackedAllocations(absl::MakeSpan(repack_allocation_blocks));
--retry_number;
}
} else {
FinalizeAllocations(absl::MakeSpan(allocation_values));
break;
@ -1541,6 +1559,33 @@ bool AlternateMemoryBestFitHeap::AreIntervalsReservedInAlternateMemory(
return false;
}
void AlternateMemoryBestFitHeap::ExportAllocationsForRepacking(
std::vector<AlternateMemoryBestFitHeap::RepackAllocationBlock*>&
allocations) {
for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
allocations.push_back(&allocation_block);
}
}
void AlternateMemoryBestFitHeap::ImportRepackedAllocations(
absl::Span<AlternateMemoryBestFitHeap::RepackAllocationBlock*>
repacked_allocations) {
interval_tree_ = {};
for (RepackAllocationBlock* allocation_block : repacked_allocations) {
MemorySpaceAssignment::Allocation* allocation = allocation_block->opaque;
VLOG(3) << "Moved " << allocation->ToString() << ", size "
<< allocation->chunk().size << " from "
<< allocation_block->initial_offset << " to "
<< allocation_block->offset;
allocation_block->opaque->mutable_chunk()->offset =
allocation_block->offset;
interval_tree_.Add(allocation_block->start_time, allocation_block->end_time,
{allocation_block->offset, allocation_block->size});
allocation_block->initial_offset = allocation_block->offset;
allocation_block->offset = -1;
}
}
void AlternateMemoryBestFitHeap::UncommitPendingChunks(
absl::Span<AllocationValue> allocation_values) {
// Clear the allocation sequence of the allocation values so that in case we
@ -1591,11 +1636,37 @@ void AlternateMemoryBestFitHeap::UncommitPendingChunks(
void AlternateMemoryBestFitHeap::FinalizeAllocations(
absl::Span<AllocationValue> allocation_values) {
absl::flat_hash_map<int64, std::vector<MemorySpaceAssignment::Allocation*>>
colocation_map;
for (AllocationValue& allocation_value : allocation_values) {
for (auto& allocation : *allocation_value.allocation_sequence()) {
AppendAllocationInfoDebugString(allocation_value, *allocation,
allocation_info_str_);
allocations_->push_back(std::move(allocation));
MemorySpaceAssignment::Allocation* inserted_allocation =
allocations_->back().get();
if (inserted_allocation->memory_space() == MemorySpace::kAlternate) {
colocation_map[inserted_allocation->chunk().offset].push_back(
inserted_allocation);
}
}
}
// Assume allocations that received the same offset need to be colocated.
// Export these to repack_allocation_blocks_ so that we can repack them to
// reduce fragmentation.
for (auto& colocation : colocation_map) {
std::vector<RepackAllocationBlock*> colocations;
for (MemorySpaceAssignment::Allocation* colocated_allocation :
colocation.second) {
repack_allocation_blocks_.push_back(
{colocated_allocation->start_time(), colocated_allocation->end_time(),
colocated_allocation->chunk().size, /*offset=*/-1,
colocated_allocation->chunk().offset, /*colocations=*/{},
colocated_allocation});
colocations.push_back(&repack_allocation_blocks_.back());
}
for (RepackAllocationBlock* repack_block : colocations) {
repack_block->colocations = colocations;
}
}
ClearPendingChunks();

View File

@ -18,6 +18,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/service/heap_simulator.h"
#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
#include "tensorflow/compiler/xla/service/memory_space_assignment_repacking.h"
namespace xla {
@ -379,6 +380,9 @@ class MemorySpaceAssignment {
// space and a fast and small alternate memory space.
enum class MemorySpace { kDefault, kAlternate };
// Forward declaration for Allocation.
class Allocation;
// The different options to be passed to the Run() API.
struct Options {
// Backend-specific integer value that describes the alternate memory.
@ -424,6 +428,15 @@ class MemorySpaceAssignment {
// copies or asynchronous copy ordering.
int64 max_retries = 1;
// The maximum number of repacks that we are willing to perform in case we
// can't allocate a buffer due to running out of memory. If this value is
// greater than 0, repacker must be non-nullptr.
int64 max_repacks = 0;
// The repacking algorithm to reduce fragmentation. Must be non-null if
// max_repacks is greater than 0.
MemorySpaceAssignmentRepacker<Allocation*>* repacker = nullptr;
// If true, tries allocating buffers across (e.g., before and inside a while
// loop body) sequential calls (kWhile, kCall, and kConditional).
bool allocate_across_sequential_calls = false;
@ -511,6 +524,7 @@ class MemorySpaceAssignment {
const std::vector<HloUse>& uses() const { return uses_; }
MemorySpace memory_space() const { return memory_space_; }
Chunk chunk() const { return *chunk_; }
Chunk* mutable_chunk() { return &*chunk_; }
void set_start_time(int64 start_time) { start_time_ = start_time; }
int64 start_time() const { return start_time_; }
int64 end_time() const { return end_time_; }
@ -929,6 +943,9 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
HeapSimulator::Result Finish() override;
private:
using RepackAllocationBlock = MemorySpaceAssignmentRepacker<
MemorySpaceAssignment::Allocation*>::AllocationBlock;
// An allocation request for a use segment. A use segment is the time segment
// between the definition and the first use, and the time segment between the
// uses of a buffer. For example, the time between the definition and Use1, is
@ -1149,6 +1166,16 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
absl::optional<AsynchronousCopy> ViolatesAsyncCopyOrdering(
int64 start_time, int64 end_time) const;
// Exports the allocations for repacking and puts them into the vector in the
// parameter.
void ExportAllocationsForRepacking(
std::vector<RepackAllocationBlock*>& allocations);
// Imports repacked allocations and updates the internal data structures
// consistent with the new packing.
void ImportRepackedAllocations(
absl::Span<RepackAllocationBlock*> repacked_allocations);
// Adds an asynchronous copy to the allocations.
void AddAsyncCopy(const MemorySpaceAssignment::Allocation& prev_allocation,
MemorySpace memory_space, absl::optional<Chunk> chunk,
@ -1197,6 +1224,11 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
BufferIntervalTree prefetch_interval_tree_;
BufferIntervalTree eviction_interval_tree_;
AsynchronousCopyOrdering async_copy_ordering_;
// A list of RepackAllocationBlock objects that mirrors allocation sequences,
// used for repacking. We use a list here because we need pointer stability
// for aliased allocations.
std::list<RepackAllocationBlock> repack_allocation_blocks_;
int64 num_repacks_ = 0;
std::vector<std::pair<BufferInterval, ChunkCandidate>> pending_chunks_;
std::vector<AsynchronousCopy> pending_async_copies_;
std::vector<std::pair<const HloValue*, RequiredMemoryAssignment>>

View File

@ -0,0 +1,57 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
#define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_
#include "tensorflow/compiler/xla/statusor.h"
#include "tensorflow/compiler/xla/types.h"
namespace xla {
// An interface to define allocation repacking algorithms.
template <typename O>
class MemorySpaceAssignmentRepacker {
public:
MemorySpaceAssignmentRepacker() = default;
virtual ~MemorySpaceAssignmentRepacker() = default;
// A contiguous block of allocation consisting of start and end (logical)
// times, size, and the initial offset. After repacking, if the repacking was
// successful and the allocations were modified, the offset field holds the
// new offset. To support aliased allocations, AllocationBlock also includes a
// vector of AllocationBlock pointers, called colocations. All AllocationBlock
// objects within the colocations must get the same offset. The opaque field
// is used by the MemorySpaceAssignment pass and should not be accessed by the
// repacking algorithm.
struct AllocationBlock {
int64 start_time;
int64 end_time;
int64 size;
int64 offset;
int64 initial_offset;
std::vector<AllocationBlock*> colocations;
O opaque;
};
// Repack the AllocationBlocks provided in the parameter. Returns true if
// allocations have been modified and false if not. Returns a non-ok status if
// there was an error.
virtual StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) = 0;
};
} // namespace xla
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_REPACKING_H_

View File

@ -71,19 +71,22 @@ class MemorySpaceAssignmentTest : public HloTestBase,
std::unique_ptr<PresetAssignments> AssignMemorySpace(
HloModule* module, int64 max_outstanding_async_copies = -1,
int64 max_prefetch_interval = 10, int64 min_prefetch_interval = 2) {
int64 max_prefetch_interval = 10, int64 min_prefetch_interval = 2,
absl::optional<MemorySpaceAssignment::Options> options = absl::nullopt) {
InstructionCountPrefetchIntervalPicker prefetch_interval_picker(
min_prefetch_interval, max_prefetch_interval);
return AssignMemorySpace(module, max_outstanding_async_copies,
/*buffer_interval_compare=*/{},
&prefetch_interval_picker);
&prefetch_interval_picker, options);
}
std::unique_ptr<PresetAssignments> AssignMemorySpace(
HloModule* module, int64 max_outstanding_async_copies,
absl::optional<MemorySpaceAssignment::BufferIntervalCompare>
buffer_interval_compare,
PrefetchIntervalPicker* prefetch_interval_picker) {
PrefetchIntervalPicker* prefetch_interval_picker,
absl::optional<MemorySpaceAssignment::Options>
memory_space_assignment_options = absl::nullopt) {
auto size_fn = [](const BufferValue& buffer) {
return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
};
@ -117,9 +120,15 @@ class MemorySpaceAssignmentTest : public HloTestBase,
}
MemorySpaceAssignment::Options options;
if (memory_space_assignment_options) {
options = *memory_space_assignment_options;
} else {
options.max_size_in_bytes = 128;
options.alignment_in_bytes = 8;
options.verify = true;
}
options.alternate_memory_space = kAlternateMemorySpace;
options.max_size_in_bytes = 128;
options.alignment_in_bytes = 8;
options.buffer_interval_compare = buffer_interval_compare;
options.prefetch_interval_picker = prefetch_interval_picker;
options.size_fn = size_fn;
@ -127,7 +136,6 @@ class MemorySpaceAssignmentTest : public HloTestBase,
options.max_outstanding_prefetches = max_outstanding_async_copies;
options.max_outstanding_evictions = max_outstanding_async_copies;
options.allocate_across_sequential_calls = GetParam();
options.verify = true;
auto alias_analysis = HloAliasAnalysis::Run(module).ValueOrDie();
std::unique_ptr<HloLiveRange> hlo_live_range =
@ -4058,6 +4066,169 @@ TEST_P(MemorySpaceAssignmentTest, MoveCopyDoneEarlier) {
find_schedule_index(cos->operand(0)));
}
// A mock MemorySpaceAssignmentRepacker class that accepst a map of
// (start_time,offset) -> new_offset values. Using this map, the repacker
// repacks the allocations to the new_offset.
class FakeMemorySpaceAssignmentRepacker
: public MemorySpaceAssignmentRepacker<MemorySpaceAssignment::Allocation*> {
public:
FakeMemorySpaceAssignmentRepacker(
absl::flat_hash_map<std::pair<int64, int64>, int64>& repack_map)
: repack_map_(repack_map) {}
StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override {
bool modified = false;
for (AllocationBlock* block : allocations) {
VLOG(1) << "Alloc time: [" << block->start_time << ", " << block->end_time
<< "] size: " << block->size
<< " init offset: " << block->initial_offset;
auto it = repack_map_.find({block->start_time, block->initial_offset});
if (it != repack_map_.end()) {
modified = true;
block->offset = it->second;
} else {
block->offset = block->initial_offset;
}
for (AllocationBlock* colocation : block->colocations) {
VLOG(1) << " [" << colocation->start_time << ", "
<< colocation->end_time << "]";
if (it != repack_map_.end()) {
colocation->offset = it->second;
} else {
colocation->offset = colocation->initial_offset;
}
}
}
return modified;
}
private:
// A map from (start_time, offset) to new_offset.
absl::flat_hash_map<std::pair<int64, int64>, int64> repack_map_;
};
TEST_P(MemorySpaceAssignmentTest, Repack) {
// We initially perform the following allocations at these offsets.
//
// Max memory
// -------------------------------------------
//
//
//
//
// +------------+
// | b |
// +------------+
// +-------+ +------------+
// | a | | n |
// +-------+ +------------+
// -------------------------------------------
// Min memory time ->
//
// Next up, we try to allocate the prefetch for m. However due to
// fragmentation, this won't be possible:
//
// Max memory
// -------------------------------------------
//
//
//
// +---------+
// +------------+ |
// | b | | |
// +------------+ |
// +-------+ | | +------------+
// | a | | d | | n |
// +-------+ +---------+ +------------+
// -------------------------------------------
// Min memory time ->
//
// We then call repack to repack the existing allocations which allows us to
// allocate the prefetch for m:
//
// Max memory
// -------------------------------------------
// +---------+
// | |
// | |
// | |
// +-------+ | |
// | a | | d |
// +-------+ +---------+
// +------------+ +------------+
// | b | | n |
// +------------+ +------------+
// -------------------------------------------
// Min memory time ->
absl::string_view hlo_string = R"(
HloModule bug, is_scheduled=true
ENTRY Entry {
param0 = f32[8,3] parameter(0)
param1 = f32[2,4] parameter(1)
a = f32[2,4] sine(param1)
b = f32[2,4] cosine(param1)
c = f32[8,3] negate(param0)
j = f32[2,4] negate(a)
d = f32[8,3] tanh(param0)
k = f32[2,4] negate(j)
l = f32[2,4] add(b, k)
m = f32[8,3] negate(d)
n = f32[2,4] sine(l)
o = f32[8,3] negate(m)
p = f32[2,4] negate(n)
q = f32[8,3] negate(m)
ROOT tuple = (f32[2,4], f32[8,3], f32[8,3]) tuple(p, q, o)
}
)";
MemorySpaceAssignment::BufferIntervalCompare buffer_interval_compare =
[](const MemorySpaceAssignment::BufferInterval& a,
const MemorySpaceAssignment::BufferInterval& b) {
auto get_opcode_priority = [](const HloOpcode& opcode) {
switch (opcode) {
case HloOpcode::kSin:
return 0;
case HloOpcode::kCos:
return 1;
case HloOpcode::kTanh:
return 2;
default:
return 3;
}
};
return get_opcode_priority(a.buffer->defining_instruction()->opcode()) <
get_opcode_priority(b.buffer->defining_instruction()->opcode());
};
TF_ASSERT_OK_AND_ASSIGN(auto module,
ParseAndReturnVerifiedModule(hlo_string));
InstructionCountPrefetchIntervalPicker prefetch_interval_picker(2, 10);
absl::flat_hash_map<std::pair<int64, int64>, int64> repack_map;
// Move "a" from offset 0 to 32.
repack_map[{2, 0}] = 32;
// Move "b" from offset 32 to 0.
repack_map[{3, 32}] = 0;
FakeMemorySpaceAssignmentRepacker repacker =
FakeMemorySpaceAssignmentRepacker(repack_map);
MemorySpaceAssignment::Options options;
options.max_size_in_bytes = 128;
options.alignment_in_bytes = 8;
options.verify = true;
options.max_repacks = 1;
options.repacker = &repacker;
AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
buffer_interval_compare, &prefetch_interval_picker,
options);
// If repacking succeeds, we should find the buffer for d in alternate memory.
const HloInstruction* d =
module->entry_computation()->GetInstructionWithName("d");
EXPECT_EQ(d->shape().layout().memory_space(), kAlternateMemorySpace);
}
TEST_P(MemorySpaceAssignmentTest, Determinism) {
// Run memory space assignment a few times to make sure every time it compiles
// to the same thing.