STT-tensorflow/tensorflow/compiler/xla/service/buffer_assignment.h
A. Unique TensorFlower 4718ac6b15 Internal change.
Change: 151587999
2017-03-29 10:23:43 -07:00

502 lines
21 KiB
C++

/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_ASSIGNMENT_H_
#define TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_ASSIGNMENT_H_
#include <functional>
#include <iosfwd>
#include <memory>
#include <string>
#include <vector>
#include "tensorflow/compiler/xla/service/buffer_liveness.h"
#include "tensorflow/compiler/xla/service/hlo_computation.h"
#include "tensorflow/compiler/xla/service/hlo_instruction.h"
#include "tensorflow/compiler/xla/service/hlo_module.h"
#include "tensorflow/compiler/xla/service/logical_buffer.h"
#include "tensorflow/compiler/xla/statusor.h"
#include "tensorflow/compiler/xla/types.h"
#include "tensorflow/compiler/xla/xla_data.pb.h"
#include "tensorflow/core/lib/gtl/array_slice.h"
#include "tensorflow/core/lib/gtl/flatmap.h"
#include "tensorflow/core/lib/gtl/flatset.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/types.h"
namespace xla {
// This class abstracts an allocation of contiguous memory which can hold the
// values described by LogicalBuffers. Each LogicalBuffer occupies a sub-range
// of the allocation, represented by a Slice. A single BufferAllocation may hold
// LogicalBuffers with disjoint liveness, which may have overlapping Slices. A
// single BufferAllocation may also hold LogicalBuffers with overlapping
// liveness, which must have disjoint Slices.
//
// The abstraction includes information required by the backends for allocation,
// use, and deallocation of the buffer. This includes the LogicalBuffers which
// are held in this allocation through the execution of the computation.
class BufferAllocation {
public:
// Holds a unique identifier for each allocation. Values are assigned
// contiguously and can be used as array indexes.
using Index = int64;
BufferAllocation(Index index, int64 size, bool is_thread_local,
bool is_reusable)
: index_(index),
size_(size),
is_thread_local_(is_thread_local),
is_reusable_(is_reusable) {}
~BufferAllocation() {}
// Returns the index of this allocation.
Index index() const { return index_; }
// Whether this allocation is used in a parallel calling context such as
// inside of a map or reduce computation. Such allocations need to be thread
// local.
bool is_thread_local() const { return is_thread_local_; }
// Whether this allocation can be used by more than one logical buffer.
bool is_reusable() const { return is_reusable_; }
// Whether this allocation holds a LogicalBuffer from a parameter of the entry
// computation. These buffers have lifetimes which may be longer than the
// XLA computation.
bool is_entry_computation_parameter() const {
return is_entry_computation_parameter_;
}
// If this allocation holds a Buffer from a parameter of the entry
// computation, this methods returns the parameter number. CHECKs otherwise.
int64 parameter_number() const {
CHECK(is_entry_computation_parameter_);
return parameter_number_;
}
// Returns whether this allocation is assigned a LogicalBuffer which may
// be live out of the entry computation.
bool maybe_live_out() const { return maybe_live_out_; }
// Returns the size of the allocation. Necessarily this must be at least as
// large as any LogicalBuffer assigned to this allocation.
int64 size() const { return size_; }
struct OffsetSize {
int64 offset = 0;
int64 size = 0;
};
// Access to the logical buffers assigned to this allocation, and their
// associated logical offsets and sizes.
const tensorflow::gtl::FlatMap<const LogicalBuffer*, OffsetSize>&
assigned_buffers() const {
return assigned_buffers_;
}
// A Slice represents a contiguous portion of a memory allocation. It is used
// to identify the memory range that a LogicalBuffer corresponds to.
class Slice {
public:
Slice() {}
Slice(const BufferAllocation* allocation, int64 offset, int64 size)
: allocation_(allocation), offset_(offset), size_(size) {}
const BufferAllocation* allocation() const { return allocation_; }
Index index() const { return allocation_->index(); }
int64 offset() const { return offset_; }
int64 size() const { return size_; }
bool operator==(const Slice& other) const {
return index() == other.index() && offset_ == other.offset_ &&
size_ == other.size_;
}
bool operator!=(const Slice& other) const { return !(*this == other); }
bool operator<(const Slice& other) const {
if (index() != other.index()) return index() < other.index();
if (offset_ != other.offset_) return offset_ < other.offset_;
return size_ < other.size_;
}
// Returns true iff this slice's memory range has a non-empty intersection
// with the other slice's memory range.
bool OverlapsWith(const Slice& other) const {
const int64 end = offset_ + size_;
const int64 other_end = other.offset_ + other.size_;
return index() == other.index() && offset_ < other_end &&
end > other.offset_;
}
struct Hasher {
size_t operator()(Slice s) const;
};
string ToString() const;
private:
const BufferAllocation* allocation_ = nullptr;
int64 offset_ = 0;
int64 size_ = 0;
};
// GetSlice returns the Slice of contiguous memory that holds the value
// described by the given 'buffer'.
// REQUIRES: 'buffer' must be assigned to this allocation.
Slice GetSlice(const LogicalBuffer& buffer) const;
string ToString() const;
// Whether the buffer is a parameter to or live out of the entry computation.
bool IsInputOrOutput() const {
return is_entry_computation_parameter() || maybe_live_out();
}
// Whether the buffer is a temporary buffer allocated before
// Executable::ExecuteOnStream.
bool IsPreallocatedTempBuffer() const {
// Parameters do not need temporary buffers.
return !is_entry_computation_parameter() &&
// LogicalBuffers that maybe pointed to by the output should live out
// of the computation.
!maybe_live_out() &&
// Thread-local buffers are allocated using `alloca`s.
!is_thread_local();
}
bool operator==(const BufferAllocation& other) const {
return index_ == other.index_;
}
bool operator!=(const BufferAllocation& other) const {
return !(*this == other);
}
bool operator<(const BufferAllocation& other) const {
return index() < other.index();
}
private:
// Only BufferAssigner and BufferAssignment can modify BufferAllocation.
friend class BufferAssigner;
friend class BufferAssignment;
// Adds a LogicalBuffer to the set assigned to this buffer.
void AddAssignment(const LogicalBuffer& buffer, int64 offset, int64 size);
void set_entry_computation_parameter(int64 parameter_number) {
is_entry_computation_parameter_ = true;
parameter_number_ = parameter_number;
}
void set_maybe_live_out(bool value) { maybe_live_out_ = value; }
void set_index(Index index) { index_ = index; }
void set_size(int64 size) { size_ = size; }
// The index of the allocation in the BufferAssignment.
Index index_;
// Size of the allocation in bytes.
int64 size_;
// Whether this buffer needs to be thread-local.
bool is_thread_local_;
// Whether this buffer is usable by more than one logical buffer.
bool is_reusable_;
// Whether this allocation holds an entry computation parameter. Entry
// computation parameters are special be cause they have lifetimes which may
// outlast the computation.
bool is_entry_computation_parameter_ = false;
// If this allocation holds an entry computation parameter, this field
// indicates the index (starting from 0) of the parameter.
int64 parameter_number_ = 0;
// Whether the allocation contains a LogicalBuffer which may be live-out of
// the entry computation. Note that this flag is conservatively computed by
// TuplePointsToAnalysis. That is, an allocation marked `maybe_live_out_`
// might not actually escape.
bool maybe_live_out_ = false;
// Mapping from the set of buffers assigned to this allocation to their
// logical offsets and sizes.
tensorflow::gtl::FlatMap<const LogicalBuffer*, OffsetSize> assigned_buffers_;
};
// Add stream operators for nicer output of CHECK/RET_CHECK failures.
std::ostream& operator<<(std::ostream& out, const BufferAllocation& s);
std::ostream& operator<<(std::ostream& out, const BufferAllocation::Slice& s);
// This class encapsulates an assignment of the LogicalBuffers in an XLA
// module to a set of BufferAllocations.
class BufferAssignment {
public:
// Returns the vector containing all buffer allocations in this assignment.
const std::vector<BufferAllocation>& Allocations() const {
return allocations_;
}
// Returns the single allocation holding all temporary buffers. Returns
// nullptr if there are no temporary buffers, or if the assignment uses more
// than one allocation to hold temporary buffers.
const BufferAllocation* GetTempAllocation() const { return temp_allocation_; }
// Returns whether the given buffer has been assigned an allocation.
bool HasAllocation(const LogicalBuffer& buffer) const;
// Returns the allocation that a particular LogicalBuffer has been assigned
// to. CHECKs if buffer has not been assigned an allocation.
const BufferAllocation& GetAssignedAllocation(
const LogicalBuffer& buffer) const;
// Returns the allocation with the given index. CHECKs if no allocation exists
// with the given index.
const BufferAllocation& GetAllocation(BufferAllocation::Index index) const;
// Builds and returns a vector containing the slices which might contain the
// subvalue at the given index of given instruction.
std::set<BufferAllocation::Slice> GetAllSlices(
const HloInstruction* instruction, const ShapeIndex& index) const;
// Convenience function which returns whether the top-level buffer of the
// instruction (index == {}) is assigned an allocation.
bool HasTopLevelAllocation(const HloInstruction* instruction) const;
// Convenience function which returns the unique slice containing the buffer
// at the given index of the given instruction. If a slice is not assigned or
// the slice cannot be determined at compile time then an error is returned.
StatusOr<BufferAllocation::Slice> GetUniqueSlice(
const HloInstruction* instruction, const ShapeIndex& index) const;
// Like GetUniqueSlice but fixes the index to the top-level of the shape
// (index = {}).
StatusOr<BufferAllocation::Slice> GetUniqueTopLevelSlice(
const HloInstruction* instruction) const;
// Like GetUniqueTopLevelSlice but returns the slice for the output of the
// entry computation of the HLO module (ie, the result of the XLA
// computation).
StatusOr<BufferAllocation::Slice> GetUniqueTopLevelOutputSlice() const;
// Returns the set LogicalBuffers which may be the source of the value at the
// given index and instruction.
const std::vector<const LogicalBuffer*>& GetSourceBuffers(
const HloInstruction* instruction, const ShapeIndex& index) const {
return GetPointsToSet(instruction).element(index);
}
// Returns true if 'hlo_a{shape_index_a}' and 'hlo_b{shape_index_b}'
// share the same BufferAllocation::Slice.
// Returns false otherwise.
// REQUIRES: BufferAssignment assigned allocations to both instructions.
bool SharesSliceAtIndex(const HloInstruction* hlo_a,
const ShapeIndex& shape_index_a,
const HloInstruction* hlo_b,
const ShapeIndex& shape_index_b) const;
// Returns the underlying points-to analysis used for this assignment.
const TuplePointsToAnalysis& points_to_analysis() const {
return liveness_->points_to_analysis();
}
string ToString() const;
// Statistics for the assignment. Values initialized to -1 are not always
// collected; fragmentation is only collected for instructions that have a
// sequential total ordering.
struct Stats {
int64 parameter_allocation_count = 0;
int64 parameter_allocation_bytes = 0;
int64 maybe_live_out_allocation_count = 0;
int64 maybe_live_out_allocation_bytes = 0;
int64 preallocated_temp_allocation_count = 0;
int64 preallocated_temp_allocation_bytes = 0;
int64 preallocated_temp_fragmentation_bytes = -1;
int64 total_allocation_count = 0;
int64 total_allocation_bytes = 0;
int64 total_fragmentation_bytes = -1;
string ToString() const;
};
const Stats& GetStats() const { return stats_; }
private:
// Only BufferAssigner can build or modify BufferAssignments.
friend class BufferAssigner;
explicit BufferAssignment(const HloModule* module,
std::unique_ptr<BufferLiveness> liveness,
int64 alignment)
: module_(module),
liveness_(std::move(liveness)),
alignment_(alignment) {}
// Creates and returns a new BufferAllocation, with no assigned
// LogicalBuffers. Ownership is maintained internally.
BufferAllocation* NewEmptyAllocation(int64 size, bool is_thread_local,
bool is_reusable);
// Helper that calls NewEmptyAllocation and AddAssignment in one call,
// creating an allocation containing a single LogicalBuffer.
BufferAllocation* NewAllocation(const LogicalBuffer& buffer, int64 size,
bool is_thread_local, bool is_reusable);
// Adds a LogicalBuffer to the set assigned to the given allocation.
void AddAssignment(BufferAllocation* allocation, const LogicalBuffer& buffer,
int64 offset, int64 size);
// Returns the BufferLiveness object used to construct this assignment.
const BufferLiveness& liveness() { return *liveness_; }
// Convenience function which returns the PointsToSet for the given
// instruction. Extracted from the liveness object.
const PointsToSet& GetPointsToSet(const HloInstruction* instruction) const;
// Mutable accessors for allocations.
BufferAllocation* GetMutableAssignedAllocation(const LogicalBuffer& buffer);
BufferAllocation* GetMutableAllocation(BufferAllocation::Index index);
// Combines allocations of temporary buffers into one big BufferAllocation.
void CombineTempAllocations();
// Computes stats for the assignment, to be retrieved by GetStats.
Status ComputeSummaryStats(const LogicalBuffer::SizeFunction& buffer_size);
// The vector of buffer allocations. Indexed by BufferAllocation::Index.
std::vector<BufferAllocation> allocations_;
// The single allocation holding all temporary buffers.
BufferAllocation* temp_allocation_ = nullptr;
// Maps Buffers to the index of the BufferAllocation which holds the buffer.
tensorflow::gtl::FlatMap<const LogicalBuffer*, BufferAllocation::Index>
allocation_index_for_buffer_;
const HloModule* module_;
const std::unique_ptr<BufferLiveness> liveness_;
const int64 alignment_;
Stats stats_;
TF_DISALLOW_COPY_AND_ASSIGN(BufferAssignment);
};
// A class which constructs a buffer assignment.
class BufferAssigner {
public:
// Build and return a BufferAssignment for the given module. The given
// HloOrdering is used to determine buffer liveness. buffer_size is a function
// which returns the size of a LogicalBuffer. Alignment is the the minimum
// alignment of any buffer. If hlos_to_allocate is not null then only
// instructions in this vector are considered for buffer assignment. If
// hlos_to_allocate is null then all instructions are considered. If
// 'colocate_related_buffers' is true, related LogicalBuffers will be
// colocated in the same allocation (i.e buffers for while result will share
// an allocation with buffers related to that same while instruction: init
// operand, condition/body parameter and body result).
static StatusOr<std::unique_ptr<BufferAssignment>> Run(
const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
LogicalBuffer::SizeFunction buffer_size, int64 alignment,
bool colocate_related_buffers,
const std::vector<const HloInstruction*>* hlos_to_allocate = nullptr);
// Overload of Run which uses ShapeUtil::ByteSizeOf to determine buffer size
// and assigns buffers to all HLO instructions in the module.
static StatusOr<std::unique_ptr<BufferAssignment>> Run(
const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
LogicalBuffer::SizeFunction buffer_size, int64 alignment);
private:
explicit BufferAssigner(LogicalBuffer::SizeFunction buffer_size,
int64 alignment, bool colocate_related_buffers)
: buffer_size_(std::move(buffer_size)),
alignment_(alignment),
colocate_related_buffers_(colocate_related_buffers) {}
virtual ~BufferAssigner() = default;
// Create a buffer assignment.
StatusOr<std::unique_ptr<BufferAssignment>> CreateAssignment(
const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
const std::vector<const HloInstruction*>* hlos_to_allocate = nullptr);
// Assigns buffers to the instructions in the given computation. "assignment"
// is modified to reflect the new buffer assignments. If is_thread_local is
// true, then all assigned buffers have the is_thread_local flag set to
// true. If hlos_to_allocate is not null it indicates which HLOs to include in
// buffer assignment. If null, all instructions in the computation are
// included.
Status AssignBuffersForComputation(
const HloComputation* computation, bool is_thread_local,
const tensorflow::gtl::FlatSet<const HloInstruction*>* hlos_to_allocate,
const tensorflow::gtl::FlatSet<const LogicalBuffer*>& colocated_buffers,
const tensorflow::gtl::FlatSet<BufferAllocation::Index>&
colocated_allocations,
BufferAssignment* assignment);
// Assigns 'buffers_to_assign' assuming the HLO instructions will be executed
// in the given 'sequential_order'.
Status AssignBuffersWithSequentialOrdering(
const std::vector<const HloInstruction*>& sequential_order,
const tensorflow::gtl::FlatSet<const LogicalBuffer*>& buffers_to_assign,
const HloComputation& computation, BufferAssignment* assignment);
// Tries to assign the given instruction to the given buffer. Returns if the
// assignment was successful.
bool MaybeAssignBuffer(BufferAllocation* allocation,
const LogicalBuffer& buffer,
BufferAssignment* assignment);
// Colocated buffers are logical buffers from different computations which
// alias. Explicitly handling these colocated buffers is necessary because
// points-to analysis is computation level scope and does not recognize
// aliasing across computations (b/32491382).
using ColocatedBufferSet = tensorflow::gtl::FlatSet<const LogicalBuffer*>;
// Returns a vector of ColocatedBufferSet objects, where each
// ColocatedBufferSet aggregates a set of related LogicalBuffers from 'module'
// which should be colocated in the same buffer allocation.
void BuildColocatedBufferSets(
const HloModule* module, const TuplePointsToAnalysis& points_to_analysis,
std::vector<ColocatedBufferSet>* colocated_buffer_sets);
// For each buffer set in 'colocated_buffer_sets', assigns all buffers in the
// same set to the same buffer allocation in 'assignment'.
void AssignColocatedBufferSets(
const std::vector<ColocatedBufferSet>& colocated_buffer_sets,
BufferAssignment* assignment,
tensorflow::gtl::FlatSet<const LogicalBuffer*>* colocated_buffers,
tensorflow::gtl::FlatSet<BufferAllocation::Index>* colocated_allocations);
// Adds the 'colocated_set' of buffers to 'colocated_buffer_sets', maintaining
// the invariant that all sets in 'colocated_buffer_sets' are disjoint.
void AddSetToColocatedBufferSets(
const std::vector<const LogicalBuffer*>& colocated_set,
std::vector<ColocatedBufferSet>* colocated_buffer_sets);
const HloModule* module_;
// Function which returns the buffer size for a given logical buffer (shape).
LogicalBuffer::SizeFunction buffer_size_;
// Minimum alignment of any buffer.
int64 alignment_;
// Indicates whether related buffers should share the same buffer allocation.
const bool colocate_related_buffers_;
TF_DISALLOW_COPY_AND_ASSIGN(BufferAssigner);
};
} // namespace xla
#endif // TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_ASSIGNMENT_H_