Use separate allocator for cached prepacked matrix allocations.

This CL splits out the SystemAlignedAlloc/Free functions so that they are independently usable in this way.

ruy::Allocator is a highly specialized allocator designed for the hot path of multiple gemm's.

The use case of cached pre-packing has a very different set of tradeoffs.

PiperOrigin-RevId: 283555950
Change-Id: I012a6ba4386e1727866e677965b857f62992d5f1
This commit is contained in:
Sean Silva 2019-12-03 09:01:54 -08:00 committed by TensorFlower Gardener
parent 2454d7bd23
commit 6547b6511b
4 changed files with 78 additions and 57 deletions

View File

@ -26,19 +26,19 @@ namespace ruy {
namespace detail {
void *AlignedAllocator::SystemAlignedAlloc(std::ptrdiff_t num_bytes) {
void *SystemAlignedAlloc(std::ptrdiff_t num_bytes) {
#ifdef _WIN32
return _aligned_malloc(num_bytes, kAlignment);
#else
void *ptr;
if (posix_memalign(&ptr, kAlignment, num_bytes)) {
if (posix_memalign(&ptr, kMinimumBlockAlignment, num_bytes)) {
return nullptr;
}
return ptr;
#endif
}
void AlignedAllocator::SystemAlignedFree(void *ptr) {
void SystemAlignedFree(void *ptr) {
#ifdef _WIN32
_aligned_free(ptr);
#else

View File

@ -34,38 +34,49 @@ inline void* VoidPtrAdd(void* p, std::ptrdiff_t offset) {
return reinterpret_cast<void*>(addr);
}
// Simple allocator designed to converge to a steady-state where all
// Minimum alignment for blocks.
//
// Considerations:
// - This needs to be at least the alignment of any usual data type.
// - It's useful that this is at least the size of a cache line to limit
// possible cache side effects (if only on performance behavior).
// - It's useful that this is at least the size of SIMD registers, as
// some SIMD instruction sets have at least performance behavior
// differences (e.g. NEON) or even different requirements (e.g. SSE)
// based on that.
// - It's useful that this is at least the size of an "exclusive reservation
// granule" on ARM, meaning that if we use this Allocator to allocate
// an atomic variable, there will be no side effects from other things
// contending for exclusive/atomic memory accesses to it. While the
// ARM reference manual mentions that this granule size may be as large
// as 2048 bytes, in practice we observe it to be 64 bytes. It can
// be queried cheaply, at runtime, from userspace, if needed.
static constexpr std::ptrdiff_t kMinimumBlockAlignment = 64;
// Primitive allocation functions obtaining aligned memory from the
// operating system.
void* SystemAlignedAlloc(std::ptrdiff_t num_bytes);
void SystemAlignedFree(void* ptr);
// Specialized allocator designed to converge to a steady-state where all
// allocations are bump-ptr allocations from an already-allocated buffer.
//
// To support these constraints, this allocator only supports two
// operations.
// - AllocateAlignedBytes: allocates a pointer to storage of a specified
// size, which must be aligned to kAlignment.
// size, which must be aligned to kMinimumBlockAlignment.
// - FreeAll: frees all previous allocations (but retains the internal
// buffer to minimize future calls into the system allocator).
//
// This class is specialized for supporting just those two operations
// under this specific steady-state usage pattern. Extending this class
// with new allocation interfaces that don't fit that pattern is probably not
// the right choice. Instead, build a new class on top of
// SystemAlignedAlloc/SystemAlignedFree.
//
// All operations happen on aligned blocks for simplicity.
class AlignedAllocator {
public:
// Alignment of allocated blocks.
//
// Considerations:
// - This needs to be at least the alignment of any usual data type.
// - It's useful that this is at least the size of a cache line to limit
// possible cache side effects (if only on performance behavior).
// - It's useful that this is at least the size of SIMD registers, as
// some SIMD instruction sets have at least performance behavior
// differences (e.g. NEON) or even different requirements (e.g. SSE)
// based on that.
// - It's useful that this is at least the size of an "exclusive reservation
// granule" on ARM, meaning that if we use this Allocator to allocate
// an atomic variable, there will be no side effects from other things
// contending for exclusive/atomic memory accesses to it. While the
// ARM reference manual mentions that this granule size may be as large
// as 2048 bytes, in practice we observe it to be 64 bytes. It can
// be queried cheaply, at runtime, from userspace, if needed.
static constexpr std::ptrdiff_t kAlignment = 64;
void operator=(const AlignedAllocator&) = delete;
~AlignedAllocator() {
FreeAll();
@ -74,7 +85,7 @@ class AlignedAllocator {
void* AllocateAlignedBytes(std::ptrdiff_t num_bytes) {
RUY_DCHECK_GT(num_bytes, 0);
RUY_DCHECK((num_bytes & (kAlignment - 1)) == 0);
RUY_DCHECK((num_bytes & (kMinimumBlockAlignment - 1)) == 0);
if (void* p = AllocateFast(num_bytes)) {
return p;
}
@ -105,17 +116,7 @@ class AlignedAllocator {
fallback_blocks_total_size_ = 0;
}
void FreeOne(void* ptr) {
for (auto p = fallback_blocks_.begin(); p != fallback_blocks_.end(); ++p) {
if (*p == ptr) {
SystemAlignedFree(ptr);
fallback_blocks_.erase(p);
return;
}
}
RUY_DCHECK(false); // Trying to free pointer we did not allocate.
}
private:
void* AllocateFast(std::ptrdiff_t num_bytes) {
if (current_ + num_bytes > size_) {
return nullptr;
@ -132,12 +133,6 @@ class AlignedAllocator {
return p;
}
private:
// Primitive allocation functions obtaining aligned memory from the
// operating system.
void* SystemAlignedAlloc(std::ptrdiff_t num_bytes);
void SystemAlignedFree(void* ptr);
// Theory of operation:
//
// - ptr_, current_, and size_ implement a basic bump-ptr allocator.
@ -171,7 +166,7 @@ class Allocator {
return nullptr;
}
return aligned.AllocateAlignedBytes(
round_up_pot(num_bytes, detail::AlignedAllocator::kAlignment));
round_up_pot(num_bytes, detail::kMinimumBlockAlignment));
}
template <typename Pointer>
void Allocate(std::ptrdiff_t count, Pointer* out) {

View File

@ -58,19 +58,14 @@ void PrepackedCache::EjectOne() {
PrepackedMatrix &pmatrix = oldest->second.first;
cache_size_ -= pmatrix.data_size;
cache_size_ -= pmatrix.sums_size;
allocator_.FreeOne(pmatrix.data);
allocator_.FreeOne(pmatrix.sums);
allocator_.Free(pmatrix.data);
allocator_.Free(pmatrix.sums);
cache_.erase(oldest);
}
void PrepackedCache::AllocatePrepackedMatrix(PrepackedMatrix *pmatrix) {
pmatrix->data = AllocateBytes(pmatrix->data_size);
pmatrix->sums = AllocateBytes(pmatrix->sums_size);
}
void *PrepackedCache::AllocateBytes(std::ptrdiff_t num_bytes) {
// Force system allocation for now to enable easy ejections.
return allocator_.AllocateSlow(num_bytes);
pmatrix->data = allocator_.Alloc(pmatrix->data_size);
pmatrix->sums = allocator_.Alloc(pmatrix->sums_size);
}
void PrepackedCache::DoInsert(const CacheKey &key,

View File

@ -16,6 +16,7 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACKED_CACHE_H_
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACKED_CACHE_H_
#include <cstddef>
#include <iostream>
#include <map>
#include <queue>
@ -27,6 +28,40 @@ limitations under the License.
namespace ruy {
namespace detail {
// Tracks a set of blocks allocated from the underlying system allocator.
class SystemBlockAllocator {
public:
void *Alloc(std::ptrdiff_t num_bytes) {
void *p = detail::SystemAlignedAlloc(num_bytes);
blocks_.push_back(p);
return p;
}
void Free(void *block) {
for (auto it = blocks_.begin(); it != blocks_.end(); ++it) {
if (*it == block) {
detail::SystemAlignedFree(block);
blocks_.erase(it);
return;
}
}
RUY_DCHECK(false); // Trying to free pointer we did not allocate.
}
~SystemBlockAllocator() {
for (void *block : blocks_) {
detail::SystemAlignedFree(block);
}
}
private:
std::vector<void *> blocks_;
};
} // namespace detail
enum CachePolicy { kNoCache, kCacheLHSOnGemV };
// "Low effort" Least Recently Used Cache for Prepacked Matrices
@ -80,12 +115,8 @@ class PrepackedCache {
private:
void EjectOne();
void *AllocateBytes(std::ptrdiff_t num_bytes);
void DoInsert(const CacheKey &key, const PrepackedMatrix &matrix);
// Since this cache is used in the context of "pre-packing", we need to
// handle allocating the space for the packed matrix ourselves, so we need
// our own allocator.
AlignedAllocator allocator_;
detail::SystemBlockAllocator allocator_;
std::map<CacheKey, MatrixWithTimeStamp> cache_;
const int32_t ejection_threshold_;
size_t cache_size_;