Use separate allocator for cached prepacked matrix allocations.
This CL splits out the SystemAlignedAlloc/Free functions so that they are independently usable in this way. ruy::Allocator is a highly specialized allocator designed for the hot path of multiple gemm's. The use case of cached pre-packing has a very different set of tradeoffs. PiperOrigin-RevId: 283555950 Change-Id: I012a6ba4386e1727866e677965b857f62992d5f1
This commit is contained in:
parent
2454d7bd23
commit
6547b6511b
@ -26,19 +26,19 @@ namespace ruy {
|
||||
|
||||
namespace detail {
|
||||
|
||||
void *AlignedAllocator::SystemAlignedAlloc(std::ptrdiff_t num_bytes) {
|
||||
void *SystemAlignedAlloc(std::ptrdiff_t num_bytes) {
|
||||
#ifdef _WIN32
|
||||
return _aligned_malloc(num_bytes, kAlignment);
|
||||
#else
|
||||
void *ptr;
|
||||
if (posix_memalign(&ptr, kAlignment, num_bytes)) {
|
||||
if (posix_memalign(&ptr, kMinimumBlockAlignment, num_bytes)) {
|
||||
return nullptr;
|
||||
}
|
||||
return ptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
void AlignedAllocator::SystemAlignedFree(void *ptr) {
|
||||
void SystemAlignedFree(void *ptr) {
|
||||
#ifdef _WIN32
|
||||
_aligned_free(ptr);
|
||||
#else
|
||||
|
@ -34,38 +34,49 @@ inline void* VoidPtrAdd(void* p, std::ptrdiff_t offset) {
|
||||
return reinterpret_cast<void*>(addr);
|
||||
}
|
||||
|
||||
// Simple allocator designed to converge to a steady-state where all
|
||||
// Minimum alignment for blocks.
|
||||
//
|
||||
// Considerations:
|
||||
// - This needs to be at least the alignment of any usual data type.
|
||||
// - It's useful that this is at least the size of a cache line to limit
|
||||
// possible cache side effects (if only on performance behavior).
|
||||
// - It's useful that this is at least the size of SIMD registers, as
|
||||
// some SIMD instruction sets have at least performance behavior
|
||||
// differences (e.g. NEON) or even different requirements (e.g. SSE)
|
||||
// based on that.
|
||||
// - It's useful that this is at least the size of an "exclusive reservation
|
||||
// granule" on ARM, meaning that if we use this Allocator to allocate
|
||||
// an atomic variable, there will be no side effects from other things
|
||||
// contending for exclusive/atomic memory accesses to it. While the
|
||||
// ARM reference manual mentions that this granule size may be as large
|
||||
// as 2048 bytes, in practice we observe it to be 64 bytes. It can
|
||||
// be queried cheaply, at runtime, from userspace, if needed.
|
||||
static constexpr std::ptrdiff_t kMinimumBlockAlignment = 64;
|
||||
|
||||
// Primitive allocation functions obtaining aligned memory from the
|
||||
// operating system.
|
||||
void* SystemAlignedAlloc(std::ptrdiff_t num_bytes);
|
||||
void SystemAlignedFree(void* ptr);
|
||||
|
||||
// Specialized allocator designed to converge to a steady-state where all
|
||||
// allocations are bump-ptr allocations from an already-allocated buffer.
|
||||
//
|
||||
// To support these constraints, this allocator only supports two
|
||||
// operations.
|
||||
// - AllocateAlignedBytes: allocates a pointer to storage of a specified
|
||||
// size, which must be aligned to kAlignment.
|
||||
// size, which must be aligned to kMinimumBlockAlignment.
|
||||
// - FreeAll: frees all previous allocations (but retains the internal
|
||||
// buffer to minimize future calls into the system allocator).
|
||||
//
|
||||
// This class is specialized for supporting just those two operations
|
||||
// under this specific steady-state usage pattern. Extending this class
|
||||
// with new allocation interfaces that don't fit that pattern is probably not
|
||||
// the right choice. Instead, build a new class on top of
|
||||
// SystemAlignedAlloc/SystemAlignedFree.
|
||||
//
|
||||
// All operations happen on aligned blocks for simplicity.
|
||||
class AlignedAllocator {
|
||||
public:
|
||||
// Alignment of allocated blocks.
|
||||
//
|
||||
// Considerations:
|
||||
// - This needs to be at least the alignment of any usual data type.
|
||||
// - It's useful that this is at least the size of a cache line to limit
|
||||
// possible cache side effects (if only on performance behavior).
|
||||
// - It's useful that this is at least the size of SIMD registers, as
|
||||
// some SIMD instruction sets have at least performance behavior
|
||||
// differences (e.g. NEON) or even different requirements (e.g. SSE)
|
||||
// based on that.
|
||||
// - It's useful that this is at least the size of an "exclusive reservation
|
||||
// granule" on ARM, meaning that if we use this Allocator to allocate
|
||||
// an atomic variable, there will be no side effects from other things
|
||||
// contending for exclusive/atomic memory accesses to it. While the
|
||||
// ARM reference manual mentions that this granule size may be as large
|
||||
// as 2048 bytes, in practice we observe it to be 64 bytes. It can
|
||||
// be queried cheaply, at runtime, from userspace, if needed.
|
||||
static constexpr std::ptrdiff_t kAlignment = 64;
|
||||
|
||||
void operator=(const AlignedAllocator&) = delete;
|
||||
~AlignedAllocator() {
|
||||
FreeAll();
|
||||
@ -74,7 +85,7 @@ class AlignedAllocator {
|
||||
|
||||
void* AllocateAlignedBytes(std::ptrdiff_t num_bytes) {
|
||||
RUY_DCHECK_GT(num_bytes, 0);
|
||||
RUY_DCHECK((num_bytes & (kAlignment - 1)) == 0);
|
||||
RUY_DCHECK((num_bytes & (kMinimumBlockAlignment - 1)) == 0);
|
||||
if (void* p = AllocateFast(num_bytes)) {
|
||||
return p;
|
||||
}
|
||||
@ -105,17 +116,7 @@ class AlignedAllocator {
|
||||
fallback_blocks_total_size_ = 0;
|
||||
}
|
||||
|
||||
void FreeOne(void* ptr) {
|
||||
for (auto p = fallback_blocks_.begin(); p != fallback_blocks_.end(); ++p) {
|
||||
if (*p == ptr) {
|
||||
SystemAlignedFree(ptr);
|
||||
fallback_blocks_.erase(p);
|
||||
return;
|
||||
}
|
||||
}
|
||||
RUY_DCHECK(false); // Trying to free pointer we did not allocate.
|
||||
}
|
||||
|
||||
private:
|
||||
void* AllocateFast(std::ptrdiff_t num_bytes) {
|
||||
if (current_ + num_bytes > size_) {
|
||||
return nullptr;
|
||||
@ -132,12 +133,6 @@ class AlignedAllocator {
|
||||
return p;
|
||||
}
|
||||
|
||||
private:
|
||||
// Primitive allocation functions obtaining aligned memory from the
|
||||
// operating system.
|
||||
void* SystemAlignedAlloc(std::ptrdiff_t num_bytes);
|
||||
void SystemAlignedFree(void* ptr);
|
||||
|
||||
// Theory of operation:
|
||||
//
|
||||
// - ptr_, current_, and size_ implement a basic bump-ptr allocator.
|
||||
@ -171,7 +166,7 @@ class Allocator {
|
||||
return nullptr;
|
||||
}
|
||||
return aligned.AllocateAlignedBytes(
|
||||
round_up_pot(num_bytes, detail::AlignedAllocator::kAlignment));
|
||||
round_up_pot(num_bytes, detail::kMinimumBlockAlignment));
|
||||
}
|
||||
template <typename Pointer>
|
||||
void Allocate(std::ptrdiff_t count, Pointer* out) {
|
||||
|
@ -58,19 +58,14 @@ void PrepackedCache::EjectOne() {
|
||||
PrepackedMatrix &pmatrix = oldest->second.first;
|
||||
cache_size_ -= pmatrix.data_size;
|
||||
cache_size_ -= pmatrix.sums_size;
|
||||
allocator_.FreeOne(pmatrix.data);
|
||||
allocator_.FreeOne(pmatrix.sums);
|
||||
allocator_.Free(pmatrix.data);
|
||||
allocator_.Free(pmatrix.sums);
|
||||
cache_.erase(oldest);
|
||||
}
|
||||
|
||||
void PrepackedCache::AllocatePrepackedMatrix(PrepackedMatrix *pmatrix) {
|
||||
pmatrix->data = AllocateBytes(pmatrix->data_size);
|
||||
pmatrix->sums = AllocateBytes(pmatrix->sums_size);
|
||||
}
|
||||
|
||||
void *PrepackedCache::AllocateBytes(std::ptrdiff_t num_bytes) {
|
||||
// Force system allocation for now to enable easy ejections.
|
||||
return allocator_.AllocateSlow(num_bytes);
|
||||
pmatrix->data = allocator_.Alloc(pmatrix->data_size);
|
||||
pmatrix->sums = allocator_.Alloc(pmatrix->sums_size);
|
||||
}
|
||||
|
||||
void PrepackedCache::DoInsert(const CacheKey &key,
|
||||
|
@ -16,6 +16,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACKED_CACHE_H_
|
||||
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACKED_CACHE_H_
|
||||
|
||||
#include <cstddef>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <queue>
|
||||
@ -27,6 +28,40 @@ limitations under the License.
|
||||
|
||||
namespace ruy {
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Tracks a set of blocks allocated from the underlying system allocator.
|
||||
class SystemBlockAllocator {
|
||||
public:
|
||||
void *Alloc(std::ptrdiff_t num_bytes) {
|
||||
void *p = detail::SystemAlignedAlloc(num_bytes);
|
||||
blocks_.push_back(p);
|
||||
return p;
|
||||
}
|
||||
|
||||
void Free(void *block) {
|
||||
for (auto it = blocks_.begin(); it != blocks_.end(); ++it) {
|
||||
if (*it == block) {
|
||||
detail::SystemAlignedFree(block);
|
||||
blocks_.erase(it);
|
||||
return;
|
||||
}
|
||||
}
|
||||
RUY_DCHECK(false); // Trying to free pointer we did not allocate.
|
||||
}
|
||||
|
||||
~SystemBlockAllocator() {
|
||||
for (void *block : blocks_) {
|
||||
detail::SystemAlignedFree(block);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<void *> blocks_;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
enum CachePolicy { kNoCache, kCacheLHSOnGemV };
|
||||
|
||||
// "Low effort" Least Recently Used Cache for Prepacked Matrices
|
||||
@ -80,12 +115,8 @@ class PrepackedCache {
|
||||
|
||||
private:
|
||||
void EjectOne();
|
||||
void *AllocateBytes(std::ptrdiff_t num_bytes);
|
||||
void DoInsert(const CacheKey &key, const PrepackedMatrix &matrix);
|
||||
// Since this cache is used in the context of "pre-packing", we need to
|
||||
// handle allocating the space for the packed matrix ourselves, so we need
|
||||
// our own allocator.
|
||||
AlignedAllocator allocator_;
|
||||
detail::SystemBlockAllocator allocator_;
|
||||
std::map<CacheKey, MatrixWithTimeStamp> cache_;
|
||||
const int32_t ejection_threshold_;
|
||||
size_t cache_size_;
|
||||
|
Loading…
Reference in New Issue
Block a user