Use separate allocator for cached prepacked matrix allocations.

This CL splits out the SystemAlignedAlloc/Free functions so that they are independently usable in this way. ruy::Allocator is a highly specialized allocator designed for the hot path of multiple gemm's. The use case of cached pre-packing has a very different set of tradeoffs. PiperOrigin-RevId: 283555950 Change-Id: I012a6ba4386e1727866e677965b857f62992d5f1
2019-12-03 09:01:54 -08:00 · 2019-12-03 09:01:54 -08:00 · 6547b6511b
commit 6547b6511b
parent 2454d7bd23
4 changed files with 78 additions and 57 deletions
--- a/tensorflow/lite/experimental/ruy/allocator.cc
+++ b/tensorflow/lite/experimental/ruy/allocator.cc
@ -26,19 +26,19 @@ namespace ruy {

 namespace detail {

-void *AlignedAllocator::SystemAlignedAlloc(std::ptrdiff_t num_bytes) {
+void *SystemAlignedAlloc(std::ptrdiff_t num_bytes) {
 #ifdef _WIN32
  return _aligned_malloc(num_bytes, kAlignment);
 #else
  void *ptr;
-  if (posix_memalign(&ptr, kAlignment, num_bytes)) {
+  if (posix_memalign(&ptr, kMinimumBlockAlignment, num_bytes)) {
    return nullptr;
  }
  return ptr;
 #endif
 }

-void AlignedAllocator::SystemAlignedFree(void *ptr) {
+void SystemAlignedFree(void *ptr) {
 #ifdef _WIN32
  _aligned_free(ptr);
 #else
--- a/tensorflow/lite/experimental/ruy/allocator.h
+++ b/tensorflow/lite/experimental/ruy/allocator.h
@ -34,38 +34,49 @@ inline void* VoidPtrAdd(void* p, std::ptrdiff_t offset) {
  return reinterpret_cast<void*>(addr);
 }

-// Simple allocator designed to converge to a steady-state where all
+// Minimum alignment for blocks.
+//
+// Considerations:
+//  - This needs to be at least the alignment of any usual data type.
+//  - It's useful that this is at least the size of a cache line to limit
+//    possible cache side effects (if only on performance behavior).
+//  - It's useful that this is at least the size of SIMD registers, as
+//    some SIMD instruction sets have at least performance behavior
+//    differences (e.g. NEON) or even different requirements (e.g. SSE)
+//    based on that.
+//  - It's useful that this is at least the size of an "exclusive reservation
+//    granule" on ARM, meaning that if we use this Allocator to allocate
+//    an atomic variable, there will be no side effects from other things
+//    contending for exclusive/atomic memory accesses to it. While the
+//    ARM reference manual mentions that this granule size may be as large
+//    as 2048 bytes, in practice we observe it to be 64 bytes. It can
+//    be queried cheaply, at runtime, from userspace, if needed.
+static constexpr std::ptrdiff_t kMinimumBlockAlignment = 64;
+
+// Primitive allocation functions obtaining aligned memory from the
+// operating system.
+void* SystemAlignedAlloc(std::ptrdiff_t num_bytes);
+void SystemAlignedFree(void* ptr);
+
+// Specialized allocator designed to converge to a steady-state where all
 // allocations are bump-ptr allocations from an already-allocated buffer.
 //
 // To support these constraints, this allocator only supports two
 // operations.
 // - AllocateAlignedBytes: allocates a pointer to storage of a specified
-// size, which must be aligned to kAlignment.
+// size, which must be aligned to kMinimumBlockAlignment.
 // - FreeAll: frees all previous allocations (but retains the internal
 // buffer to minimize future calls into the system allocator).
 //
+// This class is specialized for supporting just those two operations
+// under this specific steady-state usage pattern. Extending this class
+// with new allocation interfaces that don't fit that pattern is probably not
+// the right choice. Instead, build a new class on top of
+// SystemAlignedAlloc/SystemAlignedFree.
+//
 // All operations happen on aligned blocks for simplicity.
 class AlignedAllocator {
 public:
-  // Alignment of allocated blocks.
-  //
-  // Considerations:
-  //  - This needs to be at least the alignment of any usual data type.
-  //  - It's useful that this is at least the size of a cache line to limit
-  //    possible cache side effects (if only on performance behavior).
-  //  - It's useful that this is at least the size of SIMD registers, as
-  //    some SIMD instruction sets have at least performance behavior
-  //    differences (e.g. NEON) or even different requirements (e.g. SSE)
-  //    based on that.
-  //  - It's useful that this is at least the size of an "exclusive reservation
-  //    granule" on ARM, meaning that if we use this Allocator to allocate
-  //    an atomic variable, there will be no side effects from other things
-  //    contending for exclusive/atomic memory accesses to it. While the
-  //    ARM reference manual mentions that this granule size may be as large
-  //    as 2048 bytes, in practice we observe it to be 64 bytes. It can
-  //    be queried cheaply, at runtime, from userspace, if needed.
-  static constexpr std::ptrdiff_t kAlignment = 64;
-
  void operator=(const AlignedAllocator&) = delete;
  ~AlignedAllocator() {
    FreeAll();
@ -74,7 +85,7 @@ class AlignedAllocator {

  void* AllocateAlignedBytes(std::ptrdiff_t num_bytes) {
    RUY_DCHECK_GT(num_bytes, 0);
-    RUY_DCHECK((num_bytes & (kAlignment - 1)) == 0);
+    RUY_DCHECK((num_bytes & (kMinimumBlockAlignment - 1)) == 0);
    if (void* p = AllocateFast(num_bytes)) {
      return p;
    }
@ -105,17 +116,7 @@ class AlignedAllocator {
    fallback_blocks_total_size_ = 0;
  }

-  void FreeOne(void* ptr) {
-    for (auto p = fallback_blocks_.begin(); p != fallback_blocks_.end(); ++p) {
-      if (*p == ptr) {
-        SystemAlignedFree(ptr);
-        fallback_blocks_.erase(p);
-        return;
-      }
-    }
-    RUY_DCHECK(false);  // Trying to free pointer we did not allocate.
-  }
-
+ private:
  void* AllocateFast(std::ptrdiff_t num_bytes) {
    if (current_ + num_bytes > size_) {
      return nullptr;
@ -132,12 +133,6 @@ class AlignedAllocator {
    return p;
  }

- private:
-  // Primitive allocation functions obtaining aligned memory from the
-  // operating system.
-  void* SystemAlignedAlloc(std::ptrdiff_t num_bytes);
-  void SystemAlignedFree(void* ptr);
-
  // Theory of operation:
  //
  // - ptr_, current_, and size_ implement a basic bump-ptr allocator.
@ -171,7 +166,7 @@ class Allocator {
      return nullptr;
    }
    return aligned.AllocateAlignedBytes(
-        round_up_pot(num_bytes, detail::AlignedAllocator::kAlignment));
+        round_up_pot(num_bytes, detail::kMinimumBlockAlignment));
  }
  template <typename Pointer>
  void Allocate(std::ptrdiff_t count, Pointer* out) {
--- a/tensorflow/lite/experimental/ruy/prepacked_cache.cc
+++ b/tensorflow/lite/experimental/ruy/prepacked_cache.cc
@ -58,19 +58,14 @@ void PrepackedCache::EjectOne() {
  PrepackedMatrix &pmatrix = oldest->second.first;
  cache_size_ -= pmatrix.data_size;
  cache_size_ -= pmatrix.sums_size;
-  allocator_.FreeOne(pmatrix.data);
-  allocator_.FreeOne(pmatrix.sums);
+  allocator_.Free(pmatrix.data);
+  allocator_.Free(pmatrix.sums);
  cache_.erase(oldest);
 }

 void PrepackedCache::AllocatePrepackedMatrix(PrepackedMatrix *pmatrix) {
-  pmatrix->data = AllocateBytes(pmatrix->data_size);
-  pmatrix->sums = AllocateBytes(pmatrix->sums_size);
-}
-
-void *PrepackedCache::AllocateBytes(std::ptrdiff_t num_bytes) {
-  // Force system allocation for now to enable easy ejections.
-  return allocator_.AllocateSlow(num_bytes);
+  pmatrix->data = allocator_.Alloc(pmatrix->data_size);
+  pmatrix->sums = allocator_.Alloc(pmatrix->sums_size);
 }

 void PrepackedCache::DoInsert(const CacheKey &key,
--- a/tensorflow/lite/experimental/ruy/prepacked_cache.h
+++ b/tensorflow/lite/experimental/ruy/prepacked_cache.h
@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACKED_CACHE_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PREPACKED_CACHE_H_

+#include <cstddef>
 #include <iostream>
 #include <map>
 #include <queue>
@ -27,6 +28,40 @@ limitations under the License.

 namespace ruy {

+namespace detail {
+
+// Tracks a set of blocks allocated from the underlying system allocator.
+class SystemBlockAllocator {
+ public:
+  void *Alloc(std::ptrdiff_t num_bytes) {
+    void *p = detail::SystemAlignedAlloc(num_bytes);
+    blocks_.push_back(p);
+    return p;
+  }
+
+  void Free(void *block) {
+    for (auto it = blocks_.begin(); it != blocks_.end(); ++it) {
+      if (*it == block) {
+        detail::SystemAlignedFree(block);
+        blocks_.erase(it);
+        return;
+      }
+    }
+    RUY_DCHECK(false);  // Trying to free pointer we did not allocate.
+  }
+
+  ~SystemBlockAllocator() {
+    for (void *block : blocks_) {
+      detail::SystemAlignedFree(block);
+    }
+  }
+
+ private:
+  std::vector<void *> blocks_;
+};
+
+}  // namespace detail
+
 enum CachePolicy { kNoCache, kCacheLHSOnGemV };

 // "Low effort" Least Recently Used Cache for Prepacked Matrices
@ -80,12 +115,8 @@ class PrepackedCache {

 private:
  void EjectOne();
-  void *AllocateBytes(std::ptrdiff_t num_bytes);
  void DoInsert(const CacheKey &key, const PrepackedMatrix &matrix);
-  // Since this cache is used in the context of "pre-packing", we need to
-  // handle allocating the space for the packed matrix ourselves, so we need
-  // our own allocator.
-  AlignedAllocator allocator_;
+  detail::SystemBlockAllocator allocator_;
  std::map<CacheKey, MatrixWithTimeStamp> cache_;
  const int32_t ejection_threshold_;
  size_t cache_size_;