Merge pull request #31862 from jdduke/cherrypicks_DVR9A

Fix regression in memory consumption on arm64 devices
2019-08-21 21:03:14 -07:00 · 2019-08-21 21:03:14 -07:00 · b12998c174
commit b12998c174
parent 217315e22e 1790e093de
5 changed files with 171 additions and 28 deletions
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@ -55,6 +55,15 @@ cc_library(
    deps = [":check_macros"],
 )

+cc_test(
+    name = "size_util_test",
+    srcs = ["size_util_test.cc"],
+    deps = [
+        ":size_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
    name = "tune",
    srcs = [
--- a/tensorflow/lite/experimental/ruy/allocator.cc
+++ b/tensorflow/lite/experimental/ruy/allocator.cc
@ -15,6 +15,7 @@ limitations under the License.

 #include "tensorflow/lite/experimental/ruy/allocator.h"

+#include <cstdint>
 #include <cstdlib>

 #ifdef _WIN32
@ -25,7 +26,7 @@ namespace ruy {

 namespace detail {

-void *AlignedAllocator::SystemAlignedAlloc(std::size_t num_bytes) {
+void *AlignedAllocator::SystemAlignedAlloc(std::ptrdiff_t num_bytes) {
 #ifdef _WIN32
  return _aligned_malloc(num_bytes, kAlignment);
 #else
--- a/tensorflow/lite/experimental/ruy/allocator.h
+++ b/tensorflow/lite/experimental/ruy/allocator.h
@ -27,7 +27,8 @@ namespace ruy {

 namespace detail {

-inline void* VoidPtrAdd(void* p, std::size_t offset) {
+inline void* VoidPtrAdd(void* p, std::ptrdiff_t offset) {
+  RUY_DCHECK(p);
  std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(p) + offset;
  return reinterpret_cast<void*>(addr);
 }
@ -62,7 +63,7 @@ class AlignedAllocator {
  //    ARM reference manual mentions that this granule size may be as large
  //    as 2048 bytes, in practice we observe it to be 64 bytes. It can
  //    be queried cheaply, at runtime, from userspace, if needed.
-  static constexpr std::size_t kAlignment = 64;
+  static constexpr std::ptrdiff_t kAlignment = 64;

  void operator=(const AlignedAllocator&) = delete;
  ~AlignedAllocator() {
@ -70,7 +71,7 @@ class AlignedAllocator {
    SystemAlignedFree(ptr_);
  }

-  void* AllocateAlignedBytes(std::size_t num_bytes) {
+  void* AllocateAlignedBytes(std::ptrdiff_t num_bytes) {
    RUY_DCHECK(num_bytes > 0);
    RUY_DCHECK((num_bytes & (kAlignment - 1)) == 0);
    if (void* p = AllocateFast(num_bytes)) {
@ -85,7 +86,13 @@ class AlignedAllocator {
      return;
    }

-    std::size_t new_size = round_up_pot(size_ + fallback_blocks_total_size_);
+    // No rounding-up of the size means linear instead of logarithmic
+    // bound on the number of allocation in some worst-case calling patterns.
+    // This is considered worth it because minimizing memory usage is important
+    // and actual calling patterns in applications that we care about still
+    // reach the no-further-allocations steady state in a small finite number
+    // of iterations.
+    std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_;
    SystemAlignedFree(ptr_);
    ptr_ = SystemAlignedAlloc(new_size);
    size_ = new_size;
@ -98,16 +105,16 @@ class AlignedAllocator {
  }

 private:
-  void* AllocateFast(std::size_t num_bytes) {
-    if (current_ + num_bytes <= size_) {
-      void* ret = VoidPtrAdd(ptr_, current_);
-      current_ += num_bytes;
-      return ret;
+  void* AllocateFast(std::ptrdiff_t num_bytes) {
+    if (current_ + num_bytes > size_) {
+      return nullptr;
    }
-    return nullptr;
+    void* ret = VoidPtrAdd(ptr_, current_);
+    current_ += num_bytes;
+    return ret;
  }

-  void* AllocateSlow(std::size_t num_bytes) {
+  void* AllocateSlow(std::ptrdiff_t num_bytes) {
    void* p = SystemAlignedAlloc(num_bytes);
    fallback_blocks_total_size_ += num_bytes;
    fallback_blocks_.push_back(p);
@ -116,7 +123,7 @@ class AlignedAllocator {

  // Primitive allocation functions obtaining aligned memory from the
  // operating system.
-  void* SystemAlignedAlloc(std::size_t num_bytes);
+  void* SystemAlignedAlloc(std::ptrdiff_t num_bytes);
  void SystemAlignedFree(void* ptr);

  // Theory of operation:
@ -135,10 +142,10 @@ class AlignedAllocator {
  // bump-ptr allocator's buffer so that the next sequence of allocations
  // will hopefully not need any fallback blocks.
  void* ptr_ = nullptr;
-  std::size_t current_ = 0;
-  std::size_t size_ = 0;
+  std::ptrdiff_t current_ = 0;
+  std::ptrdiff_t size_ = 0;
  std::vector<void*> fallback_blocks_;
-  std::size_t fallback_blocks_total_size_ = 0;
+  std::ptrdiff_t fallback_blocks_total_size_ = 0;
 };

 }  // namespace detail
@ -147,7 +154,7 @@ class AlignedAllocator {
 // typed buffer.
 class Allocator {
 public:
-  void* AllocateBytes(std::size_t num_bytes) {
+  void* AllocateBytes(std::ptrdiff_t num_bytes) {
    if (num_bytes == 0) {
      return nullptr;
    }
@ -155,7 +162,7 @@ class Allocator {
        round_up_pot(num_bytes, detail::AlignedAllocator::kAlignment));
  }
  template <typename Pointer>
-  void Allocate(std::size_t count, Pointer* out) {
+  void Allocate(std::ptrdiff_t count, Pointer* out) {
    using T = typename std::pointer_traits<Pointer>::element_type;
    *out = static_cast<T*>(AllocateBytes(count * sizeof(T)));
  }
--- a/tensorflow/lite/experimental/ruy/size_util.h
+++ b/tensorflow/lite/experimental/ruy/size_util.h
@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIZE_UTIL_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIZE_UTIL_H_

+#include <type_traits>
+
 #include "tensorflow/lite/experimental/ruy/check_macros.h"

 #ifdef _WIN32
@ -24,40 +26,64 @@ limitations under the License.

 namespace ruy {

-inline int floor_log2(int n) {
+template <typename Integer>
+inline Integer floor_log2(Integer n) {
+  static_assert(std::is_integral<Integer>::value, "");
+  static_assert(std::is_signed<Integer>::value, "");
+  static_assert(sizeof(Integer) == 4 || sizeof(Integer) == 8, "");
+
  RUY_DCHECK_GE(n, 1);
 #ifdef _WIN32
  unsigned long result;
-  _BitScanReverse(&result, n);
+  if (sizeof(Integer) == 4) {
+    _BitScanReverse(&result, n);
+  } else {
+    _BitScanReverse64(&result, n);
+  }
  return result;
 #else
-  return 31 - __builtin_clz(n);
+  if (sizeof(Integer) == 4) {
+    return 31 - __builtin_clz(n);
+  } else {
+    return 63 - __builtin_clzll(n);
+  }
 #endif
 }

-inline int ceil_log2(int n) {
+template <typename Integer>
+Integer ceil_log2(Integer n) {
  RUY_DCHECK_GE(n, 1);
  return n == 1 ? 0 : floor_log2(n - 1) + 1;
 }

-inline bool is_pot(int value) {
+template <typename Integer>
+bool is_pot(Integer value) {
  return (value > 0) && ((value & (value - 1)) == 0);
 }

-inline int round_down_pot(int value) { return 1 << floor_log2(value); }
+template <typename Integer>
+Integer round_down_pot(Integer value) {
+  return static_cast<Integer>(1) << floor_log2(value);
+}

-inline int round_up_pot(int value) { return 1 << ceil_log2(value); }
+template <typename Integer>
+Integer round_up_pot(Integer value) {
+  return static_cast<Integer>(1) << ceil_log2(value);
+}

-inline int round_down_pot(int value, int modulo) {
+template <typename Integer, typename Modulo>
+Integer round_down_pot(Integer value, Modulo modulo) {
  RUY_DCHECK_EQ(modulo & (modulo - 1), 0);
  return value & ~(modulo - 1);
 }

-inline int round_up_pot(int value, int modulo) {
+template <typename Integer, typename Modulo>
+Integer round_up_pot(Integer value, Modulo modulo) {
  return round_down_pot(value + modulo - 1, modulo);
 }

-inline int clamp(int x, int lo, int hi) {
+template <typename Integer>
+Integer clamp(Integer x, Integer lo, Integer hi) {
  if (x < lo) {
    return lo;
  } else if (x > hi) {
--- a/tensorflow/lite/experimental/ruy/size_util_test.cc
+++ b/tensorflow/lite/experimental/ruy/size_util_test.cc
@ -0,0 +1,100 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/ruy/size_util.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#include <gtest/gtest.h>
+
+namespace ruy {
+namespace {
+
+template <typename Integer>
+void SizeUtilTestValue(Integer value) {
+  if (value == 0) {
+    return;
+  }
+
+  EXPECT_LE(0, floor_log2(value));
+  EXPECT_LE(floor_log2(value), ceil_log2(value));
+  EXPECT_LE(ceil_log2(value), 8 * sizeof(Integer));
+
+  if (is_pot(value)) {
+    EXPECT_EQ(floor_log2(value), ceil_log2(value));
+  } else {
+    EXPECT_EQ(floor_log2(value) + 1, ceil_log2(value));
+  }
+  EXPECT_EQ(value >> floor_log2(value), 1);
+  EXPECT_EQ(round_down_pot(value), static_cast<Integer>(1)
+                                       << floor_log2(value));
+  EXPECT_LE(round_down_pot(value), value);
+  EXPECT_GE(round_down_pot(value), value >> 1);
+  EXPECT_TRUE(is_pot(round_down_pot(value)));
+
+  if (ceil_log2(value) < 8 * sizeof(Integer) - 1) {
+    EXPECT_EQ(value >> ceil_log2(value), is_pot(value) ? 1 : 0);
+    EXPECT_EQ(round_up_pot(value), static_cast<Integer>(1) << ceil_log2(value));
+    EXPECT_GE(round_up_pot(value), value);
+    EXPECT_LE(round_up_pot(value) >> 1, value);
+    EXPECT_TRUE(is_pot(round_up_pot(value)));
+  }
+
+  for (std::uint8_t modulo : {1, 2, 8, 32, 128}) {
+    EXPECT_GE(value, round_down_pot(value, modulo));
+    EXPECT_EQ(round_down_pot(value, modulo) % modulo, 0);
+
+    if (value <= std::numeric_limits<Integer>::max() - modulo) {
+      EXPECT_LE(value, round_up_pot(value, modulo));
+      EXPECT_EQ(round_up_pot(value, modulo) % modulo, 0);
+    }
+  }
+}
+
+template <typename Integer>
+void SizeUtilTest() {
+  for (int exponent = 0; exponent < 8 * sizeof(Integer) - 1; exponent++) {
+    const Integer pot = static_cast<Integer>(1) << exponent;
+    SizeUtilTestValue(pot - 1);
+    SizeUtilTestValue(pot);
+    SizeUtilTestValue(pot + 1);
+    SizeUtilTestValue(pot + 12);
+    SizeUtilTestValue(pot + 123);
+  }
+  SizeUtilTestValue(std::numeric_limits<Integer>::max() - 1);
+  SizeUtilTestValue(std::numeric_limits<Integer>::max());
+}
+
+TEST(SizeUtilTest, Int) { SizeUtilTest<int>(); }
+
+TEST(SizeUtilTest, Long) { SizeUtilTest<long int>(); }  // NOLINT
+
+TEST(SizeUtilTest, LongLong) { SizeUtilTest<long long int>(); }  // NOLINT
+
+TEST(SizeUtilTest, Int32) { SizeUtilTest<std::int32_t>(); }
+
+TEST(SizeUtilTest, Int64) { SizeUtilTest<std::int64_t>(); }
+
+TEST(SizeUtilTest, Ptrdiff) { SizeUtilTest<std::ptrdiff_t>(); }
+
+}  // namespace
+}  // namespace ruy
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}