Merge pull request #31862 from jdduke/cherrypicks_DVR9A
Fix regression in memory consumption on arm64 devices
This commit is contained in:
commit
b12998c174
@ -55,6 +55,15 @@ cc_library(
|
||||
deps = [":check_macros"],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "size_util_test",
|
||||
srcs = ["size_util_test.cc"],
|
||||
deps = [
|
||||
":size_util",
|
||||
"@com_google_googletest//:gtest",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "tune",
|
||||
srcs = [
|
||||
|
@ -15,6 +15,7 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/lite/experimental/ruy/allocator.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
|
||||
#ifdef _WIN32
|
||||
@ -25,7 +26,7 @@ namespace ruy {
|
||||
|
||||
namespace detail {
|
||||
|
||||
void *AlignedAllocator::SystemAlignedAlloc(std::size_t num_bytes) {
|
||||
void *AlignedAllocator::SystemAlignedAlloc(std::ptrdiff_t num_bytes) {
|
||||
#ifdef _WIN32
|
||||
return _aligned_malloc(num_bytes, kAlignment);
|
||||
#else
|
||||
|
@ -27,7 +27,8 @@ namespace ruy {
|
||||
|
||||
namespace detail {
|
||||
|
||||
inline void* VoidPtrAdd(void* p, std::size_t offset) {
|
||||
inline void* VoidPtrAdd(void* p, std::ptrdiff_t offset) {
|
||||
RUY_DCHECK(p);
|
||||
std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(p) + offset;
|
||||
return reinterpret_cast<void*>(addr);
|
||||
}
|
||||
@ -62,7 +63,7 @@ class AlignedAllocator {
|
||||
// ARM reference manual mentions that this granule size may be as large
|
||||
// as 2048 bytes, in practice we observe it to be 64 bytes. It can
|
||||
// be queried cheaply, at runtime, from userspace, if needed.
|
||||
static constexpr std::size_t kAlignment = 64;
|
||||
static constexpr std::ptrdiff_t kAlignment = 64;
|
||||
|
||||
void operator=(const AlignedAllocator&) = delete;
|
||||
~AlignedAllocator() {
|
||||
@ -70,7 +71,7 @@ class AlignedAllocator {
|
||||
SystemAlignedFree(ptr_);
|
||||
}
|
||||
|
||||
void* AllocateAlignedBytes(std::size_t num_bytes) {
|
||||
void* AllocateAlignedBytes(std::ptrdiff_t num_bytes) {
|
||||
RUY_DCHECK(num_bytes > 0);
|
||||
RUY_DCHECK((num_bytes & (kAlignment - 1)) == 0);
|
||||
if (void* p = AllocateFast(num_bytes)) {
|
||||
@ -85,7 +86,13 @@ class AlignedAllocator {
|
||||
return;
|
||||
}
|
||||
|
||||
std::size_t new_size = round_up_pot(size_ + fallback_blocks_total_size_);
|
||||
// No rounding-up of the size means linear instead of logarithmic
|
||||
// bound on the number of allocation in some worst-case calling patterns.
|
||||
// This is considered worth it because minimizing memory usage is important
|
||||
// and actual calling patterns in applications that we care about still
|
||||
// reach the no-further-allocations steady state in a small finite number
|
||||
// of iterations.
|
||||
std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_;
|
||||
SystemAlignedFree(ptr_);
|
||||
ptr_ = SystemAlignedAlloc(new_size);
|
||||
size_ = new_size;
|
||||
@ -98,16 +105,16 @@ class AlignedAllocator {
|
||||
}
|
||||
|
||||
private:
|
||||
void* AllocateFast(std::size_t num_bytes) {
|
||||
if (current_ + num_bytes <= size_) {
|
||||
void* ret = VoidPtrAdd(ptr_, current_);
|
||||
current_ += num_bytes;
|
||||
return ret;
|
||||
void* AllocateFast(std::ptrdiff_t num_bytes) {
|
||||
if (current_ + num_bytes > size_) {
|
||||
return nullptr;
|
||||
}
|
||||
return nullptr;
|
||||
void* ret = VoidPtrAdd(ptr_, current_);
|
||||
current_ += num_bytes;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void* AllocateSlow(std::size_t num_bytes) {
|
||||
void* AllocateSlow(std::ptrdiff_t num_bytes) {
|
||||
void* p = SystemAlignedAlloc(num_bytes);
|
||||
fallback_blocks_total_size_ += num_bytes;
|
||||
fallback_blocks_.push_back(p);
|
||||
@ -116,7 +123,7 @@ class AlignedAllocator {
|
||||
|
||||
// Primitive allocation functions obtaining aligned memory from the
|
||||
// operating system.
|
||||
void* SystemAlignedAlloc(std::size_t num_bytes);
|
||||
void* SystemAlignedAlloc(std::ptrdiff_t num_bytes);
|
||||
void SystemAlignedFree(void* ptr);
|
||||
|
||||
// Theory of operation:
|
||||
@ -135,10 +142,10 @@ class AlignedAllocator {
|
||||
// bump-ptr allocator's buffer so that the next sequence of allocations
|
||||
// will hopefully not need any fallback blocks.
|
||||
void* ptr_ = nullptr;
|
||||
std::size_t current_ = 0;
|
||||
std::size_t size_ = 0;
|
||||
std::ptrdiff_t current_ = 0;
|
||||
std::ptrdiff_t size_ = 0;
|
||||
std::vector<void*> fallback_blocks_;
|
||||
std::size_t fallback_blocks_total_size_ = 0;
|
||||
std::ptrdiff_t fallback_blocks_total_size_ = 0;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
@ -147,7 +154,7 @@ class AlignedAllocator {
|
||||
// typed buffer.
|
||||
class Allocator {
|
||||
public:
|
||||
void* AllocateBytes(std::size_t num_bytes) {
|
||||
void* AllocateBytes(std::ptrdiff_t num_bytes) {
|
||||
if (num_bytes == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
@ -155,7 +162,7 @@ class Allocator {
|
||||
round_up_pot(num_bytes, detail::AlignedAllocator::kAlignment));
|
||||
}
|
||||
template <typename Pointer>
|
||||
void Allocate(std::size_t count, Pointer* out) {
|
||||
void Allocate(std::ptrdiff_t count, Pointer* out) {
|
||||
using T = typename std::pointer_traits<Pointer>::element_type;
|
||||
*out = static_cast<T*>(AllocateBytes(count * sizeof(T)));
|
||||
}
|
||||
|
@ -16,6 +16,8 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIZE_UTIL_H_
|
||||
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIZE_UTIL_H_
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "tensorflow/lite/experimental/ruy/check_macros.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
@ -24,40 +26,64 @@ limitations under the License.
|
||||
|
||||
namespace ruy {
|
||||
|
||||
inline int floor_log2(int n) {
|
||||
template <typename Integer>
|
||||
inline Integer floor_log2(Integer n) {
|
||||
static_assert(std::is_integral<Integer>::value, "");
|
||||
static_assert(std::is_signed<Integer>::value, "");
|
||||
static_assert(sizeof(Integer) == 4 || sizeof(Integer) == 8, "");
|
||||
|
||||
RUY_DCHECK_GE(n, 1);
|
||||
#ifdef _WIN32
|
||||
unsigned long result;
|
||||
_BitScanReverse(&result, n);
|
||||
if (sizeof(Integer) == 4) {
|
||||
_BitScanReverse(&result, n);
|
||||
} else {
|
||||
_BitScanReverse64(&result, n);
|
||||
}
|
||||
return result;
|
||||
#else
|
||||
return 31 - __builtin_clz(n);
|
||||
if (sizeof(Integer) == 4) {
|
||||
return 31 - __builtin_clz(n);
|
||||
} else {
|
||||
return 63 - __builtin_clzll(n);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
inline int ceil_log2(int n) {
|
||||
template <typename Integer>
|
||||
Integer ceil_log2(Integer n) {
|
||||
RUY_DCHECK_GE(n, 1);
|
||||
return n == 1 ? 0 : floor_log2(n - 1) + 1;
|
||||
}
|
||||
|
||||
inline bool is_pot(int value) {
|
||||
template <typename Integer>
|
||||
bool is_pot(Integer value) {
|
||||
return (value > 0) && ((value & (value - 1)) == 0);
|
||||
}
|
||||
|
||||
inline int round_down_pot(int value) { return 1 << floor_log2(value); }
|
||||
template <typename Integer>
|
||||
Integer round_down_pot(Integer value) {
|
||||
return static_cast<Integer>(1) << floor_log2(value);
|
||||
}
|
||||
|
||||
inline int round_up_pot(int value) { return 1 << ceil_log2(value); }
|
||||
template <typename Integer>
|
||||
Integer round_up_pot(Integer value) {
|
||||
return static_cast<Integer>(1) << ceil_log2(value);
|
||||
}
|
||||
|
||||
inline int round_down_pot(int value, int modulo) {
|
||||
template <typename Integer, typename Modulo>
|
||||
Integer round_down_pot(Integer value, Modulo modulo) {
|
||||
RUY_DCHECK_EQ(modulo & (modulo - 1), 0);
|
||||
return value & ~(modulo - 1);
|
||||
}
|
||||
|
||||
inline int round_up_pot(int value, int modulo) {
|
||||
template <typename Integer, typename Modulo>
|
||||
Integer round_up_pot(Integer value, Modulo modulo) {
|
||||
return round_down_pot(value + modulo - 1, modulo);
|
||||
}
|
||||
|
||||
inline int clamp(int x, int lo, int hi) {
|
||||
template <typename Integer>
|
||||
Integer clamp(Integer x, Integer lo, Integer hi) {
|
||||
if (x < lo) {
|
||||
return lo;
|
||||
} else if (x > hi) {
|
||||
|
100
tensorflow/lite/experimental/ruy/size_util_test.cc
Normal file
100
tensorflow/lite/experimental/ruy/size_util_test.cc
Normal file
@ -0,0 +1,100 @@
|
||||
/* Copyright 2019 Google LLC. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/lite/experimental/ruy/size_util.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
namespace ruy {
|
||||
namespace {
|
||||
|
||||
template <typename Integer>
|
||||
void SizeUtilTestValue(Integer value) {
|
||||
if (value == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
EXPECT_LE(0, floor_log2(value));
|
||||
EXPECT_LE(floor_log2(value), ceil_log2(value));
|
||||
EXPECT_LE(ceil_log2(value), 8 * sizeof(Integer));
|
||||
|
||||
if (is_pot(value)) {
|
||||
EXPECT_EQ(floor_log2(value), ceil_log2(value));
|
||||
} else {
|
||||
EXPECT_EQ(floor_log2(value) + 1, ceil_log2(value));
|
||||
}
|
||||
EXPECT_EQ(value >> floor_log2(value), 1);
|
||||
EXPECT_EQ(round_down_pot(value), static_cast<Integer>(1)
|
||||
<< floor_log2(value));
|
||||
EXPECT_LE(round_down_pot(value), value);
|
||||
EXPECT_GE(round_down_pot(value), value >> 1);
|
||||
EXPECT_TRUE(is_pot(round_down_pot(value)));
|
||||
|
||||
if (ceil_log2(value) < 8 * sizeof(Integer) - 1) {
|
||||
EXPECT_EQ(value >> ceil_log2(value), is_pot(value) ? 1 : 0);
|
||||
EXPECT_EQ(round_up_pot(value), static_cast<Integer>(1) << ceil_log2(value));
|
||||
EXPECT_GE(round_up_pot(value), value);
|
||||
EXPECT_LE(round_up_pot(value) >> 1, value);
|
||||
EXPECT_TRUE(is_pot(round_up_pot(value)));
|
||||
}
|
||||
|
||||
for (std::uint8_t modulo : {1, 2, 8, 32, 128}) {
|
||||
EXPECT_GE(value, round_down_pot(value, modulo));
|
||||
EXPECT_EQ(round_down_pot(value, modulo) % modulo, 0);
|
||||
|
||||
if (value <= std::numeric_limits<Integer>::max() - modulo) {
|
||||
EXPECT_LE(value, round_up_pot(value, modulo));
|
||||
EXPECT_EQ(round_up_pot(value, modulo) % modulo, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Integer>
|
||||
void SizeUtilTest() {
|
||||
for (int exponent = 0; exponent < 8 * sizeof(Integer) - 1; exponent++) {
|
||||
const Integer pot = static_cast<Integer>(1) << exponent;
|
||||
SizeUtilTestValue(pot - 1);
|
||||
SizeUtilTestValue(pot);
|
||||
SizeUtilTestValue(pot + 1);
|
||||
SizeUtilTestValue(pot + 12);
|
||||
SizeUtilTestValue(pot + 123);
|
||||
}
|
||||
SizeUtilTestValue(std::numeric_limits<Integer>::max() - 1);
|
||||
SizeUtilTestValue(std::numeric_limits<Integer>::max());
|
||||
}
|
||||
|
||||
TEST(SizeUtilTest, Int) { SizeUtilTest<int>(); }
|
||||
|
||||
TEST(SizeUtilTest, Long) { SizeUtilTest<long int>(); } // NOLINT
|
||||
|
||||
TEST(SizeUtilTest, LongLong) { SizeUtilTest<long long int>(); } // NOLINT
|
||||
|
||||
TEST(SizeUtilTest, Int32) { SizeUtilTest<std::int32_t>(); }
|
||||
|
||||
TEST(SizeUtilTest, Int64) { SizeUtilTest<std::int64_t>(); }
|
||||
|
||||
TEST(SizeUtilTest, Ptrdiff) { SizeUtilTest<std::ptrdiff_t>(); }
|
||||
|
||||
} // namespace
|
||||
} // namespace ruy
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
Loading…
Reference in New Issue
Block a user