Merge pull request #31862 from jdduke/cherrypicks_DVR9A
Fix regression in memory consumption on arm64 devices
This commit is contained in:
commit
b12998c174
@ -55,6 +55,15 @@ cc_library(
|
|||||||
deps = [":check_macros"],
|
deps = [":check_macros"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cc_test(
|
||||||
|
name = "size_util_test",
|
||||||
|
srcs = ["size_util_test.cc"],
|
||||||
|
deps = [
|
||||||
|
":size_util",
|
||||||
|
"@com_google_googletest//:gtest",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "tune",
|
name = "tune",
|
||||||
srcs = [
|
srcs = [
|
||||||
|
@ -15,6 +15,7 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "tensorflow/lite/experimental/ruy/allocator.h"
|
#include "tensorflow/lite/experimental/ruy/allocator.h"
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
@ -25,7 +26,7 @@ namespace ruy {
|
|||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
void *AlignedAllocator::SystemAlignedAlloc(std::size_t num_bytes) {
|
void *AlignedAllocator::SystemAlignedAlloc(std::ptrdiff_t num_bytes) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
return _aligned_malloc(num_bytes, kAlignment);
|
return _aligned_malloc(num_bytes, kAlignment);
|
||||||
#else
|
#else
|
||||||
|
@ -27,7 +27,8 @@ namespace ruy {
|
|||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
inline void* VoidPtrAdd(void* p, std::size_t offset) {
|
inline void* VoidPtrAdd(void* p, std::ptrdiff_t offset) {
|
||||||
|
RUY_DCHECK(p);
|
||||||
std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(p) + offset;
|
std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(p) + offset;
|
||||||
return reinterpret_cast<void*>(addr);
|
return reinterpret_cast<void*>(addr);
|
||||||
}
|
}
|
||||||
@ -62,7 +63,7 @@ class AlignedAllocator {
|
|||||||
// ARM reference manual mentions that this granule size may be as large
|
// ARM reference manual mentions that this granule size may be as large
|
||||||
// as 2048 bytes, in practice we observe it to be 64 bytes. It can
|
// as 2048 bytes, in practice we observe it to be 64 bytes. It can
|
||||||
// be queried cheaply, at runtime, from userspace, if needed.
|
// be queried cheaply, at runtime, from userspace, if needed.
|
||||||
static constexpr std::size_t kAlignment = 64;
|
static constexpr std::ptrdiff_t kAlignment = 64;
|
||||||
|
|
||||||
void operator=(const AlignedAllocator&) = delete;
|
void operator=(const AlignedAllocator&) = delete;
|
||||||
~AlignedAllocator() {
|
~AlignedAllocator() {
|
||||||
@ -70,7 +71,7 @@ class AlignedAllocator {
|
|||||||
SystemAlignedFree(ptr_);
|
SystemAlignedFree(ptr_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void* AllocateAlignedBytes(std::size_t num_bytes) {
|
void* AllocateAlignedBytes(std::ptrdiff_t num_bytes) {
|
||||||
RUY_DCHECK(num_bytes > 0);
|
RUY_DCHECK(num_bytes > 0);
|
||||||
RUY_DCHECK((num_bytes & (kAlignment - 1)) == 0);
|
RUY_DCHECK((num_bytes & (kAlignment - 1)) == 0);
|
||||||
if (void* p = AllocateFast(num_bytes)) {
|
if (void* p = AllocateFast(num_bytes)) {
|
||||||
@ -85,7 +86,13 @@ class AlignedAllocator {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::size_t new_size = round_up_pot(size_ + fallback_blocks_total_size_);
|
// No rounding-up of the size means linear instead of logarithmic
|
||||||
|
// bound on the number of allocation in some worst-case calling patterns.
|
||||||
|
// This is considered worth it because minimizing memory usage is important
|
||||||
|
// and actual calling patterns in applications that we care about still
|
||||||
|
// reach the no-further-allocations steady state in a small finite number
|
||||||
|
// of iterations.
|
||||||
|
std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_;
|
||||||
SystemAlignedFree(ptr_);
|
SystemAlignedFree(ptr_);
|
||||||
ptr_ = SystemAlignedAlloc(new_size);
|
ptr_ = SystemAlignedAlloc(new_size);
|
||||||
size_ = new_size;
|
size_ = new_size;
|
||||||
@ -98,16 +105,16 @@ class AlignedAllocator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void* AllocateFast(std::size_t num_bytes) {
|
void* AllocateFast(std::ptrdiff_t num_bytes) {
|
||||||
if (current_ + num_bytes <= size_) {
|
if (current_ + num_bytes > size_) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
void* ret = VoidPtrAdd(ptr_, current_);
|
void* ret = VoidPtrAdd(ptr_, current_);
|
||||||
current_ += num_bytes;
|
current_ += num_bytes;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void* AllocateSlow(std::size_t num_bytes) {
|
void* AllocateSlow(std::ptrdiff_t num_bytes) {
|
||||||
void* p = SystemAlignedAlloc(num_bytes);
|
void* p = SystemAlignedAlloc(num_bytes);
|
||||||
fallback_blocks_total_size_ += num_bytes;
|
fallback_blocks_total_size_ += num_bytes;
|
||||||
fallback_blocks_.push_back(p);
|
fallback_blocks_.push_back(p);
|
||||||
@ -116,7 +123,7 @@ class AlignedAllocator {
|
|||||||
|
|
||||||
// Primitive allocation functions obtaining aligned memory from the
|
// Primitive allocation functions obtaining aligned memory from the
|
||||||
// operating system.
|
// operating system.
|
||||||
void* SystemAlignedAlloc(std::size_t num_bytes);
|
void* SystemAlignedAlloc(std::ptrdiff_t num_bytes);
|
||||||
void SystemAlignedFree(void* ptr);
|
void SystemAlignedFree(void* ptr);
|
||||||
|
|
||||||
// Theory of operation:
|
// Theory of operation:
|
||||||
@ -135,10 +142,10 @@ class AlignedAllocator {
|
|||||||
// bump-ptr allocator's buffer so that the next sequence of allocations
|
// bump-ptr allocator's buffer so that the next sequence of allocations
|
||||||
// will hopefully not need any fallback blocks.
|
// will hopefully not need any fallback blocks.
|
||||||
void* ptr_ = nullptr;
|
void* ptr_ = nullptr;
|
||||||
std::size_t current_ = 0;
|
std::ptrdiff_t current_ = 0;
|
||||||
std::size_t size_ = 0;
|
std::ptrdiff_t size_ = 0;
|
||||||
std::vector<void*> fallback_blocks_;
|
std::vector<void*> fallback_blocks_;
|
||||||
std::size_t fallback_blocks_total_size_ = 0;
|
std::ptrdiff_t fallback_blocks_total_size_ = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace detail
|
} // namespace detail
|
||||||
@ -147,7 +154,7 @@ class AlignedAllocator {
|
|||||||
// typed buffer.
|
// typed buffer.
|
||||||
class Allocator {
|
class Allocator {
|
||||||
public:
|
public:
|
||||||
void* AllocateBytes(std::size_t num_bytes) {
|
void* AllocateBytes(std::ptrdiff_t num_bytes) {
|
||||||
if (num_bytes == 0) {
|
if (num_bytes == 0) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
@ -155,7 +162,7 @@ class Allocator {
|
|||||||
round_up_pot(num_bytes, detail::AlignedAllocator::kAlignment));
|
round_up_pot(num_bytes, detail::AlignedAllocator::kAlignment));
|
||||||
}
|
}
|
||||||
template <typename Pointer>
|
template <typename Pointer>
|
||||||
void Allocate(std::size_t count, Pointer* out) {
|
void Allocate(std::ptrdiff_t count, Pointer* out) {
|
||||||
using T = typename std::pointer_traits<Pointer>::element_type;
|
using T = typename std::pointer_traits<Pointer>::element_type;
|
||||||
*out = static_cast<T*>(AllocateBytes(count * sizeof(T)));
|
*out = static_cast<T*>(AllocateBytes(count * sizeof(T)));
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,8 @@ limitations under the License.
|
|||||||
#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIZE_UTIL_H_
|
#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIZE_UTIL_H_
|
||||||
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIZE_UTIL_H_
|
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIZE_UTIL_H_
|
||||||
|
|
||||||
|
#include <type_traits>
|
||||||
|
|
||||||
#include "tensorflow/lite/experimental/ruy/check_macros.h"
|
#include "tensorflow/lite/experimental/ruy/check_macros.h"
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
@ -24,40 +26,64 @@ limitations under the License.
|
|||||||
|
|
||||||
namespace ruy {
|
namespace ruy {
|
||||||
|
|
||||||
inline int floor_log2(int n) {
|
template <typename Integer>
|
||||||
|
inline Integer floor_log2(Integer n) {
|
||||||
|
static_assert(std::is_integral<Integer>::value, "");
|
||||||
|
static_assert(std::is_signed<Integer>::value, "");
|
||||||
|
static_assert(sizeof(Integer) == 4 || sizeof(Integer) == 8, "");
|
||||||
|
|
||||||
RUY_DCHECK_GE(n, 1);
|
RUY_DCHECK_GE(n, 1);
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
unsigned long result;
|
unsigned long result;
|
||||||
|
if (sizeof(Integer) == 4) {
|
||||||
_BitScanReverse(&result, n);
|
_BitScanReverse(&result, n);
|
||||||
|
} else {
|
||||||
|
_BitScanReverse64(&result, n);
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
|
if (sizeof(Integer) == 4) {
|
||||||
return 31 - __builtin_clz(n);
|
return 31 - __builtin_clz(n);
|
||||||
|
} else {
|
||||||
|
return 63 - __builtin_clzll(n);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int ceil_log2(int n) {
|
template <typename Integer>
|
||||||
|
Integer ceil_log2(Integer n) {
|
||||||
RUY_DCHECK_GE(n, 1);
|
RUY_DCHECK_GE(n, 1);
|
||||||
return n == 1 ? 0 : floor_log2(n - 1) + 1;
|
return n == 1 ? 0 : floor_log2(n - 1) + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool is_pot(int value) {
|
template <typename Integer>
|
||||||
|
bool is_pot(Integer value) {
|
||||||
return (value > 0) && ((value & (value - 1)) == 0);
|
return (value > 0) && ((value & (value - 1)) == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int round_down_pot(int value) { return 1 << floor_log2(value); }
|
template <typename Integer>
|
||||||
|
Integer round_down_pot(Integer value) {
|
||||||
|
return static_cast<Integer>(1) << floor_log2(value);
|
||||||
|
}
|
||||||
|
|
||||||
inline int round_up_pot(int value) { return 1 << ceil_log2(value); }
|
template <typename Integer>
|
||||||
|
Integer round_up_pot(Integer value) {
|
||||||
|
return static_cast<Integer>(1) << ceil_log2(value);
|
||||||
|
}
|
||||||
|
|
||||||
inline int round_down_pot(int value, int modulo) {
|
template <typename Integer, typename Modulo>
|
||||||
|
Integer round_down_pot(Integer value, Modulo modulo) {
|
||||||
RUY_DCHECK_EQ(modulo & (modulo - 1), 0);
|
RUY_DCHECK_EQ(modulo & (modulo - 1), 0);
|
||||||
return value & ~(modulo - 1);
|
return value & ~(modulo - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int round_up_pot(int value, int modulo) {
|
template <typename Integer, typename Modulo>
|
||||||
|
Integer round_up_pot(Integer value, Modulo modulo) {
|
||||||
return round_down_pot(value + modulo - 1, modulo);
|
return round_down_pot(value + modulo - 1, modulo);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int clamp(int x, int lo, int hi) {
|
template <typename Integer>
|
||||||
|
Integer clamp(Integer x, Integer lo, Integer hi) {
|
||||||
if (x < lo) {
|
if (x < lo) {
|
||||||
return lo;
|
return lo;
|
||||||
} else if (x > hi) {
|
} else if (x > hi) {
|
||||||
|
100
tensorflow/lite/experimental/ruy/size_util_test.cc
Normal file
100
tensorflow/lite/experimental/ruy/size_util_test.cc
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
/* Copyright 2019 Google LLC. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
|
||||||
|
#include "tensorflow/lite/experimental/ruy/size_util.h"
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <limits>
|
||||||
|
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
namespace ruy {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
template <typename Integer>
|
||||||
|
void SizeUtilTestValue(Integer value) {
|
||||||
|
if (value == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
EXPECT_LE(0, floor_log2(value));
|
||||||
|
EXPECT_LE(floor_log2(value), ceil_log2(value));
|
||||||
|
EXPECT_LE(ceil_log2(value), 8 * sizeof(Integer));
|
||||||
|
|
||||||
|
if (is_pot(value)) {
|
||||||
|
EXPECT_EQ(floor_log2(value), ceil_log2(value));
|
||||||
|
} else {
|
||||||
|
EXPECT_EQ(floor_log2(value) + 1, ceil_log2(value));
|
||||||
|
}
|
||||||
|
EXPECT_EQ(value >> floor_log2(value), 1);
|
||||||
|
EXPECT_EQ(round_down_pot(value), static_cast<Integer>(1)
|
||||||
|
<< floor_log2(value));
|
||||||
|
EXPECT_LE(round_down_pot(value), value);
|
||||||
|
EXPECT_GE(round_down_pot(value), value >> 1);
|
||||||
|
EXPECT_TRUE(is_pot(round_down_pot(value)));
|
||||||
|
|
||||||
|
if (ceil_log2(value) < 8 * sizeof(Integer) - 1) {
|
||||||
|
EXPECT_EQ(value >> ceil_log2(value), is_pot(value) ? 1 : 0);
|
||||||
|
EXPECT_EQ(round_up_pot(value), static_cast<Integer>(1) << ceil_log2(value));
|
||||||
|
EXPECT_GE(round_up_pot(value), value);
|
||||||
|
EXPECT_LE(round_up_pot(value) >> 1, value);
|
||||||
|
EXPECT_TRUE(is_pot(round_up_pot(value)));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (std::uint8_t modulo : {1, 2, 8, 32, 128}) {
|
||||||
|
EXPECT_GE(value, round_down_pot(value, modulo));
|
||||||
|
EXPECT_EQ(round_down_pot(value, modulo) % modulo, 0);
|
||||||
|
|
||||||
|
if (value <= std::numeric_limits<Integer>::max() - modulo) {
|
||||||
|
EXPECT_LE(value, round_up_pot(value, modulo));
|
||||||
|
EXPECT_EQ(round_up_pot(value, modulo) % modulo, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Integer>
|
||||||
|
void SizeUtilTest() {
|
||||||
|
for (int exponent = 0; exponent < 8 * sizeof(Integer) - 1; exponent++) {
|
||||||
|
const Integer pot = static_cast<Integer>(1) << exponent;
|
||||||
|
SizeUtilTestValue(pot - 1);
|
||||||
|
SizeUtilTestValue(pot);
|
||||||
|
SizeUtilTestValue(pot + 1);
|
||||||
|
SizeUtilTestValue(pot + 12);
|
||||||
|
SizeUtilTestValue(pot + 123);
|
||||||
|
}
|
||||||
|
SizeUtilTestValue(std::numeric_limits<Integer>::max() - 1);
|
||||||
|
SizeUtilTestValue(std::numeric_limits<Integer>::max());
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(SizeUtilTest, Int) { SizeUtilTest<int>(); }
|
||||||
|
|
||||||
|
TEST(SizeUtilTest, Long) { SizeUtilTest<long int>(); } // NOLINT
|
||||||
|
|
||||||
|
TEST(SizeUtilTest, LongLong) { SizeUtilTest<long long int>(); } // NOLINT
|
||||||
|
|
||||||
|
TEST(SizeUtilTest, Int32) { SizeUtilTest<std::int32_t>(); }
|
||||||
|
|
||||||
|
TEST(SizeUtilTest, Int64) { SizeUtilTest<std::int64_t>(); }
|
||||||
|
|
||||||
|
TEST(SizeUtilTest, Ptrdiff) { SizeUtilTest<std::ptrdiff_t>(); }
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
} // namespace ruy
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
::testing::InitGoogleTest(&argc, argv);
|
||||||
|
return RUN_ALL_TESTS();
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user