Merge pull request #31862 from jdduke/cherrypicks_DVR9A

Fix regression in memory consumption on arm64 devices
This commit is contained in:
Goldie Gadde 2019-08-21 21:03:14 -07:00 committed by GitHub
commit b12998c174
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 171 additions and 28 deletions

View File

@ -55,6 +55,15 @@ cc_library(
deps = [":check_macros"],
)
cc_test(
name = "size_util_test",
srcs = ["size_util_test.cc"],
deps = [
":size_util",
"@com_google_googletest//:gtest",
],
)
cc_library(
name = "tune",
srcs = [

View File

@ -15,6 +15,7 @@ limitations under the License.
#include "tensorflow/lite/experimental/ruy/allocator.h"
#include <cstdint>
#include <cstdlib>
#ifdef _WIN32
@ -25,7 +26,7 @@ namespace ruy {
namespace detail {
void *AlignedAllocator::SystemAlignedAlloc(std::size_t num_bytes) {
void *AlignedAllocator::SystemAlignedAlloc(std::ptrdiff_t num_bytes) {
#ifdef _WIN32
return _aligned_malloc(num_bytes, kAlignment);
#else

View File

@ -27,7 +27,8 @@ namespace ruy {
namespace detail {
inline void* VoidPtrAdd(void* p, std::size_t offset) {
inline void* VoidPtrAdd(void* p, std::ptrdiff_t offset) {
RUY_DCHECK(p);
std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(p) + offset;
return reinterpret_cast<void*>(addr);
}
@ -62,7 +63,7 @@ class AlignedAllocator {
// ARM reference manual mentions that this granule size may be as large
// as 2048 bytes, in practice we observe it to be 64 bytes. It can
// be queried cheaply, at runtime, from userspace, if needed.
static constexpr std::size_t kAlignment = 64;
static constexpr std::ptrdiff_t kAlignment = 64;
void operator=(const AlignedAllocator&) = delete;
~AlignedAllocator() {
@ -70,7 +71,7 @@ class AlignedAllocator {
SystemAlignedFree(ptr_);
}
void* AllocateAlignedBytes(std::size_t num_bytes) {
void* AllocateAlignedBytes(std::ptrdiff_t num_bytes) {
RUY_DCHECK(num_bytes > 0);
RUY_DCHECK((num_bytes & (kAlignment - 1)) == 0);
if (void* p = AllocateFast(num_bytes)) {
@ -85,7 +86,13 @@ class AlignedAllocator {
return;
}
std::size_t new_size = round_up_pot(size_ + fallback_blocks_total_size_);
// No rounding-up of the size means linear instead of logarithmic
// bound on the number of allocation in some worst-case calling patterns.
// This is considered worth it because minimizing memory usage is important
// and actual calling patterns in applications that we care about still
// reach the no-further-allocations steady state in a small finite number
// of iterations.
std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_;
SystemAlignedFree(ptr_);
ptr_ = SystemAlignedAlloc(new_size);
size_ = new_size;
@ -98,16 +105,16 @@ class AlignedAllocator {
}
private:
void* AllocateFast(std::size_t num_bytes) {
if (current_ + num_bytes <= size_) {
void* ret = VoidPtrAdd(ptr_, current_);
current_ += num_bytes;
return ret;
void* AllocateFast(std::ptrdiff_t num_bytes) {
if (current_ + num_bytes > size_) {
return nullptr;
}
return nullptr;
void* ret = VoidPtrAdd(ptr_, current_);
current_ += num_bytes;
return ret;
}
void* AllocateSlow(std::size_t num_bytes) {
void* AllocateSlow(std::ptrdiff_t num_bytes) {
void* p = SystemAlignedAlloc(num_bytes);
fallback_blocks_total_size_ += num_bytes;
fallback_blocks_.push_back(p);
@ -116,7 +123,7 @@ class AlignedAllocator {
// Primitive allocation functions obtaining aligned memory from the
// operating system.
void* SystemAlignedAlloc(std::size_t num_bytes);
void* SystemAlignedAlloc(std::ptrdiff_t num_bytes);
void SystemAlignedFree(void* ptr);
// Theory of operation:
@ -135,10 +142,10 @@ class AlignedAllocator {
// bump-ptr allocator's buffer so that the next sequence of allocations
// will hopefully not need any fallback blocks.
void* ptr_ = nullptr;
std::size_t current_ = 0;
std::size_t size_ = 0;
std::ptrdiff_t current_ = 0;
std::ptrdiff_t size_ = 0;
std::vector<void*> fallback_blocks_;
std::size_t fallback_blocks_total_size_ = 0;
std::ptrdiff_t fallback_blocks_total_size_ = 0;
};
} // namespace detail
@ -147,7 +154,7 @@ class AlignedAllocator {
// typed buffer.
class Allocator {
public:
void* AllocateBytes(std::size_t num_bytes) {
void* AllocateBytes(std::ptrdiff_t num_bytes) {
if (num_bytes == 0) {
return nullptr;
}
@ -155,7 +162,7 @@ class Allocator {
round_up_pot(num_bytes, detail::AlignedAllocator::kAlignment));
}
template <typename Pointer>
void Allocate(std::size_t count, Pointer* out) {
void Allocate(std::ptrdiff_t count, Pointer* out) {
using T = typename std::pointer_traits<Pointer>::element_type;
*out = static_cast<T*>(AllocateBytes(count * sizeof(T)));
}

View File

@ -16,6 +16,8 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIZE_UTIL_H_
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIZE_UTIL_H_
#include <type_traits>
#include "tensorflow/lite/experimental/ruy/check_macros.h"
#ifdef _WIN32
@ -24,40 +26,64 @@ limitations under the License.
namespace ruy {
inline int floor_log2(int n) {
template <typename Integer>
inline Integer floor_log2(Integer n) {
static_assert(std::is_integral<Integer>::value, "");
static_assert(std::is_signed<Integer>::value, "");
static_assert(sizeof(Integer) == 4 || sizeof(Integer) == 8, "");
RUY_DCHECK_GE(n, 1);
#ifdef _WIN32
unsigned long result;
_BitScanReverse(&result, n);
if (sizeof(Integer) == 4) {
_BitScanReverse(&result, n);
} else {
_BitScanReverse64(&result, n);
}
return result;
#else
return 31 - __builtin_clz(n);
if (sizeof(Integer) == 4) {
return 31 - __builtin_clz(n);
} else {
return 63 - __builtin_clzll(n);
}
#endif
}
inline int ceil_log2(int n) {
template <typename Integer>
Integer ceil_log2(Integer n) {
RUY_DCHECK_GE(n, 1);
return n == 1 ? 0 : floor_log2(n - 1) + 1;
}
inline bool is_pot(int value) {
template <typename Integer>
bool is_pot(Integer value) {
return (value > 0) && ((value & (value - 1)) == 0);
}
inline int round_down_pot(int value) { return 1 << floor_log2(value); }
template <typename Integer>
Integer round_down_pot(Integer value) {
return static_cast<Integer>(1) << floor_log2(value);
}
inline int round_up_pot(int value) { return 1 << ceil_log2(value); }
template <typename Integer>
Integer round_up_pot(Integer value) {
return static_cast<Integer>(1) << ceil_log2(value);
}
inline int round_down_pot(int value, int modulo) {
template <typename Integer, typename Modulo>
Integer round_down_pot(Integer value, Modulo modulo) {
RUY_DCHECK_EQ(modulo & (modulo - 1), 0);
return value & ~(modulo - 1);
}
inline int round_up_pot(int value, int modulo) {
template <typename Integer, typename Modulo>
Integer round_up_pot(Integer value, Modulo modulo) {
return round_down_pot(value + modulo - 1, modulo);
}
inline int clamp(int x, int lo, int hi) {
template <typename Integer>
Integer clamp(Integer x, Integer lo, Integer hi) {
if (x < lo) {
return lo;
} else if (x > hi) {

View File

@ -0,0 +1,100 @@
/* Copyright 2019 Google LLC. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/experimental/ruy/size_util.h"
#include <cstddef>
#include <cstdint>
#include <limits>
#include <gtest/gtest.h>
namespace ruy {
namespace {
template <typename Integer>
void SizeUtilTestValue(Integer value) {
if (value == 0) {
return;
}
EXPECT_LE(0, floor_log2(value));
EXPECT_LE(floor_log2(value), ceil_log2(value));
EXPECT_LE(ceil_log2(value), 8 * sizeof(Integer));
if (is_pot(value)) {
EXPECT_EQ(floor_log2(value), ceil_log2(value));
} else {
EXPECT_EQ(floor_log2(value) + 1, ceil_log2(value));
}
EXPECT_EQ(value >> floor_log2(value), 1);
EXPECT_EQ(round_down_pot(value), static_cast<Integer>(1)
<< floor_log2(value));
EXPECT_LE(round_down_pot(value), value);
EXPECT_GE(round_down_pot(value), value >> 1);
EXPECT_TRUE(is_pot(round_down_pot(value)));
if (ceil_log2(value) < 8 * sizeof(Integer) - 1) {
EXPECT_EQ(value >> ceil_log2(value), is_pot(value) ? 1 : 0);
EXPECT_EQ(round_up_pot(value), static_cast<Integer>(1) << ceil_log2(value));
EXPECT_GE(round_up_pot(value), value);
EXPECT_LE(round_up_pot(value) >> 1, value);
EXPECT_TRUE(is_pot(round_up_pot(value)));
}
for (std::uint8_t modulo : {1, 2, 8, 32, 128}) {
EXPECT_GE(value, round_down_pot(value, modulo));
EXPECT_EQ(round_down_pot(value, modulo) % modulo, 0);
if (value <= std::numeric_limits<Integer>::max() - modulo) {
EXPECT_LE(value, round_up_pot(value, modulo));
EXPECT_EQ(round_up_pot(value, modulo) % modulo, 0);
}
}
}
template <typename Integer>
void SizeUtilTest() {
for (int exponent = 0; exponent < 8 * sizeof(Integer) - 1; exponent++) {
const Integer pot = static_cast<Integer>(1) << exponent;
SizeUtilTestValue(pot - 1);
SizeUtilTestValue(pot);
SizeUtilTestValue(pot + 1);
SizeUtilTestValue(pot + 12);
SizeUtilTestValue(pot + 123);
}
SizeUtilTestValue(std::numeric_limits<Integer>::max() - 1);
SizeUtilTestValue(std::numeric_limits<Integer>::max());
}
TEST(SizeUtilTest, Int) { SizeUtilTest<int>(); }
TEST(SizeUtilTest, Long) { SizeUtilTest<long int>(); } // NOLINT
TEST(SizeUtilTest, LongLong) { SizeUtilTest<long long int>(); } // NOLINT
TEST(SizeUtilTest, Int32) { SizeUtilTest<std::int32_t>(); }
TEST(SizeUtilTest, Int64) { SizeUtilTest<std::int64_t>(); }
TEST(SizeUtilTest, Ptrdiff) { SizeUtilTest<std::ptrdiff_t>(); }
} // namespace
} // namespace ruy
int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}