Use GpuVirtualMemAllocator in GPUBFCAllocator.

The GpuVirtualMemAllocator is a suballocator which guarantees contiguous suballocations,
allowing the BFC allocator to grow without fragmentation. This should reduce the
risk of ooming when setting the allow_growth flag.

PiperOrigin-RevId: 354642117
Change-Id: I6b20bd87a155443ed32755368c665e6cf438d079
This commit is contained in:
A. Unique TensorFlower 2021-01-29 17:27:00 -08:00 committed by TensorFlower Gardener
parent 222861851e
commit 8a998b3213
12 changed files with 307 additions and 157 deletions

View File

@ -645,7 +645,7 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
// allocator must have been initialized already, so the
// GetGPUAllocator() call won't create a new allocator.
dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
gpu_options, tf_gpu_id, 1);
gpu_options, tf_gpu_id, /*total_bytes=*/1, /*peer_gpu_ids=*/{});
}
return std::make_pair(cuda_device_id, dev_allocator);
}

View File

@ -48,10 +48,10 @@ BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
free_chunks_list_(kInvalidChunkHandle),
next_allocation_id_(1) {
if (allow_growth) {
// 1MiB smallest initial allocation, unless total memory available
// 2MiB smallest initial allocation, unless total memory available
// is less.
curr_region_allocation_bytes_ =
RoundedBytes(std::min(total_memory, size_t{1048576}));
RoundedBytes(std::min(total_memory, size_t{2 << 20}));
} else {
curr_region_allocation_bytes_ = RoundedBytes(total_memory);
}

View File

@ -600,6 +600,7 @@ class BFCAllocator : public Allocator {
#endif
friend class GPUBFCAllocatorPrivateMethodsTest;
friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific;
TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator);
};

View File

@ -149,7 +149,7 @@ class GPUDeviceTestHelper {
DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
gpu_.reset(reinterpret_cast<BaseGPUDevice*>(device_.release()));
gpu_allocator_ = GPUProcessState::singleton()->GetGPUAllocator(
GPUOptions(), TfGpuId(0), memory_limit);
GPUOptions(), TfGpuId(0), memory_limit, /*peer_gpu_ids=*/{});
host_allocator_ = GPUProcessState::singleton()->GetGpuHostAllocator(0);
}

View File

@ -158,6 +158,7 @@ tf_cuda_library(
"//tensorflow/core/profiler/lib:annotated_traceme",
"//tensorflow/core/profiler/lib:scoped_annotation",
"//third_party/eigen3",
"@com_google_absl//absl/container:flat_hash_set",
],
alwayslink = 1,
)

View File

@ -73,11 +73,11 @@ bool GPUBFCAllocator::GetGarbageCollectionValue() {
return true;
}
GPUBFCAllocator::GPUBFCAllocator(DeviceMemAllocator* sub_allocator,
GPUBFCAllocator::GPUBFCAllocator(SubAllocator* sub_allocator,
size_t total_memory, const string& name)
: GPUBFCAllocator(sub_allocator, total_memory, GPUOptions(), name) {}
GPUBFCAllocator::GPUBFCAllocator(DeviceMemAllocator* sub_allocator,
GPUBFCAllocator::GPUBFCAllocator(SubAllocator* sub_allocator,
size_t total_memory,
const GPUOptions& gpu_options,
const string& name)

View File

@ -23,6 +23,7 @@ limitations under the License.
#include "tensorflow/core/common_runtime/bfc_allocator.h"
#include "tensorflow/core/common_runtime/device/device_mem_allocator.h"
#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
#include "tensorflow/core/platform/thread_annotations.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/protobuf/config.pb.h"
@ -33,9 +34,9 @@ namespace tensorflow {
// algorithm.
class GPUBFCAllocator : public BFCAllocator {
public:
GPUBFCAllocator(DeviceMemAllocator* sub_allocator, size_t total_memory,
GPUBFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
const string& name);
GPUBFCAllocator(DeviceMemAllocator* sub_allocator, size_t total_memory,
GPUBFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
const GPUOptions& gpu_options, const string& name);
~GPUBFCAllocator() override {}

View File

@ -21,26 +21,29 @@ limitations under the License.
#include <algorithm>
#include <vector>
#include "tensorflow/core/common_runtime/device/device_id.h"
#include "tensorflow/core/common_runtime/device/device_id_utils.h"
#include "tensorflow/core/common_runtime/device/device_mem_allocator.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
#include "tensorflow/core/framework/typed_allocator.h"
#include "tensorflow/core/lib/core/threadpool.h"
#include "tensorflow/core/lib/gtl/inlined_vector.h"
#include "tensorflow/core/lib/random/simple_philox.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/strcat.h"
#include "tensorflow/core/platform/stream_executor.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/platform/test_benchmark.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
namespace tensorflow {
namespace {
static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
int64 peak_bytes_in_use, int64 largest_alloc_size) {
void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
int64 peak_bytes_in_use, int64 largest_alloc_size) {
absl::optional<AllocatorStats> stats = a->GetStats();
EXPECT_TRUE(stats);
if (!stats) {
@ -53,19 +56,54 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
EXPECT_EQ(stats->largest_alloc_size, largest_alloc_size);
}
se::StreamExecutor* ExecutorForPlatformGpuId(
PlatformDeviceId platform_device_id) {
return DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_device_id)
.ValueOrDie();
class GPUBFCAllocatorTest
: public ::testing::TestWithParam<SubAllocator* (*)(size_t)> {};
#if CUDA_VERSION >= 10020
SubAllocator* CreateVirtualMemorySubAllocator(
size_t virtual_address_space_size = 1ull << 32) {
PlatformDeviceId gpu_id(0);
auto executor =
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
.ValueOrDie();
auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
executor->implementation()->GpuContextHack());
return GpuVirtualMemAllocator::Create({}, {}, *gpu_context, gpu_id,
virtual_address_space_size, {})
.ValueOrDie()
.release();
}
#endif
SubAllocator* CreateGPUMemAllocator(size_t) {
PlatformDeviceId gpu_id(0);
return new DeviceMemAllocator(
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
.ValueOrDie(),
gpu_id,
/*use_unified_memory=*/false, {}, {});
}
TEST(GPUBFCAllocatorTest, NoDups) {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
SubAllocator* CreateSubAllocator(size_t virtual_address_space_size = 1ull
<< 32) {
#if CUDA_VERSION >= 10020
return CreateVirtualMemorySubAllocator(virtual_address_space_size);
#else
return CreateGPUMemAllocator(virtual_address_space_size);
#endif
}
auto TestSuiteValues() {
#if CUDA_VERSION >= 10020
return ::testing::Values(&CreateGPUMemAllocator,
&CreateVirtualMemorySubAllocator);
#else
return ::testing::Values(&CreateGPUMemAllocator);
#endif
}
TEST_P(GPUBFCAllocatorTest, NoDups) {
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
CheckStats(&a, 0, 0, 0, 0);
// Allocate a lot of raw pointers
@ -93,12 +131,8 @@ TEST(GPUBFCAllocatorTest, NoDups) {
CheckStats(&a, 1023, 0, 654336, 1024);
}
TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
TEST_P(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
// Allocate 256 raw pointers of sizes between 100 bytes and about
// a meg
random::PhiloxRandom philox(123, 17);
@ -155,12 +189,8 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
}
}
TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
TEST_P(GPUBFCAllocatorTest, ExerciseCoalescing) {
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
CheckStats(&a, 0, 0, 0, 0);
float* first_ptr = TypedAllocator::Allocate<float>(&a, 1024, {});
@ -194,63 +224,43 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
a.DeallocateRaw(first_ptr_after);
}
TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
TEST_P(GPUBFCAllocatorTest, AllocateZeroBufSize) {
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
float* ptr = TypedAllocator::Allocate<float>(&a, 0, {});
EXPECT_EQ(nullptr, ptr);
}
TEST(GPUBFCAllocatorTest, TracksSizes) {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
TEST_P(GPUBFCAllocatorTest, TracksSizes) {
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
EXPECT_EQ(true, a.TracksAllocationSizes());
}
TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
TEST_P(GPUBFCAllocatorTest, AllocatedVsRequested) {
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
float* t1 = TypedAllocator::Allocate<float>(&a, 1, {});
EXPECT_EQ(4, a.RequestedSize(t1));
EXPECT_EQ(256, a.AllocatedSize(t1));
a.DeallocateRaw(t1);
}
TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
// Configure a 1MiB byte limit
GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc");
TEST_P(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
// Configure a 2MiB byte limit
GPUBFCAllocator a(GetParam()(1ull << 32), 2 << 20, "GPU_0_bfc");
float* first_ptr = TypedAllocator::Allocate<float>(&a, 1 << 6, {});
float* second_ptr = TypedAllocator::Allocate<float>(&a, 1 << 20, {});
float* second_ptr = TypedAllocator::Allocate<float>(&a, 2 << 20, {});
EXPECT_NE(nullptr, first_ptr);
EXPECT_EQ(nullptr, second_ptr);
a.DeallocateRaw(first_ptr);
}
TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
TEST_P(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
GPUOptions options;
options.set_allow_growth(true);
// Max of 2GiB, but starts out small.
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1LL << 31, "GPU_0_bfc");
GPUBFCAllocator a(GetParam()(1ull << 32), 1LL << 31, "GPU_0_bfc");
// Allocate 10 raw pointers of sizes between 100 bytes and about
// 64 megs.
@ -311,28 +321,20 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
}
}
TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1UL << 60, "GPU_0_bfc");
sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator b(sub_allocator, 1UL << 60, "GPU_0_bfc");
TEST_P(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
GPUBFCAllocator a(GetParam()(1ul << 62), 1UL << 60, "GPU_0_bfc");
GPUBFCAllocator b(GetParam()(1ul << 62), 1UL << 60, "GPU_0_bfc");
void* amem = a.AllocateRaw(1, 1);
void* bmem = b.AllocateRaw(1, 1 << 30);
a.DeallocateRaw(amem);
b.DeallocateRaw(bmem);
}
INSTANTIATE_TEST_SUITE_P(GPUBFCAllocatorTestSuite, GPUBFCAllocatorTest,
TestSuiteValues());
static void BM_Allocation(int iters) {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
GPUBFCAllocator a(CreateSubAllocator(1ul << 36), 1uLL << 33, "GPU_0_bfc");
// Exercise a few different allocation sizes
std::vector<size_t> sizes = {256, 4096, 16384, 524288,
512, 1048576, 10485760, 104857600,
@ -348,11 +350,7 @@ static void BM_Allocation(int iters) {
BENCHMARK(BM_Allocation);
static void BM_AllocationThreaded(int iters, int num_threads) {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
GPUBFCAllocator a(CreateSubAllocator(1ul << 36), 1uLL << 33, "GPU_0_bfc");
thread::ThreadPool pool(Env::Default(), "test", num_threads);
std::atomic_int_fast32_t count(iters);
mutex done_lock;
@ -388,11 +386,7 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16);
// A more complex benchmark that defers deallocation of an object for
// "delay" allocations.
static void BM_AllocationDelayed(int iters, int delay) {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
GPUBFCAllocator a(CreateSubAllocator(1ull << 32), 1 << 30, "GPU_0_bfc");
// Exercise a few different allocation sizes
std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024};
int size_index = 0;
@ -423,7 +417,8 @@ BENCHMARK(BM_AllocationDelayed)->Arg(1)->Arg(10)->Arg(100)->Arg(1000);
} // namespace
class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
class GPUBFCAllocatorPrivateMethodsTest
: public ::testing::TestWithParam<SubAllocator* (*)(size_t)> {
protected:
void SetUp() override { CHECK_EQ(unsetenv("TF_FORCE_GPU_ALLOW_GROWTH"), 0); }
@ -432,11 +427,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
// only methods inside this class can access private members of BFCAllocator.
void TestBinDebugInfo() {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
std::vector<void*> initial_ptrs;
std::vector<size_t> initial_ptrs_allocated_sizes;
@ -532,11 +523,8 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
}
void TestLog2FloorNonZeroSlow() {
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator a(sub_allocator, 1 /* total_memory */, "GPU_0_bfc");
GPUBFCAllocator a(GetParam()(1ull << 32), 1 /* total_memory */,
"GPU_0_bfc");
EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0));
EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1));
EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2));
@ -547,65 +535,126 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
}
void TestForceAllowGrowth() {
PlatformGpuId platform_gpu_id(0);
GPUOptions options;
// Unset flag value uses provided option.
unsetenv("TF_FORCE_GPU_ALLOW_GROWTH");
options.set_allow_growth(true);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator unset_flag_allocator(sub_allocator, 1LL << 31, options,
"GPU_0_bfc");
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{1048576}),
GPUBFCAllocator unset_flag_allocator(GetParam()(1ull << 32), 1LL << 31,
options, "GPU_0_bfc");
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{2 << 20}),
unset_flag_allocator.curr_region_allocation_bytes_);
// Unparseable flag value uses provided option.
setenv("TF_FORCE_GPU_ALLOW_GROWTH", "unparseable", 1);
options.set_allow_growth(true);
sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator unparsable_flag_allocator(sub_allocator, 1LL << 31, options,
"GPU_1_bfc");
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{1048576}),
GPUBFCAllocator unparsable_flag_allocator(GetParam()(1ull << 32), 1LL << 31,
options, "GPU_1_bfc");
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{2 << 20}),
unparsable_flag_allocator.curr_region_allocation_bytes_);
// Max of 2GiB total memory. Env variable set forces allow_growth, which
// does an initial allocation of 1MiB.
setenv("TF_FORCE_GPU_ALLOW_GROWTH", "true", 1);
options.set_allow_growth(false);
sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator force_allow_growth_allocator(sub_allocator, 1LL << 31,
options, "GPU_2_bfc");
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{1048576}),
GPUBFCAllocator force_allow_growth_allocator(
GetParam()(1ull << 32), 1LL << 31, options, "GPU_2_bfc");
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{2 << 20}),
force_allow_growth_allocator.curr_region_allocation_bytes_);
// If env variable forces allow_growth disabled, all available memory is
// allocated.
setenv("TF_FORCE_GPU_ALLOW_GROWTH", "false", 1);
options.set_allow_growth(true);
sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
false /*use_unified_memory*/, {}, {});
GPUBFCAllocator force_no_allow_growth_allocator(sub_allocator, 1LL << 31,
options, "GPU_3_bfc");
GPUBFCAllocator force_no_allow_growth_allocator(
GetParam()(1ull << 32), 1LL << 31, options, "GPU_3_bfc");
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(1LL << 31),
force_no_allow_growth_allocator.curr_region_allocation_bytes_);
}
};
TEST_P(GPUBFCAllocatorPrivateMethodsTest, BinDebugInfo) { TestBinDebugInfo(); }
TEST_P(GPUBFCAllocatorPrivateMethodsTest, Log2FloorNonZeroSlow) {
TestLog2FloorNonZeroSlow();
}
TEST_P(GPUBFCAllocatorPrivateMethodsTest, ForceAllowGrowth) {
TestForceAllowGrowth();
}
INSTANTIATE_TEST_SUITE_P(GPUBFCAllocatorPrivateMethodTestSuite,
GPUBFCAllocatorPrivateMethodsTest, TestSuiteValues());
// Tests that cannot be trivially parameterized for both suballocator types.
class GPUBFCAllocatorTest_SubAllocatorSpecific : public ::testing::Test {};
#if CUDA_VERSION >= 10020
// Benchmark for measuring "high water mark" for BFCAllocator owned memory.
TEST_F(GPUBFCAllocatorTest_SubAllocatorSpecific,
VirtualAllocatorPromotesReuse) {
GPUOptions options;
options.set_allow_growth(true);
constexpr size_t k512MiB = 512ull << 20;
// 512 MiB allocator.
GPUBFCAllocator a(CreateVirtualMemorySubAllocator(1ull << 32), k512MiB,
options, "GPU_0_bfc");
// Allocate 128 raw pointers of 4 megs.
const size_t size = 1LL << 22;
std::vector<void*> initial_ptrs;
for (size_t s = 0; s < 128; s++) {
void* raw = a.AllocateRaw(1, size);
initial_ptrs.push_back(raw);
}
// Deallocate all but the last one so the big chunk cannot be GC'd
for (int i = 0; i < 127; ++i) {
a.DeallocateRaw(initial_ptrs[i]);
}
void* big_alloc = a.AllocateRaw(1, k512MiB - size);
EXPECT_NE(big_alloc, nullptr);
}
#endif
TEST_F(GPUBFCAllocatorTest_SubAllocatorSpecific,
PhysicalAllocatorOomsFragmentation) {
GPUOptions options;
options.set_allow_growth(true);
constexpr size_t k512MiB = 512ull << 20;
// 512 MiB allocator. Garbage Collection turned off to simulate a situation
// where there is memory pressure.
GPUBFCAllocator a(CreateGPUMemAllocator(/*ignored*/ 0), k512MiB, options,
"GPU_0_bfc");
// Allocate 128 raw pointers of 4 megs.
const size_t size = 1LL << 22;
std::vector<void*> initial_ptrs;
for (size_t s = 0; s < 128; s++) {
void* raw = a.AllocateRaw(1, size);
initial_ptrs.push_back(raw);
}
// Deallocate all but the last one so the big chunk cannot be GC'd
for (int i = 0; i < 127; ++i) {
a.DeallocateRaw(initial_ptrs[i]);
}
void* big_alloc = a.AllocateRaw(1, k512MiB - size);
EXPECT_EQ(big_alloc, nullptr);
}
// Tests that use private functions and cannot be trivially parameterized for
// both suballocator types.
class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific
: public ::testing::Test {
protected:
void SetUp() override { CHECK_EQ(unsetenv("TF_FORCE_GPU_ALLOW_GROWTH"), 0); }
void TestRegionDeallocation() {
GPUOptions options;
options.set_allow_growth(true);
// Max of 2GiB, but starts out small.
PlatformGpuId platform_gpu_id(0);
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
/*use_unified_memory=*/false, {}, {});
GPUBFCAllocator a(sub_allocator, 1LL << 31, options, "GPU_0_bfc");
GPUBFCAllocator a(CreateGPUMemAllocator(/*ignored*/ 0), 1LL << 31, options,
"GPU_0_bfc");
// Allocate 128 raw pointers of 4 megs.
const size_t size = 1LL << 22;
@ -641,22 +690,59 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
}
EXPECT_EQ(1, num_chunks_in_bins);
}
#if CUDA_VERSION >= 10020
// Counterpart to the GPUMemAllocator test suite TestRegionDeallocation tests.
// Here we expect no deallocations because all allocations are coalesced into
// a single region.
void TestNoRegionDeallocation() {
GPUOptions options;
options.set_allow_growth(true);
// Max of 2GiB, but starts out small.
GPUBFCAllocator a(CreateVirtualMemorySubAllocator(1uLL << 32), 1LL << 31,
options, "GPU_0_bfc");
// Allocate 128 raw pointers of 4 megs.
const size_t size = 1LL << 22;
std::vector<void*> initial_ptrs;
for (size_t s = 0; s < 128; s++) {
void* raw = a.AllocateRaw(1, size);
initial_ptrs.push_back(raw);
}
{
mutex_lock l(a.lock_);
EXPECT_EQ(1, a.region_manager_.regions().size());
}
// Deallocate all the memories except the last one.
for (size_t i = 0; i < initial_ptrs.size() - 1; i++) {
a.DeallocateRaw(initial_ptrs[i]);
}
// Deallocate free regions and there should still be only one.
EXPECT_EQ(false, a.DeallocateFreeRegions(/*rounded_bytes=*/0));
{
mutex_lock l(a.lock_);
EXPECT_EQ(1, a.region_manager_.regions().size());
}
}
#endif
};
TEST_F(GPUBFCAllocatorPrivateMethodsTest, BinDebugInfo) { TestBinDebugInfo(); }
TEST_F(GPUBFCAllocatorPrivateMethodsTest, Log2FloorNonZeroSlow) {
TestLog2FloorNonZeroSlow();
}
TEST_F(GPUBFCAllocatorPrivateMethodsTest, ForceAllowGrowth) {
TestForceAllowGrowth();
}
TEST_F(GPUBFCAllocatorPrivateMethodsTest, TestRegionDeallocation) {
TEST_F(GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific,
TestRegionDeallocation) {
TestRegionDeallocation();
}
#if CUDA_VERSION >= 10020
TEST_F(GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific,
TestNoRegionDeallocation) {
TestNoRegionDeallocation();
}
#endif
} // namespace tensorflow
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -1371,7 +1371,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
tf_gpu_id.value());
}
TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
it->second, devices));
it->second, num_tf_gpus, devices));
}
return Status::OK();
}
@ -1400,7 +1400,7 @@ static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
Status BaseGPUDeviceFactory::CreateGPUDevice(
const SessionOptions& options, const string& name_prefix, TfGpuId tf_gpu_id,
int64 memory_limit, const DeviceLocality& dev_locality,
int64 memory_limit, const DeviceLocality& dev_locality, size_t num_tf_gpus,
std::vector<std::unique_ptr<Device>>* devices) {
CHECK_GE(tf_gpu_id.value(), 0);
const string device_name =
@ -1418,9 +1418,19 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
return desc_status.status();
}
auto desc = desc_status.ConsumeValueOrDie();
std::vector<TfGpuId> peer_gpu_ids;
peer_gpu_ids.reserve(num_tf_gpus);
for (int id = 0; id < num_tf_gpus; ++id) {
TfGpuId peer_tf_gpu_id(id);
if (peer_tf_gpu_id != tf_gpu_id) {
peer_gpu_ids.push_back(peer_tf_gpu_id);
}
}
GPUProcessState* process_state = GPUProcessState::singleton();
Allocator* gpu_allocator = process_state->GetGPUAllocator(
options.config.gpu_options(), tf_gpu_id, memory_limit);
options.config.gpu_options(), tf_gpu_id, memory_limit, peer_gpu_ids);
if (gpu_allocator == nullptr) {
return errors::Internal("Failed to get memory allocator for TF GPU ",
tf_gpu_id.value(), " with ", memory_limit,

View File

@ -354,6 +354,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
Status CreateGPUDevice(const SessionOptions& options,
const std::string& name_prefix, TfGpuId tf_gpu_id,
int64 memory_limit, const DeviceLocality& dev_locality,
size_t num_tf_gpus,
std::vector<std::unique_ptr<Device>>* devices);
virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(

View File

@ -18,6 +18,7 @@ limitations under the License.
#include <cstring>
#include <vector>
#include "absl/container/flat_hash_set.h"
#include "tensorflow/core/common_runtime/device/device_host_allocator.h"
#include "tensorflow/core/common_runtime/device/device_id_utils.h"
#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
@ -26,6 +27,7 @@ limitations under the License.
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
#include "tensorflow/core/common_runtime/pool_allocator.h"
#include "tensorflow/core/common_runtime/shared_counter.h"
#include "tensorflow/core/framework/allocator.h"
@ -77,9 +79,61 @@ int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
return numa_node >= 0 ? numa_node : 0;
}
Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
TfGpuId tf_gpu_id,
size_t total_bytes) {
// NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
static SubAllocator* CreateSubAllocator(
const GPUOptions& options, PlatformGpuId platform_gpu_id,
const std::vector<SubAllocator::Visitor>& alloc_visitors,
size_t total_bytes, const std::vector<TfGpuId>& peer_gpu_ids) {
auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_gpu_id)
.ValueOrDie();
#if defined(GOOGLE_CUDA) && CUDA_VERSION >= 10020
// Use the old allocator when unified memory is required.
// TODO(imintz): Remove the cuMemAlloc capability of this allocator.
if (options.per_process_gpu_memory_fraction() > 1.0 ||
options.experimental().use_unified_memory()) {
return new DeviceMemAllocator(executor, platform_gpu_id,
/*use_unified_memory=*/true, alloc_visitors,
{});
} else {
auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
executor->implementation()->GpuContextHack());
absl::flat_hash_set<PlatformGpuId> platform_peer_gpu_ids;
platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
for (const TfGpuId tf_gpu_id : peer_gpu_ids) {
PlatformGpuId platform_gpu_id;
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
platform_peer_gpu_ids.insert(platform_gpu_id);
}
std::vector<PlatformGpuId> platform_peer_gpu_ids_vec(
platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
// Adjust virtual address space to be slightly larger than the physical
// address space in case the BFC allocator performs suboptimal garbage
// collection.
// TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
// the va space.
return GpuVirtualMemAllocator::Create(
alloc_visitors, {}, *gpu_context, platform_gpu_id,
/*virtual_address_space_size=*/total_bytes * 2,
platform_peer_gpu_ids_vec)
.ValueOrDie()
.release();
}
#else
return new DeviceMemAllocator(
executor, platform_gpu_id,
(options.per_process_gpu_memory_fraction() > 1.0 ||
options.experimental().use_unified_memory()),
alloc_visitors, {});
#endif
}
Allocator* GPUProcessState::GetGPUAllocator(
const GPUOptions& options, TfGpuId tf_gpu_id, size_t total_bytes,
const std::vector<TfGpuId>& peer_gpu_ids) {
CHECK(process_state_);
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
@ -107,14 +161,9 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
while (bus_id >= gpu_visitors_.size()) {
gpu_visitors_.push_back({});
}
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
platform_gpu_id)
.ValueOrDie(),
platform_gpu_id,
(options.per_process_gpu_memory_fraction() > 1.0 ||
options.experimental().use_unified_memory()),
gpu_visitors_[bus_id], {});
auto* sub_allocator =
CreateSubAllocator(options, platform_gpu_id, gpu_visitors_[bus_id],
total_bytes, peer_gpu_ids);
GPUBFCAllocator* gpu_bfc_allocator =
new GPUBFCAllocator(sub_allocator, total_bytes, options,
strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));

View File

@ -82,7 +82,8 @@ class GPUProcessState {
// REQUIRES: tf_gpu_id must be a valid id for a BaseGPUDevice available in the
// current system environment. Otherwise returns nullptr.
virtual Allocator* GetGPUAllocator(const GPUOptions& options,
TfGpuId tf_gpu_id, size_t total_bytes);
TfGpuId tf_gpu_id, size_t total_bytes,
const std::vector<TfGpuId>& peer_gpu_ids);
int NumGPUAllocators() {
mutex_lock l(mu_);