Use GpuVirtualMemAllocator in GPUBFCAllocator.
The GpuVirtualMemAllocator is a suballocator which guarantees contiguous suballocations, allowing the BFC allocator to grow without fragmentation. This should reduce the risk of ooming when setting the allow_growth flag. PiperOrigin-RevId: 354642117 Change-Id: I6b20bd87a155443ed32755368c665e6cf438d079
This commit is contained in:
parent
222861851e
commit
8a998b3213
@ -645,7 +645,7 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
|
||||
// allocator must have been initialized already, so the
|
||||
// GetGPUAllocator() call won't create a new allocator.
|
||||
dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
|
||||
gpu_options, tf_gpu_id, 1);
|
||||
gpu_options, tf_gpu_id, /*total_bytes=*/1, /*peer_gpu_ids=*/{});
|
||||
}
|
||||
return std::make_pair(cuda_device_id, dev_allocator);
|
||||
}
|
||||
|
@ -48,10 +48,10 @@ BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
|
||||
free_chunks_list_(kInvalidChunkHandle),
|
||||
next_allocation_id_(1) {
|
||||
if (allow_growth) {
|
||||
// 1MiB smallest initial allocation, unless total memory available
|
||||
// 2MiB smallest initial allocation, unless total memory available
|
||||
// is less.
|
||||
curr_region_allocation_bytes_ =
|
||||
RoundedBytes(std::min(total_memory, size_t{1048576}));
|
||||
RoundedBytes(std::min(total_memory, size_t{2 << 20}));
|
||||
} else {
|
||||
curr_region_allocation_bytes_ = RoundedBytes(total_memory);
|
||||
}
|
||||
|
@ -600,6 +600,7 @@ class BFCAllocator : public Allocator {
|
||||
#endif
|
||||
|
||||
friend class GPUBFCAllocatorPrivateMethodsTest;
|
||||
friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific;
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator);
|
||||
};
|
||||
|
||||
|
@ -149,7 +149,7 @@ class GPUDeviceTestHelper {
|
||||
DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
|
||||
gpu_.reset(reinterpret_cast<BaseGPUDevice*>(device_.release()));
|
||||
gpu_allocator_ = GPUProcessState::singleton()->GetGPUAllocator(
|
||||
GPUOptions(), TfGpuId(0), memory_limit);
|
||||
GPUOptions(), TfGpuId(0), memory_limit, /*peer_gpu_ids=*/{});
|
||||
host_allocator_ = GPUProcessState::singleton()->GetGpuHostAllocator(0);
|
||||
}
|
||||
|
||||
|
@ -158,6 +158,7 @@ tf_cuda_library(
|
||||
"//tensorflow/core/profiler/lib:annotated_traceme",
|
||||
"//tensorflow/core/profiler/lib:scoped_annotation",
|
||||
"//third_party/eigen3",
|
||||
"@com_google_absl//absl/container:flat_hash_set",
|
||||
],
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
@ -73,11 +73,11 @@ bool GPUBFCAllocator::GetGarbageCollectionValue() {
|
||||
return true;
|
||||
}
|
||||
|
||||
GPUBFCAllocator::GPUBFCAllocator(DeviceMemAllocator* sub_allocator,
|
||||
GPUBFCAllocator::GPUBFCAllocator(SubAllocator* sub_allocator,
|
||||
size_t total_memory, const string& name)
|
||||
: GPUBFCAllocator(sub_allocator, total_memory, GPUOptions(), name) {}
|
||||
|
||||
GPUBFCAllocator::GPUBFCAllocator(DeviceMemAllocator* sub_allocator,
|
||||
GPUBFCAllocator::GPUBFCAllocator(SubAllocator* sub_allocator,
|
||||
size_t total_memory,
|
||||
const GPUOptions& gpu_options,
|
||||
const string& name)
|
||||
|
@ -23,6 +23,7 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/core/common_runtime/bfc_allocator.h"
|
||||
#include "tensorflow/core/common_runtime/device/device_mem_allocator.h"
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
|
||||
#include "tensorflow/core/platform/thread_annotations.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/protobuf/config.pb.h"
|
||||
@ -33,9 +34,9 @@ namespace tensorflow {
|
||||
// algorithm.
|
||||
class GPUBFCAllocator : public BFCAllocator {
|
||||
public:
|
||||
GPUBFCAllocator(DeviceMemAllocator* sub_allocator, size_t total_memory,
|
||||
GPUBFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
|
||||
const string& name);
|
||||
GPUBFCAllocator(DeviceMemAllocator* sub_allocator, size_t total_memory,
|
||||
GPUBFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
|
||||
const GPUOptions& gpu_options, const string& name);
|
||||
~GPUBFCAllocator() override {}
|
||||
|
||||
|
@ -21,26 +21,29 @@ limitations under the License.
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/core/common_runtime/device/device_id.h"
|
||||
#include "tensorflow/core/common_runtime/device/device_id_utils.h"
|
||||
#include "tensorflow/core/common_runtime/device/device_mem_allocator.h"
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
|
||||
#include "tensorflow/core/framework/typed_allocator.h"
|
||||
#include "tensorflow/core/lib/core/threadpool.h"
|
||||
#include "tensorflow/core/lib/gtl/inlined_vector.h"
|
||||
#include "tensorflow/core/lib/random/simple_philox.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/strcat.h"
|
||||
#include "tensorflow/core/platform/stream_executor.h"
|
||||
#include "tensorflow/core/platform/test.h"
|
||||
#include "tensorflow/core/platform/test_benchmark.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace {
|
||||
|
||||
static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
|
||||
int64 peak_bytes_in_use, int64 largest_alloc_size) {
|
||||
void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
|
||||
int64 peak_bytes_in_use, int64 largest_alloc_size) {
|
||||
absl::optional<AllocatorStats> stats = a->GetStats();
|
||||
EXPECT_TRUE(stats);
|
||||
if (!stats) {
|
||||
@ -53,19 +56,54 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
|
||||
EXPECT_EQ(stats->largest_alloc_size, largest_alloc_size);
|
||||
}
|
||||
|
||||
se::StreamExecutor* ExecutorForPlatformGpuId(
|
||||
PlatformDeviceId platform_device_id) {
|
||||
return DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||
platform_device_id)
|
||||
.ValueOrDie();
|
||||
class GPUBFCAllocatorTest
|
||||
: public ::testing::TestWithParam<SubAllocator* (*)(size_t)> {};
|
||||
|
||||
#if CUDA_VERSION >= 10020
|
||||
SubAllocator* CreateVirtualMemorySubAllocator(
|
||||
size_t virtual_address_space_size = 1ull << 32) {
|
||||
PlatformDeviceId gpu_id(0);
|
||||
auto executor =
|
||||
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
||||
.ValueOrDie();
|
||||
auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
|
||||
executor->implementation()->GpuContextHack());
|
||||
return GpuVirtualMemAllocator::Create({}, {}, *gpu_context, gpu_id,
|
||||
virtual_address_space_size, {})
|
||||
.ValueOrDie()
|
||||
.release();
|
||||
}
|
||||
#endif
|
||||
|
||||
SubAllocator* CreateGPUMemAllocator(size_t) {
|
||||
PlatformDeviceId gpu_id(0);
|
||||
return new DeviceMemAllocator(
|
||||
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
|
||||
.ValueOrDie(),
|
||||
gpu_id,
|
||||
/*use_unified_memory=*/false, {}, {});
|
||||
}
|
||||
|
||||
TEST(GPUBFCAllocatorTest, NoDups) {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
|
||||
SubAllocator* CreateSubAllocator(size_t virtual_address_space_size = 1ull
|
||||
<< 32) {
|
||||
#if CUDA_VERSION >= 10020
|
||||
return CreateVirtualMemorySubAllocator(virtual_address_space_size);
|
||||
#else
|
||||
return CreateGPUMemAllocator(virtual_address_space_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
auto TestSuiteValues() {
|
||||
#if CUDA_VERSION >= 10020
|
||||
return ::testing::Values(&CreateGPUMemAllocator,
|
||||
&CreateVirtualMemorySubAllocator);
|
||||
#else
|
||||
return ::testing::Values(&CreateGPUMemAllocator);
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST_P(GPUBFCAllocatorTest, NoDups) {
|
||||
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
|
||||
CheckStats(&a, 0, 0, 0, 0);
|
||||
|
||||
// Allocate a lot of raw pointers
|
||||
@ -93,12 +131,8 @@ TEST(GPUBFCAllocatorTest, NoDups) {
|
||||
CheckStats(&a, 1023, 0, 654336, 1024);
|
||||
}
|
||||
|
||||
TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
|
||||
TEST_P(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
|
||||
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
|
||||
// Allocate 256 raw pointers of sizes between 100 bytes and about
|
||||
// a meg
|
||||
random::PhiloxRandom philox(123, 17);
|
||||
@ -155,12 +189,8 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
|
||||
TEST_P(GPUBFCAllocatorTest, ExerciseCoalescing) {
|
||||
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
|
||||
CheckStats(&a, 0, 0, 0, 0);
|
||||
|
||||
float* first_ptr = TypedAllocator::Allocate<float>(&a, 1024, {});
|
||||
@ -194,63 +224,43 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
|
||||
a.DeallocateRaw(first_ptr_after);
|
||||
}
|
||||
|
||||
TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
|
||||
TEST_P(GPUBFCAllocatorTest, AllocateZeroBufSize) {
|
||||
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
|
||||
float* ptr = TypedAllocator::Allocate<float>(&a, 0, {});
|
||||
EXPECT_EQ(nullptr, ptr);
|
||||
}
|
||||
|
||||
TEST(GPUBFCAllocatorTest, TracksSizes) {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
|
||||
TEST_P(GPUBFCAllocatorTest, TracksSizes) {
|
||||
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
|
||||
EXPECT_EQ(true, a.TracksAllocationSizes());
|
||||
}
|
||||
|
||||
TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
|
||||
TEST_P(GPUBFCAllocatorTest, AllocatedVsRequested) {
|
||||
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
|
||||
float* t1 = TypedAllocator::Allocate<float>(&a, 1, {});
|
||||
EXPECT_EQ(4, a.RequestedSize(t1));
|
||||
EXPECT_EQ(256, a.AllocatedSize(t1));
|
||||
a.DeallocateRaw(t1);
|
||||
}
|
||||
|
||||
TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
// Configure a 1MiB byte limit
|
||||
GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc");
|
||||
TEST_P(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
|
||||
// Configure a 2MiB byte limit
|
||||
GPUBFCAllocator a(GetParam()(1ull << 32), 2 << 20, "GPU_0_bfc");
|
||||
|
||||
float* first_ptr = TypedAllocator::Allocate<float>(&a, 1 << 6, {});
|
||||
float* second_ptr = TypedAllocator::Allocate<float>(&a, 1 << 20, {});
|
||||
float* second_ptr = TypedAllocator::Allocate<float>(&a, 2 << 20, {});
|
||||
|
||||
EXPECT_NE(nullptr, first_ptr);
|
||||
EXPECT_EQ(nullptr, second_ptr);
|
||||
a.DeallocateRaw(first_ptr);
|
||||
}
|
||||
|
||||
TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
|
||||
TEST_P(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
|
||||
GPUOptions options;
|
||||
options.set_allow_growth(true);
|
||||
|
||||
// Max of 2GiB, but starts out small.
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1LL << 31, "GPU_0_bfc");
|
||||
GPUBFCAllocator a(GetParam()(1ull << 32), 1LL << 31, "GPU_0_bfc");
|
||||
|
||||
// Allocate 10 raw pointers of sizes between 100 bytes and about
|
||||
// 64 megs.
|
||||
@ -311,28 +321,20 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1UL << 60, "GPU_0_bfc");
|
||||
sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator b(sub_allocator, 1UL << 60, "GPU_0_bfc");
|
||||
TEST_P(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
|
||||
GPUBFCAllocator a(GetParam()(1ul << 62), 1UL << 60, "GPU_0_bfc");
|
||||
GPUBFCAllocator b(GetParam()(1ul << 62), 1UL << 60, "GPU_0_bfc");
|
||||
void* amem = a.AllocateRaw(1, 1);
|
||||
void* bmem = b.AllocateRaw(1, 1 << 30);
|
||||
a.DeallocateRaw(amem);
|
||||
b.DeallocateRaw(bmem);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(GPUBFCAllocatorTestSuite, GPUBFCAllocatorTest,
|
||||
TestSuiteValues());
|
||||
|
||||
static void BM_Allocation(int iters) {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
|
||||
GPUBFCAllocator a(CreateSubAllocator(1ul << 36), 1uLL << 33, "GPU_0_bfc");
|
||||
// Exercise a few different allocation sizes
|
||||
std::vector<size_t> sizes = {256, 4096, 16384, 524288,
|
||||
512, 1048576, 10485760, 104857600,
|
||||
@ -348,11 +350,7 @@ static void BM_Allocation(int iters) {
|
||||
BENCHMARK(BM_Allocation);
|
||||
|
||||
static void BM_AllocationThreaded(int iters, int num_threads) {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
|
||||
GPUBFCAllocator a(CreateSubAllocator(1ul << 36), 1uLL << 33, "GPU_0_bfc");
|
||||
thread::ThreadPool pool(Env::Default(), "test", num_threads);
|
||||
std::atomic_int_fast32_t count(iters);
|
||||
mutex done_lock;
|
||||
@ -388,11 +386,7 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16);
|
||||
// A more complex benchmark that defers deallocation of an object for
|
||||
// "delay" allocations.
|
||||
static void BM_AllocationDelayed(int iters, int delay) {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
|
||||
GPUBFCAllocator a(CreateSubAllocator(1ull << 32), 1 << 30, "GPU_0_bfc");
|
||||
// Exercise a few different allocation sizes
|
||||
std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024};
|
||||
int size_index = 0;
|
||||
@ -423,7 +417,8 @@ BENCHMARK(BM_AllocationDelayed)->Arg(1)->Arg(10)->Arg(100)->Arg(1000);
|
||||
|
||||
} // namespace
|
||||
|
||||
class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
|
||||
class GPUBFCAllocatorPrivateMethodsTest
|
||||
: public ::testing::TestWithParam<SubAllocator* (*)(size_t)> {
|
||||
protected:
|
||||
void SetUp() override { CHECK_EQ(unsetenv("TF_FORCE_GPU_ALLOW_GROWTH"), 0); }
|
||||
|
||||
@ -432,11 +427,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
|
||||
// only methods inside this class can access private members of BFCAllocator.
|
||||
|
||||
void TestBinDebugInfo() {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
|
||||
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
|
||||
|
||||
std::vector<void*> initial_ptrs;
|
||||
std::vector<size_t> initial_ptrs_allocated_sizes;
|
||||
@ -532,11 +523,8 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
|
||||
}
|
||||
|
||||
void TestLog2FloorNonZeroSlow() {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1 /* total_memory */, "GPU_0_bfc");
|
||||
GPUBFCAllocator a(GetParam()(1ull << 32), 1 /* total_memory */,
|
||||
"GPU_0_bfc");
|
||||
EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0));
|
||||
EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1));
|
||||
EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2));
|
||||
@ -547,65 +535,126 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
|
||||
}
|
||||
|
||||
void TestForceAllowGrowth() {
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
GPUOptions options;
|
||||
// Unset flag value uses provided option.
|
||||
unsetenv("TF_FORCE_GPU_ALLOW_GROWTH");
|
||||
options.set_allow_growth(true);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator unset_flag_allocator(sub_allocator, 1LL << 31, options,
|
||||
"GPU_0_bfc");
|
||||
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{1048576}),
|
||||
GPUBFCAllocator unset_flag_allocator(GetParam()(1ull << 32), 1LL << 31,
|
||||
options, "GPU_0_bfc");
|
||||
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{2 << 20}),
|
||||
unset_flag_allocator.curr_region_allocation_bytes_);
|
||||
|
||||
// Unparseable flag value uses provided option.
|
||||
setenv("TF_FORCE_GPU_ALLOW_GROWTH", "unparseable", 1);
|
||||
options.set_allow_growth(true);
|
||||
sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator unparsable_flag_allocator(sub_allocator, 1LL << 31, options,
|
||||
"GPU_1_bfc");
|
||||
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{1048576}),
|
||||
GPUBFCAllocator unparsable_flag_allocator(GetParam()(1ull << 32), 1LL << 31,
|
||||
options, "GPU_1_bfc");
|
||||
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{2 << 20}),
|
||||
unparsable_flag_allocator.curr_region_allocation_bytes_);
|
||||
|
||||
// Max of 2GiB total memory. Env variable set forces allow_growth, which
|
||||
// does an initial allocation of 1MiB.
|
||||
setenv("TF_FORCE_GPU_ALLOW_GROWTH", "true", 1);
|
||||
options.set_allow_growth(false);
|
||||
sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator force_allow_growth_allocator(sub_allocator, 1LL << 31,
|
||||
options, "GPU_2_bfc");
|
||||
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{1048576}),
|
||||
GPUBFCAllocator force_allow_growth_allocator(
|
||||
GetParam()(1ull << 32), 1LL << 31, options, "GPU_2_bfc");
|
||||
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{2 << 20}),
|
||||
force_allow_growth_allocator.curr_region_allocation_bytes_);
|
||||
|
||||
// If env variable forces allow_growth disabled, all available memory is
|
||||
// allocated.
|
||||
setenv("TF_FORCE_GPU_ALLOW_GROWTH", "false", 1);
|
||||
options.set_allow_growth(true);
|
||||
sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
false /*use_unified_memory*/, {}, {});
|
||||
GPUBFCAllocator force_no_allow_growth_allocator(sub_allocator, 1LL << 31,
|
||||
options, "GPU_3_bfc");
|
||||
GPUBFCAllocator force_no_allow_growth_allocator(
|
||||
GetParam()(1ull << 32), 1LL << 31, options, "GPU_3_bfc");
|
||||
EXPECT_EQ(GPUBFCAllocator::RoundedBytes(1LL << 31),
|
||||
force_no_allow_growth_allocator.curr_region_allocation_bytes_);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(GPUBFCAllocatorPrivateMethodsTest, BinDebugInfo) { TestBinDebugInfo(); }
|
||||
|
||||
TEST_P(GPUBFCAllocatorPrivateMethodsTest, Log2FloorNonZeroSlow) {
|
||||
TestLog2FloorNonZeroSlow();
|
||||
}
|
||||
|
||||
TEST_P(GPUBFCAllocatorPrivateMethodsTest, ForceAllowGrowth) {
|
||||
TestForceAllowGrowth();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(GPUBFCAllocatorPrivateMethodTestSuite,
|
||||
GPUBFCAllocatorPrivateMethodsTest, TestSuiteValues());
|
||||
|
||||
// Tests that cannot be trivially parameterized for both suballocator types.
|
||||
class GPUBFCAllocatorTest_SubAllocatorSpecific : public ::testing::Test {};
|
||||
|
||||
#if CUDA_VERSION >= 10020
|
||||
// Benchmark for measuring "high water mark" for BFCAllocator owned memory.
|
||||
TEST_F(GPUBFCAllocatorTest_SubAllocatorSpecific,
|
||||
VirtualAllocatorPromotesReuse) {
|
||||
GPUOptions options;
|
||||
options.set_allow_growth(true);
|
||||
|
||||
constexpr size_t k512MiB = 512ull << 20;
|
||||
|
||||
// 512 MiB allocator.
|
||||
GPUBFCAllocator a(CreateVirtualMemorySubAllocator(1ull << 32), k512MiB,
|
||||
options, "GPU_0_bfc");
|
||||
// Allocate 128 raw pointers of 4 megs.
|
||||
const size_t size = 1LL << 22;
|
||||
std::vector<void*> initial_ptrs;
|
||||
for (size_t s = 0; s < 128; s++) {
|
||||
void* raw = a.AllocateRaw(1, size);
|
||||
initial_ptrs.push_back(raw);
|
||||
}
|
||||
// Deallocate all but the last one so the big chunk cannot be GC'd
|
||||
for (int i = 0; i < 127; ++i) {
|
||||
a.DeallocateRaw(initial_ptrs[i]);
|
||||
}
|
||||
void* big_alloc = a.AllocateRaw(1, k512MiB - size);
|
||||
EXPECT_NE(big_alloc, nullptr);
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST_F(GPUBFCAllocatorTest_SubAllocatorSpecific,
|
||||
PhysicalAllocatorOomsFragmentation) {
|
||||
GPUOptions options;
|
||||
options.set_allow_growth(true);
|
||||
constexpr size_t k512MiB = 512ull << 20;
|
||||
|
||||
// 512 MiB allocator. Garbage Collection turned off to simulate a situation
|
||||
// where there is memory pressure.
|
||||
GPUBFCAllocator a(CreateGPUMemAllocator(/*ignored*/ 0), k512MiB, options,
|
||||
"GPU_0_bfc");
|
||||
// Allocate 128 raw pointers of 4 megs.
|
||||
const size_t size = 1LL << 22;
|
||||
std::vector<void*> initial_ptrs;
|
||||
for (size_t s = 0; s < 128; s++) {
|
||||
void* raw = a.AllocateRaw(1, size);
|
||||
initial_ptrs.push_back(raw);
|
||||
}
|
||||
// Deallocate all but the last one so the big chunk cannot be GC'd
|
||||
for (int i = 0; i < 127; ++i) {
|
||||
a.DeallocateRaw(initial_ptrs[i]);
|
||||
}
|
||||
void* big_alloc = a.AllocateRaw(1, k512MiB - size);
|
||||
EXPECT_EQ(big_alloc, nullptr);
|
||||
}
|
||||
|
||||
// Tests that use private functions and cannot be trivially parameterized for
|
||||
// both suballocator types.
|
||||
class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific
|
||||
: public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override { CHECK_EQ(unsetenv("TF_FORCE_GPU_ALLOW_GROWTH"), 0); }
|
||||
|
||||
void TestRegionDeallocation() {
|
||||
GPUOptions options;
|
||||
options.set_allow_growth(true);
|
||||
|
||||
// Max of 2GiB, but starts out small.
|
||||
PlatformGpuId platform_gpu_id(0);
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
|
||||
/*use_unified_memory=*/false, {}, {});
|
||||
GPUBFCAllocator a(sub_allocator, 1LL << 31, options, "GPU_0_bfc");
|
||||
GPUBFCAllocator a(CreateGPUMemAllocator(/*ignored*/ 0), 1LL << 31, options,
|
||||
"GPU_0_bfc");
|
||||
|
||||
// Allocate 128 raw pointers of 4 megs.
|
||||
const size_t size = 1LL << 22;
|
||||
@ -641,22 +690,59 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
|
||||
}
|
||||
EXPECT_EQ(1, num_chunks_in_bins);
|
||||
}
|
||||
|
||||
#if CUDA_VERSION >= 10020
|
||||
// Counterpart to the GPUMemAllocator test suite TestRegionDeallocation tests.
|
||||
// Here we expect no deallocations because all allocations are coalesced into
|
||||
// a single region.
|
||||
void TestNoRegionDeallocation() {
|
||||
GPUOptions options;
|
||||
options.set_allow_growth(true);
|
||||
|
||||
// Max of 2GiB, but starts out small.
|
||||
GPUBFCAllocator a(CreateVirtualMemorySubAllocator(1uLL << 32), 1LL << 31,
|
||||
options, "GPU_0_bfc");
|
||||
|
||||
// Allocate 128 raw pointers of 4 megs.
|
||||
const size_t size = 1LL << 22;
|
||||
std::vector<void*> initial_ptrs;
|
||||
for (size_t s = 0; s < 128; s++) {
|
||||
void* raw = a.AllocateRaw(1, size);
|
||||
initial_ptrs.push_back(raw);
|
||||
}
|
||||
|
||||
{
|
||||
mutex_lock l(a.lock_);
|
||||
EXPECT_EQ(1, a.region_manager_.regions().size());
|
||||
}
|
||||
|
||||
// Deallocate all the memories except the last one.
|
||||
for (size_t i = 0; i < initial_ptrs.size() - 1; i++) {
|
||||
a.DeallocateRaw(initial_ptrs[i]);
|
||||
}
|
||||
|
||||
// Deallocate free regions and there should still be only one.
|
||||
EXPECT_EQ(false, a.DeallocateFreeRegions(/*rounded_bytes=*/0));
|
||||
{
|
||||
mutex_lock l(a.lock_);
|
||||
EXPECT_EQ(1, a.region_manager_.regions().size());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
TEST_F(GPUBFCAllocatorPrivateMethodsTest, BinDebugInfo) { TestBinDebugInfo(); }
|
||||
|
||||
TEST_F(GPUBFCAllocatorPrivateMethodsTest, Log2FloorNonZeroSlow) {
|
||||
TestLog2FloorNonZeroSlow();
|
||||
}
|
||||
|
||||
TEST_F(GPUBFCAllocatorPrivateMethodsTest, ForceAllowGrowth) {
|
||||
TestForceAllowGrowth();
|
||||
}
|
||||
|
||||
TEST_F(GPUBFCAllocatorPrivateMethodsTest, TestRegionDeallocation) {
|
||||
TEST_F(GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific,
|
||||
TestRegionDeallocation) {
|
||||
TestRegionDeallocation();
|
||||
}
|
||||
|
||||
#if CUDA_VERSION >= 10020
|
||||
TEST_F(GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific,
|
||||
TestNoRegionDeallocation) {
|
||||
TestNoRegionDeallocation();
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -1371,7 +1371,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
|
||||
tf_gpu_id.value());
|
||||
}
|
||||
TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
|
||||
it->second, devices));
|
||||
it->second, num_tf_gpus, devices));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
@ -1400,7 +1400,7 @@ static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
|
||||
|
||||
Status BaseGPUDeviceFactory::CreateGPUDevice(
|
||||
const SessionOptions& options, const string& name_prefix, TfGpuId tf_gpu_id,
|
||||
int64 memory_limit, const DeviceLocality& dev_locality,
|
||||
int64 memory_limit, const DeviceLocality& dev_locality, size_t num_tf_gpus,
|
||||
std::vector<std::unique_ptr<Device>>* devices) {
|
||||
CHECK_GE(tf_gpu_id.value(), 0);
|
||||
const string device_name =
|
||||
@ -1418,9 +1418,19 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
|
||||
return desc_status.status();
|
||||
}
|
||||
auto desc = desc_status.ConsumeValueOrDie();
|
||||
|
||||
std::vector<TfGpuId> peer_gpu_ids;
|
||||
peer_gpu_ids.reserve(num_tf_gpus);
|
||||
for (int id = 0; id < num_tf_gpus; ++id) {
|
||||
TfGpuId peer_tf_gpu_id(id);
|
||||
if (peer_tf_gpu_id != tf_gpu_id) {
|
||||
peer_gpu_ids.push_back(peer_tf_gpu_id);
|
||||
}
|
||||
}
|
||||
|
||||
GPUProcessState* process_state = GPUProcessState::singleton();
|
||||
Allocator* gpu_allocator = process_state->GetGPUAllocator(
|
||||
options.config.gpu_options(), tf_gpu_id, memory_limit);
|
||||
options.config.gpu_options(), tf_gpu_id, memory_limit, peer_gpu_ids);
|
||||
if (gpu_allocator == nullptr) {
|
||||
return errors::Internal("Failed to get memory allocator for TF GPU ",
|
||||
tf_gpu_id.value(), " with ", memory_limit,
|
||||
|
@ -354,6 +354,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
|
||||
Status CreateGPUDevice(const SessionOptions& options,
|
||||
const std::string& name_prefix, TfGpuId tf_gpu_id,
|
||||
int64 memory_limit, const DeviceLocality& dev_locality,
|
||||
size_t num_tf_gpus,
|
||||
std::vector<std::unique_ptr<Device>>* devices);
|
||||
|
||||
virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
|
||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "tensorflow/core/common_runtime/device/device_host_allocator.h"
|
||||
#include "tensorflow/core/common_runtime/device/device_id_utils.h"
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
|
||||
@ -26,6 +27,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
|
||||
#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
|
||||
#include "tensorflow/core/common_runtime/pool_allocator.h"
|
||||
#include "tensorflow/core/common_runtime/shared_counter.h"
|
||||
#include "tensorflow/core/framework/allocator.h"
|
||||
@ -77,9 +79,61 @@ int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
|
||||
return numa_node >= 0 ? numa_node : 0;
|
||||
}
|
||||
|
||||
Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
|
||||
TfGpuId tf_gpu_id,
|
||||
size_t total_bytes) {
|
||||
// NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
|
||||
static SubAllocator* CreateSubAllocator(
|
||||
const GPUOptions& options, PlatformGpuId platform_gpu_id,
|
||||
const std::vector<SubAllocator::Visitor>& alloc_visitors,
|
||||
size_t total_bytes, const std::vector<TfGpuId>& peer_gpu_ids) {
|
||||
auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||
platform_gpu_id)
|
||||
.ValueOrDie();
|
||||
|
||||
#if defined(GOOGLE_CUDA) && CUDA_VERSION >= 10020
|
||||
// Use the old allocator when unified memory is required.
|
||||
// TODO(imintz): Remove the cuMemAlloc capability of this allocator.
|
||||
if (options.per_process_gpu_memory_fraction() > 1.0 ||
|
||||
options.experimental().use_unified_memory()) {
|
||||
return new DeviceMemAllocator(executor, platform_gpu_id,
|
||||
/*use_unified_memory=*/true, alloc_visitors,
|
||||
{});
|
||||
} else {
|
||||
auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
|
||||
executor->implementation()->GpuContextHack());
|
||||
|
||||
absl::flat_hash_set<PlatformGpuId> platform_peer_gpu_ids;
|
||||
platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
|
||||
for (const TfGpuId tf_gpu_id : peer_gpu_ids) {
|
||||
PlatformGpuId platform_gpu_id;
|
||||
TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
|
||||
platform_peer_gpu_ids.insert(platform_gpu_id);
|
||||
}
|
||||
std::vector<PlatformGpuId> platform_peer_gpu_ids_vec(
|
||||
platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
|
||||
|
||||
// Adjust virtual address space to be slightly larger than the physical
|
||||
// address space in case the BFC allocator performs suboptimal garbage
|
||||
// collection.
|
||||
// TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
|
||||
// the va space.
|
||||
return GpuVirtualMemAllocator::Create(
|
||||
alloc_visitors, {}, *gpu_context, platform_gpu_id,
|
||||
/*virtual_address_space_size=*/total_bytes * 2,
|
||||
platform_peer_gpu_ids_vec)
|
||||
.ValueOrDie()
|
||||
.release();
|
||||
}
|
||||
#else
|
||||
return new DeviceMemAllocator(
|
||||
executor, platform_gpu_id,
|
||||
(options.per_process_gpu_memory_fraction() > 1.0 ||
|
||||
options.experimental().use_unified_memory()),
|
||||
alloc_visitors, {});
|
||||
#endif
|
||||
}
|
||||
|
||||
Allocator* GPUProcessState::GetGPUAllocator(
|
||||
const GPUOptions& options, TfGpuId tf_gpu_id, size_t total_bytes,
|
||||
const std::vector<TfGpuId>& peer_gpu_ids) {
|
||||
CHECK(process_state_);
|
||||
#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
|
||||
(defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
|
||||
@ -107,14 +161,9 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
|
||||
while (bus_id >= gpu_visitors_.size()) {
|
||||
gpu_visitors_.push_back({});
|
||||
}
|
||||
DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
|
||||
DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
|
||||
platform_gpu_id)
|
||||
.ValueOrDie(),
|
||||
platform_gpu_id,
|
||||
(options.per_process_gpu_memory_fraction() > 1.0 ||
|
||||
options.experimental().use_unified_memory()),
|
||||
gpu_visitors_[bus_id], {});
|
||||
auto* sub_allocator =
|
||||
CreateSubAllocator(options, platform_gpu_id, gpu_visitors_[bus_id],
|
||||
total_bytes, peer_gpu_ids);
|
||||
GPUBFCAllocator* gpu_bfc_allocator =
|
||||
new GPUBFCAllocator(sub_allocator, total_bytes, options,
|
||||
strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
|
||||
|
@ -82,7 +82,8 @@ class GPUProcessState {
|
||||
// REQUIRES: tf_gpu_id must be a valid id for a BaseGPUDevice available in the
|
||||
// current system environment. Otherwise returns nullptr.
|
||||
virtual Allocator* GetGPUAllocator(const GPUOptions& options,
|
||||
TfGpuId tf_gpu_id, size_t total_bytes);
|
||||
TfGpuId tf_gpu_id, size_t total_bytes,
|
||||
const std::vector<TfGpuId>& peer_gpu_ids);
|
||||
|
||||
int NumGPUAllocators() {
|
||||
mutex_lock l(mu_);
|
||||
|
Loading…
Reference in New Issue
Block a user