Use GpuVirtualMemAllocator in GPUBFCAllocator.

The GpuVirtualMemAllocator is a suballocator which guarantees contiguous suballocations, allowing the BFC allocator to grow without fragmentation. This should reduce the risk of ooming when setting the allow_growth flag. PiperOrigin-RevId: 354642117 Change-Id: I6b20bd87a155443ed32755368c665e6cf438d079
2021-01-29 17:27:00 -08:00 · 2021-01-29 17:27:00 -08:00 · 8a998b3213
commit 8a998b3213
parent 222861851e
12 changed files with 307 additions and 157 deletions
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@ -645,7 +645,7 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
      // allocator must have been initialized already, so the
      // GetGPUAllocator() call won't create a new allocator.
      dev_allocator = GPUProcessState::singleton()->GetGPUAllocator(
-          gpu_options, tf_gpu_id, 1);
+          gpu_options, tf_gpu_id, /*total_bytes=*/1, /*peer_gpu_ids=*/{});
    }
    return std::make_pair(cuda_device_id, dev_allocator);
  }
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@ -48,10 +48,10 @@ BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
      free_chunks_list_(kInvalidChunkHandle),
      next_allocation_id_(1) {
  if (allow_growth) {
-    // 1MiB smallest initial allocation, unless total memory available
+    // 2MiB smallest initial allocation, unless total memory available
    // is less.
    curr_region_allocation_bytes_ =
-        RoundedBytes(std::min(total_memory, size_t{1048576}));
+        RoundedBytes(std::min(total_memory, size_t{2 << 20}));
  } else {
    curr_region_allocation_bytes_ = RoundedBytes(total_memory);
  }
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@ -600,6 +600,7 @@ class BFCAllocator : public Allocator {
 #endif

  friend class GPUBFCAllocatorPrivateMethodsTest;
+  friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific;
  TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator);
 };

--- a/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
@ -149,7 +149,7 @@ class GPUDeviceTestHelper {
        DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
    gpu_.reset(reinterpret_cast<BaseGPUDevice*>(device_.release()));
    gpu_allocator_ = GPUProcessState::singleton()->GetGPUAllocator(
-        GPUOptions(), TfGpuId(0), memory_limit);
+        GPUOptions(), TfGpuId(0), memory_limit, /*peer_gpu_ids=*/{});
    host_allocator_ = GPUProcessState::singleton()->GetGpuHostAllocator(0);
  }

--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@ -158,6 +158,7 @@ tf_cuda_library(
        "//tensorflow/core/profiler/lib:annotated_traceme",
        "//tensorflow/core/profiler/lib:scoped_annotation",
        "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_set",
    ],
    alwayslink = 1,
 )
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@ -73,11 +73,11 @@ bool GPUBFCAllocator::GetGarbageCollectionValue() {
  return true;
 }

-GPUBFCAllocator::GPUBFCAllocator(DeviceMemAllocator* sub_allocator,
+GPUBFCAllocator::GPUBFCAllocator(SubAllocator* sub_allocator,
                                 size_t total_memory, const string& name)
    : GPUBFCAllocator(sub_allocator, total_memory, GPUOptions(), name) {}

-GPUBFCAllocator::GPUBFCAllocator(DeviceMemAllocator* sub_allocator,
+GPUBFCAllocator::GPUBFCAllocator(SubAllocator* sub_allocator,
                                 size_t total_memory,
                                 const GPUOptions& gpu_options,
                                 const string& name)
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@ -23,6 +23,7 @@ limitations under the License.

 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/common_runtime/device/device_mem_allocator.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@ -33,9 +34,9 @@ namespace tensorflow {
 // algorithm.
 class GPUBFCAllocator : public BFCAllocator {
 public:
-  GPUBFCAllocator(DeviceMemAllocator* sub_allocator, size_t total_memory,
+  GPUBFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
                  const string& name);
-  GPUBFCAllocator(DeviceMemAllocator* sub_allocator, size_t total_memory,
+  GPUBFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
                  const GPUOptions& gpu_options, const string& name);
  ~GPUBFCAllocator() override {}

--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@ -21,26 +21,29 @@ limitations under the License.
 #include <algorithm>
 #include <vector>

+#include "tensorflow/core/common_runtime/device/device_id.h"
 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
 #include "tensorflow/core/common_runtime/device/device_mem_allocator.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
 #include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"

 namespace tensorflow {
 namespace {

-static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
-                       int64 peak_bytes_in_use, int64 largest_alloc_size) {
+void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
+                int64 peak_bytes_in_use, int64 largest_alloc_size) {
  absl::optional<AllocatorStats> stats = a->GetStats();
  EXPECT_TRUE(stats);
  if (!stats) {
@ -53,19 +56,54 @@ static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
  EXPECT_EQ(stats->largest_alloc_size, largest_alloc_size);
 }

-se::StreamExecutor* ExecutorForPlatformGpuId(
-    PlatformDeviceId platform_device_id) {
-  return DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
-                                                   platform_device_id)
-      .ValueOrDie();
+class GPUBFCAllocatorTest
+    : public ::testing::TestWithParam<SubAllocator* (*)(size_t)> {};
+
+#if CUDA_VERSION >= 10020
+SubAllocator* CreateVirtualMemorySubAllocator(
+    size_t virtual_address_space_size = 1ull << 32) {
+  PlatformDeviceId gpu_id(0);
+  auto executor =
+      DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
+          .ValueOrDie();
+  auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
+      executor->implementation()->GpuContextHack());
+  return GpuVirtualMemAllocator::Create({}, {}, *gpu_context, gpu_id,
+                                        virtual_address_space_size, {})
+      .ValueOrDie()
+      .release();
+}
+#endif
+
+SubAllocator* CreateGPUMemAllocator(size_t) {
+  PlatformDeviceId gpu_id(0);
+  return new DeviceMemAllocator(
+      DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(), gpu_id)
+          .ValueOrDie(),
+      gpu_id,
+      /*use_unified_memory=*/false, {}, {});
 }

-TEST(GPUBFCAllocatorTest, NoDups) {
-  PlatformGpuId platform_gpu_id(0);
-  DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+SubAllocator* CreateSubAllocator(size_t virtual_address_space_size = 1ull
+                                                                     << 32) {
+#if CUDA_VERSION >= 10020
+  return CreateVirtualMemorySubAllocator(virtual_address_space_size);
+#else
+  return CreateGPUMemAllocator(virtual_address_space_size);
+#endif
+}
+
+auto TestSuiteValues() {
+#if CUDA_VERSION >= 10020
+  return ::testing::Values(&CreateGPUMemAllocator,
+                           &CreateVirtualMemorySubAllocator);
+#else
+  return ::testing::Values(&CreateGPUMemAllocator);
+#endif
+}
+
+TEST_P(GPUBFCAllocatorTest, NoDups) {
+  GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
  CheckStats(&a, 0, 0, 0, 0);

  // Allocate a lot of raw pointers
@ -93,12 +131,8 @@ TEST(GPUBFCAllocatorTest, NoDups) {
  CheckStats(&a, 1023, 0, 654336, 1024);
 }

-TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
-  PlatformGpuId platform_gpu_id(0);
-  DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+TEST_P(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
+  GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
  // Allocate 256 raw pointers of sizes between 100 bytes and about
  // a meg
  random::PhiloxRandom philox(123, 17);
@ -155,12 +189,8 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
  }
 }

-TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
-  PlatformGpuId platform_gpu_id(0);
-  DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+TEST_P(GPUBFCAllocatorTest, ExerciseCoalescing) {
+  GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
  CheckStats(&a, 0, 0, 0, 0);

  float* first_ptr = TypedAllocator::Allocate<float>(&a, 1024, {});
@ -194,63 +224,43 @@ TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
  a.DeallocateRaw(first_ptr_after);
 }

-TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
-  PlatformGpuId platform_gpu_id(0);
-  DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+TEST_P(GPUBFCAllocatorTest, AllocateZeroBufSize) {
+  GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
  float* ptr = TypedAllocator::Allocate<float>(&a, 0, {});
  EXPECT_EQ(nullptr, ptr);
 }

-TEST(GPUBFCAllocatorTest, TracksSizes) {
-  PlatformGpuId platform_gpu_id(0);
-  DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+TEST_P(GPUBFCAllocatorTest, TracksSizes) {
+  GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
  EXPECT_EQ(true, a.TracksAllocationSizes());
 }

-TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
-  PlatformGpuId platform_gpu_id(0);
-  DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+TEST_P(GPUBFCAllocatorTest, AllocatedVsRequested) {
+  GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");
  float* t1 = TypedAllocator::Allocate<float>(&a, 1, {});
  EXPECT_EQ(4, a.RequestedSize(t1));
  EXPECT_EQ(256, a.AllocatedSize(t1));
  a.DeallocateRaw(t1);
 }

-TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
-  PlatformGpuId platform_gpu_id(0);
-  DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  // Configure a 1MiB byte limit
-  GPUBFCAllocator a(sub_allocator, 1 << 20, "GPU_0_bfc");
+TEST_P(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
+  // Configure a 2MiB byte limit
+  GPUBFCAllocator a(GetParam()(1ull << 32), 2 << 20, "GPU_0_bfc");

  float* first_ptr = TypedAllocator::Allocate<float>(&a, 1 << 6, {});
-  float* second_ptr = TypedAllocator::Allocate<float>(&a, 1 << 20, {});
+  float* second_ptr = TypedAllocator::Allocate<float>(&a, 2 << 20, {});

  EXPECT_NE(nullptr, first_ptr);
  EXPECT_EQ(nullptr, second_ptr);
  a.DeallocateRaw(first_ptr);
 }

-TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
+TEST_P(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
  GPUOptions options;
  options.set_allow_growth(true);

  // Max of 2GiB, but starts out small.
-  PlatformGpuId platform_gpu_id(0);
-  DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  GPUBFCAllocator a(sub_allocator, 1LL << 31, "GPU_0_bfc");
+  GPUBFCAllocator a(GetParam()(1ull << 32), 1LL << 31, "GPU_0_bfc");

  // Allocate 10 raw pointers of sizes between 100 bytes and about
  // 64 megs.
@ -311,28 +321,20 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
  }
 }

-TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
-  PlatformGpuId platform_gpu_id(0);
-  DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  GPUBFCAllocator a(sub_allocator, 1UL << 60, "GPU_0_bfc");
-  sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  GPUBFCAllocator b(sub_allocator, 1UL << 60, "GPU_0_bfc");
+TEST_P(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
+  GPUBFCAllocator a(GetParam()(1ul << 62), 1UL << 60, "GPU_0_bfc");
+  GPUBFCAllocator b(GetParam()(1ul << 62), 1UL << 60, "GPU_0_bfc");
  void* amem = a.AllocateRaw(1, 1);
  void* bmem = b.AllocateRaw(1, 1 << 30);
  a.DeallocateRaw(amem);
  b.DeallocateRaw(bmem);
 }

+INSTANTIATE_TEST_SUITE_P(GPUBFCAllocatorTestSuite, GPUBFCAllocatorTest,
+                         TestSuiteValues());
+
 static void BM_Allocation(int iters) {
-  PlatformGpuId platform_gpu_id(0);
-  DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
+  GPUBFCAllocator a(CreateSubAllocator(1ul << 36), 1uLL << 33, "GPU_0_bfc");
  // Exercise a few different allocation sizes
  std::vector<size_t> sizes = {256,        4096,      16384,    524288,
                               512,        1048576,   10485760, 104857600,
@ -348,11 +350,7 @@ static void BM_Allocation(int iters) {
 BENCHMARK(BM_Allocation);

 static void BM_AllocationThreaded(int iters, int num_threads) {
-  PlatformGpuId platform_gpu_id(0);
-  DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  GPUBFCAllocator a(sub_allocator, 1uLL << 33, "GPU_0_bfc");
+  GPUBFCAllocator a(CreateSubAllocator(1ul << 36), 1uLL << 33, "GPU_0_bfc");
  thread::ThreadPool pool(Env::Default(), "test", num_threads);
  std::atomic_int_fast32_t count(iters);
  mutex done_lock;
@ -388,11 +386,7 @@ BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16);
 // A more complex benchmark that defers deallocation of an object for
 // "delay" allocations.
 static void BM_AllocationDelayed(int iters, int delay) {
-  PlatformGpuId platform_gpu_id(0);
-  DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-      ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-      false /*use_unified_memory*/, {}, {});
-  GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+  GPUBFCAllocator a(CreateSubAllocator(1ull << 32), 1 << 30, "GPU_0_bfc");
  // Exercise a few different allocation sizes
  std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024};
  int size_index = 0;
@ -423,7 +417,8 @@ BENCHMARK(BM_AllocationDelayed)->Arg(1)->Arg(10)->Arg(100)->Arg(1000);

 }  // namespace

-class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
+class GPUBFCAllocatorPrivateMethodsTest
+    : public ::testing::TestWithParam<SubAllocator* (*)(size_t)> {
 protected:
  void SetUp() override { CHECK_EQ(unsetenv("TF_FORCE_GPU_ALLOW_GROWTH"), 0); }

@ -432,11 +427,7 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
  // only methods inside this class can access private members of BFCAllocator.

  void TestBinDebugInfo() {
-    PlatformGpuId platform_gpu_id(0);
-    DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-        ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-        false /*use_unified_memory*/, {}, {});
-    GPUBFCAllocator a(sub_allocator, 1 << 30, "GPU_0_bfc");
+    GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc");

    std::vector<void*> initial_ptrs;
    std::vector<size_t> initial_ptrs_allocated_sizes;
@ -532,11 +523,8 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
  }

  void TestLog2FloorNonZeroSlow() {
-    PlatformGpuId platform_gpu_id(0);
-    DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-        ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-        false /*use_unified_memory*/, {}, {});
-    GPUBFCAllocator a(sub_allocator, 1 /* total_memory */, "GPU_0_bfc");
+    GPUBFCAllocator a(GetParam()(1ull << 32), 1 /* total_memory */,
+                      "GPU_0_bfc");
    EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0));
    EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1));
    EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2));
@ -547,65 +535,126 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
  }

  void TestForceAllowGrowth() {
-    PlatformGpuId platform_gpu_id(0);
    GPUOptions options;
    // Unset flag value uses provided option.
    unsetenv("TF_FORCE_GPU_ALLOW_GROWTH");
    options.set_allow_growth(true);
-    DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-        ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-        false /*use_unified_memory*/, {}, {});
-    GPUBFCAllocator unset_flag_allocator(sub_allocator, 1LL << 31, options,
-                                         "GPU_0_bfc");
-    EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{1048576}),
+    GPUBFCAllocator unset_flag_allocator(GetParam()(1ull << 32), 1LL << 31,
+                                         options, "GPU_0_bfc");
+    EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{2 << 20}),
              unset_flag_allocator.curr_region_allocation_bytes_);

    // Unparseable flag value uses provided option.
    setenv("TF_FORCE_GPU_ALLOW_GROWTH", "unparseable", 1);
    options.set_allow_growth(true);
-    sub_allocator = new DeviceMemAllocator(
-        ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-        false /*use_unified_memory*/, {}, {});
-    GPUBFCAllocator unparsable_flag_allocator(sub_allocator, 1LL << 31, options,
-                                              "GPU_1_bfc");
-    EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{1048576}),
+    GPUBFCAllocator unparsable_flag_allocator(GetParam()(1ull << 32), 1LL << 31,
+                                              options, "GPU_1_bfc");
+    EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{2 << 20}),
              unparsable_flag_allocator.curr_region_allocation_bytes_);

    // Max of 2GiB total memory. Env variable set forces allow_growth, which
    // does an initial allocation of 1MiB.
    setenv("TF_FORCE_GPU_ALLOW_GROWTH", "true", 1);
    options.set_allow_growth(false);
-    sub_allocator = new DeviceMemAllocator(
-        ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-        false /*use_unified_memory*/, {}, {});
-    GPUBFCAllocator force_allow_growth_allocator(sub_allocator, 1LL << 31,
-                                                 options, "GPU_2_bfc");
-    EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{1048576}),
+    GPUBFCAllocator force_allow_growth_allocator(
+        GetParam()(1ull << 32), 1LL << 31, options, "GPU_2_bfc");
+    EXPECT_EQ(GPUBFCAllocator::RoundedBytes(size_t{2 << 20}),
              force_allow_growth_allocator.curr_region_allocation_bytes_);

    // If env variable forces allow_growth disabled, all available memory is
    // allocated.
    setenv("TF_FORCE_GPU_ALLOW_GROWTH", "false", 1);
    options.set_allow_growth(true);
-    sub_allocator = new DeviceMemAllocator(
-        ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-        false /*use_unified_memory*/, {}, {});
-    GPUBFCAllocator force_no_allow_growth_allocator(sub_allocator, 1LL << 31,
-                                                    options, "GPU_3_bfc");
+    GPUBFCAllocator force_no_allow_growth_allocator(
+        GetParam()(1ull << 32), 1LL << 31, options, "GPU_3_bfc");
    EXPECT_EQ(GPUBFCAllocator::RoundedBytes(1LL << 31),
              force_no_allow_growth_allocator.curr_region_allocation_bytes_);
  }
+};
+
+TEST_P(GPUBFCAllocatorPrivateMethodsTest, BinDebugInfo) { TestBinDebugInfo(); }
+
+TEST_P(GPUBFCAllocatorPrivateMethodsTest, Log2FloorNonZeroSlow) {
+  TestLog2FloorNonZeroSlow();
+}
+
+TEST_P(GPUBFCAllocatorPrivateMethodsTest, ForceAllowGrowth) {
+  TestForceAllowGrowth();
+}
+
+INSTANTIATE_TEST_SUITE_P(GPUBFCAllocatorPrivateMethodTestSuite,
+                         GPUBFCAllocatorPrivateMethodsTest, TestSuiteValues());
+
+// Tests that cannot be trivially parameterized for both suballocator types.
+class GPUBFCAllocatorTest_SubAllocatorSpecific : public ::testing::Test {};
+
+#if CUDA_VERSION >= 10020
+// Benchmark for measuring "high water mark" for BFCAllocator owned memory.
+TEST_F(GPUBFCAllocatorTest_SubAllocatorSpecific,
+       VirtualAllocatorPromotesReuse) {
+  GPUOptions options;
+  options.set_allow_growth(true);
+
+  constexpr size_t k512MiB = 512ull << 20;
+
+  // 512 MiB allocator.
+  GPUBFCAllocator a(CreateVirtualMemorySubAllocator(1ull << 32), k512MiB,
+                    options, "GPU_0_bfc");
+  // Allocate 128 raw pointers of 4 megs.
+  const size_t size = 1LL << 22;
+  std::vector<void*> initial_ptrs;
+  for (size_t s = 0; s < 128; s++) {
+    void* raw = a.AllocateRaw(1, size);
+    initial_ptrs.push_back(raw);
+  }
+  // Deallocate all but the last one so the big chunk cannot be GC'd
+  for (int i = 0; i < 127; ++i) {
+    a.DeallocateRaw(initial_ptrs[i]);
+  }
+  void* big_alloc = a.AllocateRaw(1, k512MiB - size);
+  EXPECT_NE(big_alloc, nullptr);
+}
+#endif
+
+TEST_F(GPUBFCAllocatorTest_SubAllocatorSpecific,
+       PhysicalAllocatorOomsFragmentation) {
+  GPUOptions options;
+  options.set_allow_growth(true);
+  constexpr size_t k512MiB = 512ull << 20;
+
+  // 512 MiB allocator. Garbage Collection turned off to simulate a situation
+  // where there is memory pressure.
+  GPUBFCAllocator a(CreateGPUMemAllocator(/*ignored*/ 0), k512MiB, options,
+                    "GPU_0_bfc");
+  // Allocate 128 raw pointers of 4 megs.
+  const size_t size = 1LL << 22;
+  std::vector<void*> initial_ptrs;
+  for (size_t s = 0; s < 128; s++) {
+    void* raw = a.AllocateRaw(1, size);
+    initial_ptrs.push_back(raw);
+  }
+  // Deallocate all but the last one so the big chunk cannot be GC'd
+  for (int i = 0; i < 127; ++i) {
+    a.DeallocateRaw(initial_ptrs[i]);
+  }
+  void* big_alloc = a.AllocateRaw(1, k512MiB - size);
+  EXPECT_EQ(big_alloc, nullptr);
+}
+
+// Tests that use private functions and cannot be trivially parameterized for
+// both suballocator types.
+class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific
+    : public ::testing::Test {
+ protected:
+  void SetUp() override { CHECK_EQ(unsetenv("TF_FORCE_GPU_ALLOW_GROWTH"), 0); }

  void TestRegionDeallocation() {
    GPUOptions options;
    options.set_allow_growth(true);

    // Max of 2GiB, but starts out small.
-    PlatformGpuId platform_gpu_id(0);
-    DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-        ExecutorForPlatformGpuId(platform_gpu_id), platform_gpu_id,
-        /*use_unified_memory=*/false, {}, {});
-    GPUBFCAllocator a(sub_allocator, 1LL << 31, options, "GPU_0_bfc");
+    GPUBFCAllocator a(CreateGPUMemAllocator(/*ignored*/ 0), 1LL << 31, options,
+                      "GPU_0_bfc");

    // Allocate 128 raw pointers of 4 megs.
    const size_t size = 1LL << 22;
@ -641,22 +690,59 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
    }
    EXPECT_EQ(1, num_chunks_in_bins);
  }
+
+#if CUDA_VERSION >= 10020
+  // Counterpart to the GPUMemAllocator test suite TestRegionDeallocation tests.
+  // Here we expect no deallocations because all allocations are coalesced into
+  // a single region.
+  void TestNoRegionDeallocation() {
+    GPUOptions options;
+    options.set_allow_growth(true);
+
+    // Max of 2GiB, but starts out small.
+    GPUBFCAllocator a(CreateVirtualMemorySubAllocator(1uLL << 32), 1LL << 31,
+                      options, "GPU_0_bfc");
+
+    // Allocate 128 raw pointers of 4 megs.
+    const size_t size = 1LL << 22;
+    std::vector<void*> initial_ptrs;
+    for (size_t s = 0; s < 128; s++) {
+      void* raw = a.AllocateRaw(1, size);
+      initial_ptrs.push_back(raw);
+    }
+
+    {
+      mutex_lock l(a.lock_);
+      EXPECT_EQ(1, a.region_manager_.regions().size());
+    }
+
+    // Deallocate all the memories except the last one.
+    for (size_t i = 0; i < initial_ptrs.size() - 1; i++) {
+      a.DeallocateRaw(initial_ptrs[i]);
+    }
+
+    // Deallocate free regions and there should still be only one.
+    EXPECT_EQ(false, a.DeallocateFreeRegions(/*rounded_bytes=*/0));
+    {
+      mutex_lock l(a.lock_);
+      EXPECT_EQ(1, a.region_manager_.regions().size());
+    }
+  }
+#endif
 };

-TEST_F(GPUBFCAllocatorPrivateMethodsTest, BinDebugInfo) { TestBinDebugInfo(); }
-
-TEST_F(GPUBFCAllocatorPrivateMethodsTest, Log2FloorNonZeroSlow) {
-  TestLog2FloorNonZeroSlow();
-}
-
-TEST_F(GPUBFCAllocatorPrivateMethodsTest, ForceAllowGrowth) {
-  TestForceAllowGrowth();
-}
-
-TEST_F(GPUBFCAllocatorPrivateMethodsTest, TestRegionDeallocation) {
+TEST_F(GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific,
+       TestRegionDeallocation) {
  TestRegionDeallocation();
 }

+#if CUDA_VERSION >= 10020
+TEST_F(GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific,
+       TestNoRegionDeallocation) {
+  TestNoRegionDeallocation();
+}
+#endif
+
 }  // namespace tensorflow

 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@ -1371,7 +1371,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
                              tf_gpu_id.value());
    }
    TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
-                                       it->second, devices));
+                                       it->second, num_tf_gpus, devices));
  }
  return Status::OK();
 }
@ -1400,7 +1400,7 @@ static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,

 Status BaseGPUDeviceFactory::CreateGPUDevice(
    const SessionOptions& options, const string& name_prefix, TfGpuId tf_gpu_id,
-    int64 memory_limit, const DeviceLocality& dev_locality,
+    int64 memory_limit, const DeviceLocality& dev_locality, size_t num_tf_gpus,
    std::vector<std::unique_ptr<Device>>* devices) {
  CHECK_GE(tf_gpu_id.value(), 0);
  const string device_name =
@ -1418,9 +1418,19 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
    return desc_status.status();
  }
  auto desc = desc_status.ConsumeValueOrDie();
+
+  std::vector<TfGpuId> peer_gpu_ids;
+  peer_gpu_ids.reserve(num_tf_gpus);
+  for (int id = 0; id < num_tf_gpus; ++id) {
+    TfGpuId peer_tf_gpu_id(id);
+    if (peer_tf_gpu_id != tf_gpu_id) {
+      peer_gpu_ids.push_back(peer_tf_gpu_id);
+    }
+  }
+
  GPUProcessState* process_state = GPUProcessState::singleton();
  Allocator* gpu_allocator = process_state->GetGPUAllocator(
-      options.config.gpu_options(), tf_gpu_id, memory_limit);
+      options.config.gpu_options(), tf_gpu_id, memory_limit, peer_gpu_ids);
  if (gpu_allocator == nullptr) {
    return errors::Internal("Failed to get memory allocator for TF GPU ",
                            tf_gpu_id.value(), " with ", memory_limit,
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@ -354,6 +354,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
  Status CreateGPUDevice(const SessionOptions& options,
                         const std::string& name_prefix, TfGpuId tf_gpu_id,
                         int64 memory_limit, const DeviceLocality& dev_locality,
+                         size_t num_tf_gpus,
                         std::vector<std::unique_ptr<Device>>* devices);

  virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@ -18,6 +18,7 @@ limitations under the License.
 #include <cstring>
 #include <vector>

+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/common_runtime/device/device_host_allocator.h"
 #include "tensorflow/core/common_runtime/device/device_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
@ -26,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
 #include "tensorflow/core/common_runtime/pool_allocator.h"
 #include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
@ -77,9 +79,61 @@ int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
  return numa_node >= 0 ? numa_node : 0;
 }

-Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
-                                            TfGpuId tf_gpu_id,
-                                            size_t total_bytes) {
+// NOLINTNEXTLINE: clang-tidy complains this is unused because of build flags.
+static SubAllocator* CreateSubAllocator(
+    const GPUOptions& options, PlatformGpuId platform_gpu_id,
+    const std::vector<SubAllocator::Visitor>& alloc_visitors,
+    size_t total_bytes, const std::vector<TfGpuId>& peer_gpu_ids) {
+  auto executor = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
+                                                            platform_gpu_id)
+                      .ValueOrDie();
+
+#if defined(GOOGLE_CUDA) && CUDA_VERSION >= 10020
+  // Use the old allocator when unified memory is required.
+  // TODO(imintz): Remove the cuMemAlloc capability of this allocator.
+  if (options.per_process_gpu_memory_fraction() > 1.0 ||
+      options.experimental().use_unified_memory()) {
+    return new DeviceMemAllocator(executor, platform_gpu_id,
+                                  /*use_unified_memory=*/true, alloc_visitors,
+                                  {});
+  } else {
+    auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
+        executor->implementation()->GpuContextHack());
+
+    absl::flat_hash_set<PlatformGpuId> platform_peer_gpu_ids;
+    platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
+    for (const TfGpuId tf_gpu_id : peer_gpu_ids) {
+      PlatformGpuId platform_gpu_id;
+      TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
+      platform_peer_gpu_ids.insert(platform_gpu_id);
+    }
+    std::vector<PlatformGpuId> platform_peer_gpu_ids_vec(
+        platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
+
+    // Adjust virtual address space to be slightly larger than the physical
+    // address space in case the BFC allocator performs suboptimal garbage
+    // collection.
+    // TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
+    // the va space.
+    return GpuVirtualMemAllocator::Create(
+               alloc_visitors, {}, *gpu_context, platform_gpu_id,
+               /*virtual_address_space_size=*/total_bytes * 2,
+               platform_peer_gpu_ids_vec)
+        .ValueOrDie()
+        .release();
+  }
+#else
+  return new DeviceMemAllocator(
+      executor, platform_gpu_id,
+      (options.per_process_gpu_memory_fraction() > 1.0 ||
+       options.experimental().use_unified_memory()),
+      alloc_visitors, {});
+#endif
+}
+
+Allocator* GPUProcessState::GetGPUAllocator(
+    const GPUOptions& options, TfGpuId tf_gpu_id, size_t total_bytes,
+    const std::vector<TfGpuId>& peer_gpu_ids) {
  CHECK(process_state_);
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
    (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
@ -107,14 +161,9 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
    while (bus_id >= gpu_visitors_.size()) {
      gpu_visitors_.push_back({});
    }
-    DeviceMemAllocator* sub_allocator = new DeviceMemAllocator(
-        DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
-                                                  platform_gpu_id)
-            .ValueOrDie(),
-        platform_gpu_id,
-        (options.per_process_gpu_memory_fraction() > 1.0 ||
-         options.experimental().use_unified_memory()),
-        gpu_visitors_[bus_id], {});
+    auto* sub_allocator =
+        CreateSubAllocator(options, platform_gpu_id, gpu_visitors_[bus_id],
+                           total_bytes, peer_gpu_ids);
    GPUBFCAllocator* gpu_bfc_allocator =
        new GPUBFCAllocator(sub_allocator, total_bytes, options,
                            strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@ -82,7 +82,8 @@ class GPUProcessState {
  // REQUIRES: tf_gpu_id must be a valid id for a BaseGPUDevice available in the
  // current system environment.  Otherwise returns nullptr.
  virtual Allocator* GetGPUAllocator(const GPUOptions& options,
-                                     TfGpuId tf_gpu_id, size_t total_bytes);
+                                     TfGpuId tf_gpu_id, size_t total_bytes,
+                                     const std::vector<TfGpuId>& peer_gpu_ids);

  int NumGPUAllocators() {
    mutex_lock l(mu_);