From c7e57df25686a9a924b0e766095e634637cdd36b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 Jan 2021 16:03:30 -0800
Subject: [PATCH] Use device handle instead of gpu ordinal in
 GpuVirtualMemAllocator for configuring peer access.

Check that peers support virtual address management and have peer access
at GpuVirtualMemAllocator creation time.

PiperOrigin-RevId: 353124035
Change-Id: I8549253b84af35a664e9a6810c6b40deba20a532
---
 tensorflow/core/common_runtime/gpu/BUILD      |  1 +
 .../gpu/gpu_virtual_mem_allocator.cc          | 56 +++++++++++++++----
 .../gpu/gpu_virtual_mem_allocator.h           | 15 +++--
 .../stream_executor/cuda/cuda_driver.cc       | 26 +++++----
 tensorflow/stream_executor/gpu/gpu_driver.h   | 16 ++++--
 5 files changed, 80 insertions(+), 34 deletions(-)
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index fbd14215a40..f413ba6b609 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -220,6 +220,7 @@ tf_cuda_library(
         "//tensorflow/stream_executor:platform",
         "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc
index 4e0f97691f9..d2b632208b5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
 
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/stream_executor/lib/status.h"
 
 #if CUDA_VERSION >= 10020
 
@@ -23,8 +25,11 @@ namespace tensorflow {
 namespace {
 
 using ::stream_executor::gpu::GpuContext;
+using ::stream_executor::gpu::GpuDeviceHandle;
 using ::stream_executor::gpu::GpuDevicePtr;
 using ::stream_executor::gpu::GpuDriver;
+using ::stream_executor::port::Status;
+using ::stream_executor::port::StatusOr;
 
 // Rounds value up to the specified power of two alignment.
 size_t AlignUp(size_t value, size_t alignment) {
@@ -33,6 +38,23 @@ size_t AlignUp(size_t value, size_t alignment) {
   return (value + alignment - 1) & ~(alignment - 1);
 }
 
+StatusOr<bool> SupportsVirtualAddressManagement(GpuDeviceHandle device) {
+  return GpuDriver::GetDeviceAttribute(
+      CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, device);
+}
+
+Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
+                                            PlatformGpuId gpu_id) {
+  TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
+                      SupportsVirtualAddressManagement(device));
+  if (!supports_virtual_address_management) {
+    return stream_executor::port::InternalError(absl::StrFormat(
+        "GPU %d does not support virtual memory address management.",
+        gpu_id.value()));
+  }
+  return {};
+}
+
 }  // namespace
 
 /* static */ stream_executor::port::StatusOr<
@@ -42,19 +64,31 @@ GpuVirtualMemAllocator::Create(const std::vector<Visitor>& alloc_visitors,
                                GpuContext& gpu_context, PlatformGpuId gpu_id,
                                size_t virtual_address_space_size,
                                const std::vector<PlatformGpuId>& peer_gpu_ids) {
-  std::vector<int> access_gpu_ordinals;
-  access_gpu_ordinals.reserve(peer_gpu_ids.size() + 1);
-  access_gpu_ordinals.push_back(gpu_id.value());
+  std::vector<GpuDeviceHandle> access_gpu_handles;
+  access_gpu_handles.reserve(peer_gpu_ids.size() + 1);
+
+  GpuDeviceHandle gpu_handle;
+  TF_RETURN_IF_ERROR(GpuDriver::GetDevice(gpu_id.value(), &gpu_handle));
+  TF_RETURN_IF_ERROR(CheckVirtualAddressManagementSupport(gpu_handle, gpu_id));
+
+  access_gpu_handles.push_back(gpu_handle);
   for (const auto& peer_id : peer_gpu_ids) {
-    access_gpu_ordinals.push_back(peer_id.value());
+    GpuDeviceHandle peer_handle;
+    TF_RETURN_IF_ERROR(GpuDriver::GetDevice(peer_id.value(), &peer_handle));
+    TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
+                        SupportsVirtualAddressManagement(peer_handle));
+    if (GpuDriver::CanEnablePeerAccess(gpu_handle, peer_handle) &&
+        supports_virtual_address_management) {
+      access_gpu_handles.push_back(peer_handle);
+    }
   }
 
   // Find the min granularity for all devices that have access to this memory;
   // that is, the maximum min granularity among all devices.
   size_t max_granularity = 1;
-  for (const int device_ordinal : access_gpu_ordinals) {
+  for (const auto device_handle : access_gpu_handles) {
     TF_ASSIGN_OR_RETURN(size_t granularity,
-                        GpuDriver::GetMinAllocationGranularity(device_ordinal));
+                        GpuDriver::GetMinAllocationGranularity(device_handle));
     max_granularity = std::max(max_granularity, granularity);
   }
 
@@ -71,18 +105,18 @@ GpuVirtualMemAllocator::Create(const std::vector<Visitor>& alloc_visitors,
 
   return std::unique_ptr<GpuVirtualMemAllocator>(new GpuVirtualMemAllocator(
       alloc_visitors, free_visitors, gpu_context, gpu_id,
-      std::move(access_gpu_ordinals), vmem, max_granularity));
+      std::move(access_gpu_handles), vmem, max_granularity));
 }
 
 GpuVirtualMemAllocator::GpuVirtualMemAllocator(
     const std::vector<Visitor>& alloc_visitors,
     const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
-    PlatformGpuId gpu_id, const std::vector<int> access_gpu_ordinals,
+    PlatformGpuId gpu_id, const std::vector<GpuDeviceHandle> access_gpu_handles,
     GpuDriver::VmemSpan vmem, size_t granularity)
     : SubAllocator(alloc_visitors, free_visitors),
       gpu_context_(gpu_context),
       gpu_id_(gpu_id),
-      access_gpu_ordinals_(access_gpu_ordinals),
+      access_gpu_handles_(access_gpu_handles),
       vmem_(vmem),
       granularity_(granularity) {}
 
@@ -122,8 +156,8 @@ void* GpuVirtualMemAllocator::Alloc(size_t alignment, size_t num_bytes,
   GpuDriver::GenericMemoryHandle handle = std::move(maybe_handle).ValueOrDie();
 
   // Map VAs for this physical memory.
-  auto status = GpuDriver::MapMemory(&gpu_context_, next_va, handle,
-                                     access_gpu_ordinals_);
+  auto status =
+      GpuDriver::MapMemory(&gpu_context_, next_va, handle, access_gpu_handles_);
   if (!status.ok()) {
     LOG(ERROR) << status;
     GpuDriver::ReleaseMemoryHandle(&gpu_context_, std::move(handle));
diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
index 23572262c42..65a716e3b3f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
@@ -71,13 +71,12 @@ class GpuVirtualMemAllocator : public SubAllocator {
   bool SupportsCoalescing() const override { return true; }
 
  private:
-  GpuVirtualMemAllocator(const std::vector<Visitor>& alloc_visitors,
-                         const std::vector<Visitor>& free_visitors,
-                         ::stream_executor::gpu::GpuContext& gpu_context,
-                         PlatformGpuId gpu_id,
-                         std::vector<int> access_gpu_ordinals,
-                         stream_executor::gpu::GpuDriver::VmemSpan vmem,
-                         size_t granularity);
+  GpuVirtualMemAllocator(
+      const std::vector<Visitor>& alloc_visitors,
+      const std::vector<Visitor>& free_visitors,
+      stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id,
+      std::vector<stream_executor::gpu::GpuDeviceHandle> access_device_handles,
+      stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity);
 
   stream_executor::gpu::GpuContext& gpu_context_;
   PlatformGpuId gpu_id_;
@@ -86,7 +85,7 @@ class GpuVirtualMemAllocator : public SubAllocator {
   // all gpus that may want to read the memory. This list also includes the
   // above gpu_id_ to facilitate the invocation of the GpuDriver::MapMemory
   // function.
-  const std::vector<int> access_gpu_ordinals_;
+  const std::vector<stream_executor::gpu::GpuDeviceHandle> access_gpu_handles_;
 
   // The virtual memory span held by this allocator.
   stream_executor::gpu::GpuDriver::VmemSpan vmem_;
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index c168e67b28b..61735e3870c 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -916,11 +916,11 @@ GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64 bytes) {
 }
 
 /* static */ port::StatusOr<uint64> GpuDriver::GetMinAllocationGranularity(
-    int device_ordinal) {
+    GpuDeviceHandle device) {
   CUmemAllocationProp props = {};
   props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
   props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  props.location.id = device_ordinal;
+  props.location.id = device;
 
   size_t granularity;
   CUresult res = cuMemGetAllocationGranularity(
@@ -970,7 +970,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) {
 /* static */ port::Status GpuDriver::MapMemory(
     GpuContext* context, CUdeviceptr va,
     const GpuDriver::GenericMemoryHandle& handle,
-    const std::vector<int>& device_ordinals) {
+    const std::vector<GpuDeviceHandle>& device_handles) {
   ScopedActivateContext activation(context);
 
   auto device = DeviceFromContext(context);
@@ -986,9 +986,9 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) {
         "Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
   }
 
-  std::vector<CUmemAccessDesc> access_descriptors(device_ordinals.size());
+  std::vector<CUmemAccessDesc> access_descriptors(device_handles.size());
   for (int i = 0; i < access_descriptors.size(); ++i) {
-    access_descriptors[i].location.id = device_ordinals[i];
+    access_descriptors[i].location.id = device_handles[i];
     access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
     access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
   }
@@ -1574,7 +1574,6 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
     return true;  // A context can always access its own memory.
   }
 
-  int can_access_peer = -1;
   auto from_device = DeviceFromContext(from);
   if (!from_device.ok()) {
     LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
@@ -1587,13 +1586,18 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
                << to_device.status();
     return false;
   }
-  CUresult res = cuDeviceCanAccessPeer(
-      &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
-  if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
+  return CanEnablePeerAccess(from_device.ValueOrDie(), to_device.ValueOrDie());
+}
+
+/* static */ bool GpuDriver::CanEnablePeerAccess(GpuDeviceHandle from,
+                                                 GpuDeviceHandle to) {
+  int can_access_peer = -1;
+  CUresult result = cuDeviceCanAccessPeer(&can_access_peer, from, to);
+  if (result != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to detect peer access capability: "
+               << ToString(result);
     return false;
   }
-
   return can_access_peer;
 }
 
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
index 955ed59926a..af1febcbc85 100644
--- a/tensorflow/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@@ -163,7 +163,8 @@ class GpuDriver {
   // Calculates the minimum alignment for memory allocations done through
   // cuMemCreate via cuMemGetAllocationGranularity.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g30ee906c2cf66a0347b3dfec3d7eb31a
-  static port::StatusOr<uint64> GetMinAllocationGranularity(int device_ordinal);
+  static port::StatusOr<uint64> GetMinAllocationGranularity(
+      GpuDeviceHandle device);
 
   // Allocates physical memory and returns a handle that can be mapped to
   // virtual addresses via cuMemCreate. bytes must be a multiple of the
@@ -185,9 +186,9 @@ class GpuDriver {
   // cuMemMap and sets the appropriate access settings via cuMemSetAccess.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gff1d395423af5c5c75375516959dae56
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g1b6b12b10e8324bf462ecab4e7ef30e1
-  static port::Status MapMemory(GpuContext* context, GpuDevicePtr va,
-                                const GenericMemoryHandle& handle,
-                                const std::vector<int>& device_ordinals);
+  static port::Status MapMemory(
+      GpuContext* context, GpuDevicePtr va, const GenericMemoryHandle& handle,
+      const std::vector<GpuDeviceHandle>& device_handles);
 
   // Unmaps the backing memory from the given virtual address range. This range
   // must fully unmap a memory handle that was mapped using MapMemory; partial
@@ -408,6 +409,13 @@ class GpuDriver {
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
   static bool CanEnablePeerAccess(GpuContext* from, GpuContext* to);
 
+  // Returns whether the from device can access memory in the to
+  // device via cuDeviceCanAccessPeer. Because of differences between ROCM and
+  // CUDA, this API is not supported in ROCM builds and will result in a link
+  // error if used.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
+  static bool CanEnablePeerAccess(GpuDeviceHandle from, GpuDeviceHandle to);
+
   // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
   static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to);