From c7e57df25686a9a924b0e766095e634637cdd36b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 21 Jan 2021 16:03:30 -0800 Subject: [PATCH] Use device handle instead of gpu ordinal in GpuVirtualMemAllocator for configuring peer access. Check that peers support virtual address management and have peer access at GpuVirtualMemAllocator creation time. PiperOrigin-RevId: 353124035 Change-Id: I8549253b84af35a664e9a6810c6b40deba20a532 --- tensorflow/core/common_runtime/gpu/BUILD | 1 + .../gpu/gpu_virtual_mem_allocator.cc | 56 +++++++++++++++---- .../gpu/gpu_virtual_mem_allocator.h | 15 +++-- .../stream_executor/cuda/cuda_driver.cc | 26 +++++---- tensorflow/stream_executor/gpu/gpu_driver.h | 16 ++++-- 5 files changed, 80 insertions(+), 34 deletions(-) diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD index fbd14215a40..f413ba6b609 100644 --- a/tensorflow/core/common_runtime/gpu/BUILD +++ b/tensorflow/core/common_runtime/gpu/BUILD @@ -220,6 +220,7 @@ tf_cuda_library( "//tensorflow/stream_executor:platform", "//tensorflow/stream_executor:stream_executor_headers", "//tensorflow/stream_executor/lib", + "@com_google_absl//absl/strings:str_format", ], ) diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc index 4e0f97691f9..d2b632208b5 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc @@ -15,7 +15,9 @@ limitations under the License. #include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h" +#include "absl/strings/str_format.h" #include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/stream_executor/lib/status.h" #if CUDA_VERSION >= 10020 @@ -23,8 +25,11 @@ namespace tensorflow { namespace { using ::stream_executor::gpu::GpuContext; +using ::stream_executor::gpu::GpuDeviceHandle; using ::stream_executor::gpu::GpuDevicePtr; using ::stream_executor::gpu::GpuDriver; +using ::stream_executor::port::Status; +using ::stream_executor::port::StatusOr; // Rounds value up to the specified power of two alignment. size_t AlignUp(size_t value, size_t alignment) { @@ -33,6 +38,23 @@ size_t AlignUp(size_t value, size_t alignment) { return (value + alignment - 1) & ~(alignment - 1); } +StatusOr SupportsVirtualAddressManagement(GpuDeviceHandle device) { + return GpuDriver::GetDeviceAttribute( + CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, device); +} + +Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device, + PlatformGpuId gpu_id) { + TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management, + SupportsVirtualAddressManagement(device)); + if (!supports_virtual_address_management) { + return stream_executor::port::InternalError(absl::StrFormat( + "GPU %d does not support virtual memory address management.", + gpu_id.value())); + } + return {}; +} + } // namespace /* static */ stream_executor::port::StatusOr< @@ -42,19 +64,31 @@ GpuVirtualMemAllocator::Create(const std::vector& alloc_visitors, GpuContext& gpu_context, PlatformGpuId gpu_id, size_t virtual_address_space_size, const std::vector& peer_gpu_ids) { - std::vector access_gpu_ordinals; - access_gpu_ordinals.reserve(peer_gpu_ids.size() + 1); - access_gpu_ordinals.push_back(gpu_id.value()); + std::vector access_gpu_handles; + access_gpu_handles.reserve(peer_gpu_ids.size() + 1); + + GpuDeviceHandle gpu_handle; + TF_RETURN_IF_ERROR(GpuDriver::GetDevice(gpu_id.value(), &gpu_handle)); + TF_RETURN_IF_ERROR(CheckVirtualAddressManagementSupport(gpu_handle, gpu_id)); + + access_gpu_handles.push_back(gpu_handle); for (const auto& peer_id : peer_gpu_ids) { - access_gpu_ordinals.push_back(peer_id.value()); + GpuDeviceHandle peer_handle; + TF_RETURN_IF_ERROR(GpuDriver::GetDevice(peer_id.value(), &peer_handle)); + TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management, + SupportsVirtualAddressManagement(peer_handle)); + if (GpuDriver::CanEnablePeerAccess(gpu_handle, peer_handle) && + supports_virtual_address_management) { + access_gpu_handles.push_back(peer_handle); + } } // Find the min granularity for all devices that have access to this memory; // that is, the maximum min granularity among all devices. size_t max_granularity = 1; - for (const int device_ordinal : access_gpu_ordinals) { + for (const auto device_handle : access_gpu_handles) { TF_ASSIGN_OR_RETURN(size_t granularity, - GpuDriver::GetMinAllocationGranularity(device_ordinal)); + GpuDriver::GetMinAllocationGranularity(device_handle)); max_granularity = std::max(max_granularity, granularity); } @@ -71,18 +105,18 @@ GpuVirtualMemAllocator::Create(const std::vector& alloc_visitors, return std::unique_ptr(new GpuVirtualMemAllocator( alloc_visitors, free_visitors, gpu_context, gpu_id, - std::move(access_gpu_ordinals), vmem, max_granularity)); + std::move(access_gpu_handles), vmem, max_granularity)); } GpuVirtualMemAllocator::GpuVirtualMemAllocator( const std::vector& alloc_visitors, const std::vector& free_visitors, GpuContext& gpu_context, - PlatformGpuId gpu_id, const std::vector access_gpu_ordinals, + PlatformGpuId gpu_id, const std::vector access_gpu_handles, GpuDriver::VmemSpan vmem, size_t granularity) : SubAllocator(alloc_visitors, free_visitors), gpu_context_(gpu_context), gpu_id_(gpu_id), - access_gpu_ordinals_(access_gpu_ordinals), + access_gpu_handles_(access_gpu_handles), vmem_(vmem), granularity_(granularity) {} @@ -122,8 +156,8 @@ void* GpuVirtualMemAllocator::Alloc(size_t alignment, size_t num_bytes, GpuDriver::GenericMemoryHandle handle = std::move(maybe_handle).ValueOrDie(); // Map VAs for this physical memory. - auto status = GpuDriver::MapMemory(&gpu_context_, next_va, handle, - access_gpu_ordinals_); + auto status = + GpuDriver::MapMemory(&gpu_context_, next_va, handle, access_gpu_handles_); if (!status.ok()) { LOG(ERROR) << status; GpuDriver::ReleaseMemoryHandle(&gpu_context_, std::move(handle)); diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h index 23572262c42..65a716e3b3f 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h @@ -71,13 +71,12 @@ class GpuVirtualMemAllocator : public SubAllocator { bool SupportsCoalescing() const override { return true; } private: - GpuVirtualMemAllocator(const std::vector& alloc_visitors, - const std::vector& free_visitors, - ::stream_executor::gpu::GpuContext& gpu_context, - PlatformGpuId gpu_id, - std::vector access_gpu_ordinals, - stream_executor::gpu::GpuDriver::VmemSpan vmem, - size_t granularity); + GpuVirtualMemAllocator( + const std::vector& alloc_visitors, + const std::vector& free_visitors, + stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id, + std::vector access_device_handles, + stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity); stream_executor::gpu::GpuContext& gpu_context_; PlatformGpuId gpu_id_; @@ -86,7 +85,7 @@ class GpuVirtualMemAllocator : public SubAllocator { // all gpus that may want to read the memory. This list also includes the // above gpu_id_ to facilitate the invocation of the GpuDriver::MapMemory // function. - const std::vector access_gpu_ordinals_; + const std::vector access_gpu_handles_; // The virtual memory span held by this allocator. stream_executor::gpu::GpuDriver::VmemSpan vmem_; diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc index c168e67b28b..61735e3870c 100644 --- a/tensorflow/stream_executor/cuda/cuda_driver.cc +++ b/tensorflow/stream_executor/cuda/cuda_driver.cc @@ -916,11 +916,11 @@ GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64 bytes) { } /* static */ port::StatusOr GpuDriver::GetMinAllocationGranularity( - int device_ordinal) { + GpuDeviceHandle device) { CUmemAllocationProp props = {}; props.type = CU_MEM_ALLOCATION_TYPE_PINNED; props.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - props.location.id = device_ordinal; + props.location.id = device; size_t granularity; CUresult res = cuMemGetAllocationGranularity( @@ -970,7 +970,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) { /* static */ port::Status GpuDriver::MapMemory( GpuContext* context, CUdeviceptr va, const GpuDriver::GenericMemoryHandle& handle, - const std::vector& device_ordinals) { + const std::vector& device_handles) { ScopedActivateContext activation(context); auto device = DeviceFromContext(context); @@ -986,9 +986,9 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) { "Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res))); } - std::vector access_descriptors(device_ordinals.size()); + std::vector access_descriptors(device_handles.size()); for (int i = 0; i < access_descriptors.size(); ++i) { - access_descriptors[i].location.id = device_ordinals[i]; + access_descriptors[i].location.id = device_handles[i]; access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE; access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; } @@ -1574,7 +1574,6 @@ static port::StatusOr GetSimpleAttribute(CUdevice device, return true; // A context can always access its own memory. } - int can_access_peer = -1; auto from_device = DeviceFromContext(from); if (!from_device.ok()) { LOG(ERROR) << "failed to resolve 'from' peer access context to a device: " @@ -1587,13 +1586,18 @@ static port::StatusOr GetSimpleAttribute(CUdevice device, << to_device.status(); return false; } - CUresult res = cuDeviceCanAccessPeer( - &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie()); - if (res != CUDA_SUCCESS) { - LOG(ERROR) << "failed to detect peer access capability: " << ToString(res); + return CanEnablePeerAccess(from_device.ValueOrDie(), to_device.ValueOrDie()); +} + +/* static */ bool GpuDriver::CanEnablePeerAccess(GpuDeviceHandle from, + GpuDeviceHandle to) { + int can_access_peer = -1; + CUresult result = cuDeviceCanAccessPeer(&can_access_peer, from, to); + if (result != CUDA_SUCCESS) { + LOG(ERROR) << "failed to detect peer access capability: " + << ToString(result); return false; } - return can_access_peer; } diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h index 955ed59926a..af1febcbc85 100644 --- a/tensorflow/stream_executor/gpu/gpu_driver.h +++ b/tensorflow/stream_executor/gpu/gpu_driver.h @@ -163,7 +163,8 @@ class GpuDriver { // Calculates the minimum alignment for memory allocations done through // cuMemCreate via cuMemGetAllocationGranularity. // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g30ee906c2cf66a0347b3dfec3d7eb31a - static port::StatusOr GetMinAllocationGranularity(int device_ordinal); + static port::StatusOr GetMinAllocationGranularity( + GpuDeviceHandle device); // Allocates physical memory and returns a handle that can be mapped to // virtual addresses via cuMemCreate. bytes must be a multiple of the @@ -185,9 +186,9 @@ class GpuDriver { // cuMemMap and sets the appropriate access settings via cuMemSetAccess. // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gff1d395423af5c5c75375516959dae56 // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g1b6b12b10e8324bf462ecab4e7ef30e1 - static port::Status MapMemory(GpuContext* context, GpuDevicePtr va, - const GenericMemoryHandle& handle, - const std::vector& device_ordinals); + static port::Status MapMemory( + GpuContext* context, GpuDevicePtr va, const GenericMemoryHandle& handle, + const std::vector& device_handles); // Unmaps the backing memory from the given virtual address range. This range // must fully unmap a memory handle that was mapped using MapMemory; partial @@ -408,6 +409,13 @@ class GpuDriver { // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e static bool CanEnablePeerAccess(GpuContext* from, GpuContext* to); + // Returns whether the from device can access memory in the to + // device via cuDeviceCanAccessPeer. Because of differences between ROCM and + // CUDA, this API is not supported in ROCM builds and will result in a link + // error if used. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e + static bool CanEnablePeerAccess(GpuDeviceHandle from, GpuDeviceHandle to); + // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess. // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to);