Use device handle instead of gpu ordinal in GpuVirtualMemAllocator for
configuring peer access. Check that peers support virtual address management and have peer access at GpuVirtualMemAllocator creation time. PiperOrigin-RevId: 353124035 Change-Id: I8549253b84af35a664e9a6810c6b40deba20a532
This commit is contained in:
parent
2d3d381d5f
commit
c7e57df256
tensorflow
core/common_runtime/gpu
stream_executor
@ -220,6 +220,7 @@ tf_cuda_library(
|
|||||||
"//tensorflow/stream_executor:platform",
|
"//tensorflow/stream_executor:platform",
|
||||||
"//tensorflow/stream_executor:stream_executor_headers",
|
"//tensorflow/stream_executor:stream_executor_headers",
|
||||||
"//tensorflow/stream_executor/lib",
|
"//tensorflow/stream_executor/lib",
|
||||||
|
"@com_google_absl//absl/strings:str_format",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -15,7 +15,9 @@ limitations under the License.
|
|||||||
|
|
||||||
#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
|
#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
|
||||||
|
|
||||||
|
#include "absl/strings/str_format.h"
|
||||||
#include "tensorflow/core/lib/strings/numbers.h"
|
#include "tensorflow/core/lib/strings/numbers.h"
|
||||||
|
#include "tensorflow/stream_executor/lib/status.h"
|
||||||
|
|
||||||
#if CUDA_VERSION >= 10020
|
#if CUDA_VERSION >= 10020
|
||||||
|
|
||||||
@ -23,8 +25,11 @@ namespace tensorflow {
|
|||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
using ::stream_executor::gpu::GpuContext;
|
using ::stream_executor::gpu::GpuContext;
|
||||||
|
using ::stream_executor::gpu::GpuDeviceHandle;
|
||||||
using ::stream_executor::gpu::GpuDevicePtr;
|
using ::stream_executor::gpu::GpuDevicePtr;
|
||||||
using ::stream_executor::gpu::GpuDriver;
|
using ::stream_executor::gpu::GpuDriver;
|
||||||
|
using ::stream_executor::port::Status;
|
||||||
|
using ::stream_executor::port::StatusOr;
|
||||||
|
|
||||||
// Rounds value up to the specified power of two alignment.
|
// Rounds value up to the specified power of two alignment.
|
||||||
size_t AlignUp(size_t value, size_t alignment) {
|
size_t AlignUp(size_t value, size_t alignment) {
|
||||||
@ -33,6 +38,23 @@ size_t AlignUp(size_t value, size_t alignment) {
|
|||||||
return (value + alignment - 1) & ~(alignment - 1);
|
return (value + alignment - 1) & ~(alignment - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
StatusOr<bool> SupportsVirtualAddressManagement(GpuDeviceHandle device) {
|
||||||
|
return GpuDriver::GetDeviceAttribute(
|
||||||
|
CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, device);
|
||||||
|
}
|
||||||
|
|
||||||
|
Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
|
||||||
|
PlatformGpuId gpu_id) {
|
||||||
|
TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
|
||||||
|
SupportsVirtualAddressManagement(device));
|
||||||
|
if (!supports_virtual_address_management) {
|
||||||
|
return stream_executor::port::InternalError(absl::StrFormat(
|
||||||
|
"GPU %d does not support virtual memory address management.",
|
||||||
|
gpu_id.value()));
|
||||||
|
}
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
/* static */ stream_executor::port::StatusOr<
|
/* static */ stream_executor::port::StatusOr<
|
||||||
@ -42,19 +64,31 @@ GpuVirtualMemAllocator::Create(const std::vector<Visitor>& alloc_visitors,
|
|||||||
GpuContext& gpu_context, PlatformGpuId gpu_id,
|
GpuContext& gpu_context, PlatformGpuId gpu_id,
|
||||||
size_t virtual_address_space_size,
|
size_t virtual_address_space_size,
|
||||||
const std::vector<PlatformGpuId>& peer_gpu_ids) {
|
const std::vector<PlatformGpuId>& peer_gpu_ids) {
|
||||||
std::vector<int> access_gpu_ordinals;
|
std::vector<GpuDeviceHandle> access_gpu_handles;
|
||||||
access_gpu_ordinals.reserve(peer_gpu_ids.size() + 1);
|
access_gpu_handles.reserve(peer_gpu_ids.size() + 1);
|
||||||
access_gpu_ordinals.push_back(gpu_id.value());
|
|
||||||
|
GpuDeviceHandle gpu_handle;
|
||||||
|
TF_RETURN_IF_ERROR(GpuDriver::GetDevice(gpu_id.value(), &gpu_handle));
|
||||||
|
TF_RETURN_IF_ERROR(CheckVirtualAddressManagementSupport(gpu_handle, gpu_id));
|
||||||
|
|
||||||
|
access_gpu_handles.push_back(gpu_handle);
|
||||||
for (const auto& peer_id : peer_gpu_ids) {
|
for (const auto& peer_id : peer_gpu_ids) {
|
||||||
access_gpu_ordinals.push_back(peer_id.value());
|
GpuDeviceHandle peer_handle;
|
||||||
|
TF_RETURN_IF_ERROR(GpuDriver::GetDevice(peer_id.value(), &peer_handle));
|
||||||
|
TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
|
||||||
|
SupportsVirtualAddressManagement(peer_handle));
|
||||||
|
if (GpuDriver::CanEnablePeerAccess(gpu_handle, peer_handle) &&
|
||||||
|
supports_virtual_address_management) {
|
||||||
|
access_gpu_handles.push_back(peer_handle);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the min granularity for all devices that have access to this memory;
|
// Find the min granularity for all devices that have access to this memory;
|
||||||
// that is, the maximum min granularity among all devices.
|
// that is, the maximum min granularity among all devices.
|
||||||
size_t max_granularity = 1;
|
size_t max_granularity = 1;
|
||||||
for (const int device_ordinal : access_gpu_ordinals) {
|
for (const auto device_handle : access_gpu_handles) {
|
||||||
TF_ASSIGN_OR_RETURN(size_t granularity,
|
TF_ASSIGN_OR_RETURN(size_t granularity,
|
||||||
GpuDriver::GetMinAllocationGranularity(device_ordinal));
|
GpuDriver::GetMinAllocationGranularity(device_handle));
|
||||||
max_granularity = std::max(max_granularity, granularity);
|
max_granularity = std::max(max_granularity, granularity);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -71,18 +105,18 @@ GpuVirtualMemAllocator::Create(const std::vector<Visitor>& alloc_visitors,
|
|||||||
|
|
||||||
return std::unique_ptr<GpuVirtualMemAllocator>(new GpuVirtualMemAllocator(
|
return std::unique_ptr<GpuVirtualMemAllocator>(new GpuVirtualMemAllocator(
|
||||||
alloc_visitors, free_visitors, gpu_context, gpu_id,
|
alloc_visitors, free_visitors, gpu_context, gpu_id,
|
||||||
std::move(access_gpu_ordinals), vmem, max_granularity));
|
std::move(access_gpu_handles), vmem, max_granularity));
|
||||||
}
|
}
|
||||||
|
|
||||||
GpuVirtualMemAllocator::GpuVirtualMemAllocator(
|
GpuVirtualMemAllocator::GpuVirtualMemAllocator(
|
||||||
const std::vector<Visitor>& alloc_visitors,
|
const std::vector<Visitor>& alloc_visitors,
|
||||||
const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
|
const std::vector<Visitor>& free_visitors, GpuContext& gpu_context,
|
||||||
PlatformGpuId gpu_id, const std::vector<int> access_gpu_ordinals,
|
PlatformGpuId gpu_id, const std::vector<GpuDeviceHandle> access_gpu_handles,
|
||||||
GpuDriver::VmemSpan vmem, size_t granularity)
|
GpuDriver::VmemSpan vmem, size_t granularity)
|
||||||
: SubAllocator(alloc_visitors, free_visitors),
|
: SubAllocator(alloc_visitors, free_visitors),
|
||||||
gpu_context_(gpu_context),
|
gpu_context_(gpu_context),
|
||||||
gpu_id_(gpu_id),
|
gpu_id_(gpu_id),
|
||||||
access_gpu_ordinals_(access_gpu_ordinals),
|
access_gpu_handles_(access_gpu_handles),
|
||||||
vmem_(vmem),
|
vmem_(vmem),
|
||||||
granularity_(granularity) {}
|
granularity_(granularity) {}
|
||||||
|
|
||||||
@ -122,8 +156,8 @@ void* GpuVirtualMemAllocator::Alloc(size_t alignment, size_t num_bytes,
|
|||||||
GpuDriver::GenericMemoryHandle handle = std::move(maybe_handle).ValueOrDie();
|
GpuDriver::GenericMemoryHandle handle = std::move(maybe_handle).ValueOrDie();
|
||||||
|
|
||||||
// Map VAs for this physical memory.
|
// Map VAs for this physical memory.
|
||||||
auto status = GpuDriver::MapMemory(&gpu_context_, next_va, handle,
|
auto status =
|
||||||
access_gpu_ordinals_);
|
GpuDriver::MapMemory(&gpu_context_, next_va, handle, access_gpu_handles_);
|
||||||
if (!status.ok()) {
|
if (!status.ok()) {
|
||||||
LOG(ERROR) << status;
|
LOG(ERROR) << status;
|
||||||
GpuDriver::ReleaseMemoryHandle(&gpu_context_, std::move(handle));
|
GpuDriver::ReleaseMemoryHandle(&gpu_context_, std::move(handle));
|
||||||
|
@ -71,13 +71,12 @@ class GpuVirtualMemAllocator : public SubAllocator {
|
|||||||
bool SupportsCoalescing() const override { return true; }
|
bool SupportsCoalescing() const override { return true; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
GpuVirtualMemAllocator(const std::vector<Visitor>& alloc_visitors,
|
GpuVirtualMemAllocator(
|
||||||
const std::vector<Visitor>& free_visitors,
|
const std::vector<Visitor>& alloc_visitors,
|
||||||
::stream_executor::gpu::GpuContext& gpu_context,
|
const std::vector<Visitor>& free_visitors,
|
||||||
PlatformGpuId gpu_id,
|
stream_executor::gpu::GpuContext& gpu_context, PlatformGpuId gpu_id,
|
||||||
std::vector<int> access_gpu_ordinals,
|
std::vector<stream_executor::gpu::GpuDeviceHandle> access_device_handles,
|
||||||
stream_executor::gpu::GpuDriver::VmemSpan vmem,
|
stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity);
|
||||||
size_t granularity);
|
|
||||||
|
|
||||||
stream_executor::gpu::GpuContext& gpu_context_;
|
stream_executor::gpu::GpuContext& gpu_context_;
|
||||||
PlatformGpuId gpu_id_;
|
PlatformGpuId gpu_id_;
|
||||||
@ -86,7 +85,7 @@ class GpuVirtualMemAllocator : public SubAllocator {
|
|||||||
// all gpus that may want to read the memory. This list also includes the
|
// all gpus that may want to read the memory. This list also includes the
|
||||||
// above gpu_id_ to facilitate the invocation of the GpuDriver::MapMemory
|
// above gpu_id_ to facilitate the invocation of the GpuDriver::MapMemory
|
||||||
// function.
|
// function.
|
||||||
const std::vector<int> access_gpu_ordinals_;
|
const std::vector<stream_executor::gpu::GpuDeviceHandle> access_gpu_handles_;
|
||||||
|
|
||||||
// The virtual memory span held by this allocator.
|
// The virtual memory span held by this allocator.
|
||||||
stream_executor::gpu::GpuDriver::VmemSpan vmem_;
|
stream_executor::gpu::GpuDriver::VmemSpan vmem_;
|
||||||
|
@ -916,11 +916,11 @@ GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64 bytes) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* static */ port::StatusOr<uint64> GpuDriver::GetMinAllocationGranularity(
|
/* static */ port::StatusOr<uint64> GpuDriver::GetMinAllocationGranularity(
|
||||||
int device_ordinal) {
|
GpuDeviceHandle device) {
|
||||||
CUmemAllocationProp props = {};
|
CUmemAllocationProp props = {};
|
||||||
props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||||
props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||||
props.location.id = device_ordinal;
|
props.location.id = device;
|
||||||
|
|
||||||
size_t granularity;
|
size_t granularity;
|
||||||
CUresult res = cuMemGetAllocationGranularity(
|
CUresult res = cuMemGetAllocationGranularity(
|
||||||
@ -970,7 +970,7 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) {
|
|||||||
/* static */ port::Status GpuDriver::MapMemory(
|
/* static */ port::Status GpuDriver::MapMemory(
|
||||||
GpuContext* context, CUdeviceptr va,
|
GpuContext* context, CUdeviceptr va,
|
||||||
const GpuDriver::GenericMemoryHandle& handle,
|
const GpuDriver::GenericMemoryHandle& handle,
|
||||||
const std::vector<int>& device_ordinals) {
|
const std::vector<GpuDeviceHandle>& device_handles) {
|
||||||
ScopedActivateContext activation(context);
|
ScopedActivateContext activation(context);
|
||||||
|
|
||||||
auto device = DeviceFromContext(context);
|
auto device = DeviceFromContext(context);
|
||||||
@ -986,9 +986,9 @@ GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) {
|
|||||||
"Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
|
"Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<CUmemAccessDesc> access_descriptors(device_ordinals.size());
|
std::vector<CUmemAccessDesc> access_descriptors(device_handles.size());
|
||||||
for (int i = 0; i < access_descriptors.size(); ++i) {
|
for (int i = 0; i < access_descriptors.size(); ++i) {
|
||||||
access_descriptors[i].location.id = device_ordinals[i];
|
access_descriptors[i].location.id = device_handles[i];
|
||||||
access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||||
access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||||
}
|
}
|
||||||
@ -1574,7 +1574,6 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
|||||||
return true; // A context can always access its own memory.
|
return true; // A context can always access its own memory.
|
||||||
}
|
}
|
||||||
|
|
||||||
int can_access_peer = -1;
|
|
||||||
auto from_device = DeviceFromContext(from);
|
auto from_device = DeviceFromContext(from);
|
||||||
if (!from_device.ok()) {
|
if (!from_device.ok()) {
|
||||||
LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
|
LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
|
||||||
@ -1587,13 +1586,18 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
|||||||
<< to_device.status();
|
<< to_device.status();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
CUresult res = cuDeviceCanAccessPeer(
|
return CanEnablePeerAccess(from_device.ValueOrDie(), to_device.ValueOrDie());
|
||||||
&can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
|
}
|
||||||
if (res != CUDA_SUCCESS) {
|
|
||||||
LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
|
/* static */ bool GpuDriver::CanEnablePeerAccess(GpuDeviceHandle from,
|
||||||
|
GpuDeviceHandle to) {
|
||||||
|
int can_access_peer = -1;
|
||||||
|
CUresult result = cuDeviceCanAccessPeer(&can_access_peer, from, to);
|
||||||
|
if (result != CUDA_SUCCESS) {
|
||||||
|
LOG(ERROR) << "failed to detect peer access capability: "
|
||||||
|
<< ToString(result);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return can_access_peer;
|
return can_access_peer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -163,7 +163,8 @@ class GpuDriver {
|
|||||||
// Calculates the minimum alignment for memory allocations done through
|
// Calculates the minimum alignment for memory allocations done through
|
||||||
// cuMemCreate via cuMemGetAllocationGranularity.
|
// cuMemCreate via cuMemGetAllocationGranularity.
|
||||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g30ee906c2cf66a0347b3dfec3d7eb31a
|
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g30ee906c2cf66a0347b3dfec3d7eb31a
|
||||||
static port::StatusOr<uint64> GetMinAllocationGranularity(int device_ordinal);
|
static port::StatusOr<uint64> GetMinAllocationGranularity(
|
||||||
|
GpuDeviceHandle device);
|
||||||
|
|
||||||
// Allocates physical memory and returns a handle that can be mapped to
|
// Allocates physical memory and returns a handle that can be mapped to
|
||||||
// virtual addresses via cuMemCreate. bytes must be a multiple of the
|
// virtual addresses via cuMemCreate. bytes must be a multiple of the
|
||||||
@ -185,9 +186,9 @@ class GpuDriver {
|
|||||||
// cuMemMap and sets the appropriate access settings via cuMemSetAccess.
|
// cuMemMap and sets the appropriate access settings via cuMemSetAccess.
|
||||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gff1d395423af5c5c75375516959dae56
|
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gff1d395423af5c5c75375516959dae56
|
||||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g1b6b12b10e8324bf462ecab4e7ef30e1
|
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g1b6b12b10e8324bf462ecab4e7ef30e1
|
||||||
static port::Status MapMemory(GpuContext* context, GpuDevicePtr va,
|
static port::Status MapMemory(
|
||||||
const GenericMemoryHandle& handle,
|
GpuContext* context, GpuDevicePtr va, const GenericMemoryHandle& handle,
|
||||||
const std::vector<int>& device_ordinals);
|
const std::vector<GpuDeviceHandle>& device_handles);
|
||||||
|
|
||||||
// Unmaps the backing memory from the given virtual address range. This range
|
// Unmaps the backing memory from the given virtual address range. This range
|
||||||
// must fully unmap a memory handle that was mapped using MapMemory; partial
|
// must fully unmap a memory handle that was mapped using MapMemory; partial
|
||||||
@ -408,6 +409,13 @@ class GpuDriver {
|
|||||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
|
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
|
||||||
static bool CanEnablePeerAccess(GpuContext* from, GpuContext* to);
|
static bool CanEnablePeerAccess(GpuContext* from, GpuContext* to);
|
||||||
|
|
||||||
|
// Returns whether the from device can access memory in the to
|
||||||
|
// device via cuDeviceCanAccessPeer. Because of differences between ROCM and
|
||||||
|
// CUDA, this API is not supported in ROCM builds and will result in a link
|
||||||
|
// error if used.
|
||||||
|
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
|
||||||
|
static bool CanEnablePeerAccess(GpuDeviceHandle from, GpuDeviceHandle to);
|
||||||
|
|
||||||
// Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
|
// Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
|
||||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
|
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
|
||||||
static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to);
|
static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to);
|
||||||
|
Loading…
Reference in New Issue
Block a user