Add virtual memory management function wrappers to GpuDriver.
PiperOrigin-RevId: 346716222 Change-Id: I6e80e10ae76c772326411a3bafa00421ff6bf7b5
This commit is contained in:
parent
5c26b3ca2c
commit
c8bca7b4ac
tensorflow/stream_executor
@ -890,6 +890,137 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
#if CUDA_VERSION >= 10020
|
||||
/* static */ port::StatusOr<GpuDriver::VmemSpan>
|
||||
GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64 bytes) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUdeviceptr base;
|
||||
CUresult res = cuMemAddressReserve(&base, bytes, /*alignment=*/0,
|
||||
/*addr=*/0, /*flags=*/0);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
return port::InternalError(
|
||||
absl::StrFormat("error reserving %d bytes of virtual GPU memory: %s",
|
||||
bytes, ToString(res)));
|
||||
}
|
||||
return {{base, bytes}};
|
||||
}
|
||||
|
||||
/* static */ void GpuDriver::FreeVirtualMemory(
|
||||
GpuContext* context, GpuDriver::VmemSpan reservation) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res = cuMemAddressFree(reservation.base, reservation.size_bytes);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
LOG(ERROR) << "error freeing vmem reservation of size "
|
||||
<< reservation.size_bytes << " at address " << reservation.base;
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<uint64> GpuDriver::GetMinAllocationGranularity(
|
||||
int device_ordinal) {
|
||||
CUmemAllocationProp props = {};
|
||||
props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
props.location.id = device_ordinal;
|
||||
|
||||
size_t granularity;
|
||||
CUresult res = cuMemGetAllocationGranularity(
|
||||
&granularity, &props, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
return port::InternalError(absl::StrCat(
|
||||
"failed to get min allocation granularity: ", ToString(res)));
|
||||
}
|
||||
return granularity;
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<GpuDriver::GenericMemoryHandle>
|
||||
GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) {
|
||||
ScopedActivateContext activation(context);
|
||||
auto device = DeviceFromContext(context);
|
||||
if (!device.ok()) {
|
||||
LOG(ERROR) << "Failed to get device from context" << device.status();
|
||||
return device.status();
|
||||
}
|
||||
|
||||
CUmemAllocationProp props = {};
|
||||
props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
props.location.id = device.ValueOrDie();
|
||||
|
||||
CUmemGenericAllocationHandle mem_handle;
|
||||
CUresult res = cuMemCreate(&mem_handle, bytes, &props, 0);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
return port::InternalError(
|
||||
absl::StrFormat("failed to create memory allocation of size %d: %s",
|
||||
bytes, ToString(res)));
|
||||
}
|
||||
return GpuDriver::GenericMemoryHandle{mem_handle, bytes};
|
||||
}
|
||||
|
||||
/* static */ void GpuDriver::ReleaseMemoryHandle(
|
||||
GpuContext* context, GpuDriver::GenericMemoryHandle handle) {
|
||||
ScopedActivateContext activation(context);
|
||||
|
||||
CUresult res = cuMemRelease(handle.handle);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
LOG(ERROR) << "Failed to release memory handle " << handle.handle
|
||||
<< " of size " << handle.bytes << ": " << ToString(res);
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ port::Status GpuDriver::MapMemory(
|
||||
GpuContext* context, CUdeviceptr va,
|
||||
const GpuDriver::GenericMemoryHandle& handle,
|
||||
const std::vector<int>& device_ordinals) {
|
||||
ScopedActivateContext activation(context);
|
||||
|
||||
auto device = DeviceFromContext(context);
|
||||
if (!device.ok()) {
|
||||
return device.status();
|
||||
}
|
||||
|
||||
// NB: Zero is the only valid value for both flags and offset.
|
||||
CUresult res =
|
||||
cuMemMap(va, handle.bytes, /*offset=*/0, handle.handle, /*flags=*/0);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
return port::InternalError(absl::StrFormat(
|
||||
"Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
|
||||
}
|
||||
|
||||
std::vector<CUmemAccessDesc> access_descriptors(device_ordinals.size());
|
||||
for (int i = 0; i < access_descriptors.size(); ++i) {
|
||||
access_descriptors[i].location.id = device_ordinals[i];
|
||||
access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
}
|
||||
|
||||
res = cuMemSetAccess(va, handle.bytes, access_descriptors.data(),
|
||||
access_descriptors.size());
|
||||
if (res != CUDA_SUCCESS) {
|
||||
// Unmap the memory that we failed to set access for.
|
||||
if (cuMemUnmap(va, handle.bytes) != CUDA_SUCCESS) {
|
||||
LOG(ERROR)
|
||||
<< "Failed to unmap memory in GpuDriver::MapMemory error path.";
|
||||
}
|
||||
return port::InternalError(absl::StrFormat(
|
||||
"Failed to set read/write access on memory mapped at %d: %s", va,
|
||||
ToString(res)));
|
||||
}
|
||||
return port::Status::OK();
|
||||
}
|
||||
|
||||
/* static */ void GpuDriver::UnmapMemory(GpuContext* context, CUdeviceptr va,
|
||||
uint64 bytes) {
|
||||
ScopedActivateContext activation(context);
|
||||
|
||||
CUresult res = cuMemUnmap(va, bytes);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
LOG(ERROR) << "Failed to unmap memory at " << va << " of size " << bytes
|
||||
<< ": " << ToString(res);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
|
||||
CUevent* event) {
|
||||
if (*event == nullptr) {
|
||||
|
@ -140,6 +140,63 @@ class GpuDriver {
|
||||
// previously registered.
|
||||
static bool HostUnregister(GpuContext* context, void* location);
|
||||
|
||||
// Virtual memory support was added to CUDA in 10.2
|
||||
#if CUDA_VERSION >= 10020
|
||||
|
||||
// Reserves a range of virtual device memory addresses via
|
||||
// cuMemAddressReserve. bytes must be a multiple of the host page size.
|
||||
// Returns nullptr base address in VmemSpan if the reservation fails.
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1ge489256c107df2a07ddf96d80c86cd9b
|
||||
struct VmemSpan {
|
||||
GpuDevicePtr base;
|
||||
// Size in bytes.
|
||||
uint64 size_bytes;
|
||||
};
|
||||
static port::StatusOr<VmemSpan> ReserveVirtualMemory(GpuContext* context,
|
||||
uint64 bytes);
|
||||
|
||||
// Frees a range of virtual addresses that were previously reserved through
|
||||
// ReserveVirtualMemory via cuMemAddressFree.
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g6993ecea2ea03e1b802b8255edc2da5b
|
||||
static void FreeVirtualMemory(GpuContext* context, VmemSpan reservation);
|
||||
|
||||
// Calculates the minimum alignment for memory allocations done through
|
||||
// cuMemCreate via cuMemGetAllocationGranularity.
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g30ee906c2cf66a0347b3dfec3d7eb31a
|
||||
static port::StatusOr<uint64> GetMinAllocationGranularity(int device_ordinal);
|
||||
|
||||
// Allocates physical memory and returns a handle that can be mapped to
|
||||
// virtual addresses via cuMemCreate. bytes must be a multiple of the
|
||||
// granularity returned by GetMinAllocationGranularity.
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g899d69a862bba36449789c64b430dc7c
|
||||
struct GenericMemoryHandle {
|
||||
uint64 handle;
|
||||
uint64 bytes;
|
||||
};
|
||||
static port::StatusOr<GenericMemoryHandle> CreateMemoryHandle(
|
||||
GpuContext* context, uint64 bytes);
|
||||
|
||||
// Frees memory represented by the provided MemoryHandle via cuMemRelease.
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g3014f0759f43a8d82db951b8e4b91d68
|
||||
static void ReleaseMemoryHandle(GpuContext* context,
|
||||
GenericMemoryHandle handle);
|
||||
|
||||
// Maps a memory allocation handle to a reserved virtual address range via
|
||||
// cuMemMap and sets the appropriate access settings via cuMemSetAccess.
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gff1d395423af5c5c75375516959dae56
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g1b6b12b10e8324bf462ecab4e7ef30e1
|
||||
static port::Status MapMemory(GpuContext* context, GpuDevicePtr va,
|
||||
const GenericMemoryHandle& handle,
|
||||
const std::vector<int>& device_ordinals);
|
||||
|
||||
// Unmaps the backing memory from the given virtual address range. This range
|
||||
// must fully unmap a memory handle that was mapped using MapMemory; partial
|
||||
// unmapping is not supported.
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gfb50aac00c848fd7087e858f59bf7e2a
|
||||
static void UnmapMemory(GpuContext* context, GpuDevicePtr va, uint64 bytes);
|
||||
|
||||
#endif // CUDA_VERSION >= 10200
|
||||
|
||||
// Given a device ordinal, returns a device handle into the device outparam,
|
||||
// which must not be null.
|
||||
//
|
||||
|
Loading…
Reference in New Issue
Block a user