Add virtual memory management function wrappers to GpuDriver.

PiperOrigin-RevId: 346716222
Change-Id: I6e80e10ae76c772326411a3bafa00421ff6bf7b5
This commit is contained in:
A. Unique TensorFlower 2020-12-09 23:04:51 -08:00 committed by TensorFlower Gardener
parent 5c26b3ca2c
commit c8bca7b4ac
2 changed files with 188 additions and 0 deletions
tensorflow/stream_executor

View File

@ -890,6 +890,137 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
return true;
}
#if CUDA_VERSION >= 10020
/* static */ port::StatusOr<GpuDriver::VmemSpan>
GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64 bytes) {
ScopedActivateContext activation(context);
CUdeviceptr base;
CUresult res = cuMemAddressReserve(&base, bytes, /*alignment=*/0,
/*addr=*/0, /*flags=*/0);
if (res != CUDA_SUCCESS) {
return port::InternalError(
absl::StrFormat("error reserving %d bytes of virtual GPU memory: %s",
bytes, ToString(res)));
}
return {{base, bytes}};
}
/* static */ void GpuDriver::FreeVirtualMemory(
GpuContext* context, GpuDriver::VmemSpan reservation) {
ScopedActivateContext activation(context);
CUresult res = cuMemAddressFree(reservation.base, reservation.size_bytes);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "error freeing vmem reservation of size "
<< reservation.size_bytes << " at address " << reservation.base;
}
}
/* static */ port::StatusOr<uint64> GpuDriver::GetMinAllocationGranularity(
int device_ordinal) {
CUmemAllocationProp props = {};
props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
props.location.id = device_ordinal;
size_t granularity;
CUresult res = cuMemGetAllocationGranularity(
&granularity, &props, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
if (res != CUDA_SUCCESS) {
return port::InternalError(absl::StrCat(
"failed to get min allocation granularity: ", ToString(res)));
}
return granularity;
}
/* static */ port::StatusOr<GpuDriver::GenericMemoryHandle>
GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) {
ScopedActivateContext activation(context);
auto device = DeviceFromContext(context);
if (!device.ok()) {
LOG(ERROR) << "Failed to get device from context" << device.status();
return device.status();
}
CUmemAllocationProp props = {};
props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
props.location.id = device.ValueOrDie();
CUmemGenericAllocationHandle mem_handle;
CUresult res = cuMemCreate(&mem_handle, bytes, &props, 0);
if (res != CUDA_SUCCESS) {
return port::InternalError(
absl::StrFormat("failed to create memory allocation of size %d: %s",
bytes, ToString(res)));
}
return GpuDriver::GenericMemoryHandle{mem_handle, bytes};
}
/* static */ void GpuDriver::ReleaseMemoryHandle(
GpuContext* context, GpuDriver::GenericMemoryHandle handle) {
ScopedActivateContext activation(context);
CUresult res = cuMemRelease(handle.handle);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "Failed to release memory handle " << handle.handle
<< " of size " << handle.bytes << ": " << ToString(res);
}
}
/* static */ port::Status GpuDriver::MapMemory(
GpuContext* context, CUdeviceptr va,
const GpuDriver::GenericMemoryHandle& handle,
const std::vector<int>& device_ordinals) {
ScopedActivateContext activation(context);
auto device = DeviceFromContext(context);
if (!device.ok()) {
return device.status();
}
// NB: Zero is the only valid value for both flags and offset.
CUresult res =
cuMemMap(va, handle.bytes, /*offset=*/0, handle.handle, /*flags=*/0);
if (res != CUDA_SUCCESS) {
return port::InternalError(absl::StrFormat(
"Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
}
std::vector<CUmemAccessDesc> access_descriptors(device_ordinals.size());
for (int i = 0; i < access_descriptors.size(); ++i) {
access_descriptors[i].location.id = device_ordinals[i];
access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
}
res = cuMemSetAccess(va, handle.bytes, access_descriptors.data(),
access_descriptors.size());
if (res != CUDA_SUCCESS) {
// Unmap the memory that we failed to set access for.
if (cuMemUnmap(va, handle.bytes) != CUDA_SUCCESS) {
LOG(ERROR)
<< "Failed to unmap memory in GpuDriver::MapMemory error path.";
}
return port::InternalError(absl::StrFormat(
"Failed to set read/write access on memory mapped at %d: %s", va,
ToString(res)));
}
return port::Status::OK();
}
/* static */ void GpuDriver::UnmapMemory(GpuContext* context, CUdeviceptr va,
uint64 bytes) {
ScopedActivateContext activation(context);
CUresult res = cuMemUnmap(va, bytes);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "Failed to unmap memory at " << va << " of size " << bytes
<< ": " << ToString(res);
}
}
#endif
/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
CUevent* event) {
if (*event == nullptr) {

View File

@ -140,6 +140,63 @@ class GpuDriver {
// previously registered.
static bool HostUnregister(GpuContext* context, void* location);
// Virtual memory support was added to CUDA in 10.2
#if CUDA_VERSION >= 10020
// Reserves a range of virtual device memory addresses via
// cuMemAddressReserve. bytes must be a multiple of the host page size.
// Returns nullptr base address in VmemSpan if the reservation fails.
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1ge489256c107df2a07ddf96d80c86cd9b
struct VmemSpan {
GpuDevicePtr base;
// Size in bytes.
uint64 size_bytes;
};
static port::StatusOr<VmemSpan> ReserveVirtualMemory(GpuContext* context,
uint64 bytes);
// Frees a range of virtual addresses that were previously reserved through
// ReserveVirtualMemory via cuMemAddressFree.
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g6993ecea2ea03e1b802b8255edc2da5b
static void FreeVirtualMemory(GpuContext* context, VmemSpan reservation);
// Calculates the minimum alignment for memory allocations done through
// cuMemCreate via cuMemGetAllocationGranularity.
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g30ee906c2cf66a0347b3dfec3d7eb31a
static port::StatusOr<uint64> GetMinAllocationGranularity(int device_ordinal);
// Allocates physical memory and returns a handle that can be mapped to
// virtual addresses via cuMemCreate. bytes must be a multiple of the
// granularity returned by GetMinAllocationGranularity.
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g899d69a862bba36449789c64b430dc7c
struct GenericMemoryHandle {
uint64 handle;
uint64 bytes;
};
static port::StatusOr<GenericMemoryHandle> CreateMemoryHandle(
GpuContext* context, uint64 bytes);
// Frees memory represented by the provided MemoryHandle via cuMemRelease.
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g3014f0759f43a8d82db951b8e4b91d68
static void ReleaseMemoryHandle(GpuContext* context,
GenericMemoryHandle handle);
// Maps a memory allocation handle to a reserved virtual address range via
// cuMemMap and sets the appropriate access settings via cuMemSetAccess.
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gff1d395423af5c5c75375516959dae56
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g1b6b12b10e8324bf462ecab4e7ef30e1
static port::Status MapMemory(GpuContext* context, GpuDevicePtr va,
const GenericMemoryHandle& handle,
const std::vector<int>& device_ordinals);
// Unmaps the backing memory from the given virtual address range. This range
// must fully unmap a memory handle that was mapped using MapMemory; partial
// unmapping is not supported.
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gfb50aac00c848fd7087e858f59bf7e2a
static void UnmapMemory(GpuContext* context, GpuDevicePtr va, uint64 bytes);
#endif // CUDA_VERSION >= 10200
// Given a device ordinal, returns a device handle into the device outparam,
// which must not be null.
//