From c8bca7b4ace348d2fdba12b3d9f5c70ba721144c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" <gardener@tensorflow.org> Date: Wed, 9 Dec 2020 23:04:51 -0800 Subject: [PATCH] Add virtual memory management function wrappers to GpuDriver. PiperOrigin-RevId: 346716222 Change-Id: I6e80e10ae76c772326411a3bafa00421ff6bf7b5 --- .../stream_executor/cuda/cuda_driver.cc | 131 ++++++++++++++++++ tensorflow/stream_executor/gpu/gpu_driver.h | 57 ++++++++ 2 files changed, 188 insertions(+) diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc index 67fd72d52f3..42db563a0bd 100644 --- a/tensorflow/stream_executor/cuda/cuda_driver.cc +++ b/tensorflow/stream_executor/cuda/cuda_driver.cc @@ -890,6 +890,137 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { return true; } +#if CUDA_VERSION >= 10020 +/* static */ port::StatusOr<GpuDriver::VmemSpan> +GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64 bytes) { + ScopedActivateContext activation(context); + CUdeviceptr base; + CUresult res = cuMemAddressReserve(&base, bytes, /*alignment=*/0, + /*addr=*/0, /*flags=*/0); + if (res != CUDA_SUCCESS) { + return port::InternalError( + absl::StrFormat("error reserving %d bytes of virtual GPU memory: %s", + bytes, ToString(res))); + } + return {{base, bytes}}; +} + +/* static */ void GpuDriver::FreeVirtualMemory( + GpuContext* context, GpuDriver::VmemSpan reservation) { + ScopedActivateContext activation(context); + CUresult res = cuMemAddressFree(reservation.base, reservation.size_bytes); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "error freeing vmem reservation of size " + << reservation.size_bytes << " at address " << reservation.base; + } +} + +/* static */ port::StatusOr<uint64> GpuDriver::GetMinAllocationGranularity( + int device_ordinal) { + CUmemAllocationProp props = {}; + props.type = CU_MEM_ALLOCATION_TYPE_PINNED; + props.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + props.location.id = device_ordinal; + + size_t granularity; + CUresult res = cuMemGetAllocationGranularity( + &granularity, &props, CU_MEM_ALLOC_GRANULARITY_MINIMUM); + if (res != CUDA_SUCCESS) { + return port::InternalError(absl::StrCat( + "failed to get min allocation granularity: ", ToString(res))); + } + return granularity; +} + +/* static */ port::StatusOr<GpuDriver::GenericMemoryHandle> +GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) { + ScopedActivateContext activation(context); + auto device = DeviceFromContext(context); + if (!device.ok()) { + LOG(ERROR) << "Failed to get device from context" << device.status(); + return device.status(); + } + + CUmemAllocationProp props = {}; + props.type = CU_MEM_ALLOCATION_TYPE_PINNED; + props.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + props.location.id = device.ValueOrDie(); + + CUmemGenericAllocationHandle mem_handle; + CUresult res = cuMemCreate(&mem_handle, bytes, &props, 0); + if (res != CUDA_SUCCESS) { + return port::InternalError( + absl::StrFormat("failed to create memory allocation of size %d: %s", + bytes, ToString(res))); + } + return GpuDriver::GenericMemoryHandle{mem_handle, bytes}; +} + +/* static */ void GpuDriver::ReleaseMemoryHandle( + GpuContext* context, GpuDriver::GenericMemoryHandle handle) { + ScopedActivateContext activation(context); + + CUresult res = cuMemRelease(handle.handle); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "Failed to release memory handle " << handle.handle + << " of size " << handle.bytes << ": " << ToString(res); + } +} + +/* static */ port::Status GpuDriver::MapMemory( + GpuContext* context, CUdeviceptr va, + const GpuDriver::GenericMemoryHandle& handle, + const std::vector<int>& device_ordinals) { + ScopedActivateContext activation(context); + + auto device = DeviceFromContext(context); + if (!device.ok()) { + return device.status(); + } + + // NB: Zero is the only valid value for both flags and offset. + CUresult res = + cuMemMap(va, handle.bytes, /*offset=*/0, handle.handle, /*flags=*/0); + if (res != CUDA_SUCCESS) { + return port::InternalError(absl::StrFormat( + "Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res))); + } + + std::vector<CUmemAccessDesc> access_descriptors(device_ordinals.size()); + for (int i = 0; i < access_descriptors.size(); ++i) { + access_descriptors[i].location.id = device_ordinals[i]; + access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + } + + res = cuMemSetAccess(va, handle.bytes, access_descriptors.data(), + access_descriptors.size()); + if (res != CUDA_SUCCESS) { + // Unmap the memory that we failed to set access for. + if (cuMemUnmap(va, handle.bytes) != CUDA_SUCCESS) { + LOG(ERROR) + << "Failed to unmap memory in GpuDriver::MapMemory error path."; + } + return port::InternalError(absl::StrFormat( + "Failed to set read/write access on memory mapped at %d: %s", va, + ToString(res))); + } + return port::Status::OK(); +} + +/* static */ void GpuDriver::UnmapMemory(GpuContext* context, CUdeviceptr va, + uint64 bytes) { + ScopedActivateContext activation(context); + + CUresult res = cuMemUnmap(va, bytes); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "Failed to unmap memory at " << va << " of size " << bytes + << ": " << ToString(res); + } +} + +#endif + /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context, CUevent* event) { if (*event == nullptr) { diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h index 25b90be1bd2..3cd13dcc013 100644 --- a/tensorflow/stream_executor/gpu/gpu_driver.h +++ b/tensorflow/stream_executor/gpu/gpu_driver.h @@ -140,6 +140,63 @@ class GpuDriver { // previously registered. static bool HostUnregister(GpuContext* context, void* location); + // Virtual memory support was added to CUDA in 10.2 +#if CUDA_VERSION >= 10020 + + // Reserves a range of virtual device memory addresses via + // cuMemAddressReserve. bytes must be a multiple of the host page size. + // Returns nullptr base address in VmemSpan if the reservation fails. + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1ge489256c107df2a07ddf96d80c86cd9b + struct VmemSpan { + GpuDevicePtr base; + // Size in bytes. + uint64 size_bytes; + }; + static port::StatusOr<VmemSpan> ReserveVirtualMemory(GpuContext* context, + uint64 bytes); + + // Frees a range of virtual addresses that were previously reserved through + // ReserveVirtualMemory via cuMemAddressFree. + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g6993ecea2ea03e1b802b8255edc2da5b + static void FreeVirtualMemory(GpuContext* context, VmemSpan reservation); + + // Calculates the minimum alignment for memory allocations done through + // cuMemCreate via cuMemGetAllocationGranularity. + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g30ee906c2cf66a0347b3dfec3d7eb31a + static port::StatusOr<uint64> GetMinAllocationGranularity(int device_ordinal); + + // Allocates physical memory and returns a handle that can be mapped to + // virtual addresses via cuMemCreate. bytes must be a multiple of the + // granularity returned by GetMinAllocationGranularity. + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g899d69a862bba36449789c64b430dc7c + struct GenericMemoryHandle { + uint64 handle; + uint64 bytes; + }; + static port::StatusOr<GenericMemoryHandle> CreateMemoryHandle( + GpuContext* context, uint64 bytes); + + // Frees memory represented by the provided MemoryHandle via cuMemRelease. + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g3014f0759f43a8d82db951b8e4b91d68 + static void ReleaseMemoryHandle(GpuContext* context, + GenericMemoryHandle handle); + + // Maps a memory allocation handle to a reserved virtual address range via + // cuMemMap and sets the appropriate access settings via cuMemSetAccess. + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gff1d395423af5c5c75375516959dae56 + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g1b6b12b10e8324bf462ecab4e7ef30e1 + static port::Status MapMemory(GpuContext* context, GpuDevicePtr va, + const GenericMemoryHandle& handle, + const std::vector<int>& device_ordinals); + + // Unmaps the backing memory from the given virtual address range. This range + // must fully unmap a memory handle that was mapped using MapMemory; partial + // unmapping is not supported. + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gfb50aac00c848fd7087e858f59bf7e2a + static void UnmapMemory(GpuContext* context, GpuDevicePtr va, uint64 bytes); + +#endif // CUDA_VERSION >= 10200 + // Given a device ordinal, returns a device handle into the device outparam, // which must not be null. //