From c8bca7b4ace348d2fdba12b3d9f5c70ba721144c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 9 Dec 2020 23:04:51 -0800
Subject: [PATCH] Add virtual memory management function wrappers to GpuDriver.

PiperOrigin-RevId: 346716222
Change-Id: I6e80e10ae76c772326411a3bafa00421ff6bf7b5
---
 .../stream_executor/cuda/cuda_driver.cc       | 131 ++++++++++++++++++
 tensorflow/stream_executor/gpu/gpu_driver.h   |  57 ++++++++
 2 files changed, 188 insertions(+)

diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 67fd72d52f3..42db563a0bd 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -890,6 +890,137 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return true;
 }
 
+#if CUDA_VERSION >= 10020
+/* static */ port::StatusOr<GpuDriver::VmemSpan>
+GpuDriver::ReserveVirtualMemory(GpuContext* context, uint64 bytes) {
+  ScopedActivateContext activation(context);
+  CUdeviceptr base;
+  CUresult res = cuMemAddressReserve(&base, bytes, /*alignment=*/0,
+                                     /*addr=*/0, /*flags=*/0);
+  if (res != CUDA_SUCCESS) {
+    return port::InternalError(
+        absl::StrFormat("error reserving %d bytes of virtual GPU memory: %s",
+                        bytes, ToString(res)));
+  }
+  return {{base, bytes}};
+}
+
+/* static */ void GpuDriver::FreeVirtualMemory(
+    GpuContext* context, GpuDriver::VmemSpan reservation) {
+  ScopedActivateContext activation(context);
+  CUresult res = cuMemAddressFree(reservation.base, reservation.size_bytes);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "error freeing vmem reservation of size "
+               << reservation.size_bytes << " at address " << reservation.base;
+  }
+}
+
+/* static */ port::StatusOr<uint64> GpuDriver::GetMinAllocationGranularity(
+    int device_ordinal) {
+  CUmemAllocationProp props = {};
+  props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  props.location.id = device_ordinal;
+
+  size_t granularity;
+  CUresult res = cuMemGetAllocationGranularity(
+      &granularity, &props, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+  if (res != CUDA_SUCCESS) {
+    return port::InternalError(absl::StrCat(
+        "failed to get min allocation granularity: ", ToString(res)));
+  }
+  return granularity;
+}
+
+/* static */ port::StatusOr<GpuDriver::GenericMemoryHandle>
+GpuDriver::CreateMemoryHandle(GpuContext* context, uint64 bytes) {
+  ScopedActivateContext activation(context);
+  auto device = DeviceFromContext(context);
+  if (!device.ok()) {
+    LOG(ERROR) << "Failed to get device from context" << device.status();
+    return device.status();
+  }
+
+  CUmemAllocationProp props = {};
+  props.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  props.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  props.location.id = device.ValueOrDie();
+
+  CUmemGenericAllocationHandle mem_handle;
+  CUresult res = cuMemCreate(&mem_handle, bytes, &props, 0);
+  if (res != CUDA_SUCCESS) {
+    return port::InternalError(
+        absl::StrFormat("failed to create memory allocation of size %d: %s",
+                        bytes, ToString(res)));
+  }
+  return GpuDriver::GenericMemoryHandle{mem_handle, bytes};
+}
+
+/* static */ void GpuDriver::ReleaseMemoryHandle(
+    GpuContext* context, GpuDriver::GenericMemoryHandle handle) {
+  ScopedActivateContext activation(context);
+
+  CUresult res = cuMemRelease(handle.handle);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "Failed to release memory handle " << handle.handle
+               << " of size " << handle.bytes << ": " << ToString(res);
+  }
+}
+
+/* static */ port::Status GpuDriver::MapMemory(
+    GpuContext* context, CUdeviceptr va,
+    const GpuDriver::GenericMemoryHandle& handle,
+    const std::vector<int>& device_ordinals) {
+  ScopedActivateContext activation(context);
+
+  auto device = DeviceFromContext(context);
+  if (!device.ok()) {
+    return device.status();
+  }
+
+  // NB: Zero is the only valid value for both flags and offset.
+  CUresult res =
+      cuMemMap(va, handle.bytes, /*offset=*/0, handle.handle, /*flags=*/0);
+  if (res != CUDA_SUCCESS) {
+    return port::InternalError(absl::StrFormat(
+        "Failed to map %d bytes at %d: %s", handle.bytes, va, ToString(res)));
+  }
+
+  std::vector<CUmemAccessDesc> access_descriptors(device_ordinals.size());
+  for (int i = 0; i < access_descriptors.size(); ++i) {
+    access_descriptors[i].location.id = device_ordinals[i];
+    access_descriptors[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    access_descriptors[i].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  }
+
+  res = cuMemSetAccess(va, handle.bytes, access_descriptors.data(),
+                       access_descriptors.size());
+  if (res != CUDA_SUCCESS) {
+    // Unmap the memory that we failed to set access for.
+    if (cuMemUnmap(va, handle.bytes) != CUDA_SUCCESS) {
+      LOG(ERROR)
+          << "Failed to unmap memory in GpuDriver::MapMemory error path.";
+    }
+    return port::InternalError(absl::StrFormat(
+        "Failed to set read/write access on memory mapped at %d: %s", va,
+        ToString(res)));
+  }
+  return port::Status::OK();
+}
+
+/* static */ void GpuDriver::UnmapMemory(GpuContext* context, CUdeviceptr va,
+                                         uint64 bytes) {
+  ScopedActivateContext activation(context);
+
+  CUresult res = cuMemUnmap(va, bytes);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "Failed to unmap memory at " << va << " of size " << bytes
+               << ": " << ToString(res);
+  }
+}
+
+#endif
+
 /* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
                                                   CUevent* event) {
   if (*event == nullptr) {
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
index 25b90be1bd2..3cd13dcc013 100644
--- a/tensorflow/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@@ -140,6 +140,63 @@ class GpuDriver {
   // previously registered.
   static bool HostUnregister(GpuContext* context, void* location);
 
+  // Virtual memory support was added to CUDA in 10.2
+#if CUDA_VERSION >= 10020
+
+  // Reserves a range of virtual device memory addresses via
+  // cuMemAddressReserve. bytes must be a multiple of the host page size.
+  // Returns nullptr base address in VmemSpan if the reservation fails.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1ge489256c107df2a07ddf96d80c86cd9b
+  struct VmemSpan {
+    GpuDevicePtr base;
+    // Size in bytes.
+    uint64 size_bytes;
+  };
+  static port::StatusOr<VmemSpan> ReserveVirtualMemory(GpuContext* context,
+                                                       uint64 bytes);
+
+  // Frees a range of virtual addresses that were previously reserved through
+  // ReserveVirtualMemory via cuMemAddressFree.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g6993ecea2ea03e1b802b8255edc2da5b
+  static void FreeVirtualMemory(GpuContext* context, VmemSpan reservation);
+
+  // Calculates the minimum alignment for memory allocations done through
+  // cuMemCreate via cuMemGetAllocationGranularity.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g30ee906c2cf66a0347b3dfec3d7eb31a
+  static port::StatusOr<uint64> GetMinAllocationGranularity(int device_ordinal);
+
+  // Allocates physical memory and returns a handle that can be mapped to
+  // virtual addresses via cuMemCreate. bytes must be a multiple of the
+  // granularity returned by GetMinAllocationGranularity.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g899d69a862bba36449789c64b430dc7c
+  struct GenericMemoryHandle {
+    uint64 handle;
+    uint64 bytes;
+  };
+  static port::StatusOr<GenericMemoryHandle> CreateMemoryHandle(
+      GpuContext* context, uint64 bytes);
+
+  // Frees memory represented by the provided MemoryHandle via cuMemRelease.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g3014f0759f43a8d82db951b8e4b91d68
+  static void ReleaseMemoryHandle(GpuContext* context,
+                                  GenericMemoryHandle handle);
+
+  // Maps a memory allocation handle to a reserved virtual address range via
+  // cuMemMap and sets the appropriate access settings via cuMemSetAccess.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gff1d395423af5c5c75375516959dae56
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g1b6b12b10e8324bf462ecab4e7ef30e1
+  static port::Status MapMemory(GpuContext* context, GpuDevicePtr va,
+                                const GenericMemoryHandle& handle,
+                                const std::vector<int>& device_ordinals);
+
+  // Unmaps the backing memory from the given virtual address range. This range
+  // must fully unmap a memory handle that was mapped using MapMemory; partial
+  // unmapping is not supported.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gfb50aac00c848fd7087e858f59bf7e2a
+  static void UnmapMemory(GpuContext* context, GpuDevicePtr va, uint64 bytes);
+
+#endif  // CUDA_VERSION >= 10200
+
   // Given a device ordinal, returns a device handle into the device outparam,
   // which must not be null.
   //