From 834a3f7395a9db748349e0bf9dfff7af558cb4fb Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Fri, 1 Feb 2019 17:59:02 +0000
Subject: [PATCH] changing the rocm_driver API from being dynamically linked to
 being dynamically loaded

---
 tensorflow/stream_executor/rocm/BUILD         | 149 +++++++++---------
 .../stream_executor/rocm/rocm_driver.cc       | 125 +++++++--------
 .../rocm/rocm_driver_wrapper.h                | 147 +++++++++++++++++
 3 files changed, 285 insertions(+), 136 deletions(-)
 create mode 100644 tensorflow/stream_executor/rocm/rocm_driver_wrapper.h

diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
index f0b05822703..5190b551f80 100644
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -47,7 +47,7 @@ cc_library(
 cc_library(
     name = "rocm_driver",
     srcs = if_rocm_is_configured(["rocm_driver.cc"]),
-    hdrs = [],
+    hdrs = if_rocm_is_configured(["rocm_driver_wrapper.h"]),
     deps = if_rocm_is_configured([
         ":rocm_diagnostics",
         "@com_google_absl//absl/base",
@@ -57,6 +57,7 @@ cc_library(
         "//tensorflow/stream_executor/gpu:gpu_driver_header",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
         "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
@@ -141,60 +142,60 @@ cc_library(
 )
 
 cc_library(
-   name = "rocblas_plugin",
-   srcs = if_rocm_is_configured(["rocm_blas.cc"]),
-   hdrs = if_rocm_is_configured(["rocm_blas.h"]),
-   visibility = ["//visibility:public"],
-   deps = if_rocm_is_configured([
-       ":rocm_gpu_executor",
-       ":rocm_platform_id",
-       "//third_party/eigen3",
-       "//tensorflow/core:lib_internal",
-       "//tensorflow/stream_executor",
-       "//tensorflow/stream_executor:event",
-       "//tensorflow/stream_executor:host_or_device_scalar",
-       "//tensorflow/stream_executor:plugin_registry",
-       "//tensorflow/stream_executor:scratch_allocator",
-       "//tensorflow/stream_executor:timer",
-       "//tensorflow/stream_executor/gpu:gpu_activation",
-       "//tensorflow/stream_executor/gpu:gpu_helpers_header",
-       "//tensorflow/stream_executor/gpu:gpu_stream_header",
-       "//tensorflow/stream_executor/gpu:gpu_timer_header",
-       "//tensorflow/stream_executor/lib",
-       "//tensorflow/stream_executor/platform",
-       "//tensorflow/stream_executor/platform:dso_loader",
-       "@com_google_absl//absl/strings",
-       "@local_config_rocm//rocm:rocm_headers",
-   ] + if_static([
-       "@local_config_rocm//rocm:rocblas"
-   ])),
-   alwayslink = True,
+    name = "rocblas_plugin",
+    srcs = if_rocm_is_configured(["rocm_blas.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_blas.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "//third_party/eigen3",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:host_or_device_scalar",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_timer_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "@com_google_absl//absl/strings",
+        "@local_config_rocm//rocm:rocm_headers",
+    ] + if_static([
+        "@local_config_rocm//rocm:rocblas",
+    ])),
+    alwayslink = True,
 )
 
 cc_library(
-   name = "rocfft_plugin",
-   srcs = if_rocm_is_configured(["rocm_fft.cc"]),
-   hdrs = if_rocm_is_configured(["rocm_fft.h"]),
-   visibility = ["//visibility:public"],
-   deps = if_rocm_is_configured([
-       ":rocm_platform_id",
-       "//tensorflow/stream_executor:event",
-       "//tensorflow/stream_executor:fft",
-       "//tensorflow/stream_executor:plugin_registry",
-       "//tensorflow/stream_executor:scratch_allocator",
-       "//tensorflow/stream_executor/gpu:gpu_activation",
-       "//tensorflow/stream_executor/gpu:gpu_helpers_header",
-       "//tensorflow/stream_executor/gpu:gpu_executor_header",
-       "//tensorflow/stream_executor/gpu:gpu_stream_header",
-       "//tensorflow/stream_executor/gpu:gpu_kernel_header",
-       "//tensorflow/stream_executor/lib",
-       "//tensorflow/stream_executor/platform",
-       "//tensorflow/stream_executor/platform:dso_loader",
-       "@local_config_rocm//rocm:rocm_headers",
-   ] + if_static([
-       "@local_config_rocm//rocm:rocfft"
-   ])),
-   alwayslink = True,
+    name = "rocfft_plugin",
+    srcs = if_rocm_is_configured(["rocm_fft.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_fft.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_platform_id",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:fft",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "@local_config_rocm//rocm:rocm_headers",
+    ] + if_static([
+        "@local_config_rocm//rocm:rocfft",
+    ])),
+    alwayslink = True,
 )
 
 # FIXME: enable in future PRs
@@ -237,28 +238,28 @@ cc_library(
 #)
 
 cc_library(
-   name = "rocrand_plugin",
-   srcs = if_rocm_is_configured(["rocm_rng.cc"]),
-   hdrs = if_rocm_is_configured([]),
-   deps = if_rocm_is_configured([
-       ":rocm_gpu_executor",
-       ":rocm_platform_id",
-       "@local_config_rocm//rocm:rocm_headers",
-       "//tensorflow/stream_executor:event",
-       "//tensorflow/stream_executor:plugin_registry",
-       "//tensorflow/stream_executor:rng",
-       "//tensorflow/stream_executor/gpu:gpu_activation_header",
-       "//tensorflow/stream_executor/gpu:gpu_helpers_header",
-       "//tensorflow/stream_executor/gpu:gpu_executor_header",
-       "//tensorflow/stream_executor/gpu:gpu_rng_header",
-       "//tensorflow/stream_executor/gpu:gpu_stream_header",
-       "//tensorflow/stream_executor/lib",
-       "//tensorflow/stream_executor/platform",
-       "//tensorflow/stream_executor/platform:dso_loader",
-   ] + if_static([
-       "@local_config_rocm//rocm:hiprand"
-   ])),
-   alwayslink = True,
+    name = "rocrand_plugin",
+    srcs = if_rocm_is_configured(["rocm_rng.cc"]),
+    hdrs = if_rocm_is_configured([]),
+    deps = if_rocm_is_configured([
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/gpu:gpu_rng_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static([
+        "@local_config_rocm//rocm:hiprand",
+    ])),
+    alwayslink = True,
 )
 
 cc_library(
diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
index 39d52d28304..73b1b350f71 100644
--- a/tensorflow/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"
 
 bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
 bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
@@ -143,7 +144,7 @@ string MemorySpaceString(MemorySpace memory_space) {
 // HIP driver (e.g., this value is not our cached view of the current device).
 static int CurrentDeviceOrDie() {
   int current = -1;
-  hipError_t result = hipGetDevice(&current);
+  hipError_t result = tensorflow::wrap::hipGetDevice(&current);
   if (result != hipSuccess) {
     LOG(FATAL) << "failed to query current device: " << ToString(result);
   }
@@ -154,7 +155,7 @@ namespace {
 
 // Call hipDeviceSynchronize and crash if it doesn't succeed.
 void SynchronizeOrDie() {
-  auto res = hipDeviceSynchronize();
+  auto res = tensorflow::wrap::hipDeviceSynchronize();
   if (res != hipSuccess) {
     LOG(FATAL) << "Synchronize found " << ToString(res)
                << " :: " << port::CurrentStackTrace();
@@ -197,7 +198,7 @@ ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
           << tls->current_device_ordinal << " to " << context->device_ordinal();
 
   // Set the device and update thread local.
-  CHECK_EQ(hipSuccess, hipSetDevice(context->device_ordinal()));
+  CHECK_EQ(hipSuccess, tensorflow::wrap::hipSetDevice(context->device_ordinal()));
   tls->current_device_ordinal = context->device_ordinal();
 }
 
@@ -225,7 +226,7 @@ ScopedActivateContext::~ScopedActivateContext() {
           << to_restore_->device_ordinal();
 
   // Set context and update thread local.
-  CHECK_EQ(hipSuccess, hipSetDevice(to_restore_->device_ordinal()));
+  CHECK_EQ(hipSuccess, tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal()));
   tls->current_device_ordinal = to_restore_->device_ordinal();
 }
 
@@ -261,7 +262,7 @@ string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
 // in the process of querying.
 string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
   hipPointerAttribute_t from_pointerAttributes;
-  hipError_t result = hipPointerGetAttributes(&from_pointerAttributes, from);
+  hipError_t result = tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from);
   if (result != hipSuccess) {
     LOG(ERROR) << "could not retrieve source pointer's device: "
                << ToString(result);
@@ -269,7 +270,7 @@ string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
   }
 
   hipPointerAttribute_t to_pointerAttributes;
-  result = hipPointerGetAttributes(&to_pointerAttributes, to);
+  result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to);
   if (result != hipSuccess) {
     LOG(ERROR) << "could not retrieve destination pointer's device: "
                << ToString(result);
@@ -289,7 +290,7 @@ static port::Status InternalInit() {
   if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
     LOG(ERROR) << "injecting ROCM init error; initialization will fail";
   } else {
-    res = hipInit(0 /* = flags */);
+    res = tensorflow::wrap::hipInit(0 /* = flags */);
   }
 
   if (res == hipSuccess) {
@@ -322,7 +323,7 @@ static port::Status InternalInit() {
 
 /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
                                                hipDevice_t* device) {
-  hipError_t res = hipDeviceGet(device, device_ordinal);
+  hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal);
   if (res == hipSuccess) {
     return port::Status::OK();
   }
@@ -336,7 +337,7 @@ static port::Status InternalInit() {
                                            string* device_name) {
   static const size_t kCharLimit = 64;
   absl::InlinedVector<char, 4> chars(kCharLimit);
-  hipError_t res = hipDeviceGetName(chars.begin(), kCharLimit - 1, device);
+  hipError_t res = tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to get device name for " << device << ": "
                << ToString(res);
@@ -382,7 +383,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
 
 /* static */ bool GpuDriver::FuncSetCacheConfig(hipFunction_t function,
                                                 hipFuncCache_t cache_config) {
-  hipError_t res = hipFuncSetCacheConfig(function, cache_config);
+  hipError_t res = tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to set ROCM kernel cache config. kernel: " << function
                << ", config: " << cache_config << ", result: " << ToString(res);
@@ -396,7 +397,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
 GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   hipSharedMemConfig shared_mem_config;
   ScopedActivateContext activation{context};
-  hipError_t result = hipDeviceGetSharedMemConfig(&shared_mem_config);
+  hipError_t result = tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config);
   if (result != hipSuccess) {
     LOG(ERROR) << "failed to get ROCM device shared memory config. "
                << "Context device ID: " << context->device_ordinal()
@@ -411,7 +412,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
     GpuContext* context, hipSharedMemConfig shared_mem_config) {
   ScopedActivateContext activation{context};
-  hipError_t result = hipDeviceSetSharedMemConfig(shared_mem_config);
+  hipError_t result = tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config);
   if (result != hipSuccess) {
     LOG(ERROR) << "failed to set ROCM device shared memory config. "
                << "Context device ID: " << context->device_ordinal()
@@ -435,7 +436,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
           << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
-  hipError_t res = hipModuleLaunchKernel(
+  hipError_t res = tensorflow::wrap::hipModuleLaunchKernel(
       function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
       block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
   if (res != hipSuccess) {
@@ -471,7 +472,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
         ScopedActivateContext activation{context};
         void* hsaco_data = const_cast<char*>(hsaco_contents);
 
-        hipError_t res = hipModuleLoadData(module, hsaco_data);
+        hipError_t res = tensorflow::wrap::hipModuleLoadData(module, hsaco_data);
 
         if (res != hipSuccess) {
           LOG(ERROR) << "failed to load HSACO: " << ToString(res);
@@ -491,7 +492,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                     hipDeviceptr_t location,
                                                     uint8 value, size_t size) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipMemset(location, value, size);
+  hipError_t res = tensorflow::wrap::hipMemset(location, value, size);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -513,7 +514,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     return false;
   }
   hipError_t res =
-      hipMemset(pointer, static_cast<int>(value), uint32_count * 4);
+      tensorflow::wrap::hipMemset(pointer, static_cast<int>(value), uint32_count * 4);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -527,7 +528,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                      size_t uint32_count,
                                                      GpuStreamHandle stream) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipMemsetAsync(location, value, uint32_count, stream);
+  hipError_t res = tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
@@ -552,7 +553,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     LOG(ERROR) << "failed to memset memory";
     return false;
   }
-  hipError_t res = hipMemsetAsync(pointer, value, uint32_count * 4, stream);
+  hipError_t res = tensorflow::wrap::hipMemsetAsync(pointer, value, uint32_count * 4, stream);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
@@ -565,7 +566,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                GpuStreamHandle stream,
                                                StreamCallback callback,
                                                void* data) {
-  hipError_t res = hipStreamAddCallback(stream, (hipStreamCallback_t)callback,
+  hipError_t res = tensorflow::wrap::hipStreamAddCallback(stream, (hipStreamCallback_t)callback,
                                         data, 0 /* = flags */);
   if (res != hipSuccess) {
     LOG(ERROR) << "unable to add host callback: " << ToString(res);
@@ -580,7 +581,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                hipFunction_t* function) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && kernel_name != nullptr);
-  hipError_t res = hipModuleGetFunction(function, module, kernel_name);
+  hipError_t res = tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to get kernel \"" << kernel_name
                << "\" from module: " << ToString(res);
@@ -598,7 +599,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && symbol_name != nullptr &&
         (dptr != nullptr || bytes != nullptr));
-  hipError_t res = hipModuleGetGlobal(dptr, bytes, module, symbol_name);
+  hipError_t res = tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
   if (res != hipSuccess) {
     // symbol may not be found in the current module, but it may reside in
     // another module.
@@ -613,7 +614,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ void GpuDriver::UnloadModule(GpuContext* context,
                                           hipModule_t module) {
   ScopedActivateContext activated{context};
-  hipError_t res = hipModuleUnload(module);
+  hipError_t res = tensorflow::wrap::hipModuleUnload(module);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to unload module " << module
                << "; leaking: " << ToString(res);
@@ -623,7 +624,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ bool GpuDriver::CreateStream(GpuContext* context,
                                           GpuStreamHandle* stream) {
   ScopedActivateContext activated{context};
-  hipError_t res = hipStreamCreateWithFlags(
+  hipError_t res = tensorflow::wrap::hipStreamCreateWithFlags(
       stream, hipStreamDefault);  // switch to hipStreamNonBlocking?
   if (res != hipSuccess) {
     LOG(ERROR) << "could not allocate ROCM stream for device "
@@ -643,7 +644,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   }
 
   ScopedActivateContext activated{context};
-  hipError_t res = hipStreamDestroy(*stream);
+  hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to destroy ROCM stream for device "
                << context->device_ordinal() << ": " << ToString(res);
@@ -658,7 +659,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                              uint64 bytes) {
   ScopedActivateContext activated{context};
   hipDeviceptr_t result = 0;
-  hipError_t res = hipMalloc(&result, bytes);
+  hipError_t res = tensorflow::wrap::hipMallocVanilla(&result, bytes);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to allocate "
                << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
@@ -675,7 +676,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                               void* location) {
   ScopedActivateContext activation{context};
   hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
-  hipError_t res = hipFree(pointer);
+  hipError_t res = tensorflow::wrap::hipFree(pointer);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to free device memory at " << location
                << "; result: " << ToString(res);
@@ -704,7 +705,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   ScopedActivateContext activation{context};
   void* host_mem = nullptr;
   // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
-  hipError_t res = hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
+  hipError_t res = tensorflow::wrap::hipHostMallocVanilla(&host_mem, bytes, hipHostMallocPortable);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to alloc " << bytes
                << " bytes on host: " << ToString(res);
@@ -715,7 +716,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
                                             void* location) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipHostFree(location);
+  hipError_t res = tensorflow::wrap::hipHostFree(location);
   if (res != hipSuccess) {
     LOG(ERROR) << "error deallocating host memory at " << location << ": "
                << ToString(res);
@@ -726,7 +727,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                           uint64 bytes) {
   ScopedActivateContext activation{context};
   // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
-  hipError_t res = hipHostRegister(location, bytes, hipHostRegisterPortable);
+  hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes, hipHostRegisterPortable);
   if (res != hipSuccess) {
     LOG(ERROR) << "error registering host memory at " << location << ": "
                << ToString(res);
@@ -738,7 +739,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
                                             void* location) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipHostUnregister(location);
+  hipError_t res = tensorflow::wrap::hipHostUnregister(location);
   if (res != hipSuccess) {
     LOG(ERROR) << "error unregistering host memory at " << location << ": "
                << ToString(res);
@@ -755,7 +756,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   }
 
   ScopedActivateContext activated{context};
-  hipError_t res = hipEventDestroy(*event);
+  hipError_t res = tensorflow::wrap::hipEventDestroy(*event);
   *event = nullptr;
 
   switch (res) {
@@ -779,7 +780,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                  GpuEventHandle event,
                                                  GpuStreamHandle stream) {
   ScopedActivateContext activated{context};
-  hipError_t res = hipEventRecord(event, stream);
+  hipError_t res = tensorflow::wrap::hipEventRecord(event, stream);
   switch (res) {
     case hipSuccess:
       return port::Status::OK();
@@ -800,7 +801,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
     GpuContext* context, GpuEventHandle event) {
   ScopedActivateContext activated{context};
-  hipError_t res = hipEventQuery(event);
+  hipError_t res = tensorflow::wrap::hipEventQuery(event);
   if (res != hipSuccess && res != hipErrorNotReady) {
     return port::Status{
         port::error::INTERNAL,
@@ -817,12 +818,12 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   ScopedActivateContext activated{context};
   // The stop event must have completed in order for hipEventElapsedTime to
   // work.
-  hipError_t res = hipEventSynchronize(stop);
+  hipError_t res = tensorflow::wrap::hipEventSynchronize(stop);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
     return false;
   }
-  res = hipEventElapsedTime(elapsed_milliseconds, start, stop);
+  res = tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to get elapsed time between events: "
                << ToString(res);
@@ -836,7 +837,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                GpuStreamHandle stream,
                                                GpuEventHandle event) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipStreamWaitEvent(stream, event, 0 /* = flags */);
+  hipError_t res = tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */);
   if (res != hipSuccess) {
     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
     return false;
@@ -847,7 +848,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 
 /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipDeviceSynchronize();
+  hipError_t res = tensorflow::wrap::hipDeviceSynchronize();
   if (res != hipSuccess) {
     LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
                << " :: " << port::CurrentStackTrace();
@@ -861,7 +862,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                        GpuStreamHandle stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
-  hipError_t res = hipStreamSynchronize(stream);
+  hipError_t res = tensorflow::wrap::hipStreamSynchronize(stream);
   if (res != hipSuccess) {
     port::Status status = port::InternalError(
         absl::StrCat("could not synchronize on ROCM stream: ", ToString(res)));
@@ -877,7 +878,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                           GpuStreamHandle stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
-  hipError_t res = hipStreamQuery(stream);
+  hipError_t res = tensorflow::wrap::hipStreamQuery(stream);
   if (res == hipSuccess) {
     return true;
   }
@@ -891,7 +892,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
     GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipMemcpyDtoH(host_dst, gpu_src, size);
+  hipError_t res = tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size);
   if (res != hipSuccess) {
     return port::InternalError(
         absl::StrFormat("failed to synchronous memcpy from device to host: %s; "
@@ -908,7 +909,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
     uint64 size) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src), size);
+  hipError_t res = tensorflow::wrap::hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src), size);
   if (res != hipSuccess) {
     return port::InternalError(absl::StrFormat(
         "failed to synchronous memcpy from host to device: %s; Gpu dst: %p;"
@@ -924,7 +925,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
     GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
     uint64 size) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipMemcpyDtoD(gpu_dst, gpu_src, size);
+  hipError_t res = tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size);
   if (res != hipSuccess) {
     return port::InternalError(absl::StrFormat(
         "failed to synchronous memcpy from host to device: %s; Gpu dst: %p; "
@@ -942,7 +943,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                    uint64 size,
                                                    GpuStreamHandle stream) {
   ScopedActivateContext activation{context};
-  hipError_t res = hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
+  hipError_t res = tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
   if (res != hipSuccess) {
     LOG(ERROR) << absl::StrFormat(
         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
@@ -964,7 +965,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                    GpuStreamHandle stream) {
   ScopedActivateContext activation{context};
   hipError_t res =
-      hipMemcpyHtoDAsync(gpu_dst, const_cast<void*>(host_src), size, stream);
+      tensorflow::wrap::hipMemcpyHtoDAsync(gpu_dst, const_cast<void*>(host_src), size, stream);
   if (res != hipSuccess) {
     LOG(ERROR) << absl::StrFormat(
         "failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
@@ -984,7 +985,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
                                                    uint64 size,
                                                    GpuStreamHandle stream) {
   ScopedActivateContext activation{context};
-  hipError_t result = hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
+  hipError_t result = tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
   if (result != hipSuccess) {
     LOG(ERROR) << absl::StrFormat(
         "failed to enqueue async memcpy from device to device: %s"
@@ -1021,7 +1022,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   }
 
   ScopedActivateContext activated{context};
-  hipError_t res = hipEventCreateWithFlags(event, hipflags);
+  hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags);
 
   if (res == hipSuccess) {
     return port::Status::OK();
@@ -1037,7 +1038,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 
 /* static */ int GpuDriver::GetDeviceCount() {
   int device_count = 0;
-  hipError_t res = hipGetDeviceCount(&device_count);
+  hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count);
   if (res != hipSuccess) {
     LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
     return 0;
@@ -1061,7 +1062,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 
 /* static */ port::Status GpuDriver::GetPointerAddressRange(
     hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
-  hipError_t result = hipMemGetAddressRange(base, size, dptr);
+  hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr);
   if (result == hipSuccess) {
     return port::Status::OK();
   } else if (result == hipErrorNotFound) {
@@ -1106,7 +1107,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
     hipDeviceptr_t pointer) {
   hipPointerAttribute_t pointerAttributes;
-  hipError_t result = hipPointerGetAttributes(&pointerAttributes, pointer);
+  hipError_t result = tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
   if (result != hipSuccess) {
     return port::Status{
         port::error::INTERNAL,
@@ -1114,7 +1115,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   }
 
   hipDevice_t device;
-  result = hipDeviceGet(&device, pointerAttributes.device);
+  result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device);
   if (result != hipSuccess) {
     return port::Status{
         port::error::INTERNAL,
@@ -1127,7 +1128,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
                                                       hipDevice_t device) {
   hipDeviceProp_t props;
-  hipError_t result = hipGetDeviceProperties(&props, device);
+  hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
   if (result == hipSuccess) {
     *version = props.gcnArch;
     return port::Status::OK();
@@ -1145,7 +1146,7 @@ template <typename T>
 static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
                                             hipDeviceAttribute_t attribute) {
   int value = -1;
-  hipError_t result = hipDeviceGetAttribute(&value, attribute, device);
+  hipError_t result = tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device);
   if (result != hipSuccess) {
     return port::Status{
         port::error::NOT_FOUND,
@@ -1200,21 +1201,21 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
                                            hipDevice_t device) {
   int value;
   hipError_t res =
-      hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, device);
+      tensorflow::wrap::hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, device);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
     return false;
   }
   *x = value;
 
-  res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, device);
+  res = tensorflow::wrap::hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, device);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
     return false;
   }
   *y = value;
 
-  res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, device);
+  res = tensorflow::wrap::hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, device);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
     return false;
@@ -1224,7 +1225,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
 }
 
 /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
-  hipError_t res = hipDriverGetVersion(driver_version);
+  hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query driver version: " << ToString(res);
     return false;
@@ -1235,7 +1236,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
 
 /* static */ bool GpuDriver::GetDeviceProperties(
     hipDeviceProp_t* device_properties, int device_ordinal) {
-  hipError_t res = hipGetDeviceProperties(device_properties, device_ordinal);
+  hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties, device_ordinal);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query device properties: " << ToString(res);
     return false;
@@ -1268,7 +1269,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   ScopedActivateContext activation{context};
   size_t free = 0;
   size_t total = 0;
-  hipError_t res = hipMemGetInfo(&free, &total);
+  hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
     return false;
@@ -1282,7 +1283,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
 /* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
                                                   uint64* result) {
   size_t value = -1;
-  hipError_t res = hipDeviceTotalMem(&value, device);
+  hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
     return false;
@@ -1297,7 +1298,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   static const int kBufferSize = 64;
   absl::InlinedVector<char, 4> chars(kBufferSize);
   chars[kBufferSize - 1] = '\0';
-  hipError_t res = hipDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
+  hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
     return pci_bus_id;
@@ -1313,7 +1314,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
   }
 
   int can_access_peer = -1;
-  hipError_t res = hipDeviceCanAccessPeer(
+  hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer(
       &can_access_peer, from->device_ordinal(), to->device_ordinal());
   if (res != hipSuccess) {
     LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
@@ -1331,7 +1332,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
 
   ScopedActivateContext activated{from};
   hipError_t result =
-      hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */);
+      tensorflow::wrap::hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */);
   if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
     return port::Status{
         port::error::INTERNAL,
diff --git a/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
new file mode 100644
index 00000000000..0a0ab3ae745
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
@@ -0,0 +1,147 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps rocm driver calls with dso loader so that we don't need to
+// have explicit linking to librocm. All TF rocm driver usage should route
+// through this wrapper.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
+
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "rocm/include/hip/hip_runtime.h"
+
+
+#if defined(TENSORFLOW_USE_ROCM)
+
+#endif
+
+namespace tensorflow {
+namespace wrap {
+#ifdef PLATFORM_GOOGLE
+// Use static linked library
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                       \
+  template <typename... Args>                                              \
+  auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
+    return ::hipSymbolName(args...);                                      \
+  }
+
+// This macro wraps a global identifier, given by hipSymbolName, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#else
+#define TO_STR_(x) #x
+#define TO_STR(x) TO_STR_(x)
+
+// hipMalloc and hipHostMalloc are defined as funtion templates in the
+// HIP header files, and hence their names get mangled and the attempt
+// to resolve their name when trying to dynamically load them will fail
+// Updating the HIP header files to make them C functions is underway.
+// Until that change flows through, we will workaround the issue by
+// creating dummy wrappers for them here
+
+hipError_t hipMallocVanilla(void** ptr, size_t size) {
+  return hipErrorNotInitialized;
+}
+  
+hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
+  return hipErrorNotInitialized;
+}
+
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                             \
+  template <typename... Args>                                               \
+  auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) {    \
+    using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type;     \
+    static FuncPtrT loaded = []() -> FuncPtrT {                             \
+      static const char *kName = TO_STR(hipSymbolName);                     \
+      void *f;                                                              \
+      auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary( \
+          stream_executor::internal::CachedDsoLoader::GetHipDsoHandle()     \
+              .ValueOrDie(),                                                \
+          kName, &f);                                                       \
+      CHECK(s.ok()) << "could not find " << kName                           \
+                    << " in HIP DSO; dlerror: " << s.error_message();       \
+      return reinterpret_cast<FuncPtrT>(f);                                 \
+    }();                                                                    \
+    return loaded(args...);                                                 \
+  }
+#endif
+
+// clang-format off
+#define HIP_ROUTINE_EACH(__macro)                   \
+  __macro(hipDeviceCanAccessPeer)                   \
+  __macro(hipDeviceEnablePeerAccess)                \
+  __macro(hipDeviceGet)                             \
+  __macro(hipDeviceGetAttribute)                    \
+  __macro(hipDeviceGetName)                         \
+  __macro(hipDeviceGetPCIBusId)                     \
+  __macro(hipDeviceGetSharedMemConfig)              \
+  __macro(hipDeviceSetSharedMemConfig)              \
+  __macro(hipDeviceSynchronize)                     \
+  __macro(hipDeviceTotalMem)                        \
+  __macro(hipDriverGetVersion)                      \
+  __macro(hipEventCreateWithFlags)                  \
+  __macro(hipEventElapsedTime)                      \
+  __macro(hipEventDestroy)                          \
+  __macro(hipEventQuery)                            \
+  __macro(hipEventRecord)                           \
+  __macro(hipEventSynchronize)                      \
+  __macro(hipFree)                                  \
+  __macro(hipFuncSetCacheConfig)                    \
+  __macro(hipGetDevice)                             \
+  __macro(hipGetDeviceCount)                        \
+  __macro(hipGetDeviceProperties)                   \
+  __macro(hipHostFree)                              \
+  __macro(hipHostRegister)                          \
+  __macro(hipHostUnregister)                        \
+  __macro(hipInit)                                  \
+  __macro(hipMemGetAddressRange)                    \
+  __macro(hipMemGetInfo)                            \
+  __macro(hipMemcpyDtoD)                            \
+  __macro(hipMemcpyDtoDAsync)                       \
+  __macro(hipMemcpyDtoH)                            \
+  __macro(hipMemcpyDtoHAsync)                       \
+  __macro(hipMemcpyHtoD)                            \
+  __macro(hipMemcpyHtoDAsync)                       \
+  __macro(hipMemset)                                \
+  __macro(hipMemsetAsync)                           \
+  __macro(hipModuleGetFunction)                     \
+  __macro(hipModuleGetGlobal)                       \
+  __macro(hipModuleLaunchKernel)                    \
+  __macro(hipModuleLoadData)                        \
+  __macro(hipModuleUnload)                          \
+  __macro(hipPointerGetAttributes)                  \
+  __macro(hipSetDevice)                             \
+  __macro(hipStreamAddCallback)                     \
+  __macro(hipStreamCreateWithFlags)                 \
+  __macro(hipStreamDestroy)                         \
+  __macro(hipStreamQuery)                           \
+  __macro(hipStreamSynchronize)                     \
+  __macro(hipStreamWaitEvent)                       \
+// clang-format on
+
+HIP_ROUTINE_EACH(STREAM_EXECUTOR_HIP_WRAP)
+#undef HIP_ROUTINE_EACH
+#undef STREAM_EXECUTOR_HIP_WRAP
+#undef TO_STR
+#undef TO_STR_
+}  // namespace wrap
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_