From 834a3f7395a9db748349e0bf9dfff7af558cb4fb Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Fri, 1 Feb 2019 17:59:02 +0000 Subject: [PATCH] changing the rocm_driver API from being dynamically linked to being dynamically loaded --- tensorflow/stream_executor/rocm/BUILD | 149 +++++++++--------- .../stream_executor/rocm/rocm_driver.cc | 125 +++++++-------- .../rocm/rocm_driver_wrapper.h | 147 +++++++++++++++++ 3 files changed, 285 insertions(+), 136 deletions(-) create mode 100644 tensorflow/stream_executor/rocm/rocm_driver_wrapper.h diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD index f0b05822703..5190b551f80 100644 --- a/tensorflow/stream_executor/rocm/BUILD +++ b/tensorflow/stream_executor/rocm/BUILD @@ -47,7 +47,7 @@ cc_library( cc_library( name = "rocm_driver", srcs = if_rocm_is_configured(["rocm_driver.cc"]), - hdrs = [], + hdrs = if_rocm_is_configured(["rocm_driver_wrapper.h"]), deps = if_rocm_is_configured([ ":rocm_diagnostics", "@com_google_absl//absl/base", @@ -57,6 +57,7 @@ cc_library( "//tensorflow/stream_executor/gpu:gpu_driver_header", "//tensorflow/stream_executor/lib", "//tensorflow/stream_executor/platform", + "//tensorflow/stream_executor/platform:dso_loader", "@local_config_rocm//rocm:rocm_headers", ]), ) @@ -141,60 +142,60 @@ cc_library( ) cc_library( - name = "rocblas_plugin", - srcs = if_rocm_is_configured(["rocm_blas.cc"]), - hdrs = if_rocm_is_configured(["rocm_blas.h"]), - visibility = ["//visibility:public"], - deps = if_rocm_is_configured([ - ":rocm_gpu_executor", - ":rocm_platform_id", - "//third_party/eigen3", - "//tensorflow/core:lib_internal", - "//tensorflow/stream_executor", - "//tensorflow/stream_executor:event", - "//tensorflow/stream_executor:host_or_device_scalar", - "//tensorflow/stream_executor:plugin_registry", - "//tensorflow/stream_executor:scratch_allocator", - "//tensorflow/stream_executor:timer", - "//tensorflow/stream_executor/gpu:gpu_activation", - "//tensorflow/stream_executor/gpu:gpu_helpers_header", - "//tensorflow/stream_executor/gpu:gpu_stream_header", - "//tensorflow/stream_executor/gpu:gpu_timer_header", - "//tensorflow/stream_executor/lib", - "//tensorflow/stream_executor/platform", - "//tensorflow/stream_executor/platform:dso_loader", - "@com_google_absl//absl/strings", - "@local_config_rocm//rocm:rocm_headers", - ] + if_static([ - "@local_config_rocm//rocm:rocblas" - ])), - alwayslink = True, + name = "rocblas_plugin", + srcs = if_rocm_is_configured(["rocm_blas.cc"]), + hdrs = if_rocm_is_configured(["rocm_blas.h"]), + visibility = ["//visibility:public"], + deps = if_rocm_is_configured([ + ":rocm_gpu_executor", + ":rocm_platform_id", + "//third_party/eigen3", + "//tensorflow/core:lib_internal", + "//tensorflow/stream_executor", + "//tensorflow/stream_executor:event", + "//tensorflow/stream_executor:host_or_device_scalar", + "//tensorflow/stream_executor:plugin_registry", + "//tensorflow/stream_executor:scratch_allocator", + "//tensorflow/stream_executor:timer", + "//tensorflow/stream_executor/gpu:gpu_activation", + "//tensorflow/stream_executor/gpu:gpu_helpers_header", + "//tensorflow/stream_executor/gpu:gpu_stream_header", + "//tensorflow/stream_executor/gpu:gpu_timer_header", + "//tensorflow/stream_executor/lib", + "//tensorflow/stream_executor/platform", + "//tensorflow/stream_executor/platform:dso_loader", + "@com_google_absl//absl/strings", + "@local_config_rocm//rocm:rocm_headers", + ] + if_static([ + "@local_config_rocm//rocm:rocblas", + ])), + alwayslink = True, ) cc_library( - name = "rocfft_plugin", - srcs = if_rocm_is_configured(["rocm_fft.cc"]), - hdrs = if_rocm_is_configured(["rocm_fft.h"]), - visibility = ["//visibility:public"], - deps = if_rocm_is_configured([ - ":rocm_platform_id", - "//tensorflow/stream_executor:event", - "//tensorflow/stream_executor:fft", - "//tensorflow/stream_executor:plugin_registry", - "//tensorflow/stream_executor:scratch_allocator", - "//tensorflow/stream_executor/gpu:gpu_activation", - "//tensorflow/stream_executor/gpu:gpu_helpers_header", - "//tensorflow/stream_executor/gpu:gpu_executor_header", - "//tensorflow/stream_executor/gpu:gpu_stream_header", - "//tensorflow/stream_executor/gpu:gpu_kernel_header", - "//tensorflow/stream_executor/lib", - "//tensorflow/stream_executor/platform", - "//tensorflow/stream_executor/platform:dso_loader", - "@local_config_rocm//rocm:rocm_headers", - ] + if_static([ - "@local_config_rocm//rocm:rocfft" - ])), - alwayslink = True, + name = "rocfft_plugin", + srcs = if_rocm_is_configured(["rocm_fft.cc"]), + hdrs = if_rocm_is_configured(["rocm_fft.h"]), + visibility = ["//visibility:public"], + deps = if_rocm_is_configured([ + ":rocm_platform_id", + "//tensorflow/stream_executor:event", + "//tensorflow/stream_executor:fft", + "//tensorflow/stream_executor:plugin_registry", + "//tensorflow/stream_executor:scratch_allocator", + "//tensorflow/stream_executor/gpu:gpu_activation", + "//tensorflow/stream_executor/gpu:gpu_helpers_header", + "//tensorflow/stream_executor/gpu:gpu_executor_header", + "//tensorflow/stream_executor/gpu:gpu_stream_header", + "//tensorflow/stream_executor/gpu:gpu_kernel_header", + "//tensorflow/stream_executor/lib", + "//tensorflow/stream_executor/platform", + "//tensorflow/stream_executor/platform:dso_loader", + "@local_config_rocm//rocm:rocm_headers", + ] + if_static([ + "@local_config_rocm//rocm:rocfft", + ])), + alwayslink = True, ) # FIXME: enable in future PRs @@ -237,28 +238,28 @@ cc_library( #) cc_library( - name = "rocrand_plugin", - srcs = if_rocm_is_configured(["rocm_rng.cc"]), - hdrs = if_rocm_is_configured([]), - deps = if_rocm_is_configured([ - ":rocm_gpu_executor", - ":rocm_platform_id", - "@local_config_rocm//rocm:rocm_headers", - "//tensorflow/stream_executor:event", - "//tensorflow/stream_executor:plugin_registry", - "//tensorflow/stream_executor:rng", - "//tensorflow/stream_executor/gpu:gpu_activation_header", - "//tensorflow/stream_executor/gpu:gpu_helpers_header", - "//tensorflow/stream_executor/gpu:gpu_executor_header", - "//tensorflow/stream_executor/gpu:gpu_rng_header", - "//tensorflow/stream_executor/gpu:gpu_stream_header", - "//tensorflow/stream_executor/lib", - "//tensorflow/stream_executor/platform", - "//tensorflow/stream_executor/platform:dso_loader", - ] + if_static([ - "@local_config_rocm//rocm:hiprand" - ])), - alwayslink = True, + name = "rocrand_plugin", + srcs = if_rocm_is_configured(["rocm_rng.cc"]), + hdrs = if_rocm_is_configured([]), + deps = if_rocm_is_configured([ + ":rocm_gpu_executor", + ":rocm_platform_id", + "@local_config_rocm//rocm:rocm_headers", + "//tensorflow/stream_executor:event", + "//tensorflow/stream_executor:plugin_registry", + "//tensorflow/stream_executor:rng", + "//tensorflow/stream_executor/gpu:gpu_activation_header", + "//tensorflow/stream_executor/gpu:gpu_helpers_header", + "//tensorflow/stream_executor/gpu:gpu_executor_header", + "//tensorflow/stream_executor/gpu:gpu_rng_header", + "//tensorflow/stream_executor/gpu:gpu_stream_header", + "//tensorflow/stream_executor/lib", + "//tensorflow/stream_executor/platform", + "//tensorflow/stream_executor/platform:dso_loader", + ] + if_static([ + "@local_config_rocm//rocm:hiprand", + ])), + alwayslink = True, ) cc_library( diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc index 39d52d28304..73b1b350f71 100644 --- a/tensorflow/stream_executor/rocm/rocm_driver.cc +++ b/tensorflow/stream_executor/rocm/rocm_driver.cc @@ -36,6 +36,7 @@ limitations under the License. #include "tensorflow/stream_executor/platform/logging.h" #include "tensorflow/stream_executor/platform/mutex.h" #include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h" bool FLAGS_gpuexec_rocm_driver_inject_init_error = false; bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false; @@ -143,7 +144,7 @@ string MemorySpaceString(MemorySpace memory_space) { // HIP driver (e.g., this value is not our cached view of the current device). static int CurrentDeviceOrDie() { int current = -1; - hipError_t result = hipGetDevice(¤t); + hipError_t result = tensorflow::wrap::hipGetDevice(¤t); if (result != hipSuccess) { LOG(FATAL) << "failed to query current device: " << ToString(result); } @@ -154,7 +155,7 @@ namespace { // Call hipDeviceSynchronize and crash if it doesn't succeed. void SynchronizeOrDie() { - auto res = hipDeviceSynchronize(); + auto res = tensorflow::wrap::hipDeviceSynchronize(); if (res != hipSuccess) { LOG(FATAL) << "Synchronize found " << ToString(res) << " :: " << port::CurrentStackTrace(); @@ -197,7 +198,7 @@ ScopedActivateContext::ScopedActivateContext(GpuContext* context) { << tls->current_device_ordinal << " to " << context->device_ordinal(); // Set the device and update thread local. - CHECK_EQ(hipSuccess, hipSetDevice(context->device_ordinal())); + CHECK_EQ(hipSuccess, tensorflow::wrap::hipSetDevice(context->device_ordinal())); tls->current_device_ordinal = context->device_ordinal(); } @@ -225,7 +226,7 @@ ScopedActivateContext::~ScopedActivateContext() { << to_restore_->device_ordinal(); // Set context and update thread local. - CHECK_EQ(hipSuccess, hipSetDevice(to_restore_->device_ordinal())); + CHECK_EQ(hipSuccess, tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal())); tls->current_device_ordinal = to_restore_->device_ordinal(); } @@ -261,7 +262,7 @@ string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) { // in the process of querying. string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) { hipPointerAttribute_t from_pointerAttributes; - hipError_t result = hipPointerGetAttributes(&from_pointerAttributes, from); + hipError_t result = tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from); if (result != hipSuccess) { LOG(ERROR) << "could not retrieve source pointer's device: " << ToString(result); @@ -269,7 +270,7 @@ string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) { } hipPointerAttribute_t to_pointerAttributes; - result = hipPointerGetAttributes(&to_pointerAttributes, to); + result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to); if (result != hipSuccess) { LOG(ERROR) << "could not retrieve destination pointer's device: " << ToString(result); @@ -289,7 +290,7 @@ static port::Status InternalInit() { if (FLAGS_gpuexec_rocm_driver_inject_init_error) { LOG(ERROR) << "injecting ROCM init error; initialization will fail"; } else { - res = hipInit(0 /* = flags */); + res = tensorflow::wrap::hipInit(0 /* = flags */); } if (res == hipSuccess) { @@ -322,7 +323,7 @@ static port::Status InternalInit() { /* static */ port::Status GpuDriver::GetDevice(int device_ordinal, hipDevice_t* device) { - hipError_t res = hipDeviceGet(device, device_ordinal); + hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal); if (res == hipSuccess) { return port::Status::OK(); } @@ -336,7 +337,7 @@ static port::Status InternalInit() { string* device_name) { static const size_t kCharLimit = 64; absl::InlinedVector chars(kCharLimit); - hipError_t res = hipDeviceGetName(chars.begin(), kCharLimit - 1, device); + hipError_t res = tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device); if (res != hipSuccess) { LOG(ERROR) << "failed to get device name for " << device << ": " << ToString(res); @@ -382,7 +383,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options, /* static */ bool GpuDriver::FuncSetCacheConfig(hipFunction_t function, hipFuncCache_t cache_config) { - hipError_t res = hipFuncSetCacheConfig(function, cache_config); + hipError_t res = tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config); if (res != hipSuccess) { LOG(ERROR) << "failed to set ROCM kernel cache config. kernel: " << function << ", config: " << cache_config << ", result: " << ToString(res); @@ -396,7 +397,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options, GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { hipSharedMemConfig shared_mem_config; ScopedActivateContext activation{context}; - hipError_t result = hipDeviceGetSharedMemConfig(&shared_mem_config); + hipError_t result = tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config); if (result != hipSuccess) { LOG(ERROR) << "failed to get ROCM device shared memory config. " << "Context device ID: " << context->device_ordinal() @@ -411,7 +412,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { /* static */ port::Status GpuDriver::ContextSetSharedMemConfig( GpuContext* context, hipSharedMemConfig shared_mem_config) { ScopedActivateContext activation{context}; - hipError_t result = hipDeviceSetSharedMemConfig(shared_mem_config); + hipError_t result = tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config); if (result != hipSuccess) { LOG(ERROR) << "failed to set ROCM device shared memory config. " << "Context device ID: " << context->device_ordinal() @@ -435,7 +436,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z << " bdx: " << block_dim_x << " bdy: " << block_dim_y << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes; - hipError_t res = hipModuleLaunchKernel( + hipError_t res = tensorflow::wrap::hipModuleLaunchKernel( function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y, block_dim_z, shared_mem_bytes, stream, kernel_params, extra); if (res != hipSuccess) { @@ -471,7 +472,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { ScopedActivateContext activation{context}; void* hsaco_data = const_cast(hsaco_contents); - hipError_t res = hipModuleLoadData(module, hsaco_data); + hipError_t res = tensorflow::wrap::hipModuleLoadData(module, hsaco_data); if (res != hipSuccess) { LOG(ERROR) << "failed to load HSACO: " << ToString(res); @@ -491,7 +492,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { hipDeviceptr_t location, uint8 value, size_t size) { ScopedActivateContext activation{context}; - hipError_t res = hipMemset(location, value, size); + hipError_t res = tensorflow::wrap::hipMemset(location, value, size); if (res != hipSuccess) { LOG(ERROR) << "failed to memset memory: " << ToString(res); return false; @@ -513,7 +514,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { return false; } hipError_t res = - hipMemset(pointer, static_cast(value), uint32_count * 4); + tensorflow::wrap::hipMemset(pointer, static_cast(value), uint32_count * 4); if (res != hipSuccess) { LOG(ERROR) << "failed to memset memory: " << ToString(res); return false; @@ -527,7 +528,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { size_t uint32_count, GpuStreamHandle stream) { ScopedActivateContext activation{context}; - hipError_t res = hipMemsetAsync(location, value, uint32_count, stream); + hipError_t res = tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream); if (res != hipSuccess) { LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res); return false; @@ -552,7 +553,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { LOG(ERROR) << "failed to memset memory"; return false; } - hipError_t res = hipMemsetAsync(pointer, value, uint32_count * 4, stream); + hipError_t res = tensorflow::wrap::hipMemsetAsync(pointer, value, uint32_count * 4, stream); if (res != hipSuccess) { LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res); return false; @@ -565,7 +566,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { GpuStreamHandle stream, StreamCallback callback, void* data) { - hipError_t res = hipStreamAddCallback(stream, (hipStreamCallback_t)callback, + hipError_t res = tensorflow::wrap::hipStreamAddCallback(stream, (hipStreamCallback_t)callback, data, 0 /* = flags */); if (res != hipSuccess) { LOG(ERROR) << "unable to add host callback: " << ToString(res); @@ -580,7 +581,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { hipFunction_t* function) { ScopedActivateContext activated{context}; CHECK(module != nullptr && kernel_name != nullptr); - hipError_t res = hipModuleGetFunction(function, module, kernel_name); + hipError_t res = tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name); if (res != hipSuccess) { LOG(ERROR) << "failed to get kernel \"" << kernel_name << "\" from module: " << ToString(res); @@ -598,7 +599,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { ScopedActivateContext activated{context}; CHECK(module != nullptr && symbol_name != nullptr && (dptr != nullptr || bytes != nullptr)); - hipError_t res = hipModuleGetGlobal(dptr, bytes, module, symbol_name); + hipError_t res = tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name); if (res != hipSuccess) { // symbol may not be found in the current module, but it may reside in // another module. @@ -613,7 +614,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { /* static */ void GpuDriver::UnloadModule(GpuContext* context, hipModule_t module) { ScopedActivateContext activated{context}; - hipError_t res = hipModuleUnload(module); + hipError_t res = tensorflow::wrap::hipModuleUnload(module); if (res != hipSuccess) { LOG(ERROR) << "failed to unload module " << module << "; leaking: " << ToString(res); @@ -623,7 +624,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { /* static */ bool GpuDriver::CreateStream(GpuContext* context, GpuStreamHandle* stream) { ScopedActivateContext activated{context}; - hipError_t res = hipStreamCreateWithFlags( + hipError_t res = tensorflow::wrap::hipStreamCreateWithFlags( stream, hipStreamDefault); // switch to hipStreamNonBlocking? if (res != hipSuccess) { LOG(ERROR) << "could not allocate ROCM stream for device " @@ -643,7 +644,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { } ScopedActivateContext activated{context}; - hipError_t res = hipStreamDestroy(*stream); + hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream); if (res != hipSuccess) { LOG(ERROR) << "failed to destroy ROCM stream for device " << context->device_ordinal() << ": " << ToString(res); @@ -658,7 +659,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { uint64 bytes) { ScopedActivateContext activated{context}; hipDeviceptr_t result = 0; - hipError_t res = hipMalloc(&result, bytes); + hipError_t res = tensorflow::wrap::hipMallocVanilla(&result, bytes); if (res != hipSuccess) { LOG(ERROR) << "failed to allocate " << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes @@ -675,7 +676,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { void* location) { ScopedActivateContext activation{context}; hipDeviceptr_t pointer = absl::bit_cast(location); - hipError_t res = hipFree(pointer); + hipError_t res = tensorflow::wrap::hipFree(pointer); if (res != hipSuccess) { LOG(ERROR) << "failed to free device memory at " << location << "; result: " << ToString(res); @@ -704,7 +705,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { ScopedActivateContext activation{context}; void* host_mem = nullptr; // "Portable" memory is visible to all ROCM contexts. Safe for our use model. - hipError_t res = hipHostMalloc(&host_mem, bytes, hipHostMallocPortable); + hipError_t res = tensorflow::wrap::hipHostMallocVanilla(&host_mem, bytes, hipHostMallocPortable); if (res != hipSuccess) { LOG(ERROR) << "failed to alloc " << bytes << " bytes on host: " << ToString(res); @@ -715,7 +716,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { /* static */ void GpuDriver::HostDeallocate(GpuContext* context, void* location) { ScopedActivateContext activation{context}; - hipError_t res = hipHostFree(location); + hipError_t res = tensorflow::wrap::hipHostFree(location); if (res != hipSuccess) { LOG(ERROR) << "error deallocating host memory at " << location << ": " << ToString(res); @@ -726,7 +727,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { uint64 bytes) { ScopedActivateContext activation{context}; // "Portable" memory is visible to all ROCM contexts. Safe for our use model. - hipError_t res = hipHostRegister(location, bytes, hipHostRegisterPortable); + hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes, hipHostRegisterPortable); if (res != hipSuccess) { LOG(ERROR) << "error registering host memory at " << location << ": " << ToString(res); @@ -738,7 +739,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { /* static */ bool GpuDriver::HostUnregister(GpuContext* context, void* location) { ScopedActivateContext activation{context}; - hipError_t res = hipHostUnregister(location); + hipError_t res = tensorflow::wrap::hipHostUnregister(location); if (res != hipSuccess) { LOG(ERROR) << "error unregistering host memory at " << location << ": " << ToString(res); @@ -755,7 +756,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { } ScopedActivateContext activated{context}; - hipError_t res = hipEventDestroy(*event); + hipError_t res = tensorflow::wrap::hipEventDestroy(*event); *event = nullptr; switch (res) { @@ -779,7 +780,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { GpuEventHandle event, GpuStreamHandle stream) { ScopedActivateContext activated{context}; - hipError_t res = hipEventRecord(event, stream); + hipError_t res = tensorflow::wrap::hipEventRecord(event, stream); switch (res) { case hipSuccess: return port::Status::OK(); @@ -800,7 +801,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { /* static */ port::StatusOr GpuDriver::QueryEvent( GpuContext* context, GpuEventHandle event) { ScopedActivateContext activated{context}; - hipError_t res = hipEventQuery(event); + hipError_t res = tensorflow::wrap::hipEventQuery(event); if (res != hipSuccess && res != hipErrorNotReady) { return port::Status{ port::error::INTERNAL, @@ -817,12 +818,12 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { ScopedActivateContext activated{context}; // The stop event must have completed in order for hipEventElapsedTime to // work. - hipError_t res = hipEventSynchronize(stop); + hipError_t res = tensorflow::wrap::hipEventSynchronize(stop); if (res != hipSuccess) { LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res); return false; } - res = hipEventElapsedTime(elapsed_milliseconds, start, stop); + res = tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop); if (res != hipSuccess) { LOG(ERROR) << "failed to get elapsed time between events: " << ToString(res); @@ -836,7 +837,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { GpuStreamHandle stream, GpuEventHandle event) { ScopedActivateContext activation{context}; - hipError_t res = hipStreamWaitEvent(stream, event, 0 /* = flags */); + hipError_t res = tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */); if (res != hipSuccess) { LOG(ERROR) << "could not wait stream on event: " << ToString(res); return false; @@ -847,7 +848,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) { ScopedActivateContext activation{context}; - hipError_t res = hipDeviceSynchronize(); + hipError_t res = tensorflow::wrap::hipDeviceSynchronize(); if (res != hipSuccess) { LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res) << " :: " << port::CurrentStackTrace(); @@ -861,7 +862,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { GpuStreamHandle stream) { ScopedActivateContext activated{context}; CHECK(stream != nullptr); - hipError_t res = hipStreamSynchronize(stream); + hipError_t res = tensorflow::wrap::hipStreamSynchronize(stream); if (res != hipSuccess) { port::Status status = port::InternalError( absl::StrCat("could not synchronize on ROCM stream: ", ToString(res))); @@ -877,7 +878,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { GpuStreamHandle stream) { ScopedActivateContext activated{context}; CHECK(stream != nullptr); - hipError_t res = hipStreamQuery(stream); + hipError_t res = tensorflow::wrap::hipStreamQuery(stream); if (res == hipSuccess) { return true; } @@ -891,7 +892,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { /* static */ port::Status GpuDriver::SynchronousMemcpyD2H( GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) { ScopedActivateContext activation{context}; - hipError_t res = hipMemcpyDtoH(host_dst, gpu_src, size); + hipError_t res = tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size); if (res != hipSuccess) { return port::InternalError( absl::StrFormat("failed to synchronous memcpy from device to host: %s; " @@ -908,7 +909,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src, uint64 size) { ScopedActivateContext activation{context}; - hipError_t res = hipMemcpyHtoD(gpu_dst, const_cast(host_src), size); + hipError_t res = tensorflow::wrap::hipMemcpyHtoD(gpu_dst, const_cast(host_src), size); if (res != hipSuccess) { return port::InternalError(absl::StrFormat( "failed to synchronous memcpy from host to device: %s; Gpu dst: %p;" @@ -924,7 +925,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src, uint64 size) { ScopedActivateContext activation{context}; - hipError_t res = hipMemcpyDtoD(gpu_dst, gpu_src, size); + hipError_t res = tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size); if (res != hipSuccess) { return port::InternalError(absl::StrFormat( "failed to synchronous memcpy from host to device: %s; Gpu dst: %p; " @@ -942,7 +943,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { uint64 size, GpuStreamHandle stream) { ScopedActivateContext activation{context}; - hipError_t res = hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream); + hipError_t res = tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream); if (res != hipSuccess) { LOG(ERROR) << absl::StrFormat( "failed to enqueue async memcpy from device to host: %s; host dst: %p; " @@ -964,7 +965,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { GpuStreamHandle stream) { ScopedActivateContext activation{context}; hipError_t res = - hipMemcpyHtoDAsync(gpu_dst, const_cast(host_src), size, stream); + tensorflow::wrap::hipMemcpyHtoDAsync(gpu_dst, const_cast(host_src), size, stream); if (res != hipSuccess) { LOG(ERROR) << absl::StrFormat( "failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; " @@ -984,7 +985,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { uint64 size, GpuStreamHandle stream) { ScopedActivateContext activation{context}; - hipError_t result = hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream); + hipError_t result = tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream); if (result != hipSuccess) { LOG(ERROR) << absl::StrFormat( "failed to enqueue async memcpy from device to device: %s" @@ -1021,7 +1022,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { } ScopedActivateContext activated{context}; - hipError_t res = hipEventCreateWithFlags(event, hipflags); + hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags); if (res == hipSuccess) { return port::Status::OK(); @@ -1037,7 +1038,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { /* static */ int GpuDriver::GetDeviceCount() { int device_count = 0; - hipError_t res = hipGetDeviceCount(&device_count); + hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count); if (res != hipSuccess) { LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res); return 0; @@ -1061,7 +1062,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { /* static */ port::Status GpuDriver::GetPointerAddressRange( hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) { - hipError_t result = hipMemGetAddressRange(base, size, dptr); + hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr); if (result == hipSuccess) { return port::Status::OK(); } else if (result == hipErrorNotFound) { @@ -1106,7 +1107,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { /* static */ port::StatusOr GpuDriver::GetPointerDevice( hipDeviceptr_t pointer) { hipPointerAttribute_t pointerAttributes; - hipError_t result = hipPointerGetAttributes(&pointerAttributes, pointer); + hipError_t result = tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer); if (result != hipSuccess) { return port::Status{ port::error::INTERNAL, @@ -1114,7 +1115,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { } hipDevice_t device; - result = hipDeviceGet(&device, pointerAttributes.device); + result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device); if (result != hipSuccess) { return port::Status{ port::error::INTERNAL, @@ -1127,7 +1128,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version, hipDevice_t device) { hipDeviceProp_t props; - hipError_t result = hipGetDeviceProperties(&props, device); + hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device); if (result == hipSuccess) { *version = props.gcnArch; return port::Status::OK(); @@ -1145,7 +1146,7 @@ template static port::StatusOr GetSimpleAttribute(hipDevice_t device, hipDeviceAttribute_t attribute) { int value = -1; - hipError_t result = hipDeviceGetAttribute(&value, attribute, device); + hipError_t result = tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device); if (result != hipSuccess) { return port::Status{ port::error::NOT_FOUND, @@ -1200,21 +1201,21 @@ static port::StatusOr GetSimpleAttribute(hipDevice_t device, hipDevice_t device) { int value; hipError_t res = - hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, device); + tensorflow::wrap::hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, device); if (res != hipSuccess) { LOG(ERROR) << "failed to query max grid dim x: " << ToString(res); return false; } *x = value; - res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, device); + res = tensorflow::wrap::hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, device); if (res != hipSuccess) { LOG(ERROR) << "failed to query max grid dim y: " << ToString(res); return false; } *y = value; - res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, device); + res = tensorflow::wrap::hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, device); if (res != hipSuccess) { LOG(ERROR) << "failed to query max grid dim z: " << ToString(res); return false; @@ -1224,7 +1225,7 @@ static port::StatusOr GetSimpleAttribute(hipDevice_t device, } /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) { - hipError_t res = hipDriverGetVersion(driver_version); + hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version); if (res != hipSuccess) { LOG(ERROR) << "failed to query driver version: " << ToString(res); return false; @@ -1235,7 +1236,7 @@ static port::StatusOr GetSimpleAttribute(hipDevice_t device, /* static */ bool GpuDriver::GetDeviceProperties( hipDeviceProp_t* device_properties, int device_ordinal) { - hipError_t res = hipGetDeviceProperties(device_properties, device_ordinal); + hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties, device_ordinal); if (res != hipSuccess) { LOG(ERROR) << "failed to query device properties: " << ToString(res); return false; @@ -1268,7 +1269,7 @@ static port::StatusOr GetSimpleAttribute(hipDevice_t device, ScopedActivateContext activation{context}; size_t free = 0; size_t total = 0; - hipError_t res = hipMemGetInfo(&free, &total); + hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total); if (res != hipSuccess) { LOG(ERROR) << "failed to query device memory info: " << ToString(res); return false; @@ -1282,7 +1283,7 @@ static port::StatusOr GetSimpleAttribute(hipDevice_t device, /* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device, uint64* result) { size_t value = -1; - hipError_t res = hipDeviceTotalMem(&value, device); + hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device); if (res != hipSuccess) { LOG(ERROR) << "failed to query total available memory: " << ToString(res); return false; @@ -1297,7 +1298,7 @@ static port::StatusOr GetSimpleAttribute(hipDevice_t device, static const int kBufferSize = 64; absl::InlinedVector chars(kBufferSize); chars[kBufferSize - 1] = '\0'; - hipError_t res = hipDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device); + hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device); if (res != hipSuccess) { LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res); return pci_bus_id; @@ -1313,7 +1314,7 @@ static port::StatusOr GetSimpleAttribute(hipDevice_t device, } int can_access_peer = -1; - hipError_t res = hipDeviceCanAccessPeer( + hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer( &can_access_peer, from->device_ordinal(), to->device_ordinal()); if (res != hipSuccess) { LOG(ERROR) << "failed to detect peer access capability: " << ToString(res); @@ -1331,7 +1332,7 @@ static port::StatusOr GetSimpleAttribute(hipDevice_t device, ScopedActivateContext activated{from}; hipError_t result = - hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */); + tensorflow::wrap::hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */); if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) { return port::Status{ port::error::INTERNAL, diff --git a/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h new file mode 100644 index 00000000000..0a0ab3ae745 --- /dev/null +++ b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h @@ -0,0 +1,147 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// This file wraps rocm driver calls with dso loader so that we don't need to +// have explicit linking to librocm. All TF rocm driver usage should route +// through this wrapper. + +#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_ +#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_ + +#include "tensorflow/stream_executor/lib/env.h" +#include "tensorflow/stream_executor/platform/dso_loader.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "rocm/include/hip/hip_runtime.h" + + +#if defined(TENSORFLOW_USE_ROCM) + +#endif + +namespace tensorflow { +namespace wrap { +#ifdef PLATFORM_GOOGLE +// Use static linked library +#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName) \ + template \ + auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \ + return ::hipSymbolName(args...); \ + } + +// This macro wraps a global identifier, given by hipSymbolName, in a callable +// structure that loads the DLL symbol out of the DSO handle in a thread-safe +// manner on first use. This dynamic loading technique is used to avoid DSO +// dependencies on vendor libraries which may or may not be available in the +// deployed binary environment. +#else +#define TO_STR_(x) #x +#define TO_STR(x) TO_STR_(x) + +// hipMalloc and hipHostMalloc are defined as funtion templates in the +// HIP header files, and hence their names get mangled and the attempt +// to resolve their name when trying to dynamically load them will fail +// Updating the HIP header files to make them C functions is underway. +// Until that change flows through, we will workaround the issue by +// creating dummy wrappers for them here + +hipError_t hipMallocVanilla(void** ptr, size_t size) { + return hipErrorNotInitialized; +} + +hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) { + return hipErrorNotInitialized; +} + +#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName) \ + template \ + auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \ + using FuncPtrT = std::add_pointer::type; \ + static FuncPtrT loaded = []() -> FuncPtrT { \ + static const char *kName = TO_STR(hipSymbolName); \ + void *f; \ + auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary( \ + stream_executor::internal::CachedDsoLoader::GetHipDsoHandle() \ + .ValueOrDie(), \ + kName, &f); \ + CHECK(s.ok()) << "could not find " << kName \ + << " in HIP DSO; dlerror: " << s.error_message(); \ + return reinterpret_cast(f); \ + }(); \ + return loaded(args...); \ + } +#endif + +// clang-format off +#define HIP_ROUTINE_EACH(__macro) \ + __macro(hipDeviceCanAccessPeer) \ + __macro(hipDeviceEnablePeerAccess) \ + __macro(hipDeviceGet) \ + __macro(hipDeviceGetAttribute) \ + __macro(hipDeviceGetName) \ + __macro(hipDeviceGetPCIBusId) \ + __macro(hipDeviceGetSharedMemConfig) \ + __macro(hipDeviceSetSharedMemConfig) \ + __macro(hipDeviceSynchronize) \ + __macro(hipDeviceTotalMem) \ + __macro(hipDriverGetVersion) \ + __macro(hipEventCreateWithFlags) \ + __macro(hipEventElapsedTime) \ + __macro(hipEventDestroy) \ + __macro(hipEventQuery) \ + __macro(hipEventRecord) \ + __macro(hipEventSynchronize) \ + __macro(hipFree) \ + __macro(hipFuncSetCacheConfig) \ + __macro(hipGetDevice) \ + __macro(hipGetDeviceCount) \ + __macro(hipGetDeviceProperties) \ + __macro(hipHostFree) \ + __macro(hipHostRegister) \ + __macro(hipHostUnregister) \ + __macro(hipInit) \ + __macro(hipMemGetAddressRange) \ + __macro(hipMemGetInfo) \ + __macro(hipMemcpyDtoD) \ + __macro(hipMemcpyDtoDAsync) \ + __macro(hipMemcpyDtoH) \ + __macro(hipMemcpyDtoHAsync) \ + __macro(hipMemcpyHtoD) \ + __macro(hipMemcpyHtoDAsync) \ + __macro(hipMemset) \ + __macro(hipMemsetAsync) \ + __macro(hipModuleGetFunction) \ + __macro(hipModuleGetGlobal) \ + __macro(hipModuleLaunchKernel) \ + __macro(hipModuleLoadData) \ + __macro(hipModuleUnload) \ + __macro(hipPointerGetAttributes) \ + __macro(hipSetDevice) \ + __macro(hipStreamAddCallback) \ + __macro(hipStreamCreateWithFlags) \ + __macro(hipStreamDestroy) \ + __macro(hipStreamQuery) \ + __macro(hipStreamSynchronize) \ + __macro(hipStreamWaitEvent) \ +// clang-format on + +HIP_ROUTINE_EACH(STREAM_EXECUTOR_HIP_WRAP) +#undef HIP_ROUTINE_EACH +#undef STREAM_EXECUTOR_HIP_WRAP +#undef TO_STR +#undef TO_STR_ +} // namespace wrap +} // namespace tensorflow + +#endif // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_