changing the rocm_driver API from being dynamically linked to being dynamically loaded

This commit is contained in:
Deven Desai 2019-02-01 17:59:02 +00:00
parent 07b3f34199
commit 834a3f7395
3 changed files with 285 additions and 136 deletions

View File

@ -47,7 +47,7 @@ cc_library(
cc_library( cc_library(
name = "rocm_driver", name = "rocm_driver",
srcs = if_rocm_is_configured(["rocm_driver.cc"]), srcs = if_rocm_is_configured(["rocm_driver.cc"]),
hdrs = [], hdrs = if_rocm_is_configured(["rocm_driver_wrapper.h"]),
deps = if_rocm_is_configured([ deps = if_rocm_is_configured([
":rocm_diagnostics", ":rocm_diagnostics",
"@com_google_absl//absl/base", "@com_google_absl//absl/base",
@ -57,6 +57,7 @@ cc_library(
"//tensorflow/stream_executor/gpu:gpu_driver_header", "//tensorflow/stream_executor/gpu:gpu_driver_header",
"//tensorflow/stream_executor/lib", "//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform", "//tensorflow/stream_executor/platform",
"//tensorflow/stream_executor/platform:dso_loader",
"@local_config_rocm//rocm:rocm_headers", "@local_config_rocm//rocm:rocm_headers",
]), ]),
) )
@ -141,60 +142,60 @@ cc_library(
) )
cc_library( cc_library(
name = "rocblas_plugin", name = "rocblas_plugin",
srcs = if_rocm_is_configured(["rocm_blas.cc"]), srcs = if_rocm_is_configured(["rocm_blas.cc"]),
hdrs = if_rocm_is_configured(["rocm_blas.h"]), hdrs = if_rocm_is_configured(["rocm_blas.h"]),
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
deps = if_rocm_is_configured([ deps = if_rocm_is_configured([
":rocm_gpu_executor", ":rocm_gpu_executor",
":rocm_platform_id", ":rocm_platform_id",
"//third_party/eigen3", "//third_party/eigen3",
"//tensorflow/core:lib_internal", "//tensorflow/core:lib_internal",
"//tensorflow/stream_executor", "//tensorflow/stream_executor",
"//tensorflow/stream_executor:event", "//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:host_or_device_scalar", "//tensorflow/stream_executor:host_or_device_scalar",
"//tensorflow/stream_executor:plugin_registry", "//tensorflow/stream_executor:plugin_registry",
"//tensorflow/stream_executor:scratch_allocator", "//tensorflow/stream_executor:scratch_allocator",
"//tensorflow/stream_executor:timer", "//tensorflow/stream_executor:timer",
"//tensorflow/stream_executor/gpu:gpu_activation", "//tensorflow/stream_executor/gpu:gpu_activation",
"//tensorflow/stream_executor/gpu:gpu_helpers_header", "//tensorflow/stream_executor/gpu:gpu_helpers_header",
"//tensorflow/stream_executor/gpu:gpu_stream_header", "//tensorflow/stream_executor/gpu:gpu_stream_header",
"//tensorflow/stream_executor/gpu:gpu_timer_header", "//tensorflow/stream_executor/gpu:gpu_timer_header",
"//tensorflow/stream_executor/lib", "//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform", "//tensorflow/stream_executor/platform",
"//tensorflow/stream_executor/platform:dso_loader", "//tensorflow/stream_executor/platform:dso_loader",
"@com_google_absl//absl/strings", "@com_google_absl//absl/strings",
"@local_config_rocm//rocm:rocm_headers", "@local_config_rocm//rocm:rocm_headers",
] + if_static([ ] + if_static([
"@local_config_rocm//rocm:rocblas" "@local_config_rocm//rocm:rocblas",
])), ])),
alwayslink = True, alwayslink = True,
) )
cc_library( cc_library(
name = "rocfft_plugin", name = "rocfft_plugin",
srcs = if_rocm_is_configured(["rocm_fft.cc"]), srcs = if_rocm_is_configured(["rocm_fft.cc"]),
hdrs = if_rocm_is_configured(["rocm_fft.h"]), hdrs = if_rocm_is_configured(["rocm_fft.h"]),
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
deps = if_rocm_is_configured([ deps = if_rocm_is_configured([
":rocm_platform_id", ":rocm_platform_id",
"//tensorflow/stream_executor:event", "//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:fft", "//tensorflow/stream_executor:fft",
"//tensorflow/stream_executor:plugin_registry", "//tensorflow/stream_executor:plugin_registry",
"//tensorflow/stream_executor:scratch_allocator", "//tensorflow/stream_executor:scratch_allocator",
"//tensorflow/stream_executor/gpu:gpu_activation", "//tensorflow/stream_executor/gpu:gpu_activation",
"//tensorflow/stream_executor/gpu:gpu_helpers_header", "//tensorflow/stream_executor/gpu:gpu_helpers_header",
"//tensorflow/stream_executor/gpu:gpu_executor_header", "//tensorflow/stream_executor/gpu:gpu_executor_header",
"//tensorflow/stream_executor/gpu:gpu_stream_header", "//tensorflow/stream_executor/gpu:gpu_stream_header",
"//tensorflow/stream_executor/gpu:gpu_kernel_header", "//tensorflow/stream_executor/gpu:gpu_kernel_header",
"//tensorflow/stream_executor/lib", "//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform", "//tensorflow/stream_executor/platform",
"//tensorflow/stream_executor/platform:dso_loader", "//tensorflow/stream_executor/platform:dso_loader",
"@local_config_rocm//rocm:rocm_headers", "@local_config_rocm//rocm:rocm_headers",
] + if_static([ ] + if_static([
"@local_config_rocm//rocm:rocfft" "@local_config_rocm//rocm:rocfft",
])), ])),
alwayslink = True, alwayslink = True,
) )
# FIXME: enable in future PRs # FIXME: enable in future PRs
@ -237,28 +238,28 @@ cc_library(
#) #)
cc_library( cc_library(
name = "rocrand_plugin", name = "rocrand_plugin",
srcs = if_rocm_is_configured(["rocm_rng.cc"]), srcs = if_rocm_is_configured(["rocm_rng.cc"]),
hdrs = if_rocm_is_configured([]), hdrs = if_rocm_is_configured([]),
deps = if_rocm_is_configured([ deps = if_rocm_is_configured([
":rocm_gpu_executor", ":rocm_gpu_executor",
":rocm_platform_id", ":rocm_platform_id",
"@local_config_rocm//rocm:rocm_headers", "@local_config_rocm//rocm:rocm_headers",
"//tensorflow/stream_executor:event", "//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:plugin_registry", "//tensorflow/stream_executor:plugin_registry",
"//tensorflow/stream_executor:rng", "//tensorflow/stream_executor:rng",
"//tensorflow/stream_executor/gpu:gpu_activation_header", "//tensorflow/stream_executor/gpu:gpu_activation_header",
"//tensorflow/stream_executor/gpu:gpu_helpers_header", "//tensorflow/stream_executor/gpu:gpu_helpers_header",
"//tensorflow/stream_executor/gpu:gpu_executor_header", "//tensorflow/stream_executor/gpu:gpu_executor_header",
"//tensorflow/stream_executor/gpu:gpu_rng_header", "//tensorflow/stream_executor/gpu:gpu_rng_header",
"//tensorflow/stream_executor/gpu:gpu_stream_header", "//tensorflow/stream_executor/gpu:gpu_stream_header",
"//tensorflow/stream_executor/lib", "//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform", "//tensorflow/stream_executor/platform",
"//tensorflow/stream_executor/platform:dso_loader", "//tensorflow/stream_executor/platform:dso_loader",
] + if_static([ ] + if_static([
"@local_config_rocm//rocm:hiprand" "@local_config_rocm//rocm:hiprand",
])), ])),
alwayslink = True, alwayslink = True,
) )
cc_library( cc_library(

View File

@ -36,6 +36,7 @@ limitations under the License.
#include "tensorflow/stream_executor/platform/logging.h" #include "tensorflow/stream_executor/platform/logging.h"
#include "tensorflow/stream_executor/platform/mutex.h" #include "tensorflow/stream_executor/platform/mutex.h"
#include "tensorflow/stream_executor/platform/port.h" #include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"
bool FLAGS_gpuexec_rocm_driver_inject_init_error = false; bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false; bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
@ -143,7 +144,7 @@ string MemorySpaceString(MemorySpace memory_space) {
// HIP driver (e.g., this value is not our cached view of the current device). // HIP driver (e.g., this value is not our cached view of the current device).
static int CurrentDeviceOrDie() { static int CurrentDeviceOrDie() {
int current = -1; int current = -1;
hipError_t result = hipGetDevice(&current); hipError_t result = tensorflow::wrap::hipGetDevice(&current);
if (result != hipSuccess) { if (result != hipSuccess) {
LOG(FATAL) << "failed to query current device: " << ToString(result); LOG(FATAL) << "failed to query current device: " << ToString(result);
} }
@ -154,7 +155,7 @@ namespace {
// Call hipDeviceSynchronize and crash if it doesn't succeed. // Call hipDeviceSynchronize and crash if it doesn't succeed.
void SynchronizeOrDie() { void SynchronizeOrDie() {
auto res = hipDeviceSynchronize(); auto res = tensorflow::wrap::hipDeviceSynchronize();
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(FATAL) << "Synchronize found " << ToString(res) LOG(FATAL) << "Synchronize found " << ToString(res)
<< " :: " << port::CurrentStackTrace(); << " :: " << port::CurrentStackTrace();
@ -197,7 +198,7 @@ ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
<< tls->current_device_ordinal << " to " << context->device_ordinal(); << tls->current_device_ordinal << " to " << context->device_ordinal();
// Set the device and update thread local. // Set the device and update thread local.
CHECK_EQ(hipSuccess, hipSetDevice(context->device_ordinal())); CHECK_EQ(hipSuccess, tensorflow::wrap::hipSetDevice(context->device_ordinal()));
tls->current_device_ordinal = context->device_ordinal(); tls->current_device_ordinal = context->device_ordinal();
} }
@ -225,7 +226,7 @@ ScopedActivateContext::~ScopedActivateContext() {
<< to_restore_->device_ordinal(); << to_restore_->device_ordinal();
// Set context and update thread local. // Set context and update thread local.
CHECK_EQ(hipSuccess, hipSetDevice(to_restore_->device_ordinal())); CHECK_EQ(hipSuccess, tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal()));
tls->current_device_ordinal = to_restore_->device_ordinal(); tls->current_device_ordinal = to_restore_->device_ordinal();
} }
@ -261,7 +262,7 @@ string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
// in the process of querying. // in the process of querying.
string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) { string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
hipPointerAttribute_t from_pointerAttributes; hipPointerAttribute_t from_pointerAttributes;
hipError_t result = hipPointerGetAttributes(&from_pointerAttributes, from); hipError_t result = tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from);
if (result != hipSuccess) { if (result != hipSuccess) {
LOG(ERROR) << "could not retrieve source pointer's device: " LOG(ERROR) << "could not retrieve source pointer's device: "
<< ToString(result); << ToString(result);
@ -269,7 +270,7 @@ string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
} }
hipPointerAttribute_t to_pointerAttributes; hipPointerAttribute_t to_pointerAttributes;
result = hipPointerGetAttributes(&to_pointerAttributes, to); result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to);
if (result != hipSuccess) { if (result != hipSuccess) {
LOG(ERROR) << "could not retrieve destination pointer's device: " LOG(ERROR) << "could not retrieve destination pointer's device: "
<< ToString(result); << ToString(result);
@ -289,7 +290,7 @@ static port::Status InternalInit() {
if (FLAGS_gpuexec_rocm_driver_inject_init_error) { if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
LOG(ERROR) << "injecting ROCM init error; initialization will fail"; LOG(ERROR) << "injecting ROCM init error; initialization will fail";
} else { } else {
res = hipInit(0 /* = flags */); res = tensorflow::wrap::hipInit(0 /* = flags */);
} }
if (res == hipSuccess) { if (res == hipSuccess) {
@ -322,7 +323,7 @@ static port::Status InternalInit() {
/* static */ port::Status GpuDriver::GetDevice(int device_ordinal, /* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
hipDevice_t* device) { hipDevice_t* device) {
hipError_t res = hipDeviceGet(device, device_ordinal); hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal);
if (res == hipSuccess) { if (res == hipSuccess) {
return port::Status::OK(); return port::Status::OK();
} }
@ -336,7 +337,7 @@ static port::Status InternalInit() {
string* device_name) { string* device_name) {
static const size_t kCharLimit = 64; static const size_t kCharLimit = 64;
absl::InlinedVector<char, 4> chars(kCharLimit); absl::InlinedVector<char, 4> chars(kCharLimit);
hipError_t res = hipDeviceGetName(chars.begin(), kCharLimit - 1, device); hipError_t res = tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to get device name for " << device << ": " LOG(ERROR) << "failed to get device name for " << device << ": "
<< ToString(res); << ToString(res);
@ -382,7 +383,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
/* static */ bool GpuDriver::FuncSetCacheConfig(hipFunction_t function, /* static */ bool GpuDriver::FuncSetCacheConfig(hipFunction_t function,
hipFuncCache_t cache_config) { hipFuncCache_t cache_config) {
hipError_t res = hipFuncSetCacheConfig(function, cache_config); hipError_t res = tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to set ROCM kernel cache config. kernel: " << function LOG(ERROR) << "failed to set ROCM kernel cache config. kernel: " << function
<< ", config: " << cache_config << ", result: " << ToString(res); << ", config: " << cache_config << ", result: " << ToString(res);
@ -396,7 +397,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
GpuDriver::ContextGetSharedMemConfig(GpuContext* context) { GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
hipSharedMemConfig shared_mem_config; hipSharedMemConfig shared_mem_config;
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t result = hipDeviceGetSharedMemConfig(&shared_mem_config); hipError_t result = tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config);
if (result != hipSuccess) { if (result != hipSuccess) {
LOG(ERROR) << "failed to get ROCM device shared memory config. " LOG(ERROR) << "failed to get ROCM device shared memory config. "
<< "Context device ID: " << context->device_ordinal() << "Context device ID: " << context->device_ordinal()
@ -411,7 +412,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
/* static */ port::Status GpuDriver::ContextSetSharedMemConfig( /* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
GpuContext* context, hipSharedMemConfig shared_mem_config) { GpuContext* context, hipSharedMemConfig shared_mem_config) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t result = hipDeviceSetSharedMemConfig(shared_mem_config); hipError_t result = tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config);
if (result != hipSuccess) { if (result != hipSuccess) {
LOG(ERROR) << "failed to set ROCM device shared memory config. " LOG(ERROR) << "failed to set ROCM device shared memory config. "
<< "Context device ID: " << context->device_ordinal() << "Context device ID: " << context->device_ordinal()
@ -435,7 +436,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
<< " gdy: " << grid_dim_y << " gdz: " << grid_dim_z << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
<< " bdx: " << block_dim_x << " bdy: " << block_dim_y << " bdx: " << block_dim_x << " bdy: " << block_dim_y
<< " bdz: " << block_dim_z << " smem: " << shared_mem_bytes; << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
hipError_t res = hipModuleLaunchKernel( hipError_t res = tensorflow::wrap::hipModuleLaunchKernel(
function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y, function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
block_dim_z, shared_mem_bytes, stream, kernel_params, extra); block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
if (res != hipSuccess) { if (res != hipSuccess) {
@ -471,7 +472,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
void* hsaco_data = const_cast<char*>(hsaco_contents); void* hsaco_data = const_cast<char*>(hsaco_contents);
hipError_t res = hipModuleLoadData(module, hsaco_data); hipError_t res = tensorflow::wrap::hipModuleLoadData(module, hsaco_data);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to load HSACO: " << ToString(res); LOG(ERROR) << "failed to load HSACO: " << ToString(res);
@ -491,7 +492,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
hipDeviceptr_t location, hipDeviceptr_t location,
uint8 value, size_t size) { uint8 value, size_t size) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t res = hipMemset(location, value, size); hipError_t res = tensorflow::wrap::hipMemset(location, value, size);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to memset memory: " << ToString(res); LOG(ERROR) << "failed to memset memory: " << ToString(res);
return false; return false;
@ -513,7 +514,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
return false; return false;
} }
hipError_t res = hipError_t res =
hipMemset(pointer, static_cast<int>(value), uint32_count * 4); tensorflow::wrap::hipMemset(pointer, static_cast<int>(value), uint32_count * 4);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to memset memory: " << ToString(res); LOG(ERROR) << "failed to memset memory: " << ToString(res);
return false; return false;
@ -527,7 +528,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
size_t uint32_count, size_t uint32_count,
GpuStreamHandle stream) { GpuStreamHandle stream) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t res = hipMemsetAsync(location, value, uint32_count, stream); hipError_t res = tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res); LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
return false; return false;
@ -552,7 +553,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
LOG(ERROR) << "failed to memset memory"; LOG(ERROR) << "failed to memset memory";
return false; return false;
} }
hipError_t res = hipMemsetAsync(pointer, value, uint32_count * 4, stream); hipError_t res = tensorflow::wrap::hipMemsetAsync(pointer, value, uint32_count * 4, stream);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res); LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
return false; return false;
@ -565,7 +566,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
GpuStreamHandle stream, GpuStreamHandle stream,
StreamCallback callback, StreamCallback callback,
void* data) { void* data) {
hipError_t res = hipStreamAddCallback(stream, (hipStreamCallback_t)callback, hipError_t res = tensorflow::wrap::hipStreamAddCallback(stream, (hipStreamCallback_t)callback,
data, 0 /* = flags */); data, 0 /* = flags */);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "unable to add host callback: " << ToString(res); LOG(ERROR) << "unable to add host callback: " << ToString(res);
@ -580,7 +581,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
hipFunction_t* function) { hipFunction_t* function) {
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
CHECK(module != nullptr && kernel_name != nullptr); CHECK(module != nullptr && kernel_name != nullptr);
hipError_t res = hipModuleGetFunction(function, module, kernel_name); hipError_t res = tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to get kernel \"" << kernel_name LOG(ERROR) << "failed to get kernel \"" << kernel_name
<< "\" from module: " << ToString(res); << "\" from module: " << ToString(res);
@ -598,7 +599,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
CHECK(module != nullptr && symbol_name != nullptr && CHECK(module != nullptr && symbol_name != nullptr &&
(dptr != nullptr || bytes != nullptr)); (dptr != nullptr || bytes != nullptr));
hipError_t res = hipModuleGetGlobal(dptr, bytes, module, symbol_name); hipError_t res = tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
if (res != hipSuccess) { if (res != hipSuccess) {
// symbol may not be found in the current module, but it may reside in // symbol may not be found in the current module, but it may reside in
// another module. // another module.
@ -613,7 +614,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
/* static */ void GpuDriver::UnloadModule(GpuContext* context, /* static */ void GpuDriver::UnloadModule(GpuContext* context,
hipModule_t module) { hipModule_t module) {
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
hipError_t res = hipModuleUnload(module); hipError_t res = tensorflow::wrap::hipModuleUnload(module);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to unload module " << module LOG(ERROR) << "failed to unload module " << module
<< "; leaking: " << ToString(res); << "; leaking: " << ToString(res);
@ -623,7 +624,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
/* static */ bool GpuDriver::CreateStream(GpuContext* context, /* static */ bool GpuDriver::CreateStream(GpuContext* context,
GpuStreamHandle* stream) { GpuStreamHandle* stream) {
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
hipError_t res = hipStreamCreateWithFlags( hipError_t res = tensorflow::wrap::hipStreamCreateWithFlags(
stream, hipStreamDefault); // switch to hipStreamNonBlocking? stream, hipStreamDefault); // switch to hipStreamNonBlocking?
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "could not allocate ROCM stream for device " LOG(ERROR) << "could not allocate ROCM stream for device "
@ -643,7 +644,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
} }
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
hipError_t res = hipStreamDestroy(*stream); hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to destroy ROCM stream for device " LOG(ERROR) << "failed to destroy ROCM stream for device "
<< context->device_ordinal() << ": " << ToString(res); << context->device_ordinal() << ": " << ToString(res);
@ -658,7 +659,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
uint64 bytes) { uint64 bytes) {
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
hipDeviceptr_t result = 0; hipDeviceptr_t result = 0;
hipError_t res = hipMalloc(&result, bytes); hipError_t res = tensorflow::wrap::hipMallocVanilla(&result, bytes);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to allocate " LOG(ERROR) << "failed to allocate "
<< port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
@ -675,7 +676,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
void* location) { void* location) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location); hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
hipError_t res = hipFree(pointer); hipError_t res = tensorflow::wrap::hipFree(pointer);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to free device memory at " << location LOG(ERROR) << "failed to free device memory at " << location
<< "; result: " << ToString(res); << "; result: " << ToString(res);
@ -704,7 +705,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
void* host_mem = nullptr; void* host_mem = nullptr;
// "Portable" memory is visible to all ROCM contexts. Safe for our use model. // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
hipError_t res = hipHostMalloc(&host_mem, bytes, hipHostMallocPortable); hipError_t res = tensorflow::wrap::hipHostMallocVanilla(&host_mem, bytes, hipHostMallocPortable);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to alloc " << bytes LOG(ERROR) << "failed to alloc " << bytes
<< " bytes on host: " << ToString(res); << " bytes on host: " << ToString(res);
@ -715,7 +716,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
/* static */ void GpuDriver::HostDeallocate(GpuContext* context, /* static */ void GpuDriver::HostDeallocate(GpuContext* context,
void* location) { void* location) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t res = hipHostFree(location); hipError_t res = tensorflow::wrap::hipHostFree(location);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "error deallocating host memory at " << location << ": " LOG(ERROR) << "error deallocating host memory at " << location << ": "
<< ToString(res); << ToString(res);
@ -726,7 +727,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
uint64 bytes) { uint64 bytes) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
// "Portable" memory is visible to all ROCM contexts. Safe for our use model. // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
hipError_t res = hipHostRegister(location, bytes, hipHostRegisterPortable); hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes, hipHostRegisterPortable);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "error registering host memory at " << location << ": " LOG(ERROR) << "error registering host memory at " << location << ": "
<< ToString(res); << ToString(res);
@ -738,7 +739,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
/* static */ bool GpuDriver::HostUnregister(GpuContext* context, /* static */ bool GpuDriver::HostUnregister(GpuContext* context,
void* location) { void* location) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t res = hipHostUnregister(location); hipError_t res = tensorflow::wrap::hipHostUnregister(location);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "error unregistering host memory at " << location << ": " LOG(ERROR) << "error unregistering host memory at " << location << ": "
<< ToString(res); << ToString(res);
@ -755,7 +756,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
} }
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
hipError_t res = hipEventDestroy(*event); hipError_t res = tensorflow::wrap::hipEventDestroy(*event);
*event = nullptr; *event = nullptr;
switch (res) { switch (res) {
@ -779,7 +780,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
GpuEventHandle event, GpuEventHandle event,
GpuStreamHandle stream) { GpuStreamHandle stream) {
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
hipError_t res = hipEventRecord(event, stream); hipError_t res = tensorflow::wrap::hipEventRecord(event, stream);
switch (res) { switch (res) {
case hipSuccess: case hipSuccess:
return port::Status::OK(); return port::Status::OK();
@ -800,7 +801,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
/* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent( /* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
GpuContext* context, GpuEventHandle event) { GpuContext* context, GpuEventHandle event) {
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
hipError_t res = hipEventQuery(event); hipError_t res = tensorflow::wrap::hipEventQuery(event);
if (res != hipSuccess && res != hipErrorNotReady) { if (res != hipSuccess && res != hipErrorNotReady) {
return port::Status{ return port::Status{
port::error::INTERNAL, port::error::INTERNAL,
@ -817,12 +818,12 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
// The stop event must have completed in order for hipEventElapsedTime to // The stop event must have completed in order for hipEventElapsedTime to
// work. // work.
hipError_t res = hipEventSynchronize(stop); hipError_t res = tensorflow::wrap::hipEventSynchronize(stop);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res); LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
return false; return false;
} }
res = hipEventElapsedTime(elapsed_milliseconds, start, stop); res = tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to get elapsed time between events: " LOG(ERROR) << "failed to get elapsed time between events: "
<< ToString(res); << ToString(res);
@ -836,7 +837,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
GpuStreamHandle stream, GpuStreamHandle stream,
GpuEventHandle event) { GpuEventHandle event) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t res = hipStreamWaitEvent(stream, event, 0 /* = flags */); hipError_t res = tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "could not wait stream on event: " << ToString(res); LOG(ERROR) << "could not wait stream on event: " << ToString(res);
return false; return false;
@ -847,7 +848,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) { /* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t res = hipDeviceSynchronize(); hipError_t res = tensorflow::wrap::hipDeviceSynchronize();
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res) LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
<< " :: " << port::CurrentStackTrace(); << " :: " << port::CurrentStackTrace();
@ -861,7 +862,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
GpuStreamHandle stream) { GpuStreamHandle stream) {
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
CHECK(stream != nullptr); CHECK(stream != nullptr);
hipError_t res = hipStreamSynchronize(stream); hipError_t res = tensorflow::wrap::hipStreamSynchronize(stream);
if (res != hipSuccess) { if (res != hipSuccess) {
port::Status status = port::InternalError( port::Status status = port::InternalError(
absl::StrCat("could not synchronize on ROCM stream: ", ToString(res))); absl::StrCat("could not synchronize on ROCM stream: ", ToString(res)));
@ -877,7 +878,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
GpuStreamHandle stream) { GpuStreamHandle stream) {
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
CHECK(stream != nullptr); CHECK(stream != nullptr);
hipError_t res = hipStreamQuery(stream); hipError_t res = tensorflow::wrap::hipStreamQuery(stream);
if (res == hipSuccess) { if (res == hipSuccess) {
return true; return true;
} }
@ -891,7 +892,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
/* static */ port::Status GpuDriver::SynchronousMemcpyD2H( /* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) { GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t res = hipMemcpyDtoH(host_dst, gpu_src, size); hipError_t res = tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size);
if (res != hipSuccess) { if (res != hipSuccess) {
return port::InternalError( return port::InternalError(
absl::StrFormat("failed to synchronous memcpy from device to host: %s; " absl::StrFormat("failed to synchronous memcpy from device to host: %s; "
@ -908,7 +909,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src, GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
uint64 size) { uint64 size) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t res = hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src), size); hipError_t res = tensorflow::wrap::hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src), size);
if (res != hipSuccess) { if (res != hipSuccess) {
return port::InternalError(absl::StrFormat( return port::InternalError(absl::StrFormat(
"failed to synchronous memcpy from host to device: %s; Gpu dst: %p;" "failed to synchronous memcpy from host to device: %s; Gpu dst: %p;"
@ -924,7 +925,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src, GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
uint64 size) { uint64 size) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t res = hipMemcpyDtoD(gpu_dst, gpu_src, size); hipError_t res = tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size);
if (res != hipSuccess) { if (res != hipSuccess) {
return port::InternalError(absl::StrFormat( return port::InternalError(absl::StrFormat(
"failed to synchronous memcpy from host to device: %s; Gpu dst: %p; " "failed to synchronous memcpy from host to device: %s; Gpu dst: %p; "
@ -942,7 +943,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
uint64 size, uint64 size,
GpuStreamHandle stream) { GpuStreamHandle stream) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t res = hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream); hipError_t res = tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << absl::StrFormat( LOG(ERROR) << absl::StrFormat(
"failed to enqueue async memcpy from device to host: %s; host dst: %p; " "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
@ -964,7 +965,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
GpuStreamHandle stream) { GpuStreamHandle stream) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t res = hipError_t res =
hipMemcpyHtoDAsync(gpu_dst, const_cast<void*>(host_src), size, stream); tensorflow::wrap::hipMemcpyHtoDAsync(gpu_dst, const_cast<void*>(host_src), size, stream);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << absl::StrFormat( LOG(ERROR) << absl::StrFormat(
"failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; " "failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
@ -984,7 +985,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
uint64 size, uint64 size,
GpuStreamHandle stream) { GpuStreamHandle stream) {
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
hipError_t result = hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream); hipError_t result = tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
if (result != hipSuccess) { if (result != hipSuccess) {
LOG(ERROR) << absl::StrFormat( LOG(ERROR) << absl::StrFormat(
"failed to enqueue async memcpy from device to device: %s" "failed to enqueue async memcpy from device to device: %s"
@ -1021,7 +1022,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
} }
ScopedActivateContext activated{context}; ScopedActivateContext activated{context};
hipError_t res = hipEventCreateWithFlags(event, hipflags); hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags);
if (res == hipSuccess) { if (res == hipSuccess) {
return port::Status::OK(); return port::Status::OK();
@ -1037,7 +1038,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
/* static */ int GpuDriver::GetDeviceCount() { /* static */ int GpuDriver::GetDeviceCount() {
int device_count = 0; int device_count = 0;
hipError_t res = hipGetDeviceCount(&device_count); hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res); LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
return 0; return 0;
@ -1061,7 +1062,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
/* static */ port::Status GpuDriver::GetPointerAddressRange( /* static */ port::Status GpuDriver::GetPointerAddressRange(
hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) { hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
hipError_t result = hipMemGetAddressRange(base, size, dptr); hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr);
if (result == hipSuccess) { if (result == hipSuccess) {
return port::Status::OK(); return port::Status::OK();
} else if (result == hipErrorNotFound) { } else if (result == hipErrorNotFound) {
@ -1106,7 +1107,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
/* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice( /* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
hipDeviceptr_t pointer) { hipDeviceptr_t pointer) {
hipPointerAttribute_t pointerAttributes; hipPointerAttribute_t pointerAttributes;
hipError_t result = hipPointerGetAttributes(&pointerAttributes, pointer); hipError_t result = tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
if (result != hipSuccess) { if (result != hipSuccess) {
return port::Status{ return port::Status{
port::error::INTERNAL, port::error::INTERNAL,
@ -1114,7 +1115,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
} }
hipDevice_t device; hipDevice_t device;
result = hipDeviceGet(&device, pointerAttributes.device); result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device);
if (result != hipSuccess) { if (result != hipSuccess) {
return port::Status{ return port::Status{
port::error::INTERNAL, port::error::INTERNAL,
@ -1127,7 +1128,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version, /* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
hipDevice_t device) { hipDevice_t device) {
hipDeviceProp_t props; hipDeviceProp_t props;
hipError_t result = hipGetDeviceProperties(&props, device); hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
if (result == hipSuccess) { if (result == hipSuccess) {
*version = props.gcnArch; *version = props.gcnArch;
return port::Status::OK(); return port::Status::OK();
@ -1145,7 +1146,7 @@ template <typename T>
static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device, static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
hipDeviceAttribute_t attribute) { hipDeviceAttribute_t attribute) {
int value = -1; int value = -1;
hipError_t result = hipDeviceGetAttribute(&value, attribute, device); hipError_t result = tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device);
if (result != hipSuccess) { if (result != hipSuccess) {
return port::Status{ return port::Status{
port::error::NOT_FOUND, port::error::NOT_FOUND,
@ -1200,21 +1201,21 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
hipDevice_t device) { hipDevice_t device) {
int value; int value;
hipError_t res = hipError_t res =
hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, device); tensorflow::wrap::hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, device);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to query max grid dim x: " << ToString(res); LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
return false; return false;
} }
*x = value; *x = value;
res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, device); res = tensorflow::wrap::hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, device);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to query max grid dim y: " << ToString(res); LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
return false; return false;
} }
*y = value; *y = value;
res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, device); res = tensorflow::wrap::hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, device);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to query max grid dim z: " << ToString(res); LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
return false; return false;
@ -1224,7 +1225,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
} }
/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) { /* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
hipError_t res = hipDriverGetVersion(driver_version); hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to query driver version: " << ToString(res); LOG(ERROR) << "failed to query driver version: " << ToString(res);
return false; return false;
@ -1235,7 +1236,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
/* static */ bool GpuDriver::GetDeviceProperties( /* static */ bool GpuDriver::GetDeviceProperties(
hipDeviceProp_t* device_properties, int device_ordinal) { hipDeviceProp_t* device_properties, int device_ordinal) {
hipError_t res = hipGetDeviceProperties(device_properties, device_ordinal); hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties, device_ordinal);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to query device properties: " << ToString(res); LOG(ERROR) << "failed to query device properties: " << ToString(res);
return false; return false;
@ -1268,7 +1269,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
ScopedActivateContext activation{context}; ScopedActivateContext activation{context};
size_t free = 0; size_t free = 0;
size_t total = 0; size_t total = 0;
hipError_t res = hipMemGetInfo(&free, &total); hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to query device memory info: " << ToString(res); LOG(ERROR) << "failed to query device memory info: " << ToString(res);
return false; return false;
@ -1282,7 +1283,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
/* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device, /* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
uint64* result) { uint64* result) {
size_t value = -1; size_t value = -1;
hipError_t res = hipDeviceTotalMem(&value, device); hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to query total available memory: " << ToString(res); LOG(ERROR) << "failed to query total available memory: " << ToString(res);
return false; return false;
@ -1297,7 +1298,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
static const int kBufferSize = 64; static const int kBufferSize = 64;
absl::InlinedVector<char, 4> chars(kBufferSize); absl::InlinedVector<char, 4> chars(kBufferSize);
chars[kBufferSize - 1] = '\0'; chars[kBufferSize - 1] = '\0';
hipError_t res = hipDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device); hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res); LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
return pci_bus_id; return pci_bus_id;
@ -1313,7 +1314,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
} }
int can_access_peer = -1; int can_access_peer = -1;
hipError_t res = hipDeviceCanAccessPeer( hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer(
&can_access_peer, from->device_ordinal(), to->device_ordinal()); &can_access_peer, from->device_ordinal(), to->device_ordinal());
if (res != hipSuccess) { if (res != hipSuccess) {
LOG(ERROR) << "failed to detect peer access capability: " << ToString(res); LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
@ -1331,7 +1332,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
ScopedActivateContext activated{from}; ScopedActivateContext activated{from};
hipError_t result = hipError_t result =
hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */); tensorflow::wrap::hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */);
if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) { if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
return port::Status{ return port::Status{
port::error::INTERNAL, port::error::INTERNAL,

View File

@ -0,0 +1,147 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This file wraps rocm driver calls with dso loader so that we don't need to
// have explicit linking to librocm. All TF rocm driver usage should route
// through this wrapper.
#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
#include "tensorflow/stream_executor/lib/env.h"
#include "tensorflow/stream_executor/platform/dso_loader.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "rocm/include/hip/hip_runtime.h"
#if defined(TENSORFLOW_USE_ROCM)
#endif
namespace tensorflow {
namespace wrap {
#ifdef PLATFORM_GOOGLE
// Use static linked library
#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName) \
template <typename... Args> \
auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
return ::hipSymbolName(args...); \
}
// This macro wraps a global identifier, given by hipSymbolName, in a callable
// structure that loads the DLL symbol out of the DSO handle in a thread-safe
// manner on first use. This dynamic loading technique is used to avoid DSO
// dependencies on vendor libraries which may or may not be available in the
// deployed binary environment.
#else
#define TO_STR_(x) #x
#define TO_STR(x) TO_STR_(x)
// hipMalloc and hipHostMalloc are defined as funtion templates in the
// HIP header files, and hence their names get mangled and the attempt
// to resolve their name when trying to dynamically load them will fail
// Updating the HIP header files to make them C functions is underway.
// Until that change flows through, we will workaround the issue by
// creating dummy wrappers for them here
hipError_t hipMallocVanilla(void** ptr, size_t size) {
return hipErrorNotInitialized;
}
hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
return hipErrorNotInitialized;
}
#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName) \
template <typename... Args> \
auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type; \
static FuncPtrT loaded = []() -> FuncPtrT { \
static const char *kName = TO_STR(hipSymbolName); \
void *f; \
auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary( \
stream_executor::internal::CachedDsoLoader::GetHipDsoHandle() \
.ValueOrDie(), \
kName, &f); \
CHECK(s.ok()) << "could not find " << kName \
<< " in HIP DSO; dlerror: " << s.error_message(); \
return reinterpret_cast<FuncPtrT>(f); \
}(); \
return loaded(args...); \
}
#endif
// clang-format off
#define HIP_ROUTINE_EACH(__macro) \
__macro(hipDeviceCanAccessPeer) \
__macro(hipDeviceEnablePeerAccess) \
__macro(hipDeviceGet) \
__macro(hipDeviceGetAttribute) \
__macro(hipDeviceGetName) \
__macro(hipDeviceGetPCIBusId) \
__macro(hipDeviceGetSharedMemConfig) \
__macro(hipDeviceSetSharedMemConfig) \
__macro(hipDeviceSynchronize) \
__macro(hipDeviceTotalMem) \
__macro(hipDriverGetVersion) \
__macro(hipEventCreateWithFlags) \
__macro(hipEventElapsedTime) \
__macro(hipEventDestroy) \
__macro(hipEventQuery) \
__macro(hipEventRecord) \
__macro(hipEventSynchronize) \
__macro(hipFree) \
__macro(hipFuncSetCacheConfig) \
__macro(hipGetDevice) \
__macro(hipGetDeviceCount) \
__macro(hipGetDeviceProperties) \
__macro(hipHostFree) \
__macro(hipHostRegister) \
__macro(hipHostUnregister) \
__macro(hipInit) \
__macro(hipMemGetAddressRange) \
__macro(hipMemGetInfo) \
__macro(hipMemcpyDtoD) \
__macro(hipMemcpyDtoDAsync) \
__macro(hipMemcpyDtoH) \
__macro(hipMemcpyDtoHAsync) \
__macro(hipMemcpyHtoD) \
__macro(hipMemcpyHtoDAsync) \
__macro(hipMemset) \
__macro(hipMemsetAsync) \
__macro(hipModuleGetFunction) \
__macro(hipModuleGetGlobal) \
__macro(hipModuleLaunchKernel) \
__macro(hipModuleLoadData) \
__macro(hipModuleUnload) \
__macro(hipPointerGetAttributes) \
__macro(hipSetDevice) \
__macro(hipStreamAddCallback) \
__macro(hipStreamCreateWithFlags) \
__macro(hipStreamDestroy) \
__macro(hipStreamQuery) \
__macro(hipStreamSynchronize) \
__macro(hipStreamWaitEvent) \
// clang-format on
HIP_ROUTINE_EACH(STREAM_EXECUTOR_HIP_WRAP)
#undef HIP_ROUTINE_EACH
#undef STREAM_EXECUTOR_HIP_WRAP
#undef TO_STR
#undef TO_STR_
} // namespace wrap
} // namespace tensorflow
#endif // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_