changing the rocm_driver API from being dynamically linked to being dynamically loaded
This commit is contained in:
parent
07b3f34199
commit
834a3f7395
@ -47,7 +47,7 @@ cc_library(
|
||||
cc_library(
|
||||
name = "rocm_driver",
|
||||
srcs = if_rocm_is_configured(["rocm_driver.cc"]),
|
||||
hdrs = [],
|
||||
hdrs = if_rocm_is_configured(["rocm_driver_wrapper.h"]),
|
||||
deps = if_rocm_is_configured([
|
||||
":rocm_diagnostics",
|
||||
"@com_google_absl//absl/base",
|
||||
@ -57,6 +57,7 @@ cc_library(
|
||||
"//tensorflow/stream_executor/gpu:gpu_driver_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
"//tensorflow/stream_executor/platform:dso_loader",
|
||||
"@local_config_rocm//rocm:rocm_headers",
|
||||
]),
|
||||
)
|
||||
@ -166,7 +167,7 @@ cc_library(
|
||||
"@com_google_absl//absl/strings",
|
||||
"@local_config_rocm//rocm:rocm_headers",
|
||||
] + if_static([
|
||||
"@local_config_rocm//rocm:rocblas"
|
||||
"@local_config_rocm//rocm:rocblas",
|
||||
])),
|
||||
alwayslink = True,
|
||||
)
|
||||
@ -192,7 +193,7 @@ cc_library(
|
||||
"//tensorflow/stream_executor/platform:dso_loader",
|
||||
"@local_config_rocm//rocm:rocm_headers",
|
||||
] + if_static([
|
||||
"@local_config_rocm//rocm:rocfft"
|
||||
"@local_config_rocm//rocm:rocfft",
|
||||
])),
|
||||
alwayslink = True,
|
||||
)
|
||||
@ -256,7 +257,7 @@ cc_library(
|
||||
"//tensorflow/stream_executor/platform",
|
||||
"//tensorflow/stream_executor/platform:dso_loader",
|
||||
] + if_static([
|
||||
"@local_config_rocm//rocm:hiprand"
|
||||
"@local_config_rocm//rocm:hiprand",
|
||||
])),
|
||||
alwayslink = True,
|
||||
)
|
||||
|
@ -36,6 +36,7 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/platform/logging.h"
|
||||
#include "tensorflow/stream_executor/platform/mutex.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"
|
||||
|
||||
bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
|
||||
bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
|
||||
@ -143,7 +144,7 @@ string MemorySpaceString(MemorySpace memory_space) {
|
||||
// HIP driver (e.g., this value is not our cached view of the current device).
|
||||
static int CurrentDeviceOrDie() {
|
||||
int current = -1;
|
||||
hipError_t result = hipGetDevice(¤t);
|
||||
hipError_t result = tensorflow::wrap::hipGetDevice(¤t);
|
||||
if (result != hipSuccess) {
|
||||
LOG(FATAL) << "failed to query current device: " << ToString(result);
|
||||
}
|
||||
@ -154,7 +155,7 @@ namespace {
|
||||
|
||||
// Call hipDeviceSynchronize and crash if it doesn't succeed.
|
||||
void SynchronizeOrDie() {
|
||||
auto res = hipDeviceSynchronize();
|
||||
auto res = tensorflow::wrap::hipDeviceSynchronize();
|
||||
if (res != hipSuccess) {
|
||||
LOG(FATAL) << "Synchronize found " << ToString(res)
|
||||
<< " :: " << port::CurrentStackTrace();
|
||||
@ -197,7 +198,7 @@ ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
|
||||
<< tls->current_device_ordinal << " to " << context->device_ordinal();
|
||||
|
||||
// Set the device and update thread local.
|
||||
CHECK_EQ(hipSuccess, hipSetDevice(context->device_ordinal()));
|
||||
CHECK_EQ(hipSuccess, tensorflow::wrap::hipSetDevice(context->device_ordinal()));
|
||||
tls->current_device_ordinal = context->device_ordinal();
|
||||
}
|
||||
|
||||
@ -225,7 +226,7 @@ ScopedActivateContext::~ScopedActivateContext() {
|
||||
<< to_restore_->device_ordinal();
|
||||
|
||||
// Set context and update thread local.
|
||||
CHECK_EQ(hipSuccess, hipSetDevice(to_restore_->device_ordinal()));
|
||||
CHECK_EQ(hipSuccess, tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal()));
|
||||
tls->current_device_ordinal = to_restore_->device_ordinal();
|
||||
}
|
||||
|
||||
@ -261,7 +262,7 @@ string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
|
||||
// in the process of querying.
|
||||
string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
|
||||
hipPointerAttribute_t from_pointerAttributes;
|
||||
hipError_t result = hipPointerGetAttributes(&from_pointerAttributes, from);
|
||||
hipError_t result = tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from);
|
||||
if (result != hipSuccess) {
|
||||
LOG(ERROR) << "could not retrieve source pointer's device: "
|
||||
<< ToString(result);
|
||||
@ -269,7 +270,7 @@ string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
|
||||
}
|
||||
|
||||
hipPointerAttribute_t to_pointerAttributes;
|
||||
result = hipPointerGetAttributes(&to_pointerAttributes, to);
|
||||
result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to);
|
||||
if (result != hipSuccess) {
|
||||
LOG(ERROR) << "could not retrieve destination pointer's device: "
|
||||
<< ToString(result);
|
||||
@ -289,7 +290,7 @@ static port::Status InternalInit() {
|
||||
if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
|
||||
LOG(ERROR) << "injecting ROCM init error; initialization will fail";
|
||||
} else {
|
||||
res = hipInit(0 /* = flags */);
|
||||
res = tensorflow::wrap::hipInit(0 /* = flags */);
|
||||
}
|
||||
|
||||
if (res == hipSuccess) {
|
||||
@ -322,7 +323,7 @@ static port::Status InternalInit() {
|
||||
|
||||
/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
|
||||
hipDevice_t* device) {
|
||||
hipError_t res = hipDeviceGet(device, device_ordinal);
|
||||
hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal);
|
||||
if (res == hipSuccess) {
|
||||
return port::Status::OK();
|
||||
}
|
||||
@ -336,7 +337,7 @@ static port::Status InternalInit() {
|
||||
string* device_name) {
|
||||
static const size_t kCharLimit = 64;
|
||||
absl::InlinedVector<char, 4> chars(kCharLimit);
|
||||
hipError_t res = hipDeviceGetName(chars.begin(), kCharLimit - 1, device);
|
||||
hipError_t res = tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to get device name for " << device << ": "
|
||||
<< ToString(res);
|
||||
@ -382,7 +383,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
|
||||
|
||||
/* static */ bool GpuDriver::FuncSetCacheConfig(hipFunction_t function,
|
||||
hipFuncCache_t cache_config) {
|
||||
hipError_t res = hipFuncSetCacheConfig(function, cache_config);
|
||||
hipError_t res = tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to set ROCM kernel cache config. kernel: " << function
|
||||
<< ", config: " << cache_config << ", result: " << ToString(res);
|
||||
@ -396,7 +397,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
|
||||
GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
hipSharedMemConfig shared_mem_config;
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t result = hipDeviceGetSharedMemConfig(&shared_mem_config);
|
||||
hipError_t result = tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config);
|
||||
if (result != hipSuccess) {
|
||||
LOG(ERROR) << "failed to get ROCM device shared memory config. "
|
||||
<< "Context device ID: " << context->device_ordinal()
|
||||
@ -411,7 +412,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
|
||||
GpuContext* context, hipSharedMemConfig shared_mem_config) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t result = hipDeviceSetSharedMemConfig(shared_mem_config);
|
||||
hipError_t result = tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config);
|
||||
if (result != hipSuccess) {
|
||||
LOG(ERROR) << "failed to set ROCM device shared memory config. "
|
||||
<< "Context device ID: " << context->device_ordinal()
|
||||
@ -435,7 +436,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
<< " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
|
||||
<< " bdx: " << block_dim_x << " bdy: " << block_dim_y
|
||||
<< " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
|
||||
hipError_t res = hipModuleLaunchKernel(
|
||||
hipError_t res = tensorflow::wrap::hipModuleLaunchKernel(
|
||||
function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
|
||||
block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
|
||||
if (res != hipSuccess) {
|
||||
@ -471,7 +472,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
ScopedActivateContext activation{context};
|
||||
void* hsaco_data = const_cast<char*>(hsaco_contents);
|
||||
|
||||
hipError_t res = hipModuleLoadData(module, hsaco_data);
|
||||
hipError_t res = tensorflow::wrap::hipModuleLoadData(module, hsaco_data);
|
||||
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to load HSACO: " << ToString(res);
|
||||
@ -491,7 +492,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
hipDeviceptr_t location,
|
||||
uint8 value, size_t size) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t res = hipMemset(location, value, size);
|
||||
hipError_t res = tensorflow::wrap::hipMemset(location, value, size);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to memset memory: " << ToString(res);
|
||||
return false;
|
||||
@ -513,7 +514,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
return false;
|
||||
}
|
||||
hipError_t res =
|
||||
hipMemset(pointer, static_cast<int>(value), uint32_count * 4);
|
||||
tensorflow::wrap::hipMemset(pointer, static_cast<int>(value), uint32_count * 4);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to memset memory: " << ToString(res);
|
||||
return false;
|
||||
@ -527,7 +528,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
size_t uint32_count,
|
||||
GpuStreamHandle stream) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t res = hipMemsetAsync(location, value, uint32_count, stream);
|
||||
hipError_t res = tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
|
||||
return false;
|
||||
@ -552,7 +553,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
LOG(ERROR) << "failed to memset memory";
|
||||
return false;
|
||||
}
|
||||
hipError_t res = hipMemsetAsync(pointer, value, uint32_count * 4, stream);
|
||||
hipError_t res = tensorflow::wrap::hipMemsetAsync(pointer, value, uint32_count * 4, stream);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
|
||||
return false;
|
||||
@ -565,7 +566,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
GpuStreamHandle stream,
|
||||
StreamCallback callback,
|
||||
void* data) {
|
||||
hipError_t res = hipStreamAddCallback(stream, (hipStreamCallback_t)callback,
|
||||
hipError_t res = tensorflow::wrap::hipStreamAddCallback(stream, (hipStreamCallback_t)callback,
|
||||
data, 0 /* = flags */);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "unable to add host callback: " << ToString(res);
|
||||
@ -580,7 +581,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
hipFunction_t* function) {
|
||||
ScopedActivateContext activated{context};
|
||||
CHECK(module != nullptr && kernel_name != nullptr);
|
||||
hipError_t res = hipModuleGetFunction(function, module, kernel_name);
|
||||
hipError_t res = tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to get kernel \"" << kernel_name
|
||||
<< "\" from module: " << ToString(res);
|
||||
@ -598,7 +599,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
ScopedActivateContext activated{context};
|
||||
CHECK(module != nullptr && symbol_name != nullptr &&
|
||||
(dptr != nullptr || bytes != nullptr));
|
||||
hipError_t res = hipModuleGetGlobal(dptr, bytes, module, symbol_name);
|
||||
hipError_t res = tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
|
||||
if (res != hipSuccess) {
|
||||
// symbol may not be found in the current module, but it may reside in
|
||||
// another module.
|
||||
@ -613,7 +614,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
/* static */ void GpuDriver::UnloadModule(GpuContext* context,
|
||||
hipModule_t module) {
|
||||
ScopedActivateContext activated{context};
|
||||
hipError_t res = hipModuleUnload(module);
|
||||
hipError_t res = tensorflow::wrap::hipModuleUnload(module);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to unload module " << module
|
||||
<< "; leaking: " << ToString(res);
|
||||
@ -623,7 +624,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
/* static */ bool GpuDriver::CreateStream(GpuContext* context,
|
||||
GpuStreamHandle* stream) {
|
||||
ScopedActivateContext activated{context};
|
||||
hipError_t res = hipStreamCreateWithFlags(
|
||||
hipError_t res = tensorflow::wrap::hipStreamCreateWithFlags(
|
||||
stream, hipStreamDefault); // switch to hipStreamNonBlocking?
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "could not allocate ROCM stream for device "
|
||||
@ -643,7 +644,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
}
|
||||
|
||||
ScopedActivateContext activated{context};
|
||||
hipError_t res = hipStreamDestroy(*stream);
|
||||
hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to destroy ROCM stream for device "
|
||||
<< context->device_ordinal() << ": " << ToString(res);
|
||||
@ -658,7 +659,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
uint64 bytes) {
|
||||
ScopedActivateContext activated{context};
|
||||
hipDeviceptr_t result = 0;
|
||||
hipError_t res = hipMalloc(&result, bytes);
|
||||
hipError_t res = tensorflow::wrap::hipMallocVanilla(&result, bytes);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to allocate "
|
||||
<< port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
|
||||
@ -675,7 +676,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
void* location) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
|
||||
hipError_t res = hipFree(pointer);
|
||||
hipError_t res = tensorflow::wrap::hipFree(pointer);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to free device memory at " << location
|
||||
<< "; result: " << ToString(res);
|
||||
@ -704,7 +705,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
ScopedActivateContext activation{context};
|
||||
void* host_mem = nullptr;
|
||||
// "Portable" memory is visible to all ROCM contexts. Safe for our use model.
|
||||
hipError_t res = hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
|
||||
hipError_t res = tensorflow::wrap::hipHostMallocVanilla(&host_mem, bytes, hipHostMallocPortable);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to alloc " << bytes
|
||||
<< " bytes on host: " << ToString(res);
|
||||
@ -715,7 +716,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
/* static */ void GpuDriver::HostDeallocate(GpuContext* context,
|
||||
void* location) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t res = hipHostFree(location);
|
||||
hipError_t res = tensorflow::wrap::hipHostFree(location);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "error deallocating host memory at " << location << ": "
|
||||
<< ToString(res);
|
||||
@ -726,7 +727,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
uint64 bytes) {
|
||||
ScopedActivateContext activation{context};
|
||||
// "Portable" memory is visible to all ROCM contexts. Safe for our use model.
|
||||
hipError_t res = hipHostRegister(location, bytes, hipHostRegisterPortable);
|
||||
hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes, hipHostRegisterPortable);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "error registering host memory at " << location << ": "
|
||||
<< ToString(res);
|
||||
@ -738,7 +739,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
/* static */ bool GpuDriver::HostUnregister(GpuContext* context,
|
||||
void* location) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t res = hipHostUnregister(location);
|
||||
hipError_t res = tensorflow::wrap::hipHostUnregister(location);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "error unregistering host memory at " << location << ": "
|
||||
<< ToString(res);
|
||||
@ -755,7 +756,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
}
|
||||
|
||||
ScopedActivateContext activated{context};
|
||||
hipError_t res = hipEventDestroy(*event);
|
||||
hipError_t res = tensorflow::wrap::hipEventDestroy(*event);
|
||||
*event = nullptr;
|
||||
|
||||
switch (res) {
|
||||
@ -779,7 +780,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
GpuEventHandle event,
|
||||
GpuStreamHandle stream) {
|
||||
ScopedActivateContext activated{context};
|
||||
hipError_t res = hipEventRecord(event, stream);
|
||||
hipError_t res = tensorflow::wrap::hipEventRecord(event, stream);
|
||||
switch (res) {
|
||||
case hipSuccess:
|
||||
return port::Status::OK();
|
||||
@ -800,7 +801,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
/* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
|
||||
GpuContext* context, GpuEventHandle event) {
|
||||
ScopedActivateContext activated{context};
|
||||
hipError_t res = hipEventQuery(event);
|
||||
hipError_t res = tensorflow::wrap::hipEventQuery(event);
|
||||
if (res != hipSuccess && res != hipErrorNotReady) {
|
||||
return port::Status{
|
||||
port::error::INTERNAL,
|
||||
@ -817,12 +818,12 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
ScopedActivateContext activated{context};
|
||||
// The stop event must have completed in order for hipEventElapsedTime to
|
||||
// work.
|
||||
hipError_t res = hipEventSynchronize(stop);
|
||||
hipError_t res = tensorflow::wrap::hipEventSynchronize(stop);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
|
||||
return false;
|
||||
}
|
||||
res = hipEventElapsedTime(elapsed_milliseconds, start, stop);
|
||||
res = tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to get elapsed time between events: "
|
||||
<< ToString(res);
|
||||
@ -836,7 +837,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
GpuStreamHandle stream,
|
||||
GpuEventHandle event) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t res = hipStreamWaitEvent(stream, event, 0 /* = flags */);
|
||||
hipError_t res = tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "could not wait stream on event: " << ToString(res);
|
||||
return false;
|
||||
@ -847,7 +848,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
|
||||
/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t res = hipDeviceSynchronize();
|
||||
hipError_t res = tensorflow::wrap::hipDeviceSynchronize();
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
|
||||
<< " :: " << port::CurrentStackTrace();
|
||||
@ -861,7 +862,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
GpuStreamHandle stream) {
|
||||
ScopedActivateContext activated{context};
|
||||
CHECK(stream != nullptr);
|
||||
hipError_t res = hipStreamSynchronize(stream);
|
||||
hipError_t res = tensorflow::wrap::hipStreamSynchronize(stream);
|
||||
if (res != hipSuccess) {
|
||||
port::Status status = port::InternalError(
|
||||
absl::StrCat("could not synchronize on ROCM stream: ", ToString(res)));
|
||||
@ -877,7 +878,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
GpuStreamHandle stream) {
|
||||
ScopedActivateContext activated{context};
|
||||
CHECK(stream != nullptr);
|
||||
hipError_t res = hipStreamQuery(stream);
|
||||
hipError_t res = tensorflow::wrap::hipStreamQuery(stream);
|
||||
if (res == hipSuccess) {
|
||||
return true;
|
||||
}
|
||||
@ -891,7 +892,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
|
||||
GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t res = hipMemcpyDtoH(host_dst, gpu_src, size);
|
||||
hipError_t res = tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size);
|
||||
if (res != hipSuccess) {
|
||||
return port::InternalError(
|
||||
absl::StrFormat("failed to synchronous memcpy from device to host: %s; "
|
||||
@ -908,7 +909,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
|
||||
uint64 size) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t res = hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src), size);
|
||||
hipError_t res = tensorflow::wrap::hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src), size);
|
||||
if (res != hipSuccess) {
|
||||
return port::InternalError(absl::StrFormat(
|
||||
"failed to synchronous memcpy from host to device: %s; Gpu dst: %p;"
|
||||
@ -924,7 +925,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
|
||||
uint64 size) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t res = hipMemcpyDtoD(gpu_dst, gpu_src, size);
|
||||
hipError_t res = tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size);
|
||||
if (res != hipSuccess) {
|
||||
return port::InternalError(absl::StrFormat(
|
||||
"failed to synchronous memcpy from host to device: %s; Gpu dst: %p; "
|
||||
@ -942,7 +943,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
uint64 size,
|
||||
GpuStreamHandle stream) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t res = hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
|
||||
hipError_t res = tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << absl::StrFormat(
|
||||
"failed to enqueue async memcpy from device to host: %s; host dst: %p; "
|
||||
@ -964,7 +965,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
GpuStreamHandle stream) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t res =
|
||||
hipMemcpyHtoDAsync(gpu_dst, const_cast<void*>(host_src), size, stream);
|
||||
tensorflow::wrap::hipMemcpyHtoDAsync(gpu_dst, const_cast<void*>(host_src), size, stream);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << absl::StrFormat(
|
||||
"failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
|
||||
@ -984,7 +985,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
uint64 size,
|
||||
GpuStreamHandle stream) {
|
||||
ScopedActivateContext activation{context};
|
||||
hipError_t result = hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
|
||||
hipError_t result = tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
|
||||
if (result != hipSuccess) {
|
||||
LOG(ERROR) << absl::StrFormat(
|
||||
"failed to enqueue async memcpy from device to device: %s"
|
||||
@ -1021,7 +1022,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
}
|
||||
|
||||
ScopedActivateContext activated{context};
|
||||
hipError_t res = hipEventCreateWithFlags(event, hipflags);
|
||||
hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags);
|
||||
|
||||
if (res == hipSuccess) {
|
||||
return port::Status::OK();
|
||||
@ -1037,7 +1038,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
|
||||
/* static */ int GpuDriver::GetDeviceCount() {
|
||||
int device_count = 0;
|
||||
hipError_t res = hipGetDeviceCount(&device_count);
|
||||
hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
|
||||
return 0;
|
||||
@ -1061,7 +1062,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
|
||||
/* static */ port::Status GpuDriver::GetPointerAddressRange(
|
||||
hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
|
||||
hipError_t result = hipMemGetAddressRange(base, size, dptr);
|
||||
hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr);
|
||||
if (result == hipSuccess) {
|
||||
return port::Status::OK();
|
||||
} else if (result == hipErrorNotFound) {
|
||||
@ -1106,7 +1107,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
/* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
|
||||
hipDeviceptr_t pointer) {
|
||||
hipPointerAttribute_t pointerAttributes;
|
||||
hipError_t result = hipPointerGetAttributes(&pointerAttributes, pointer);
|
||||
hipError_t result = tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
|
||||
if (result != hipSuccess) {
|
||||
return port::Status{
|
||||
port::error::INTERNAL,
|
||||
@ -1114,7 +1115,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
}
|
||||
|
||||
hipDevice_t device;
|
||||
result = hipDeviceGet(&device, pointerAttributes.device);
|
||||
result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device);
|
||||
if (result != hipSuccess) {
|
||||
return port::Status{
|
||||
port::error::INTERNAL,
|
||||
@ -1127,7 +1128,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
|
||||
hipDevice_t device) {
|
||||
hipDeviceProp_t props;
|
||||
hipError_t result = hipGetDeviceProperties(&props, device);
|
||||
hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
|
||||
if (result == hipSuccess) {
|
||||
*version = props.gcnArch;
|
||||
return port::Status::OK();
|
||||
@ -1145,7 +1146,7 @@ template <typename T>
|
||||
static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
|
||||
hipDeviceAttribute_t attribute) {
|
||||
int value = -1;
|
||||
hipError_t result = hipDeviceGetAttribute(&value, attribute, device);
|
||||
hipError_t result = tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device);
|
||||
if (result != hipSuccess) {
|
||||
return port::Status{
|
||||
port::error::NOT_FOUND,
|
||||
@ -1200,21 +1201,21 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
|
||||
hipDevice_t device) {
|
||||
int value;
|
||||
hipError_t res =
|
||||
hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, device);
|
||||
tensorflow::wrap::hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimX, device);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
|
||||
return false;
|
||||
}
|
||||
*x = value;
|
||||
|
||||
res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, device);
|
||||
res = tensorflow::wrap::hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimY, device);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
|
||||
return false;
|
||||
}
|
||||
*y = value;
|
||||
|
||||
res = hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, device);
|
||||
res = tensorflow::wrap::hipDeviceGetAttribute(&value, hipDeviceAttributeMaxGridDimZ, device);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
|
||||
return false;
|
||||
@ -1224,7 +1225,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
|
||||
}
|
||||
|
||||
/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
|
||||
hipError_t res = hipDriverGetVersion(driver_version);
|
||||
hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to query driver version: " << ToString(res);
|
||||
return false;
|
||||
@ -1235,7 +1236,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
|
||||
|
||||
/* static */ bool GpuDriver::GetDeviceProperties(
|
||||
hipDeviceProp_t* device_properties, int device_ordinal) {
|
||||
hipError_t res = hipGetDeviceProperties(device_properties, device_ordinal);
|
||||
hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties, device_ordinal);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to query device properties: " << ToString(res);
|
||||
return false;
|
||||
@ -1268,7 +1269,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
|
||||
ScopedActivateContext activation{context};
|
||||
size_t free = 0;
|
||||
size_t total = 0;
|
||||
hipError_t res = hipMemGetInfo(&free, &total);
|
||||
hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to query device memory info: " << ToString(res);
|
||||
return false;
|
||||
@ -1282,7 +1283,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
|
||||
/* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
|
||||
uint64* result) {
|
||||
size_t value = -1;
|
||||
hipError_t res = hipDeviceTotalMem(&value, device);
|
||||
hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to query total available memory: " << ToString(res);
|
||||
return false;
|
||||
@ -1297,7 +1298,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
|
||||
static const int kBufferSize = 64;
|
||||
absl::InlinedVector<char, 4> chars(kBufferSize);
|
||||
chars[kBufferSize - 1] = '\0';
|
||||
hipError_t res = hipDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
|
||||
hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
|
||||
return pci_bus_id;
|
||||
@ -1313,7 +1314,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
|
||||
}
|
||||
|
||||
int can_access_peer = -1;
|
||||
hipError_t res = hipDeviceCanAccessPeer(
|
||||
hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer(
|
||||
&can_access_peer, from->device_ordinal(), to->device_ordinal());
|
||||
if (res != hipSuccess) {
|
||||
LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
|
||||
@ -1331,7 +1332,7 @@ static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
|
||||
|
||||
ScopedActivateContext activated{from};
|
||||
hipError_t result =
|
||||
hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */);
|
||||
tensorflow::wrap::hipDeviceEnablePeerAccess(to->device_ordinal(), 0 /* = flags */);
|
||||
if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
|
||||
return port::Status{
|
||||
port::error::INTERNAL,
|
||||
|
147
tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
Normal file
147
tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
Normal file
@ -0,0 +1,147 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// This file wraps rocm driver calls with dso loader so that we don't need to
|
||||
// have explicit linking to librocm. All TF rocm driver usage should route
|
||||
// through this wrapper.
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
|
||||
|
||||
#include "tensorflow/stream_executor/lib/env.h"
|
||||
#include "tensorflow/stream_executor/platform/dso_loader.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "rocm/include/hip/hip_runtime.h"
|
||||
|
||||
|
||||
#if defined(TENSORFLOW_USE_ROCM)
|
||||
|
||||
#endif
|
||||
|
||||
namespace tensorflow {
|
||||
namespace wrap {
|
||||
#ifdef PLATFORM_GOOGLE
|
||||
// Use static linked library
|
||||
#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName) \
|
||||
template <typename... Args> \
|
||||
auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
|
||||
return ::hipSymbolName(args...); \
|
||||
}
|
||||
|
||||
// This macro wraps a global identifier, given by hipSymbolName, in a callable
|
||||
// structure that loads the DLL symbol out of the DSO handle in a thread-safe
|
||||
// manner on first use. This dynamic loading technique is used to avoid DSO
|
||||
// dependencies on vendor libraries which may or may not be available in the
|
||||
// deployed binary environment.
|
||||
#else
|
||||
#define TO_STR_(x) #x
|
||||
#define TO_STR(x) TO_STR_(x)
|
||||
|
||||
// hipMalloc and hipHostMalloc are defined as funtion templates in the
|
||||
// HIP header files, and hence their names get mangled and the attempt
|
||||
// to resolve their name when trying to dynamically load them will fail
|
||||
// Updating the HIP header files to make them C functions is underway.
|
||||
// Until that change flows through, we will workaround the issue by
|
||||
// creating dummy wrappers for them here
|
||||
|
||||
hipError_t hipMallocVanilla(void** ptr, size_t size) {
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
|
||||
hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
|
||||
return hipErrorNotInitialized;
|
||||
}
|
||||
|
||||
#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName) \
|
||||
template <typename... Args> \
|
||||
auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
|
||||
using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type; \
|
||||
static FuncPtrT loaded = []() -> FuncPtrT { \
|
||||
static const char *kName = TO_STR(hipSymbolName); \
|
||||
void *f; \
|
||||
auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary( \
|
||||
stream_executor::internal::CachedDsoLoader::GetHipDsoHandle() \
|
||||
.ValueOrDie(), \
|
||||
kName, &f); \
|
||||
CHECK(s.ok()) << "could not find " << kName \
|
||||
<< " in HIP DSO; dlerror: " << s.error_message(); \
|
||||
return reinterpret_cast<FuncPtrT>(f); \
|
||||
}(); \
|
||||
return loaded(args...); \
|
||||
}
|
||||
#endif
|
||||
|
||||
// clang-format off
|
||||
#define HIP_ROUTINE_EACH(__macro) \
|
||||
__macro(hipDeviceCanAccessPeer) \
|
||||
__macro(hipDeviceEnablePeerAccess) \
|
||||
__macro(hipDeviceGet) \
|
||||
__macro(hipDeviceGetAttribute) \
|
||||
__macro(hipDeviceGetName) \
|
||||
__macro(hipDeviceGetPCIBusId) \
|
||||
__macro(hipDeviceGetSharedMemConfig) \
|
||||
__macro(hipDeviceSetSharedMemConfig) \
|
||||
__macro(hipDeviceSynchronize) \
|
||||
__macro(hipDeviceTotalMem) \
|
||||
__macro(hipDriverGetVersion) \
|
||||
__macro(hipEventCreateWithFlags) \
|
||||
__macro(hipEventElapsedTime) \
|
||||
__macro(hipEventDestroy) \
|
||||
__macro(hipEventQuery) \
|
||||
__macro(hipEventRecord) \
|
||||
__macro(hipEventSynchronize) \
|
||||
__macro(hipFree) \
|
||||
__macro(hipFuncSetCacheConfig) \
|
||||
__macro(hipGetDevice) \
|
||||
__macro(hipGetDeviceCount) \
|
||||
__macro(hipGetDeviceProperties) \
|
||||
__macro(hipHostFree) \
|
||||
__macro(hipHostRegister) \
|
||||
__macro(hipHostUnregister) \
|
||||
__macro(hipInit) \
|
||||
__macro(hipMemGetAddressRange) \
|
||||
__macro(hipMemGetInfo) \
|
||||
__macro(hipMemcpyDtoD) \
|
||||
__macro(hipMemcpyDtoDAsync) \
|
||||
__macro(hipMemcpyDtoH) \
|
||||
__macro(hipMemcpyDtoHAsync) \
|
||||
__macro(hipMemcpyHtoD) \
|
||||
__macro(hipMemcpyHtoDAsync) \
|
||||
__macro(hipMemset) \
|
||||
__macro(hipMemsetAsync) \
|
||||
__macro(hipModuleGetFunction) \
|
||||
__macro(hipModuleGetGlobal) \
|
||||
__macro(hipModuleLaunchKernel) \
|
||||
__macro(hipModuleLoadData) \
|
||||
__macro(hipModuleUnload) \
|
||||
__macro(hipPointerGetAttributes) \
|
||||
__macro(hipSetDevice) \
|
||||
__macro(hipStreamAddCallback) \
|
||||
__macro(hipStreamCreateWithFlags) \
|
||||
__macro(hipStreamDestroy) \
|
||||
__macro(hipStreamQuery) \
|
||||
__macro(hipStreamSynchronize) \
|
||||
__macro(hipStreamWaitEvent) \
|
||||
// clang-format on
|
||||
|
||||
HIP_ROUTINE_EACH(STREAM_EXECUTOR_HIP_WRAP)
|
||||
#undef HIP_ROUTINE_EACH
|
||||
#undef STREAM_EXECUTOR_HIP_WRAP
|
||||
#undef TO_STR
|
||||
#undef TO_STR_
|
||||
} // namespace wrap
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
|
Loading…
x
Reference in New Issue
Block a user