1327 lines
47 KiB
C++
1327 lines
47 KiB
C++
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
|
|
#include <map>
|
|
#include <set>
|
|
#include <utility>
|
|
|
|
#include "absl/base/casts.h"
|
|
#include "absl/container/inlined_vector.h"
|
|
#include "absl/strings/str_cat.h"
|
|
#include "absl/strings/str_format.h"
|
|
#include "absl/synchronization/mutex.h"
|
|
#include "absl/synchronization/notification.h"
|
|
#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
|
|
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
|
#include "tensorflow/stream_executor/lib/env.h"
|
|
#include "tensorflow/stream_executor/lib/error.h"
|
|
#include "tensorflow/stream_executor/lib/human_readable.h"
|
|
#include "tensorflow/stream_executor/lib/stacktrace.h"
|
|
#include "tensorflow/stream_executor/lib/static_threadlocal.h"
|
|
#include "tensorflow/stream_executor/lib/threadpool.h"
|
|
#include "tensorflow/stream_executor/platform/logging.h"
|
|
#include "tensorflow/stream_executor/platform/port.h"
|
|
#include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"
|
|
|
|
bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
|
|
bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
|
|
bool FLAGS_gpuexec_rocm_device_0_only = false;
|
|
|
|
#define RETURN_IF_ROCM_ERROR(expr, ...) \
|
|
do { \
|
|
hipError_t _res = (expr); \
|
|
if (TF_PREDICT_FALSE(_res != hipSuccess)) { \
|
|
return port::InternalError(absl::StrCat( \
|
|
__VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
|
|
} \
|
|
} while (0)
|
|
|
|
// Debugging: on each push and pop of a rocm context, verify the current device
|
|
// matches the expected one.
|
|
constexpr bool kVerifyGpuContext = false;
|
|
|
|
namespace stream_executor {
|
|
namespace gpu {
|
|
|
|
// GpuContext wraps the device_ordinal.
|
|
// Only reason we need this wrapper class is to make the GpuDriver* API
|
|
class GpuContext {
|
|
public:
|
|
GpuContext(const int v) : device_ordinal_(v) {}
|
|
|
|
int device_ordinal() const { return device_ordinal_; }
|
|
|
|
// Disallow copying and moving.
|
|
GpuContext(GpuContext&&) = delete;
|
|
GpuContext(const GpuContext&) = delete;
|
|
GpuContext& operator=(GpuContext&&) = delete;
|
|
GpuContext& operator=(const GpuContext&) = delete;
|
|
|
|
private:
|
|
const int device_ordinal_;
|
|
};
|
|
|
|
namespace {
|
|
|
|
// Formats hipError_t to output prettified values into a log stream.
|
|
// Error summaries taken from:
|
|
string ToString(hipError_t result) {
|
|
#define OSTREAM_ROCM_ERROR(__name) \
|
|
case hipError##__name: \
|
|
return "HIP_ERROR_" #__name;
|
|
|
|
switch (result) {
|
|
OSTREAM_ROCM_ERROR(InvalidValue)
|
|
OSTREAM_ROCM_ERROR(OutOfMemory)
|
|
OSTREAM_ROCM_ERROR(NotInitialized)
|
|
OSTREAM_ROCM_ERROR(Deinitialized)
|
|
OSTREAM_ROCM_ERROR(NoDevice)
|
|
OSTREAM_ROCM_ERROR(InvalidDevice)
|
|
OSTREAM_ROCM_ERROR(InvalidImage)
|
|
OSTREAM_ROCM_ERROR(InvalidContext)
|
|
OSTREAM_ROCM_ERROR(InvalidHandle)
|
|
OSTREAM_ROCM_ERROR(NotFound)
|
|
OSTREAM_ROCM_ERROR(NotReady)
|
|
OSTREAM_ROCM_ERROR(NoBinaryForGpu)
|
|
|
|
// Encountered an uncorrectable ECC error during execution.
|
|
OSTREAM_ROCM_ERROR(ECCNotCorrectable)
|
|
|
|
// Load/store on an invalid address. Must reboot all context.
|
|
case 700:
|
|
return "ROCM_ERROR_ILLEGAL_ADDRESS";
|
|
// Passed too many / wrong arguments, too many threads for register count.
|
|
case 701:
|
|
return "ROCM_ERROR_LAUNCH_OUT_OF_RESOURCES";
|
|
|
|
OSTREAM_ROCM_ERROR(ContextAlreadyInUse)
|
|
OSTREAM_ROCM_ERROR(PeerAccessUnsupported)
|
|
OSTREAM_ROCM_ERROR(Unknown) // Unknown internal error to ROCM.
|
|
default:
|
|
return absl::StrCat("hipError_t(", static_cast<int>(result), ")");
|
|
}
|
|
}
|
|
|
|
// ROCM driver routines may require a large amount of stack (particularly
|
|
// hipModuleLoadDataEx, in our experience). To avoid stack overflow when using
|
|
// stack-limited threads (such as those spawned by a default-argument
|
|
// thread::ThreadPool on some platforms), we run certain routines in this pool
|
|
// and wait for completion.
|
|
port::ThreadPool* GetDriverExecutor() {
|
|
static port::ThreadPool* thread_pool = new port::ThreadPool(
|
|
port::Env::Default(), port::ThreadOptions(), "rocm_driver", 1);
|
|
return thread_pool;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
string MemorySpaceString(MemorySpace memory_space) {
|
|
switch (memory_space) {
|
|
case MemorySpace::kHost:
|
|
return "host";
|
|
case MemorySpace::kDevice:
|
|
return "device";
|
|
default:
|
|
LOG(FATAL) << "impossible memory space";
|
|
}
|
|
}
|
|
|
|
// Returns the current device set in HIP. This is done by calling the
|
|
// HIP driver (e.g., this value is not our cached view of the current device).
|
|
static int CurrentDeviceOrDie() {
|
|
int current = -1;
|
|
hipError_t result = tensorflow::wrap::hipGetDevice(¤t);
|
|
if (result != hipSuccess) {
|
|
LOG(FATAL) << "failed to query current device: " << ToString(result);
|
|
}
|
|
return current;
|
|
}
|
|
|
|
namespace {
|
|
|
|
// Call hipDeviceSynchronize and crash if it doesn't succeed.
|
|
void SynchronizeOrDie() {
|
|
auto res = tensorflow::wrap::hipDeviceSynchronize();
|
|
if (res != hipSuccess) {
|
|
LOG(FATAL) << "Synchronize found " << ToString(res)
|
|
<< " :: " << port::CurrentStackTrace();
|
|
}
|
|
}
|
|
|
|
struct ThreadLocalData {
|
|
int current_device_ordinal;
|
|
int depth;
|
|
};
|
|
|
|
SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
|
|
|
|
} // namespace
|
|
|
|
ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
|
|
if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
|
|
SynchronizeOrDie();
|
|
}
|
|
|
|
auto* tls = &tls_data.get();
|
|
if (tls->depth == 0) {
|
|
tls->current_device_ordinal = CurrentDeviceOrDie();
|
|
}
|
|
|
|
if (kVerifyGpuContext) {
|
|
CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
|
|
}
|
|
|
|
tls->depth++;
|
|
|
|
to_restore_ = context;
|
|
|
|
if (context->device_ordinal() == tls->current_device_ordinal) {
|
|
DCHECK_EQ(CurrentDeviceOrDie(), context->device_ordinal());
|
|
return;
|
|
}
|
|
|
|
VLOG(3) << "ScopedActivateContext switching device from "
|
|
<< tls->current_device_ordinal << " to " << context->device_ordinal();
|
|
|
|
// Set the device and update thread local.
|
|
CHECK_EQ(hipSuccess,
|
|
tensorflow::wrap::hipSetDevice(context->device_ordinal()));
|
|
tls->current_device_ordinal = context->device_ordinal();
|
|
}
|
|
|
|
ScopedActivateContext::~ScopedActivateContext() {
|
|
if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
|
|
SynchronizeOrDie();
|
|
}
|
|
|
|
auto* tls = &tls_data.get();
|
|
|
|
if (kVerifyGpuContext) {
|
|
CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
|
|
}
|
|
|
|
tls->depth--;
|
|
DCHECK_GE(tls->depth, 0);
|
|
|
|
if (to_restore_->device_ordinal() == tls->current_device_ordinal) {
|
|
DCHECK_EQ(CurrentDeviceOrDie(), to_restore_->device_ordinal());
|
|
return;
|
|
}
|
|
|
|
VLOG(3) << "ScopedActivateContext switching device from "
|
|
<< tls->current_device_ordinal << " to "
|
|
<< to_restore_->device_ordinal();
|
|
|
|
// Set context and update thread local.
|
|
CHECK_EQ(hipSuccess,
|
|
tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal()));
|
|
tls->current_device_ordinal = to_restore_->device_ordinal();
|
|
}
|
|
|
|
namespace {
|
|
|
|
// Returns a stringified device number associated with pointer, primarily for
|
|
// logging purposes. Returns "?" if the device could not be successfully
|
|
// queried.
|
|
string ROCMPointerToDeviceString(hipDeviceptr_t pointer) {
|
|
auto value = GpuDriver::GetPointerDevice(pointer);
|
|
if (value.ok()) {
|
|
return absl::StrCat(value.ValueOrDie());
|
|
}
|
|
LOG(ERROR) << "could not query device: " << value.status();
|
|
return "?";
|
|
}
|
|
|
|
// Returns a stringified memory space associated with pointer, primarily for
|
|
// logging purposes. Returns "?" if the memory space could not be successfully
|
|
// queried.
|
|
string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
|
|
auto value = GpuDriver::GetPointerMemorySpace(pointer);
|
|
if (value.ok()) {
|
|
return MemorySpaceString(value.ValueOrDie());
|
|
}
|
|
LOG(ERROR) << "could not query device: " << value.status();
|
|
return "?";
|
|
}
|
|
|
|
// Returns a stringified representation of whether or not peer access is
|
|
// permitted between the "from" and "to" pointers' associated contexts,
|
|
// primarily for logging purposes. Returns "error" if an error is encountered
|
|
// in the process of querying.
|
|
string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
|
|
hipPointerAttribute_t from_pointerAttributes;
|
|
hipError_t result =
|
|
tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from);
|
|
if (result != hipSuccess) {
|
|
LOG(ERROR) << "could not retrieve source pointer's device: "
|
|
<< ToString(result);
|
|
return "error";
|
|
}
|
|
|
|
hipPointerAttribute_t to_pointerAttributes;
|
|
result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to);
|
|
if (result != hipSuccess) {
|
|
LOG(ERROR) << "could not retrieve destination pointer's device: "
|
|
<< ToString(result);
|
|
return "error";
|
|
}
|
|
|
|
GpuContext fromCtx(from_pointerAttributes.device);
|
|
GpuContext toCtx(to_pointerAttributes.device);
|
|
|
|
return GpuDriver::CanEnablePeerAccess(&fromCtx, &toCtx) ? "true" : "false";
|
|
}
|
|
|
|
// Actually performs the work of ROCM initialization. Wrapped up in one-time
|
|
// execution guard.
|
|
static port::Status InternalInit() {
|
|
hipError_t res = hipErrorNoDevice;
|
|
if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
|
|
LOG(ERROR) << "injecting ROCM init error; initialization will fail";
|
|
} else {
|
|
res = tensorflow::wrap::hipInit(0 /* = flags */);
|
|
}
|
|
|
|
if (res == hipSuccess) {
|
|
return port::Status::OK();
|
|
}
|
|
|
|
LOG(ERROR) << "failed call to hipInit: " << ToString(res);
|
|
Diagnostician::LogDiagnosticInformation();
|
|
return port::Status{port::error::ABORTED,
|
|
absl::StrCat("failed call to hipInit: ", ToString(res))};
|
|
}
|
|
|
|
} // namespace
|
|
|
|
/* static */ port::Status GpuDriver::Init() {
|
|
// Cached return value from calling InternalInit(), as hipInit need only be
|
|
// called once, but GpuDriver::Init may be called many times.
|
|
static port::Status* init_retval = [] {
|
|
return new port::Status(InternalInit());
|
|
}();
|
|
return *init_retval;
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
|
|
hipDevice_t* device) {
|
|
hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal);
|
|
if (res == hipSuccess) {
|
|
return port::Status::OK();
|
|
}
|
|
|
|
return port::Status{
|
|
port::error::INTERNAL,
|
|
absl::StrCat("failed call to hipDeviceGet: ", ToString(res))};
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::GetDeviceName(hipDevice_t device,
|
|
string* device_name) {
|
|
static const size_t kCharLimit = 64;
|
|
absl::InlinedVector<char, 4> chars(kCharLimit);
|
|
RETURN_IF_ROCM_ERROR(
|
|
tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device),
|
|
"Failed to get device name");
|
|
chars[kCharLimit - 1] = '\0';
|
|
*device_name = chars.begin();
|
|
return port::Status::OK();
|
|
}
|
|
|
|
bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
|
|
int* flags) {
|
|
static_assert(DeviceOptions::kMask == 0xf,
|
|
"needs update for new device options");
|
|
return true;
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::CreateContext(
|
|
int device_ordinal, hipDevice_t device, const DeviceOptions& device_options,
|
|
GpuContext** context) {
|
|
*context = new GpuContext(device_ordinal);
|
|
return port::Status::OK();
|
|
}
|
|
/* static */ void GpuDriver::DestroyContext(GpuContext* context) {
|
|
if (context == nullptr) {
|
|
return;
|
|
}
|
|
delete context;
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::FuncGetAttribute(
|
|
hipDeviceAttribute_t attribute, hipFunction_t func, int* attribute_value) {
|
|
// TODO(ROCm) properly implement this feature in HIP
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::FuncSetCacheConfig(
|
|
hipFunction_t function, hipFuncCache_t cache_config) {
|
|
RETURN_IF_ROCM_ERROR(
|
|
tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config),
|
|
"Failed to set ROCM kernel cache config.");
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ port::StatusOr<hipSharedMemConfig>
|
|
GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
|
hipSharedMemConfig shared_mem_config;
|
|
ScopedActivateContext activation{context};
|
|
RETURN_IF_ROCM_ERROR(
|
|
tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config),
|
|
"Failed to get shared memory config");
|
|
return shared_mem_config;
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
|
|
GpuContext* context, hipSharedMemConfig shared_mem_config) {
|
|
ScopedActivateContext activation{context};
|
|
RETURN_IF_ROCM_ERROR(
|
|
tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config),
|
|
"Failed to set ROCM device shared memory config");
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::LaunchKernel(
|
|
GpuContext* context, hipFunction_t function, unsigned int grid_dim_x,
|
|
unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
|
|
unsigned int block_dim_y, unsigned int block_dim_z,
|
|
unsigned int shared_mem_bytes, GpuStreamHandle stream, void** kernel_params,
|
|
void** extra) {
|
|
ScopedActivateContext activation{context};
|
|
VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
|
|
<< " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
|
|
<< " bdx: " << block_dim_x << " bdy: " << block_dim_y
|
|
<< " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
|
|
RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipModuleLaunchKernel(
|
|
function, grid_dim_x, grid_dim_y, grid_dim_z,
|
|
block_dim_x, block_dim_y, block_dim_z,
|
|
shared_mem_bytes, stream, kernel_params, extra),
|
|
"Failed to launch ROCM kernel");
|
|
VLOG(2) << "successfully launched kernel";
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
|
|
const char* ptx_contents,
|
|
hipModule_t* module) {
|
|
LOG(ERROR) << "Feature not supported on ROCm platform (LoadPtx)";
|
|
return port::InternalError("Not Implemented");
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
|
|
const char* cubin_bytes,
|
|
hipModule_t* module) {
|
|
return port::Status{port::error::INTERNAL,
|
|
"Feature not supported on ROCm platform (LoadCubin)"};
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
|
|
const char* hsaco_contents,
|
|
hipModule_t* module) {
|
|
absl::Notification notification;
|
|
port::Status ret = port::Status::OK();
|
|
GetDriverExecutor()->Schedule([context, hsaco_contents, module, &ret,
|
|
¬ification]() {
|
|
ScopedActivateContext activation{context};
|
|
void* hsaco_data = const_cast<char*>(hsaco_contents);
|
|
|
|
hipError_t res = tensorflow::wrap::hipModuleLoadData(module, hsaco_data);
|
|
|
|
if (res != hipSuccess) {
|
|
ret = port::InternalError(
|
|
absl::StrCat("Failed to load HSACO: ", ToString(res)));
|
|
notification.Notify();
|
|
}
|
|
|
|
CHECK(module != nullptr);
|
|
notification.Notify();
|
|
});
|
|
notification.WaitForNotification();
|
|
|
|
return ret;
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::SynchronousMemsetUint8(
|
|
GpuContext* context, hipDeviceptr_t location, uint8 value, size_t size) {
|
|
ScopedActivateContext activation{context};
|
|
RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipMemsetD8(location, value, size),
|
|
"Failed to memset memory");
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::SynchronousMemsetUint32(
|
|
GpuContext* context, hipDeviceptr_t location, uint32 value,
|
|
size_t uint32_count) {
|
|
ScopedActivateContext activation{context};
|
|
void* pointer = absl::bit_cast<void*>(location);
|
|
RETURN_IF_ROCM_ERROR(
|
|
tensorflow::wrap::hipMemsetD32(pointer, value, uint32_count),
|
|
"Failed to memset memory");
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
|
|
GpuContext* context, hipDeviceptr_t location, uint8 value,
|
|
size_t uint32_count, GpuStreamHandle stream) {
|
|
ScopedActivateContext activation{context};
|
|
RETURN_IF_ROCM_ERROR(
|
|
tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream),
|
|
"Failed to enqueue async memset operation");
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
|
|
GpuContext* context, hipDeviceptr_t location, uint32 value,
|
|
size_t uint32_count, GpuStreamHandle stream) {
|
|
ScopedActivateContext activation{context};
|
|
void* pointer = absl::bit_cast<void*>(location);
|
|
RETURN_IF_ROCM_ERROR(
|
|
tensorflow::wrap::hipMemsetD32Async(pointer, value, uint32_count, stream),
|
|
"Failed to enqueue async memset operation");
|
|
VLOG(2) << "successfully enqueued async memset operation";
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
|
|
GpuStreamHandle stream,
|
|
StreamCallback callback,
|
|
void* data) {
|
|
hipError_t res = tensorflow::wrap::hipStreamAddCallback(
|
|
stream, (hipStreamCallback_t)callback, data, 0 /* = flags */);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "unable to add host callback: " << ToString(res);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
|
|
hipModule_t module,
|
|
const char* kernel_name,
|
|
hipFunction_t* function) {
|
|
ScopedActivateContext activated{context};
|
|
CHECK(module != nullptr && kernel_name != nullptr);
|
|
hipError_t res =
|
|
tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to get kernel \"" << kernel_name
|
|
<< "\" from module: " << ToString(res);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
|
|
hipModule_t module,
|
|
const char* symbol_name,
|
|
hipDeviceptr_t* dptr,
|
|
size_t* bytes) {
|
|
ScopedActivateContext activated{context};
|
|
CHECK(module != nullptr && symbol_name != nullptr &&
|
|
(dptr != nullptr || bytes != nullptr));
|
|
hipError_t res =
|
|
tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
|
|
if (res != hipSuccess) {
|
|
// symbol may not be found in the current module, but it may reside in
|
|
// another module.
|
|
VLOG(2) << "failed to get symbol \"" << symbol_name
|
|
<< "\" from module: " << ToString(res);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/* static */ void GpuDriver::UnloadModule(GpuContext* context,
|
|
hipModule_t module) {
|
|
ScopedActivateContext activated{context};
|
|
hipError_t res = tensorflow::wrap::hipModuleUnload(module);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to unload module " << module
|
|
<< "; leaking: " << ToString(res);
|
|
}
|
|
}
|
|
|
|
/* static */ bool GpuDriver::CreateStream(GpuContext* context,
|
|
GpuStreamHandle* stream,
|
|
int priority) {
|
|
ScopedActivateContext activated{context};
|
|
hipError_t res;
|
|
if (priority == 0) {
|
|
res = tensorflow::wrap::hipStreamCreateWithFlags(
|
|
stream, hipStreamDefault); // switch to hipStreamNonBlocking?
|
|
} else {
|
|
res = tensorflow::wrap::hipStreamCreateWithPriority(
|
|
stream, hipStreamDefault, priority); // switch to hipStreamNonBlocking?
|
|
}
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "could not allocate ROCM stream for device "
|
|
<< context->device_ordinal() << ": " << ToString(res);
|
|
return false;
|
|
}
|
|
|
|
VLOG(2) << "successfully created stream " << *stream << " for device "
|
|
<< context->device_ordinal() << " on thread";
|
|
return true;
|
|
}
|
|
|
|
/* static */ void GpuDriver::DestroyStream(GpuContext* context,
|
|
GpuStreamHandle* stream) {
|
|
if (*stream == nullptr) {
|
|
return;
|
|
}
|
|
|
|
ScopedActivateContext activated{context};
|
|
hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to destroy ROCM stream for device "
|
|
<< context->device_ordinal() << ": " << ToString(res);
|
|
} else {
|
|
VLOG(2) << "successfully destroyed stream " << *stream << " for device "
|
|
<< context->device_ordinal();
|
|
*stream = nullptr;
|
|
}
|
|
}
|
|
|
|
/* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
|
|
uint64 bytes) {
|
|
ScopedActivateContext activated{context};
|
|
hipDeviceptr_t result = 0;
|
|
hipError_t res = tensorflow::wrap::hipMalloc(&result, bytes);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to allocate "
|
|
<< port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
|
|
<< " bytes) from device: " << ToString(res);
|
|
return nullptr;
|
|
}
|
|
void* ptr = reinterpret_cast<void*>(result);
|
|
VLOG(2) << "allocated " << ptr << " for device " << context->device_ordinal()
|
|
<< " of " << bytes << " bytes";
|
|
return ptr;
|
|
}
|
|
|
|
/* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
|
|
void* location) {
|
|
ScopedActivateContext activation{context};
|
|
hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
|
|
hipError_t res = tensorflow::wrap::hipFree(pointer);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to free device memory at " << location
|
|
<< "; result: " << ToString(res);
|
|
} else {
|
|
VLOG(2) << "deallocated " << location << " for device "
|
|
<< context->device_ordinal();
|
|
}
|
|
}
|
|
|
|
/* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
|
|
uint64 bytes) {
|
|
ScopedActivateContext activated{context};
|
|
|
|
LOG(ERROR)
|
|
<< "Feature not supported on ROCm platform (UnifiedMemoryAllocate)";
|
|
return nullptr;
|
|
}
|
|
|
|
/* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
|
|
void* location) {
|
|
LOG(ERROR)
|
|
<< "Feature not supported on ROCm platform (UnifiedMemoryDeallocate)";
|
|
}
|
|
|
|
/* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
|
|
ScopedActivateContext activation{context};
|
|
void* host_mem = nullptr;
|
|
// "Portable" memory is visible to all ROCM contexts. Safe for our use model.
|
|
hipError_t res =
|
|
tensorflow::wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to alloc " << bytes
|
|
<< " bytes on host: " << ToString(res);
|
|
}
|
|
return host_mem;
|
|
}
|
|
|
|
/* static */ void GpuDriver::HostDeallocate(GpuContext* context,
|
|
void* location) {
|
|
ScopedActivateContext activation{context};
|
|
hipError_t res = tensorflow::wrap::hipHostFree(location);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "error deallocating host memory at " << location << ": "
|
|
<< ToString(res);
|
|
}
|
|
}
|
|
|
|
/* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
|
|
uint64 bytes) {
|
|
ScopedActivateContext activation{context};
|
|
// "Portable" memory is visible to all ROCM contexts. Safe for our use model.
|
|
hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes,
|
|
hipHostRegisterPortable);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "error registering host memory at " << location << ": "
|
|
<< ToString(res);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::HostUnregister(GpuContext* context,
|
|
void* location) {
|
|
ScopedActivateContext activation{context};
|
|
hipError_t res = tensorflow::wrap::hipHostUnregister(location);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "error unregistering host memory at " << location << ": "
|
|
<< ToString(res);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
|
|
GpuEventHandle* event) {
|
|
if (*event == nullptr) {
|
|
return port::Status{port::error::INVALID_ARGUMENT,
|
|
"input event cannot be null"};
|
|
}
|
|
|
|
ScopedActivateContext activated{context};
|
|
hipError_t res = tensorflow::wrap::hipEventDestroy(*event);
|
|
*event = nullptr;
|
|
|
|
switch (res) {
|
|
case hipSuccess:
|
|
return port::Status::OK();
|
|
case hipErrorDeinitialized:
|
|
case hipErrorNotInitialized:
|
|
return port::Status{
|
|
port::error::FAILED_PRECONDITION,
|
|
absl::StrFormat("error destroying ROCM event in device %d: %s",
|
|
context->device_ordinal(), ToString(res).c_str())};
|
|
default:
|
|
return port::Status{
|
|
port::error::INTERNAL,
|
|
absl::StrFormat("error destroying ROCM event in device %d: %s",
|
|
context->device_ordinal(), ToString(res).c_str())};
|
|
}
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
|
|
GpuEventHandle event,
|
|
GpuStreamHandle stream) {
|
|
ScopedActivateContext activated{context};
|
|
hipError_t res = tensorflow::wrap::hipEventRecord(event, stream);
|
|
switch (res) {
|
|
case hipSuccess:
|
|
return port::Status::OK();
|
|
case hipErrorDeinitialized:
|
|
case hipErrorNotInitialized:
|
|
return port::Status{
|
|
port::error::FAILED_PRECONDITION,
|
|
absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
|
|
ToString(res).c_str())};
|
|
default:
|
|
return port::Status{
|
|
port::error::INVALID_ARGUMENT,
|
|
absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
|
|
ToString(res).c_str())};
|
|
}
|
|
}
|
|
|
|
/* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
|
|
GpuContext* context, GpuEventHandle event) {
|
|
ScopedActivateContext activated{context};
|
|
hipError_t res = tensorflow::wrap::hipEventQuery(event);
|
|
if (res != hipSuccess && res != hipErrorNotReady) {
|
|
return port::Status{
|
|
port::error::INTERNAL,
|
|
absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
|
|
float* elapsed_milliseconds,
|
|
GpuEventHandle start,
|
|
GpuEventHandle stop) {
|
|
ScopedActivateContext activated{context};
|
|
// The stop event must have completed in order for hipEventElapsedTime to
|
|
// work.
|
|
hipError_t res = tensorflow::wrap::hipEventSynchronize(stop);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
|
|
return false;
|
|
}
|
|
res =
|
|
tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to get elapsed time between events: "
|
|
<< ToString(res);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
|
|
GpuStreamHandle stream,
|
|
GpuEventHandle event) {
|
|
ScopedActivateContext activation{context};
|
|
hipError_t res =
|
|
tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "could not wait stream on event: " << ToString(res);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
|
|
ScopedActivateContext activation{context};
|
|
hipError_t res = tensorflow::wrap::hipDeviceSynchronize();
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
|
|
<< " :: " << port::CurrentStackTrace();
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
|
|
GpuStreamHandle stream) {
|
|
ScopedActivateContext activated{context};
|
|
CHECK(stream != nullptr);
|
|
RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipStreamSynchronize(stream),
|
|
"Could not synchronize on ROCM stream");
|
|
VLOG(2) << "successfully synchronized stream " << stream << " on device "
|
|
<< context->device_ordinal();
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
|
|
GpuStreamHandle stream) {
|
|
ScopedActivateContext activated{context};
|
|
CHECK(stream != nullptr);
|
|
hipError_t res = tensorflow::wrap::hipStreamQuery(stream);
|
|
if (res == hipSuccess) {
|
|
return true;
|
|
}
|
|
|
|
if (res != hipErrorNotReady) {
|
|
LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
|
|
GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) {
|
|
ScopedActivateContext activation{context};
|
|
RETURN_IF_ROCM_ERROR(
|
|
tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size),
|
|
absl::StrFormat("failed to synchronous memcpy from device to host: "
|
|
"host dst: %p; Gpu src: %p; size: %llu=0x%llx",
|
|
host_dst, absl::bit_cast<void*>(gpu_src), size, size));
|
|
VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
|
|
<< host_dst;
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(
|
|
GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
|
|
uint64 size) {
|
|
ScopedActivateContext activation{context};
|
|
RETURN_IF_ROCM_ERROR(
|
|
tensorflow::wrap::hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src),
|
|
size),
|
|
absl::StrFormat(
|
|
"failed to synchronous memcpy from host to device: Gpu dst: %p;"
|
|
" host src: %p; size: %llu=0x%llx",
|
|
absl::bit_cast<void*>(gpu_dst), host_src, size, size));
|
|
VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(
|
|
GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
|
|
uint64 size) {
|
|
ScopedActivateContext activation{context};
|
|
RETURN_IF_ROCM_ERROR(
|
|
tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size),
|
|
absl::StrFormat(
|
|
"failed to synchronous memcpy from host to device:Gpu dst: %p; "
|
|
"Gpu src: %p; size: %llu=0x%llx",
|
|
absl::bit_cast<void*>(gpu_dst), absl::bit_cast<void*>(gpu_src), size,
|
|
size));
|
|
VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
|
|
void* host_dst,
|
|
hipDeviceptr_t gpu_src,
|
|
uint64 size,
|
|
GpuStreamHandle stream) {
|
|
ScopedActivateContext activation{context};
|
|
hipError_t res =
|
|
tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << absl::StrFormat(
|
|
"failed to enqueue async memcpy from device to host: %s; host dst: %p; "
|
|
"Gpu src: %p; size: %llu=0x%llx",
|
|
ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
|
|
size);
|
|
return false;
|
|
}
|
|
VLOG(2) << "successfully enqueued async memcpy d2h of " << size
|
|
<< " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
|
|
<< host_dst << " on stream " << stream;
|
|
return true;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
|
|
hipDeviceptr_t gpu_dst,
|
|
const void* host_src,
|
|
uint64 size,
|
|
GpuStreamHandle stream) {
|
|
ScopedActivateContext activation{context};
|
|
hipError_t res = tensorflow::wrap::hipMemcpyHtoDAsync(
|
|
gpu_dst, const_cast<void*>(host_src), size, stream);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << absl::StrFormat(
|
|
"failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
|
|
"host src: %p; size: %llu=0x%llx",
|
|
ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
|
|
size);
|
|
return false;
|
|
}
|
|
VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
|
|
<< " on stream " << stream;
|
|
return true;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
|
|
hipDeviceptr_t gpu_dst,
|
|
hipDeviceptr_t gpu_src,
|
|
uint64 size,
|
|
GpuStreamHandle stream) {
|
|
ScopedActivateContext activation{context};
|
|
hipError_t result =
|
|
tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
|
|
if (result != hipSuccess) {
|
|
LOG(ERROR) << absl::StrFormat(
|
|
"failed to enqueue async memcpy from device to device: %s"
|
|
"; Gpu dst: %p on %s %s"
|
|
"; Gpu src: %p on %s %s"
|
|
"; can access? %s; size: %llu=0x%llx",
|
|
ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
|
|
ROCMPointerToMemorySpaceString(gpu_dst).c_str(),
|
|
ROCMPointerToDeviceString(gpu_dst).c_str(),
|
|
absl::bit_cast<void*>(gpu_src),
|
|
ROCMPointerToMemorySpaceString(gpu_src).c_str(),
|
|
ROCMPointerToDeviceString(gpu_src).c_str(),
|
|
ROCMPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
|
|
|
|
return false;
|
|
}
|
|
VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
|
|
return true;
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
|
|
GpuEventHandle* event,
|
|
EventFlags flags) {
|
|
int hipflags;
|
|
switch (flags) {
|
|
case EventFlags::kDefault:
|
|
hipflags = hipEventDefault;
|
|
break;
|
|
case EventFlags::kDisableTiming:
|
|
hipflags = hipEventDisableTiming | hipEventReleaseToSystem;
|
|
break;
|
|
default:
|
|
LOG(FATAL) << "impossible event flags: " << int(hipflags);
|
|
}
|
|
|
|
ScopedActivateContext activated{context};
|
|
hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags);
|
|
|
|
if (res == hipSuccess) {
|
|
return port::Status::OK();
|
|
} else if (res == hipErrorMemoryAllocation) {
|
|
return port::Status{port::error::RESOURCE_EXHAUSTED,
|
|
"could not create ROCM event: out of device memory"};
|
|
} else {
|
|
return port::Status{
|
|
port::error::FAILED_PRECONDITION,
|
|
absl::StrCat("could not create ROCM event: ", ToString(res))};
|
|
}
|
|
}
|
|
|
|
/* static */ int GpuDriver::GetDeviceCount() {
|
|
int device_count = 0;
|
|
hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
|
|
return 0;
|
|
}
|
|
|
|
if (FLAGS_gpuexec_rocm_device_0_only && device_count > 1) {
|
|
device_count = 1;
|
|
}
|
|
return device_count;
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
|
|
int* cc_minor,
|
|
hipDevice_t device) {
|
|
return port::Status(
|
|
port::error::INTERNAL,
|
|
absl::StrFormat("failed to get compute capability for device: %d "
|
|
"(unsupported API on AMD Gpus)",
|
|
device));
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::GetPointerAddressRange(
|
|
hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
|
|
hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr);
|
|
if (result == hipSuccess) {
|
|
return port::Status::OK();
|
|
} else if (result == hipErrorNotFound) {
|
|
// We differentiate between "this pointer is unknown" (return here) and
|
|
// "there was an internal error while performing this operation" (return
|
|
// below).
|
|
return port::Status{port::error::NOT_FOUND,
|
|
absl::StrFormat("not a device pointer %p; %s",
|
|
reinterpret_cast<void*>(dptr),
|
|
ToString(result).c_str())};
|
|
}
|
|
|
|
return port::Status{
|
|
port::error::INTERNAL,
|
|
absl::StrFormat("failed to get pointer into for device pointer %p; %s",
|
|
reinterpret_cast<void*>(dptr), ToString(result).c_str())};
|
|
}
|
|
|
|
/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
|
|
hipDeviceptr_t pointer) {
|
|
unsigned int value;
|
|
hipError_t result = hipSuccess;
|
|
if (result == hipSuccess) {
|
|
switch (value) {
|
|
case hipMemoryTypeDevice:
|
|
return MemorySpace::kDevice;
|
|
case hipMemoryTypeHost:
|
|
return MemorySpace::kHost;
|
|
default:
|
|
return port::Status{
|
|
port::error::INTERNAL,
|
|
absl::StrCat("unknown memory space provided by ROCM API: ", value)};
|
|
}
|
|
}
|
|
|
|
return port::Status{
|
|
port::error::INTERNAL,
|
|
absl::StrCat("failed to query device pointer for memory space: ",
|
|
ToString(result))};
|
|
}
|
|
|
|
/* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
|
|
hipDeviceptr_t pointer) {
|
|
hipPointerAttribute_t pointerAttributes;
|
|
hipError_t result =
|
|
tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
|
|
if (result != hipSuccess) {
|
|
return port::Status{
|
|
port::error::INTERNAL,
|
|
absl::StrCat("failed to get device for pointer: ", ToString(result))};
|
|
}
|
|
|
|
hipDevice_t device;
|
|
result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device);
|
|
if (result != hipSuccess) {
|
|
return port::Status{
|
|
port::error::INTERNAL,
|
|
absl::StrCat("failed to get device for pointer: ", ToString(result))};
|
|
}
|
|
|
|
return device;
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
|
|
hipDevice_t device) {
|
|
hipDeviceProp_t props;
|
|
hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
|
|
if (result == hipSuccess) {
|
|
*version = props.gcnArch;
|
|
return port::Status::OK();
|
|
}
|
|
*version = 0;
|
|
return port::Status{
|
|
port::error::INTERNAL,
|
|
absl::StrFormat("failed to determine AMDGpu ISA version for device %d",
|
|
device)};
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::GetGpuGCNArchName(
|
|
hipDevice_t device, std::string* gcnArchName) {
|
|
hipDeviceProp_t props;
|
|
hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
|
|
if (result == hipSuccess) {
|
|
*gcnArchName = props.gcnArchName;
|
|
return port::Status::OK();
|
|
}
|
|
*gcnArchName = "";
|
|
return port::Status{
|
|
port::error::INTERNAL,
|
|
absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
|
|
device)};
|
|
}
|
|
|
|
// Helper function that turns the integer output of hipDeviceGetAttribute to
|
|
// type T and wraps it in a StatusOr.
|
|
template <typename T>
|
|
static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
|
|
hipDeviceAttribute_t attribute) {
|
|
int value = -1;
|
|
hipError_t result =
|
|
tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device);
|
|
if (result != hipSuccess) {
|
|
return port::Status{
|
|
port::error::NOT_FOUND,
|
|
absl::StrCat("could not retrieve ROCM device attribute (", attribute,
|
|
"): ", ToString(result))};
|
|
}
|
|
T converted = value;
|
|
return converted;
|
|
}
|
|
|
|
/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
|
|
hipDevice_t device) {
|
|
return GetSimpleAttribute<int>(device, hipDeviceAttributeMultiprocessorCount);
|
|
}
|
|
|
|
/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
|
|
hipDevice_t device) {
|
|
return GetSimpleAttribute<int64>(
|
|
device, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor);
|
|
}
|
|
|
|
/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
|
|
hipDevice_t device) {
|
|
return GetSimpleAttribute<int64>(device,
|
|
hipDeviceAttributeMaxSharedMemoryPerBlock);
|
|
}
|
|
|
|
/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
|
|
hipDevice_t device) {
|
|
return GetSimpleAttribute<int64>(
|
|
device, hipDeviceAttributeMaxThreadsPerMultiProcessor);
|
|
}
|
|
|
|
/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
|
|
hipDevice_t device) {
|
|
return GetSimpleAttribute<int64>(device,
|
|
hipDeviceAttributeMaxThreadsPerBlock);
|
|
}
|
|
|
|
/* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
|
|
hipDevice_t device) {
|
|
return GetSimpleAttribute<int64>(device,
|
|
hipDeviceAttributeMaxRegistersPerBlock);
|
|
}
|
|
|
|
/* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
|
|
hipDevice_t device) {
|
|
return GetSimpleAttribute<int64>(device, hipDeviceAttributeWarpSize);
|
|
}
|
|
|
|
/* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
|
|
hipDevice_t device) {
|
|
int value;
|
|
hipError_t res = tensorflow::wrap::hipDeviceGetAttribute(
|
|
&value, hipDeviceAttributeMaxGridDimX, device);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
|
|
return false;
|
|
}
|
|
*x = value;
|
|
|
|
res = tensorflow::wrap::hipDeviceGetAttribute(
|
|
&value, hipDeviceAttributeMaxGridDimY, device);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
|
|
return false;
|
|
}
|
|
*y = value;
|
|
|
|
res = tensorflow::wrap::hipDeviceGetAttribute(
|
|
&value, hipDeviceAttributeMaxGridDimZ, device);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
|
|
return false;
|
|
}
|
|
*z = value;
|
|
return true;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
|
|
hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to query driver version: " << ToString(res);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::GetDeviceProperties(
|
|
hipDeviceProp_t* device_properties, int device_ordinal) {
|
|
hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties,
|
|
device_ordinal);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to query device properties: " << ToString(res);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
|
|
hipDeviceAttribute_t attribute, hipDevice_t device) {
|
|
return GetSimpleAttribute<int>(device, attribute);
|
|
}
|
|
|
|
/* static */ bool GpuDriver::IsEccEnabled(hipDevice_t device, bool* result) {
|
|
int value = -1;
|
|
hipError_t res = hipSuccess;
|
|
// TODO(ROCm) implement this feature in HIP
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to query ECC status: " << ToString(res);
|
|
return false;
|
|
}
|
|
|
|
*result = value;
|
|
return true;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
|
|
int64* free_out,
|
|
int64* total_out) {
|
|
ScopedActivateContext activation{context};
|
|
size_t free = 0;
|
|
size_t total = 0;
|
|
hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to query device memory info: " << ToString(res);
|
|
return false;
|
|
}
|
|
|
|
*free_out = free;
|
|
*total_out = total;
|
|
return true;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
|
|
uint64* result) {
|
|
size_t value = -1;
|
|
hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to query total available memory: " << ToString(res);
|
|
return false;
|
|
}
|
|
|
|
*result = value;
|
|
return true;
|
|
}
|
|
|
|
/* static */ string GpuDriver::GetPCIBusID(hipDevice_t device) {
|
|
string pci_bus_id;
|
|
static const int kBufferSize = 64;
|
|
absl::InlinedVector<char, 4> chars(kBufferSize);
|
|
chars[kBufferSize - 1] = '\0';
|
|
hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(
|
|
chars.begin(), kBufferSize - 1, device);
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
|
|
return pci_bus_id;
|
|
}
|
|
pci_bus_id = chars.begin();
|
|
return pci_bus_id;
|
|
}
|
|
|
|
/* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
|
|
GpuContext* to) {
|
|
if (from->device_ordinal() == to->device_ordinal()) {
|
|
return true; // A device can always access its own memory.
|
|
}
|
|
|
|
int can_access_peer = -1;
|
|
hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer(
|
|
&can_access_peer, from->device_ordinal(), to->device_ordinal());
|
|
if (res != hipSuccess) {
|
|
LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
|
|
return false;
|
|
}
|
|
|
|
return can_access_peer;
|
|
}
|
|
|
|
/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
|
|
GpuContext* to) {
|
|
if (from->device_ordinal() == to->device_ordinal()) {
|
|
return port::Status::OK(); // A device can always access its own memory.
|
|
}
|
|
|
|
ScopedActivateContext activated{from};
|
|
hipError_t result = tensorflow::wrap::hipDeviceEnablePeerAccess(
|
|
to->device_ordinal(), 0 /* = flags */);
|
|
if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
|
|
return port::Status{
|
|
port::error::INTERNAL,
|
|
absl::StrFormat("failed to enable peer access from %d to %d: %s",
|
|
from->device_ordinal(), to->device_ordinal(),
|
|
ToString(result).c_str())};
|
|
}
|
|
|
|
return port::Status::OK();
|
|
}
|
|
|
|
/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
|
|
GpuContext* context, hipFunction_t kernel, int threads_per_block,
|
|
size_t dynamic_shared_memory_bytes) {
|
|
ScopedActivateContext activation{context};
|
|
|
|
int max_blocks = 0;
|
|
hipError_t result = hipSuccess;
|
|
// TODO(ROCm) implement this feature in HIP
|
|
if (result != hipSuccess) {
|
|
return port::Status{
|
|
port::error::INTERNAL,
|
|
absl::StrFormat("failed to calculate occupancy of kernel %p: %s",
|
|
kernel, ToString(result).c_str())};
|
|
}
|
|
|
|
return max_blocks;
|
|
}
|
|
|
|
} // namespace gpu
|
|
} // namespace stream_executor
|