STT-tensorflow/tensorflow/stream_executor/rocm/rocm_driver.cc

/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include <stdint.h>
#include <stdlib.h>

#include <map>
#include <set>
#include <utility>

#include "absl/base/casts.h"
#include "absl/container/inlined_vector.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/synchronization/mutex.h"
#include "absl/synchronization/notification.h"
#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
#include "tensorflow/stream_executor/lib/env.h"
#include "tensorflow/stream_executor/lib/error.h"
#include "tensorflow/stream_executor/lib/human_readable.h"
#include "tensorflow/stream_executor/lib/stacktrace.h"
#include "tensorflow/stream_executor/lib/static_threadlocal.h"
#include "tensorflow/stream_executor/lib/threadpool.h"
#include "tensorflow/stream_executor/platform/logging.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"

bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
bool FLAGS_gpuexec_rocm_device_0_only = false;

#define RETURN_IF_ROCM_ERROR(expr, ...)                                \
  do {                                                                 \
    hipError_t _res = (expr);                                          \
    if (TF_PREDICT_FALSE(_res != hipSuccess)) {                        \
      return port::InternalError(absl::StrCat(                         \
          __VA_ARGS__, ": ", ::stream_executor::gpu::ToString(_res))); \
    }                                                                  \
  } while (0)

// Debugging: on each push and pop of a rocm context, verify the current device
// matches the expected one.
constexpr bool kVerifyGpuContext = false;

namespace stream_executor {
namespace gpu {

// GpuContext wraps the device_ordinal.
// Only reason we need this wrapper class is to make the GpuDriver* API
class GpuContext {
 public:
  GpuContext(const int v) : device_ordinal_(v) {}

  int device_ordinal() const { return device_ordinal_; }

  // Disallow copying and moving.
  GpuContext(GpuContext&&) = delete;
  GpuContext(const GpuContext&) = delete;
  GpuContext& operator=(GpuContext&&) = delete;
  GpuContext& operator=(const GpuContext&) = delete;

 private:
  const int device_ordinal_;
};

namespace {

// Formats hipError_t to output prettified values into a log stream.
// Error summaries taken from:
string ToString(hipError_t result) {
#define OSTREAM_ROCM_ERROR(__name) \
  case hipError##__name:           \
    return "HIP_ERROR_" #__name;

  switch (result) {
    OSTREAM_ROCM_ERROR(InvalidValue)
    OSTREAM_ROCM_ERROR(OutOfMemory)
    OSTREAM_ROCM_ERROR(NotInitialized)
    OSTREAM_ROCM_ERROR(Deinitialized)
    OSTREAM_ROCM_ERROR(NoDevice)
    OSTREAM_ROCM_ERROR(InvalidDevice)
    OSTREAM_ROCM_ERROR(InvalidImage)
    OSTREAM_ROCM_ERROR(InvalidContext)
    OSTREAM_ROCM_ERROR(InvalidHandle)
    OSTREAM_ROCM_ERROR(NotFound)
    OSTREAM_ROCM_ERROR(NotReady)
    OSTREAM_ROCM_ERROR(NoBinaryForGpu)

    // Encountered an uncorrectable ECC error during execution.
    OSTREAM_ROCM_ERROR(ECCNotCorrectable)

    // Load/store on an invalid address. Must reboot all context.
    case 700:
      return "ROCM_ERROR_ILLEGAL_ADDRESS";
    // Passed too many / wrong arguments, too many threads for register count.
    case 701:
      return "ROCM_ERROR_LAUNCH_OUT_OF_RESOURCES";

      OSTREAM_ROCM_ERROR(ContextAlreadyInUse)
      OSTREAM_ROCM_ERROR(PeerAccessUnsupported)
      OSTREAM_ROCM_ERROR(Unknown)  // Unknown internal error to ROCM.
    default:
      return absl::StrCat("hipError_t(", static_cast<int>(result), ")");
  }
}

// ROCM driver routines may require a large amount of stack (particularly
// hipModuleLoadDataEx, in our experience). To avoid stack overflow when using
// stack-limited threads (such as those spawned by a default-argument
// thread::ThreadPool on some platforms), we run certain routines in this pool
// and wait for completion.
port::ThreadPool* GetDriverExecutor() {
  static port::ThreadPool* thread_pool = new port::ThreadPool(
      port::Env::Default(), port::ThreadOptions(), "rocm_driver", 1);
  return thread_pool;
}

}  // namespace

string MemorySpaceString(MemorySpace memory_space) {
  switch (memory_space) {
    case MemorySpace::kHost:
      return "host";
    case MemorySpace::kDevice:
      return "device";
    default:
      LOG(FATAL) << "impossible memory space";
  }
}

// Returns the current device set in HIP. This is done by calling the
// HIP driver (e.g., this value is not our cached view of the current device).
static int CurrentDeviceOrDie() {
  int current = -1;
  hipError_t result = tensorflow::wrap::hipGetDevice(&current);
  if (result != hipSuccess) {
    LOG(FATAL) << "failed to query current device: " << ToString(result);
  }
  return current;
}

namespace {

// Call hipDeviceSynchronize and crash if it doesn't succeed.
void SynchronizeOrDie() {
  auto res = tensorflow::wrap::hipDeviceSynchronize();
  if (res != hipSuccess) {
    LOG(FATAL) << "Synchronize found " << ToString(res)
               << " :: " << port::CurrentStackTrace();
  }
}

struct ThreadLocalData {
  int current_device_ordinal;
  int depth;
};

SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);

}  // namespace

ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
  if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
    SynchronizeOrDie();
  }

  auto* tls = &tls_data.get();
  if (tls->depth == 0) {
    tls->current_device_ordinal = CurrentDeviceOrDie();
  }

  if (kVerifyGpuContext) {
    CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
  }

  tls->depth++;

  to_restore_ = context;

  if (context->device_ordinal() == tls->current_device_ordinal) {
    DCHECK_EQ(CurrentDeviceOrDie(), context->device_ordinal());
    return;
  }

  VLOG(3) << "ScopedActivateContext switching device from "
          << tls->current_device_ordinal << " to " << context->device_ordinal();

  // Set the device and update thread local.
  CHECK_EQ(hipSuccess,
           tensorflow::wrap::hipSetDevice(context->device_ordinal()));
  tls->current_device_ordinal = context->device_ordinal();
}

ScopedActivateContext::~ScopedActivateContext() {
  if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
    SynchronizeOrDie();
  }

  auto* tls = &tls_data.get();

  if (kVerifyGpuContext) {
    CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
  }

  tls->depth--;
  DCHECK_GE(tls->depth, 0);

  if (to_restore_->device_ordinal() == tls->current_device_ordinal) {
    DCHECK_EQ(CurrentDeviceOrDie(), to_restore_->device_ordinal());
    return;
  }

  VLOG(3) << "ScopedActivateContext switching device from "
          << tls->current_device_ordinal << " to "
          << to_restore_->device_ordinal();

  // Set context and update thread local.
  CHECK_EQ(hipSuccess,
           tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal()));
  tls->current_device_ordinal = to_restore_->device_ordinal();
}

namespace {

// Returns a stringified device number associated with pointer, primarily for
// logging purposes. Returns "?" if the device could not be successfully
// queried.
string ROCMPointerToDeviceString(hipDeviceptr_t pointer) {
  auto value = GpuDriver::GetPointerDevice(pointer);
  if (value.ok()) {
    return absl::StrCat(value.ValueOrDie());
  }
  LOG(ERROR) << "could not query device: " << value.status();
  return "?";
}

// Returns a stringified memory space associated with pointer, primarily for
// logging purposes. Returns "?" if the memory space could not be successfully
// queried.
string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
  auto value = GpuDriver::GetPointerMemorySpace(pointer);
  if (value.ok()) {
    return MemorySpaceString(value.ValueOrDie());
  }
  LOG(ERROR) << "could not query device: " << value.status();
  return "?";
}

// Returns a stringified representation of whether or not peer access is
// permitted between the "from" and "to" pointers' associated contexts,
// primarily for logging purposes. Returns "error" if an error is encountered
// in the process of querying.
string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
  hipPointerAttribute_t from_pointerAttributes;
  hipError_t result =
      tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from);
  if (result != hipSuccess) {
    LOG(ERROR) << "could not retrieve source pointer's device: "
               << ToString(result);
    return "error";
  }

  hipPointerAttribute_t to_pointerAttributes;
  result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to);
  if (result != hipSuccess) {
    LOG(ERROR) << "could not retrieve destination pointer's device: "
               << ToString(result);
    return "error";
  }

  GpuContext fromCtx(from_pointerAttributes.device);
  GpuContext toCtx(to_pointerAttributes.device);

  return GpuDriver::CanEnablePeerAccess(&fromCtx, &toCtx) ? "true" : "false";
}

// Actually performs the work of ROCM initialization. Wrapped up in one-time
// execution guard.
static port::Status InternalInit() {
  hipError_t res = hipErrorNoDevice;
  if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
    LOG(ERROR) << "injecting ROCM init error; initialization will fail";
  } else {
    res = tensorflow::wrap::hipInit(0 /* = flags */);
  }

  if (res == hipSuccess) {
    return port::Status::OK();
  }

  LOG(ERROR) << "failed call to hipInit: " << ToString(res);
  Diagnostician::LogDiagnosticInformation();
  return port::Status{port::error::ABORTED,
                      absl::StrCat("failed call to hipInit: ", ToString(res))};
}

}  // namespace

/* static */ port::Status GpuDriver::Init() {
  // Cached return value from calling InternalInit(), as hipInit need only be
  // called once, but GpuDriver::Init may be called many times.
  static port::Status* init_retval = [] {
    return new port::Status(InternalInit());
  }();
  return *init_retval;
}

/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
                                               hipDevice_t* device) {
  hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal);
  if (res == hipSuccess) {
    return port::Status::OK();
  }

  return port::Status{
      port::error::INTERNAL,
      absl::StrCat("failed call to hipDeviceGet: ", ToString(res))};
}

/* static */ port::Status GpuDriver::GetDeviceName(hipDevice_t device,
                                                   string* device_name) {
  static const size_t kCharLimit = 64;
  absl::InlinedVector<char, 4> chars(kCharLimit);
  RETURN_IF_ROCM_ERROR(
      tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device),
      "Failed to get device name");
  chars[kCharLimit - 1] = '\0';
  *device_name = chars.begin();
  return port::Status::OK();
}

bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
                                 int* flags) {
  static_assert(DeviceOptions::kMask == 0xf,
                "needs update for new device options");
  return true;
}

/* static */ port::Status GpuDriver::CreateContext(
    int device_ordinal, hipDevice_t device, const DeviceOptions& device_options,
    GpuContext** context) {
  *context = new GpuContext(device_ordinal);
  return port::Status::OK();
}
/* static */ void GpuDriver::DestroyContext(GpuContext* context) {
  if (context == nullptr) {
    return;
  }
  delete context;
}

/* static */ port::Status GpuDriver::FuncGetAttribute(
    hipDeviceAttribute_t attribute, hipFunction_t func, int* attribute_value) {
  // TODO(ROCm) properly implement this feature in HIP
  return port::Status::OK();
}

/* static */ port::Status GpuDriver::FuncSetCacheConfig(
    hipFunction_t function, hipFuncCache_t cache_config) {
  RETURN_IF_ROCM_ERROR(
      tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config),
      "Failed to set ROCM kernel cache config.");
  return port::Status::OK();
}

/* static */ port::StatusOr<hipSharedMemConfig>
GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
  hipSharedMemConfig shared_mem_config;
  ScopedActivateContext activation{context};
  RETURN_IF_ROCM_ERROR(
      tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config),
      "Failed to get shared memory config");
  return shared_mem_config;
}

/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
    GpuContext* context, hipSharedMemConfig shared_mem_config) {
  ScopedActivateContext activation{context};
  RETURN_IF_ROCM_ERROR(
      tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config),
      "Failed to set ROCM device shared memory config");
  return port::Status::OK();
}

/* static */ port::Status GpuDriver::LaunchKernel(
    GpuContext* context, hipFunction_t function, unsigned int grid_dim_x,
    unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
    unsigned int block_dim_y, unsigned int block_dim_z,
    unsigned int shared_mem_bytes, GpuStreamHandle stream, void** kernel_params,
    void** extra) {
  ScopedActivateContext activation{context};
  VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
          << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
          << " bdx: " << block_dim_x << " bdy: " << block_dim_y
          << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
  RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipModuleLaunchKernel(
                           function, grid_dim_x, grid_dim_y, grid_dim_z,
                           block_dim_x, block_dim_y, block_dim_z,
                           shared_mem_bytes, stream, kernel_params, extra),
                       "Failed to launch ROCM kernel");
  VLOG(2) << "successfully launched kernel";
  return port::Status::OK();
}

/* static */ port::Status GpuDriver::LoadPtx(GpuContext* context,
                                             const char* ptx_contents,
                                             hipModule_t* module) {
  LOG(ERROR) << "Feature not supported on ROCm platform (LoadPtx)";
  return port::InternalError("Not Implemented");
}

/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
                                               const char* cubin_bytes,
                                               hipModule_t* module) {
  return port::Status{port::error::INTERNAL,
                      "Feature not supported on ROCm platform (LoadCubin)"};
}

/* static */ port::Status GpuDriver::LoadHsaco(GpuContext* context,
                                               const char* hsaco_contents,
                                               hipModule_t* module) {
  absl::Notification notification;
  port::Status ret = port::Status::OK();
  GetDriverExecutor()->Schedule([context, hsaco_contents, module, &ret,
                                 &notification]() {
    ScopedActivateContext activation{context};
    void* hsaco_data = const_cast<char*>(hsaco_contents);

    hipError_t res = tensorflow::wrap::hipModuleLoadData(module, hsaco_data);

    if (res != hipSuccess) {
      ret = port::InternalError(
          absl::StrCat("Failed to load HSACO: ", ToString(res)));
      notification.Notify();
    }

    CHECK(module != nullptr);
    notification.Notify();
  });
  notification.WaitForNotification();

  return ret;
}

/* static */ port::Status GpuDriver::SynchronousMemsetUint8(
    GpuContext* context, hipDeviceptr_t location, uint8 value, size_t size) {
  ScopedActivateContext activation{context};
  RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipMemsetD8(location, value, size),
                       "Failed to memset memory");
  return port::Status::OK();
}

/* static */ port::Status GpuDriver::SynchronousMemsetUint32(
    GpuContext* context, hipDeviceptr_t location, uint32 value,
    size_t uint32_count) {
  ScopedActivateContext activation{context};
  void* pointer = absl::bit_cast<void*>(location);
  RETURN_IF_ROCM_ERROR(
      tensorflow::wrap::hipMemsetD32(pointer, value, uint32_count),
      "Failed to memset memory");
  return port::Status::OK();
}

/* static */ port::Status GpuDriver::AsynchronousMemsetUint8(
    GpuContext* context, hipDeviceptr_t location, uint8 value,
    size_t uint32_count, GpuStreamHandle stream) {
  ScopedActivateContext activation{context};
  RETURN_IF_ROCM_ERROR(
      tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream),
      "Failed to enqueue async memset operation");
  return port::Status::OK();
}

/* static */ port::Status GpuDriver::AsynchronousMemsetUint32(
    GpuContext* context, hipDeviceptr_t location, uint32 value,
    size_t uint32_count, GpuStreamHandle stream) {
  ScopedActivateContext activation{context};
  void* pointer = absl::bit_cast<void*>(location);
  RETURN_IF_ROCM_ERROR(
      tensorflow::wrap::hipMemsetD32Async(pointer, value, uint32_count, stream),
      "Failed to enqueue async memset operation");
  VLOG(2) << "successfully enqueued async memset operation";
  return port::Status::OK();
}

/* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
                                               GpuStreamHandle stream,
                                               StreamCallback callback,
                                               void* data) {
  hipError_t res = tensorflow::wrap::hipStreamAddCallback(
      stream, (hipStreamCallback_t)callback, data, 0 /* = flags */);
  if (res != hipSuccess) {
    LOG(ERROR) << "unable to add host callback: " << ToString(res);
    return false;
  }
  return true;
}

/* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
                                               hipModule_t module,
                                               const char* kernel_name,
                                               hipFunction_t* function) {
  ScopedActivateContext activated{context};
  CHECK(module != nullptr && kernel_name != nullptr);
  hipError_t res =
      tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to get kernel \"" << kernel_name
               << "\" from module: " << ToString(res);
    return false;
  }

  return true;
}

/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
                                             hipModule_t module,
                                             const char* symbol_name,
                                             hipDeviceptr_t* dptr,
                                             size_t* bytes) {
  ScopedActivateContext activated{context};
  CHECK(module != nullptr && symbol_name != nullptr &&
        (dptr != nullptr || bytes != nullptr));
  hipError_t res =
      tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
  if (res != hipSuccess) {
    // symbol may not be found in the current module, but it may reside in
    // another module.
    VLOG(2) << "failed to get symbol \"" << symbol_name
            << "\" from module: " << ToString(res);
    return false;
  }

  return true;
}

/* static */ void GpuDriver::UnloadModule(GpuContext* context,
                                          hipModule_t module) {
  ScopedActivateContext activated{context};
  hipError_t res = tensorflow::wrap::hipModuleUnload(module);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to unload module " << module
               << "; leaking: " << ToString(res);
  }
}

/* static */ bool GpuDriver::CreateStream(GpuContext* context,
                                          GpuStreamHandle* stream,
                                          int priority) {
  ScopedActivateContext activated{context};
  hipError_t res;
  if (priority == 0) {
    res = tensorflow::wrap::hipStreamCreateWithFlags(
        stream, hipStreamDefault);  // switch to hipStreamNonBlocking?
  } else {
    res = tensorflow::wrap::hipStreamCreateWithPriority(
        stream, hipStreamDefault, priority);  // switch to hipStreamNonBlocking?
  }
  if (res != hipSuccess) {
    LOG(ERROR) << "could not allocate ROCM stream for device "
               << context->device_ordinal() << ": " << ToString(res);
    return false;
  }

  VLOG(2) << "successfully created stream " << *stream << " for device "
          << context->device_ordinal() << " on thread";
  return true;
}

/* static */ void GpuDriver::DestroyStream(GpuContext* context,
                                           GpuStreamHandle* stream) {
  if (*stream == nullptr) {
    return;
  }

  ScopedActivateContext activated{context};
  hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to destroy ROCM stream for device "
               << context->device_ordinal() << ": " << ToString(res);
  } else {
    VLOG(2) << "successfully destroyed stream " << *stream << " for device "
            << context->device_ordinal();
    *stream = nullptr;
  }
}

/* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
                                             uint64 bytes) {
  ScopedActivateContext activated{context};
  hipDeviceptr_t result = 0;
  hipError_t res = tensorflow::wrap::hipMalloc(&result, bytes);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to allocate "
               << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
               << " bytes) from device: " << ToString(res);
    return nullptr;
  }
  void* ptr = reinterpret_cast<void*>(result);
  VLOG(2) << "allocated " << ptr << " for device " << context->device_ordinal()
          << " of " << bytes << " bytes";
  return ptr;
}

/* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
                                              void* location) {
  ScopedActivateContext activation{context};
  hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
  hipError_t res = tensorflow::wrap::hipFree(pointer);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to free device memory at " << location
               << "; result: " << ToString(res);
  } else {
    VLOG(2) << "deallocated " << location << " for device "
            << context->device_ordinal();
  }
}

/* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
                                                    uint64 bytes) {
  ScopedActivateContext activated{context};

  LOG(ERROR)
      << "Feature not supported on ROCm platform (UnifiedMemoryAllocate)";
  return nullptr;
}

/* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
                                                     void* location) {
  LOG(ERROR)
      << "Feature not supported on ROCm platform (UnifiedMemoryDeallocate)";
}

/* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
  ScopedActivateContext activation{context};
  void* host_mem = nullptr;
  // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
  hipError_t res =
      tensorflow::wrap::hipHostMalloc(&host_mem, bytes, hipHostMallocPortable);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to alloc " << bytes
               << " bytes on host: " << ToString(res);
  }
  return host_mem;
}

/* static */ void GpuDriver::HostDeallocate(GpuContext* context,
                                            void* location) {
  ScopedActivateContext activation{context};
  hipError_t res = tensorflow::wrap::hipHostFree(location);
  if (res != hipSuccess) {
    LOG(ERROR) << "error deallocating host memory at " << location << ": "
               << ToString(res);
  }
}

/* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
                                          uint64 bytes) {
  ScopedActivateContext activation{context};
  // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
  hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes,
                                                     hipHostRegisterPortable);
  if (res != hipSuccess) {
    LOG(ERROR) << "error registering host memory at " << location << ": "
               << ToString(res);
    return false;
  }
  return true;
}

/* static */ bool GpuDriver::HostUnregister(GpuContext* context,
                                            void* location) {
  ScopedActivateContext activation{context};
  hipError_t res = tensorflow::wrap::hipHostUnregister(location);
  if (res != hipSuccess) {
    LOG(ERROR) << "error unregistering host memory at " << location << ": "
               << ToString(res);
    return false;
  }
  return true;
}

/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
                                                  GpuEventHandle* event) {
  if (*event == nullptr) {
    return port::Status{port::error::INVALID_ARGUMENT,
                        "input event cannot be null"};
  }

  ScopedActivateContext activated{context};
  hipError_t res = tensorflow::wrap::hipEventDestroy(*event);
  *event = nullptr;

  switch (res) {
    case hipSuccess:
      return port::Status::OK();
    case hipErrorDeinitialized:
    case hipErrorNotInitialized:
      return port::Status{
          port::error::FAILED_PRECONDITION,
          absl::StrFormat("error destroying ROCM event in device %d: %s",
                          context->device_ordinal(), ToString(res).c_str())};
    default:
      return port::Status{
          port::error::INTERNAL,
          absl::StrFormat("error destroying ROCM event in device %d: %s",
                          context->device_ordinal(), ToString(res).c_str())};
  }
}

/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
                                                 GpuEventHandle event,
                                                 GpuStreamHandle stream) {
  ScopedActivateContext activated{context};
  hipError_t res = tensorflow::wrap::hipEventRecord(event, stream);
  switch (res) {
    case hipSuccess:
      return port::Status::OK();
    case hipErrorDeinitialized:
    case hipErrorNotInitialized:
      return port::Status{
          port::error::FAILED_PRECONDITION,
          absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
                          ToString(res).c_str())};
    default:
      return port::Status{
          port::error::INVALID_ARGUMENT,
          absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
                          ToString(res).c_str())};
  }
}

/* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
    GpuContext* context, GpuEventHandle event) {
  ScopedActivateContext activated{context};
  hipError_t res = tensorflow::wrap::hipEventQuery(event);
  if (res != hipSuccess && res != hipErrorNotReady) {
    return port::Status{
        port::error::INTERNAL,
        absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
  }

  return res;
}

/* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
                                                 float* elapsed_milliseconds,
                                                 GpuEventHandle start,
                                                 GpuEventHandle stop) {
  ScopedActivateContext activated{context};
  // The stop event must have completed in order for hipEventElapsedTime to
  // work.
  hipError_t res = tensorflow::wrap::hipEventSynchronize(stop);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
    return false;
  }
  res =
      tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to get elapsed time between events: "
               << ToString(res);
    return false;
  }

  return true;
}

/* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
                                               GpuStreamHandle stream,
                                               GpuEventHandle event) {
  ScopedActivateContext activation{context};
  hipError_t res =
      tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */);
  if (res != hipSuccess) {
    LOG(ERROR) << "could not wait stream on event: " << ToString(res);
    return false;
  }

  return true;
}

/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
  ScopedActivateContext activation{context};
  hipError_t res = tensorflow::wrap::hipDeviceSynchronize();
  if (res != hipSuccess) {
    LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
               << " :: " << port::CurrentStackTrace();
    return false;
  }

  return true;
}

/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
                                                       GpuStreamHandle stream) {
  ScopedActivateContext activated{context};
  CHECK(stream != nullptr);
  RETURN_IF_ROCM_ERROR(tensorflow::wrap::hipStreamSynchronize(stream),
                       "Could not synchronize on ROCM stream");
  VLOG(2) << "successfully synchronized stream " << stream << " on device "
          << context->device_ordinal();
  return port::Status::OK();
}

/* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
                                          GpuStreamHandle stream) {
  ScopedActivateContext activated{context};
  CHECK(stream != nullptr);
  hipError_t res = tensorflow::wrap::hipStreamQuery(stream);
  if (res == hipSuccess) {
    return true;
  }

  if (res != hipErrorNotReady) {
    LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
  }
  return false;
}

/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
    GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) {
  ScopedActivateContext activation{context};
  RETURN_IF_ROCM_ERROR(
      tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size),
      absl::StrFormat("failed to synchronous memcpy from device to host: "
                      "host dst: %p; Gpu src: %p; size: %llu=0x%llx",
                      host_dst, absl::bit_cast<void*>(gpu_src), size, size));
  VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
          << host_dst;
  return port::Status::OK();
}

/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(
    GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
    uint64 size) {
  ScopedActivateContext activation{context};
  RETURN_IF_ROCM_ERROR(
      tensorflow::wrap::hipMemcpyHtoD(gpu_dst, const_cast<void*>(host_src),
                                      size),
      absl::StrFormat(
          "failed to synchronous memcpy from host to device: Gpu dst: %p;"
          " host src: %p; size: %llu=0x%llx",
          absl::bit_cast<void*>(gpu_dst), host_src, size, size));
  VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
  return port::Status::OK();
}

/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(
    GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
    uint64 size) {
  ScopedActivateContext activation{context};
  RETURN_IF_ROCM_ERROR(
      tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size),
      absl::StrFormat(
          "failed to synchronous memcpy from host to device:Gpu dst: %p; "
          "Gpu src: %p; size: %llu=0x%llx",
          absl::bit_cast<void*>(gpu_dst), absl::bit_cast<void*>(gpu_src), size,
          size));
  VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
  return port::Status::OK();
}

/* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
                                                   void* host_dst,
                                                   hipDeviceptr_t gpu_src,
                                                   uint64 size,
                                                   GpuStreamHandle stream) {
  ScopedActivateContext activation{context};
  hipError_t res =
      tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
  if (res != hipSuccess) {
    LOG(ERROR) << absl::StrFormat(
        "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
        "Gpu src: %p; size: %llu=0x%llx",
        ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
        size);
    return false;
  }
  VLOG(2) << "successfully enqueued async memcpy d2h of " << size
          << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
          << host_dst << " on stream " << stream;
  return true;
}

/* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
                                                   hipDeviceptr_t gpu_dst,
                                                   const void* host_src,
                                                   uint64 size,
                                                   GpuStreamHandle stream) {
  ScopedActivateContext activation{context};
  hipError_t res = tensorflow::wrap::hipMemcpyHtoDAsync(
      gpu_dst, const_cast<void*>(host_src), size, stream);
  if (res != hipSuccess) {
    LOG(ERROR) << absl::StrFormat(
        "failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
        "host src: %p; size: %llu=0x%llx",
        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
        size);
    return false;
  }
  VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
          << " on stream " << stream;
  return true;
}

/* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
                                                   hipDeviceptr_t gpu_dst,
                                                   hipDeviceptr_t gpu_src,
                                                   uint64 size,
                                                   GpuStreamHandle stream) {
  ScopedActivateContext activation{context};
  hipError_t result =
      tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
  if (result != hipSuccess) {
    LOG(ERROR) << absl::StrFormat(
        "failed to enqueue async memcpy from device to device: %s"
        "; Gpu dst: %p on %s %s"
        "; Gpu src: %p on %s %s"
        "; can access? %s; size: %llu=0x%llx",
        ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
        ROCMPointerToMemorySpaceString(gpu_dst).c_str(),
        ROCMPointerToDeviceString(gpu_dst).c_str(),
        absl::bit_cast<void*>(gpu_src),
        ROCMPointerToMemorySpaceString(gpu_src).c_str(),
        ROCMPointerToDeviceString(gpu_src).c_str(),
        ROCMPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);

    return false;
  }
  VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
  return true;
}

/* static */ port::Status GpuDriver::InitEvent(GpuContext* context,
                                               GpuEventHandle* event,
                                               EventFlags flags) {
  int hipflags;
  switch (flags) {
    case EventFlags::kDefault:
      hipflags = hipEventDefault;
      break;
    case EventFlags::kDisableTiming:
      hipflags = hipEventDisableTiming | hipEventReleaseToSystem;
      break;
    default:
      LOG(FATAL) << "impossible event flags: " << int(hipflags);
  }

  ScopedActivateContext activated{context};
  hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags);

  if (res == hipSuccess) {
    return port::Status::OK();
  } else if (res == hipErrorMemoryAllocation) {
    return port::Status{port::error::RESOURCE_EXHAUSTED,
                        "could not create ROCM event: out of device memory"};
  } else {
    return port::Status{
        port::error::FAILED_PRECONDITION,
        absl::StrCat("could not create ROCM event: ", ToString(res))};
  }
}

/* static */ int GpuDriver::GetDeviceCount() {
  int device_count = 0;
  hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count);
  if (res != hipSuccess) {
    LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
    return 0;
  }

  if (FLAGS_gpuexec_rocm_device_0_only && device_count > 1) {
    device_count = 1;
  }
  return device_count;
}

/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
                                                          int* cc_minor,
                                                          hipDevice_t device) {
  return port::Status(
      port::error::INTERNAL,
      absl::StrFormat("failed to get compute capability for device: %d "
                      "(unsupported API on AMD Gpus)",
                      device));
}

/* static */ port::Status GpuDriver::GetPointerAddressRange(
    hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
  hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr);
  if (result == hipSuccess) {
    return port::Status::OK();
  } else if (result == hipErrorNotFound) {
    // We differentiate between "this pointer is unknown" (return here) and
    // "there was an internal error while performing this operation" (return
    // below).
    return port::Status{port::error::NOT_FOUND,
                        absl::StrFormat("not a device pointer %p; %s",
                                        reinterpret_cast<void*>(dptr),
                                        ToString(result).c_str())};
  }

  return port::Status{
      port::error::INTERNAL,
      absl::StrFormat("failed to get pointer into for device pointer %p; %s",
                      reinterpret_cast<void*>(dptr), ToString(result).c_str())};
}

/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
    hipDeviceptr_t pointer) {
  unsigned int value;
  hipError_t result = hipSuccess;
  if (result == hipSuccess) {
    switch (value) {
      case hipMemoryTypeDevice:
        return MemorySpace::kDevice;
      case hipMemoryTypeHost:
        return MemorySpace::kHost;
      default:
        return port::Status{
            port::error::INTERNAL,
            absl::StrCat("unknown memory space provided by ROCM API: ", value)};
    }
  }

  return port::Status{
      port::error::INTERNAL,
      absl::StrCat("failed to query device pointer for memory space: ",
                   ToString(result))};
}

/* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
    hipDeviceptr_t pointer) {
  hipPointerAttribute_t pointerAttributes;
  hipError_t result =
      tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
  if (result != hipSuccess) {
    return port::Status{
        port::error::INTERNAL,
        absl::StrCat("failed to get device for pointer: ", ToString(result))};
  }

  hipDevice_t device;
  result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device);
  if (result != hipSuccess) {
    return port::Status{
        port::error::INTERNAL,
        absl::StrCat("failed to get device for pointer: ", ToString(result))};
  }

  return device;
}

/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
                                                      hipDevice_t device) {
  hipDeviceProp_t props;
  hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
  if (result == hipSuccess) {
    *version = props.gcnArch;
    return port::Status::OK();
  }
  *version = 0;
  return port::Status{
      port::error::INTERNAL,
      absl::StrFormat("failed to determine AMDGpu ISA version for device %d",
                      device)};
}

/* static */ port::Status GpuDriver::GetGpuGCNArchName(
    hipDevice_t device, std::string* gcnArchName) {
  hipDeviceProp_t props;
  hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
  if (result == hipSuccess) {
    *gcnArchName = props.gcnArchName;
    return port::Status::OK();
  }
  *gcnArchName = "";
  return port::Status{
      port::error::INTERNAL,
      absl::StrFormat("failed to determine AMDGpu GCN Arch Name for device %d",
                      device)};
}

// Helper function that turns the integer output of hipDeviceGetAttribute to
// type T and wraps it in a StatusOr.
template <typename T>
static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
                                            hipDeviceAttribute_t attribute) {
  int value = -1;
  hipError_t result =
      tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device);
  if (result != hipSuccess) {
    return port::Status{
        port::error::NOT_FOUND,
        absl::StrCat("could not retrieve ROCM device attribute (", attribute,
                     "): ", ToString(result))};
  }
  T converted = value;
  return converted;
}

/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
    hipDevice_t device) {
  return GetSimpleAttribute<int>(device, hipDeviceAttributeMultiprocessorCount);
}

/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
    hipDevice_t device) {
  return GetSimpleAttribute<int64>(
      device, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor);
}

/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
    hipDevice_t device) {
  return GetSimpleAttribute<int64>(device,
                                   hipDeviceAttributeMaxSharedMemoryPerBlock);
}

/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
    hipDevice_t device) {
  return GetSimpleAttribute<int64>(
      device, hipDeviceAttributeMaxThreadsPerMultiProcessor);
}

/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
    hipDevice_t device) {
  return GetSimpleAttribute<int64>(device,
                                   hipDeviceAttributeMaxThreadsPerBlock);
}

/* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
    hipDevice_t device) {
  return GetSimpleAttribute<int64>(device,
                                   hipDeviceAttributeMaxRegistersPerBlock);
}

/* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
    hipDevice_t device) {
  return GetSimpleAttribute<int64>(device, hipDeviceAttributeWarpSize);
}

/* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
                                           hipDevice_t device) {
  int value;
  hipError_t res = tensorflow::wrap::hipDeviceGetAttribute(
      &value, hipDeviceAttributeMaxGridDimX, device);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
    return false;
  }
  *x = value;

  res = tensorflow::wrap::hipDeviceGetAttribute(
      &value, hipDeviceAttributeMaxGridDimY, device);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
    return false;
  }
  *y = value;

  res = tensorflow::wrap::hipDeviceGetAttribute(
      &value, hipDeviceAttributeMaxGridDimZ, device);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
    return false;
  }
  *z = value;
  return true;
}

/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
  hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to query driver version: " << ToString(res);
    return false;
  }

  return true;
}

/* static */ bool GpuDriver::GetDeviceProperties(
    hipDeviceProp_t* device_properties, int device_ordinal) {
  hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties,
                                                            device_ordinal);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to query device properties: " << ToString(res);
    return false;
  }

  return true;
}

/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
    hipDeviceAttribute_t attribute, hipDevice_t device) {
  return GetSimpleAttribute<int>(device, attribute);
}

/* static */ bool GpuDriver::IsEccEnabled(hipDevice_t device, bool* result) {
  int value = -1;
  hipError_t res = hipSuccess;
  // TODO(ROCm) implement this feature in HIP
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to query ECC status: " << ToString(res);
    return false;
  }

  *result = value;
  return true;
}

/* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
                                                 int64* free_out,
                                                 int64* total_out) {
  ScopedActivateContext activation{context};
  size_t free = 0;
  size_t total = 0;
  hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to query device memory info: " << ToString(res);
    return false;
  }

  *free_out = free;
  *total_out = total;
  return true;
}

/* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
                                                  uint64* result) {
  size_t value = -1;
  hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to query total available memory: " << ToString(res);
    return false;
  }

  *result = value;
  return true;
}

/* static */ string GpuDriver::GetPCIBusID(hipDevice_t device) {
  string pci_bus_id;
  static const int kBufferSize = 64;
  absl::InlinedVector<char, 4> chars(kBufferSize);
  chars[kBufferSize - 1] = '\0';
  hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(
      chars.begin(), kBufferSize - 1, device);
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
    return pci_bus_id;
  }
  pci_bus_id = chars.begin();
  return pci_bus_id;
}

/* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
                                                 GpuContext* to) {
  if (from->device_ordinal() == to->device_ordinal()) {
    return true;  // A device can always access its own memory.
  }

  int can_access_peer = -1;
  hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer(
      &can_access_peer, from->device_ordinal(), to->device_ordinal());
  if (res != hipSuccess) {
    LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
    return false;
  }

  return can_access_peer;
}

/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
                                                      GpuContext* to) {
  if (from->device_ordinal() == to->device_ordinal()) {
    return port::Status::OK();  // A device can always access its own memory.
  }

  ScopedActivateContext activated{from};
  hipError_t result = tensorflow::wrap::hipDeviceEnablePeerAccess(
      to->device_ordinal(), 0 /* = flags */);
  if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
    return port::Status{
        port::error::INTERNAL,
        absl::StrFormat("failed to enable peer access from %d to %d: %s",
                        from->device_ordinal(), to->device_ordinal(),
                        ToString(result).c_str())};
  }

  return port::Status::OK();
}

/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
    GpuContext* context, hipFunction_t kernel, int threads_per_block,
    size_t dynamic_shared_memory_bytes) {
  ScopedActivateContext activation{context};

  int max_blocks = 0;
  hipError_t result = hipSuccess;
  // TODO(ROCm) implement this feature in HIP
  if (result != hipSuccess) {
    return port::Status{
        port::error::INTERNAL,
        absl::StrFormat("failed to calculate occupancy of kernel %p: %s",
                        kernel, ToString(result).c_str())};
  }

  return max_blocks;
}

}  // namespace gpu
}  // namespace stream_executor