PR #25011: [GPU][ROCm][CUDA] StreamExecutor logic for ROCm / CUDA platform (PR 20709 / 22669 / 24156 continued)

Please approve this CL. It will be submitted automatically, and its GitHub pull request will be marked as merged.

Imported from GitHub PR #25011

New PR to continue the efforts started by @deven-amd in #20709 / #22669 / #24156.

This PR aims to refactor StreamExecutor GPU interfaces so it can be shared among CUDA and ROCm. The PR would be the first part of a series of PRs.

Based on @timshen91 's inputs, I've refactored logic in #214156 so :

- only contains changes in stream_executor/....
- does not remove any stream_executor/cuda/*.h, so that things outside of stream_executor don't break. All the types and functions in the namespace cuda now alias to namespace gpu counterparts. For example, namespace cuda { using CUDADriver = gpu::GpuDriver; }.
- all stream_executor/gpu/BUILD targets should be only visible to //third_party/tensorflow/stream_executor:__subpackages__.
- target stream_executor/gpu:X should be only used by stream_executor/cuda:cuda_X or stream_executor/rocm:rocm_X, not cuda_Y. For example, cuda:cuda_platform should depend on cuda:cuda_driver, not gpu:gpu_driver.

Copybara import of the project:

  - 267affbb73df9164baf4e62142fe7201e6a305ee [ROCm][CUDA] StreamExecutor logic for ROCm / CUDA platform by Wen-Heng (Jack) Chung <whchung@gmail.com>
  - 04fac5bf358059bdb2cd4a3e092e52dc982ea7b0 Merge 267affbb73df9164baf4e62142fe7201e6a305ee into 5f8ea... by Wen-Heng (Jack) Chung <whchung@gmail.com>

COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/tensorflow/pull/25011 from ROCmSoftwarePlatform:google-upstream-pr-stream-executor-alt 267affbb73df9164baf4e62142fe7201e6a305ee
PiperOrigin-RevId: 231250990
This commit is contained in:
Tim Shen 2019-01-28 11:00:39 -08:00 committed by TensorFlower Gardener
parent 56c3ac7d23
commit aba83497f5
71 changed files with 6790 additions and 2324 deletions

View File

@ -343,6 +343,13 @@ config_setting(
},
)
config_setting(
name = "using_rocm_hipcc",
define_values = {
"using_rocm_hipcc": "true",
},
)
config_setting(
name = "with_mpi_support",
values = {"define": "with_mpi_support=true"},

View File

@ -1964,6 +1964,14 @@ cc_library(
],
)
cc_library(
name = "rocm",
visibility = ["//visibility:public"],
deps = [
"//tensorflow/core/platform/default/build_config:rocm",
],
)
# -----------------------------------------------------------------------------
# Clif-related proto libraries.

View File

@ -6,6 +6,7 @@ load("//tensorflow:tensorflow.bzl", "if_windows")
load("//tensorflow:tensorflow.bzl", "if_not_windows")
load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
load(
"//third_party/mkl:build_defs.bzl",
"if_mkl_ml",
@ -735,6 +736,11 @@ def tf_additional_binary_deps():
"//tensorflow/stream_executor:cuda_platform",
"//tensorflow/core/platform/default/build_config:cuda",
],
) + if_rocm(
[
"//tensorflow/stream_executor:rocm_platform",
"//tensorflow/core/platform/default/build_config:rocm",
],
) + [
# TODO(allenl): Split these out into their own shared objects (they are
# here because they are shared between contrib/ op shared objects and

View File

@ -8,6 +8,7 @@ licenses(["notice"]) # Apache 2.0
exports_files(["LICENSE"])
load("//tensorflow:tensorflow.bzl", "if_cuda")
load("//tensorflow:tensorflow.bzl", "if_rocm")
load("//tensorflow:tensorflow.bzl", "tf_copts")
load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
@ -42,6 +43,7 @@ tf_cuda_library(
"//tensorflow/stream_executor/cuda:cuda_platform_id",
"//tensorflow/stream_executor/host:host_platform_id",
"//tensorflow/stream_executor/platform:dso_loader",
"//tensorflow/stream_executor/rocm:rocm_platform_id",
] + select({
"@local_config_cuda//cuda:darwin": ["IOKit"],
"//conditions:default": [],
@ -50,6 +52,7 @@ tf_cuda_library(
"//tensorflow:using_cuda_nvcc": ["//tensorflow/stream_executor/cuda:all_runtime"],
"//tensorflow:using_cuda_clang_with_dynamic_build": [],
"//tensorflow:using_cuda_nvcc_with_dynamic_build": [],
"//tensorflow:using_rocm_hipcc": ["//tensorflow/stream_executor/rocm:all_runtime"],
"//conditions:default": [],
}),
)
@ -67,6 +70,18 @@ cc_library(
}),
)
cc_library(
name = "stream_executor_rocm",
deps = [
":stream_executor_no_cuda",
":rocm",
] + if_static(
["//tensorflow/stream_executor/rocm:all_runtime"],
) + select({
"//conditions:default": [],
}),
)
cc_library(
name = "stream_executor_no_cuda",
deps = [
@ -79,6 +94,7 @@ cc_library(
"//tensorflow/stream_executor/host:host_platform",
"//tensorflow/stream_executor/host:host_platform_id",
"//tensorflow/stream_executor/platform:dso_loader",
"//tensorflow/stream_executor/rocm:rocm_platform_id",
],
)
@ -267,6 +283,17 @@ cc_library(
],
)
cc_library(
name = "rocm",
data = [],
linkopts = select({
"//conditions:default": [
"-Wl,-rpath,../local_config_rocm/rocm/rocm/lib",
],
}),
deps = [],
)
cc_library(
name = "sycl",
data = if_ccpp([

View File

@ -27,6 +27,7 @@ limitations under the License.
#include "tensorflow/stream_executor/multi_platform_manager.h"
#include "tensorflow/stream_executor/platform.h"
#include "tensorflow/stream_executor/platform/dso_loader.h"
#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
#include "tensorflow/stream_executor/scratch_allocator.h"
#include "tensorflow/stream_executor/stream.h"
#include "tensorflow/stream_executor/stream_executor.h"

View File

@ -27,6 +27,7 @@ limitations under the License.
#include "tensorflow/stream_executor/multi_platform_manager.h"
#include "tensorflow/stream_executor/platform.h"
#include "tensorflow/stream_executor/platform/dso_loader.h"
#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
#include "tensorflow/stream_executor/scratch_allocator.h"
#include "tensorflow/stream_executor/stream.h"
#include "tensorflow/stream_executor/stream_executor.h"

View File

@ -654,3 +654,8 @@ alias(
name = "cuda_platform",
actual = "//tensorflow/stream_executor/cuda:all_runtime",
)
alias(
name = "rocm_platform",
actual = "//tensorflow/stream_executor/rocm:all_runtime",
)

View File

@ -66,6 +66,7 @@ cc_library(
deps = if_cuda_is_configured([
"@com_google_absl//absl/container:inlined_vector",
"@com_google_absl//absl/strings",
"//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
]),
@ -85,6 +86,7 @@ cc_library(
"@com_google_absl//absl/strings",
"@local_config_cuda//cuda:cuda_headers",
"//tensorflow/stream_executor:device_options",
"//tensorflow/stream_executor/gpu:gpu_driver_header",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
"//tensorflow/stream_executor/platform:dso_loader",
@ -97,18 +99,22 @@ cc_library(
name = "cuda_activation_header",
hdrs = ["cuda_activation.h"],
visibility = ["//visibility:public"],
deps = ["//tensorflow/stream_executor/platform"],
deps = [
"//tensorflow/stream_executor/gpu:gpu_activation_header",
"//tensorflow/stream_executor/platform",
],
)
cc_library(
name = "cuda_activation",
srcs = if_cuda_is_configured(["cuda_activation.cc"]),
srcs = [],
hdrs = if_cuda_is_configured(["cuda_activation.h"]),
deps = if_cuda_is_configured([
":cuda_driver",
"@local_config_cuda//cuda:cuda_headers",
"//tensorflow/stream_executor",
"//tensorflow/stream_executor:stream_executor_internal",
"//tensorflow/stream_executor/gpu:gpu_activation",
"//tensorflow/stream_executor/platform",
]),
)
@ -120,6 +126,7 @@ cc_library(
deps = if_cuda_is_configured([
":cuda_kernel",
"//tensorflow/stream_executor:event",
"//tensorflow/stream_executor/gpu:gpu_executor_header",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
]),
@ -133,10 +140,10 @@ cc_library(
deps = if_cuda_is_configured([
":cuda_activation",
":cuda_gpu_executor",
":cuda_helpers",
":cuda_platform_id",
":cuda_stream",
":cuda_timer",
":cuda_helpers",
"@com_google_absl//absl/strings",
"//third_party/eigen3",
"@local_config_cuda//cuda:cuda_headers",
@ -147,6 +154,7 @@ cc_library(
"//tensorflow/stream_executor:plugin_registry",
"//tensorflow/stream_executor:scratch_allocator",
"//tensorflow/stream_executor:timer",
"//tensorflow/stream_executor/gpu:gpu_helpers_header",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
"//tensorflow/stream_executor/platform:dso_loader",
@ -162,14 +170,15 @@ cc_library(
deps = if_cuda_is_configured([
":cuda_activation_header",
":cuda_gpu_executor_header",
":cuda_helpers",
":cuda_platform_id",
":cuda_stream",
":cuda_helpers",
"@local_config_cuda//cuda:cuda_headers",
"//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:fft",
"//tensorflow/stream_executor:plugin_registry",
"//tensorflow/stream_executor:scratch_allocator",
"//tensorflow/stream_executor/gpu:gpu_helpers_header",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
"//tensorflow/stream_executor/platform:dso_loader",
@ -223,13 +232,15 @@ cc_library(
deps = if_cuda_is_configured([
":cuda_activation",
":cuda_gpu_executor",
":cuda_helpers",
":cuda_platform_id",
":cuda_stream",
":cuda_helpers",
"@local_config_cuda//cuda:cuda_headers",
"//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:plugin_registry",
"//tensorflow/stream_executor:rng",
"//tensorflow/stream_executor/gpu:gpu_helpers_header",
"//tensorflow/stream_executor/gpu:gpu_rng_header",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
"//tensorflow/stream_executor/platform:dso_loader",
@ -239,12 +250,14 @@ cc_library(
cc_library(
name = "cuda_kernel",
srcs = if_cuda_is_configured(["cuda_kernel.cc"]),
hdrs = if_cuda_is_configured(["cuda_kernel.h"]),
deps = if_cuda_is_configured([
":cuda_driver",
"@local_config_cuda//cuda:cuda_headers",
"//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:stream_executor_pimpl_header",
"//tensorflow/stream_executor/gpu:gpu_kernel_header",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
]),
@ -254,6 +267,9 @@ cc_library(
cc_library(
name = "cuda_helpers",
textual_hdrs = if_cuda_is_configured(["cuda_helpers.h"]),
deps = if_cuda_is_configured([
"//tensorflow/stream_executor/gpu:gpu_helpers_header",
]),
)
cc_library(
@ -265,19 +281,22 @@ cc_library(
":cuda_gpu_executor_header",
":cuda_stream",
"//tensorflow/stream_executor:stream_executor_headers",
"//tensorflow/stream_executor/gpu:gpu_event",
"//tensorflow/stream_executor/gpu:gpu_stream_header",
"//tensorflow/stream_executor/lib",
]),
)
cc_library(
name = "cuda_stream",
srcs = if_cuda_is_configured(["cuda_stream.cc"]),
srcs = [],
hdrs = if_cuda_is_configured(["cuda_stream.h"]),
deps = if_cuda_is_configured([
":cuda_driver",
":cuda_gpu_executor_header",
"//tensorflow/stream_executor:stream_executor_headers",
"//tensorflow/stream_executor:stream_header",
"//tensorflow/stream_executor/gpu:gpu_stream",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
]),
@ -285,18 +304,18 @@ cc_library(
cc_library(
name = "cuda_timer",
srcs = if_cuda_is_configured(["cuda_timer.cc"]),
srcs = [],
hdrs = if_cuda_is_configured(["cuda_timer.h"]),
deps = if_cuda_is_configured([
":cuda_driver",
":cuda_gpu_executor_header",
":cuda_stream",
"//tensorflow/stream_executor:stream_executor_headers",
"//tensorflow/stream_executor/gpu:gpu_timer",
"//tensorflow/stream_executor/lib",
]),
)
# It implements :cuda_gpu_executor_header
cc_library(
name = "cuda_gpu_executor",
srcs = if_cuda_is_configured(["cuda_gpu_executor.cc"]),
@ -316,6 +335,7 @@ cc_library(
"//tensorflow/stream_executor:stream_executor_internal",
"//tensorflow/stream_executor:stream_executor_pimpl_header",
"//tensorflow/stream_executor:timer",
"//tensorflow/stream_executor/gpu:gpu_executor_header",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
"//tensorflow/stream_executor/platform:dso_loader",

View File

@ -17,13 +17,13 @@ limitations under the License.
// It reaches into the CUDA implementation to activate an underlying CUDA
// context.
//
// Having this file separate from cuda_gpu_executor.h means that dependent
// Having this file separate from cuda/cuda_gpu_executor.h means that dependent
// code does not also have to depend on cuda.h.
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/gpu/gpu_activation.h"
namespace stream_executor {
@ -31,29 +31,7 @@ class StreamExecutor;
namespace cuda {
class CUDAExecutor;
class ScopedActivateContext;
// Activates a CUDA context within an enclosing scope.
class ScopedActivateExecutorContext {
public:
// Form that takes a CUDA executor implementation.
explicit ScopedActivateExecutorContext(CUDAExecutor* cuda_exec);
// Form that takes a pImpl executor and extracts a CUDA implementation --
// fatal failure if it is not CUDA inside.
explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
~ScopedActivateExecutorContext();
private:
// The cuda.h-using datatype that we wrap.
ScopedActivateContext* driver_scoped_activate_context_;
SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
};
using ScopedActivateExecutorContext = gpu::ScopedActivateExecutorContext;
} // namespace cuda
} // namespace stream_executor

File diff suppressed because it is too large Load Diff

View File

@ -33,26 +33,26 @@ namespace stream_executor {
class Stream;
namespace cuda {
namespace gpu {
// Opaque and unique identifier for the cuBLAS plugin.
extern const PluginId kCuBlasPlugin;
class CUDAExecutor;
class GpuExecutor;
// BLAS plugin for CUDA platform via cuBLAS library.
//
// This satisfies the platform-agnostic BlasSupport interface.
//
// Note that the cuBLAS handle that this encapsulates is implicitly tied to the
// context (and, as a result, the device) that the parent CUDAExecutor is tied
// context (and, as a result, the device) that the parent GpuExecutor is tied
// to. This simply happens as an artifact of creating the cuBLAS handle when a
// CUDA context is active.
//
// Thread-safe post-initialization.
class CUDABlas : public blas::BlasSupport {
public:
explicit CUDABlas(CUDAExecutor *parent);
explicit CUDABlas(GpuExecutor *parent);
// Allocates a cuBLAS handle.
bool Init();
@ -145,9 +145,9 @@ class CUDABlas : public blas::BlasSupport {
// mutex that guards the cuBLAS handle for this device.
mutex mu_;
// CUDAExecutor which instantiated this CUDABlas.
// GpuExecutor which instantiated this CUDABlas.
// Immutable post-initialization.
CUDAExecutor *parent_;
GpuExecutor *parent_;
// cuBLAS library handle on the device.
cublasHandle_t blas_ GUARDED_BY(mu_);
@ -155,7 +155,7 @@ class CUDABlas : public blas::BlasSupport {
SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
};
} // namespace cuda
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_

View File

@ -52,13 +52,6 @@ limitations under the License.
namespace stream_executor {
namespace cuda {
#ifdef __APPLE__
static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
#elif !defined(PLATFORM_WINDOWS)
static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
#endif
string DriverVersionToString(DriverVersion version) {
return port::Printf("%d.%d.%d", std::get<0>(version), std::get<1>(version), std::get<2>(version));
}
@ -112,6 +105,18 @@ port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
return result;
}
} // namespace cuda
} // namespace stream_executor
namespace stream_executor {
namespace gpu {
#ifdef __APPLE__
static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
#elif !defined(PLATFORM_WINDOWS)
static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
#endif
// -- class Diagnostician
string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
@ -190,11 +195,11 @@ void Diagnostician::LogDiagnosticInformation() {
}
port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
LOG(INFO) << "libcuda reported version is: "
<< DriverVersionStatusToString(dso_version);
<< cuda::DriverVersionStatusToString(dso_version);
port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
LOG(INFO) << "kernel reported version is: "
<< DriverVersionStatusToString(kernel_version);
<< cuda::DriverVersionStatusToString(kernel_version);
#endif
// OS X kernel driver does not report version accurately
@ -232,7 +237,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
}
const size_t length = suffix_pos - start;
const string version = path.substr(start, length);
result = StringToDriverVersion(version);
result = cuda::StringToDriverVersion(version);
}
#else
#if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
@ -260,7 +265,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
// TODO(b/22689637): Eliminate the explicit namespace if possible.
auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
auto result = static_cast<port::StatusOr<DriverVersion> *>(data);
*result = StringToDriverVersion(stripped_dso_version);
*result = cuda::StringToDriverVersion(stripped_dso_version);
return 1;
}
return 0;
@ -292,7 +297,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
// TODO(b/22689637): Eliminate the explicit namespace if possible.
auto stripped_kernel_version =
port::StripSuffixString(kernel_version, ".ld64");
return StringToDriverVersion(stripped_kernel_version);
return cuda::StringToDriverVersion(stripped_kernel_version);
}
void Diagnostician::WarnOnDsoKernelMismatch(
@ -301,12 +306,12 @@ void Diagnostician::WarnOnDsoKernelMismatch(
if (kernel_version.ok() && dso_version.ok() &&
dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
LOG(INFO) << "kernel version seems to match DSO: "
<< DriverVersionToString(kernel_version.ValueOrDie());
<< cuda::DriverVersionToString(kernel_version.ValueOrDie());
} else {
LOG(ERROR) << "kernel version "
<< DriverVersionStatusToString(kernel_version)
<< cuda::DriverVersionStatusToString(kernel_version)
<< " does not match DSO version "
<< DriverVersionStatusToString(dso_version)
<< cuda::DriverVersionStatusToString(dso_version)
<< " -- cannot find working devices in this configuration";
}
}
@ -336,9 +341,9 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
// see
// https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
if (version == NULL) {
return StringToDriverVersion("");
return cuda::StringToDriverVersion("");
}
return StringToDriverVersion(version);
return cuda::StringToDriverVersion(version);
}
CFRelease(kext_infos);
auto status = port::Status(
@ -387,6 +392,5 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
#endif
}
} // namespace cuda
} // namespace gpu
} // namespace stream_executor

View File

@ -16,17 +16,13 @@ limitations under the License.
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
#include "tensorflow/stream_executor/platform/port.h"
#include <tuple>
#include "tensorflow/stream_executor/lib/statusor.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
namespace stream_executor {
namespace cuda {
// e.g. DriverVersion{346, 3, 4}
using DriverVersion = std::tuple<int, int, int>;
using DriverVersion = gpu::DriverVersion;
// Converts a parsed driver version to string form.
string DriverVersionToString(DriverVersion version);
@ -35,61 +31,9 @@ string DriverVersionToString(DriverVersion version);
string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
port::StatusOr<DriverVersion> StringToDriverVersion(const string &value);
port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
class Diagnostician {
public:
// Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
// not initializing).
//
// Note: if we're running on a machine that has no GPUs, we don't want to
// produce very much log spew beyond saying, "looks like there's no CUDA
// kernel
// module running".
//
// Note: we use non-Google-File:: API here because we may be called before
// InitGoogle has completed.
static void LogDiagnosticInformation();
// Given the driver version file contents, finds the kernel module version and
// returns it as a string.
//
// This is solely used for more informative log messages when the user is
// running on a machine that happens to have a libcuda/kernel driver mismatch.
static port::StatusOr<DriverVersion> FindKernelModuleVersion(
const string &driver_version_file_contents);
// Extracts the kernel driver version from the current host.
static port::StatusOr<DriverVersion> FindKernelDriverVersion();
// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
// driver-interfacing DSO version number. Returns it as a string.
static port::StatusOr<DriverVersion> FindDsoVersion();
// Logs information about the kernel driver version and userspace driver
// library version.
static void LogDriverVersionInformation();
private:
// Given the DSO version number and the driver version file contents, extracts
// the driver version and compares, warning the user in the case of
// incompatibility.
//
// This is solely used for more informative log messages when the user is
// running on a machine that happens to have a libcuda/kernel driver mismatch.
static void WarnOnDsoKernelMismatch(
port::StatusOr<DriverVersion> dso_version,
port::StatusOr<DriverVersion> kernel_version);
// Logs information about the dev nodes present on this machine: their
// existence, permissions, accessibility from this uid/gid.
static void LogDevNodeDiagnosticInformation();
static string GetDevNodePath(int dev_node_ordinal);
SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
};
using Diagnostician = gpu::Diagnostician;
} // namespace cuda
} // namespace stream_executor

View File

@ -58,7 +58,7 @@ limitations under the License.
#pragma clang diagnostic warning "-Wmismatched-tags"
namespace stream_executor {
namespace cuda {
namespace gpu {
PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);
@ -137,7 +137,7 @@ class CudnnHandle {
public:
// Takes ownership of the executor context and the lock to access cuDNN
// using handle.
CudnnHandle(cuda::ScopedActivateExecutorContext context, mutex_lock lock,
CudnnHandle(gpu::ScopedActivateExecutorContext context, mutex_lock lock,
cudnnHandle_t handle)
: context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}
@ -146,7 +146,7 @@ class CudnnHandle {
cudnnHandle_t handle() const { return handle_; }
private:
cuda::ScopedActivateExecutorContext context_;
gpu::ScopedActivateExecutorContext context_;
mutex_lock lock_;
cudnnHandle_t handle_; // Not owned.
};
@ -334,10 +334,10 @@ class CudnnAccess {
// The legacy default stream synchronizes with all other streams and it is
// therefore a bad idea (performance wise) to call any cuDNN APIs that
// enqueue work in the stream.
CudnnHandle GetHandle(CUDAExecutor* executor, Stream* stream) {
CudnnHandle GetHandle(GpuExecutor* executor, Stream* stream) {
mutex_lock lock(mutex_);
cuda::ScopedActivateExecutorContext context(executor);
CUstream cu_stream = stream ? AsCUDAStreamValue(stream) : cudaStreamLegacy;
gpu::ScopedActivateExecutorContext context(executor);
CUstream cu_stream = stream ? AsGpuStreamValue(stream) : cudaStreamLegacy;
auto status = cudnnSetStream(handle_, cu_stream);
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Failed to set cuDNN stream.";
return CudnnHandle(std::move(context), std::move(lock), handle_);
@ -448,7 +448,7 @@ port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
} // namespace
CudnnSupport::CudnnSupport(CUDAExecutor* parent) : parent_(parent) {}
CudnnSupport::CudnnSupport(GpuExecutor* parent) : parent_(parent) {}
port::Status CudnnSupport::Init() {
ScopedActivateExecutorContext context(parent_);
@ -481,14 +481,14 @@ port::Status CudnnSupport::Init() {
CHECK_EQ(cudnn_handle, nullptr);
LOG(ERROR) << "Could not create cudnn handle: " << ToString(status);
if (status == CUDNN_STATUS_NOT_INITIALIZED) {
auto result = cuda::Diagnostician::FindKernelDriverVersion();
auto result = gpu::Diagnostician::FindKernelDriverVersion();
if (!result.ok()) {
LOG(ERROR) << "Error retrieving driver version: "
<< DriverVersionStatusToString(result);
<< cuda::DriverVersionStatusToString(result);
} else {
const auto& version = result.ValueOrDie();
LOG(ERROR) << "Possibly insufficient driver version: "
<< DriverVersionToString(version);
<< cuda::DriverVersionToString(version);
}
}
@ -1151,7 +1151,7 @@ class CudnnRnnParamsDescriptor {
} // namespace
class CudnnRnnDescriptor : public dnn::RnnDescriptor {
CudnnRnnDescriptor(const CudnnHandle& cudnn, cuda::RnnDescriptor rnn_desc,
CudnnRnnDescriptor(const CudnnHandle& cudnn, gpu::RnnDescriptor rnn_desc,
PersistentRnnPlan rnn_plan, int num_layers,
int hidden_size, int input_size, int batch_size,
cudnnRNNInputMode_t input_mode,
@ -1191,7 +1191,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
CudnnDropoutDescriptor dropout_desc,
CudnnDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));
cuda::RnnDescriptor rnn_desc = CreateRnnDescriptor();
gpu::RnnDescriptor rnn_desc = CreateRnnDescriptor();
cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config.algorithm());
// TODO: allow the user to choose an algorithm.
@ -1282,7 +1282,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
}
private:
cuda::RnnDescriptor rnn_desc_;
gpu::RnnDescriptor rnn_desc_;
PersistentRnnPlan rnn_plan_;
int num_layers_;
int hidden_size_;
@ -1401,15 +1401,14 @@ port::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
class CudnnRnnSequenceTensorDescriptor
: public dnn::RnnSequenceTensorDescriptor {
CudnnRnnSequenceTensorDescriptor(CUDAExecutor* parent, int max_seq_length,
CudnnRnnSequenceTensorDescriptor(GpuExecutor* parent, int max_seq_length,
int batch_size, int data_size,
cudnnDataType_t data_type,
#if CUDNN_VERSION >= 7201
RNNDataDescriptor data_handle,
#endif
TensorDescriptor handle)
: parent_(parent),
max_seq_length_(max_seq_length),
: max_seq_length_(max_seq_length),
batch_size_(batch_size),
data_size_(data_size),
data_type_(data_type),
@ -1425,7 +1424,7 @@ class CudnnRnnSequenceTensorDescriptor
default;
static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
CUDAExecutor* parent, int max_seq_length, int batch_size, int data_size,
GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
cudnnDataType_t data_type) {
CHECK_GT(max_seq_length, 0);
int dims[] = {batch_size, data_size, 1};
@ -1444,7 +1443,7 @@ class CudnnRnnSequenceTensorDescriptor
}
static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
CUDAExecutor* parent, int max_seq_length, int batch_size, int data_size,
GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
const absl::Span<const int>& seq_lengths, cudnnDataType_t data_type) {
#if CUDNN_VERSION >= 7201
CHECK_GT(max_seq_length, 0);
@ -1496,7 +1495,6 @@ class CudnnRnnSequenceTensorDescriptor
}
private:
CUDAExecutor* parent_;
int max_seq_length_;
int batch_size_;
int data_size_;
@ -1511,11 +1509,10 @@ class CudnnRnnSequenceTensorDescriptor
class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
public:
CudnnRnnStateTensorDescriptor(CUDAExecutor* parent, int num_layers,
CudnnRnnStateTensorDescriptor(GpuExecutor* parent, int num_layers,
int batch_size, int data_size,
cudnnDataType_t data_type)
: parent_(parent),
handle_(CreateTensorDescriptor()),
: handle_(CreateTensorDescriptor()),
num_layers_(num_layers),
batch_size_(batch_size),
data_size_(data_size),
@ -1535,7 +1532,6 @@ class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
int data_size() const { return data_size_; }
private:
CUDAExecutor* parent_;
TensorDescriptor handle_;
int num_layers_;
int batch_size_;
@ -1699,14 +1695,14 @@ port::Status CudnnSupport::DoRnnForwardImpl(
}
}
std::unique_ptr<CUDATimer, TimerDeleter> timer;
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
const bool is_profiling = output_profile_result != nullptr;
if (is_profiling) {
timer.reset(new CUDATimer(parent_));
timer.reset(new GpuTimer(parent_));
// The start and stop of the timer should be as close to the Cudnn call as
// possible. It is still possible for other threads to issue workload on
// to this stream. So it could take multiple profiling measurements.
if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
return port::Status(port::error::INTERNAL, "Failed to start timer");
}
}
@ -1791,7 +1787,7 @@ port::Status CudnnSupport::DoRnnForwardImpl(
}
if (is_profiling) {
if (!timer->Stop(AsCUDAStream(stream))) {
if (!timer->Stop(AsGpuStream(stream))) {
return port::Status(port::error::INTERNAL, "Failed to stop timer");
}
auto algo_desc = *rnn_desc.algorithm_config().algorithm();
@ -1842,14 +1838,14 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
workspace_allocator));
std::unique_ptr<CUDATimer, TimerDeleter> timer;
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
const bool is_profiling = output_profile_result != nullptr;
if (is_profiling) {
timer.reset(new CUDATimer(parent_));
timer.reset(new GpuTimer(parent_));
// The start and stop of the timer should be as close to the Cudnn call as
// possible. It is still possible for other threads to issue workload on
// to this stream. So it could take multiple profiling measurements.
if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
return port::Status(port::error::INTERNAL, "Failed to start timer");
}
}
@ -1948,7 +1944,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
}
if (is_profiling) {
if (!timer->Stop(AsCUDAStream(stream))) {
if (!timer->Stop(AsGpuStream(stream))) {
return port::Status(port::error::INTERNAL, "Failed to stop timer");
}
auto algo_desc = *rnn_desc.algorithm_config().algorithm();
@ -2915,13 +2911,13 @@ port::Status CudnnSupport::DoConvolve(
const bool is_profiling = output_profile_result != nullptr;
std::unique_ptr<CUDATimer, TimerDeleter> timer;
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
if (is_profiling) {
timer.reset(new CUDATimer(parent_)); // NOLINT
timer.reset(new GpuTimer(parent_)); // NOLINT
// The start and stop of the timer should be as close to the Cudnn call as
// possible. It is still possible for other threads to issue workload on
// to this stream. So it could take multiple profiling measurements.
if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
return port::Status(port::error::INTERNAL, "Failed to start timer");
}
}
@ -3110,7 +3106,7 @@ port::Status CudnnSupport::DoConvolve(
}
if (is_profiling) {
if (!timer->Stop(AsCUDAStream(stream))) {
if (!timer->Stop(AsGpuStream(stream))) {
return port::Status(port::error::INTERNAL, "Failed to stop timer");
}
output_profile_result->set_algorithm(algorithm_desc);
@ -3175,13 +3171,13 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
output_nd, scratch_allocator, &scratch));
std::unique_ptr<CUDATimer, TimerDeleter> timer;
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
if (is_profiling) {
timer.reset(new CUDATimer(parent_)); // NOLINT
timer.reset(new GpuTimer(parent_)); // NOLINT
// The start and stop of the timer should be as close to the Cudnn call as
// possible. It is still possible for other threads to issue workload on
// to this stream. So it could take multiple profiling measurements.
if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
return port::Status(port::error::INTERNAL, "Failed to start timer");
}
}
@ -3234,7 +3230,7 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
/*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
if (is_profiling) {
if (!timer->Stop(AsCUDAStream(stream))) {
if (!timer->Stop(AsGpuStream(stream))) {
return port::Status(port::error::INTERNAL, "Failed to stop timer");
}
output_profile_result->set_algorithm(algo_desc);
@ -4339,22 +4335,22 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
return IsStatusOk(status, /*report_error=*/true);
}
} // namespace cuda
} // namespace gpu
void initialize_cudnn() {
port::Status status =
PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
cuda::kCudaPlatformId, cuda::kCuDnnPlugin, "cuDNN",
cuda::kCudaPlatformId, gpu::kCuDnnPlugin, "cuDNN",
[](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
cuda::CUDAExecutor* cuda_executor =
dynamic_cast<cuda::CUDAExecutor*>(parent);
gpu::GpuExecutor* cuda_executor =
dynamic_cast<gpu::GpuExecutor*>(parent);
if (cuda_executor == nullptr) {
LOG(ERROR) << "Attempting to initialize an instance of the cuDNN "
<< "support library with a non-CUDA StreamExecutor";
return nullptr;
}
cuda::CudnnSupport* dnn = new cuda::CudnnSupport(cuda_executor);
gpu::CudnnSupport* dnn = new gpu::CudnnSupport(cuda_executor);
if (!dnn->Init().ok()) {
// Note: Init() will log a more specific error.
delete dnn;
@ -4369,7 +4365,7 @@ void initialize_cudnn() {
}
PluginRegistry::Instance()->SetDefaultFactory(
cuda::kCudaPlatformId, PluginKind::kDnn, cuda::kCuDnnPlugin);
cuda::kCudaPlatformId, PluginKind::kDnn, gpu::kCuDnnPlugin);
}
} // namespace stream_executor

View File

@ -28,9 +28,9 @@ limitations under the License.
#include "tensorflow/stream_executor/temporary_device_memory.h"
namespace stream_executor {
namespace cuda {
namespace gpu {
class CUDAExecutor;
class GpuExecutor;
class CudnnRnnDescriptor;
class CudnnRnnSequenceTensorDescriptor;
class CudnnRnnStateTensorDescriptor;
@ -42,7 +42,7 @@ extern const PluginId kCuDnnPlugin;
// functions, see dnn.h.
class CudnnSupport : public dnn::DnnSupport {
public:
explicit CudnnSupport(CUDAExecutor* parent);
explicit CudnnSupport(GpuExecutor* parent);
port::Status Init() override;
port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
@ -552,7 +552,7 @@ class CudnnSupport : public dnn::DnnSupport {
DeviceMemoryBase* output_data) override;
private:
CUDAExecutor* parent_; // Parent executor object. Not owned.
GpuExecutor* parent_; // Parent executor object. Not owned.
// Provides access to the cuDNN handle.
std::unique_ptr<class CudnnAccess> cudnn_;
@ -667,7 +667,7 @@ class CudnnSupport : public dnn::DnnSupport {
SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
};
} // namespace cuda
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_

View File

@ -45,21 +45,20 @@ bool FLAGS_gpuexec_cuda_device_0_only = false;
// Debugging: on each push and pop of a cuda context, verify the current context
// matches the expected one.
constexpr bool kVerifyCudaContext = false;
constexpr bool kVerifyGpuContext = false;
namespace stream_executor {
namespace cuda {
namespace gpu {
namespace {
// Manages the singleton map of contexts that we've created, mapping
// from the CUcontext to the CudaContext* that we pass around internally.
// This also manages assignment of unique ids to CudaContexts, to allow
// from the CUcontext to the GpuContext* that we pass around internally.
// This also manages assignment of unique ids to GpuContexts, to allow
// for fast comparison of a context against the current context.
//
// CUDA-runtime-created contexts are avoided, if triple angle
// brace launches are required, by using the scoped activations in
// cuda_activation.h.
// gpu/gpu_activation.h.
class CreatedContexts {
public:
// Returns whether context is a member of the live set.
@ -69,14 +68,14 @@ class CreatedContexts {
}
// Adds context to the live set, or returns it if it's already present.
static CudaContext* Add(CUcontext context) {
static GpuContext* Add(CUcontext context) {
CHECK(context != nullptr);
mutex_lock lock(mu_);
auto insert_result = Live()->insert(std::make_pair(context, nullptr));
auto it = insert_result.first;
if (insert_result.second) {
// context was not present in the map. Add it.
it->second = MakeUnique<CudaContext>(context, next_id_++);
it->second = MakeUnique<GpuContext>(context, next_id_++);
}
return it->second.get();
}
@ -92,9 +91,9 @@ class CreatedContexts {
private:
// Returns the live map singleton.
static std::map<CUcontext, std::unique_ptr<CudaContext>> *Live() {
static std::map<CUcontext, std::unique_ptr<GpuContext>>* Live() {
static auto singleton =
new std::map<CUcontext, std::unique_ptr<CudaContext>>;
new std::map<CUcontext, std::unique_ptr<GpuContext>>;
return singleton;
}
@ -123,7 +122,7 @@ string ToString(CUresult result) {
// created by StreamExecutor (to ensure that the CUDA runtime didn't create a
// context behind our backs).
CUcontext CurrentContext() {
CUcontext current = CUDADriver::CurrentContextOrDie();
CUcontext current = cuda::CurrentContextOrDie();
if (current != nullptr && !CreatedContexts::Has(current)) {
LOG(FATAL) << "current context was not created by the StreamExecutor "
"cuda_driver API: "
@ -177,7 +176,7 @@ void SynchronizeOrDie() {
struct ThreadLocalData {
int64 id;
CudaContext* context; // Only valid if id == a known good context.
GpuContext* context; // Only valid if id == a known good context.
int depth;
};
@ -185,13 +184,13 @@ SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
} // namespace
ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
auto* tls = &tls_data.get();
tls->depth++;
if (tls->id == cuda_context->id()) {
if (kVerifyCudaContext) {
if (kVerifyGpuContext) {
CHECK_EQ(CurrentContext(), cuda_context->context());
}
DCHECK_EQ(CurrentContext(), cuda_context->context());
@ -215,8 +214,8 @@ ScopedActivateContext::~ScopedActivateContext() {
auto* tls = &tls_data.get();
if (kVerifyCudaContext) {
// Note that if kVerifyCudaContext is used, and contexts are deleted, it's
if (kVerifyGpuContext) {
// Note that if kVerifyGpuContext is used, and contexts are deleted, it's
// possible this could fail in the CurrentContext() call.
CHECK_EQ(CurrentContext(),
tls->context == nullptr ? nullptr : tls->context->context());
@ -242,7 +241,7 @@ namespace {
// logging purposes. Returns "?" if the device could not be successfully
// queried.
string CUDAPointerToDeviceString(CUdeviceptr pointer) {
auto value = CUDADriver::GetPointerDevice(pointer);
auto value = GpuDriver::GetPointerDevice(pointer);
if (value.ok()) {
return absl::StrCat(value.ValueOrDie());
}
@ -254,7 +253,7 @@ string CUDAPointerToDeviceString(CUdeviceptr pointer) {
// logging purposes. Returns "?" if the memory space could not be successfully
// queried.
string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
auto value = CUDADriver::GetPointerMemorySpace(pointer);
auto value = GpuDriver::GetPointerMemorySpace(pointer);
if (value.ok()) {
return MemorySpaceString(value.ValueOrDie());
}
@ -267,20 +266,20 @@ string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
// primarily for logging purposes. Returns "error" if an error is encountered
// in the process of querying.
string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
auto from_context = CUDADriver::GetPointerContext(from);
auto from_context = GpuDriver::GetPointerContext(from);
if (!from_context.ok()) {
LOG(ERROR) << "could not retrieve source pointer's context: "
<< from_context.status();
return "error";
}
auto to_context = CUDADriver::GetPointerContext(to);
auto to_context = GpuDriver::GetPointerContext(to);
if (!to_context.ok()) {
LOG(ERROR) << "could not retrieve destination pointer's context: "
<< to_context.status();
return "error";
}
return CUDADriver::CanEnablePeerAccess(from_context.ValueOrDie(),
to_context.ValueOrDie())
return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
to_context.ValueOrDie())
? "true"
: "false";
}
@ -308,9 +307,9 @@ static port::Status InternalInit() {
} // namespace
/* static */ port::Status CUDADriver::Init() {
/* static */ port::Status GpuDriver::Init() {
// Cached return value from calling InternalInit(), as cuInit need only be
// called once, but CUDADriver::Init may be called many times.
// called once, but GpuDriver::Init may be called many times.
static port::Status init_retval;
static bool set = false;
static mutex *init_mu = new mutex;
@ -324,8 +323,8 @@ static port::Status InternalInit() {
return init_retval;
}
/* static */ port::Status CUDADriver::GetDevice(int device_ordinal,
CUdevice *device) {
/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
CUdevice* device) {
CUresult res = tensorflow::wrap::cuDeviceGet(device, device_ordinal);
if (res == CUDA_SUCCESS) {
return port::Status::OK();
@ -336,8 +335,8 @@ static port::Status InternalInit() {
absl::StrCat("failed call to cuDeviceGet: ", ToString(res)));
}
/* static */ bool CUDADriver::GetDeviceName(CUdevice device,
string *device_name) {
/* static */ bool GpuDriver::GetDeviceName(CUdevice device,
string* device_name) {
static const size_t kCharLimit = 64;
absl::InlinedVector<char, 4> chars(kCharLimit);
CUresult res =
@ -376,9 +375,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
return true;
}
/* static */ port::Status CUDADriver::CreateContext(
CUdevice device, const DeviceOptions &device_options,
CudaContext **context) {
/* static */ port::Status GpuDriver::CreateContext(
int device_ordinal, CUdevice device, const DeviceOptions& device_options,
GpuContext** context) {
*context = nullptr;
int flags = 0;
@ -407,7 +406,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
}
}
former_context = CUDADriver::CurrentContextOrDie();
former_context = cuda::CurrentContextOrDie();
res = tensorflow::wrap::cuDevicePrimaryCtxRetain(&new_context, device);
if (former_context != nullptr) {
CUdevice former_device;
@ -454,7 +453,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
return port::Status(port::error::INTERNAL, message);
}
/* static */ void CUDADriver::DestroyContext(CudaContext* context) {
/* static */ void GpuDriver::DestroyContext(GpuContext* context) {
if (context == nullptr) {
return;
}
@ -473,9 +472,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
CreatedContexts::Remove(context->context());
}
/* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute,
CUfunction func,
int *attribute_value) {
/* static */ bool GpuDriver::FuncGetAttribute(CUfunction_attribute attribute,
CUfunction func,
int* attribute_value) {
CUresult res =
tensorflow::wrap::cuFuncGetAttribute(attribute_value, attribute, func);
if (res != CUDA_SUCCESS) {
@ -486,8 +485,8 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
return true;
}
/* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function,
CUfunc_cache cache_config) {
/* static */ bool GpuDriver::FuncSetCacheConfig(CUfunction function,
CUfunc_cache cache_config) {
CUresult res = tensorflow::wrap::cuFuncSetCacheConfig(function, cache_config);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
@ -499,7 +498,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
}
/* static */ port::StatusOr<CUsharedconfig>
CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
CUsharedconfig shared_mem_config;
ScopedActivateContext activation(context);
CUresult result =
@ -517,8 +516,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return shared_mem_config;
}
/* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
CudaContext* context, CUsharedconfig shared_mem_config) {
/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
GpuContext* context, CUsharedconfig shared_mem_config) {
ScopedActivateContext activation(context);
CUresult result =
tensorflow::wrap::cuCtxSetSharedMemConfig(shared_mem_config);
@ -536,12 +535,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return port::Status::OK();
}
/* static */ bool CUDADriver::LaunchKernel(
CudaContext* context, CUfunction function, unsigned int grid_dim_x,
/* static */ bool GpuDriver::LaunchKernel(
GpuContext* context, CUfunction function, unsigned int grid_dim_x,
unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
unsigned int block_dim_y, unsigned int block_dim_z,
unsigned int shared_mem_bytes, CUstream stream, void **kernel_params,
void **extra) {
unsigned int shared_mem_bytes, CUstream stream, void** kernel_params,
void** extra) {
ScopedActivateContext activation(context);
VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
<< " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
@ -559,9 +558,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ port::Status CUDADriver::LoadCubin(CudaContext* context,
const char *cubin_bytes,
CUmodule *module) {
/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
const char* cubin_bytes,
CUmodule* module) {
ScopedActivateContext activation(context);
CUresult result =
tensorflow::wrap::cuModuleLoadFatBinary(module, cubin_bytes);
@ -573,9 +572,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return port::Status::OK();
}
/* static */ bool CUDADriver::LoadPtx(CudaContext* context,
const char *ptx_contents,
CUmodule *module) {
/* static */ bool GpuDriver::LoadPtx(GpuContext* context,
const char* ptx_contents,
CUmodule* module) {
port::Notification notification;
bool ret = true;
GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
@ -643,9 +642,16 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return ret;
}
/* static */ bool CUDADriver::SynchronousMemsetUint8(CudaContext* context,
CUdeviceptr location,
uint8 value, size_t size) {
/* static */ bool GpuDriver::LoadHsaco(GpuContext* context,
const char* hsaco_contents,
CUmodule* module) {
LOG(ERROR) << "Feature not supported on CUDA platform (LoadHsaco)";
return false;
}
/* static */ bool GpuDriver::SynchronousMemsetUint8(GpuContext* context,
CUdeviceptr location,
uint8 value, size_t size) {
ScopedActivateContext activation(context);
CUresult res = tensorflow::wrap::cuMemsetD8(location, value, size);
if (res != CUDA_SUCCESS) {
@ -655,10 +661,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ bool CUDADriver::SynchronousMemsetUint32(CudaContext* context,
CUdeviceptr location,
uint32 value,
size_t uint32_count) {
/* static */ bool GpuDriver::SynchronousMemsetUint32(GpuContext* context,
CUdeviceptr location,
uint32 value,
size_t uint32_count) {
ScopedActivateContext activation(context);
CUresult res = tensorflow::wrap::cuMemsetD32(location, value, uint32_count);
if (res != CUDA_SUCCESS) {
@ -668,11 +674,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ bool CUDADriver::AsynchronousMemsetUint8(CudaContext* context,
CUdeviceptr location,
uint8 value,
size_t uint32_count,
CUstream stream) {
/* static */ bool GpuDriver::AsynchronousMemsetUint8(GpuContext* context,
CUdeviceptr location,
uint8 value,
size_t uint32_count,
CUstream stream) {
ScopedActivateContext activation(context);
CUresult res =
tensorflow::wrap::cuMemsetD8Async(location, value, uint32_count, stream);
@ -684,11 +690,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ bool CUDADriver::AsynchronousMemsetUint32(CudaContext* context,
CUdeviceptr location,
uint32 value,
size_t uint32_count,
CUstream stream) {
/* static */ bool GpuDriver::AsynchronousMemsetUint32(GpuContext* context,
CUdeviceptr location,
uint32 value,
size_t uint32_count,
CUstream stream) {
ScopedActivateContext activation(context);
CUresult res =
tensorflow::wrap::cuMemsetD32Async(location, value, uint32_count, stream);
@ -700,10 +706,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ bool CUDADriver::AddStreamCallback(CudaContext* context,
CUstream stream,
StreamCallback callback,
void *data) {
/* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
CUstream stream,
StreamCallback callback,
void* data) {
// Note: flags param is required to be zero according to CUDA 6.0.
CUresult res = tensorflow::wrap::cuStreamAddCallback(stream, callback, data,
0 /* = flags */);
@ -714,10 +720,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ bool CUDADriver::GetModuleFunction(CudaContext *context,
CUmodule module,
const char *kernel_name,
CUfunction *function) {
/* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
CUmodule module,
const char* kernel_name,
CUfunction* function) {
ScopedActivateContext activated{context};
CHECK(module != nullptr && kernel_name != nullptr);
CUresult res =
@ -731,11 +737,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ bool CUDADriver::GetModuleSymbol(CudaContext* context,
CUmodule module,
const char *symbol_name,
CUdeviceptr *dptr,
size_t *bytes) {
/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
CUmodule module,
const char* symbol_name,
CUdeviceptr* dptr, size_t* bytes) {
ScopedActivateContext activated{context};
CHECK(module != nullptr && symbol_name != nullptr &&
(dptr != nullptr || bytes != nullptr));
@ -752,8 +757,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ void CUDADriver::UnloadModule(CudaContext *context,
CUmodule module) {
/* static */ void GpuDriver::UnloadModule(GpuContext* context,
CUmodule module) {
ScopedActivateContext activated{context};
CUresult res = tensorflow::wrap::cuModuleUnload(module);
if (res != CUDA_SUCCESS) {
@ -762,8 +767,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
}
/* static */ port::StatusOr<CUdevice> CUDADriver::DeviceFromContext(
CudaContext* context) {
/* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
GpuContext* context) {
ScopedActivateContext activated{context};
CUdevice device = -1;
CUresult result = tensorflow::wrap::cuCtxGetDevice(&device);
@ -776,26 +781,26 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
absl::StrCat("failed to get device for context: ", ToString(result)));
}
/* static */ bool CUDADriver::CreateStream(CudaContext *context,
CUstream *out) {
/* static */ bool GpuDriver::CreateStream(GpuContext* context,
CUstream* stream) {
// TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
// up synchronization with respect to memsets and any other things that have
// to occur on the default stream?
ScopedActivateContext activated{context};
CUresult res = tensorflow::wrap::cuStreamCreate(out, 0);
CUresult res = tensorflow::wrap::cuStreamCreate(stream, 0);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "could not allocate CUDA stream for context "
<< context->context() << ": " << ToString(res);
return false;
}
VLOG(2) << "successfully created stream " << *out << " for context "
VLOG(2) << "successfully created stream " << *stream << " for context "
<< context->context() << " on thread";
return true;
}
/* static */ void CUDADriver::DestroyStream(CudaContext* context,
CUstream *stream) {
/* static */ void GpuDriver::DestroyStream(GpuContext* context,
CUstream* stream) {
if (*stream == nullptr) {
return;
}
@ -812,8 +817,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
}
/* static */ void *CUDADriver::DeviceAllocate(CudaContext *context,
uint64 bytes) {
/* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
uint64 bytes) {
ScopedActivateContext activated{context};
CUdeviceptr result = 0;
CUresult res = tensorflow::wrap::cuMemAlloc(&result, bytes);
@ -829,8 +834,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return ptr;
}
/* static */ void CUDADriver::DeviceDeallocate(CudaContext* context,
void *location) {
/* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
void* location) {
ScopedActivateContext activation(context);
CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
CUresult res = tensorflow::wrap::cuMemFree(pointer);
@ -843,8 +848,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
}
/* static */ void *CUDADriver::UnifiedMemoryAllocate(CudaContext *context,
uint64 bytes) {
/* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
uint64 bytes) {
ScopedActivateContext activation(context);
CUdeviceptr result = 0;
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
@ -861,8 +866,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return ptr;
}
/* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context,
void *location) {
/* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
void* location) {
ScopedActivateContext activation(context);
CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
CUresult res = tensorflow::wrap::cuMemFree(pointer);
@ -875,8 +880,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
}
/* static */ void *CUDADriver::HostAllocate(CudaContext *context,
uint64 bytes) {
/* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
ScopedActivateContext activation(context);
void *host_mem = nullptr;
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
@ -889,8 +893,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return host_mem;
}
/* static */ void CUDADriver::HostDeallocate(CudaContext* context,
void *location) {
/* static */ void GpuDriver::HostDeallocate(GpuContext* context,
void* location) {
ScopedActivateContext activation(context);
CUresult res = tensorflow::wrap::cuMemFreeHost(location);
if (res != CUDA_SUCCESS) {
@ -899,8 +903,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
}
/* static */ bool CUDADriver::HostRegister(CudaContext* context, void *location,
uint64 bytes) {
/* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
uint64 bytes) {
ScopedActivateContext activation(context);
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
CUresult res = tensorflow::wrap::cuMemHostRegister(
@ -913,8 +917,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ bool CUDADriver::HostUnregister(CudaContext* context,
void *location) {
/* static */ bool GpuDriver::HostUnregister(GpuContext* context,
void* location) {
ScopedActivateContext activation(context);
CUresult res = tensorflow::wrap::cuMemHostUnregister(location);
if (res != CUDA_SUCCESS) {
@ -925,8 +929,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ port::Status CUDADriver::DestroyEvent(CudaContext* context,
CUevent *event) {
/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
CUevent* event) {
if (*event == nullptr) {
return port::Status(port::error::INVALID_ARGUMENT,
"input event cannot be null");
@ -953,9 +957,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
}
/* static */ port::Status CUDADriver::RecordEvent(CudaContext* context,
CUevent event,
CUstream stream) {
/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
CUevent event,
CUstream stream) {
ScopedActivateContext activated{context};
CUresult res = tensorflow::wrap::cuEventRecord(event, stream);
switch (res) {
@ -975,8 +979,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
}
/* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(
CudaContext *context, CUevent event) {
/* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
CUevent event) {
ScopedActivateContext activated{context};
CUresult res = tensorflow::wrap::cuEventQuery(event);
if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
@ -988,9 +992,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return res;
}
/* static */ bool CUDADriver::GetEventElapsedTime(CudaContext* context,
float *elapsed_milliseconds,
CUevent start, CUevent stop) {
/* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
float* elapsed_milliseconds,
CUevent start, CUevent stop) {
ScopedActivateContext activated{context};
// The stop event must have completed in order for cuEventElapsedTime to
// work.
@ -1009,9 +1013,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ bool CUDADriver::WaitStreamOnEvent(CudaContext* context,
CUstream stream,
CUevent event) {
/* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
CUstream stream, CUevent event) {
ScopedActivateContext activation(context);
CUresult res =
tensorflow::wrap::cuStreamWaitEvent(stream, event, 0 /* = flags */);
@ -1023,7 +1026,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) {
/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
ScopedActivateContext activation(context);
CUresult res = tensorflow::wrap::cuCtxSynchronize();
if (res != CUDA_SUCCESS) {
@ -1035,8 +1038,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ port::Status CUDADriver::SynchronizeStream(CudaContext *context,
CUstream stream) {
/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
CUstream stream) {
ScopedActivateContext activated{context};
CHECK(stream != nullptr);
CUresult res = tensorflow::wrap::cuStreamSynchronize(stream);
@ -1051,8 +1054,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return port::Status::OK();
}
/* static */ bool CUDADriver::IsStreamIdle(CudaContext *context,
CUstream stream) {
/* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
CUstream stream) {
ScopedActivateContext activated{context};
CHECK(stream != nullptr);
CUresult res = tensorflow::wrap::cuStreamQuery(stream);
@ -1066,10 +1069,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return false;
}
/* static */ port::Status CUDADriver::SynchronousMemcpyD2H(CudaContext *context,
void *host_dst,
CUdeviceptr gpu_src,
uint64 size) {
/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
void* host_dst,
CUdeviceptr gpu_src,
uint64 size) {
ScopedActivateContext activation(context);
CUresult res = tensorflow::wrap::cuMemcpyDtoH(host_dst, gpu_src, size);
if (res != CUDA_SUCCESS) {
@ -1084,10 +1087,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return port::Status::OK();
}
/* static */ port::Status CUDADriver::SynchronousMemcpyH2D(CudaContext *context,
CUdeviceptr gpu_dst,
const void *host_src,
uint64 size) {
/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
CUdeviceptr gpu_dst,
const void* host_src,
uint64 size) {
ScopedActivateContext activation(context);
CUresult res = tensorflow::wrap::cuMemcpyHtoD(gpu_dst, host_src, size);
if (res != CUDA_SUCCESS) {
@ -1101,10 +1104,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return port::Status::OK();
}
/* static */ port::Status CUDADriver::SynchronousMemcpyD2D(CudaContext *context,
CUdeviceptr gpu_dst,
CUdeviceptr gpu_src,
uint64 size) {
/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
CUdeviceptr gpu_dst,
CUdeviceptr gpu_src,
uint64 size) {
ScopedActivateContext activation(context);
CUresult res = tensorflow::wrap::cuMemcpyDtoD(gpu_dst, gpu_src, size);
if (res != CUDA_SUCCESS) {
@ -1118,11 +1121,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return port::Status::OK();
}
/* static */ bool CUDADriver::AsynchronousMemcpyD2H(CudaContext* context,
void *host_dst,
CUdeviceptr gpu_src,
uint64 size,
CUstream stream) {
/* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
void* host_dst,
CUdeviceptr gpu_src,
uint64 size,
CUstream stream) {
ScopedActivateContext activation(context);
CUresult res =
tensorflow::wrap::cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
@ -1140,11 +1143,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ bool CUDADriver::AsynchronousMemcpyH2D(CudaContext* context,
CUdeviceptr gpu_dst,
const void *host_src,
uint64 size,
CUstream stream) {
/* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
CUdeviceptr gpu_dst,
const void* host_src,
uint64 size,
CUstream stream) {
ScopedActivateContext activation(context);
CUresult res =
tensorflow::wrap::cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
@ -1161,11 +1164,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ bool CUDADriver::AsynchronousMemcpyD2D(CudaContext* context,
CUdeviceptr gpu_dst,
CUdeviceptr gpu_src,
uint64 size,
CUstream stream) {
/* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
CUdeviceptr gpu_dst,
CUdeviceptr gpu_src,
uint64 size,
CUstream stream) {
ScopedActivateContext activation(context);
CUresult result =
tensorflow::wrap::cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
@ -1189,9 +1192,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return true;
}
/* static */ port::Status CUDADriver::CreateEvent(CudaContext* context,
CUevent *result,
EventFlags flags) {
/* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
CUevent* result,
EventFlags flags) {
int cuflags;
switch (flags) {
case EventFlags::kDefault:
@ -1219,7 +1222,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
}
/* static */ int CUDADriver::GetDeviceCount() {
/* static */ int GpuDriver::GetDeviceCount() {
int device_count = 0;
CUresult res = tensorflow::wrap::cuDeviceGetCount(&device_count);
if (res != CUDA_SUCCESS) {
@ -1233,9 +1236,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return device_count;
}
/* static */ port::StatusOr<CudaContext*> CUDADriver::GetPointerContext(
/* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
CUdeviceptr pointer) {
CudaContext* context = nullptr;
GpuContext* context = nullptr;
CUresult result = tensorflow::wrap::cuPointerGetAttribute(
&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
if (result == CUDA_SUCCESS) {
@ -1249,7 +1252,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
ToString(result)));
}
/* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
CUdeviceptr pointer) {
unsigned int value;
CUresult result = tensorflow::wrap::cuPointerGetAttribute(
@ -1273,9 +1276,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
ToString(result)));
}
/* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
CUdeviceptr *base,
size_t *size) {
/* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
CUdeviceptr* base,
size_t* size) {
CUresult result = tensorflow::wrap::cuMemGetAddressRange(base, size, dptr);
if (result == CUDA_SUCCESS) {
return port::Status::OK();
@ -1295,7 +1298,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
reinterpret_cast<void *>(dptr), ToString(result).c_str()));
}
/* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice(
/* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
CUdeviceptr pointer) {
auto result = GetPointerContext(pointer);
if (!result.ok()) {
@ -1305,9 +1308,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return DeviceFromContext(result.ValueOrDie());
}
/* static */ port::Status CUDADriver::GetComputeCapability(int *cc_major,
int *cc_minor,
CUdevice device) {
/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
int* cc_minor,
CUdevice device) {
*cc_major = 0;
*cc_minor = 0;
@ -1334,6 +1337,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
return port::Status::OK();
}
/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
CUdevice device) {
return port::Status{
port::error::INTERNAL,
"Feature not supported on CUDA platform (GetGpuISAVersion)"};
}
// Helper function that turns the integer output of cuDeviceGetAttribute to type
// T and wraps it in a StatusOr.
template <typename T>
@ -1352,49 +1362,49 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return converted;
}
/* static */ port::StatusOr<int> CUDADriver::GetMultiprocessorCount(
/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
CUdevice device) {
return GetSimpleAttribute<int>(device,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
}
/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerCore(
/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
CUdevice device) {
return GetSimpleAttribute<int64>(
device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
}
/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerBlock(
/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
CUdevice device) {
return GetSimpleAttribute<int64>(
device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
}
/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerMultiprocessor(
/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
CUdevice device) {
return GetSimpleAttribute<int64>(
device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
}
/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerBlock(
/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
CUdevice device) {
return GetSimpleAttribute<int64>(device,
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
}
/* static */ port::StatusOr<int64> CUDADriver::GetMaxRegistersPerBlock(
/* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
CUdevice device) {
return GetSimpleAttribute<int64>(device,
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
}
/* static */ port::StatusOr<int64> CUDADriver::GetThreadsPerWarp(
/* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
CUdevice device) {
return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
}
/* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z,
CUdevice device) {
/* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
CUdevice device) {
int value;
CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
@ -1422,7 +1432,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return true;
}
/* static */ bool CUDADriver::GetDriverVersion(int *driver_version) {
/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
CUresult res = tensorflow::wrap::cuDriverGetVersion(driver_version);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to query driver version: " << ToString(res);
@ -1432,7 +1442,19 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return true;
}
/* static */ port::StatusOr<int> CUDADriver::GetDeviceAttribute(
/* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
int device_ordinal) {
CUresult res = tensorflow::wrap::cuDeviceGetProperties(device_properties,
device_ordinal);
if (res != CUDA_SUCCESS) {
LOG(ERROR) << "failed to query device properties: " << ToString(res);
return false;
}
return true;
}
/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
CUdevice_attribute attribute, CUdevice device) {
int val;
CUresult res =
@ -1446,7 +1468,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return val;
}
/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
/* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
int value = -1;
CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
@ -1459,9 +1481,9 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return true;
}
/* static */ bool CUDADriver::GetDeviceMemoryInfo(CudaContext* context,
int64 *free_out,
int64 *total_out) {
/* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
int64* free_out,
int64* total_out) {
ScopedActivateContext activation(context);
size_t free = 0;
size_t total = 0;
@ -1476,8 +1498,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return true;
}
/* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device,
uint64 *result) {
/* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
uint64* result) {
size_t value = -1;
CUresult res = tensorflow::wrap::cuDeviceTotalMem(&value, device);
if (res != CUDA_SUCCESS) {
@ -1489,7 +1511,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return true;
}
/* static */ string CUDADriver::GetPCIBusID(CUdevice device) {
/* static */ string GpuDriver::GetPCIBusID(CUdevice device) {
string pci_bus_id;
static const int kBufferSize = 64;
absl::InlinedVector<char, 4> chars(kBufferSize);
@ -1504,8 +1526,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return pci_bus_id;
}
/* static */ bool CUDADriver::CanEnablePeerAccess(CudaContext* from,
CudaContext* to) {
/* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
GpuContext* to) {
if (from == to) {
return true; // A context can always access its own memory.
}
@ -1533,8 +1555,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return can_access_peer;
}
/* static */ port::Status CUDADriver::EnablePeerAccess(CudaContext* from,
CudaContext* to) {
/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
GpuContext* to) {
if (from == to) {
return port::Status::OK(); // A context can always access its own memory.
}
@ -1553,8 +1575,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return port::Status::OK();
}
/* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore(
CudaContext* context, CUfunction kernel, int threads_per_block,
/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
GpuContext* context, CUfunction kernel, int threads_per_block,
size_t dynamic_shared_memory_bytes) {
ScopedActivateContext activation(context);
@ -1572,11 +1594,15 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
return max_blocks;
}
/* static */ CUcontext CUDADriver::CurrentContextOrDie() {
} // namespace gpu
namespace cuda {
CUcontext CurrentContextOrDie() {
CUcontext current = nullptr;
CUresult result = tensorflow::wrap::cuCtxGetCurrent(&current);
if (result != CUDA_SUCCESS) {
LOG(FATAL) << "failed to query current context: " << ToString(result);
LOG(FATAL) << "failed to query current context: " << gpu::ToString(result);
}
return current;
}

View File

@ -18,495 +18,45 @@ limitations under the License.
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
#include <stddef.h>
#include "tensorflow/stream_executor/platform/port.h"
#include "cuda/include/cuda.h"
#include "tensorflow/stream_executor/device_options.h"
#include "tensorflow/stream_executor/lib/status.h"
#include "tensorflow/stream_executor/lib/statusor.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
namespace stream_executor {
namespace cuda {
// Identifies the memory space where an allocation resides. See
// CUDADriver::GetPointerMemorySpace().
enum class MemorySpace { kHost, kDevice };
// Returns a casual string, such as "host" for the provided memory space.
string MemorySpaceString(MemorySpace memory_space);
class CudaContext;
// CUDADriver contains wrappers for calls to the userspace library driver. It's
// useful to isolate these calls and put basic wrappers around them to separate
// userspace library driver behaviors from the rest of the program.
//
// At the moment it's simply used as a namespace.
//
// The calls log any specific errors internally and return whether the operation
// was successful to the caller.
//
// The order of parameters is generally kept symmetric with the underlying CUDA
// driver API.
//
// Links on functions are to specific documentation under
// http://docs.nvidia.com/cuda/cuda-driver-api/
//
// Thread safety: these functions should not be used from signal handlers.
class CUDADriver {
public:
// Wraps a call to cuInit with logging to help indicate what has gone wrong in
// the case of failure. Safe to call multiple times; will be fast on all calls
// after the first.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
static port::Status Init();
// Returns the device associated with the given context.
// device is an outparam owned by the caller, must not be null.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
static port::StatusOr<CUdevice> DeviceFromContext(CudaContext* context);
// Creates a new CUDA stream associated with the given context via
// cuStreamCreate.
// stream is an outparam owned by the caller, must not be null.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
static bool CreateStream(CudaContext* context, CUstream* stream);
// Destroys a CUDA stream associated with the given context.
// stream is owned by the caller, must not be null, and *stream is set to null
// if the stream is successfully destroyed.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
static void DestroyStream(CudaContext* context, CUstream* stream);
// CUDA events can explicitly disable event TSC retrieval for some presumed
// performance improvement if timing is unnecessary.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
enum class EventFlags { kDefault, kDisableTiming };
// Creates a new event associated with the given context.
// result is an outparam owned by the caller and must not be null.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
static port::Status CreateEvent(CudaContext* context, CUevent* result,
EventFlags flags);
// Destroys *event and turns it into a nullptr. event may not be null, but
// *event may be, via cuEventDestroy
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
static port::Status DestroyEvent(CudaContext* context, CUevent* event);
// Allocates a GPU memory space of size bytes associated with the given
// context via cuMemAlloc.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
static void* DeviceAllocate(CudaContext* context, uint64 bytes);
// Deallocates a GPU memory space of size bytes associated with the given
// context via cuMemFree.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
static void DeviceDeallocate(CudaContext* context, void* location);
// Allocates a unified memory space of size bytes associated with the given
// context via cuMemAllocManaged.
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
static void* UnifiedMemoryAllocate(CudaContext* context, uint64 bytes);
// Deallocates a unified memory space of size bytes associated with the given
// context via cuMemFree.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
static void UnifiedMemoryDeallocate(CudaContext* context, void* location);
// Allocates page-locked and CUDA-registered memory on the host via
// cuMemAllocHost.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
static void* HostAllocate(CudaContext* context, uint64 bytes);
// Deallocates a location created by HostAllocate, via cuMemFreeHost.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
static void HostDeallocate(CudaContext* context, void* location);
// Registers a memory region at location of size bytes via cuMemHostRegister.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
static bool HostRegister(CudaContext* context, void* location, uint64 bytes);
// Unregisters a memory region that was previously registered at location via
// cuMemHostUnregister.
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
//
// TODO(leary) verify an error will be returned if the location wasn't
// previously registered.
static bool HostUnregister(CudaContext* context, void* location);
// Given a device ordinal, returns a device handle into the device outparam,
// which must not be null.
//
// N.B. these device handles do not have a corresponding destroy function in
// the CUDA driver API.
static port::Status GetDevice(int device_ordinal, CUdevice* device);
// Given a device handle, returns the name reported by the driver for the
// device.
static bool GetDeviceName(CUdevice device, string* name_out);
// Given a device to create a context for, returns a context handle into the
// context outparam, which must not be null.
//
// N.B. CUDA contexts are weird. They are implicitly associated with the
// calling thread. Current documentation on contexts and their influence on
// userspace processes is given here:
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
static port::Status CreateContext(CUdevice device,
const DeviceOptions& device_options,
CudaContext** context);
// Destroys the provided context via cuCtxDestroy.
// Don't do this while clients could still be using the context, per the docs
// bad things will happen.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
static void DestroyContext(CudaContext* context);
// Queries the runtime for the specified attribute of the specified function.
// cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
// in terms of integer-sized values, so there's no potential for overrun (as
// of CUDA 5.5).
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
static bool FuncGetAttribute(CUfunction_attribute attribute,
CUfunction function, int* attribute_value);
// Sets the preferred cache configuration for the specified function.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
static bool FuncSetCacheConfig(CUfunction function,
CUfunc_cache cache_config);
// Gets the preferred shared memory bank configuration for the specified
// CONTEXT (not function!), either default or four- or eight-byte bank size.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
static port::StatusOr<CUsharedconfig> ContextGetSharedMemConfig(
CudaContext* context);
// Sets the preferred shared memory bank configuration for the specified
// CONTEXT (not function!), either default or four- or eight-byte bank size.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
static port::Status ContextSetSharedMemConfig(
CudaContext* context, CUsharedconfig shared_mem_config);
// Launches a CUDA kernel via cuLaunchKernel.
// TODO(leary) describe the structure of kernel_params and extra in a readable
// way.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
static bool LaunchKernel(CudaContext* context, CUfunction function,
unsigned int grid_dim_x, unsigned int grid_dim_y,
unsigned int grid_dim_z, unsigned int block_dim_x,
unsigned int block_dim_y, unsigned int block_dim_z,
unsigned int shared_mem_bytes, CUstream stream,
void** kernel_params, void** extra);
// Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
// handle in "module". Any error logs that are produced are logged internally.
static bool LoadPtx(CudaContext* context, const char* ptx_contents,
CUmodule* module);
// Loads cubin_bytes with the CUDA driver's blob loading interface and stores
// the resulting handle in "module".
static port::Status LoadCubin(CudaContext* context, const char* cubin_bytes,
CUmodule* module);
// Retrieves a named kernel from a loaded module, and places the resulting
// handle into function (outparam) on success. Neither kernel_name nor
// function may be null. No ownership is taken of kernel_name.
static bool GetModuleFunction(CudaContext* context, CUmodule module,
const char* kernel_name, CUfunction* function);
// Retrieves a named global/constant symbol from a loaded module, and returns
// a device pointer and size of the symbol on success. symbol_name may not be
// null. At least one of dptr or bytes should not be null. No ownership is
// taken of symbol_name.
static bool GetModuleSymbol(CudaContext* context, CUmodule module,
const char* symbol_name, CUdeviceptr* dptr,
size_t* bytes);
// Unloads module from the current context via cuModuleUnload.
// TODO(leary) the documentation doesn't say what kind of disasters happen
// if you try to unload a module while its CUfunctions are in use.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
static void UnloadModule(CudaContext* context, CUmodule module);
// Performs a synchronous memset of the device memory segment via cuMemsetD8.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
static bool SynchronousMemsetUint8(CudaContext* context, CUdeviceptr location,
uint8 value, size_t size);
// Performs a synchronous memset of the device memory segment via cuMemsetD32.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
static bool SynchronousMemsetUint32(CudaContext* context,
CUdeviceptr location, uint32 value,
size_t uint32_count);
// Performs an asynchronous memset of the device memory segment via
// cuMemsetD8Async.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
static bool AsynchronousMemsetUint8(CudaContext* context,
CUdeviceptr location, uint8 value,
size_t uint32_count, CUstream stream);
// Performs an asynchronous memset of the device memory segment via
// cuMemsetD32Async.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
static bool AsynchronousMemsetUint32(CudaContext* context,
CUdeviceptr location, uint32 value,
size_t uint32_count, CUstream stream);
// -- Synchronous memcopies.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
static port::Status SynchronousMemcpyD2H(CudaContext* context, void* host_dst,
CUdeviceptr gpu_src, uint64 size);
static port::Status SynchronousMemcpyH2D(CudaContext* context,
CUdeviceptr gpu_dst,
const void* host_src, uint64 size);
static port::Status SynchronousMemcpyD2D(CudaContext* context,
CUdeviceptr gpu_dst,
CUdeviceptr gpu_src, uint64 size);
// -- Asynchronous memcopies.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
static bool AsynchronousMemcpyD2H(CudaContext* context, void* host_dst,
CUdeviceptr gpu_src, uint64 size,
CUstream stream);
static bool AsynchronousMemcpyH2D(CudaContext* context, CUdeviceptr gpu_dst,
const void* host_src, uint64 size,
CUstream stream);
static bool AsynchronousMemcpyD2D(CudaContext* context, CUdeviceptr gpu_dst,
CUdeviceptr gpu_src, uint64 size,
CUstream stream);
// The CUDA stream callback type signature.
// The data passed to AddStreamCallback is subsequently passed to this
// callback when it fires.
//
// Some notable things:
// * Callbacks must not make any CUDA API calls.
// * Callbacks from independent streams execute in an undefined order and may
// be serialized.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
typedef void (*StreamCallback)(CUstream stream, CUresult status, void* data);
// Enqueues a callback operation into stream.
// See StreamCallback above and the NVIDIA documentation for additional
// details.
static bool AddStreamCallback(CudaContext* context, CUstream stream,
StreamCallback callback, void* data);
// Causes stream to wait for event to trigger before proceeding via
// cuStreamWaitEvent.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
static bool WaitStreamOnEvent(CudaContext* context, CUstream stream,
CUevent event);
// Blocks the calling thread until the operations enqueued onto stream have
// been completed, via cuStreamSynchronize.
//
// TODO(leary) if a pathological thread enqueues operations onto the stream
// while another thread blocks like this, can you wind up waiting an unbounded
// amount of time?
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
static port::Status SynchronizeStream(CudaContext* context, CUstream stream);
// Blocks the calling thread until the operations associated with the context
// have been completed, via cuCtxSynchronize.
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
static bool SynchronizeContext(CudaContext* context);
// Returns true if all stream tasks have completed at time of the call. Note
// the potential for races around this call (if another thread adds work to
// the stream immediately after this returns).
static bool IsStreamIdle(CudaContext* context, CUstream stream);
// Returns whether code in the from context can access memory in the to
// context via cuDeviceCanAccessPeer.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
static bool CanEnablePeerAccess(CudaContext* from, CudaContext* to);
// Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
static port::Status EnablePeerAccess(CudaContext* from, CudaContext* to);
// Returns the elapsed milliseconds between start and stop via
// cuEventElapsedTime.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
static bool GetEventElapsedTime(CudaContext* context,
float* elapsed_milliseconds, CUevent start,
CUevent stop);
// Records that an event occurred when execution reaches the current point in
// thestream via cuEventRecord.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
static port::Status RecordEvent(CudaContext* context, CUevent event,
CUstream stream);
// Polls (without blocking) to determine the status of an event - pending or
// complete (or an error status).
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
static port::StatusOr<CUresult> QueryEvent(CudaContext* context,
CUevent event);
// -- Pointer-specific calls.
// Returns the context in which pointer was allocated or registered.
static port::StatusOr<CudaContext*> GetPointerContext(CUdeviceptr pointer);
// Returns the device associated with the context from GetPointerContext().
static port::StatusOr<CUdevice> GetPointerDevice(CUdeviceptr pointer);
// Returns the memory space addressed by pointer.
static port::StatusOr<MemorySpace> GetPointerMemorySpace(CUdeviceptr pointer);
// Returns the base address and size of the device pointer dptr.
static port::Status GetPointerAddressRange(CUdeviceptr dptr,
CUdeviceptr* base, size_t* size);
// -- Device-specific calls.
// Returns the compute capability for the device; i.e (3, 5).
// This is currently done via the deprecated device API.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
static port::Status GetComputeCapability(int* cc_major, int* cc_minor,
CUdevice device);
// Returns the number of multiprocessors on the device (note that the device
// may be multi-GPU-per-board).
static port::StatusOr<int> GetMultiprocessorCount(CUdevice device);
// Returns the limit on number of threads that can be resident in a single
// multiprocessor.
static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(CUdevice device);
// Returns the limit on number of threads which may be resident for a single
// block (cooperative thread array).
static port::StatusOr<int64> GetMaxThreadsPerBlock(CUdevice device);
// Returns the amount of shared memory available on a single GPU core (i.e.
// SM on NVIDIA devices).
static port::StatusOr<int64> GetMaxSharedMemoryPerCore(CUdevice device);
// Returns the amount of shared memory available for a single block
// (cooperative thread array).
static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(CUdevice device);
// Returns the maximum supported number of registers per block.
static port::StatusOr<int64> GetMaxRegistersPerBlock(CUdevice device);
// Returns the number of threads per warp.
static port::StatusOr<int64> GetThreadsPerWarp(CUdevice device);
// Queries the grid limits for device with cuDeviceGetAttribute calls.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
static bool GetGridLimits(int* x, int* y, int* z, CUdevice device);
// Gets a specific integer-valued property about the given device.
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
static port::StatusOr<int> GetDeviceAttribute(CUdevice_attribute attribute,
CUdevice device);
// Returns whether ECC is enabled for the given CUdevice via
// cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
static bool IsEccEnabled(CUdevice device, bool* result);
// Returns the total amount of memory available for allocation by the CUDA
// context, in bytes, via cuDeviceTotalMem.
static bool GetDeviceTotalMemory(CUdevice device, uint64* result);
// Returns the free amount of memory and total amount of memory, as reported
// by cuMemGetInfo.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
static bool GetDeviceMemoryInfo(CudaContext* context, int64* free,
int64* total);
// Returns a PCI bus id string for the device.
// [domain]:[bus]:[device].[function]
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
static string GetPCIBusID(CUdevice device);
// -- Context- and device-independent calls.
// Returns the number of visible CUDA device via cuDeviceGetCount.
// This should correspond to the set of device ordinals available.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
static int GetDeviceCount();
// Returns the driver version number via cuDriverGetVersion.
// This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
// instead, the CUDA toolkit release number that this driver is compatible
// with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
// compatible driver).
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
static bool GetDriverVersion(int* driver_version);
// -- Other calls
// Returns the maximum number of blocks (per multiprocessor) occupied by the
// specified kernel/CUfunction when launched with the specified parameters.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
CudaContext* context, CUfunction kernel, int threads_per_block,
size_t dynamic_shared_memory_bytes);
// Returns the current context set in CUDA. This is done by calling the cuda
// driver (e.g., this value is not our cached view of the current context).
static CUcontext CurrentContextOrDie();
// Seam for injecting an error at CUDA initialization time for testing
// purposes.
static bool driver_inject_init_error_;
};
// Ensures a context is activated within a scope.
class ScopedActivateContext {
public:
// Activates the context via cuCtxSetCurrent, if it is not the currently
// active context (a la cuCtxGetCurrent). Note the alternative push/pop
// mechanism is said by NVIDIA to be relatively slow and deprecated.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
explicit ScopedActivateContext(CudaContext* context);
// Checks that the context has remained activated for the duration of the
// scope.
~ScopedActivateContext();
private:
CudaContext* to_restore_ = nullptr;
};
// CudaContext wraps a cuda CUcontext handle, and includes a unique id. The
namespace gpu {
// CUDAContext wraps a cuda CUcontext handle, and includes a unique id. The
// unique id is positive, and ids are not repeated within the process.
class CudaContext {
class GpuContext {
public:
CudaContext(CUcontext context, int64 id) : context_(context), id_(id) {}
GpuContext(CUcontext context, int64 id) : context_(context), id_(id) {}
CUcontext context() const { return context_; }
int64 id() const { return id_; }
// Disallow copying and moving.
CudaContext(CudaContext&&) = delete;
CudaContext(const CudaContext&) = delete;
CudaContext& operator=(CudaContext&&) = delete;
CudaContext& operator=(const CudaContext&) = delete;
GpuContext(GpuContext&&) = delete;
GpuContext(const GpuContext&) = delete;
GpuContext& operator=(GpuContext&&) = delete;
GpuContext& operator=(const GpuContext&) = delete;
private:
CUcontext const context_;
const int64 id_;
};
inline CUcontext CurrentContextOrDie() {
return CUDADriver::CurrentContextOrDie();
}
} // namespace gpu
namespace cuda {
using MemorySpace = gpu::MemorySpace;
using CUDADriver = gpu::GpuDriver;
using ScopedActivateContext = gpu::ScopedActivateContext;
using CudaContext = gpu::GpuContext;
// Returns the current context set in CUDA. This is done by calling the cuda
// driver (e.g., this value is not our cached view of the current context).
CUcontext CurrentContextOrDie();
} // namespace cuda
} // namespace stream_executor

View File

@ -78,6 +78,7 @@ namespace wrap {
__macro(cuDeviceGetCount) \
__macro(cuDeviceGetName) \
__macro(cuDeviceGetPCIBusId) \
__macro(cuDeviceGetProperties) \
__macro(cuDevicePrimaryCtxGetState) \
__macro(cuDevicePrimaryCtxRelease) \
__macro(cuDevicePrimaryCtxRetain) \

View File

@ -20,30 +20,11 @@ limitations under the License.
#include "tensorflow/stream_executor/lib/statusor.h"
namespace stream_executor {
namespace cuda {
namespace gpu {
CUDAEvent::CUDAEvent(CUDAExecutor* parent)
: parent_(parent), cuda_event_(nullptr) {}
CUDAEvent::~CUDAEvent() {}
port::Status CUDAEvent::Init() {
return CUDADriver::CreateEvent(parent_->cuda_context(), &cuda_event_,
CUDADriver::EventFlags::kDisableTiming);
}
port::Status CUDAEvent::Destroy() {
return CUDADriver::DestroyEvent(parent_->cuda_context(), &cuda_event_);
}
port::Status CUDAEvent::Record(CUDAStream* stream) {
return CUDADriver::RecordEvent(parent_->cuda_context(), cuda_event_,
stream->cuda_stream());
}
Event::Status CUDAEvent::PollForStatus() {
Event::Status GpuEvent::PollForStatus() {
port::StatusOr<CUresult> status =
CUDADriver::QueryEvent(parent_->cuda_context(), cuda_event_);
GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
if (!status.ok()) {
LOG(ERROR) << "Error polling for event status: "
<< status.status().error_message();
@ -62,9 +43,5 @@ Event::Status CUDAEvent::PollForStatus() {
}
}
const CUevent& CUDAEvent::cuda_event() {
return cuda_event_;
}
} // namespace cuda
} // namespace gpu
} // namespace stream_executor

View File

@ -16,45 +16,12 @@ limitations under the License.
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
#include "tensorflow/stream_executor/cuda/cuda_stream.h"
#include "tensorflow/stream_executor/event.h"
#include "tensorflow/stream_executor/lib/status.h"
#include "tensorflow/stream_executor/gpu/gpu_event.h"
namespace stream_executor {
namespace cuda {
// CUDAEvent wraps a CUevent in the platform-independent EventInterface
// interface.
class CUDAEvent : public internal::EventInterface {
public:
explicit CUDAEvent(CUDAExecutor* parent);
~CUDAEvent() override;
// Populates the CUDA-platform-specific elements of this object.
port::Status Init();
// Deallocates any platform-specific elements of this object. This is broken
// out (not part of the destructor) to allow for error reporting.
port::Status Destroy();
// Inserts the event at the current position into the specified stream.
port::Status Record(CUDAStream* stream);
// Polls the CUDA platform for the event's current status.
Event::Status PollForStatus();
// The underlying CUDA event element.
const CUevent& cuda_event();
private:
// The Executor used to which this object and CUevent are bound.
CUDAExecutor* parent_;
// The underlying CUDA event element.
CUevent cuda_event_;
};
using CUDAEvent = gpu::GpuEvent;
} // namespace cuda
} // namespace stream_executor

View File

@ -33,7 +33,7 @@ limitations under the License.
#include "tensorflow/stream_executor/stream_executor_internal.h"
namespace stream_executor {
namespace cuda {
namespace gpu {
PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
@ -45,13 +45,13 @@ namespace wrap {
// manner on first use. This dynamic loading technique is used to avoid DSO
// dependencies on vendor libraries which may or may not be available in the
// deployed binary environment.
#define STREAM_EXECUTOR_CUFFT_WRAP(__name) \
struct WrapperShim__##__name { \
template <typename... Args> \
cufftResult operator()(CUDAExecutor *parent, Args... args) { \
cuda::ScopedActivateExecutorContext sac{parent}; \
return ::__name(args...); \
} \
#define STREAM_EXECUTOR_CUFFT_WRAP(__name) \
struct WrapperShim__##__name { \
template <typename... Args> \
cufftResult operator()(GpuExecutor *parent, Args... args) { \
gpu::ScopedActivateExecutorContext sac{parent}; \
return ::__name(args...); \
} \
} __name;
#else
@ -77,8 +77,8 @@ namespace wrap {
return f; \
} \
template <typename... Args> \
cufftResult operator()(CUDAExecutor *parent, Args... args) { \
cuda::ScopedActivateExecutorContext sac{parent}; \
cufftResult operator()(GpuExecutor *parent, Args... args) { \
gpu::ScopedActivateExecutorContext sac{parent}; \
return DynLoad()(args...); \
} \
} __name; \
@ -145,8 +145,8 @@ cufftType CUDAFftType(fft::Type type) {
}
// Associates the given stream with the given cuFFT plan.
bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
auto ret = wrap::cufftSetStream(parent, plan, AsCUDAStreamValue(stream));
bool SetStream(GpuExecutor *parent, cufftHandle plan, Stream *stream) {
auto ret = wrap::cufftSetStream(parent, plan, AsGpuStreamValue(stream));
if (ret != CUFFT_SUCCESS) {
LOG(ERROR) << "failed to run cuFFT routine cufftSetStream: " << ret;
return false;
@ -157,7 +157,7 @@ bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
} // namespace
port::Status CUDAFftPlan::Initialize(
CUDAExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
GpuExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
uint64 *input_embed, uint64 input_stride, uint64 input_distance,
uint64 *output_embed, uint64 output_stride, uint64 output_distance,
fft::Type type, int batch_count, ScratchAllocator *scratch_allocator) {
@ -317,7 +317,7 @@ port::Status CUDAFftPlan::Initialize(
return port::Status::OK();
}
port::Status CUDAFftPlan::Initialize(CUDAExecutor *parent, Stream *stream,
port::Status CUDAFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
int rank, uint64 *elem_count,
fft::Type type,
ScratchAllocator *scratch_allocator) {
@ -549,8 +549,8 @@ bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec,
}
auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
CUDAComplex(CUDAMemoryMutable(output)));
GpuComplex(const_cast<InputT *>(GpuMemory(input))),
GpuComplex(GpuMemoryMutable(output)));
if (ret != CUFFT_SUCCESS) {
LOG(ERROR) << "failed to run cuFFT routine: " << ret;
@ -576,8 +576,8 @@ bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
}
auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
CUDAComplex(CUDAMemoryMutable(output)),
GpuComplex(const_cast<InputT *>(GpuMemory(input))),
GpuComplex(GpuMemoryMutable(output)),
cuda_fft_plan->GetFftDirection());
if (ret != CUFFT_SUCCESS) {
@ -614,22 +614,22 @@ STREAM_EXECUTOR_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
#undef STREAM_EXECUTOR_CUDA_DEFINE_FFT
} // namespace cuda
} // namespace gpu
void initialize_cufft() {
port::Status status =
PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
cuda::kCudaPlatformId, cuda::kCuFftPlugin, "cuFFT",
cuda::kCudaPlatformId, gpu::kCuFftPlugin, "cuFFT",
[](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
cuda::CUDAExecutor *cuda_executor =
dynamic_cast<cuda::CUDAExecutor *>(parent);
gpu::GpuExecutor *cuda_executor =
dynamic_cast<gpu::GpuExecutor *>(parent);
if (cuda_executor == nullptr) {
LOG(ERROR) << "Attempting to initialize an instance of the cuFFT "
<< "support library with a non-CUDA StreamExecutor";
return nullptr;
}
return new cuda::CUDAFft(cuda_executor);
return new gpu::CUDAFft(cuda_executor);
});
if (!status.ok()) {
LOG(ERROR) << "Unable to register cuFFT factory: "
@ -637,7 +637,7 @@ void initialize_cufft() {
}
PluginRegistry::Instance()->SetDefaultFactory(
cuda::kCudaPlatformId, PluginKind::kFft, cuda::kCuFftPlugin);
cuda::kCudaPlatformId, PluginKind::kFft, gpu::kCuFftPlugin);
}
} // namespace stream_executor

View File

@ -30,9 +30,9 @@ namespace stream_executor {
class Stream;
namespace cuda {
namespace gpu {
class CUDAExecutor;
class GpuExecutor;
// Opaque and unique indentifier for the cuFFT plugin.
extern const PluginId kCuFftPlugin;
@ -64,17 +64,17 @@ class CUDAFftPlan : public fft::Plan {
}
// Initialize function for batched plan
port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
uint64 *elem_count, uint64 *input_embed,
port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
uint64* elem_count, uint64* input_embed,
uint64 input_stride, uint64 input_distance,
uint64 *output_embed, uint64 output_stride,
uint64* output_embed, uint64 output_stride,
uint64 output_distance, fft::Type type,
int batch_count, ScratchAllocator *scratch_allocator);
int batch_count, ScratchAllocator* scratch_allocator);
// Initialize function for 1d,2d, and 3d plan
port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
uint64 *elem_count, fft::Type type,
ScratchAllocator *scratch_allocator);
port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
uint64* elem_count, fft::Type type,
ScratchAllocator* scratch_allocator);
port::Status UpdateScratchAllocator(Stream *stream,
ScratchAllocator *scratch_allocator);
@ -83,7 +83,7 @@ class CUDAFftPlan : public fft::Plan {
bool IsInitialized() const { return is_initialized_; }
private:
CUDAExecutor *parent_;
GpuExecutor* parent_;
cufftHandle plan_;
fft::Type fft_type_;
DeviceMemory<uint8> scratch_;
@ -96,7 +96,7 @@ class CUDAFftPlan : public fft::Plan {
// This satisfies the platform-agnostic FftSupport interface.
//
// Note that the cuFFT handle that this encapsulates is implicitly tied to the
// context (and, as a result, the device) that the parent CUDAExecutor is tied
// context (and, as a result, the device) that the parent GpuExecutor is tied
// to. This simply happens as an artifact of creating the cuFFT handle when a
// CUDA context is active.
//
@ -104,13 +104,13 @@ class CUDAFftPlan : public fft::Plan {
// context of parent_, so all context is explicit.
class CUDAFft : public fft::FftSupport {
public:
explicit CUDAFft(CUDAExecutor *parent) : parent_(parent) {}
explicit CUDAFft(GpuExecutor* parent) : parent_(parent) {}
~CUDAFft() override {}
TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES
private:
CUDAExecutor *parent_;
GpuExecutor* parent_;
// Two helper functions that execute dynload::cufftExec?2?.
@ -131,7 +131,7 @@ class CUDAFft : public fft::FftSupport {
SE_DISALLOW_COPY_AND_ASSIGN(CUDAFft);
};
} // namespace cuda
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_

View File

@ -72,7 +72,7 @@ extern bool FLAGS_check_gpu_leaks;
bool FLAGS_prefer_cubin_to_ptx = true;
namespace stream_executor {
namespace cuda {
namespace gpu {
// Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
// It has been observed that loading both PTX and cubins into the driver library
@ -84,17 +84,16 @@ namespace cuda {
// variable with extern linkage and populate it from another translation unit.
std::function<string(const string &)> g_cubinate;
static CUDAEvent *AsCUDAEvent(Event *event) {
static GpuEvent* AsGpuEvent(Event* event) {
DCHECK(event != nullptr);
return static_cast<CUDAEvent *>(event->implementation());
return static_cast<GpuEvent*>(event->implementation());
}
// Given a platform-independent timer datatype, returns the internal CUDA
// platform implementation pointer.
static CUDATimer *AsCUDATimer(Timer *timer) {
static GpuTimer* AsGpuTimer(Timer* timer) {
DCHECK(timer != nullptr);
return static_cast<CUDATimer *>(timer->implementation());
return static_cast<GpuTimer*>(timer->implementation());
}
// Given const GPU memory, returns a libcuda device pointer datatype, suitable
@ -112,48 +111,49 @@ static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
return AsCudaDevicePtr(*gpu_mem);
}
CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec) {
GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
CHECK(cuda_exec != nullptr);
return cuda_exec->cuda_context();
return cuda_exec->gpu_context();
}
CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec) {
return static_cast<CUDAExecutor *>(stream_exec->implementation());
GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
return static_cast<GpuExecutor*>(stream_exec->implementation());
}
CUDAExecutor::~CUDAExecutor() {
CHECK(kernel_to_gpu_binary_.empty()) << "CUDAExecutor has live kernels.";
CHECK(gpu_binary_to_module_.empty()) << "CUDAExecutor has loaded modules.";
GpuExecutor::~GpuExecutor() {
CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
if (context_ != nullptr) {
CUDADriver::DestroyContext(context_);
GpuDriver::DestroyContext(context_);
}
}
port::Status CUDAExecutor::Init(int device_ordinal,
DeviceOptions device_options) {
port::Status GpuExecutor::Init(int device_ordinal,
DeviceOptions device_options) {
device_ordinal_ = device_ordinal;
auto status = CUDADriver::Init();
auto status = GpuDriver::Init();
if (!status.ok()) {
return status;
}
status = CUDADriver::GetDevice(device_ordinal_, &device_);
status = GpuDriver::GetDevice(device_ordinal_, &device_);
if (!status.ok()) {
return status;
}
status = CUDADriver::CreateContext(device_, device_options, &context_);
status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
&context_);
if (!status.ok()) {
return status;
}
return CUDADriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
}
bool CUDAExecutor::FindOnDiskForComputeCapability(
bool GpuExecutor::FindOnDiskForComputeCapability(
absl::string_view filename, absl::string_view canonical_suffix,
string *found_filename) const {
string* found_filename) const {
if (cc_major_ == 0 && cc_minor_ == 0) {
return false;
}
@ -177,6 +177,13 @@ bool CUDAExecutor::FindOnDiskForComputeCapability(
return false;
}
bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
absl::string_view canonical_suffix,
string* found_filename) const {
LOG(ERROR)
<< "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
return false;
}
// Returns the path to the running executable.
// N.B. Derived from //knowledge/smalltalk/background_kb.cc
// Arg: strip_exe: if true, remove the name of the executable itself from the
@ -211,12 +218,12 @@ static string GetBinaryDir(bool strip_exe) {
return exe_path;
}
bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, CUmodule* module) {
uint64_t module_refcount;
std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
if (*module == nullptr) {
auto load_status = CUDADriver::LoadCubin(context_, cubin, module);
auto load_status = GpuDriver::LoadCubin(context_, cubin, module);
if (!load_status.ok()) {
LOG(ERROR) << "failed to load CUBIN: " << load_status;
return false;
@ -233,12 +240,12 @@ bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
return true;
}
bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
bool GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
uint64_t module_refcount;
std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
if (*module == nullptr) {
if (!CUDADriver::LoadPtx(context_, ptx, module)) {
if (!GpuDriver::LoadPtx(context_, ptx, module)) {
return false;
}
VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
@ -253,9 +260,14 @@ bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
return true;
}
bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
KernelBase *kernel) {
CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, CUmodule* module) {
LOG(ERROR) << "Feature not supported on CUDA platform (LoadModuleFromHsaco)";
return false;
}
bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
KernelBase* kernel) {
GpuKernel* cuda_kernel = AsGpuKernel(kernel);
CUmodule module;
const string *kernelname;
@ -295,8 +307,8 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
return false;
}
VLOG(2) << "getting function " << *kernelname << " from module " << module;
if (!CUDADriver::GetModuleFunction(context_, module, kernelname->c_str(),
cuda_kernel->cuda_function_ptr())) {
if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
cuda_kernel->gpu_function_ptr())) {
return false;
}
@ -313,7 +325,7 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
return true;
}
bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
auto module_it = gpu_binary_to_module_.find(gpu_binary);
if (gpu_binary_to_module_.end() == module_it) {
VLOG(3) << "No loaded CUDA module for " << gpu_binary;
@ -324,13 +336,13 @@ bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
if (--refcount == 0) {
VLOG(3) << "Unloading CUDA module " << module;
CUDADriver::UnloadModule(context_, module);
GpuDriver::UnloadModule(context_, module);
gpu_binary_to_module_.erase(module_it);
}
return true;
}
void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
mutex_lock lock{in_memory_modules_mu_};
@ -346,9 +358,9 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
kernel_to_gpu_binary_.erase(gpu_binary_it);
}
bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
ModuleHandle *module_handle) {
// In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
ModuleHandle* module_handle) {
// In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
// ModuleHandle::id().
CUmodule cu_module;
if (spec.has_cuda_cubin_in_memory()) {
@ -382,25 +394,23 @@ bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
return false;
}
bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) {
bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
mutex_lock lock{in_memory_modules_mu_};
return UnloadGpuBinary(gpu_binary);
}
bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
KernelMetadata *kernel_metadata) {
bool GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
KernelMetadata* kernel_metadata) {
int value;
if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
*cuda_kernel->cuda_function_ptr(),
&value)) {
if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
*cuda_kernel->gpu_function_ptr(), &value)) {
return false;
}
kernel_metadata->set_registers_per_thread(value);
if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
*cuda_kernel->cuda_function_ptr(),
&value)) {
if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
*cuda_kernel->gpu_function_ptr(), &value)) {
return false;
}
kernel_metadata->set_shared_memory_bytes(value);
@ -408,13 +418,13 @@ bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
return true;
}
bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
const BlockDim &block_dims, const KernelBase &kernel,
const KernelArgsArrayBase &args) {
bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
const BlockDim& block_dims, const KernelBase& kernel,
const KernelArgsArrayBase& args) {
CHECK_EQ(kernel.Arity(), args.number_of_arguments());
CUstream custream = AsCUDAStreamValue(stream);
const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
CUstream custream = AsGpuStreamValue(stream);
const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
// Only perform/print the occupancy check once. Even just checking to see
// whether we've done an occupancy check on this kernel before isn't free
@ -431,16 +441,16 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
if (cuda_kernel->GetPreferredCacheConfig() !=
KernelCacheConfig::kNoPreference) {
CUDADriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetCUDACacheConfig());
GpuDriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetGpuCacheConfig());
}
void **kernel_params = const_cast<void **>(args.argument_addresses().data());
if (!CUDADriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
block_dims.z, thread_dims.x, thread_dims.y,
thread_dims.z, args.number_of_shared_bytes(),
custream, kernel_params,
nullptr /* = extra */)) {
if (!GpuDriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
block_dims.z, thread_dims.x, thread_dims.y,
thread_dims.z, args.number_of_shared_bytes(),
custream, kernel_params,
nullptr /* = extra */)) {
LOG(ERROR) << "failed to launch CUDA kernel " << kernel.name() << " with "
<< args.number_of_arguments()
<< " args; thread dim: " << thread_dims.ToString()
@ -454,9 +464,9 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
// This is a non-essential operation; if there's a failure, proceed without
// logging an error. It's nearly certain that in case of failures, we'd never
// get here in the first place; these are very low-impact routines.
void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
const ThreadDim &thread_dims,
const BlockDim &block_dims) {
void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
const ThreadDim& thread_dims,
const BlockDim& block_dims) {
VLOG(2) << "Computing kernel occupancy for kernel "
<< kernel.demangled_name();
VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
@ -475,8 +485,8 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
const DeviceDescription &device_description =
kernel.parent()->GetDeviceDescription();
const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
smem_per_block, thread_dims, cufunc);
@ -496,10 +506,11 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
// Compute and return maximum blocks per core (occupancy) based on the
// device description, some kernel characteristics and the number of threads per
// block. If unable to compute occupancy, zero is returned.
int CUDAExecutor::CalculateOccupancy(
const DeviceDescription &device_description, uint64 registers_per_thread,
uint64 shared_memory_per_block, const ThreadDim &thread_dims,
CUfunction func) {
int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims,
CUfunction func) {
int suggested_blocks = 0;
int suggested_threads = 0;
CUresult err = tensorflow::wrap::cuOccupancyMaxPotentialBlockSize(
@ -511,12 +522,12 @@ int CUDAExecutor::CalculateOccupancy(
// Compute and return the suggested thread count to achieve ideal occupancy.
// If the provided thread dimensions match this number, zero is returned.
int CUDAExecutor::CompareOccupancy(int *initial_blocks,
const DeviceDescription &device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim &thread_dims,
CUfunction func) {
int GpuExecutor::CompareOccupancy(int* initial_blocks,
const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims,
CUfunction func) {
int suggested_blocks = 0;
int suggested_threads = 0;
CUresult err = tensorflow::wrap::cuOccupancyMaxPotentialBlockSize(
@ -531,88 +542,87 @@ int CUDAExecutor::CompareOccupancy(int *initial_blocks,
}
}
void *CUDAExecutor::Allocate(uint64 size) {
return CUDADriver::DeviceAllocate(context_, size);
void* GpuExecutor::Allocate(uint64 size) {
return GpuDriver::DeviceAllocate(context_, size);
}
void *CUDAExecutor::AllocateSubBuffer(DeviceMemoryBase *mem,
uint64 offset_bytes, uint64 size_bytes) {
void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
uint64 size_bytes) {
// offset and size are in bytes, so char* works as the pointer type.
return reinterpret_cast<char *>(mem->opaque()) + offset_bytes;
}
void CUDAExecutor::Deallocate(DeviceMemoryBase *mem) {
void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
// CUDA "sub-buffers" are just pointer + offset, so no dealloc is necessary.
if (!mem->is_sub_buffer()) {
CUDADriver::DeviceDeallocate(context_, mem->opaque());
GpuDriver::DeviceDeallocate(context_, mem->opaque());
}
}
bool CUDAExecutor::HostMemoryRegister(void *location, uint64 size) {
bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
if (location == nullptr || size == 0) {
LOG(WARNING) << "attempting to register null or zero-sized memory: "
<< location << "; size " << size;
}
VLOG(2) << "registering " << location << " size " << size;
return CUDADriver::HostRegister(context_, location, size);
return GpuDriver::HostRegister(context_, location, size);
}
bool CUDAExecutor::HostMemoryUnregister(void *location) {
bool GpuExecutor::HostMemoryUnregister(void* location) {
VLOG(2) << "unregistering " << location;
return CUDADriver::HostUnregister(context_, location);
return GpuDriver::HostUnregister(context_, location);
}
bool CUDAExecutor::SynchronizeAllActivity() {
return CUDADriver::SynchronizeContext(context_);
bool GpuExecutor::SynchronizeAllActivity() {
return GpuDriver::SynchronizeContext(context_);
}
bool CUDAExecutor::SynchronousMemZero(DeviceMemoryBase *location, uint64 size) {
bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
size % 4 == 0) {
return CUDADriver::SynchronousMemsetUint32(
return GpuDriver::SynchronousMemsetUint32(
context_, AsCudaDevicePtr(location), 0x0, size / 4);
}
return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
0x0, size);
return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
0x0, size);
}
bool CUDAExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value,
uint64 size) {
bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
uint64 size) {
if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
size % 4 == 0) {
// cudaMemset reinterprets "value" as a uint8.
uint8 byte_value = static_cast<uint8>(value);
uint32 pattern = (byte_value << 24) | (byte_value << 16) |
(byte_value << 8) | byte_value;
return CUDADriver::SynchronousMemsetUint32(
return GpuDriver::SynchronousMemsetUint32(
context_, AsCudaDevicePtr(location), pattern, size / 4);
}
return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
value, size);
return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
value, size);
}
port::Status CUDAExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
const void *host_src,
uint64 size) {
return CUDADriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
host_src, size);
port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
const void* host_src, uint64 size) {
return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
host_src, size);
}
port::Status CUDAExecutor::SynchronousMemcpy(void *host_dst,
const DeviceMemoryBase &gpu_src,
uint64 size) {
return CUDADriver::SynchronousMemcpyD2H(context_, host_dst,
AsCudaDevicePtr(gpu_src), size);
port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
const DeviceMemoryBase& gpu_src,
uint64 size) {
return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
AsCudaDevicePtr(gpu_src), size);
}
port::Status CUDAExecutor::SynchronousMemcpyDeviceToDevice(
DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) {
return CUDADriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
AsCudaDevicePtr(gpu_src), size);
port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
AsCudaDevicePtr(gpu_src), size);
}
bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
uint64 size) {
bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
uint64 size) {
if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
size % 4 == 0) {
return Memset32(stream, location, 0x0, size);
@ -621,88 +631,87 @@ bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
}
}
bool CUDAExecutor::Memset(Stream *stream, DeviceMemoryBase *location,
uint8 pattern, uint64 size) {
bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
uint8 pattern, uint64 size) {
VLOG(2) << "enqueueing memset8 operation onto stream " << stream
<< " at location " << location << " with size " << size
<< " and pattern " << std::hex << pattern;
return CUDADriver::AsynchronousMemsetUint8(
context_, AsCudaDevicePtr(location), pattern, size,
AsCUDAStreamValue(stream));
return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
pattern, size,
AsGpuStreamValue(stream));
}
bool CUDAExecutor::Memset32(Stream *stream, DeviceMemoryBase *location,
uint32 pattern, uint64 size) {
bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
uint32 pattern, uint64 size) {
VLOG(2) << "enqueueing memset32 operation onto stream " << stream
<< " at location " << location << " with size " << size
<< " and pattern " << std::hex << pattern;
CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
size % 4 == 0);
return CUDADriver::AsynchronousMemsetUint32(
return GpuDriver::AsynchronousMemsetUint32(
context_, AsCudaDevicePtr(location), pattern, size / 4,
AsCUDAStreamValue(stream));
AsGpuStreamValue(stream));
}
bool CUDAExecutor::Memcpy(Stream *stream, void *host_dst,
const DeviceMemoryBase &gpu_src, uint64 size) {
return CUDADriver::AsynchronousMemcpyD2H(context_, host_dst,
AsCudaDevicePtr(gpu_src), size,
AsCUDAStreamValue(stream));
bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
const DeviceMemoryBase& gpu_src, uint64 size) {
return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
AsCudaDevicePtr(gpu_src), size,
AsGpuStreamValue(stream));
}
bool CUDAExecutor::Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst,
const void *host_src, uint64 size) {
return CUDADriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
host_src, size,
AsCUDAStreamValue(stream));
bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
const void* host_src, uint64 size) {
return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
host_src, size,
AsGpuStreamValue(stream));
}
bool CUDAExecutor::MemcpyDeviceToDevice(Stream *stream,
DeviceMemoryBase *gpu_dst,
const DeviceMemoryBase &gpu_src,
uint64 size) {
return CUDADriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
AsCudaDevicePtr(gpu_src), size,
AsCUDAStreamValue(stream));
bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
DeviceMemoryBase* gpu_dst,
const DeviceMemoryBase& gpu_src,
uint64 size) {
return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
AsCudaDevicePtr(gpu_src), size,
AsGpuStreamValue(stream));
}
bool CUDAExecutor::HostCallback(Stream *stream,
std::function<port::Status()> callback) {
bool GpuExecutor::HostCallback(Stream* stream,
std::function<port::Status()> callback) {
auto callback_ptr = new std::function<void()>([callback]() {
port::Status s = callback();
if (!s.ok()) {
LOG(WARNING) << "Host callback failed: " << s;
}
});
return CUDADriver::AddStreamCallback(context_, AsCUDAStreamValue(stream),
InternalHostCallback, callback_ptr);
return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
InternalHostCallback, callback_ptr);
}
/* static */ void CUDAExecutor::InternalHostCallback(CUstream stream,
CUresult status,
void *data) {
/* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
CUresult status,
void* data) {
std::function<void()> *callback =
reinterpret_cast<std::function<void()> *>(data);
(*callback)();
delete callback;
}
port::Status CUDAExecutor::AllocateEvent(Event *event) {
return AsCUDAEvent(event)->Init();
port::Status GpuExecutor::AllocateEvent(Event* event) {
return AsGpuEvent(event)->Init();
}
port::Status CUDAExecutor::DeallocateEvent(Event *event) {
return AsCUDAEvent(event)->Destroy();
port::Status GpuExecutor::DeallocateEvent(Event* event) {
return AsGpuEvent(event)->Destroy();
}
port::Status CUDAExecutor::RecordEvent(Stream *stream, Event *event) {
return AsCUDAEvent(event)->Record(AsCUDAStream(stream));
port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
return AsGpuEvent(event)->Record(AsGpuStream(stream));
}
port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
if (CUDADriver::WaitStreamOnEvent(context_,
AsCUDAStream(stream)->cuda_stream(),
AsCUDAEvent(event)->cuda_event())) {
port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
AsGpuEvent(event)->gpu_event())) {
return port::Status::OK();
} else {
return port::Status(
@ -712,61 +721,61 @@ port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
}
}
Event::Status CUDAExecutor::PollForEventStatus(Event *event) {
return AsCUDAEvent(event)->PollForStatus();
Event::Status GpuExecutor::PollForEventStatus(Event* event) {
return AsGpuEvent(event)->PollForStatus();
}
bool CUDAExecutor::AllocateStream(Stream *stream) {
return AsCUDAStream(stream)->Init();
bool GpuExecutor::AllocateStream(Stream* stream) {
return AsGpuStream(stream)->Init();
}
void CUDAExecutor::DeallocateStream(Stream *stream) {
CUDAStream *cuda_stream = AsCUDAStream(stream);
void GpuExecutor::DeallocateStream(Stream* stream) {
GpuStream* cuda_stream = AsGpuStream(stream);
if (!cuda_stream->IsIdle()) {
LOG(ERROR) << "Deallocating stream with pending work";
}
cuda_stream->Destroy();
}
bool CUDAExecutor::AllocateTimer(Timer *timer) {
return AsCUDATimer(timer)->Init();
bool GpuExecutor::AllocateTimer(Timer* timer) {
return AsGpuTimer(timer)->Init();
}
void CUDAExecutor::DeallocateTimer(Timer *timer) {
AsCUDATimer(timer)->Destroy();
void GpuExecutor::DeallocateTimer(Timer* timer) {
AsGpuTimer(timer)->Destroy();
}
bool CUDAExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
CUevent other_completed_event = *AsCUDAStream(other)->completed_event();
bool ok = CUDADriver::RecordEvent(context_, other_completed_event,
AsCUDAStreamValue(other))
.ok();
bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
CUevent other_completed_event = *AsGpuStream(other)->completed_event();
bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
AsGpuStreamValue(other))
.ok();
if (!ok) {
LOG(ERROR) << "failed to record completion event; "
"therefore, failed to create inter-stream dependency";
return false;
}
return CUDADriver::WaitStreamOnEvent(context_, AsCUDAStreamValue(dependent),
other_completed_event);
return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
other_completed_event);
}
bool CUDAExecutor::StartTimer(Stream *stream, Timer *timer) {
return AsCUDATimer(timer)->Start(AsCUDAStream(stream));
bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
return AsGpuTimer(timer)->Start(AsGpuStream(stream));
}
bool CUDAExecutor::StopTimer(Stream *stream, Timer *timer) {
return AsCUDATimer(timer)->Stop(AsCUDAStream(stream));
bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
}
port::Status CUDAExecutor::BlockHostUntilDone(Stream *stream) {
return CUDADriver::SynchronizeStream(context_, AsCUDAStreamValue(stream));
port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
}
blas::BlasSupport *CUDAExecutor::CreateBlas() {
blas::BlasSupport* GpuExecutor::CreateBlas() {
PluginRegistry *registry = PluginRegistry::Instance();
port::StatusOr<PluginRegistry::BlasFactory> status =
registry->GetFactory<PluginRegistry::BlasFactory>(kCudaPlatformId,
registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
plugin_config_.blas());
if (!status.ok()) {
LOG(ERROR) << "Unable to retrieve BLAS factory: "
@ -777,10 +786,10 @@ blas::BlasSupport *CUDAExecutor::CreateBlas() {
return status.ValueOrDie()(this);
}
dnn::DnnSupport *CUDAExecutor::CreateDnn() {
dnn::DnnSupport* GpuExecutor::CreateDnn() {
PluginRegistry *registry = PluginRegistry::Instance();
port::StatusOr<PluginRegistry::DnnFactory> status =
registry->GetFactory<PluginRegistry::DnnFactory>(kCudaPlatformId,
registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
plugin_config_.dnn());
if (!status.ok()) {
LOG(ERROR) << "Unable to retrieve DNN factory: "
@ -791,10 +800,10 @@ dnn::DnnSupport *CUDAExecutor::CreateDnn() {
return status.ValueOrDie()(this);
}
fft::FftSupport *CUDAExecutor::CreateFft() {
fft::FftSupport* GpuExecutor::CreateFft() {
PluginRegistry *registry = PluginRegistry::Instance();
port::StatusOr<PluginRegistry::FftFactory> status =
registry->GetFactory<PluginRegistry::FftFactory>(kCudaPlatformId,
registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
plugin_config_.fft());
if (!status.ok()) {
LOG(ERROR) << "Unable to retrieve FFT factory: "
@ -805,10 +814,10 @@ fft::FftSupport *CUDAExecutor::CreateFft() {
return status.ValueOrDie()(this);
}
rng::RngSupport *CUDAExecutor::CreateRng() {
rng::RngSupport* GpuExecutor::CreateRng() {
PluginRegistry *registry = PluginRegistry::Instance();
port::StatusOr<PluginRegistry::RngFactory> status =
registry->GetFactory<PluginRegistry::RngFactory>(kCudaPlatformId,
registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
plugin_config_.rng());
if (!status.ok()) {
LOG(ERROR) << "Unable to retrieve RNG factory: "
@ -820,23 +829,21 @@ rng::RngSupport *CUDAExecutor::CreateRng() {
}
// TODO(rspringer): Remove in b/18544742.
bool CUDAExecutor::SupportsDnn() const {
return true;
bool GpuExecutor::SupportsDnn() const { return true; }
bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
}
bool CUDAExecutor::CanEnablePeerAccessTo(StreamExecutorInterface *other) {
CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
return CUDADriver::CanEnablePeerAccess(context_, cuda_other->context_);
port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
}
port::Status CUDAExecutor::EnablePeerAccessTo(StreamExecutorInterface *other) {
CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
return CUDADriver::EnablePeerAccess(context_, cuda_other->context_);
}
SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
port::StatusOr<CUsharedconfig> cuda_config =
CUDADriver::ContextGetSharedMemConfig(context_);
GpuDriver::ContextGetSharedMemConfig(context_);
if (!cuda_config.ok()) {
// Don't log; the failed call will log necessary output.
return SharedMemoryConfig::kDefault;
@ -855,7 +862,7 @@ SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
}
}
port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
SharedMemoryConfig config) {
CUsharedconfig cuda_config;
switch (config) {
@ -872,21 +879,21 @@ port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
LOG(FATAL) << "Invalid shared memory configuration specified: "
<< static_cast<int>(config);
}
return CUDADriver::ContextSetSharedMemConfig(context_, cuda_config);
return GpuDriver::ContextSetSharedMemConfig(context_, cuda_config);
}
bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
return CUDADriver::GetDeviceMemoryInfo(context_, free, total);
bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
}
bool CUDAExecutor::GetSymbol(const string &symbol_name,
ModuleHandle module_handle, void **mem,
size_t *bytes) {
bool GpuExecutor::GetSymbol(const string& symbol_name,
ModuleHandle module_handle, void** mem,
size_t* bytes) {
auto lookup_in_module = [&](CUmodule module) {
CHECK(module != nullptr);
return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
reinterpret_cast<CUdeviceptr *>(mem),
bytes);
return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
reinterpret_cast<CUdeviceptr*>(mem),
bytes);
};
{ // give limited scope to mutex_lock
@ -908,13 +915,13 @@ bool CUDAExecutor::GetSymbol(const string &symbol_name,
return false;
}
bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
// The BlockDim name is a mismatch against these GRID_DIM_* queries because
// we use BlockDims to express the dimensions of blocks within a grid
// (as opposed to ThreadDim which expresses the dimensions of threads
// within a block).
int x, y, z;
if (!CUDADriver::GetGridLimits(&x, &y, &z, device_)) {
if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
return false;
}
@ -924,35 +931,35 @@ bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
return true;
}
bool CUDAExecutor::SupportsBlas() const { return true; }
bool GpuExecutor::SupportsBlas() const { return true; }
bool CUDAExecutor::SupportsFft() const { return true; }
bool GpuExecutor::SupportsFft() const { return true; }
bool CUDAExecutor::SupportsRng() const { return true; }
bool GpuExecutor::SupportsRng() const { return true; }
std::unique_ptr<internal::EventInterface>
CUDAExecutor::CreateEventImplementation() {
return std::unique_ptr<internal::EventInterface>(new CUDAEvent(this));
GpuExecutor::CreateEventImplementation() {
return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
}
std::unique_ptr<internal::KernelInterface>
CUDAExecutor::CreateKernelImplementation() {
return std::unique_ptr<internal::KernelInterface>(new CUDAKernel());
GpuExecutor::CreateKernelImplementation() {
return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
}
std::unique_ptr<internal::StreamInterface>
CUDAExecutor::GetStreamImplementation() {
return std::unique_ptr<internal::StreamInterface>(new CUDAStream(this));
GpuExecutor::GetStreamImplementation() {
return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
}
std::unique_ptr<internal::TimerInterface>
CUDAExecutor::GetTimerImplementation() {
return std::unique_ptr<internal::TimerInterface>(new CUDATimer(this));
GpuExecutor::GetTimerImplementation() {
return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
}
void *CUDAExecutor::GpuContextHack() { return context_; }
void* GpuExecutor::GpuContextHack() { return context_; }
CudaContext* CUDAExecutor::cuda_context() { return context_; }
GpuContext* GpuExecutor::gpu_context() { return context_; }
// Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
// of SysFS. Returns -1 if it cannot.
@ -1019,21 +1026,21 @@ static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
#endif
}
DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
internal::DeviceDescriptionBuilder builder;
{
int driver_version = 0;
(void)CUDADriver::GetDriverVersion(&driver_version);
(void)GpuDriver::GetDriverVersion(&driver_version);
string augmented_driver_version = port::Printf(
"%d (%s)", driver_version,
DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str());
cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
.c_str());
builder.set_driver_version(augmented_driver_version);
}
{
string pci_bus_id = CUDADriver::GetPCIBusID(device_);
string pci_bus_id = GpuDriver::GetPCIBusID(device_);
// Lower the hex characters to match sysfs.
pci_bus_id = port::Lowercase(pci_bus_id);
@ -1046,43 +1053,43 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
{
builder.set_threads_per_block_limit(
CUDADriver::GetDeviceAttribute(
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device_)
GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
device_)
.ValueOrDie());
ThreadDim thread_dim_limit;
thread_dim_limit.x = CUDADriver::GetDeviceAttribute(
thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device_)
.ValueOrDie();
thread_dim_limit.y = CUDADriver::GetDeviceAttribute(
thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device_)
.ValueOrDie();
thread_dim_limit.z = CUDADriver::GetDeviceAttribute(
thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device_)
.ValueOrDie();
builder.set_thread_dim_limit(thread_dim_limit);
int clock_rate =
CUDADriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
.ValueOrDie();
builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
}
{
bool ecc_enabled = false;
(void)CUDADriver::IsEccEnabled(device_, &ecc_enabled);
(void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
builder.set_ecc_enabled(ecc_enabled);
}
{
uint64 device_memory_size = -1;
(void)CUDADriver::GetDeviceTotalMemory(device_, &device_memory_size);
(void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
builder.set_device_memory_size(device_memory_size);
}
port::StatusOr<int> mem_clock_khz = CUDADriver::GetDeviceAttribute(
port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
port::StatusOr<int> mem_bus_width_bits = CUDADriver::GetDeviceAttribute(
port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
// Times 2 because HBM is DDR memory; it gets two data bits per each data
@ -1100,7 +1107,7 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
{
string device_name;
(void)CUDADriver::GetDeviceName(device_, &device_name);
(void)GpuDriver::GetDeviceName(device_, &device_name);
builder.set_name(device_name);
}
@ -1114,19 +1121,19 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
builder.set_device_vendor("NVIDIA Corporation");
builder.set_cuda_compute_capability(cc_major_, cc_minor_);
builder.set_shared_memory_per_core(
CUDADriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
builder.set_shared_memory_per_block(
CUDADriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
builder.set_core_count(
CUDADriver::GetMultiprocessorCount(device_).ValueOrDie());
GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
builder.set_threads_per_core_limit(
CUDADriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
builder.set_registers_per_block_limit(
CUDADriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
builder.set_threads_per_warp(
CUDADriver::GetThreadsPerWarp(device_).ValueOrDie());
GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
builder.set_registers_per_core_limit(
CUDADriver::GetDeviceAttribute(
GpuDriver::GetDeviceAttribute(
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
.ValueOrDie());
@ -1134,11 +1141,11 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
return built.release();
}
} // namespace cuda
} // namespace gpu
void initialize_cuda_gpu_executor() {
*internal::MakeCUDAExecutorImplementation() = [](const PluginConfig &config) {
return new cuda::CUDAExecutor{config};
*internal::MakeCUDAExecutorImplementation() = [](const PluginConfig& config) {
return new gpu::GpuExecutor{config};
};
}

View File

@ -22,289 +22,12 @@ limitations under the License.
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
#include <set>
#include <unordered_map>
#include "absl/strings/string_view.h"
#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
#include "tensorflow/stream_executor/event.h"
#include "tensorflow/stream_executor/lib/status.h"
#include "tensorflow/stream_executor/lib/statusor.h"
#include "tensorflow/stream_executor/platform.h"
#include "tensorflow/stream_executor/platform/mutex.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/platform/thread_annotations.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
namespace stream_executor {
namespace cuda {
// CUDA-platform implementation of the platform-agnostic
// StreamExecutorInferface.
class CUDAExecutor : public internal::StreamExecutorInterface {
public:
// sub_platform indicates the subplatform used in this executor; it must
// be a CUDA type.
explicit CUDAExecutor(const PluginConfig &plugin_config)
: device_(0),
context_(nullptr),
device_ordinal_(0),
cc_major_(0),
cc_minor_(0),
plugin_config_(plugin_config) {}
// See the corresponding StreamExecutor methods for method comments on the
// following overrides.
~CUDAExecutor() override;
port::Status Init(int device_ordinal, DeviceOptions device_options) override;
bool GetKernel(const MultiKernelLoaderSpec &spec,
KernelBase *kernel) override;
void UnloadKernel(const KernelBase *kernel) override;
bool LoadModule(const MultiModuleLoaderSpec &spec,
ModuleHandle *module_handle) override;
bool UnloadModule(ModuleHandle module_handle) override;
bool Launch(Stream *stream, const ThreadDim &thread_dims,
const BlockDim &block_dims, const KernelBase &k,
const KernelArgsArrayBase &args) override;
int CalculateOccupancy(const DeviceDescription &device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim &thread_dims, CUfunction func);
int CompareOccupancy(int *initial_blocks,
const DeviceDescription &device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim &thread_dims, CUfunction func);
void *Allocate(uint64 size) override;
void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
uint64 size_bytes) override;
void Deallocate(DeviceMemoryBase *mem) override;
void *UnifiedMemoryAllocate(uint64 size) override {
return CUDADriver::UnifiedMemoryAllocate(context_, size);
}
void UnifiedMemoryDeallocate(void *location) override {
return CUDADriver::UnifiedMemoryDeallocate(context_, location);
}
// CUDA allocation/registration functions are necessary because the driver
// internally sets up buffers for DMA operations (and page locks them).
// There's no external interface for us to otherwise control these DMA
// settings.
void *HostMemoryAllocate(uint64 size) override {
return CUDADriver::HostAllocate(context_, size);
}
void HostMemoryDeallocate(void *location) override {
return CUDADriver::HostDeallocate(context_, location);
}
bool HostMemoryRegister(void *location, uint64 size) override;
bool HostMemoryUnregister(void *location) override;
bool SynchronizeAllActivity() override;
bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override;
bool SynchronousMemSet(DeviceMemoryBase *location, int value,
uint64 size) override;
port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
const void *host_src, uint64 size) override;
port::Status SynchronousMemcpy(void *host_dst,
const DeviceMemoryBase &gpu_src,
uint64 size) override;
port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
const DeviceMemoryBase &gpu_src,
uint64 size) override;
bool MemZero(Stream *stream, DeviceMemoryBase *location,
uint64 size) override;
bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern,
uint64 size) override;
bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
uint64 size) override;
bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src,
uint64 size) override;
bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src,
uint64 size) override;
bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
const DeviceMemoryBase &gpu_src,
uint64 size) override;
bool HostCallback(Stream *stream,
std::function<port::Status()> callback) override;
bool AllocateStream(Stream *stream) override;
void DeallocateStream(Stream *stream) override;
bool CreateStreamDependency(Stream *dependent, Stream *other) override;
bool AllocateTimer(Timer *timer) override;
void DeallocateTimer(Timer *timer) override;
bool StartTimer(Stream *stream, Timer *timer) override;
bool StopTimer(Stream *stream, Timer *timer) override;
port::Status AllocateEvent(Event *event) override;
port::Status DeallocateEvent(Event *event) override;
port::Status RecordEvent(Stream *stream, Event *event) override;
port::Status WaitForEvent(Stream *stream, Event *event) override;
Event::Status PollForEventStatus(Event *event) override;
port::Status BlockHostUntilDone(Stream *stream) override;
int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override;
bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override;
SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
bool DeviceMemoryUsage(int64 *free, int64 *total) const override;
// Search for the symbol and returns a device pointer and size.
// Returns false if symbol does not exist.
bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
void **mem, size_t *bytes) override;
DeviceDescription *PopulateDeviceDescription() const override;
// Populates the block_dim_limit by querying the device driver API. If an
// error occurs at any point while asking the driver for block dim limits, it
// will be only partially populated as a result, and an error will be logged.
bool FillBlockDimLimit(BlockDim *block_dim_limit) const;
bool SupportsBlas() const override;
blas::BlasSupport *CreateBlas() override;
bool SupportsFft() const override;
fft::FftSupport *CreateFft() override;
bool SupportsRng() const override;
rng::RngSupport *CreateRng() override;
bool SupportsDnn() const override;
dnn::DnnSupport *CreateDnn() override;
std::unique_ptr<internal::EventInterface> CreateEventImplementation()
override;
std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
override;
std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
void *GpuContextHack() override;
CudaContext* cuda_context();
private:
// Attempts to find a more specific version of the file indicated by
// filename by looking for compute-capability-specific suffixed versions; i.e.
// looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
// we're on a compute capability 3.0 machine.
bool FindOnDiskForComputeCapability(absl::string_view filename,
absl::string_view canonical_suffix,
string *found_filename) const;
// Host callback landing routine invoked by CUDA.
// data: User-provided callback provided to HostCallback() above, captured
// as a std::function<void()>. Allocated/initialized inside
// HostCallback() and owned and deleted by this call.
static void InternalHostCallback(CUstream stream, CUresult status,
void *data);
// Collects metadata for the specified kernel.
bool GetKernelMetadata(CUDAKernel *cuda_kernel,
KernelMetadata *kernel_metadata);
// Prints to VLOG(2) information about the kernel's occupancy and how it might
// be improved.
void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
const BlockDim &block_dims);
bool LoadModuleFromCuBin(const char *cubin, CUmodule *module)
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
// Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated.
bool LoadModuleFromPtx(const char *ptx, CUmodule *module)
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
bool UnloadGpuBinary(const void *gpu_binary)
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
// Guards the in-memory-module mapping.
mutex in_memory_modules_mu_;
// Kernel -> loaded GPU binary. Many kernels may load the same binary.
std::unordered_map<const KernelBase *, const void *> kernel_to_gpu_binary_
GUARDED_BY(in_memory_modules_mu_);
// GPU binary (PTX or CUBIN) -> {CUDA module, reference count}.
std::unordered_map<const void *, std::pair<CUmodule, uint64>>
gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
// Guards the launched kernel set.
mutex launched_kernels_mu_;
// Keeps track of the set of launched kernels. Currently used to suppress the
// occupancy check on subsequent launches.
std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_);
// Handle for the CUDA device being operated on. Immutable
// post-initialization.
CUdevice device_;
// Handle for session with the library/driver. Immutable post-initialization.
CudaContext* context_;
// The device ordinal value that this executor was initialized with; recorded
// for use in getting device metadata. Immutable post-initialization.
int device_ordinal_;
// The major verion of the compute capability for device_.
int cc_major_;
// The minor verion of the compute capability for device_.
int cc_minor_;
// The plugin configuration associated with this instance.
PluginConfig plugin_config_;
SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor);
};
using CUDAExecutor = gpu::GpuExecutor;
} // namespace cuda
} // namespace stream_executor

View File

@ -17,88 +17,9 @@ limitations under the License.
//
// These are typically placed here for use by multiple source components (for
// example, BLAS and executor components).
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
#include <stddef.h>
#include <complex>
#include "cuda/include/cuComplex.h"
namespace stream_executor {
template <typename ElemT>
class DeviceMemory;
namespace cuda {
// Converts a const DeviceMemory reference to its underlying typed pointer in
// CUDA
// device memory.
template <typename T>
const T *CUDAMemory(const DeviceMemory<T> &mem) {
return static_cast<const T *>(mem.opaque());
}
// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
// pointer in CUDA device memory.
template <typename T>
T *CUDAMemoryMutable(DeviceMemory<T> *mem) {
return static_cast<T *>(mem->opaque());
}
static_assert(sizeof(std::complex<float>) == sizeof(cuComplex),
"std::complex<float> and cuComplex should have the same size");
static_assert(offsetof(cuComplex, x) == 0,
"The real part of cuComplex should appear first.");
static_assert(sizeof(std::complex<double>) == sizeof(cuDoubleComplex),
"std::complex<double> and cuDoubleComplex should have the same "
"size");
static_assert(offsetof(cuDoubleComplex, x) == 0,
"The real part of cuDoubleComplex should appear first.");
// Type traits to get CUDA complex types from std::complex<>.
template <typename T>
struct CUDAComplexT {
typedef T type;
};
template <>
struct CUDAComplexT<std::complex<float>> {
typedef cuComplex type;
};
template <>
struct CUDAComplexT<std::complex<double>> {
typedef cuDoubleComplex type;
};
// Converts pointers of std::complex<> to pointers of
// cuComplex/cuDoubleComplex. No type conversion for non-complex types.
template <typename T>
inline const typename CUDAComplexT<T>::type *CUDAComplex(const T *p) {
return reinterpret_cast<const typename CUDAComplexT<T>::type *>(p);
}
template <typename T>
inline typename CUDAComplexT<T>::type *CUDAComplex(T *p) {
return reinterpret_cast<typename CUDAComplexT<T>::type *>(p);
}
// Converts values of std::complex<float/double> to values of
// cuComplex/cuDoubleComplex.
inline cuComplex CUDAComplexValue(std::complex<float> val) {
return {val.real(), val.imag()};
}
inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) {
return {val.real(), val.imag()};
}
} // namespace cuda
} // namespace stream_executor
#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_

View File

@ -0,0 +1,38 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
namespace stream_executor {
namespace gpu {
CUfunc_cache GpuKernel::GetGpuCacheConfig() const {
switch (preferred_cache_config_) {
case KernelCacheConfig::kNoPreference:
return CU_FUNC_CACHE_PREFER_NONE;
case KernelCacheConfig::kPreferShared:
return CU_FUNC_CACHE_PREFER_SHARED;
case KernelCacheConfig::kPreferL1:
return CU_FUNC_CACHE_PREFER_L1;
case KernelCacheConfig::kPreferEqual:
return CU_FUNC_CACHE_PREFER_EQUAL;
default:
LOG(FATAL) << "Unknown KernelCacheConfig"
<< static_cast<int32>(preferred_cache_config_);
}
}
} // namespace gpu
} // namespace stream_executor

View File

@ -22,104 +22,12 @@ limitations under the License.
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
#include "tensorflow/stream_executor/kernel_cache_config.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/platform/logging.h"
#include "cuda/include/cuda.h"
#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
#error \
"No driver calls in this file, wrap driver functionality in cuda_driver.cc."
#endif
#ifdef __CUDA_RUNTIME_H__
#error \
"CUDA runtime being included into CUDA GPU executor; should be driver only."
#endif
#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
namespace stream_executor {
namespace cuda {
// Wraps a CUfunction to implement the platform-independent KernelInterface.
class CUDAKernel : public internal::KernelInterface {
public:
CUDAKernel() : cuda_function_(nullptr), arity_(0),
preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
// Note that the function is unloaded when the module is unloaded, and the
// module that the function is contained in is owned by the CUDAExecutor.
~CUDAKernel() override {}
// As arity cannot be reflected upon using the CUDA API, the arity is
// explicitly set during the CUDAExecutor::GetKernel initialization process.
void set_arity(unsigned arity) { arity_ = arity; }
unsigned Arity() const override { return arity_; }
// Returns the CUfunction value for passing to the CUDA API.
CUfunction AsCUDAFunctionValue() const {
DCHECK(cuda_function_ != nullptr);
return const_cast<CUfunction>(cuda_function_);
}
// Returns the slot that the CUfunction is stored within for this object,
// for the CUDA API which wants to load into a CUfunction*.
CUfunction *cuda_function_ptr() { return &cuda_function_; }
// CUDA supports setting the preferred cache configuration of a CUfunction
// (more-or-less equivalent to a CUDAKernel). We support this via the below
// functions; users can set a preference, and that is applied when the kernel
// is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to
// load the kernel & set the preference when the user calls the setter below;
// either approach is valid.
// Sets the current kernel cache configuration preference.
void SetPreferredCacheConfig(KernelCacheConfig config) override {
preferred_cache_config_ = config;
}
// Returns the current kernel cache configuration preference.
KernelCacheConfig GetPreferredCacheConfig() const override {
return preferred_cache_config_;
}
// Returns the current kernel cache configuration preference as a
// CUfunc_cache.
CUfunc_cache GetCUDACacheConfig() const {
switch (preferred_cache_config_) {
case KernelCacheConfig::kNoPreference:
return CU_FUNC_CACHE_PREFER_NONE;
case KernelCacheConfig::kPreferShared:
return CU_FUNC_CACHE_PREFER_SHARED;
case KernelCacheConfig::kPreferL1:
return CU_FUNC_CACHE_PREFER_L1;
case KernelCacheConfig::kPreferEqual:
return CU_FUNC_CACHE_PREFER_EQUAL;
default:
LOG(FATAL) << "Unknown KernelCacheConfig"
<< static_cast<int32>(preferred_cache_config_);
}
}
private:
CUfunction cuda_function_; // Wrapped CUDA kernel handle.
unsigned arity_; // Number of formal parameters the kernel takes.
// Preferred (but not required) cache configuration for this kernel.
KernelCacheConfig preferred_cache_config_;
};
// Given a platform-independent kernel datatype, returns the (const) internal
// CUDA platform implementation pointer.
inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) {
return static_cast<const CUDAKernel *>(kernel->implementation());
}
// Given a platform-independent kernel datatype, returns the (non-const)
// internal CUDA platform implementation pointer.
inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
return static_cast<CUDAKernel *>(kernel->implementation());
}
using CUDAKernel = gpu::GpuKernel;
} // namespace cuda
} // namespace stream_executor

View File

@ -25,7 +25,7 @@ limitations under the License.
#include "tensorflow/stream_executor/lib/stringprintf.h"
namespace stream_executor {
namespace cuda {
namespace gpu {
namespace {
// Synchronize with spinlocks.
@ -129,16 +129,16 @@ port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus(
port::Printf("Executor for bus %d not found.", bus_ordinal));
}
Platform::Id CudaPlatform::id() const { return kCudaPlatformId; }
Platform::Id CudaPlatform::id() const { return cuda::kCudaPlatformId; }
int CudaPlatform::VisibleDeviceCount() const {
// Throw away the result - it logs internally, and this [containing] function
// isn't in the path of user control. It's safe to call this > 1x.
if (!cuda::CUDADriver::Init().ok()) {
if (!gpu::GpuDriver::Init().ok()) {
return -1;
}
return CUDADriver::GetDeviceCount();
return GpuDriver::GetDeviceCount();
}
const string& CudaPlatform::Name() const { return name_; }
@ -169,7 +169,7 @@ port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
port::StatusOr<std::unique_ptr<StreamExecutor>>
CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
auto executor = MakeUnique<StreamExecutor>(
this, MakeUnique<CUDAExecutor>(config.plugin_config));
this, MakeUnique<GpuExecutor>(config.plugin_config));
auto init_status = executor->Init(config.ordinal, config.device_options);
if (!init_status.ok()) {
return port::Status(
@ -191,13 +191,13 @@ void CudaPlatform::UnregisterTraceListener(TraceListener* listener) {
LOG(FATAL) << "not yet implemented: unregister CUDA trace listener";
}
} // namespace cuda
} // namespace gpu
static void InitializeCudaPlatform() {
// Disabling leak checking, MultiPlatformManager does not destroy its
// registered platforms.
std::unique_ptr<cuda::CudaPlatform> platform(new cuda::CudaPlatform);
std::unique_ptr<gpu::CudaPlatform> platform(new gpu::CudaPlatform);
SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
}

View File

@ -32,7 +32,7 @@ limitations under the License.
#include "tensorflow/stream_executor/trace_listener.h"
namespace stream_executor {
namespace cuda {
namespace gpu {
// Opaque and unique identifier for the CUDA platform plugin.
// This is needed so that plugins can refer to/identify this platform without
@ -102,6 +102,12 @@ class CudaPlatform : public Platform {
SE_DISALLOW_COPY_AND_ASSIGN(CudaPlatform);
};
} // namespace gpu
namespace cuda {
using CudaPlatform = gpu::CudaPlatform;
} // namespace cuda
} // namespace stream_executor

View File

@ -58,33 +58,33 @@ std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
}
namespace stream_executor {
namespace cuda {
namespace gpu {
PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);
namespace wrap {
#ifdef PLATFORM_GOOGLE
#define STREAM_EXECUTOR_CURAND_WRAP(__name) \
struct WrapperShim__##__name { \
template <typename... Args> \
curandStatus_t operator()(CUDAExecutor *parent, Args... args) { \
cuda::ScopedActivateExecutorContext sac{parent}; \
return ::__name(args...); \
} \
#define STREAM_EXECUTOR_CURAND_WRAP(__name) \
struct WrapperShim__##__name { \
template <typename... Args> \
curandStatus_t operator()(GpuExecutor* parent, Args... args) { \
gpu::ScopedActivateExecutorContext sac{parent}; \
return ::__name(args...); \
} \
} __name;
#else
#define STREAM_EXECUTOR_CURAND_WRAP(__name) \
struct DynLoadShim__##__name { \
static const char *kName; \
static const char* kName; \
using FuncPtrT = std::add_pointer<decltype(::__name)>::type; \
static void *GetDsoHandle() { \
static void* GetDsoHandle() { \
auto s = internal::CachedDsoLoader::GetCurandDsoHandle(); \
return s.ValueOrDie(); \
} \
static FuncPtrT LoadOrDie() { \
void *f; \
void* f; \
auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
kName, &f); \
CHECK(s.ok()) << "could not find " << kName \
@ -96,12 +96,12 @@ namespace wrap {
return f; \
} \
template <typename... Args> \
curandStatus_t operator()(CUDAExecutor *parent, Args... args) { \
cuda::ScopedActivateExecutorContext sac{parent}; \
curandStatus_t operator()(GpuExecutor* parent, Args... args) { \
gpu::ScopedActivateExecutorContext sac{parent}; \
return DynLoad()(args...); \
} \
} __name; \
const char *DynLoadShim__##__name::kName = #__name;
const char* DynLoadShim__##__name::kName = #__name;
#endif
STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
@ -116,38 +116,15 @@ STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormalDouble);
} // namespace wrap
template <typename T>
string TypeString();
GpuRng::GpuRng(GpuExecutor* parent) : parent_(parent), rng_(nullptr) {}
template <>
string TypeString<float>() {
return "float";
}
template <>
string TypeString<double>() {
return "double";
}
template <>
string TypeString<std::complex<float>>() {
return "std::complex<float>";
}
template <>
string TypeString<std::complex<double>>() {
return "std::complex<double>";
}
CUDARng::CUDARng(CUDAExecutor *parent) : parent_(parent), rng_(nullptr) {}
CUDARng::~CUDARng() {
GpuRng::~GpuRng() {
if (rng_ != nullptr) {
wrap::curandDestroyGenerator(parent_, rng_);
}
}
bool CUDARng::Init() {
bool GpuRng::Init() {
mutex_lock lock(mu_);
CHECK(rng_ == nullptr);
@ -162,9 +139,9 @@ bool CUDARng::Init() {
return true;
}
bool CUDARng::SetStream(Stream *stream) {
bool GpuRng::SetStream(Stream* stream) {
curandStatus_t ret =
wrap::curandSetStream(parent_, rng_, AsCUDAStreamValue(stream));
wrap::curandSetStream(parent_, rng_, AsGpuStreamValue(stream));
if (ret != CURAND_STATUS_SUCCESS) {
LOG(ERROR) << "failed to set stream for random generation: " << ret;
return false;
@ -182,8 +159,7 @@ constexpr bool ComplexIsConsecutiveFloats() {
}
template <typename T>
bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
DeviceMemory<T> *v) {
bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
mutex_lock lock(mu_);
static_assert(ComplexIsConsecutiveFloats(),
"std::complex values are not stored as consecutive values");
@ -203,11 +179,11 @@ bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
if (std::is_same<T, float>::value ||
std::is_same<T, std::complex<float>>::value) {
ret = wrap::curandGenerateUniform(
parent_, rng_, reinterpret_cast<float *>(CUDAMemoryMutable(v)),
parent_, rng_, reinterpret_cast<float*>(GpuMemoryMutable(v)),
element_count);
} else {
ret = wrap::curandGenerateUniformDouble(
parent_, rng_, reinterpret_cast<double *>(CUDAMemoryMutable(v)),
parent_, rng_, reinterpret_cast<double*>(GpuMemoryMutable(v)),
element_count);
}
if (ret != CURAND_STATUS_SUCCESS) {
@ -220,29 +196,29 @@ bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
return true;
}
bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) {
bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) {
return DoPopulateRandUniformInternal(stream, v);
}
bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) {
bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) {
return DoPopulateRandUniformInternal(stream, v);
}
bool CUDARng::DoPopulateRandUniform(Stream *stream,
DeviceMemory<std::complex<float>> *v) {
bool GpuRng::DoPopulateRandUniform(Stream* stream,
DeviceMemory<std::complex<float>>* v) {
return DoPopulateRandUniformInternal(stream, v);
}
bool CUDARng::DoPopulateRandUniform(Stream *stream,
DeviceMemory<std::complex<double>> *v) {
bool GpuRng::DoPopulateRandUniform(Stream* stream,
DeviceMemory<std::complex<double>>* v) {
return DoPopulateRandUniformInternal(stream, v);
}
template <typename ElemT, typename FuncT>
bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
ElemT stddev,
DeviceMemory<ElemT> *v,
FuncT func) {
bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
ElemT stddev,
DeviceMemory<ElemT>* v,
FuncT func) {
mutex_lock lock(mu_);
if (!SetStream(stream)) {
@ -251,7 +227,7 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
uint64 element_count = v->ElementCount();
curandStatus_t ret =
func(parent_, rng_, CUDAMemoryMutable(v), element_count, mean, stddev);
func(parent_, rng_, GpuMemoryMutable(v), element_count, mean, stddev);
if (ret != CURAND_STATUS_SUCCESS) {
LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
@ -262,19 +238,19 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
return true;
}
bool CUDARng::DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
DeviceMemory<float> *v) {
bool GpuRng::DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
DeviceMemory<float>* v) {
return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
wrap::curandGenerateNormal);
}
bool CUDARng::DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
DeviceMemory<double> *v) {
bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
DeviceMemory<double>* v) {
return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
wrap::curandGenerateNormalDouble);
}
bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
mutex_lock lock(mu_);
CHECK(rng_ != nullptr);
@ -303,15 +279,15 @@ bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
return true;
}
} // namespace cuda
} // namespace gpu
void initialize_curand() {
port::Status status =
PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
cuda::kCudaPlatformId, cuda::kCuRandPlugin, "cuRAND",
[](internal::StreamExecutorInterface *parent) -> rng::RngSupport * {
cuda::CUDAExecutor *cuda_executor =
dynamic_cast<cuda::CUDAExecutor *>(parent);
cuda::kCudaPlatformId, gpu::kGpuRandPlugin, "cuRAND",
[](internal::StreamExecutorInterface* parent) -> rng::RngSupport* {
gpu::GpuExecutor* cuda_executor =
dynamic_cast<gpu::GpuExecutor*>(parent);
if (cuda_executor == nullptr) {
LOG(ERROR)
<< "Attempting to initialize an instance of the cuRAND "
@ -319,7 +295,7 @@ void initialize_curand() {
return nullptr;
}
cuda::CUDARng *rng = new cuda::CUDARng(cuda_executor);
gpu::GpuRng* rng = new gpu::GpuRng(cuda_executor);
if (!rng->Init()) {
// Note: Init() will log a more specific error.
delete rng;
@ -334,7 +310,7 @@ void initialize_curand() {
}
PluginRegistry::Instance()->SetDefaultFactory(
cuda::kCudaPlatformId, PluginKind::kRng, cuda::kCuRandPlugin);
cuda::kCudaPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
}
} // namespace stream_executor

View File

@ -16,85 +16,13 @@ limitations under the License.
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
#include "tensorflow/stream_executor/platform/mutex.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/platform/thread_annotations.h"
#include "tensorflow/stream_executor/plugin_registry.h"
#include "tensorflow/stream_executor/rng.h"
typedef struct curandGenerator_st *curandGenerator_t;
#include "tensorflow/stream_executor/gpu/gpu_rng.h"
namespace stream_executor {
class Stream;
template <typename ElemT>
class DeviceMemory;
namespace cuda {
// Opaque and unique identifier for the cuRAND plugin.
extern const PluginId kCuRandPlugin;
class CUDAExecutor;
// CUDA-platform implementation of the random number generation support
// interface.
//
// Thread-safe post-initialization.
class CUDARng : public rng::RngSupport {
public:
explicit CUDARng(CUDAExecutor *parent);
// Retrieves a curand library generator handle. This is necessary for
// enqueuing random number generation work onto the device.
// TODO(leary) provide a way for users to select the RNG algorithm.
bool Init();
// Releases a curand library generator handle, if one was acquired.
~CUDARng() override;
// See rng::RngSupport for details on the following overrides.
bool DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) override;
bool DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) override;
bool DoPopulateRandUniform(Stream *stream,
DeviceMemory<std::complex<float>> *v) override;
bool DoPopulateRandUniform(Stream *stream,
DeviceMemory<std::complex<double>> *v) override;
bool DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
DeviceMemory<float> *v) override;
bool DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
DeviceMemory<double> *v) override;
bool SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) override;
private:
// Actually performs the work of generating random numbers - the public
// methods are thin wrappers to this interface.
template <typename T>
bool DoPopulateRandUniformInternal(Stream *stream, DeviceMemory<T> *v);
template <typename ElemT, typename FuncT>
bool DoPopulateRandGaussianInternal(Stream *stream, ElemT mean, ElemT stddev,
DeviceMemory<ElemT> *v, FuncT func);
// Sets the stream for the internal curand generator.
//
// This is a stateful operation, as the handle can only have one stream set at
// a given time, so it is usually performed right before enqueuing work to do
// with random number generation.
bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
// mutex that guards the cuRAND handle for this device.
mutex mu_;
// CUDAExecutor which instantiated this CUDARng.
// Immutable post-initialization.
CUDAExecutor *parent_;
// cuRANDalibrary handle on the device.
curandGenerator_t rng_ GUARDED_BY(mu_);
SE_DISALLOW_COPY_AND_ASSIGN(CUDARng);
};
using CUDARng = gpu::GpuRng;
} // namespace cuda
} // namespace stream_executor

View File

@ -13,79 +13,22 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Defines the CUDAStream type - the CUDA-specific implementation of the generic
// Defines the GpuStream type - the CUDA-specific implementation of the generic
// StreamExecutor Stream interface.
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
#include "tensorflow/stream_executor/platform/thread_annotations.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
namespace stream_executor {
namespace cuda {
class CUDAExecutor;
using CUDAStream = gpu::GpuStream;
// Wraps a CUstream in order to satisfy the platform-independent
// StreamInterface.
//
// Thread-safe post-initialization.
class CUDAStream : public internal::StreamInterface {
public:
explicit CUDAStream(CUDAExecutor *parent)
: parent_(parent), cuda_stream_(nullptr), completed_event_(nullptr) {}
// Note: teardown is handled by a parent's call to DeallocateStream.
~CUDAStream() override {}
void *GpuStreamHack() override { return cuda_stream_; }
void **GpuStreamMemberHack() override {
return reinterpret_cast<void **>(&cuda_stream_);
}
// Explicitly initialize the CUDA resources associated with this stream, used
// by StreamExecutor::AllocateStream().
bool Init();
// Explicitly destroy the CUDA resources associated with this stream, used by
// StreamExecutor::DeallocateStream().
void Destroy();
// Returns true if no work is pending or executing on the stream.
bool IsIdle() const;
// Retrieves an event which indicates that all work enqueued into the stream
// has completed. Ownership of the event is not transferred to the caller, the
// event is owned by this stream.
CUevent* completed_event() { return &completed_event_; }
// Returns the CUstream value for passing to the CUDA API.
//
// Precond: this CUDAStream has been allocated (otherwise passing a nullptr
// into the NVIDIA library causes difficult-to-understand faults).
CUstream cuda_stream() const {
DCHECK(cuda_stream_ != nullptr);
return const_cast<CUstream>(cuda_stream_);
}
CUDAExecutor *parent() const { return parent_; }
private:
CUDAExecutor *parent_; // Executor that spawned this stream.
CUstream cuda_stream_; // Wrapped CUDA stream handle.
// Event that indicates this stream has completed.
CUevent completed_event_ = nullptr;
};
// Helper functions to simplify extremely common flows.
// Converts a Stream to the underlying CUDAStream implementation.
CUDAStream *AsCUDAStream(Stream *stream);
// Extracts a CUstream from a CUDAStream-backed Stream object.
CUstream AsCUDAStreamValue(Stream *stream);
inline CUDAStream* AsCUDAStream(Stream* stream) {
return gpu::AsGpuStream(stream);
}
} // namespace cuda
} // namespace stream_executor

View File

@ -13,76 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Defines the CUDATimer type - the CUDA-specific implementation of the generic
// Defines the GpuTimer type - the CUDA-specific implementation of the generic
// StreamExecutor Timer interface.
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
#include "tensorflow/stream_executor/stream_executor_internal.h"
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
#include "tensorflow/stream_executor/gpu/gpu_timer.h"
namespace stream_executor {
namespace cuda {
class CUDAExecutor;
class CUDAStream;
// Wraps a pair of CUevents in order to satisfy the platform-independent
// TimerInferface -- both a start and a stop event are present which may be
// recorded in a stream.
class CUDATimer : public internal::TimerInterface {
public:
explicit CUDATimer(CUDAExecutor *parent)
: parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
// Note: teardown needs to be explicitly handled in this API by a call to
// StreamExecutor::DeallocateTimer(), which invokes Destroy().
// TODO(csigg): Change to RAII.
~CUDATimer() override {}
// Allocates the platform-specific pieces of the timer, called as part of
// StreamExecutor::AllocateTimer().
bool Init();
// Deallocates the platform-specific pieces of the timer, called as part of
// StreamExecutor::DeallocateTimer().
void Destroy();
// Records the "timer start" event at the current point in the stream.
bool Start(CUDAStream *stream);
// Records the "timer stop" event at the current point in the stream.
bool Stop(CUDAStream *stream);
// Returns the elapsed time, in milliseconds, between the start and stop
// events.
float GetElapsedMilliseconds() const;
// See Timer::Microseconds().
// TODO(leary) make this into an error code interface...
uint64 Microseconds() const override {
return GetElapsedMilliseconds() * 1e3;
}
// See Timer::Nanoseconds().
uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
private:
CUDAExecutor *parent_;
CUevent start_event_; // Event recorded to indicate the "start" timestamp
// executing in a stream.
CUevent stop_event_; // Event recorded to indicate the "stop" timestamp
// executing in a stream.
};
struct TimerDeleter {
void operator()(CUDATimer *t) {
t->Destroy();
delete t;
}
};
using CUDATimer = gpu::GpuTimer;
} // namespace cuda
} // namespace stream_executor

View File

@ -16,7 +16,7 @@ limitations under the License.
#include "tensorflow/stream_executor/cuda/cudnn_version.h"
namespace stream_executor {
namespace cuda {
namespace gpu {
bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
CudnnVersion loaded_version) {
@ -36,5 +36,5 @@ bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
loaded_version.minor_version >= source_version.minor_version));
}
} // namespace cuda
} // namespace gpu
} // namespace stream_executor

View File

@ -21,7 +21,7 @@ limitations under the License.
#include "tensorflow/core/lib/strings/strcat.h"
namespace stream_executor {
namespace cuda {
namespace gpu {
struct CudnnVersion {
CudnnVersion() = default;
@ -44,7 +44,7 @@ struct CudnnVersion {
bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
CudnnVersion loaded_version);
} // namespace cuda
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_

View File

@ -18,7 +18,7 @@ limitations under the License.
#include "tensorflow/core/platform/test.h"
namespace stream_executor {
namespace cuda {
namespace gpu {
namespace {
TEST(CuDNNVersion, ToString) {
@ -68,5 +68,5 @@ TEST(IsSourceCompatibleWithCudnnLibraryTest, Basic) {
}
} // namespace
} // namespace cuda
} // namespace gpu
} // namespace stream_executor

View File

@ -50,6 +50,7 @@ DeviceDescription::DeviceDescription()
clock_rate_ghz_(-1.0),
cuda_compute_capability_major_(-1),
cuda_compute_capability_minor_(-1),
rocm_amdgpu_isa_version_(-1),
numa_node_(-1),
core_count_(-1),
ecc_enabled_(false) {}
@ -112,6 +113,15 @@ bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const {
return cuda_compute_capability_major_ != 0;
}
bool DeviceDescription::rocm_amdgpu_isa_version(int *version) const {
bool status = false;
if (rocm_amdgpu_isa_version_ > 0) {
*version = rocm_amdgpu_isa_version_;
status = true;
}
return status;
}
bool ThreadDimOk(const DeviceDescription &device_description,
const ThreadDim &thread_dim) {
auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;

View File

@ -133,6 +133,11 @@ class DeviceDescription {
// zero, and the return value will be false.
bool cuda_compute_capability(int *major, int *minor) const;
// Returns the AMDGPU ISA version if we're running on the ROCm platform.
// If the information is not available, the version is not modified,
// and the return value will be false.
bool rocm_amdgpu_isa_version(int *version) const;
// Returns the maximum amount of shared memory present on a single core
// (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
// devices). Note that some devices, such as NVIDIA's have a configurable
@ -195,6 +200,9 @@ class DeviceDescription {
int cuda_compute_capability_major_;
int cuda_compute_capability_minor_;
// ROCM AMDGPU ISA version, 0 if not available.
int rocm_amdgpu_isa_version_;
int numa_node_;
int core_count_;
bool ecc_enabled_;
@ -280,6 +288,10 @@ class DeviceDescriptionBuilder {
device_description_->cuda_compute_capability_minor_ = minor;
}
void set_rocm_amdgpu_isa_version(int version) {
device_description_->rocm_amdgpu_isa_version_ = version;
}
void set_numa_node(int value) { device_description_->numa_node_ = value; }
void set_core_count(int value) { device_description_->core_count_ = value; }
void set_ecc_enabled(bool value) {

View File

@ -0,0 +1,209 @@
# Description:
# GPU-platform specific StreamExecutor support code.
licenses(["notice"]) # Apache 2.0
load(
"//tensorflow/stream_executor:build_defs.bzl",
"stream_executor_friends",
)
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
package_group(
name = "friends",
packages = stream_executor_friends(),
)
package(
default_visibility = [":friends"],
)
# Filegroup used to collect source files for the dependency check.
filegroup(
name = "c_srcs",
data = glob([
"**/*.cc",
"**/*.h",
]),
)
cc_library(
name = "gpu_activation_header",
hdrs = ["gpu_activation.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = ["//tensorflow/stream_executor/platform"],
)
cc_library(
name = "gpu_activation",
srcs = ["gpu_activation.cc"],
hdrs = ["gpu_activation.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
":gpu_activation_header",
":gpu_driver_header",
"//tensorflow/stream_executor",
"//tensorflow/stream_executor:stream_executor_internal",
"//tensorflow/stream_executor/platform",
],
)
cc_library(
name = "gpu_diagnostics_header",
hdrs = ["gpu_diagnostics.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
],
)
cc_library(
name = "gpu_driver_header",
hdrs = ["gpu_driver.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
":gpu_types_header",
"//tensorflow/stream_executor:device_options",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
"@local_config_cuda//cuda:cuda_headers",
],
)
cc_library(
name = "gpu_event_header",
hdrs = ["gpu_event.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
":gpu_driver_header",
":gpu_stream_header",
"//tensorflow/stream_executor:event",
"//tensorflow/stream_executor/lib",
],
)
cc_library(
name = "gpu_event",
srcs = ["gpu_event.cc"],
hdrs = ["gpu_event.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
":gpu_driver_header",
":gpu_executor_header",
":gpu_stream",
"//tensorflow/stream_executor:stream_executor_headers",
"//tensorflow/stream_executor/lib",
],
)
cc_library(
name = "gpu_executor_header",
hdrs = ["gpu_executor.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
":gpu_kernel_header",
"//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:platform",
"//tensorflow/stream_executor:stream_executor_internal",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
"@com_google_absl//absl/strings",
],
)
cc_library(
name = "gpu_helpers_header",
hdrs = ["gpu_helpers.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [":gpu_types_header"],
)
cc_library(
name = "gpu_kernel_header",
hdrs = ["gpu_kernel.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
":gpu_driver_header",
"//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:stream_executor_pimpl_header",
"//tensorflow/stream_executor/platform",
],
)
cc_library(
name = "gpu_rng_header",
hdrs = ["gpu_rng.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
":gpu_types_header",
"//tensorflow/stream_executor:plugin_registry",
"//tensorflow/stream_executor:rng",
"//tensorflow/stream_executor/platform",
],
)
cc_library(
name = "gpu_stream_header",
hdrs = ["gpu_stream.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
":gpu_driver_header",
"//tensorflow/stream_executor:stream_executor_internal",
"//tensorflow/stream_executor/platform",
],
)
cc_library(
name = "gpu_stream",
srcs = ["gpu_stream.cc"],
hdrs = ["gpu_stream.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
":gpu_driver_header",
":gpu_executor_header",
"//tensorflow/stream_executor:stream_executor_headers",
"//tensorflow/stream_executor:stream_header",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
],
)
cc_library(
name = "gpu_timer_header",
hdrs = ["gpu_timer.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
":gpu_driver_header",
":gpu_executor_header",
"//tensorflow/stream_executor:stream_executor_internal",
],
)
cc_library(
name = "gpu_timer",
srcs = ["gpu_timer.cc"],
hdrs = ["gpu_timer.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
":gpu_driver_header",
":gpu_executor_header",
":gpu_stream",
"//tensorflow/stream_executor:stream_executor_headers",
"//tensorflow/stream_executor/lib",
],
)
cc_library(
name = "gpu_types_header",
hdrs = ["gpu_types.h"],
visibility = ["//tensorflow/stream_executor:__subpackages__"],
deps = [
"//tensorflow/stream_executor/platform",
] + if_cuda_is_configured([
"@local_config_cuda//cuda:cuda_headers",
]) + if_rocm_is_configured([
"@local_config_rocm//rocm:rocm_headers",
]),
)

View File

@ -1,4 +1,4 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -13,36 +13,36 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/stream_executor/cuda/cuda_activation.h"
#include "tensorflow/stream_executor/gpu/gpu_activation.h"
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
#include "tensorflow/stream_executor/stream_executor.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
namespace stream_executor {
namespace cuda {
namespace gpu {
CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec);
CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec);
GpuContext* ExtractGpuContext(GpuExecutor* gpu_exec);
GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec);
ScopedActivateExecutorContext::ScopedActivateExecutorContext(
CUDAExecutor *cuda_exec):
driver_scoped_activate_context_(
new ScopedActivateContext{ExtractCudaContext(cuda_exec)}) { }
GpuExecutor* gpu_exec)
: driver_scoped_activate_context_(
new ScopedActivateContext{ExtractGpuContext(gpu_exec)}) {}
ScopedActivateExecutorContext::ScopedActivateExecutorContext(
StreamExecutor *stream_exec)
: ScopedActivateExecutorContext(ExtractCudaExecutor(stream_exec)) {}
StreamExecutor* stream_exec)
: ScopedActivateExecutorContext(ExtractGpuExecutor(stream_exec)) {}
ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
delete static_cast<ScopedActivateContext *>(driver_scoped_activate_context_);
delete static_cast<ScopedActivateContext*>(driver_scoped_activate_context_);
}
ScopedActivateExecutorContext::ScopedActivateExecutorContext(
ScopedActivateExecutorContext &&other)
ScopedActivateExecutorContext&& other)
: driver_scoped_activate_context_(other.driver_scoped_activate_context_) {
other.driver_scoped_activate_context_ = nullptr;
}
} // namespace cuda
} // namespace gpu
} // namespace stream_executor

View File

@ -0,0 +1,61 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This file contains APIs that assume a StreamExecutor is backed by CUDA.
// It reaches into the CUDA implementation to activate an underlying CUDA
// context.
//
// Having this file separate from gpu/gpu_executor.h means that dependent
// code does not also have to depend on cuda.h.
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
#include "tensorflow/stream_executor/platform/port.h"
namespace stream_executor {
class StreamExecutor;
namespace gpu {
class GpuExecutor;
class ScopedActivateContext;
// Activates a CUDA context within an enclosing scope.
class ScopedActivateExecutorContext {
public:
// Form that takes a CUDA executor implementation.
explicit ScopedActivateExecutorContext(GpuExecutor* gpu_exec);
// Form that takes a pImpl executor and extracts a CUDA implementation --
// fatal failure if it is not CUDA inside.
explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
~ScopedActivateExecutorContext();
private:
// The cuda.h-using datatype that we wrap.
ScopedActivateContext* driver_scoped_activate_context_;
SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
};
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_

View File

@ -0,0 +1,99 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
#include <tuple>
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/lib/statusor.h"
#include "tensorflow/stream_executor/platform/port.h"
namespace stream_executor {
namespace gpu {
// e.g. DriverVersion{346, 3, 4}
using DriverVersion = std::tuple<int, int, int>;
// FIXME: These functions are in stream_executor::cuda namespaces for now
// Will move to stream_executor::gpu namespace in the near future
//
//// Converts a parsed driver version to string form.
// string DriverVersionToString(DriverVersion version);
//
//// Converts a parsed driver version or status value to natural string form.
// string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
//
//// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
// port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
class Diagnostician {
public:
// Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
// not initializing).
//
// Note: if we're running on a machine that has no GPUs, we don't want to
// produce very much log spew beyond saying, "looks like there's no CUDA
// kernel
// module running".
//
// Note: we use non-Google-File:: API here because we may be called before
// InitGoogle has completed.
static void LogDiagnosticInformation();
// Given the driver version file contents, finds the kernel module version and
// returns it as a string.
//
// This is solely used for more informative log messages when the user is
// running on a machine that happens to have a libcuda/kernel driver mismatch.
static port::StatusOr<DriverVersion> FindKernelModuleVersion(
const string& driver_version_file_contents);
// Extracts the kernel driver version from the current host.
static port::StatusOr<DriverVersion> FindKernelDriverVersion();
// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
// driver-interfacing DSO version number. Returns it as a string.
static port::StatusOr<DriverVersion> FindDsoVersion();
// Logs information about the kernel driver version and userspace driver
// library version.
static void LogDriverVersionInformation();
private:
// Given the DSO version number and the driver version file contents, extracts
// the driver version and compares, warning the user in the case of
// incompatibility.
//
// This is solely used for more informative log messages when the user is
// running on a machine that happens to have a libcuda/kernel driver mismatch.
static void WarnOnDsoKernelMismatch(
port::StatusOr<DriverVersion> dso_version,
port::StatusOr<DriverVersion> kernel_version);
// Logs information about the dev nodes present on this machine: their
// existence, permissions, accessibility from this uid/gid.
static void LogDevNodeDiagnosticInformation();
static string GetDevNodePath(int dev_node_ordinal);
SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
};
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_

View File

@ -0,0 +1,525 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// CUDA userspace driver library wrapper functionality.
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
#include <stddef.h>
#include "tensorflow/stream_executor/platform/port.h"
#include "cuda/include/cuda.h"
#include "tensorflow/stream_executor/device_options.h"
#include "tensorflow/stream_executor/lib/status.h"
#include "tensorflow/stream_executor/lib/statusor.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/gpu/gpu_types.h"
namespace stream_executor {
namespace gpu {
// Identifies the memory space where an allocation resides. See
// GpuDriver::GetPointerMemorySpace().
enum class MemorySpace { kHost, kDevice };
// Returns a casual string, such as "host" for the provided memory space.
string MemorySpaceString(MemorySpace memory_space);
class GpuContext;
// GpuDriver contains wrappers for calls to the userspace library driver. It's
// useful to isolate these calls and put basic wrappers around them to separate
// userspace library driver behaviors from the rest of the program.
//
// At the moment it's simply used as a namespace.
//
// The calls log any specific errors internally and return whether the operation
// was successful to the caller.
//
// The order of parameters is generally kept symmetric with the underlying CUDA
// driver API.
//
// Links on functions are to specific documentation under
// http://docs.nvidia.com/cuda/cuda-driver-api/
//
// Thread safety: these functions should not be used from signal handlers.
class GpuDriver {
public:
// Wraps a call to cuInit with logging to help indicate what has gone wrong in
// the case of failure. Safe to call multiple times; will be fast on all calls
// after the first.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
static port::Status Init();
// Returns the device associated with the given context.
// device is an outparam owned by the caller, must not be null.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
static port::StatusOr<GpuDeviceHandle> DeviceFromContext(GpuContext* context);
// Creates a new CUDA stream associated with the given context via
// cuStreamCreate.
// stream is an outparam owned by the caller, must not be null.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
static bool CreateStream(GpuContext* context, GpuStreamHandle* stream);
// Destroys a CUDA stream associated with the given context.
// stream is owned by the caller, must not be null, and *stream is set to null
// if the stream is successfully destroyed.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
static void DestroyStream(GpuContext* context, GpuStreamHandle* stream);
// CUDA events can explicitly disable event TSC retrieval for some presumed
// performance improvement if timing is unnecessary.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
enum class EventFlags { kDefault, kDisableTiming };
// Creates a new event associated with the given context.
// result is an outparam owned by the caller and must not be null.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
static port::Status CreateEvent(GpuContext* context, GpuEventHandle* result,
EventFlags flags);
// Destroys *event and turns it into a nullptr. event may not be null, but
// *event may be, via cuEventDestroy
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
static port::Status DestroyEvent(GpuContext* context, GpuEventHandle* event);
// Allocates a GPU memory space of size bytes associated with the given
// context via cuMemAlloc.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
static void* DeviceAllocate(GpuContext* context, uint64 bytes);
// Deallocates a GPU memory space of size bytes associated with the given
// context via cuMemFree.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
static void DeviceDeallocate(GpuContext* context, void* location);
// Allocates a unified memory space of size bytes associated with the given
// context via cuMemAllocManaged.
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
// (supported on CUDA only)
static void* UnifiedMemoryAllocate(GpuContext* context, uint64 bytes);
// Deallocates a unified memory space of size bytes associated with the given
// context via cuMemFree.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
// (supported on CUDA only)
static void UnifiedMemoryDeallocate(GpuContext* context, void* location);
// Allocates page-locked and CUDA-registered memory on the host via
// cuMemAllocHost.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
static void* HostAllocate(GpuContext* context, uint64 bytes);
// Deallocates a location created by HostAllocate, via cuMemFreeHost.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
static void HostDeallocate(GpuContext* context, void* location);
// Registers a memory region at location of size bytes via cuMemHostRegister.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
static bool HostRegister(GpuContext* context, void* location, uint64 bytes);
// Unregisters a memory region that was previously registered at location via
// cuMemHostUnregister.
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
//
// TODO(leary) verify an error will be returned if the location wasn't
// previously registered.
static bool HostUnregister(GpuContext* context, void* location);
// Given a device ordinal, returns a device handle into the device outparam,
// which must not be null.
//
// N.B. these device handles do not have a corresponding destroy function in
// the CUDA driver API.
static port::Status GetDevice(int device_ordinal, GpuDeviceHandle* device);
// Given a device handle, returns the name reported by the driver for the
// device.
static bool GetDeviceName(GpuDeviceHandle device, string* device_name);
// Given a device to create a context for, returns a context handle into the
// context outparam, which must not be null.
//
// N.B. CUDA contexts are weird. They are implicitly associated with the
// calling thread. Current documentation on contexts and their influence on
// userspace processes is given here:
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
static port::Status CreateContext(int device_ordinal, GpuDeviceHandle device,
const DeviceOptions& device_options,
GpuContext** context);
// Destroys the provided context via cuCtxDestroy.
// Don't do this while clients could still be using the context, per the docs
// bad things will happen.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
static void DestroyContext(GpuContext* context);
// Queries the runtime for the specified attribute of the specified function.
// cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
// in terms of integer-sized values, so there's no potential for overrun (as
// of CUDA 5.5).
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
static bool FuncGetAttribute(GpuFunctionAttribute attribute,
GpuFunctionHandle function,
int* attribute_value);
// Sets the preferred cache configuration for the specified function.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
static bool FuncSetCacheConfig(GpuFunctionHandle function,
GpuFuncCachePreference cache_config);
// Gets the preferred shared memory bank configuration for the specified
// CONTEXT (not function!), either default or four- or eight-byte bank size.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
static port::StatusOr<GpuSharedMemConfig> ContextGetSharedMemConfig(
GpuContext* context);
// Sets the preferred shared memory bank configuration for the specified
// CONTEXT (not function!), either default or four- or eight-byte bank size.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
static port::Status ContextSetSharedMemConfig(
GpuContext* context, GpuSharedMemConfig shared_mem_config);
// Launches a CUDA kernel via cuLaunchKernel.
// TODO(leary) describe the structure of kernel_params and extra in a readable
// way.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
static bool LaunchKernel(GpuContext* context, GpuFunctionHandle function,
unsigned int grid_dim_x, unsigned int grid_dim_y,
unsigned int grid_dim_z, unsigned int block_dim_x,
unsigned int block_dim_y, unsigned int block_dim_z,
unsigned int shared_mem_bytes,
GpuStreamHandle stream, void** kernel_params,
void** extra);
// Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
// handle in "module". Any error logs that are produced are logged internally.
// (supported on CUDA only)
static bool LoadPtx(GpuContext* context, const char* ptx_contents,
GpuModuleHandle* module);
// Loads cubin_bytes with the CUDA driver's blob loading interface and stores
// the resulting handle in "module".
// (supported on CUDA only)
static port::Status LoadCubin(GpuContext* context, const char* cubin_bytes,
GpuModuleHandle* module);
// Loads HSACO with the ROCM runtime and stores the resulting handle in
// "module". Any error logs that are produced are logged internally.
// (supported on ROCm only)
static bool LoadHsaco(GpuContext* context, const char* hsaco_contents,
GpuModuleHandle* module);
// Retrieves a named kernel from a loaded module, and places the resulting
// handle into function (outparam) on success. Neither kernel_name nor
// function may be null. No ownership is taken of kernel_name.
static bool GetModuleFunction(GpuContext* context, GpuModuleHandle module,
const char* kernel_name,
GpuFunctionHandle* function);
// Retrieves a named global/constant symbol from a loaded module, and returns
// a device pointer and size of the symbol on success. symbol_name may not be
// null. At least one of dptr or bytes should not be null. No ownership is
// taken of symbol_name.
static bool GetModuleSymbol(GpuContext* context, GpuModuleHandle module,
const char* symbol_name, GpuDevicePtr* dptr,
size_t* bytes);
// Unloads module from the current context via cuModuleUnload.
// TODO(leary) the documentation doesn't say what kind of disasters happen
// if you try to unload a module while its GpuFunctionHandles are in use.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
static void UnloadModule(GpuContext* context, GpuModuleHandle module);
// Performs a synchronous memset of the device memory segment via cuMemsetD8.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
static bool SynchronousMemsetUint8(GpuContext* context, GpuDevicePtr location,
uint8 value, size_t size);
// Performs a synchronous memset of the device memory segment via cuMemsetD32.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
static bool SynchronousMemsetUint32(GpuContext* context,
GpuDevicePtr location, uint32 value,
size_t uint32_count);
// Performs an asynchronous memset of the device memory segment via
// cuMemsetD8Async.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
static bool AsynchronousMemsetUint8(GpuContext* context,
GpuDevicePtr location, uint8 value,
size_t uint32_count,
GpuStreamHandle stream);
// Performs an asynchronous memset of the device memory segment via
// cuMemsetD32Async.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
static bool AsynchronousMemsetUint32(GpuContext* context,
GpuDevicePtr location, uint32 value,
size_t uint32_count,
GpuStreamHandle stream);
// -- Synchronous memcopies.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
static port::Status SynchronousMemcpyD2H(GpuContext* context, void* host_dst,
GpuDevicePtr gpu_src, uint64 size);
static port::Status SynchronousMemcpyH2D(GpuContext* context,
GpuDevicePtr gpu_dst,
const void* host_src, uint64 size);
static port::Status SynchronousMemcpyD2D(GpuContext* context,
GpuDevicePtr gpu_dst,
GpuDevicePtr gpu_src, uint64 size);
// -- Asynchronous memcopies.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
static bool AsynchronousMemcpyD2H(GpuContext* context, void* host_dst,
GpuDevicePtr gpu_src, uint64 size,
GpuStreamHandle stream);
static bool AsynchronousMemcpyH2D(GpuContext* context, GpuDevicePtr gpu_dst,
const void* host_src, uint64 size,
GpuStreamHandle stream);
static bool AsynchronousMemcpyD2D(GpuContext* context, GpuDevicePtr gpu_dst,
GpuDevicePtr gpu_src, uint64 size,
GpuStreamHandle stream);
// The CUDA stream callback type signature.
// The data passed to AddStreamCallback is subsequently passed to this
// callback when it fires.
//
// Some notable things:
// * Callbacks must not make any CUDA API calls.
// * Callbacks from independent streams execute in an undefined order and may
// be serialized.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
typedef void (*StreamCallback)(GpuStreamHandle stream, GpuStatus status,
void* data);
// Enqueues a callback operation into stream.
// See StreamCallback above and the NVIDIA documentation for additional
// details.
static bool AddStreamCallback(GpuContext* context, GpuStreamHandle stream,
StreamCallback callback, void* data);
// Causes stream to wait for event to trigger before proceeding via
// cuStreamWaitEvent.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
static bool WaitStreamOnEvent(GpuContext* context, GpuStreamHandle stream,
GpuEventHandle event);
// Blocks the calling thread until the operations enqueued onto stream have
// been completed, via cuStreamSynchronize.
//
// TODO(leary) if a pathological thread enqueues operations onto the stream
// while another thread blocks like this, can you wind up waiting an unbounded
// amount of time?
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
static port::Status SynchronizeStream(GpuContext* context,
GpuStreamHandle stream);
// Blocks the calling thread until the operations associated with the context
// have been completed, via cuCtxSynchronize.
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
static bool SynchronizeContext(GpuContext* context);
// Returns true if all stream tasks have completed at time of the call. Note
// the potential for races around this call (if another thread adds work to
// the stream immediately after this returns).
static bool IsStreamIdle(GpuContext* context, GpuStreamHandle stream);
// Returns whether code in the from context can access memory in the to
// context via cuDeviceCanAccessPeer.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
static bool CanEnablePeerAccess(GpuContext* from, GpuContext* to);
// Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to);
// Returns the elapsed milliseconds between start and stop via
// cuEventElapsedTime.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
static bool GetEventElapsedTime(GpuContext* context,
float* elapsed_milliseconds,
GpuEventHandle start, GpuEventHandle stop);
// Records that an event occurred when execution reaches the current point in
// thestream via cuEventRecord.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
static port::Status RecordEvent(GpuContext* context, GpuEventHandle event,
GpuStreamHandle stream);
// Polls (without blocking) to determine the status of an event - pending or
// complete (or an error status).
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
static port::StatusOr<GpuStatus> QueryEvent(GpuContext* context,
GpuEventHandle event);
// -- Pointer-specific calls.
// Returns the context in which pointer was allocated or registered.
static port::StatusOr<GpuContext*> GetPointerContext(GpuDevicePtr pointer);
// Returns the device associated with the context from GetPointerContext().
static port::StatusOr<GpuDeviceHandle> GetPointerDevice(GpuDevicePtr pointer);
// Returns the memory space addressed by pointer.
static port::StatusOr<MemorySpace> GetPointerMemorySpace(
GpuDevicePtr pointer);
// Returns the base address and size of the device pointer dptr.
static port::Status GetPointerAddressRange(GpuDevicePtr dptr,
GpuDevicePtr* base, size_t* size);
// -- Device-specific calls.
// Returns the compute capability for the device; i.e (3, 5).
// This is currently done via the deprecated device API.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
// (supported on CUDA only)
static port::Status GetComputeCapability(int* cc_major, int* cc_minor,
GpuDeviceHandle device);
// Returns Gpu ISA version for the device; i.e 803, 900.
// (supported on ROCm only)
static port::Status GetGpuISAVersion(int* version, GpuDeviceHandle device);
// Returns the number of multiprocessors on the device (note that the device
// may be multi-GPU-per-board).
static port::StatusOr<int> GetMultiprocessorCount(GpuDeviceHandle device);
// Returns the limit on number of threads that can be resident in a single
// multiprocessor.
static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(
GpuDeviceHandle device);
// Returns the limit on number of threads which may be resident for a single
// block (cooperative thread array).
static port::StatusOr<int64> GetMaxThreadsPerBlock(GpuDeviceHandle device);
// Returns the amount of shared memory available on a single GPU core (i.e.
// SM on NVIDIA devices).
static port::StatusOr<int64> GetMaxSharedMemoryPerCore(
GpuDeviceHandle device);
// Returns the amount of shared memory available for a single block
// (cooperative thread array).
static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(
GpuDeviceHandle device);
// Returns the maximum supported number of registers per block.
static port::StatusOr<int64> GetMaxRegistersPerBlock(GpuDeviceHandle device);
// Returns the number of threads per warp.
static port::StatusOr<int64> GetThreadsPerWarp(GpuDeviceHandle device);
// Queries the grid limits for device with cuDeviceGetAttribute calls.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
static bool GetGridLimits(int* x, int* y, int* z, GpuDeviceHandle device);
// Returns a grab-bag of device properties in a caller-owned device_properties
// structure for device_ordinal via cuDeviceGetProperties.
//
// This call is deprecated in the NVIDIA driver API; its replacement is
// GetDeviceAttribute
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
static bool GetDeviceProperties(GpuDeviceProperty* device_properties,
int device_ordinal);
// Gets a specific integer-valued property about the given device.
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
static port::StatusOr<int> GetDeviceAttribute(GpuDeviceAttribute attribute,
GpuDeviceHandle device);
// Returns whether ECC is enabled for the given GpuDeviceHandle via
// cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
static bool IsEccEnabled(GpuDeviceHandle device, bool* result);
// Returns the total amount of memory available for allocation by the CUDA
// context, in bytes, via cuDeviceTotalMem.
static bool GetDeviceTotalMemory(GpuDeviceHandle device, uint64* result);
// Returns the free amount of memory and total amount of memory, as reported
// by cuMemGetInfo.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
static bool GetDeviceMemoryInfo(GpuContext* context, int64* free,
int64* total);
// Returns a PCI bus id string for the device.
// [domain]:[bus]:[device].[function]
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
static string GetPCIBusID(GpuDeviceHandle device);
// -- Context- and device-independent calls.
// Returns the number of visible CUDA device via cuDeviceGetCount.
// This should correspond to the set of device ordinals available.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
static int GetDeviceCount();
// Returns the driver version number via cuDriverGetVersion.
// This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
// instead, the CUDA toolkit release number that this driver is compatible
// with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
// compatible driver).
//
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
static bool GetDriverVersion(int* driver_version);
// -- Other calls
// Returns the maximum number of blocks (per multiprocessor) occupied by the
// specified kernel/GpuFunctionHandle when launched with the specified
// parameters.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
GpuContext* context, GpuFunctionHandle kernel, int threads_per_block,
size_t dynamic_shared_memory_bytes);
// Seam for injecting an error at CUDA initialization time for testing
// purposes.
static bool driver_inject_init_error_;
};
// Ensures a context is activated within a scope.
class ScopedActivateContext {
public:
// Activates the context via cuCtxSetCurrent, if it is not the currently
// active context (a la cuCtxGetCurrent). Note the alternative push/pop
// mechanism is said by NVIDIA to be relatively slow and deprecated.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
explicit ScopedActivateContext(GpuContext* context);
// Checks that the context has remained activated for the duration of the
// scope.
~ScopedActivateContext();
private:
GpuContext* to_restore_ = nullptr;
};
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_

View File

@ -0,0 +1,47 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/stream_executor/gpu/gpu_event.h"
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
#include "tensorflow/stream_executor/lib/statusor.h"
namespace stream_executor {
namespace gpu {
GpuEvent::GpuEvent(GpuExecutor* parent)
: parent_(parent), gpu_event_(nullptr) {}
GpuEvent::~GpuEvent() {}
port::Status GpuEvent::Init() {
return GpuDriver::CreateEvent(parent_->gpu_context(), &gpu_event_,
GpuDriver::EventFlags::kDisableTiming);
}
port::Status GpuEvent::Destroy() {
return GpuDriver::DestroyEvent(parent_->gpu_context(), &gpu_event_);
}
port::Status GpuEvent::Record(GpuStream* stream) {
return GpuDriver::RecordEvent(parent_->gpu_context(), gpu_event_,
stream->gpu_stream());
}
GpuEventHandle GpuEvent::gpu_event() { return gpu_event_; }
} // namespace gpu
} // namespace stream_executor

View File

@ -0,0 +1,62 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
#include "tensorflow/stream_executor/event.h"
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
#include "tensorflow/stream_executor/lib/status.h"
namespace stream_executor {
namespace gpu {
// GpuEvent wraps a GpuEventHandle in the platform-independent EventInterface
// interface.
class GpuEvent : public internal::EventInterface {
public:
explicit GpuEvent(GpuExecutor* parent);
~GpuEvent() override;
// Populates the CUDA-platform-specific elements of this object.
port::Status Init();
// Deallocates any platform-specific elements of this object. This is broken
// out (not part of the destructor) to allow for error reporting.
port::Status Destroy();
// Inserts the event at the current position into the specified stream.
port::Status Record(GpuStream* stream);
// Polls the CUDA platform for the event's current status.
Event::Status PollForStatus();
// The underlying CUDA event element.
GpuEventHandle gpu_event();
private:
// The Executor used to which this object and GpuEventHandle are bound.
GpuExecutor* parent_;
// The underlying CUDA event element.
GpuEventHandle gpu_event_;
};
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_

View File

@ -0,0 +1,347 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// The CUDA implementation of the StreamExecutorInterface functionality.
// CUDA inclusions are ideally confined to this implementation file.
//
// The notions from the StreamExecutor basically correspond to the CUDA streams
// programming model provided by the libcuda.so driver APIs, so we don't have
// to do much more than wrap the calls to the libraries appropriately.
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
#include <set>
#include <unordered_map>
#include "absl/strings/string_view.h"
#include "tensorflow/stream_executor/event.h"
#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
#include "tensorflow/stream_executor/lib/status.h"
#include "tensorflow/stream_executor/lib/statusor.h"
#include "tensorflow/stream_executor/platform.h"
#include "tensorflow/stream_executor/platform/mutex.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/platform/thread_annotations.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
namespace stream_executor {
namespace gpu {
// CUDA-platform implementation of the platform-agnostic
// StreamExecutorInferface.
class GpuExecutor : public internal::StreamExecutorInterface {
public:
// sub_platform indicates the subplatform used in this executor; it must
// be a CUDA type.
explicit GpuExecutor(const PluginConfig& plugin_config)
: device_(0),
context_(nullptr),
device_ordinal_(0),
cc_major_(0),
cc_minor_(0),
version_(0),
plugin_config_(plugin_config) {}
// See the corresponding StreamExecutor methods for method comments on the
// following overrides.
~GpuExecutor() override;
port::Status Init(int device_ordinal, DeviceOptions device_options) override;
bool GetKernel(const MultiKernelLoaderSpec& spec,
KernelBase* kernel) override;
// (supported on CUDA only)
void UnloadKernel(const KernelBase* kernel) override;
bool LoadModule(const MultiModuleLoaderSpec& spec,
ModuleHandle* module_handle) override;
bool UnloadModule(ModuleHandle module_handle) override;
bool Launch(Stream* stream, const ThreadDim& thread_dims,
const BlockDim& block_dims, const KernelBase& k,
const KernelArgsArrayBase& args) override;
// (supported on CUDA only)
int CalculateOccupancy(const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims, GpuFunctionHandle func);
// (supported on CUDA only)
int CompareOccupancy(int* initial_blocks,
const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims, GpuFunctionHandle func);
void* Allocate(uint64 size) override;
void* AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
uint64 size_bytes) override;
void Deallocate(DeviceMemoryBase* mem) override;
void* UnifiedMemoryAllocate(uint64 size) override {
return GpuDriver::UnifiedMemoryAllocate(context_, size);
}
void UnifiedMemoryDeallocate(void* location) override {
return GpuDriver::UnifiedMemoryDeallocate(context_, location);
}
// CUDA allocation/registration functions are necessary because the driver
// internally sets up buffers for DMA operations (and page locks them).
// There's no external interface for us to otherwise control these DMA
// settings.
void* HostMemoryAllocate(uint64 size) override {
return GpuDriver::HostAllocate(context_, size);
}
void HostMemoryDeallocate(void* location) override {
return GpuDriver::HostDeallocate(context_, location);
}
bool HostMemoryRegister(void* location, uint64 size) override;
bool HostMemoryUnregister(void* location) override;
bool SynchronizeAllActivity() override;
bool SynchronousMemZero(DeviceMemoryBase* location, uint64 size) override;
bool SynchronousMemSet(DeviceMemoryBase* location, int value,
uint64 size) override;
port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
const void* host_src, uint64 size) override;
port::Status SynchronousMemcpy(void* host_dst,
const DeviceMemoryBase& gpu_src,
uint64 size) override;
port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
const DeviceMemoryBase& gpu_src,
uint64 size) override;
bool MemZero(Stream* stream, DeviceMemoryBase* location,
uint64 size) override;
bool Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
uint64 size) override;
bool Memset32(Stream* stream, DeviceMemoryBase* location, uint32 pattern,
uint64 size) override;
bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
uint64 size) override;
bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
uint64 size) override;
bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
const DeviceMemoryBase& gpu_src,
uint64 size) override;
bool HostCallback(Stream* stream,
std::function<port::Status()> callback) override;
bool AllocateStream(Stream* stream) override;
void DeallocateStream(Stream* stream) override;
bool CreateStreamDependency(Stream* dependent, Stream* other) override;
bool AllocateTimer(Timer* timer) override;
void DeallocateTimer(Timer* timer) override;
bool StartTimer(Stream* stream, Timer* timer) override;
bool StopTimer(Stream* stream, Timer* timer) override;
port::Status AllocateEvent(Event* event) override;
port::Status DeallocateEvent(Event* event) override;
port::Status RecordEvent(Stream* stream, Event* event) override;
port::Status WaitForEvent(Stream* stream, Event* event) override;
Event::Status PollForEventStatus(Event* event) override;
port::Status BlockHostUntilDone(Stream* stream) override;
int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
bool DeviceMemoryUsage(int64* free, int64* total) const override;
// Search for the symbol and returns a device pointer and size.
// Returns false if symbol does not exist.
bool GetSymbol(const string& symbol_name, ModuleHandle module_handle,
void** mem, size_t* bytes) override;
DeviceDescription* PopulateDeviceDescription() const override;
// Populates the block_dim_limit by querying the device driver API. If an
// error occurs at any point while asking the driver for block dim limits, it
// will be only partially populated as a result, and an error will be logged.
bool FillBlockDimLimit(BlockDim* block_dim_limit) const;
bool SupportsBlas() const override;
blas::BlasSupport* CreateBlas() override;
bool SupportsFft() const override;
fft::FftSupport* CreateFft() override;
bool SupportsRng() const override;
rng::RngSupport* CreateRng() override;
bool SupportsDnn() const override;
dnn::DnnSupport* CreateDnn() override;
std::unique_ptr<internal::EventInterface> CreateEventImplementation()
override;
std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
override;
std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
void* GpuContextHack() override;
GpuContext* gpu_context();
private:
// Attempts to find a more specific version of the file indicated by
// filename by looking for compute-capability-specific suffixed versions; i.e.
// looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
// we're on a compute capability 3.0 machine.
// (supported on CUDA only)
bool FindOnDiskForComputeCapability(absl::string_view filename,
absl::string_view canonical_suffix,
string* found_filename) const;
// Attempts to find a more specific version of the file indicated by
// filename by looking for AMDGPU ISA-specific suffixed versions.
// (supported on ROCm only)
bool FindOnDiskForISAVersion(absl::string_view filename,
absl::string_view canonical_suffix,
string* found_filename) const;
// Host callback landing routine invoked by CUDA.
// data: User-provided callback provided to HostCallback() above, captured
// as a std::function<void()>. Allocated/initialized inside
// HostCallback() and owned and deleted by this call.
static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
void* data);
// Collects metadata for the specified kernel.
bool GetKernelMetadata(GpuKernel* cuda_kernel,
KernelMetadata* kernel_metadata);
// Prints to VLOG(2) information about the kernel's occupancy and how it might
// be improved.
void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
const BlockDim& block_dims);
// (supported on CUDA only)
bool LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
// Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated.
// (supported on CUDA only)
bool LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
// (supported on ROCm only)
bool LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
bool UnloadGpuBinary(const void* gpu_binary)
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
// Guards the on-disk-module mapping.
mutex disk_modules_mu_;
// Mapping from filename to GPUModuleHandle, if it was already retrieved.
// Multiple GPUFunctionHandle are usually obtained from a single
// GPUModuleHandle so we attempt to hit in this mapping first, before
// retrieving it.
std::map<string, GpuModuleHandle> disk_modules_ GUARDED_BY(disk_modules_mu_);
// Guards the in-memory-module mapping.
mutex in_memory_modules_mu_;
std::map<const char*, GpuModuleHandle> in_memory_modules_
GUARDED_BY(in_memory_modules_mu_);
// Kernel -> loaded GPU binary. Many kernels may load the same binary.
std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
GUARDED_BY(in_memory_modules_mu_);
// GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>>
gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
// Guards the launched kernel set.
mutex launched_kernels_mu_;
// Keeps track of the set of launched kernels. Currently used to suppress the
// occupancy check on subsequent launches.
std::set<GpuFunctionHandle> launched_kernels_
GUARDED_BY(launched_kernels_mu_);
// Handle for the CUDA device being operated on. Immutable
// post-initialization.
GpuDeviceHandle device_;
// Handle for session with the library/driver. Immutable post-initialization.
GpuContext* context_;
// The device ordinal value that this executor was initialized with; recorded
// for use in getting device metadata. Immutable post-initialization.
int device_ordinal_;
// The major verion of the compute capability for device_.
int cc_major_;
// The minor verion of the compute capability for device_.
int cc_minor_;
// GPU ISA version for device_.
int version_;
// The plugin configuration associated with this instance.
PluginConfig plugin_config_;
SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
};
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_

View File

@ -0,0 +1,107 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Common helper functions used for dealing with CUDA API datatypes.
//
// These are typically placed here for use by multiple source components (for
// example, BLAS and executor components).
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
#include <stddef.h>
#include <complex>
#include "tensorflow/stream_executor/gpu/gpu_types.h"
namespace stream_executor {
template <typename ElemT>
class DeviceMemory;
namespace gpu {
// Converts a const DeviceMemory reference to its underlying typed pointer in
// CUDA
// device memory.
template <typename T>
const T* GpuMemory(const DeviceMemory<T>& mem) {
return static_cast<const T*>(mem.opaque());
}
// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
// pointer in CUDA device memory.
template <typename T>
T* GpuMemoryMutable(DeviceMemory<T>* mem) {
return static_cast<T*>(mem->opaque());
}
static_assert(
sizeof(std::complex<float>) == sizeof(GpuComplexType),
"std::complex<float> and GpuComplexType should have the same size");
static_assert(offsetof(GpuComplexType, x) == 0,
"The real part of GpuComplexType should appear first.");
static_assert(
sizeof(std::complex<double>) == sizeof(GpuDoubleComplexType),
"std::complex<double> and GpuDoubleComplexType should have the same "
"size");
static_assert(offsetof(GpuDoubleComplexType, x) == 0,
"The real part of GpuDoubleComplexType should appear first.");
// Type traits to get CUDA complex types from std::complex<>.
template <typename T>
struct GpuComplexT {
typedef T type;
};
template <>
struct GpuComplexT<std::complex<float>> {
typedef GpuComplexType type;
};
template <>
struct GpuComplexT<std::complex<double>> {
typedef GpuDoubleComplexType type;
};
// Converts pointers of std::complex<> to pointers of
// GpuComplexType/GpuDoubleComplexType. No type conversion for non-complex
// types.
template <typename T>
inline const typename GpuComplexT<T>::type* GpuComplex(const T* p) {
return reinterpret_cast<const typename GpuComplexT<T>::type*>(p);
}
template <typename T>
inline typename GpuComplexT<T>::type* GpuComplex(T* p) {
return reinterpret_cast<typename GpuComplexT<T>::type*>(p);
}
// Converts values of std::complex<float/double> to values of
// GpuComplexType/GpuDoubleComplexType.
inline GpuComplexType GpuComplexValue(std::complex<float> val) {
return {val.real(), val.imag()};
}
inline GpuDoubleComplexType GpuComplexValue(std::complex<double> val) {
return {val.real(), val.imag()};
}
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_

View File

@ -0,0 +1,105 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// The CUDA implementation of the StreamExecutorInterface functionality.
// CUDA inclusions are ideally confined to this implementation file.
//
// The notions from the StreamExecutor basically correspond to the CUDA streams
// programming model provided by the libcuda.so driver APIs, so we don't have
// to do much more than wrap the calls to the libraries appropriately.
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
#include "tensorflow/stream_executor/kernel_cache_config.h"
#include "tensorflow/stream_executor/platform/logging.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
namespace stream_executor {
namespace gpu {
// Wraps a GpuFunctionHandle to implement the platform-independent
// KernelInterface.
class GpuKernel : public internal::KernelInterface {
public:
GpuKernel()
: gpu_function_(nullptr),
arity_(0),
preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
// Note that the function is unloaded when the module is unloaded, and the
// module that the function is contained in is owned by the GpuExecutor.
~GpuKernel() override {}
// As arity cannot be reflected upon using the CUDA API, the arity is
// explicitly set during the GpuExecutor::GetKernel initialization process.
void set_arity(unsigned arity) { arity_ = arity; }
unsigned Arity() const override { return arity_; }
// Returns the GpuFunctionHandle value for passing to the CUDA API.
GpuFunctionHandle AsGpuFunctionHandle() const {
DCHECK(gpu_function_ != nullptr);
return const_cast<GpuFunctionHandle>(gpu_function_);
}
// Returns the slot that the GpuFunctionHandle is stored within for this
// object, for the CUDA API which wants to load into a GpuFunctionHandle*.
GpuFunctionHandle* gpu_function_ptr() { return &gpu_function_; }
// CUDA supports setting the preferred cache configuration of a
// GpuFunctionHandle (more-or-less equivalent to a GpuKernel). We support this
// via the below functions; users can set a preference, and that is applied
// when the kernel is [lazy-]loaded (in GpuExecutor::Launch). The alternative
// would be to load the kernel & set the preference when the user calls the
// setter below; either approach is valid. Sets the current kernel cache
// configuration preference.
void SetPreferredCacheConfig(KernelCacheConfig config) override {
preferred_cache_config_ = config;
}
// Returns the current kernel cache configuration preference.
KernelCacheConfig GetPreferredCacheConfig() const override {
return preferred_cache_config_;
}
// Returns the current kernel cache configuration preference as a
// CUfunc_cache.
GpuFuncCachePreference GetGpuCacheConfig() const;
private:
GpuFunctionHandle gpu_function_; // Wrapped CUDA kernel handle.
unsigned arity_; // Number of formal parameters the kernel takes.
// Preferred (but not required) cache configuration for this kernel.
KernelCacheConfig preferred_cache_config_;
};
// Given a platform-independent kernel datatype, returns the (const) internal
// CUDA platform implementation pointer.
inline const GpuKernel* AsGpuKernel(const KernelBase* kernel) {
return static_cast<const GpuKernel*>(kernel->implementation());
}
// Given a platform-independent kernel datatype, returns the (non-const)
// internal CUDA platform implementation pointer.
inline GpuKernel* AsGpuKernel(KernelBase* kernel) {
return static_cast<GpuKernel*>(kernel->implementation());
}
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_

View File

@ -0,0 +1,125 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
#include "tensorflow/stream_executor/platform/mutex.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/platform/thread_annotations.h"
#include "tensorflow/stream_executor/plugin_registry.h"
#include "tensorflow/stream_executor/rng.h"
#include "tensorflow/stream_executor/gpu/gpu_types.h"
namespace stream_executor {
class Stream;
template <typename ElemT>
class DeviceMemory;
namespace gpu {
// Opaque and unique identifier for the GPU RNG plugin.
extern const PluginId kGpuRandPlugin;
class GpuExecutor;
// GPU-platform implementation of the random number generation support
// interface.
//
// Thread-safe post-initialization.
class GpuRng : public rng::RngSupport {
public:
explicit GpuRng(GpuExecutor* parent);
// Retrieves a gpu rng library generator handle. This is necessary for
// enqueuing random number generation work onto the device.
// TODO(leary) provide a way for users to select the RNG algorithm.
bool Init();
// Releases a gpu rng library generator handle, if one was acquired.
~GpuRng() override;
// See rng::RngSupport for details on the following overrides.
bool DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) override;
bool DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) override;
bool DoPopulateRandUniform(Stream* stream,
DeviceMemory<std::complex<float>>* v) override;
bool DoPopulateRandUniform(Stream* stream,
DeviceMemory<std::complex<double>>* v) override;
bool DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
DeviceMemory<float>* v) override;
bool DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
DeviceMemory<double>* v) override;
bool SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) override;
private:
// Actually performs the work of generating random numbers - the public
// methods are thin wrappers to this interface.
template <typename T>
bool DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v);
template <typename ElemT, typename FuncT>
bool DoPopulateRandGaussianInternal(Stream* stream, ElemT mean, ElemT stddev,
DeviceMemory<ElemT>* v, FuncT func);
// Sets the stream for the internal gpu rng generator.
//
// This is a stateful operation, as the handle can only have one stream set at
// a given time, so it is usually performed right before enqueuing work to do
// with random number generation.
bool SetStream(Stream* stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
// mutex that guards the gpu rng library handle for this device.
mutex mu_;
// GpuExecutor which instantiated this GpuRng.
// Immutable post-initialization.
GpuExecutor* parent_;
// gpu rng library handle on the device.
GpuRngHandle rng_ GUARDED_BY(mu_);
SE_DISALLOW_COPY_AND_ASSIGN(GpuRng);
};
template <typename T>
string TypeString();
template <>
string TypeString<float>() {
return "float";
}
template <>
string TypeString<double>() {
return "double";
}
template <>
string TypeString<std::complex<float>>() {
return "std::complex<float>";
}
template <>
string TypeString<std::complex<double>>() {
return "std::complex<double>";
}
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_

View File

@ -1,4 +1,4 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -13,49 +13,49 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/stream_executor/cuda/cuda_stream.h"
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
#include "tensorflow/stream_executor/lib/status.h"
#include "tensorflow/stream_executor/stream.h"
namespace stream_executor {
namespace cuda {
namespace gpu {
bool CUDAStream::Init() {
if (!CUDADriver::CreateStream(parent_->cuda_context(), &cuda_stream_)) {
bool GpuStream::Init() {
if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_)) {
return false;
}
return CUDADriver::CreateEvent(parent_->cuda_context(), &completed_event_,
CUDADriver::EventFlags::kDisableTiming)
return GpuDriver::CreateEvent(parent_->gpu_context(), &completed_event_,
GpuDriver::EventFlags::kDisableTiming)
.ok();
}
void CUDAStream::Destroy() {
void GpuStream::Destroy() {
if (completed_event_ != nullptr) {
port::Status status =
CUDADriver::DestroyEvent(parent_->cuda_context(), &completed_event_);
GpuDriver::DestroyEvent(parent_->gpu_context(), &completed_event_);
if (!status.ok()) {
LOG(ERROR) << status.error_message();
}
}
CUDADriver::DestroyStream(parent_->cuda_context(), &cuda_stream_);
GpuDriver::DestroyStream(parent_->gpu_context(), &gpu_stream_);
}
bool CUDAStream::IsIdle() const {
return CUDADriver::IsStreamIdle(parent_->cuda_context(), cuda_stream_);
bool GpuStream::IsIdle() const {
return GpuDriver::IsStreamIdle(parent_->gpu_context(), gpu_stream_);
}
CUDAStream *AsCUDAStream(Stream *stream) {
GpuStream* AsGpuStream(Stream* stream) {
DCHECK(stream != nullptr);
return static_cast<CUDAStream *>(stream->implementation());
return static_cast<GpuStream*>(stream->implementation());
}
CUstream AsCUDAStreamValue(Stream *stream) {
GpuStreamHandle AsGpuStreamValue(Stream* stream) {
DCHECK(stream != nullptr);
return AsCUDAStream(stream)->cuda_stream();
return AsGpuStream(stream)->gpu_stream();
}
} // namespace cuda
} // namespace gpu
} // namespace stream_executor

View File

@ -0,0 +1,96 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Defines the GpuStream type - the CUDA-specific implementation of the generic
// StreamExecutor Stream interface.
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
#include "tensorflow/stream_executor/platform/thread_annotations.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
namespace stream_executor {
namespace gpu {
class GpuExecutor;
// Wraps a GpuStreamHandle in order to satisfy the platform-independent
// StreamInterface.
//
// Thread-safe post-initialization.
class GpuStream : public internal::StreamInterface {
public:
explicit GpuStream(GpuExecutor* parent)
: parent_(parent), gpu_stream_(nullptr), completed_event_(nullptr) {}
// Note: teardown is handled by a parent's call to DeallocateStream.
~GpuStream() override {}
void* GpuStreamHack() override { return gpu_stream_; }
void** GpuStreamMemberHack() override {
return reinterpret_cast<void**>(&gpu_stream_);
}
// Explicitly initialize the CUDA resources associated with this stream, used
// by StreamExecutor::AllocateStream().
bool Init();
// Explicitly destroy the CUDA resources associated with this stream, used by
// StreamExecutor::DeallocateStream().
void Destroy();
// Returns true if no work is pending or executing on the stream.
bool IsIdle() const;
// Retrieves an event which indicates that all work enqueued into the stream
// has completed. Ownership of the event is not transferred to the caller, the
// event is owned by this stream.
GpuEventHandle* completed_event() { return &completed_event_; }
// Returns the GpuStreamHandle value for passing to the CUDA API.
//
// Precond: this GpuStream has been allocated (otherwise passing a nullptr
// into the NVIDIA library causes difficult-to-understand faults).
GpuStreamHandle gpu_stream() const {
DCHECK(gpu_stream_ != nullptr);
return const_cast<GpuStreamHandle>(gpu_stream_);
}
// TODO(timshen): Migrate away and remove this function.
GpuStreamHandle cuda_stream() const { return gpu_stream(); }
GpuExecutor* parent() const { return parent_; }
private:
GpuExecutor* parent_; // Executor that spawned this stream.
GpuStreamHandle gpu_stream_; // Wrapped CUDA stream handle.
// Event that indicates this stream has completed.
GpuEventHandle completed_event_ = nullptr;
};
// Helper functions to simplify extremely common flows.
// Converts a Stream to the underlying GpuStream implementation.
GpuStream* AsGpuStream(Stream* stream);
// Extracts a GpuStreamHandle from a GpuStream-backed Stream object.
GpuStreamHandle AsGpuStreamValue(Stream* stream);
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_

View File

@ -1,4 +1,4 @@
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -13,31 +13,31 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/stream_executor/cuda/cuda_timer.h"
#include "tensorflow/stream_executor/gpu/gpu_timer.h"
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
#include "tensorflow/stream_executor/cuda/cuda_stream.h"
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
#include "tensorflow/stream_executor/lib/status.h"
namespace stream_executor {
namespace cuda {
namespace gpu {
bool CUDATimer::Init() {
bool GpuTimer::Init() {
CHECK(start_event_ == nullptr && stop_event_ == nullptr);
CudaContext* context = parent_->cuda_context();
port::Status status = CUDADriver::CreateEvent(
context, &start_event_, CUDADriver::EventFlags::kDefault);
GpuContext* context = parent_->gpu_context();
port::Status status = GpuDriver::CreateEvent(context, &start_event_,
GpuDriver::EventFlags::kDefault);
if (!status.ok()) {
LOG(ERROR) << status;
return false;
}
status = CUDADriver::CreateEvent(context, &stop_event_,
CUDADriver::EventFlags::kDefault);
status = GpuDriver::CreateEvent(context, &stop_event_,
GpuDriver::EventFlags::kDefault);
if (!status.ok()) {
LOG(ERROR) << status;
status = CUDADriver::DestroyEvent(context, &start_event_);
status = GpuDriver::DestroyEvent(context, &start_event_);
if (!status.ok()) {
LOG(ERROR) << status;
}
@ -48,47 +48,46 @@ bool CUDATimer::Init() {
return true;
}
void CUDATimer::Destroy() {
CudaContext* context = parent_->cuda_context();
port::Status status = CUDADriver::DestroyEvent(context, &start_event_);
void GpuTimer::Destroy() {
GpuContext* context = parent_->gpu_context();
port::Status status = GpuDriver::DestroyEvent(context, &start_event_);
if (!status.ok()) {
LOG(ERROR) << status;
}
status = CUDADriver::DestroyEvent(context, &stop_event_);
status = GpuDriver::DestroyEvent(context, &stop_event_);
if (!status.ok()) {
LOG(ERROR) << status;
}
}
float CUDATimer::GetElapsedMilliseconds() const {
float GpuTimer::GetElapsedMilliseconds() const {
CHECK(start_event_ != nullptr && stop_event_ != nullptr);
// TODO(leary) provide a way to query timer resolution?
// CUDA docs say a resolution of about 0.5us
float elapsed_milliseconds = NAN;
(void)CUDADriver::GetEventElapsedTime(parent_->cuda_context(),
&elapsed_milliseconds, start_event_,
stop_event_);
(void)GpuDriver::GetEventElapsedTime(
parent_->gpu_context(), &elapsed_milliseconds, start_event_, stop_event_);
return elapsed_milliseconds;
}
bool CUDATimer::Start(CUDAStream* stream) {
port::Status status = CUDADriver::RecordEvent(
parent_->cuda_context(), start_event_, stream->cuda_stream());
bool GpuTimer::Start(GpuStream* stream) {
port::Status status = GpuDriver::RecordEvent(
parent_->gpu_context(), start_event_, stream->gpu_stream());
if (!status.ok()) {
LOG(ERROR) << status;
}
return status.ok();
}
bool CUDATimer::Stop(CUDAStream* stream) {
port::Status status = CUDADriver::RecordEvent(
parent_->cuda_context(), stop_event_, stream->cuda_stream());
bool GpuTimer::Stop(GpuStream* stream) {
port::Status status = GpuDriver::RecordEvent(
parent_->gpu_context(), stop_event_, stream->gpu_stream());
if (!status.ok()) {
LOG(ERROR) << status;
}
return status.ok();
}
} // namespace cuda
} // namespace gpu
} // namespace stream_executor

View File

@ -0,0 +1,90 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// Defines the GpuTimer type - the CUDA-specific implementation of the generic
// StreamExecutor Timer interface.
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
namespace stream_executor {
namespace gpu {
class GpuExecutor;
class GpuStream;
// Wraps a pair of GpuEventHandles in order to satisfy the platform-independent
// TimerInferface -- both a start and a stop event are present which may be
// recorded in a stream.
class GpuTimer : public internal::TimerInterface {
public:
explicit GpuTimer(GpuExecutor* parent)
: parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
// Note: teardown needs to be explicitly handled in this API by a call to
// StreamExecutor::DeallocateTimer(), which invokes Destroy().
// TODO(csigg): Change to RAII.
~GpuTimer() override {}
// Allocates the platform-specific pieces of the timer, called as part of
// StreamExecutor::AllocateTimer().
bool Init();
// Deallocates the platform-specific pieces of the timer, called as part of
// StreamExecutor::DeallocateTimer().
void Destroy();
// Records the "timer start" event at the current point in the stream.
bool Start(GpuStream* stream);
// Records the "timer stop" event at the current point in the stream.
bool Stop(GpuStream* stream);
// Returns the elapsed time, in milliseconds, between the start and stop
// events.
float GetElapsedMilliseconds() const;
// See Timer::Microseconds().
// TODO(leary) make this into an error code interface...
uint64 Microseconds() const override {
return GetElapsedMilliseconds() * 1e3;
}
// See Timer::Nanoseconds().
uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
private:
GpuExecutor* parent_;
GpuEventHandle start_event_; // Event recorded to indicate the "start"
// timestamp executing in a stream.
GpuEventHandle stop_event_; // Event recorded to indicate the "stop"
// timestamp executing in a stream.
};
struct GpuTimerDeleter {
void operator()(GpuTimer* t) {
t->Destroy();
delete t;
}
};
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_

View File

@ -0,0 +1,84 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// GPU (ROCm / CUDA) specific type handle resolution
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
#if TENSORFLOW_USE_ROCM
#include "rocm/include/hip/hip_complex.h"
#include "rocm/include/hip/hip_runtime.h"
#include "rocm/include/hiprand/hiprand.h"
#else // CUDA
#include "cuda/include/cuComplex.h"
#include "cuda/include/cuda.h"
// cannot include curand.h here
// because it triggers the #error in cuda/cuda_gpu_executor.cc
// (because curand.h includes cuda_runtime.h)
// so explicitly adding the lone typedef we need from that file
typedef struct curandGenerator_st* curandGenerator_t;
#endif
namespace stream_executor {
namespace gpu {
#if TENSORFLOW_USE_ROCM
using GpuStreamHandle = hipStream_t;
using GpuEventHandle = hipEvent_t;
using GpuFunctionHandle = hipFunction_t;
using GpuFunctionAttribute = hipDeviceAttribute_t; // not a typo!
using GpuDeviceHandle = hipDevice_t;
using GpuDevicePtr = hipDeviceptr_t;
using GpuDeviceAttribute = hipDeviceAttribute_t;
using GpuDeviceProperty = hipDeviceProp_t;
using GpuModuleHandle = hipModule_t;
using GpuStatus = hipError_t;
using GpuFuncCachePreference = hipFuncCache_t;
using GpuSharedMemConfig = hipSharedMemConfig;
using GpuComplexType = hipComplex;
using GpuDoubleComplexType = hipDoubleComplex;
using GpuRngHandle = hiprandGenerator_t;
#else // CUDA
using GpuStreamHandle = CUstream;
using GpuEventHandle = CUevent;
using GpuFunctionHandle = CUfunction;
using GpuFunctionAttribute = CUfunction_attribute;
using GpuDeviceHandle = CUdevice;
using GpuDevicePtr = CUdeviceptr;
using GpuDeviceAttribute = CUdevice_attribute;
using GpuDeviceProperty = CUdevprop;
using GpuModuleHandle = CUmodule;
using GpuStatus = CUresult;
using GpuFuncCachePreference = CUfunc_cache;
using GpuSharedMemConfig = CUsharedconfig;
using GpuComplexType = cuComplex;
using GpuDoubleComplexType = cuDoubleComplex;
using GpuRngHandle = curandGenerator_t;
#endif
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_

View File

@ -28,6 +28,8 @@ string PlatformKindString(PlatformKind kind) {
switch (kind) {
case PlatformKind::kCuda:
return "CUDA";
case PlatformKind::kROCm:
return "ROCm";
case PlatformKind::kOpenCL:
return "OpenCL";
case PlatformKind::kHost:
@ -52,6 +54,7 @@ PlatformKind PlatformKindFromString(string kind) {
bool PlatformIsRunnable(PlatformKind kind) {
switch (kind) {
case PlatformKind::kCuda:
case PlatformKind::kROCm:
case PlatformKind::kOpenCL:
case PlatformKind::kHost:
return true;
@ -63,6 +66,7 @@ bool PlatformIsRunnable(PlatformKind kind) {
bool PlatformIsRunnableOnDevice(PlatformKind kind) {
switch (kind) {
case PlatformKind::kCuda:
case PlatformKind::kROCm:
case PlatformKind::kOpenCL:
return true;
default:

View File

@ -40,6 +40,7 @@ class StreamExecutor;
enum class PlatformKind {
kInvalid,
kCuda,
kROCm,
kOpenCL,
kHost,
kMock,

View File

@ -0,0 +1,267 @@
# Description:
# ROCm-platform specific StreamExecutor support code.
licenses(["notice"]) # Apache 2.0
load("//tensorflow:tensorflow.bzl", "tf_cc_test")
load(
"//tensorflow/stream_executor:build_defs.bzl",
"stream_executor_friends",
)
load("//tensorflow:tensorflow.bzl", "tf_copts")
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
package_group(
name = "friends",
packages = stream_executor_friends(),
)
package(
default_visibility = [":friends"],
)
# Filegroup used to collect source files for the dependency check.
filegroup(
name = "c_srcs",
data = glob([
"**/*.cc",
"**/*.h",
]),
)
cc_library(
name = "rocm_diagnostics",
srcs = if_rocm_is_configured(["rocm_diagnostics.cc"]),
hdrs = [],
deps = if_rocm_is_configured([
"@com_google_absl//absl/container:inlined_vector",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
"//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
]),
)
cc_library(
name = "rocm_driver",
srcs = if_rocm_is_configured(["rocm_driver.cc"]),
hdrs = [],
deps = if_rocm_is_configured([
":rocm_diagnostics",
"@com_google_absl//absl/base",
"@com_google_absl//absl/container:inlined_vector",
"@com_google_absl//absl/strings",
"//tensorflow/stream_executor:device_options",
"//tensorflow/stream_executor/gpu:gpu_driver_header",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
"@local_config_rocm//rocm:rocm_headers",
]),
)
cc_library(
name = "rocm_event",
srcs = if_rocm_is_configured(["rocm_event.cc"]),
hdrs = [],
deps = if_rocm_is_configured([
":rocm_driver",
"//tensorflow/stream_executor:stream_executor_headers",
"//tensorflow/stream_executor/gpu:gpu_event_header",
"//tensorflow/stream_executor/gpu:gpu_executor_header",
"//tensorflow/stream_executor/gpu:gpu_stream_header",
"//tensorflow/stream_executor/lib",
]),
)
cc_library(
name = "rocm_gpu_executor",
srcs = if_rocm_is_configured(["rocm_gpu_executor.cc"]),
hdrs = [],
deps = if_rocm_is_configured([
":rocm_diagnostics",
":rocm_driver",
":rocm_event",
":rocm_kernel",
":rocm_platform_id",
"@com_google_absl//absl/strings",
"//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:plugin_registry",
"//tensorflow/stream_executor:stream_executor_internal",
"//tensorflow/stream_executor:stream_executor_pimpl_header",
"//tensorflow/stream_executor:timer",
"//tensorflow/stream_executor/gpu:gpu_activation_header",
"//tensorflow/stream_executor/gpu:gpu_event",
"//tensorflow/stream_executor/gpu:gpu_kernel_header",
"//tensorflow/stream_executor/gpu:gpu_stream",
"//tensorflow/stream_executor/gpu:gpu_timer",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
"//tensorflow/stream_executor/platform:dso_loader",
]),
alwayslink = True,
)
cc_library(
name = "rocm_kernel",
srcs = if_rocm_is_configured(["rocm_kernel.cc"]),
hdrs = [],
visibility = ["//visibility:public"],
deps = if_rocm_is_configured([
"//tensorflow/stream_executor/gpu:gpu_kernel_header",
]),
alwayslink = True,
)
cc_library(
name = "rocm_platform",
srcs = if_rocm_is_configured(["rocm_platform.cc"]),
hdrs = if_rocm_is_configured(["rocm_platform.h"]),
visibility = ["//visibility:public"],
deps = if_rocm_is_configured([
":rocm_driver",
":rocm_gpu_executor",
":rocm_platform_id",
"//tensorflow/stream_executor", # buildcleaner: keep
"//tensorflow/stream_executor:executor_cache",
"//tensorflow/stream_executor:multi_platform_manager",
"//tensorflow/stream_executor:stream_executor_pimpl_header",
"//tensorflow/stream_executor/lib",
"//tensorflow/stream_executor/platform",
]),
alwayslink = True, # Registers itself with the MultiPlatformManager.
)
cc_library(
name = "rocm_platform_id",
srcs = ["rocm_platform_id.cc"],
hdrs = ["rocm_platform_id.h"],
deps = ["//tensorflow/stream_executor:platform"],
)
# FIXME: enable in future PRs
#cc_library(
# name = "rocblas_plugin",
# srcs = ["rocm_blas.cc"],
# hdrs = ["rocm_blas.h"],
# visibility = ["//visibility:public"],
# deps = [
# ":rocm_gpu_executor",
# ":rocm_platform_id",
# "//third_party/eigen3",
# "//tensorflow/core:lib_internal",
# "//tensorflow/stream_executor",
# "//tensorflow/stream_executor:event",
# "//tensorflow/stream_executor:host_or_device_scalar",
# "//tensorflow/stream_executor:plugin_registry",
# "//tensorflow/stream_executor:scratch_allocator",
# "//tensorflow/stream_executor:timer",
# "//tenosrflow/stream_executor/gpu:gpu_activation_header",
# "//tenosrflow/stream_executor/gpu:gpu_stream_header",
# "//tenosrflow/stream_executor/gpu:gpu_timer_header",
# "//tensorflow/stream_executor/lib",
# "//tensorflow/stream_executor/platform",
# "//tensorflow/stream_executor/platform:dso_loader",
# "@com_google_absl//absl/strings",
# "@local_config_rocm//rocm:rocm_headers",
# ] + if_static(["@local_config_rocm//rocm:rocblas"]),
# alwayslink = True,
#)
# FIXME: enable in future PRs
#cc_library(
# name = "rocfft_plugin",
# srcs = ["rocm_fft.cc"],
# hdrs = [],
# visibility = ["//visibility:public"],
# deps = [
# ":rocm_platform_id",
# "//tensorflow/stream_executor:event",
# "//tensorflow/stream_executor:fft",
# "//tensorflow/stream_executor:plugin_registry",
# "//tensorflow/stream_executor:scratch_allocator",
# "//tenosrflow/stream_executor/gpu:gpu_stream_header",
# "//tensorflow/stream_executor/lib",
# "//tensorflow/stream_executor/platform",
# "//tensorflow/stream_executor/platform:dso_loader",
# "@local_config_rocm//rocm:rocm_headers",
# ] + if_static(["@local_config_rocm//rocm:rocfft"]),
# alwayslink = True,
#)
# FIXME: enable in future PRs
#cc_library(
# name = "miopen_plugin",
# srcs = ["rocm_dnn.cc"],
# hdrs = [],
# copts = [
# # STREAM_EXECUTOR_CUDNN_WRAP would fail on Clang with the default
# # setting of template depth 256
# "-ftemplate-depth-512",
# ],
# visibility = ["//visibility:public"],
# deps = [
# ":rocm_diagnostics",
# ":rocm_driver",
# ":rocm_gpu_executor",
# ":rocm_platform_id",
# "//third_party/eigen3",
# "//tensorflow/core:lib",
# "//tensorflow/core:lib_internal",
# "//tensorflow/core:logger",
# "//tensorflow/stream_executor:dnn",
# "//tensorflow/stream_executor:event",
# "//tensorflow/stream_executor:logging_proto_cc",
# "//tensorflow/stream_executor:plugin_registry",
# "//tensorflow/stream_executor:scratch_allocator",
# "//tensorflow/stream_executor:stream_executor_pimpl_header",
# "//tensorflow/stream_executor:temporary_device_memory",
# "//tenosrflow/stream_executor/gpu:gpu_activation_header",
# "//tenosrflow/stream_executor/gpu:gpu_stream_header",
# "//tenosrflow/stream_executor/gpu:gpu_timer_header",
# "//tensorflow/stream_executor/lib",
# "//tensorflow/stream_executor/platform",
# "//tensorflow/stream_executor/platform:dso_loader",
# "@com_google_absl//absl/strings",
# "@local_config_rocm//rocm:rocm_headers",
# ] + tf_additional_miopen_plugin_deps() + if_static(["@local_config_rocm//rocm:miopen"]),
# alwayslink = True,
#)
# FIXME: enable in future PRs
#cc_library(
# name = "rocrand_plugin",
# srcs = ["rocm_rng.cc"],
# hdrs = [],
# deps = [
# ":rocm_gpu_executor",
# ":rocm_platform_id",
# "@local_config_rocm//rocm:rocm_headers",
# "//tensorflow/stream_executor:event",
# "//tensorflow/stream_executor:plugin_registry",
# "//tensorflow/stream_executor:rng",
# "//tenosrflow/stream_executor/gpu:gpu_activation_header",
# "//tenosrflow/stream_executor/gpu:gpu_stream_header",
# "//tensorflow/stream_executor/lib",
# "//tensorflow/stream_executor/platform",
# "//tensorflow/stream_executor/platform:dso_loader",
# ] + if_static(["@local_config_rocm//rocm:curand"]),
# alwayslink = True,
#)
cc_library(
name = "all_runtime",
copts = tf_copts(),
visibility = ["//visibility:public"],
deps = if_rocm_is_configured([
# FIXME: enable in future PRs
#":miopen_plugin",
#":rocfft_plugin",
#":rocblas_plugin",
#":rocrand_plugin",
":rocm_driver",
":rocm_platform",
]),
alwayslink = 1,
)

View File

@ -0,0 +1,234 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <dirent.h>
#include <limits.h>
#include <link.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/sysmacros.h>
#include <unistd.h>
#include <algorithm>
#include <memory>
#include <vector>
#include "absl/container/inlined_vector.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
#include "tensorflow/stream_executor/lib/error.h"
#include "tensorflow/stream_executor/lib/numbers.h"
#include "tensorflow/stream_executor/lib/process_state.h"
#include "tensorflow/stream_executor/lib/status.h"
#include "tensorflow/stream_executor/lib/str_util.h"
#include "tensorflow/stream_executor/lib/stringprintf.h"
#include "tensorflow/stream_executor/platform/logging.h"
namespace stream_executor {
namespace gpu {
string DriverVersionToString(DriverVersion version) {
return absl::StrFormat("%d.%d.%d", std::get<0>(version), std::get<1>(version),
std::get<2>(version));
}
string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
if (!version.ok()) {
return version.status().ToString();
}
return DriverVersionToString(version.ValueOrDie());
}
port::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
std::vector<string> pieces = port::Split(value, '.');
if (pieces.size() != 2 && pieces.size() != 3) {
return port::Status{port::error::INVALID_ARGUMENT,
absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
"for driver version; got \"%s\"",
value.c_str())};
}
int major;
int minor;
int patch = 0;
if (!port::safe_strto32(pieces[0], &major)) {
return port::Status{
port::error::INVALID_ARGUMENT,
absl::StrFormat("could not parse major version number \"%s\" as an "
"integer from string \"%s\"",
pieces[0].c_str(), value.c_str())};
}
if (!port::safe_strto32(pieces[1], &minor)) {
return port::Status{
port::error::INVALID_ARGUMENT,
absl::StrFormat("could not parse minor version number \"%s\" as an "
"integer from string \"%s\"",
pieces[1].c_str(), value.c_str())};
}
if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
return port::Status{
port::error::INVALID_ARGUMENT,
absl::StrFormat("could not parse patch version number \"%s\" as an "
"integer from string \"%s\"",
pieces[2].c_str(), value.c_str())};
}
DriverVersion result{major, minor, patch};
VLOG(2) << "version string \"" << value << "\" made value "
<< DriverVersionToString(result);
return result;
}
// -- class Diagnostician
string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
return absl::StrCat("/dev/kfd", dev_node_ordinal);
}
void Diagnostician::LogDiagnosticInformation() {
LOG(INFO) << "retrieving ROCM diagnostic information for host: "
<< port::Hostname();
LogDriverVersionInformation();
}
/* static */ void Diagnostician::LogDriverVersionInformation() {
LOG(INFO) << "hostname: " << port::Hostname();
if (VLOG_IS_ON(1)) {
const char* value = getenv("LD_LIBRARY_PATH");
string library_path = value == nullptr ? "" : value;
VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
std::vector<string> pieces = port::Split(library_path, ':');
for (const auto& piece : pieces) {
if (piece.empty()) {
continue;
}
DIR* dir = opendir(piece.c_str());
if (dir == nullptr) {
VLOG(1) << "could not open \"" << piece << "\"";
continue;
}
while (dirent* entity = readdir(dir)) {
VLOG(1) << piece << " :: " << entity->d_name;
}
closedir(dir);
}
}
port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
LOG(INFO) << "librocm reported version is: "
<< DriverVersionStatusToString(dso_version);
port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
LOG(INFO) << "kernel reported version is: "
<< DriverVersionStatusToString(kernel_version);
if (kernel_version.ok() && dso_version.ok()) {
WarnOnDsoKernelMismatch(dso_version, kernel_version);
}
}
// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
// driver-interfacing DSO version number. Returns it as a string.
port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
port::StatusOr<DriverVersion> result{port::Status{
port::error::NOT_FOUND,
"was unable to find librocm.so DSO loaded into this program"}};
// Callback used when iterating through DSOs. Looks for the driver-interfacing
// DSO and yields its version number into the callback data, when found.
auto iterate_phdr = [](struct dl_phdr_info* info, size_t size,
void* data) -> int {
if (strstr(info->dlpi_name, "librocm.so.1")) {
VLOG(1) << "found DLL info with name: " << info->dlpi_name;
char resolved_path[PATH_MAX] = {0};
if (realpath(info->dlpi_name, resolved_path) == nullptr) {
return 0;
}
VLOG(1) << "found DLL info with resolved path: " << resolved_path;
const char* slash = rindex(resolved_path, '/');
if (slash == nullptr) {
return 0;
}
const char* so_suffix = ".so.";
const char* dot = strstr(slash, so_suffix);
if (dot == nullptr) {
return 0;
}
string dso_version = dot + strlen(so_suffix);
// TODO(b/22689637): Eliminate the explicit namespace if possible.
auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
auto result = static_cast<port::StatusOr<DriverVersion>*>(data);
*result = StringToDriverVersion(stripped_dso_version);
return 1;
}
return 0;
};
dl_iterate_phdr(iterate_phdr, &result);
return result;
}
port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
const string& driver_version_file_contents) {
static const char* kDriverFilePrelude = "Kernel Module ";
size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
if (offset == string::npos) {
return port::Status{
port::error::NOT_FOUND,
absl::StrCat("could not find kernel module information in "
"driver version file contents: \"",
driver_version_file_contents, "\"")};
}
string version_and_rest = driver_version_file_contents.substr(
offset + strlen(kDriverFilePrelude), string::npos);
size_t space_index = version_and_rest.find(" ");
auto kernel_version = version_and_rest.substr(0, space_index);
// TODO(b/22689637): Eliminate the explicit namespace if possible.
auto stripped_kernel_version =
port::StripSuffixString(kernel_version, ".ld64");
return StringToDriverVersion(stripped_kernel_version);
}
void Diagnostician::WarnOnDsoKernelMismatch(
port::StatusOr<DriverVersion> dso_version,
port::StatusOr<DriverVersion> kernel_version) {
if (kernel_version.ok() && dso_version.ok() &&
dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
LOG(INFO) << "kernel version seems to match DSO: "
<< DriverVersionToString(kernel_version.ValueOrDie());
} else {
LOG(ERROR) << "kernel version "
<< DriverVersionStatusToString(kernel_version)
<< " does not match DSO version "
<< DriverVersionStatusToString(dso_version)
<< " -- cannot find working devices in this configuration";
}
}
port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
auto status = port::Status{port::error::UNIMPLEMENTED,
"kernel reported driver version not implemented"};
return status;
}
} // namespace gpu
} // namespace stream_executor

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,46 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/stream_executor/gpu/gpu_event.h"
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
#include "tensorflow/stream_executor/lib/statusor.h"
namespace stream_executor {
namespace gpu {
Event::Status GpuEvent::PollForStatus() {
port::StatusOr<hipError_t> status =
GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
if (!status.ok()) {
LOG(ERROR) << "Error polling for event status: "
<< status.status().error_message();
return Event::Status::kError;
}
switch (status.ValueOrDie()) {
case hipSuccess:
return Event::Status::kComplete;
case hipErrorNotReady:
return Event::Status::kPending;
default:
LOG(INFO) << "Error condition returned for event status: "
<< status.ValueOrDie();
return Event::Status::kError;
}
}
} // namespace gpu
} // namespace stream_executor

View File

@ -0,0 +1,976 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <unistd.h>
#include "absl/base/casts.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
#include "tensorflow/stream_executor/gpu/gpu_event.h"
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
#include "tensorflow/stream_executor/gpu/gpu_timer.h"
#include "tensorflow/stream_executor/kernel_cache_config.h"
#include "tensorflow/stream_executor/lib/env.h"
#include "tensorflow/stream_executor/lib/error.h"
#include "tensorflow/stream_executor/lib/initialize.h"
#include "tensorflow/stream_executor/lib/mathutil.h"
#include "tensorflow/stream_executor/lib/numbers.h"
#include "tensorflow/stream_executor/lib/path.h"
#include "tensorflow/stream_executor/lib/process_state.h"
#include "tensorflow/stream_executor/lib/ptr_util.h"
#include "tensorflow/stream_executor/lib/statusor.h"
#include "tensorflow/stream_executor/lib/str_util.h"
#include "tensorflow/stream_executor/lib/stringprintf.h"
#include "tensorflow/stream_executor/platform.h"
#include "tensorflow/stream_executor/platform/dso_loader.h"
#include "tensorflow/stream_executor/platform/logging.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/plugin_registry.h"
#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
#include "tensorflow/stream_executor/stream.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
#include "tensorflow/stream_executor/stream_executor_pimpl.h"
#include "tensorflow/stream_executor/timer.h"
#ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
#error \
"No driver calls in this file, wrap driver functionality in rocm_driver.cc."
#endif
#ifdef __ROCM_RUNTIME_H__
#error \
"ROCM runtime being included into ROCM GPU executor; should be driver only."
#endif
namespace stream_executor {
namespace gpu {
static GpuEvent* AsGpuEvent(Event* event) {
DCHECK(event != nullptr);
return static_cast<GpuEvent*>(event->implementation());
}
// Given a platform-independent timer datatype, returns the internal ROCM
// platform implementation pointer.
static GpuTimer* AsGpuTimer(Timer* timer) {
DCHECK(timer != nullptr);
return static_cast<GpuTimer*>(timer->implementation());
}
// Given const GPU memory, returns a librocm device pointer datatype, suitable
// for passing directly to librocm APIs.
//
// N.B. we must lose constness in order to pass a suitable type to the existing
// librocm APIs, so the caller should take care to only pass the result of const
// GPU memory conversions to librocm functions which will honor constness.
static hipDeviceptr_t AsROCmDevicePtr(const DeviceMemoryBase& gpu_mem) {
return const_cast<hipDeviceptr_t>(gpu_mem.opaque());
}
// See description on const version above.
static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
return AsROCmDevicePtr(*gpu_mem);
}
static GpuContext* GetGpuContext(Stream* stream) {
return static_cast<GpuExecutor*>(stream->parent()->implementation())
->gpu_context();
}
GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
CHECK(rocm_exec != nullptr);
return rocm_exec->gpu_context();
}
GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
return static_cast<GpuExecutor*>(stream_exec->implementation());
}
GpuExecutor::~GpuExecutor() {
for (auto& it : disk_modules_) {
GpuDriver::UnloadModule(context_, it.second);
}
for (auto& it : in_memory_modules_) {
GpuDriver::UnloadModule(context_, it.second);
}
if (context_ != nullptr) {
GpuDriver::DestroyContext(context_);
}
CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
}
bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
mutex_lock lock{in_memory_modules_mu_};
return UnloadGpuBinary(gpu_binary);
}
bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
auto module_it = gpu_binary_to_module_.find(gpu_binary);
if (gpu_binary_to_module_.end() == module_it) {
VLOG(3) << "No loaded HSACO module for " << gpu_binary;
return false;
}
auto& module = module_it->second.first;
auto& refcount = module_it->second.second;
VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
if (--refcount == 0) {
VLOG(3) << "Unloading HSACO module " << module;
GpuDriver::UnloadModule(context_, module);
gpu_binary_to_module_.erase(module_it);
}
return true;
}
void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
LOG(FATAL) << "Feature not supported on ROCM platform (UnloadKernel)";
}
port::Status GpuExecutor::Init(int device_ordinal,
DeviceOptions device_options) {
device_ordinal_ = device_ordinal;
auto status = GpuDriver::Init();
if (!status.ok()) {
return status;
}
status = GpuDriver::GetDevice(device_ordinal_, &device_);
if (!status.ok()) {
return status;
}
status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
&context_);
if (!status.ok()) {
return status;
}
return GpuDriver::GetGpuISAVersion(&version_, device_);
}
bool GpuExecutor::FindOnDiskForComputeCapability(
absl::string_view filename, absl::string_view canonical_suffix,
string* found_filename) const {
LOG(FATAL) << "Feature not supported on ROCM platform "
"(FindOnDiskForComputeCapability)";
return false;
}
bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
absl::string_view canonical_suffix,
string* found_filename) const {
if (version_ == 0) {
return false;
}
string cc_specific =
absl::StrCat(filename, ".cc", version_, canonical_suffix);
if (port::FileExists(cc_specific).ok()) {
VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
<< cc_specific;
*found_filename = cc_specific;
return true;
}
VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
<< cc_specific;
if (port::FileExists(string(filename)).ok()) {
*found_filename = string(filename);
return true;
}
return false;
}
// Returns the path to the running executable.
// N.B. Derived from //knowledge/smalltalk/background_kb.cc
// Arg: strip_exe: if true, remove the name of the executable itself from the
// returned string. Example: calling this from /usr/bin/foo
// would return /usr/bin.
static string GetBinaryDir(bool strip_exe) {
char exe_path[PATH_MAX] = {0};
CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
// Make sure it's null-terminated:
exe_path[sizeof(exe_path) - 1] = 0;
if (strip_exe) {
// The exe is the last component of the path, so remove one component.
string ret = exe_path;
std::vector<string> components = port::Split(exe_path, '/');
components.pop_back();
return port::Join(components, "/");
}
return exe_path;
}
bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
KernelBase* kernel) {
GpuKernel* rocm_kernel = AsGpuKernel(kernel);
hipModule_t module = nullptr;
const string* kernelname;
const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
bool has_cubin = spec.has_cuda_cubin_on_disk();
if (has_cubin) {
on_disk_spec = &spec.cuda_cubin_on_disk();
}
if (on_disk_spec != nullptr) {
LOG(WARNING) << "loading ROCM kernel from disk is not supported";
return false;
} else if (spec.has_cuda_cubin_in_memory()) {
kernelname = &spec.cuda_cubin_in_memory().kernelname();
const char* hsaco = spec.cuda_cubin_in_memory().bytes();
mutex_lock lock{in_memory_modules_mu_};
module = in_memory_modules_[hsaco];
if (module == nullptr) {
if (!GpuDriver::LoadHsaco(context_, hsaco, &module)) {
LOG(ERROR) << "failed to load HSACO\n";
return false;
}
in_memory_modules_[hsaco] = module;
}
} else {
LOG(WARNING) << "no method of loading ROCM kernel provided";
return false;
}
VLOG(2) << "getting function " << *kernelname << " from module " << module;
if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
rocm_kernel->gpu_function_ptr())) {
return false;
}
// We have to trust the kernel loader spec arity because there doesn't appear
// to be a way to reflect on the number of expected arguments w/the ROCM API.
rocm_kernel->set_arity(spec.arity());
KernelMetadata kernel_metadata;
if (!GetKernelMetadata(rocm_kernel, &kernel_metadata)) {
LOG(WARNING) << "Unable to get metadata for kernel " << kernelname;
}
kernel->set_metadata(kernel_metadata);
kernel->set_name(*kernelname);
return true;
}
bool GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
KernelMetadata* kernel_metadata) {
int value = 0;
// TODO(ROCm) implement this feature in HIP
kernel_metadata->set_registers_per_thread(value);
// TODO(ROCm) implement this feature in HIP
kernel_metadata->set_shared_memory_bytes(value);
return true;
}
bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
const BlockDim& block_dims, const KernelBase& kernel,
const KernelArgsArrayBase& args) {
CHECK_EQ(kernel.Arity(), args.number_of_arguments());
GpuStreamHandle hipstream = AsGpuStreamValue(stream);
const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();
// Only perform/print the occupancy check once. Even just checking to see
// whether we've done an occupancy check on this kernel before isn't free
// (because we have to synchronize), so we only do this at -v 2+.
if (VLOG_IS_ON(2)) {
mutex_lock lock(launched_kernels_mu_);
if (!launched_kernels_.count(hipfunc)) {
VlogOccupancyInfo(kernel, thread_dims, block_dims);
// TODO(rspringer): Remove elements from launched_kernels_...if we ever
// expose a kernel/module deallocation method.
launched_kernels_.insert(hipfunc);
}
}
if (rocm_kernel->GetPreferredCacheConfig() !=
KernelCacheConfig::kNoPreference) {
GpuDriver::FuncSetCacheConfig(hipfunc, rocm_kernel->GetGpuCacheConfig());
}
// prepare kernargs
// KernelArgsArrayBase keeps the pointer of arguments
// deference them here
std::vector<void*> kernargs;
KernelArgIterator iter = args.arg_iterator();
while (iter.has_next()) {
KernelArg arg = iter.next();
VLOG(2) << "*(arg.address): "
<< reinterpret_cast<void*>(
*static_cast<const uint64_t*>(arg.address));
kernargs.push_back(
reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
}
size_t size = sizeof(void*) * kernargs.size();
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
if (!GpuDriver::LaunchKernel(
GetGpuContext(stream), hipfunc, block_dims.x, block_dims.y,
block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config)) {
LOG(ERROR) << "failed to launch ROCM kernel with args: "
<< args.number_of_arguments()
<< "; thread dim: " << thread_dims.ToString()
<< "; block dim: " << block_dims.ToString();
return false;
}
return true;
}
int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims,
GpuFunctionHandle func) {
LOG(FATAL) << "Feature not supported on ROCM platform (CalculateOccupancy)";
return 0;
}
int GpuExecutor::CompareOccupancy(int* initial_blocks,
const DeviceDescription& device_description,
uint64 registers_per_thread,
uint64 shared_memory_per_block,
const ThreadDim& thread_dims,
GpuFunctionHandle func) {
LOG(FATAL) << "Feature not supported on ROCM platform (CompareOccupancy)";
return 0;
}
bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
ModuleHandle* module_handle) {
// In GpuExecutor we store the pointer to the HSACO binary as
// ModuleHandle::id().
hipModule_t hip_module = nullptr;
// TODO(ROCm): Need generic term instead of cubin/cuda/ptx
if (spec.has_cuda_cubin_in_memory()) {
mutex_lock lock{in_memory_modules_mu_};
if (!LoadModuleFromHsaco(
reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
&hip_module)) {
return false;
}
*module_handle = ModuleHandle(const_cast<void*>(
static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
return true;
} else {
LOG(ERROR) << "No HSACO binary found \n";
return false;
}
}
bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, hipModule_t* module) {
LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
return false;
}
bool GpuExecutor::LoadModuleFromPtx(const char* ptx, hipModule_t* module) {
LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
return false;
}
bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, hipModule_t* module) {
uint64_t module_refcount;
std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
if (*module == nullptr) {
if (!GpuDriver::LoadHsaco(context_, hsaco, module)) {
LOG(ERROR) << "failed to load : HSACO \n";
return false;
}
module_refcount = 1;
VLOG(3) << "Loaded HSACO " << static_cast<const void*>(hsaco)
<< " as module " << *module;
} else {
++module_refcount;
VLOG(3) << "HSACO " << static_cast<const void*>(hsaco)
<< " is already loaded as module " << *module;
}
gpu_binary_to_module_[hsaco] = {*module, module_refcount};
return true;
}
// This is a non-essential operation; if there's a failure, proceed without
// logging an error. It's nearly certain that in case of failures, we'd never
// get here in the first place; these are very low-impact routines.
void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
const ThreadDim& thread_dims,
const BlockDim& block_dims) {
// TODO(ROCm) implement this feature in HIP
}
void* GpuExecutor::Allocate(uint64 size) {
return GpuDriver::DeviceAllocate(context_, size);
}
void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
uint64 size_bytes) {
// offset and size are in bytes, so char* works as the pointer type.
return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
}
void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
// ROCM "sub-buffers" are just pointer + offset, so no dealloc is necessary.
if (!mem->is_sub_buffer()) {
GpuDriver::DeviceDeallocate(context_, mem->opaque());
}
}
bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
if (location == nullptr || size == 0) {
LOG(WARNING) << "attempting to register null or zero-sized memory: "
<< location << "; size " << size;
}
VLOG(2) << "registering " << location << " size " << size;
return GpuDriver::HostRegister(context_, location, size);
}
bool GpuExecutor::HostMemoryUnregister(void* location) {
VLOG(2) << "unregistering " << location;
return GpuDriver::HostUnregister(context_, location);
}
bool GpuExecutor::SynchronizeAllActivity() {
return GpuDriver::SynchronizeContext(context_);
}
bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
size % 4 == 0) {
return GpuDriver::SynchronousMemsetUint32(
context_, AsROCmDevicePtr(location), 0x0, size / 4);
}
return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
0x0, size);
}
bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
uint64 size) {
if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
size % 4 == 0) {
// hipMemset reinterprets "value" as a uint8.
uint8 byte_value = static_cast<uint8>(value);
uint32 pattern = (byte_value << 24) | (byte_value << 16) |
(byte_value << 8) | byte_value;
return GpuDriver::SynchronousMemsetUint32(
context_, AsROCmDevicePtr(location), pattern, size / 4);
}
return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
value, size);
}
port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
const void* host_src, uint64 size) {
return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
host_src, size);
}
port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
const DeviceMemoryBase& gpu_src,
uint64 size) {
return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
AsROCmDevicePtr(gpu_src), size);
}
port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
AsROCmDevicePtr(gpu_src), size);
}
bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
uint64 size) {
if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
size % 4 == 0) {
return Memset32(stream, location, 0x0, size);
} else {
return Memset(stream, location, 0x0, size);
}
}
bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
uint8 pattern, uint64 size) {
VLOG(2) << "enqueueing memset8 operation onto stream " << stream
<< " at location " << location << " with size " << size
<< " and pattern " << std::hex << pattern;
return GpuDriver::AsynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
pattern, size,
AsGpuStreamValue(stream));
}
bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
uint32 pattern, uint64 size) {
VLOG(2) << "enqueueing memset32 operation onto stream " << stream
<< " at location " << location << " with size " << size
<< " and pattern " << std::hex << pattern;
CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
size % 4 == 0);
return GpuDriver::AsynchronousMemsetUint32(
context_, AsROCmDevicePtr(location), pattern, size / 4,
AsGpuStreamValue(stream));
}
bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
const DeviceMemoryBase& gpu_src, uint64 size) {
return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
AsROCmDevicePtr(gpu_src), size,
AsGpuStreamValue(stream));
}
bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
const void* host_src, uint64 size) {
return GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
host_src, size,
AsGpuStreamValue(stream));
}
bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
DeviceMemoryBase* gpu_dst,
const DeviceMemoryBase& gpu_src,
uint64 size) {
return GpuDriver::AsynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
AsROCmDevicePtr(gpu_src), size,
AsGpuStreamValue(stream));
}
bool GpuExecutor::HostCallback(Stream* stream,
std::function<port::Status()> callback) {
auto callback_ptr = new std::function<void()>([callback]() {
port::Status s = callback();
if (!s.ok()) {
LOG(WARNING) << "Host callback failed: " << s;
}
});
return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
InternalHostCallback, callback_ptr);
}
/* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
hipError_t status,
void* data) {
std::function<void()>* callback =
reinterpret_cast<std::function<void()>*>(data);
(*callback)();
delete callback;
}
port::Status GpuExecutor::AllocateEvent(Event* event) {
return AsGpuEvent(event)->Init();
}
port::Status GpuExecutor::DeallocateEvent(Event* event) {
return AsGpuEvent(event)->Destroy();
}
port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
return AsGpuEvent(event)->Record(AsGpuStream(stream));
}
port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
AsGpuEvent(event)->gpu_event())) {
return port::Status::OK();
} else {
return port::Status{
port::error::INTERNAL,
absl::StrFormat("error recording waiting for ROCM event on stream %p",
stream)};
}
}
Event::Status GpuExecutor::PollForEventStatus(Event* event) {
return AsGpuEvent(event)->PollForStatus();
}
bool GpuExecutor::AllocateStream(Stream* stream) {
return AsGpuStream(stream)->Init();
}
void GpuExecutor::DeallocateStream(Stream* stream) {
GpuStream* rocm_stream = AsGpuStream(stream);
if (!rocm_stream->IsIdle()) {
LOG(ERROR) << "Deallocating stream with pending work";
}
rocm_stream->Destroy();
}
bool GpuExecutor::AllocateTimer(Timer* timer) {
return AsGpuTimer(timer)->Init();
}
void GpuExecutor::DeallocateTimer(Timer* timer) {
AsGpuTimer(timer)->Destroy();
}
bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
GpuEventHandle other_completed_event = *AsGpuStream(other)->completed_event();
bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
AsGpuStreamValue(other))
.ok();
if (!ok) {
LOG(ERROR) << "failed to record completion event; "
"therefore, failed to create inter-stream dependency";
return false;
}
return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
other_completed_event);
}
bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
return AsGpuTimer(timer)->Start(AsGpuStream(stream));
}
bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
}
port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
}
blas::BlasSupport* GpuExecutor::CreateBlas() {
PluginRegistry* registry = PluginRegistry::Instance();
port::StatusOr<PluginRegistry::BlasFactory> status =
registry->GetFactory<PluginRegistry::BlasFactory>(kROCmPlatformId,
plugin_config_.blas());
if (!status.ok()) {
LOG(ERROR) << "Unable to retrieve BLAS factory: "
<< status.status().error_message();
return nullptr;
}
return status.ValueOrDie()(this);
}
dnn::DnnSupport* GpuExecutor::CreateDnn() {
PluginRegistry* registry = PluginRegistry::Instance();
port::StatusOr<PluginRegistry::DnnFactory> status =
registry->GetFactory<PluginRegistry::DnnFactory>(kROCmPlatformId,
plugin_config_.dnn());
if (!status.ok()) {
LOG(ERROR) << "Unable to retrieve DNN factory: "
<< status.status().error_message();
return nullptr;
}
return status.ValueOrDie()(this);
}
fft::FftSupport* GpuExecutor::CreateFft() {
PluginRegistry* registry = PluginRegistry::Instance();
port::StatusOr<PluginRegistry::FftFactory> status =
registry->GetFactory<PluginRegistry::FftFactory>(kROCmPlatformId,
plugin_config_.fft());
if (!status.ok()) {
LOG(ERROR) << "Unable to retrieve FFT factory: "
<< status.status().error_message();
return nullptr;
}
return status.ValueOrDie()(this);
}
rng::RngSupport* GpuExecutor::CreateRng() {
PluginRegistry* registry = PluginRegistry::Instance();
port::StatusOr<PluginRegistry::RngFactory> status =
registry->GetFactory<PluginRegistry::RngFactory>(kROCmPlatformId,
plugin_config_.rng());
if (!status.ok()) {
LOG(ERROR) << "Unable to retrieve RNG factory: "
<< status.status().error_message();
return nullptr;
}
return status.ValueOrDie()(this);
}
// TODO(rspringer): Remove in b/18544742.
bool GpuExecutor::SupportsDnn() const { return true; }
bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
}
port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
}
SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
port::StatusOr<hipSharedMemConfig> rocm_config =
GpuDriver::ContextGetSharedMemConfig(context_);
if (!rocm_config.ok()) {
// Don't log; the failed call will log necessary output.
return SharedMemoryConfig::kDefault;
}
switch (rocm_config.ValueOrDie()) {
case hipSharedMemBankSizeDefault:
return SharedMemoryConfig::kDefault;
case hipSharedMemBankSizeFourByte:
return SharedMemoryConfig::kFourByte;
case hipSharedMemBankSizeEightByte:
return SharedMemoryConfig::kEightByte;
default:
LOG(FATAL) << "Invalid shared memory configuration returned: "
<< rocm_config.ValueOrDie();
}
}
port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
SharedMemoryConfig config) {
hipSharedMemConfig rocm_config;
switch (config) {
case SharedMemoryConfig::kDefault:
rocm_config = hipSharedMemBankSizeDefault;
break;
case SharedMemoryConfig::kFourByte:
rocm_config = hipSharedMemBankSizeFourByte;
break;
case SharedMemoryConfig::kEightByte:
rocm_config = hipSharedMemBankSizeEightByte;
break;
default:
LOG(FATAL) << "Invalid shared memory configuration specified: "
<< static_cast<int>(config);
}
return GpuDriver::ContextSetSharedMemConfig(context_, rocm_config);
}
bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
}
bool GpuExecutor::GetSymbol(const string& symbol_name,
ModuleHandle module_handle, void** mem,
size_t* bytes) {
{ // give limited scope to mutex_lock
mutex_lock lock{disk_modules_mu_};
for (auto& it : disk_modules_) {
if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
reinterpret_cast<hipDeviceptr_t*>(mem),
bytes)) {
return true;
}
}
}
{ // give limited scope to mutex_lock
mutex_lock lock{in_memory_modules_mu_};
for (auto& it : in_memory_modules_) {
if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
reinterpret_cast<hipDeviceptr_t*>(mem),
bytes)) {
return true;
}
}
}
{ // give limited scope to mutex_lock
mutex_lock lock{in_memory_modules_mu_};
if (static_cast<bool>(module_handle)) {
auto it = gpu_binary_to_module_.find(module_handle.id());
CHECK(it != gpu_binary_to_module_.end());
if (GpuDriver::GetModuleSymbol(
context_, it->second.first, symbol_name.c_str(),
reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
return true;
}
}
for (auto& it : gpu_binary_to_module_) {
if (GpuDriver::GetModuleSymbol(
context_, it.second.first, symbol_name.c_str(),
reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
return true;
}
}
}
LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
return false;
}
bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
// The BlockDim name is a mismatch against these GRID_DIM_* queries because
// we use BlockDims to express the dimensions of blocks within a grid
// (as opposed to ThreadDim which expresses the dimensions of threads
// within a block).
int x, y, z;
if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
return false;
}
block_dim_limit->x = x;
block_dim_limit->y = y;
block_dim_limit->z = z;
return true;
}
bool GpuExecutor::SupportsBlas() const { return true; }
bool GpuExecutor::SupportsFft() const { return true; }
bool GpuExecutor::SupportsRng() const { return true; }
std::unique_ptr<internal::EventInterface>
GpuExecutor::CreateEventImplementation() {
return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
}
std::unique_ptr<internal::KernelInterface>
GpuExecutor::CreateKernelImplementation() {
return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
}
std::unique_ptr<internal::StreamInterface>
GpuExecutor::GetStreamImplementation() {
return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
}
std::unique_ptr<internal::TimerInterface>
GpuExecutor::GetTimerImplementation() {
return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
}
void* GpuExecutor::GpuContextHack() { return context_; }
GpuContext* GpuExecutor::gpu_context() { return context_; }
// Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
// of SysFS. Returns -1 if it cannot.
//
// For anything more complicated/prod-focused than this, you'll likely want to
// turn to gsys' topology modeling.
static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
// TODO(ROCm) implement this feature in HIP
return 1;
}
DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
internal::DeviceDescriptionBuilder builder;
{
int driver_version = 0;
(void)GpuDriver::GetDriverVersion(&driver_version);
string augmented_driver_version = absl::StrFormat(
"%d (%s)", driver_version,
DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str());
builder.set_driver_version(augmented_driver_version);
}
{
string pci_bus_id = GpuDriver::GetPCIBusID(device_);
// Lower the hex characters to match sysfs.
pci_bus_id = port::Lowercase(pci_bus_id);
builder.set_pci_bus_id(pci_bus_id);
// Read the NUMA node corresponding to the PCI bus ID out of sysfs.
int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_);
builder.set_numa_node(numa_node);
}
hipDeviceProp_t prop;
if (GpuDriver::GetDeviceProperties(&prop, device_ordinal_)) {
builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
ThreadDim thread_dim_limit;
thread_dim_limit.x = prop.maxThreadsDim[0];
thread_dim_limit.y = prop.maxThreadsDim[1];
thread_dim_limit.z = prop.maxThreadsDim[2];
builder.set_thread_dim_limit(thread_dim_limit);
float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
builder.set_clock_rate_ghz(clock_rate_ghz);
}
{
bool ecc_enabled = false;
(void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
builder.set_ecc_enabled(ecc_enabled);
}
{
uint64 device_memory_size = -1;
(void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
builder.set_device_memory_size(device_memory_size);
}
{
BlockDim block_dim_limit;
FillBlockDimLimit(&block_dim_limit);
builder.set_block_dim_limit(block_dim_limit);
}
{
string device_name;
(void)GpuDriver::GetDeviceName(device_, &device_name);
builder.set_name(device_name);
}
builder.set_platform_version(
absl::StrCat("AMDGPU ISA version: gfx", version_));
// TODO(leary) should be a way to query this from the driver, but this is
// unlikely to change for us any time soon.
builder.set_device_address_bits(64);
builder.set_device_vendor("Advanced Micro Devices, Inc");
builder.set_rocm_amdgpu_isa_version(version_);
builder.set_shared_memory_per_core(
GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
builder.set_shared_memory_per_block(
GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
builder.set_core_count(
GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
builder.set_threads_per_core_limit(
GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
builder.set_registers_per_block_limit(
GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
builder.set_threads_per_warp(
GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
builder.set_registers_per_core_limit(64 * 1024);
auto built = builder.Build();
return built.release();
}
} // namespace gpu
void initialize_rocm_gpu_executor() {
*internal::MakeROCMExecutorImplementation() = [](const PluginConfig& config) {
return new gpu::GpuExecutor{config};
};
}
} // namespace stream_executor
REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {
stream_executor::initialize_rocm_gpu_executor();
});

View File

@ -0,0 +1,38 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
namespace stream_executor {
namespace gpu {
hipFuncCache_t GpuKernel::GetGpuCacheConfig() const {
switch (preferred_cache_config_) {
case KernelCacheConfig::kNoPreference:
return hipFuncCachePreferNone;
case KernelCacheConfig::kPreferShared:
return hipFuncCachePreferShared;
case KernelCacheConfig::kPreferL1:
return hipFuncCachePreferL1;
case KernelCacheConfig::kPreferEqual:
return hipFuncCachePreferEqual;
default:
LOG(FATAL) << "Unknown KernelCacheConfig"
<< static_cast<int32>(preferred_cache_config_);
}
}
} // namespace gpu
} // namespace stream_executor

View File

@ -0,0 +1,180 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/stream_executor/rocm/rocm_platform.h"
#include "absl/strings/str_format.h"
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
#include "tensorflow/stream_executor/lib/error.h"
#include "tensorflow/stream_executor/lib/initialize.h"
#include "tensorflow/stream_executor/lib/ptr_util.h"
#include "tensorflow/stream_executor/lib/status.h"
#include "tensorflow/stream_executor/lib/stringprintf.h"
#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
namespace stream_executor {
namespace gpu {
ROCmPlatform::ROCmPlatform()
: name_("ROCM"), min_numa_node_(0), limit_numa_node_(0) {}
ROCmPlatform::~ROCmPlatform() {}
// Due to legacy issues in user code, we can't currently call InpectNumaNodes
// at module initialization time, because non-GPU programs still include this
// plugin via various methods, so instead, it has to be init-on-reference.
void ROCmPlatform::InspectNumaNodes() {
// To get NUMA node information, we need to create all executors, so we can
// examine their device descriptions to see their bus assignments.
static bool initialized = false;
static mutex numa_mutex(LINKER_INITIALIZED);
mutex_lock lock(numa_mutex);
if (initialized) {
return;
}
StreamExecutorConfig config;
for (int i = 0; i < VisibleDeviceCount(); i++) {
config.ordinal = i;
StreamExecutor* exec = GetExecutor(config).ValueOrDie();
if (i == 0) {
// NUMA nodes may not start at 0, so set the minimum node based on the
// first executor we see.
min_numa_node_ = exec->GetDeviceDescription().numa_node();
limit_numa_node_ = min_numa_node_ + 1;
} else {
min_numa_node_ =
std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
limit_numa_node_ = std::max(limit_numa_node_,
exec->GetDeviceDescription().numa_node() + 1);
}
}
initialized = true;
}
int ROCmPlatform::BusCount() {
InspectNumaNodes();
return limit_numa_node_ - min_numa_node_;
}
int ROCmPlatform::DeviceToBus(int device_ordinal) {
StreamExecutorConfig config;
config.ordinal = device_ordinal;
StreamExecutor* exec = GetExecutor(config).ValueOrDie();
return exec->GetDeviceDescription().numa_node() - min_numa_node_;
}
port::StatusOr<StreamExecutor*> ROCmPlatform::FirstExecutorForBus(
int bus_ordinal) {
InspectNumaNodes();
CHECK_LT(bus_ordinal, BusCount()) << "bus ordinal out of available range";
for (int i = 0; i < VisibleDeviceCount(); i++) {
if (DeviceToBus(i) == bus_ordinal) {
StreamExecutorConfig config;
config.ordinal = i;
return GetExecutor(config).ValueOrDie();
}
}
return port::Status{
port::error::NOT_FOUND,
absl::StrFormat("Executor for bus %d not found.", bus_ordinal)};
}
Platform::Id ROCmPlatform::id() const { return kROCmPlatformId; }
int ROCmPlatform::VisibleDeviceCount() const {
// Throw away the result - it logs internally, and this [containing] function
// isn't in the path of user control. It's safe to call this > 1x.
if (!gpu::GpuDriver::Init().ok()) {
return -1;
}
return GpuDriver::GetDeviceCount();
}
const string& ROCmPlatform::Name() const { return name_; }
port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal) {
StreamExecutorConfig config;
config.ordinal = ordinal;
config.plugin_config = PluginConfig();
config.device_options = DeviceOptions::Default();
return GetExecutor(config);
}
port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDeviceWithPluginConfig(
int device_ordinal, const PluginConfig& plugin_config) {
StreamExecutorConfig config;
config.ordinal = device_ordinal;
config.plugin_config = plugin_config;
config.device_options = DeviceOptions::Default();
return GetExecutor(config);
}
port::StatusOr<StreamExecutor*> ROCmPlatform::GetExecutor(
const StreamExecutorConfig& config) {
return executor_cache_.GetOrCreate(
config, [&]() { return GetUncachedExecutor(config); });
}
port::StatusOr<std::unique_ptr<StreamExecutor>>
ROCmPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
auto executor = MakeUnique<StreamExecutor>(
this, MakeUnique<GpuExecutor>(config.plugin_config));
auto init_status = executor->Init(config.ordinal, config.device_options);
if (!init_status.ok()) {
return port::Status{
port::error::INTERNAL,
absl::StrFormat(
"failed initializing StreamExecutor for ROCM device ordinal %d: %s",
config.ordinal, init_status.ToString().c_str())};
}
return std::move(executor);
}
void ROCmPlatform::RegisterTraceListener(
std::unique_ptr<TraceListener> listener) {
LOG(FATAL) << "not yet implemented: register ROCM trace listener";
}
void ROCmPlatform::UnregisterTraceListener(TraceListener* listener) {
LOG(FATAL) << "not yet implemented: unregister ROCM trace listener";
}
} // namespace gpu
static void InitializeROCmPlatform() {
// Disabling leak checking, MultiPlatformManager does not destroy its
// registered platforms.
auto status = MultiPlatformManager::PlatformWithName("ROCM");
if (!status.ok()) {
std::unique_ptr<gpu::ROCmPlatform> platform(new gpu::ROCmPlatform);
SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
}
}
} // namespace stream_executor
REGISTER_MODULE_INITIALIZER(rocm_platform,
stream_executor::InitializeROCmPlatform());
DECLARE_MODULE_INITIALIZER(multi_platform_manager);
// Note that module initialization sequencing is not supported in the
// open-source project, so this will be a no-op there.
REGISTER_MODULE_INITIALIZER_SEQUENCE(rocm_platform, multi_platform_manager);

View File

@ -0,0 +1,110 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
#include <memory>
#include <vector>
#include "tensorflow/stream_executor/executor_cache.h"
#include "tensorflow/stream_executor/lib/statusor.h"
#include "tensorflow/stream_executor/multi_platform_manager.h"
#include "tensorflow/stream_executor/platform.h"
#include "tensorflow/stream_executor/platform/mutex.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/platform/thread_annotations.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
#include "tensorflow/stream_executor/stream_executor_pimpl.h"
#include "tensorflow/stream_executor/trace_listener.h"
namespace stream_executor {
namespace gpu {
// Opaque and unique identifier for the ROCM platform plugin.
// This is needed so that plugins can refer to/identify this platform without
// instantiating a ROCmPlatform object.
extern const Platform::Id kROCmPlatformId;
// ROCm-specific platform plugin, registered as a singleton value via module
// initializer.
class ROCmPlatform : public Platform {
public:
ROCmPlatform();
~ROCmPlatform() override;
// ROCmPlatform-specific functionality
// Returns the number of distinct buses / NUMA nodes on the machine.
int BusCount();
// Returns the bus/NUMA node for the specified device ordinal.
int DeviceToBus(int device_ordinal);
// Returns the lowest-ordinal-number StreamExecutor on the specified bus.
port::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal);
// Platform interface implementation:
// Returns the same value as kROCmPlatform above.
Platform::Id id() const override;
// Returns -1 as a sentinel on internal failure (and logs the error).
int VisibleDeviceCount() const override;
const string& Name() const override;
port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
int ordinal, const PluginConfig& config) override;
port::StatusOr<StreamExecutor*> GetExecutor(
const StreamExecutorConfig& config) override;
port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
const StreamExecutorConfig& config) override;
void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
void UnregisterTraceListener(TraceListener* listener) override;
private:
// Determines the number of NUMA nodes and the assignment of executor to each.
void InspectNumaNodes();
// This platform's name.
string name_;
// mutex that guards internal state.
mutable mutex mu_;
// Cache of created executors.
ExecutorCache executor_cache_;
// The smallest NUMA node value for any device managed by this machine
// manager. Used, along with limit_numa_node_, to convert NUMA nodes into bus
// ordinals. The NUMA node space occupied by GPUs is assumed to be dense./
int min_numa_node_;
// Larger than the NUMA node value for any device managed by this machine
// manager.
int limit_numa_node_;
SE_DISALLOW_COPY_AND_ASSIGN(ROCmPlatform);
};
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_

View File

@ -0,0 +1,24 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
namespace stream_executor {
namespace gpu {
PLATFORM_DEFINE_ID(kROCmPlatformId);
} // namespace gpu
} // namespace stream_executor

View File

@ -0,0 +1,34 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
#include "tensorflow/stream_executor/platform.h"
namespace stream_executor {
namespace gpu {
// Opaque and unique identifier for the ROCm platform.
// This is needed so that plugins can refer to/identify this platform without
// instantiating a ROCmPlatform object.
// This is broken out here to avoid a circular dependency between ROCmPlatform
// and GpuExecutor.
extern const Platform::Id kROCmPlatformId;
} // namespace gpu
} // namespace stream_executor
#endif // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_

View File

@ -0,0 +1,284 @@
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "rocm/include/hiprand/hiprand.h"
#include "tensorflow/stream_executor/device_memory.h"
#include "tensorflow/stream_executor/gpu/gpu_activation.h"
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
#include "tensorflow/stream_executor/gpu/gpu_rng.h"
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
#include "tensorflow/stream_executor/lib/env.h"
#include "tensorflow/stream_executor/lib/initialize.h"
#include "tensorflow/stream_executor/lib/status.h"
#include "tensorflow/stream_executor/platform/logging.h"
#include "tensorflow/stream_executor/rng.h"
#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
// Formats hiprandStatus_t to output prettified values into a log stream.
std::ostream& operator<<(std::ostream& in, const hiprandStatus_t& status) {
#define OSTREAM_HIPRAND_STATUS(__name) \
case HIPRAND_STATUS_##__name: \
in << "HIPRAND_STATUS_" #__name; \
return in;
switch (status) {
OSTREAM_HIPRAND_STATUS(SUCCESS)
OSTREAM_HIPRAND_STATUS(VERSION_MISMATCH)
OSTREAM_HIPRAND_STATUS(NOT_INITIALIZED)
OSTREAM_HIPRAND_STATUS(ALLOCATION_FAILED)
OSTREAM_HIPRAND_STATUS(TYPE_ERROR)
OSTREAM_HIPRAND_STATUS(OUT_OF_RANGE)
OSTREAM_HIPRAND_STATUS(LENGTH_NOT_MULTIPLE)
OSTREAM_HIPRAND_STATUS(LAUNCH_FAILURE)
OSTREAM_HIPRAND_STATUS(PREEXISTING_FAILURE)
OSTREAM_HIPRAND_STATUS(INITIALIZATION_FAILED)
OSTREAM_HIPRAND_STATUS(ARCH_MISMATCH)
OSTREAM_HIPRAND_STATUS(INTERNAL_ERROR)
default:
in << "hiprandStatus_t(" << static_cast<int>(status) << ")";
return in;
}
}
namespace stream_executor {
namespace gpu {
PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);
namespace wrap {
#define PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(__name) \
struct WrapperShim__##__name { \
template <typename... Args> \
hiprandStatus_t operator()(GpuExecutor* parent, Args... args) { \
gpu::ScopedActivateExecutorContext sac{parent}; \
return ::__name(args...); \
} \
} __name;
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandCreateGenerator);
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandDestroyGenerator);
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetStream);
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateUniform);
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateUniformDouble);
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetPseudoRandomGeneratorSeed);
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetGeneratorOffset);
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateNormal);
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateNormalDouble);
} // namespace wrap
GpuRng::GpuRng(GpuExecutor* parent) : parent_(parent), rng_(nullptr) {}
GpuRng::~GpuRng() {
if (rng_ != nullptr) {
wrap::hiprandDestroyGenerator(parent_, rng_);
}
}
bool GpuRng::Init() {
mutex_lock lock{mu_};
CHECK(rng_ == nullptr);
hiprandStatus_t ret =
wrap::hiprandCreateGenerator(parent_, &rng_, HIPRAND_RNG_PSEUDO_DEFAULT);
if (ret != HIPRAND_STATUS_SUCCESS) {
LOG(ERROR) << "failed to create random number generator: " << ret;
return false;
}
CHECK(rng_ != nullptr);
return true;
}
bool GpuRng::SetStream(Stream* stream) {
hiprandStatus_t ret =
wrap::hiprandSetStream(parent_, rng_, AsGpuStreamValue(stream));
if (ret != HIPRAND_STATUS_SUCCESS) {
LOG(ERROR) << "failed to set stream for random generation: " << ret;
return false;
}
return true;
}
// Returns true if std::complex stores its contents as two consecutive
// elements. Tests int, float and double, as the last two are independent
// specializations.
constexpr bool ComplexIsConsecutiveFloats() {
return sizeof(std::complex<int>) == 8 && sizeof(std::complex<float>) == 8 &&
sizeof(std::complex<double>) == 16;
}
template <typename T>
bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
mutex_lock lock{mu_};
static_assert(ComplexIsConsecutiveFloats(),
"std::complex values are not stored as consecutive values");
if (!SetStream(stream)) {
return false;
}
// std::complex<T> is currently implemented as two consecutive T variables.
uint64 element_count = v->ElementCount();
if (std::is_same<T, std::complex<float>>::value ||
std::is_same<T, std::complex<double>>::value) {
element_count *= 2;
}
hiprandStatus_t ret;
if (std::is_same<T, float>::value ||
std::is_same<T, std::complex<float>>::value) {
ret = wrap::hiprandGenerateUniform(
parent_, rng_, reinterpret_cast<float*>(GpuMemoryMutable(v)),
element_count);
} else {
ret = wrap::hiprandGenerateUniformDouble(
parent_, rng_, reinterpret_cast<double*>(GpuMemoryMutable(v)),
element_count);
}
if (ret != HIPRAND_STATUS_SUCCESS) {
LOG(ERROR) << "failed to do uniform generation of " << v->ElementCount()
<< " " << TypeString<T>() << "s at " << v->opaque() << ": "
<< ret;
return false;
}
return true;
}
bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) {
return DoPopulateRandUniformInternal(stream, v);
}
bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) {
return DoPopulateRandUniformInternal(stream, v);
}
bool GpuRng::DoPopulateRandUniform(Stream* stream,
DeviceMemory<std::complex<float>>* v) {
return DoPopulateRandUniformInternal(stream, v);
}
bool GpuRng::DoPopulateRandUniform(Stream* stream,
DeviceMemory<std::complex<double>>* v) {
return DoPopulateRandUniformInternal(stream, v);
}
template <typename ElemT, typename FuncT>
bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
ElemT stddev,
DeviceMemory<ElemT>* v,
FuncT func) {
mutex_lock lock{mu_};
if (!SetStream(stream)) {
return false;
}
uint64 element_count = v->ElementCount();
hiprandStatus_t ret =
func(parent_, rng_, GpuMemoryMutable(v), element_count, mean, stddev);
if (ret != HIPRAND_STATUS_SUCCESS) {
LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
<< " floats at " << v->opaque() << ": " << ret;
return false;
}
return true;
}
bool GpuRng::DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
DeviceMemory<float>* v) {
return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
wrap::hiprandGenerateNormal);
}
bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
DeviceMemory<double>* v) {
return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
wrap::hiprandGenerateNormalDouble);
}
bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
mutex_lock lock{mu_};
CHECK(rng_ != nullptr);
if (!CheckSeed(seed, seed_bytes)) {
return false;
}
if (!SetStream(stream)) {
return false;
}
// Requires 8 bytes of seed data; checked in RngSupport::CheckSeed (above)
// (which itself requires 16 for API consistency with host RNG fallbacks).
hiprandStatus_t ret = wrap::hiprandSetPseudoRandomGeneratorSeed(
parent_, rng_, *(reinterpret_cast<const uint64*>(seed)));
if (ret != HIPRAND_STATUS_SUCCESS) {
LOG(ERROR) << "failed to set rng seed: " << ret;
return false;
}
ret = wrap::hiprandSetGeneratorOffset(parent_, rng_, 0);
if (ret != HIPRAND_STATUS_SUCCESS) {
LOG(ERROR) << "failed to reset rng position: " << ret;
return false;
}
return true;
}
} // namespace gpu
} // namespace stream_executor
namespace se = ::stream_executor;
REGISTER_MODULE_INITIALIZER(register_hiprand, {
se::port::Status status =
se::PluginRegistry::Instance()
->RegisterFactory<se::PluginRegistry::RngFactory>(
se::gpu::kROCmPlatformId, se::gpu::kGpuRandPlugin, "hipRAND",
[](se::internal::StreamExecutorInterface* parent)
-> se::rng::RngSupport* {
se::gpu::GpuExecutor* rocm_executor =
dynamic_cast<se::gpu::GpuExecutor*>(parent);
if (rocm_executor == nullptr) {
LOG(ERROR)
<< "Attempting to initialize an instance of the hipRAND "
<< "support library with a non-ROCM StreamExecutor";
return nullptr;
}
se::gpu::GpuRng* rng = new se::gpu::GpuRng(rocm_executor);
if (!rng->Init()) {
// Note: Init() will log a more specific error.
delete rng;
return nullptr;
}
return rng;
});
if (!status.ok()) {
LOG(ERROR) << "Unable to register hipRAND factory: "
<< status.error_message();
}
se::PluginRegistry::Instance()->SetDefaultFactory(
se::gpu::kROCmPlatformId, se::PluginKind::kRng, se::gpu::kGpuRandPlugin);
});

View File

@ -25,6 +25,13 @@ StreamExecutorFactory* MakeCUDAExecutorImplementation() {
return &instance;
}
// -- ROCm
StreamExecutorFactory* MakeROCMExecutorImplementation() {
static StreamExecutorFactory instance;
return &instance;
}
// -- OpenCL
StreamExecutorFactory* MakeOpenCLExecutorImplementation() {

View File

@ -374,9 +374,11 @@ using StreamFactory = std::function<StreamInterface *(StreamExecutor *)>;
using TimerFactory = std::function<TimerInterface *(StreamExecutor *)>;
using KernelFactory = std::function<KernelInterface*()>;
StreamExecutorFactory* MakeCUDAExecutorImplementation();
StreamExecutorFactory *MakeCUDAExecutorImplementation();
StreamExecutorFactory* MakeOpenCLExecutorImplementation();
StreamExecutorFactory *MakeROCMExecutorImplementation();
StreamExecutorFactory *MakeOpenCLExecutorImplementation();
extern StreamExecutorFactory MakeHostExecutorImplementation;

View File

@ -71,6 +71,9 @@ internal::StreamExecutorInterface *StreamExecutorImplementationFromPlatformKind(
case PlatformKind::kCuda:
factory = *internal::MakeCUDAExecutorImplementation();
break;
case PlatformKind::kROCm:
factory = *internal::MakeROCMExecutorImplementation();
break;
case PlatformKind::kOpenCL:
factory = *internal::MakeOpenCLExecutorImplementation();
break;
@ -188,6 +191,8 @@ StreamExecutor::StreamExecutor(
memory_limit_bytes_(GetMemoryLimitBytes()) {
if (port::Lowercase(platform_->Name()) == "cuda") {
platform_kind_ = PlatformKind::kCuda;
} else if (port::Lowercase(platform_->Name()) == "rocm") {
platform_kind_ = PlatformKind::kROCm;
} else if (port::Lowercase(platform_->Name()) == "opencl") {
platform_kind_ = PlatformKind::kOpenCL;
} else if (port::Lowercase(platform_->Name()) == "host") {

View File

@ -18,6 +18,7 @@ cc_library(
includes = [
".",
"rocm/include",
"rocm/include/rocrand",
],
visibility = ["//visibility:public"],
)