PR #25011: [GPU][ROCm][CUDA] StreamExecutor logic for ROCm / CUDA platform (PR 20709 / 22669 / 24156 continued)
Please approve this CL. It will be submitted automatically, and its GitHub pull request will be marked as merged. Imported from GitHub PR #25011 New PR to continue the efforts started by @deven-amd in #20709 / #22669 / #24156. This PR aims to refactor StreamExecutor GPU interfaces so it can be shared among CUDA and ROCm. The PR would be the first part of a series of PRs. Based on @timshen91 's inputs, I've refactored logic in #214156 so : - only contains changes in stream_executor/.... - does not remove any stream_executor/cuda/*.h, so that things outside of stream_executor don't break. All the types and functions in the namespace cuda now alias to namespace gpu counterparts. For example, namespace cuda { using CUDADriver = gpu::GpuDriver; }. - all stream_executor/gpu/BUILD targets should be only visible to //third_party/tensorflow/stream_executor:__subpackages__. - target stream_executor/gpu:X should be only used by stream_executor/cuda:cuda_X or stream_executor/rocm:rocm_X, not cuda_Y. For example, cuda:cuda_platform should depend on cuda:cuda_driver, not gpu:gpu_driver. Copybara import of the project: - 267affbb73df9164baf4e62142fe7201e6a305ee [ROCm][CUDA] StreamExecutor logic for ROCm / CUDA platform by Wen-Heng (Jack) Chung <whchung@gmail.com> - 04fac5bf358059bdb2cd4a3e092e52dc982ea7b0 Merge 267affbb73df9164baf4e62142fe7201e6a305ee into 5f8ea... by Wen-Heng (Jack) Chung <whchung@gmail.com> COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/tensorflow/pull/25011 from ROCmSoftwarePlatform:google-upstream-pr-stream-executor-alt 267affbb73df9164baf4e62142fe7201e6a305ee PiperOrigin-RevId: 231250990
This commit is contained in:
parent
56c3ac7d23
commit
aba83497f5
@ -343,6 +343,13 @@ config_setting(
|
||||
},
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "using_rocm_hipcc",
|
||||
define_values = {
|
||||
"using_rocm_hipcc": "true",
|
||||
},
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "with_mpi_support",
|
||||
values = {"define": "with_mpi_support=true"},
|
||||
|
@ -1964,6 +1964,14 @@ cc_library(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "rocm",
|
||||
visibility = ["//visibility:public"],
|
||||
deps = [
|
||||
"//tensorflow/core/platform/default/build_config:rocm",
|
||||
],
|
||||
)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Clif-related proto libraries.
|
||||
|
||||
|
@ -6,6 +6,7 @@ load("//tensorflow:tensorflow.bzl", "if_windows")
|
||||
load("//tensorflow:tensorflow.bzl", "if_not_windows")
|
||||
load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
|
||||
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
|
||||
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
|
||||
load(
|
||||
"//third_party/mkl:build_defs.bzl",
|
||||
"if_mkl_ml",
|
||||
@ -735,6 +736,11 @@ def tf_additional_binary_deps():
|
||||
"//tensorflow/stream_executor:cuda_platform",
|
||||
"//tensorflow/core/platform/default/build_config:cuda",
|
||||
],
|
||||
) + if_rocm(
|
||||
[
|
||||
"//tensorflow/stream_executor:rocm_platform",
|
||||
"//tensorflow/core/platform/default/build_config:rocm",
|
||||
],
|
||||
) + [
|
||||
# TODO(allenl): Split these out into their own shared objects (they are
|
||||
# here because they are shared between contrib/ op shared objects and
|
||||
|
@ -8,6 +8,7 @@ licenses(["notice"]) # Apache 2.0
|
||||
exports_files(["LICENSE"])
|
||||
|
||||
load("//tensorflow:tensorflow.bzl", "if_cuda")
|
||||
load("//tensorflow:tensorflow.bzl", "if_rocm")
|
||||
load("//tensorflow:tensorflow.bzl", "tf_copts")
|
||||
load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
|
||||
load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
|
||||
@ -42,6 +43,7 @@ tf_cuda_library(
|
||||
"//tensorflow/stream_executor/cuda:cuda_platform_id",
|
||||
"//tensorflow/stream_executor/host:host_platform_id",
|
||||
"//tensorflow/stream_executor/platform:dso_loader",
|
||||
"//tensorflow/stream_executor/rocm:rocm_platform_id",
|
||||
] + select({
|
||||
"@local_config_cuda//cuda:darwin": ["IOKit"],
|
||||
"//conditions:default": [],
|
||||
@ -50,6 +52,7 @@ tf_cuda_library(
|
||||
"//tensorflow:using_cuda_nvcc": ["//tensorflow/stream_executor/cuda:all_runtime"],
|
||||
"//tensorflow:using_cuda_clang_with_dynamic_build": [],
|
||||
"//tensorflow:using_cuda_nvcc_with_dynamic_build": [],
|
||||
"//tensorflow:using_rocm_hipcc": ["//tensorflow/stream_executor/rocm:all_runtime"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
)
|
||||
@ -67,6 +70,18 @@ cc_library(
|
||||
}),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "stream_executor_rocm",
|
||||
deps = [
|
||||
":stream_executor_no_cuda",
|
||||
":rocm",
|
||||
] + if_static(
|
||||
["//tensorflow/stream_executor/rocm:all_runtime"],
|
||||
) + select({
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "stream_executor_no_cuda",
|
||||
deps = [
|
||||
@ -79,6 +94,7 @@ cc_library(
|
||||
"//tensorflow/stream_executor/host:host_platform",
|
||||
"//tensorflow/stream_executor/host:host_platform_id",
|
||||
"//tensorflow/stream_executor/platform:dso_loader",
|
||||
"//tensorflow/stream_executor/rocm:rocm_platform_id",
|
||||
],
|
||||
)
|
||||
|
||||
@ -267,6 +283,17 @@ cc_library(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "rocm",
|
||||
data = [],
|
||||
linkopts = select({
|
||||
"//conditions:default": [
|
||||
"-Wl,-rpath,../local_config_rocm/rocm/rocm/lib",
|
||||
],
|
||||
}),
|
||||
deps = [],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "sycl",
|
||||
data = if_ccpp([
|
||||
|
@ -27,6 +27,7 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/multi_platform_manager.h"
|
||||
#include "tensorflow/stream_executor/platform.h"
|
||||
#include "tensorflow/stream_executor/platform/dso_loader.h"
|
||||
#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
|
||||
#include "tensorflow/stream_executor/scratch_allocator.h"
|
||||
#include "tensorflow/stream_executor/stream.h"
|
||||
#include "tensorflow/stream_executor/stream_executor.h"
|
||||
|
@ -27,6 +27,7 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/multi_platform_manager.h"
|
||||
#include "tensorflow/stream_executor/platform.h"
|
||||
#include "tensorflow/stream_executor/platform/dso_loader.h"
|
||||
#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
|
||||
#include "tensorflow/stream_executor/scratch_allocator.h"
|
||||
#include "tensorflow/stream_executor/stream.h"
|
||||
#include "tensorflow/stream_executor/stream_executor.h"
|
||||
|
@ -654,3 +654,8 @@ alias(
|
||||
name = "cuda_platform",
|
||||
actual = "//tensorflow/stream_executor/cuda:all_runtime",
|
||||
)
|
||||
|
||||
alias(
|
||||
name = "rocm_platform",
|
||||
actual = "//tensorflow/stream_executor/rocm:all_runtime",
|
||||
)
|
||||
|
@ -66,6 +66,7 @@ cc_library(
|
||||
deps = if_cuda_is_configured([
|
||||
"@com_google_absl//absl/container:inlined_vector",
|
||||
"@com_google_absl//absl/strings",
|
||||
"//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
]),
|
||||
@ -85,6 +86,7 @@ cc_library(
|
||||
"@com_google_absl//absl/strings",
|
||||
"@local_config_cuda//cuda:cuda_headers",
|
||||
"//tensorflow/stream_executor:device_options",
|
||||
"//tensorflow/stream_executor/gpu:gpu_driver_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
"//tensorflow/stream_executor/platform:dso_loader",
|
||||
@ -97,18 +99,22 @@ cc_library(
|
||||
name = "cuda_activation_header",
|
||||
hdrs = ["cuda_activation.h"],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = ["//tensorflow/stream_executor/platform"],
|
||||
deps = [
|
||||
"//tensorflow/stream_executor/gpu:gpu_activation_header",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "cuda_activation",
|
||||
srcs = if_cuda_is_configured(["cuda_activation.cc"]),
|
||||
srcs = [],
|
||||
hdrs = if_cuda_is_configured(["cuda_activation.h"]),
|
||||
deps = if_cuda_is_configured([
|
||||
":cuda_driver",
|
||||
"@local_config_cuda//cuda:cuda_headers",
|
||||
"//tensorflow/stream_executor",
|
||||
"//tensorflow/stream_executor:stream_executor_internal",
|
||||
"//tensorflow/stream_executor/gpu:gpu_activation",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
]),
|
||||
)
|
||||
@ -120,6 +126,7 @@ cc_library(
|
||||
deps = if_cuda_is_configured([
|
||||
":cuda_kernel",
|
||||
"//tensorflow/stream_executor:event",
|
||||
"//tensorflow/stream_executor/gpu:gpu_executor_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
]),
|
||||
@ -133,10 +140,10 @@ cc_library(
|
||||
deps = if_cuda_is_configured([
|
||||
":cuda_activation",
|
||||
":cuda_gpu_executor",
|
||||
":cuda_helpers",
|
||||
":cuda_platform_id",
|
||||
":cuda_stream",
|
||||
":cuda_timer",
|
||||
":cuda_helpers",
|
||||
"@com_google_absl//absl/strings",
|
||||
"//third_party/eigen3",
|
||||
"@local_config_cuda//cuda:cuda_headers",
|
||||
@ -147,6 +154,7 @@ cc_library(
|
||||
"//tensorflow/stream_executor:plugin_registry",
|
||||
"//tensorflow/stream_executor:scratch_allocator",
|
||||
"//tensorflow/stream_executor:timer",
|
||||
"//tensorflow/stream_executor/gpu:gpu_helpers_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
"//tensorflow/stream_executor/platform:dso_loader",
|
||||
@ -162,14 +170,15 @@ cc_library(
|
||||
deps = if_cuda_is_configured([
|
||||
":cuda_activation_header",
|
||||
":cuda_gpu_executor_header",
|
||||
":cuda_helpers",
|
||||
":cuda_platform_id",
|
||||
":cuda_stream",
|
||||
":cuda_helpers",
|
||||
"@local_config_cuda//cuda:cuda_headers",
|
||||
"//tensorflow/stream_executor:event",
|
||||
"//tensorflow/stream_executor:fft",
|
||||
"//tensorflow/stream_executor:plugin_registry",
|
||||
"//tensorflow/stream_executor:scratch_allocator",
|
||||
"//tensorflow/stream_executor/gpu:gpu_helpers_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
"//tensorflow/stream_executor/platform:dso_loader",
|
||||
@ -223,13 +232,15 @@ cc_library(
|
||||
deps = if_cuda_is_configured([
|
||||
":cuda_activation",
|
||||
":cuda_gpu_executor",
|
||||
":cuda_helpers",
|
||||
":cuda_platform_id",
|
||||
":cuda_stream",
|
||||
":cuda_helpers",
|
||||
"@local_config_cuda//cuda:cuda_headers",
|
||||
"//tensorflow/stream_executor:event",
|
||||
"//tensorflow/stream_executor:plugin_registry",
|
||||
"//tensorflow/stream_executor:rng",
|
||||
"//tensorflow/stream_executor/gpu:gpu_helpers_header",
|
||||
"//tensorflow/stream_executor/gpu:gpu_rng_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
"//tensorflow/stream_executor/platform:dso_loader",
|
||||
@ -239,12 +250,14 @@ cc_library(
|
||||
|
||||
cc_library(
|
||||
name = "cuda_kernel",
|
||||
srcs = if_cuda_is_configured(["cuda_kernel.cc"]),
|
||||
hdrs = if_cuda_is_configured(["cuda_kernel.h"]),
|
||||
deps = if_cuda_is_configured([
|
||||
":cuda_driver",
|
||||
"@local_config_cuda//cuda:cuda_headers",
|
||||
"//tensorflow/stream_executor:event",
|
||||
"//tensorflow/stream_executor:stream_executor_pimpl_header",
|
||||
"//tensorflow/stream_executor/gpu:gpu_kernel_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
]),
|
||||
@ -254,6 +267,9 @@ cc_library(
|
||||
cc_library(
|
||||
name = "cuda_helpers",
|
||||
textual_hdrs = if_cuda_is_configured(["cuda_helpers.h"]),
|
||||
deps = if_cuda_is_configured([
|
||||
"//tensorflow/stream_executor/gpu:gpu_helpers_header",
|
||||
]),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
@ -265,19 +281,22 @@ cc_library(
|
||||
":cuda_gpu_executor_header",
|
||||
":cuda_stream",
|
||||
"//tensorflow/stream_executor:stream_executor_headers",
|
||||
"//tensorflow/stream_executor/gpu:gpu_event",
|
||||
"//tensorflow/stream_executor/gpu:gpu_stream_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
]),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "cuda_stream",
|
||||
srcs = if_cuda_is_configured(["cuda_stream.cc"]),
|
||||
srcs = [],
|
||||
hdrs = if_cuda_is_configured(["cuda_stream.h"]),
|
||||
deps = if_cuda_is_configured([
|
||||
":cuda_driver",
|
||||
":cuda_gpu_executor_header",
|
||||
"//tensorflow/stream_executor:stream_executor_headers",
|
||||
"//tensorflow/stream_executor:stream_header",
|
||||
"//tensorflow/stream_executor/gpu:gpu_stream",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
]),
|
||||
@ -285,18 +304,18 @@ cc_library(
|
||||
|
||||
cc_library(
|
||||
name = "cuda_timer",
|
||||
srcs = if_cuda_is_configured(["cuda_timer.cc"]),
|
||||
srcs = [],
|
||||
hdrs = if_cuda_is_configured(["cuda_timer.h"]),
|
||||
deps = if_cuda_is_configured([
|
||||
":cuda_driver",
|
||||
":cuda_gpu_executor_header",
|
||||
":cuda_stream",
|
||||
"//tensorflow/stream_executor:stream_executor_headers",
|
||||
"//tensorflow/stream_executor/gpu:gpu_timer",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
]),
|
||||
)
|
||||
|
||||
# It implements :cuda_gpu_executor_header
|
||||
cc_library(
|
||||
name = "cuda_gpu_executor",
|
||||
srcs = if_cuda_is_configured(["cuda_gpu_executor.cc"]),
|
||||
@ -316,6 +335,7 @@ cc_library(
|
||||
"//tensorflow/stream_executor:stream_executor_internal",
|
||||
"//tensorflow/stream_executor:stream_executor_pimpl_header",
|
||||
"//tensorflow/stream_executor:timer",
|
||||
"//tensorflow/stream_executor/gpu:gpu_executor_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
"//tensorflow/stream_executor/platform:dso_loader",
|
||||
|
@ -17,13 +17,13 @@ limitations under the License.
|
||||
// It reaches into the CUDA implementation to activate an underlying CUDA
|
||||
// context.
|
||||
//
|
||||
// Having this file separate from cuda_gpu_executor.h means that dependent
|
||||
// Having this file separate from cuda/cuda_gpu_executor.h means that dependent
|
||||
// code does not also have to depend on cuda.h.
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
|
||||
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_activation.h"
|
||||
|
||||
namespace stream_executor {
|
||||
|
||||
@ -31,29 +31,7 @@ class StreamExecutor;
|
||||
|
||||
namespace cuda {
|
||||
|
||||
class CUDAExecutor;
|
||||
class ScopedActivateContext;
|
||||
|
||||
// Activates a CUDA context within an enclosing scope.
|
||||
class ScopedActivateExecutorContext {
|
||||
public:
|
||||
// Form that takes a CUDA executor implementation.
|
||||
explicit ScopedActivateExecutorContext(CUDAExecutor* cuda_exec);
|
||||
|
||||
// Form that takes a pImpl executor and extracts a CUDA implementation --
|
||||
// fatal failure if it is not CUDA inside.
|
||||
explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
|
||||
|
||||
ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
|
||||
|
||||
~ScopedActivateExecutorContext();
|
||||
|
||||
private:
|
||||
// The cuda.h-using datatype that we wrap.
|
||||
ScopedActivateContext* driver_scoped_activate_context_;
|
||||
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
|
||||
};
|
||||
using ScopedActivateExecutorContext = gpu::ScopedActivateExecutorContext;
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace stream_executor
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -33,26 +33,26 @@ namespace stream_executor {
|
||||
|
||||
class Stream;
|
||||
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
// Opaque and unique identifier for the cuBLAS plugin.
|
||||
extern const PluginId kCuBlasPlugin;
|
||||
|
||||
class CUDAExecutor;
|
||||
class GpuExecutor;
|
||||
|
||||
// BLAS plugin for CUDA platform via cuBLAS library.
|
||||
//
|
||||
// This satisfies the platform-agnostic BlasSupport interface.
|
||||
//
|
||||
// Note that the cuBLAS handle that this encapsulates is implicitly tied to the
|
||||
// context (and, as a result, the device) that the parent CUDAExecutor is tied
|
||||
// context (and, as a result, the device) that the parent GpuExecutor is tied
|
||||
// to. This simply happens as an artifact of creating the cuBLAS handle when a
|
||||
// CUDA context is active.
|
||||
//
|
||||
// Thread-safe post-initialization.
|
||||
class CUDABlas : public blas::BlasSupport {
|
||||
public:
|
||||
explicit CUDABlas(CUDAExecutor *parent);
|
||||
explicit CUDABlas(GpuExecutor *parent);
|
||||
|
||||
// Allocates a cuBLAS handle.
|
||||
bool Init();
|
||||
@ -145,9 +145,9 @@ class CUDABlas : public blas::BlasSupport {
|
||||
// mutex that guards the cuBLAS handle for this device.
|
||||
mutex mu_;
|
||||
|
||||
// CUDAExecutor which instantiated this CUDABlas.
|
||||
// GpuExecutor which instantiated this CUDABlas.
|
||||
// Immutable post-initialization.
|
||||
CUDAExecutor *parent_;
|
||||
GpuExecutor *parent_;
|
||||
|
||||
// cuBLAS library handle on the device.
|
||||
cublasHandle_t blas_ GUARDED_BY(mu_);
|
||||
@ -155,7 +155,7 @@ class CUDABlas : public blas::BlasSupport {
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
|
||||
|
@ -52,13 +52,6 @@ limitations under the License.
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
|
||||
#ifdef __APPLE__
|
||||
static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
|
||||
#elif !defined(PLATFORM_WINDOWS)
|
||||
static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
|
||||
#endif
|
||||
|
||||
|
||||
string DriverVersionToString(DriverVersion version) {
|
||||
return port::Printf("%d.%d.%d", std::get<0>(version), std::get<1>(version), std::get<2>(version));
|
||||
}
|
||||
@ -112,6 +105,18 @@ port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace stream_executor
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
#ifdef __APPLE__
|
||||
static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
|
||||
#elif !defined(PLATFORM_WINDOWS)
|
||||
static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
|
||||
#endif
|
||||
|
||||
// -- class Diagnostician
|
||||
|
||||
string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
|
||||
@ -190,11 +195,11 @@ void Diagnostician::LogDiagnosticInformation() {
|
||||
}
|
||||
port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
|
||||
LOG(INFO) << "libcuda reported version is: "
|
||||
<< DriverVersionStatusToString(dso_version);
|
||||
<< cuda::DriverVersionStatusToString(dso_version);
|
||||
|
||||
port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
|
||||
LOG(INFO) << "kernel reported version is: "
|
||||
<< DriverVersionStatusToString(kernel_version);
|
||||
<< cuda::DriverVersionStatusToString(kernel_version);
|
||||
#endif
|
||||
|
||||
// OS X kernel driver does not report version accurately
|
||||
@ -232,7 +237,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
|
||||
}
|
||||
const size_t length = suffix_pos - start;
|
||||
const string version = path.substr(start, length);
|
||||
result = StringToDriverVersion(version);
|
||||
result = cuda::StringToDriverVersion(version);
|
||||
}
|
||||
#else
|
||||
#if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
|
||||
@ -260,7 +265,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
|
||||
// TODO(b/22689637): Eliminate the explicit namespace if possible.
|
||||
auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
|
||||
auto result = static_cast<port::StatusOr<DriverVersion> *>(data);
|
||||
*result = StringToDriverVersion(stripped_dso_version);
|
||||
*result = cuda::StringToDriverVersion(stripped_dso_version);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
@ -292,7 +297,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
|
||||
// TODO(b/22689637): Eliminate the explicit namespace if possible.
|
||||
auto stripped_kernel_version =
|
||||
port::StripSuffixString(kernel_version, ".ld64");
|
||||
return StringToDriverVersion(stripped_kernel_version);
|
||||
return cuda::StringToDriverVersion(stripped_kernel_version);
|
||||
}
|
||||
|
||||
void Diagnostician::WarnOnDsoKernelMismatch(
|
||||
@ -301,12 +306,12 @@ void Diagnostician::WarnOnDsoKernelMismatch(
|
||||
if (kernel_version.ok() && dso_version.ok() &&
|
||||
dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
|
||||
LOG(INFO) << "kernel version seems to match DSO: "
|
||||
<< DriverVersionToString(kernel_version.ValueOrDie());
|
||||
<< cuda::DriverVersionToString(kernel_version.ValueOrDie());
|
||||
} else {
|
||||
LOG(ERROR) << "kernel version "
|
||||
<< DriverVersionStatusToString(kernel_version)
|
||||
<< cuda::DriverVersionStatusToString(kernel_version)
|
||||
<< " does not match DSO version "
|
||||
<< DriverVersionStatusToString(dso_version)
|
||||
<< cuda::DriverVersionStatusToString(dso_version)
|
||||
<< " -- cannot find working devices in this configuration";
|
||||
}
|
||||
}
|
||||
@ -336,9 +341,9 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
|
||||
// see
|
||||
// https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
|
||||
if (version == NULL) {
|
||||
return StringToDriverVersion("");
|
||||
return cuda::StringToDriverVersion("");
|
||||
}
|
||||
return StringToDriverVersion(version);
|
||||
return cuda::StringToDriverVersion(version);
|
||||
}
|
||||
CFRelease(kext_infos);
|
||||
auto status = port::Status(
|
||||
@ -387,6 +392,5 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
@ -16,17 +16,13 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
|
||||
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include <tuple>
|
||||
|
||||
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
|
||||
// e.g. DriverVersion{346, 3, 4}
|
||||
using DriverVersion = std::tuple<int, int, int>;
|
||||
using DriverVersion = gpu::DriverVersion;
|
||||
|
||||
// Converts a parsed driver version to string form.
|
||||
string DriverVersionToString(DriverVersion version);
|
||||
@ -35,61 +31,9 @@ string DriverVersionToString(DriverVersion version);
|
||||
string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
|
||||
|
||||
// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
|
||||
port::StatusOr<DriverVersion> StringToDriverVersion(const string &value);
|
||||
port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
|
||||
|
||||
class Diagnostician {
|
||||
public:
|
||||
// Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
|
||||
// not initializing).
|
||||
//
|
||||
// Note: if we're running on a machine that has no GPUs, we don't want to
|
||||
// produce very much log spew beyond saying, "looks like there's no CUDA
|
||||
// kernel
|
||||
// module running".
|
||||
//
|
||||
// Note: we use non-Google-File:: API here because we may be called before
|
||||
// InitGoogle has completed.
|
||||
static void LogDiagnosticInformation();
|
||||
|
||||
// Given the driver version file contents, finds the kernel module version and
|
||||
// returns it as a string.
|
||||
//
|
||||
// This is solely used for more informative log messages when the user is
|
||||
// running on a machine that happens to have a libcuda/kernel driver mismatch.
|
||||
static port::StatusOr<DriverVersion> FindKernelModuleVersion(
|
||||
const string &driver_version_file_contents);
|
||||
|
||||
// Extracts the kernel driver version from the current host.
|
||||
static port::StatusOr<DriverVersion> FindKernelDriverVersion();
|
||||
|
||||
// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
|
||||
// driver-interfacing DSO version number. Returns it as a string.
|
||||
static port::StatusOr<DriverVersion> FindDsoVersion();
|
||||
|
||||
// Logs information about the kernel driver version and userspace driver
|
||||
// library version.
|
||||
static void LogDriverVersionInformation();
|
||||
|
||||
private:
|
||||
|
||||
// Given the DSO version number and the driver version file contents, extracts
|
||||
// the driver version and compares, warning the user in the case of
|
||||
// incompatibility.
|
||||
//
|
||||
// This is solely used for more informative log messages when the user is
|
||||
// running on a machine that happens to have a libcuda/kernel driver mismatch.
|
||||
static void WarnOnDsoKernelMismatch(
|
||||
port::StatusOr<DriverVersion> dso_version,
|
||||
port::StatusOr<DriverVersion> kernel_version);
|
||||
|
||||
// Logs information about the dev nodes present on this machine: their
|
||||
// existence, permissions, accessibility from this uid/gid.
|
||||
static void LogDevNodeDiagnosticInformation();
|
||||
|
||||
static string GetDevNodePath(int dev_node_ordinal);
|
||||
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
|
||||
};
|
||||
using Diagnostician = gpu::Diagnostician;
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace stream_executor
|
||||
|
@ -58,7 +58,7 @@ limitations under the License.
|
||||
#pragma clang diagnostic warning "-Wmismatched-tags"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);
|
||||
|
||||
@ -137,7 +137,7 @@ class CudnnHandle {
|
||||
public:
|
||||
// Takes ownership of the executor context and the lock to access cuDNN
|
||||
// using handle.
|
||||
CudnnHandle(cuda::ScopedActivateExecutorContext context, mutex_lock lock,
|
||||
CudnnHandle(gpu::ScopedActivateExecutorContext context, mutex_lock lock,
|
||||
cudnnHandle_t handle)
|
||||
: context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}
|
||||
|
||||
@ -146,7 +146,7 @@ class CudnnHandle {
|
||||
cudnnHandle_t handle() const { return handle_; }
|
||||
|
||||
private:
|
||||
cuda::ScopedActivateExecutorContext context_;
|
||||
gpu::ScopedActivateExecutorContext context_;
|
||||
mutex_lock lock_;
|
||||
cudnnHandle_t handle_; // Not owned.
|
||||
};
|
||||
@ -334,10 +334,10 @@ class CudnnAccess {
|
||||
// The legacy default stream synchronizes with all other streams and it is
|
||||
// therefore a bad idea (performance wise) to call any cuDNN APIs that
|
||||
// enqueue work in the stream.
|
||||
CudnnHandle GetHandle(CUDAExecutor* executor, Stream* stream) {
|
||||
CudnnHandle GetHandle(GpuExecutor* executor, Stream* stream) {
|
||||
mutex_lock lock(mutex_);
|
||||
cuda::ScopedActivateExecutorContext context(executor);
|
||||
CUstream cu_stream = stream ? AsCUDAStreamValue(stream) : cudaStreamLegacy;
|
||||
gpu::ScopedActivateExecutorContext context(executor);
|
||||
CUstream cu_stream = stream ? AsGpuStreamValue(stream) : cudaStreamLegacy;
|
||||
auto status = cudnnSetStream(handle_, cu_stream);
|
||||
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Failed to set cuDNN stream.";
|
||||
return CudnnHandle(std::move(context), std::move(lock), handle_);
|
||||
@ -448,7 +448,7 @@ port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
|
||||
|
||||
} // namespace
|
||||
|
||||
CudnnSupport::CudnnSupport(CUDAExecutor* parent) : parent_(parent) {}
|
||||
CudnnSupport::CudnnSupport(GpuExecutor* parent) : parent_(parent) {}
|
||||
|
||||
port::Status CudnnSupport::Init() {
|
||||
ScopedActivateExecutorContext context(parent_);
|
||||
@ -481,14 +481,14 @@ port::Status CudnnSupport::Init() {
|
||||
CHECK_EQ(cudnn_handle, nullptr);
|
||||
LOG(ERROR) << "Could not create cudnn handle: " << ToString(status);
|
||||
if (status == CUDNN_STATUS_NOT_INITIALIZED) {
|
||||
auto result = cuda::Diagnostician::FindKernelDriverVersion();
|
||||
auto result = gpu::Diagnostician::FindKernelDriverVersion();
|
||||
if (!result.ok()) {
|
||||
LOG(ERROR) << "Error retrieving driver version: "
|
||||
<< DriverVersionStatusToString(result);
|
||||
<< cuda::DriverVersionStatusToString(result);
|
||||
} else {
|
||||
const auto& version = result.ValueOrDie();
|
||||
LOG(ERROR) << "Possibly insufficient driver version: "
|
||||
<< DriverVersionToString(version);
|
||||
<< cuda::DriverVersionToString(version);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1151,7 +1151,7 @@ class CudnnRnnParamsDescriptor {
|
||||
} // namespace
|
||||
|
||||
class CudnnRnnDescriptor : public dnn::RnnDescriptor {
|
||||
CudnnRnnDescriptor(const CudnnHandle& cudnn, cuda::RnnDescriptor rnn_desc,
|
||||
CudnnRnnDescriptor(const CudnnHandle& cudnn, gpu::RnnDescriptor rnn_desc,
|
||||
PersistentRnnPlan rnn_plan, int num_layers,
|
||||
int hidden_size, int input_size, int batch_size,
|
||||
cudnnRNNInputMode_t input_mode,
|
||||
@ -1191,7 +1191,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
|
||||
CudnnDropoutDescriptor dropout_desc,
|
||||
CudnnDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));
|
||||
|
||||
cuda::RnnDescriptor rnn_desc = CreateRnnDescriptor();
|
||||
gpu::RnnDescriptor rnn_desc = CreateRnnDescriptor();
|
||||
cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config.algorithm());
|
||||
|
||||
// TODO: allow the user to choose an algorithm.
|
||||
@ -1282,7 +1282,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
|
||||
}
|
||||
|
||||
private:
|
||||
cuda::RnnDescriptor rnn_desc_;
|
||||
gpu::RnnDescriptor rnn_desc_;
|
||||
PersistentRnnPlan rnn_plan_;
|
||||
int num_layers_;
|
||||
int hidden_size_;
|
||||
@ -1401,15 +1401,14 @@ port::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
|
||||
|
||||
class CudnnRnnSequenceTensorDescriptor
|
||||
: public dnn::RnnSequenceTensorDescriptor {
|
||||
CudnnRnnSequenceTensorDescriptor(CUDAExecutor* parent, int max_seq_length,
|
||||
CudnnRnnSequenceTensorDescriptor(GpuExecutor* parent, int max_seq_length,
|
||||
int batch_size, int data_size,
|
||||
cudnnDataType_t data_type,
|
||||
#if CUDNN_VERSION >= 7201
|
||||
RNNDataDescriptor data_handle,
|
||||
#endif
|
||||
TensorDescriptor handle)
|
||||
: parent_(parent),
|
||||
max_seq_length_(max_seq_length),
|
||||
: max_seq_length_(max_seq_length),
|
||||
batch_size_(batch_size),
|
||||
data_size_(data_size),
|
||||
data_type_(data_type),
|
||||
@ -1425,7 +1424,7 @@ class CudnnRnnSequenceTensorDescriptor
|
||||
default;
|
||||
|
||||
static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
|
||||
CUDAExecutor* parent, int max_seq_length, int batch_size, int data_size,
|
||||
GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
|
||||
cudnnDataType_t data_type) {
|
||||
CHECK_GT(max_seq_length, 0);
|
||||
int dims[] = {batch_size, data_size, 1};
|
||||
@ -1444,7 +1443,7 @@ class CudnnRnnSequenceTensorDescriptor
|
||||
}
|
||||
|
||||
static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
|
||||
CUDAExecutor* parent, int max_seq_length, int batch_size, int data_size,
|
||||
GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
|
||||
const absl::Span<const int>& seq_lengths, cudnnDataType_t data_type) {
|
||||
#if CUDNN_VERSION >= 7201
|
||||
CHECK_GT(max_seq_length, 0);
|
||||
@ -1496,7 +1495,6 @@ class CudnnRnnSequenceTensorDescriptor
|
||||
}
|
||||
|
||||
private:
|
||||
CUDAExecutor* parent_;
|
||||
int max_seq_length_;
|
||||
int batch_size_;
|
||||
int data_size_;
|
||||
@ -1511,11 +1509,10 @@ class CudnnRnnSequenceTensorDescriptor
|
||||
|
||||
class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
|
||||
public:
|
||||
CudnnRnnStateTensorDescriptor(CUDAExecutor* parent, int num_layers,
|
||||
CudnnRnnStateTensorDescriptor(GpuExecutor* parent, int num_layers,
|
||||
int batch_size, int data_size,
|
||||
cudnnDataType_t data_type)
|
||||
: parent_(parent),
|
||||
handle_(CreateTensorDescriptor()),
|
||||
: handle_(CreateTensorDescriptor()),
|
||||
num_layers_(num_layers),
|
||||
batch_size_(batch_size),
|
||||
data_size_(data_size),
|
||||
@ -1535,7 +1532,6 @@ class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
|
||||
int data_size() const { return data_size_; }
|
||||
|
||||
private:
|
||||
CUDAExecutor* parent_;
|
||||
TensorDescriptor handle_;
|
||||
int num_layers_;
|
||||
int batch_size_;
|
||||
@ -1699,14 +1695,14 @@ port::Status CudnnSupport::DoRnnForwardImpl(
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<CUDATimer, TimerDeleter> timer;
|
||||
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
|
||||
const bool is_profiling = output_profile_result != nullptr;
|
||||
if (is_profiling) {
|
||||
timer.reset(new CUDATimer(parent_));
|
||||
timer.reset(new GpuTimer(parent_));
|
||||
// The start and stop of the timer should be as close to the Cudnn call as
|
||||
// possible. It is still possible for other threads to issue workload on
|
||||
// to this stream. So it could take multiple profiling measurements.
|
||||
if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
|
||||
if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
|
||||
return port::Status(port::error::INTERNAL, "Failed to start timer");
|
||||
}
|
||||
}
|
||||
@ -1791,7 +1787,7 @@ port::Status CudnnSupport::DoRnnForwardImpl(
|
||||
}
|
||||
|
||||
if (is_profiling) {
|
||||
if (!timer->Stop(AsCUDAStream(stream))) {
|
||||
if (!timer->Stop(AsGpuStream(stream))) {
|
||||
return port::Status(port::error::INTERNAL, "Failed to stop timer");
|
||||
}
|
||||
auto algo_desc = *rnn_desc.algorithm_config().algorithm();
|
||||
@ -1842,14 +1838,14 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
|
||||
CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
|
||||
workspace_allocator));
|
||||
|
||||
std::unique_ptr<CUDATimer, TimerDeleter> timer;
|
||||
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
|
||||
const bool is_profiling = output_profile_result != nullptr;
|
||||
if (is_profiling) {
|
||||
timer.reset(new CUDATimer(parent_));
|
||||
timer.reset(new GpuTimer(parent_));
|
||||
// The start and stop of the timer should be as close to the Cudnn call as
|
||||
// possible. It is still possible for other threads to issue workload on
|
||||
// to this stream. So it could take multiple profiling measurements.
|
||||
if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
|
||||
if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
|
||||
return port::Status(port::error::INTERNAL, "Failed to start timer");
|
||||
}
|
||||
}
|
||||
@ -1948,7 +1944,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
|
||||
}
|
||||
|
||||
if (is_profiling) {
|
||||
if (!timer->Stop(AsCUDAStream(stream))) {
|
||||
if (!timer->Stop(AsGpuStream(stream))) {
|
||||
return port::Status(port::error::INTERNAL, "Failed to stop timer");
|
||||
}
|
||||
auto algo_desc = *rnn_desc.algorithm_config().algorithm();
|
||||
@ -2915,13 +2911,13 @@ port::Status CudnnSupport::DoConvolve(
|
||||
|
||||
const bool is_profiling = output_profile_result != nullptr;
|
||||
|
||||
std::unique_ptr<CUDATimer, TimerDeleter> timer;
|
||||
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
|
||||
if (is_profiling) {
|
||||
timer.reset(new CUDATimer(parent_)); // NOLINT
|
||||
timer.reset(new GpuTimer(parent_)); // NOLINT
|
||||
// The start and stop of the timer should be as close to the Cudnn call as
|
||||
// possible. It is still possible for other threads to issue workload on
|
||||
// to this stream. So it could take multiple profiling measurements.
|
||||
if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
|
||||
if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
|
||||
return port::Status(port::error::INTERNAL, "Failed to start timer");
|
||||
}
|
||||
}
|
||||
@ -3110,7 +3106,7 @@ port::Status CudnnSupport::DoConvolve(
|
||||
}
|
||||
|
||||
if (is_profiling) {
|
||||
if (!timer->Stop(AsCUDAStream(stream))) {
|
||||
if (!timer->Stop(AsGpuStream(stream))) {
|
||||
return port::Status(port::error::INTERNAL, "Failed to stop timer");
|
||||
}
|
||||
output_profile_result->set_algorithm(algorithm_desc);
|
||||
@ -3175,13 +3171,13 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
|
||||
stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
|
||||
output_nd, scratch_allocator, &scratch));
|
||||
|
||||
std::unique_ptr<CUDATimer, TimerDeleter> timer;
|
||||
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
|
||||
if (is_profiling) {
|
||||
timer.reset(new CUDATimer(parent_)); // NOLINT
|
||||
timer.reset(new GpuTimer(parent_)); // NOLINT
|
||||
// The start and stop of the timer should be as close to the Cudnn call as
|
||||
// possible. It is still possible for other threads to issue workload on
|
||||
// to this stream. So it could take multiple profiling measurements.
|
||||
if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
|
||||
if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
|
||||
return port::Status(port::error::INTERNAL, "Failed to start timer");
|
||||
}
|
||||
}
|
||||
@ -3234,7 +3230,7 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
|
||||
/*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
|
||||
|
||||
if (is_profiling) {
|
||||
if (!timer->Stop(AsCUDAStream(stream))) {
|
||||
if (!timer->Stop(AsGpuStream(stream))) {
|
||||
return port::Status(port::error::INTERNAL, "Failed to stop timer");
|
||||
}
|
||||
output_profile_result->set_algorithm(algo_desc);
|
||||
@ -4339,22 +4335,22 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
|
||||
return IsStatusOk(status, /*report_error=*/true);
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
|
||||
void initialize_cudnn() {
|
||||
port::Status status =
|
||||
PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
|
||||
cuda::kCudaPlatformId, cuda::kCuDnnPlugin, "cuDNN",
|
||||
cuda::kCudaPlatformId, gpu::kCuDnnPlugin, "cuDNN",
|
||||
[](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
|
||||
cuda::CUDAExecutor* cuda_executor =
|
||||
dynamic_cast<cuda::CUDAExecutor*>(parent);
|
||||
gpu::GpuExecutor* cuda_executor =
|
||||
dynamic_cast<gpu::GpuExecutor*>(parent);
|
||||
if (cuda_executor == nullptr) {
|
||||
LOG(ERROR) << "Attempting to initialize an instance of the cuDNN "
|
||||
<< "support library with a non-CUDA StreamExecutor";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
cuda::CudnnSupport* dnn = new cuda::CudnnSupport(cuda_executor);
|
||||
gpu::CudnnSupport* dnn = new gpu::CudnnSupport(cuda_executor);
|
||||
if (!dnn->Init().ok()) {
|
||||
// Note: Init() will log a more specific error.
|
||||
delete dnn;
|
||||
@ -4369,7 +4365,7 @@ void initialize_cudnn() {
|
||||
}
|
||||
|
||||
PluginRegistry::Instance()->SetDefaultFactory(
|
||||
cuda::kCudaPlatformId, PluginKind::kDnn, cuda::kCuDnnPlugin);
|
||||
cuda::kCudaPlatformId, PluginKind::kDnn, gpu::kCuDnnPlugin);
|
||||
}
|
||||
|
||||
} // namespace stream_executor
|
||||
|
@ -28,9 +28,9 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/temporary_device_memory.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
class CUDAExecutor;
|
||||
class GpuExecutor;
|
||||
class CudnnRnnDescriptor;
|
||||
class CudnnRnnSequenceTensorDescriptor;
|
||||
class CudnnRnnStateTensorDescriptor;
|
||||
@ -42,7 +42,7 @@ extern const PluginId kCuDnnPlugin;
|
||||
// functions, see dnn.h.
|
||||
class CudnnSupport : public dnn::DnnSupport {
|
||||
public:
|
||||
explicit CudnnSupport(CUDAExecutor* parent);
|
||||
explicit CudnnSupport(GpuExecutor* parent);
|
||||
|
||||
port::Status Init() override;
|
||||
port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
|
||||
@ -552,7 +552,7 @@ class CudnnSupport : public dnn::DnnSupport {
|
||||
DeviceMemoryBase* output_data) override;
|
||||
|
||||
private:
|
||||
CUDAExecutor* parent_; // Parent executor object. Not owned.
|
||||
GpuExecutor* parent_; // Parent executor object. Not owned.
|
||||
|
||||
// Provides access to the cuDNN handle.
|
||||
std::unique_ptr<class CudnnAccess> cudnn_;
|
||||
@ -667,7 +667,7 @@ class CudnnSupport : public dnn::DnnSupport {
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
|
||||
|
@ -45,21 +45,20 @@ bool FLAGS_gpuexec_cuda_device_0_only = false;
|
||||
|
||||
// Debugging: on each push and pop of a cuda context, verify the current context
|
||||
// matches the expected one.
|
||||
constexpr bool kVerifyCudaContext = false;
|
||||
constexpr bool kVerifyGpuContext = false;
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
|
||||
namespace gpu {
|
||||
namespace {
|
||||
|
||||
// Manages the singleton map of contexts that we've created, mapping
|
||||
// from the CUcontext to the CudaContext* that we pass around internally.
|
||||
// This also manages assignment of unique ids to CudaContexts, to allow
|
||||
// from the CUcontext to the GpuContext* that we pass around internally.
|
||||
// This also manages assignment of unique ids to GpuContexts, to allow
|
||||
// for fast comparison of a context against the current context.
|
||||
//
|
||||
// CUDA-runtime-created contexts are avoided, if triple angle
|
||||
// brace launches are required, by using the scoped activations in
|
||||
// cuda_activation.h.
|
||||
// gpu/gpu_activation.h.
|
||||
class CreatedContexts {
|
||||
public:
|
||||
// Returns whether context is a member of the live set.
|
||||
@ -69,14 +68,14 @@ class CreatedContexts {
|
||||
}
|
||||
|
||||
// Adds context to the live set, or returns it if it's already present.
|
||||
static CudaContext* Add(CUcontext context) {
|
||||
static GpuContext* Add(CUcontext context) {
|
||||
CHECK(context != nullptr);
|
||||
mutex_lock lock(mu_);
|
||||
auto insert_result = Live()->insert(std::make_pair(context, nullptr));
|
||||
auto it = insert_result.first;
|
||||
if (insert_result.second) {
|
||||
// context was not present in the map. Add it.
|
||||
it->second = MakeUnique<CudaContext>(context, next_id_++);
|
||||
it->second = MakeUnique<GpuContext>(context, next_id_++);
|
||||
}
|
||||
return it->second.get();
|
||||
}
|
||||
@ -92,9 +91,9 @@ class CreatedContexts {
|
||||
|
||||
private:
|
||||
// Returns the live map singleton.
|
||||
static std::map<CUcontext, std::unique_ptr<CudaContext>> *Live() {
|
||||
static std::map<CUcontext, std::unique_ptr<GpuContext>>* Live() {
|
||||
static auto singleton =
|
||||
new std::map<CUcontext, std::unique_ptr<CudaContext>>;
|
||||
new std::map<CUcontext, std::unique_ptr<GpuContext>>;
|
||||
return singleton;
|
||||
}
|
||||
|
||||
@ -123,7 +122,7 @@ string ToString(CUresult result) {
|
||||
// created by StreamExecutor (to ensure that the CUDA runtime didn't create a
|
||||
// context behind our backs).
|
||||
CUcontext CurrentContext() {
|
||||
CUcontext current = CUDADriver::CurrentContextOrDie();
|
||||
CUcontext current = cuda::CurrentContextOrDie();
|
||||
if (current != nullptr && !CreatedContexts::Has(current)) {
|
||||
LOG(FATAL) << "current context was not created by the StreamExecutor "
|
||||
"cuda_driver API: "
|
||||
@ -177,7 +176,7 @@ void SynchronizeOrDie() {
|
||||
|
||||
struct ThreadLocalData {
|
||||
int64 id;
|
||||
CudaContext* context; // Only valid if id == a known good context.
|
||||
GpuContext* context; // Only valid if id == a known good context.
|
||||
int depth;
|
||||
};
|
||||
|
||||
@ -185,13 +184,13 @@ SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
|
||||
|
||||
} // namespace
|
||||
|
||||
ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
|
||||
ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
|
||||
if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
|
||||
|
||||
auto* tls = &tls_data.get();
|
||||
tls->depth++;
|
||||
if (tls->id == cuda_context->id()) {
|
||||
if (kVerifyCudaContext) {
|
||||
if (kVerifyGpuContext) {
|
||||
CHECK_EQ(CurrentContext(), cuda_context->context());
|
||||
}
|
||||
DCHECK_EQ(CurrentContext(), cuda_context->context());
|
||||
@ -215,8 +214,8 @@ ScopedActivateContext::~ScopedActivateContext() {
|
||||
|
||||
auto* tls = &tls_data.get();
|
||||
|
||||
if (kVerifyCudaContext) {
|
||||
// Note that if kVerifyCudaContext is used, and contexts are deleted, it's
|
||||
if (kVerifyGpuContext) {
|
||||
// Note that if kVerifyGpuContext is used, and contexts are deleted, it's
|
||||
// possible this could fail in the CurrentContext() call.
|
||||
CHECK_EQ(CurrentContext(),
|
||||
tls->context == nullptr ? nullptr : tls->context->context());
|
||||
@ -242,7 +241,7 @@ namespace {
|
||||
// logging purposes. Returns "?" if the device could not be successfully
|
||||
// queried.
|
||||
string CUDAPointerToDeviceString(CUdeviceptr pointer) {
|
||||
auto value = CUDADriver::GetPointerDevice(pointer);
|
||||
auto value = GpuDriver::GetPointerDevice(pointer);
|
||||
if (value.ok()) {
|
||||
return absl::StrCat(value.ValueOrDie());
|
||||
}
|
||||
@ -254,7 +253,7 @@ string CUDAPointerToDeviceString(CUdeviceptr pointer) {
|
||||
// logging purposes. Returns "?" if the memory space could not be successfully
|
||||
// queried.
|
||||
string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
|
||||
auto value = CUDADriver::GetPointerMemorySpace(pointer);
|
||||
auto value = GpuDriver::GetPointerMemorySpace(pointer);
|
||||
if (value.ok()) {
|
||||
return MemorySpaceString(value.ValueOrDie());
|
||||
}
|
||||
@ -267,20 +266,20 @@ string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
|
||||
// primarily for logging purposes. Returns "error" if an error is encountered
|
||||
// in the process of querying.
|
||||
string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
|
||||
auto from_context = CUDADriver::GetPointerContext(from);
|
||||
auto from_context = GpuDriver::GetPointerContext(from);
|
||||
if (!from_context.ok()) {
|
||||
LOG(ERROR) << "could not retrieve source pointer's context: "
|
||||
<< from_context.status();
|
||||
return "error";
|
||||
}
|
||||
auto to_context = CUDADriver::GetPointerContext(to);
|
||||
auto to_context = GpuDriver::GetPointerContext(to);
|
||||
if (!to_context.ok()) {
|
||||
LOG(ERROR) << "could not retrieve destination pointer's context: "
|
||||
<< to_context.status();
|
||||
return "error";
|
||||
}
|
||||
return CUDADriver::CanEnablePeerAccess(from_context.ValueOrDie(),
|
||||
to_context.ValueOrDie())
|
||||
return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
|
||||
to_context.ValueOrDie())
|
||||
? "true"
|
||||
: "false";
|
||||
}
|
||||
@ -308,9 +307,9 @@ static port::Status InternalInit() {
|
||||
|
||||
} // namespace
|
||||
|
||||
/* static */ port::Status CUDADriver::Init() {
|
||||
/* static */ port::Status GpuDriver::Init() {
|
||||
// Cached return value from calling InternalInit(), as cuInit need only be
|
||||
// called once, but CUDADriver::Init may be called many times.
|
||||
// called once, but GpuDriver::Init may be called many times.
|
||||
static port::Status init_retval;
|
||||
static bool set = false;
|
||||
static mutex *init_mu = new mutex;
|
||||
@ -324,8 +323,8 @@ static port::Status InternalInit() {
|
||||
return init_retval;
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::GetDevice(int device_ordinal,
|
||||
CUdevice *device) {
|
||||
/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
|
||||
CUdevice* device) {
|
||||
CUresult res = tensorflow::wrap::cuDeviceGet(device, device_ordinal);
|
||||
if (res == CUDA_SUCCESS) {
|
||||
return port::Status::OK();
|
||||
@ -336,8 +335,8 @@ static port::Status InternalInit() {
|
||||
absl::StrCat("failed call to cuDeviceGet: ", ToString(res)));
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::GetDeviceName(CUdevice device,
|
||||
string *device_name) {
|
||||
/* static */ bool GpuDriver::GetDeviceName(CUdevice device,
|
||||
string* device_name) {
|
||||
static const size_t kCharLimit = 64;
|
||||
absl::InlinedVector<char, 4> chars(kCharLimit);
|
||||
CUresult res =
|
||||
@ -376,9 +375,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::CreateContext(
|
||||
CUdevice device, const DeviceOptions &device_options,
|
||||
CudaContext **context) {
|
||||
/* static */ port::Status GpuDriver::CreateContext(
|
||||
int device_ordinal, CUdevice device, const DeviceOptions& device_options,
|
||||
GpuContext** context) {
|
||||
*context = nullptr;
|
||||
|
||||
int flags = 0;
|
||||
@ -407,7 +406,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
|
||||
}
|
||||
}
|
||||
|
||||
former_context = CUDADriver::CurrentContextOrDie();
|
||||
former_context = cuda::CurrentContextOrDie();
|
||||
res = tensorflow::wrap::cuDevicePrimaryCtxRetain(&new_context, device);
|
||||
if (former_context != nullptr) {
|
||||
CUdevice former_device;
|
||||
@ -454,7 +453,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
|
||||
return port::Status(port::error::INTERNAL, message);
|
||||
}
|
||||
|
||||
/* static */ void CUDADriver::DestroyContext(CudaContext* context) {
|
||||
/* static */ void GpuDriver::DestroyContext(GpuContext* context) {
|
||||
if (context == nullptr) {
|
||||
return;
|
||||
}
|
||||
@ -473,9 +472,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
|
||||
CreatedContexts::Remove(context->context());
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute,
|
||||
CUfunction func,
|
||||
int *attribute_value) {
|
||||
/* static */ bool GpuDriver::FuncGetAttribute(CUfunction_attribute attribute,
|
||||
CUfunction func,
|
||||
int* attribute_value) {
|
||||
CUresult res =
|
||||
tensorflow::wrap::cuFuncGetAttribute(attribute_value, attribute, func);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
@ -486,8 +485,8 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function,
|
||||
CUfunc_cache cache_config) {
|
||||
/* static */ bool GpuDriver::FuncSetCacheConfig(CUfunction function,
|
||||
CUfunc_cache cache_config) {
|
||||
CUresult res = tensorflow::wrap::cuFuncSetCacheConfig(function, cache_config);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
|
||||
@ -499,7 +498,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<CUsharedconfig>
|
||||
CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
|
||||
CUsharedconfig shared_mem_config;
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult result =
|
||||
@ -517,8 +516,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return shared_mem_config;
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
|
||||
CudaContext* context, CUsharedconfig shared_mem_config) {
|
||||
/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
|
||||
GpuContext* context, CUsharedconfig shared_mem_config) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult result =
|
||||
tensorflow::wrap::cuCtxSetSharedMemConfig(shared_mem_config);
|
||||
@ -536,12 +535,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return port::Status::OK();
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::LaunchKernel(
|
||||
CudaContext* context, CUfunction function, unsigned int grid_dim_x,
|
||||
/* static */ bool GpuDriver::LaunchKernel(
|
||||
GpuContext* context, CUfunction function, unsigned int grid_dim_x,
|
||||
unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
|
||||
unsigned int block_dim_y, unsigned int block_dim_z,
|
||||
unsigned int shared_mem_bytes, CUstream stream, void **kernel_params,
|
||||
void **extra) {
|
||||
unsigned int shared_mem_bytes, CUstream stream, void** kernel_params,
|
||||
void** extra) {
|
||||
ScopedActivateContext activation(context);
|
||||
VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
|
||||
<< " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
|
||||
@ -559,9 +558,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::LoadCubin(CudaContext* context,
|
||||
const char *cubin_bytes,
|
||||
CUmodule *module) {
|
||||
/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
|
||||
const char* cubin_bytes,
|
||||
CUmodule* module) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult result =
|
||||
tensorflow::wrap::cuModuleLoadFatBinary(module, cubin_bytes);
|
||||
@ -573,9 +572,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return port::Status::OK();
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::LoadPtx(CudaContext* context,
|
||||
const char *ptx_contents,
|
||||
CUmodule *module) {
|
||||
/* static */ bool GpuDriver::LoadPtx(GpuContext* context,
|
||||
const char* ptx_contents,
|
||||
CUmodule* module) {
|
||||
port::Notification notification;
|
||||
bool ret = true;
|
||||
GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
|
||||
@ -643,9 +642,16 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::SynchronousMemsetUint8(CudaContext* context,
|
||||
CUdeviceptr location,
|
||||
uint8 value, size_t size) {
|
||||
/* static */ bool GpuDriver::LoadHsaco(GpuContext* context,
|
||||
const char* hsaco_contents,
|
||||
CUmodule* module) {
|
||||
LOG(ERROR) << "Feature not supported on CUDA platform (LoadHsaco)";
|
||||
return false;
|
||||
}
|
||||
|
||||
/* static */ bool GpuDriver::SynchronousMemsetUint8(GpuContext* context,
|
||||
CUdeviceptr location,
|
||||
uint8 value, size_t size) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res = tensorflow::wrap::cuMemsetD8(location, value, size);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
@ -655,10 +661,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::SynchronousMemsetUint32(CudaContext* context,
|
||||
CUdeviceptr location,
|
||||
uint32 value,
|
||||
size_t uint32_count) {
|
||||
/* static */ bool GpuDriver::SynchronousMemsetUint32(GpuContext* context,
|
||||
CUdeviceptr location,
|
||||
uint32 value,
|
||||
size_t uint32_count) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res = tensorflow::wrap::cuMemsetD32(location, value, uint32_count);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
@ -668,11 +674,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::AsynchronousMemsetUint8(CudaContext* context,
|
||||
CUdeviceptr location,
|
||||
uint8 value,
|
||||
size_t uint32_count,
|
||||
CUstream stream) {
|
||||
/* static */ bool GpuDriver::AsynchronousMemsetUint8(GpuContext* context,
|
||||
CUdeviceptr location,
|
||||
uint8 value,
|
||||
size_t uint32_count,
|
||||
CUstream stream) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res =
|
||||
tensorflow::wrap::cuMemsetD8Async(location, value, uint32_count, stream);
|
||||
@ -684,11 +690,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::AsynchronousMemsetUint32(CudaContext* context,
|
||||
CUdeviceptr location,
|
||||
uint32 value,
|
||||
size_t uint32_count,
|
||||
CUstream stream) {
|
||||
/* static */ bool GpuDriver::AsynchronousMemsetUint32(GpuContext* context,
|
||||
CUdeviceptr location,
|
||||
uint32 value,
|
||||
size_t uint32_count,
|
||||
CUstream stream) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res =
|
||||
tensorflow::wrap::cuMemsetD32Async(location, value, uint32_count, stream);
|
||||
@ -700,10 +706,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::AddStreamCallback(CudaContext* context,
|
||||
CUstream stream,
|
||||
StreamCallback callback,
|
||||
void *data) {
|
||||
/* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
|
||||
CUstream stream,
|
||||
StreamCallback callback,
|
||||
void* data) {
|
||||
// Note: flags param is required to be zero according to CUDA 6.0.
|
||||
CUresult res = tensorflow::wrap::cuStreamAddCallback(stream, callback, data,
|
||||
0 /* = flags */);
|
||||
@ -714,10 +720,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::GetModuleFunction(CudaContext *context,
|
||||
CUmodule module,
|
||||
const char *kernel_name,
|
||||
CUfunction *function) {
|
||||
/* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
|
||||
CUmodule module,
|
||||
const char* kernel_name,
|
||||
CUfunction* function) {
|
||||
ScopedActivateContext activated{context};
|
||||
CHECK(module != nullptr && kernel_name != nullptr);
|
||||
CUresult res =
|
||||
@ -731,11 +737,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::GetModuleSymbol(CudaContext* context,
|
||||
CUmodule module,
|
||||
const char *symbol_name,
|
||||
CUdeviceptr *dptr,
|
||||
size_t *bytes) {
|
||||
/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
|
||||
CUmodule module,
|
||||
const char* symbol_name,
|
||||
CUdeviceptr* dptr, size_t* bytes) {
|
||||
ScopedActivateContext activated{context};
|
||||
CHECK(module != nullptr && symbol_name != nullptr &&
|
||||
(dptr != nullptr || bytes != nullptr));
|
||||
@ -752,8 +757,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ void CUDADriver::UnloadModule(CudaContext *context,
|
||||
CUmodule module) {
|
||||
/* static */ void GpuDriver::UnloadModule(GpuContext* context,
|
||||
CUmodule module) {
|
||||
ScopedActivateContext activated{context};
|
||||
CUresult res = tensorflow::wrap::cuModuleUnload(module);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
@ -762,8 +767,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<CUdevice> CUDADriver::DeviceFromContext(
|
||||
CudaContext* context) {
|
||||
/* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
|
||||
GpuContext* context) {
|
||||
ScopedActivateContext activated{context};
|
||||
CUdevice device = -1;
|
||||
CUresult result = tensorflow::wrap::cuCtxGetDevice(&device);
|
||||
@ -776,26 +781,26 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
absl::StrCat("failed to get device for context: ", ToString(result)));
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::CreateStream(CudaContext *context,
|
||||
CUstream *out) {
|
||||
/* static */ bool GpuDriver::CreateStream(GpuContext* context,
|
||||
CUstream* stream) {
|
||||
// TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
|
||||
// up synchronization with respect to memsets and any other things that have
|
||||
// to occur on the default stream?
|
||||
ScopedActivateContext activated{context};
|
||||
CUresult res = tensorflow::wrap::cuStreamCreate(out, 0);
|
||||
CUresult res = tensorflow::wrap::cuStreamCreate(stream, 0);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
LOG(ERROR) << "could not allocate CUDA stream for context "
|
||||
<< context->context() << ": " << ToString(res);
|
||||
return false;
|
||||
}
|
||||
|
||||
VLOG(2) << "successfully created stream " << *out << " for context "
|
||||
VLOG(2) << "successfully created stream " << *stream << " for context "
|
||||
<< context->context() << " on thread";
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ void CUDADriver::DestroyStream(CudaContext* context,
|
||||
CUstream *stream) {
|
||||
/* static */ void GpuDriver::DestroyStream(GpuContext* context,
|
||||
CUstream* stream) {
|
||||
if (*stream == nullptr) {
|
||||
return;
|
||||
}
|
||||
@ -812,8 +817,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ void *CUDADriver::DeviceAllocate(CudaContext *context,
|
||||
uint64 bytes) {
|
||||
/* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
|
||||
uint64 bytes) {
|
||||
ScopedActivateContext activated{context};
|
||||
CUdeviceptr result = 0;
|
||||
CUresult res = tensorflow::wrap::cuMemAlloc(&result, bytes);
|
||||
@ -829,8 +834,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/* static */ void CUDADriver::DeviceDeallocate(CudaContext* context,
|
||||
void *location) {
|
||||
/* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
|
||||
void* location) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
|
||||
CUresult res = tensorflow::wrap::cuMemFree(pointer);
|
||||
@ -843,8 +848,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ void *CUDADriver::UnifiedMemoryAllocate(CudaContext *context,
|
||||
uint64 bytes) {
|
||||
/* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
|
||||
uint64 bytes) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUdeviceptr result = 0;
|
||||
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
|
||||
@ -861,8 +866,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context,
|
||||
void *location) {
|
||||
/* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
|
||||
void* location) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
|
||||
CUresult res = tensorflow::wrap::cuMemFree(pointer);
|
||||
@ -875,8 +880,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ void *CUDADriver::HostAllocate(CudaContext *context,
|
||||
uint64 bytes) {
|
||||
/* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
|
||||
ScopedActivateContext activation(context);
|
||||
void *host_mem = nullptr;
|
||||
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
|
||||
@ -889,8 +893,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return host_mem;
|
||||
}
|
||||
|
||||
/* static */ void CUDADriver::HostDeallocate(CudaContext* context,
|
||||
void *location) {
|
||||
/* static */ void GpuDriver::HostDeallocate(GpuContext* context,
|
||||
void* location) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res = tensorflow::wrap::cuMemFreeHost(location);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
@ -899,8 +903,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::HostRegister(CudaContext* context, void *location,
|
||||
uint64 bytes) {
|
||||
/* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
|
||||
uint64 bytes) {
|
||||
ScopedActivateContext activation(context);
|
||||
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
|
||||
CUresult res = tensorflow::wrap::cuMemHostRegister(
|
||||
@ -913,8 +917,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::HostUnregister(CudaContext* context,
|
||||
void *location) {
|
||||
/* static */ bool GpuDriver::HostUnregister(GpuContext* context,
|
||||
void* location) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res = tensorflow::wrap::cuMemHostUnregister(location);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
@ -925,8 +929,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::DestroyEvent(CudaContext* context,
|
||||
CUevent *event) {
|
||||
/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
|
||||
CUevent* event) {
|
||||
if (*event == nullptr) {
|
||||
return port::Status(port::error::INVALID_ARGUMENT,
|
||||
"input event cannot be null");
|
||||
@ -953,9 +957,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::RecordEvent(CudaContext* context,
|
||||
CUevent event,
|
||||
CUstream stream) {
|
||||
/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
|
||||
CUevent event,
|
||||
CUstream stream) {
|
||||
ScopedActivateContext activated{context};
|
||||
CUresult res = tensorflow::wrap::cuEventRecord(event, stream);
|
||||
switch (res) {
|
||||
@ -975,8 +979,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(
|
||||
CudaContext *context, CUevent event) {
|
||||
/* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
|
||||
CUevent event) {
|
||||
ScopedActivateContext activated{context};
|
||||
CUresult res = tensorflow::wrap::cuEventQuery(event);
|
||||
if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
|
||||
@ -988,9 +992,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return res;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::GetEventElapsedTime(CudaContext* context,
|
||||
float *elapsed_milliseconds,
|
||||
CUevent start, CUevent stop) {
|
||||
/* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
|
||||
float* elapsed_milliseconds,
|
||||
CUevent start, CUevent stop) {
|
||||
ScopedActivateContext activated{context};
|
||||
// The stop event must have completed in order for cuEventElapsedTime to
|
||||
// work.
|
||||
@ -1009,9 +1013,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::WaitStreamOnEvent(CudaContext* context,
|
||||
CUstream stream,
|
||||
CUevent event) {
|
||||
/* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
|
||||
CUstream stream, CUevent event) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res =
|
||||
tensorflow::wrap::cuStreamWaitEvent(stream, event, 0 /* = flags */);
|
||||
@ -1023,7 +1026,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) {
|
||||
/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res = tensorflow::wrap::cuCtxSynchronize();
|
||||
if (res != CUDA_SUCCESS) {
|
||||
@ -1035,8 +1038,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::SynchronizeStream(CudaContext *context,
|
||||
CUstream stream) {
|
||||
/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
|
||||
CUstream stream) {
|
||||
ScopedActivateContext activated{context};
|
||||
CHECK(stream != nullptr);
|
||||
CUresult res = tensorflow::wrap::cuStreamSynchronize(stream);
|
||||
@ -1051,8 +1054,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return port::Status::OK();
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::IsStreamIdle(CudaContext *context,
|
||||
CUstream stream) {
|
||||
/* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
|
||||
CUstream stream) {
|
||||
ScopedActivateContext activated{context};
|
||||
CHECK(stream != nullptr);
|
||||
CUresult res = tensorflow::wrap::cuStreamQuery(stream);
|
||||
@ -1066,10 +1069,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::SynchronousMemcpyD2H(CudaContext *context,
|
||||
void *host_dst,
|
||||
CUdeviceptr gpu_src,
|
||||
uint64 size) {
|
||||
/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
|
||||
void* host_dst,
|
||||
CUdeviceptr gpu_src,
|
||||
uint64 size) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res = tensorflow::wrap::cuMemcpyDtoH(host_dst, gpu_src, size);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
@ -1084,10 +1087,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return port::Status::OK();
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::SynchronousMemcpyH2D(CudaContext *context,
|
||||
CUdeviceptr gpu_dst,
|
||||
const void *host_src,
|
||||
uint64 size) {
|
||||
/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
|
||||
CUdeviceptr gpu_dst,
|
||||
const void* host_src,
|
||||
uint64 size) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res = tensorflow::wrap::cuMemcpyHtoD(gpu_dst, host_src, size);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
@ -1101,10 +1104,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return port::Status::OK();
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::SynchronousMemcpyD2D(CudaContext *context,
|
||||
CUdeviceptr gpu_dst,
|
||||
CUdeviceptr gpu_src,
|
||||
uint64 size) {
|
||||
/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
|
||||
CUdeviceptr gpu_dst,
|
||||
CUdeviceptr gpu_src,
|
||||
uint64 size) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res = tensorflow::wrap::cuMemcpyDtoD(gpu_dst, gpu_src, size);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
@ -1118,11 +1121,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return port::Status::OK();
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::AsynchronousMemcpyD2H(CudaContext* context,
|
||||
void *host_dst,
|
||||
CUdeviceptr gpu_src,
|
||||
uint64 size,
|
||||
CUstream stream) {
|
||||
/* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
|
||||
void* host_dst,
|
||||
CUdeviceptr gpu_src,
|
||||
uint64 size,
|
||||
CUstream stream) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res =
|
||||
tensorflow::wrap::cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
|
||||
@ -1140,11 +1143,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::AsynchronousMemcpyH2D(CudaContext* context,
|
||||
CUdeviceptr gpu_dst,
|
||||
const void *host_src,
|
||||
uint64 size,
|
||||
CUstream stream) {
|
||||
/* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
|
||||
CUdeviceptr gpu_dst,
|
||||
const void* host_src,
|
||||
uint64 size,
|
||||
CUstream stream) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult res =
|
||||
tensorflow::wrap::cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
|
||||
@ -1161,11 +1164,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::AsynchronousMemcpyD2D(CudaContext* context,
|
||||
CUdeviceptr gpu_dst,
|
||||
CUdeviceptr gpu_src,
|
||||
uint64 size,
|
||||
CUstream stream) {
|
||||
/* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
|
||||
CUdeviceptr gpu_dst,
|
||||
CUdeviceptr gpu_src,
|
||||
uint64 size,
|
||||
CUstream stream) {
|
||||
ScopedActivateContext activation(context);
|
||||
CUresult result =
|
||||
tensorflow::wrap::cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
|
||||
@ -1189,9 +1192,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::CreateEvent(CudaContext* context,
|
||||
CUevent *result,
|
||||
EventFlags flags) {
|
||||
/* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
|
||||
CUevent* result,
|
||||
EventFlags flags) {
|
||||
int cuflags;
|
||||
switch (flags) {
|
||||
case EventFlags::kDefault:
|
||||
@ -1219,7 +1222,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ int CUDADriver::GetDeviceCount() {
|
||||
/* static */ int GpuDriver::GetDeviceCount() {
|
||||
int device_count = 0;
|
||||
CUresult res = tensorflow::wrap::cuDeviceGetCount(&device_count);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
@ -1233,9 +1236,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return device_count;
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<CudaContext*> CUDADriver::GetPointerContext(
|
||||
/* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
|
||||
CUdeviceptr pointer) {
|
||||
CudaContext* context = nullptr;
|
||||
GpuContext* context = nullptr;
|
||||
CUresult result = tensorflow::wrap::cuPointerGetAttribute(
|
||||
&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
|
||||
if (result == CUDA_SUCCESS) {
|
||||
@ -1249,7 +1252,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
ToString(result)));
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
|
||||
/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
|
||||
CUdeviceptr pointer) {
|
||||
unsigned int value;
|
||||
CUresult result = tensorflow::wrap::cuPointerGetAttribute(
|
||||
@ -1273,9 +1276,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
ToString(result)));
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
|
||||
CUdeviceptr *base,
|
||||
size_t *size) {
|
||||
/* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
|
||||
CUdeviceptr* base,
|
||||
size_t* size) {
|
||||
CUresult result = tensorflow::wrap::cuMemGetAddressRange(base, size, dptr);
|
||||
if (result == CUDA_SUCCESS) {
|
||||
return port::Status::OK();
|
||||
@ -1295,7 +1298,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
reinterpret_cast<void *>(dptr), ToString(result).c_str()));
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice(
|
||||
/* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
|
||||
CUdeviceptr pointer) {
|
||||
auto result = GetPointerContext(pointer);
|
||||
if (!result.ok()) {
|
||||
@ -1305,9 +1308,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return DeviceFromContext(result.ValueOrDie());
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::GetComputeCapability(int *cc_major,
|
||||
int *cc_minor,
|
||||
CUdevice device) {
|
||||
/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
|
||||
int* cc_minor,
|
||||
CUdevice device) {
|
||||
*cc_major = 0;
|
||||
*cc_minor = 0;
|
||||
|
||||
@ -1334,6 +1337,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
|
||||
return port::Status::OK();
|
||||
}
|
||||
|
||||
/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
|
||||
CUdevice device) {
|
||||
return port::Status{
|
||||
port::error::INTERNAL,
|
||||
"Feature not supported on CUDA platform (GetGpuISAVersion)"};
|
||||
}
|
||||
|
||||
// Helper function that turns the integer output of cuDeviceGetAttribute to type
|
||||
// T and wraps it in a StatusOr.
|
||||
template <typename T>
|
||||
@ -1352,49 +1362,49 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
||||
return converted;
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<int> CUDADriver::GetMultiprocessorCount(
|
||||
/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
|
||||
CUdevice device) {
|
||||
return GetSimpleAttribute<int>(device,
|
||||
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerCore(
|
||||
/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
|
||||
CUdevice device) {
|
||||
return GetSimpleAttribute<int64>(
|
||||
device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerBlock(
|
||||
/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
|
||||
CUdevice device) {
|
||||
return GetSimpleAttribute<int64>(
|
||||
device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerMultiprocessor(
|
||||
/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
|
||||
CUdevice device) {
|
||||
return GetSimpleAttribute<int64>(
|
||||
device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerBlock(
|
||||
/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
|
||||
CUdevice device) {
|
||||
return GetSimpleAttribute<int64>(device,
|
||||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<int64> CUDADriver::GetMaxRegistersPerBlock(
|
||||
/* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
|
||||
CUdevice device) {
|
||||
return GetSimpleAttribute<int64>(device,
|
||||
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<int64> CUDADriver::GetThreadsPerWarp(
|
||||
/* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
|
||||
CUdevice device) {
|
||||
return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z,
|
||||
CUdevice device) {
|
||||
/* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
|
||||
CUdevice device) {
|
||||
int value;
|
||||
CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
|
||||
&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
|
||||
@ -1422,7 +1432,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::GetDriverVersion(int *driver_version) {
|
||||
/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
|
||||
CUresult res = tensorflow::wrap::cuDriverGetVersion(driver_version);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
LOG(ERROR) << "failed to query driver version: " << ToString(res);
|
||||
@ -1432,7 +1442,19 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<int> CUDADriver::GetDeviceAttribute(
|
||||
/* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
|
||||
int device_ordinal) {
|
||||
CUresult res = tensorflow::wrap::cuDeviceGetProperties(device_properties,
|
||||
device_ordinal);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
LOG(ERROR) << "failed to query device properties: " << ToString(res);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
|
||||
CUdevice_attribute attribute, CUdevice device) {
|
||||
int val;
|
||||
CUresult res =
|
||||
@ -1446,7 +1468,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
||||
return val;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
|
||||
/* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
|
||||
int value = -1;
|
||||
CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
|
||||
&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
|
||||
@ -1459,9 +1481,9 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::GetDeviceMemoryInfo(CudaContext* context,
|
||||
int64 *free_out,
|
||||
int64 *total_out) {
|
||||
/* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
|
||||
int64* free_out,
|
||||
int64* total_out) {
|
||||
ScopedActivateContext activation(context);
|
||||
size_t free = 0;
|
||||
size_t total = 0;
|
||||
@ -1476,8 +1498,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device,
|
||||
uint64 *result) {
|
||||
/* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
|
||||
uint64* result) {
|
||||
size_t value = -1;
|
||||
CUresult res = tensorflow::wrap::cuDeviceTotalMem(&value, device);
|
||||
if (res != CUDA_SUCCESS) {
|
||||
@ -1489,7 +1511,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */ string CUDADriver::GetPCIBusID(CUdevice device) {
|
||||
/* static */ string GpuDriver::GetPCIBusID(CUdevice device) {
|
||||
string pci_bus_id;
|
||||
static const int kBufferSize = 64;
|
||||
absl::InlinedVector<char, 4> chars(kBufferSize);
|
||||
@ -1504,8 +1526,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
||||
return pci_bus_id;
|
||||
}
|
||||
|
||||
/* static */ bool CUDADriver::CanEnablePeerAccess(CudaContext* from,
|
||||
CudaContext* to) {
|
||||
/* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
|
||||
GpuContext* to) {
|
||||
if (from == to) {
|
||||
return true; // A context can always access its own memory.
|
||||
}
|
||||
@ -1533,8 +1555,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
||||
return can_access_peer;
|
||||
}
|
||||
|
||||
/* static */ port::Status CUDADriver::EnablePeerAccess(CudaContext* from,
|
||||
CudaContext* to) {
|
||||
/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
|
||||
GpuContext* to) {
|
||||
if (from == to) {
|
||||
return port::Status::OK(); // A context can always access its own memory.
|
||||
}
|
||||
@ -1553,8 +1575,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
||||
return port::Status::OK();
|
||||
}
|
||||
|
||||
/* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore(
|
||||
CudaContext* context, CUfunction kernel, int threads_per_block,
|
||||
/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
|
||||
GpuContext* context, CUfunction kernel, int threads_per_block,
|
||||
size_t dynamic_shared_memory_bytes) {
|
||||
ScopedActivateContext activation(context);
|
||||
|
||||
@ -1572,11 +1594,15 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
|
||||
return max_blocks;
|
||||
}
|
||||
|
||||
/* static */ CUcontext CUDADriver::CurrentContextOrDie() {
|
||||
} // namespace gpu
|
||||
|
||||
namespace cuda {
|
||||
|
||||
CUcontext CurrentContextOrDie() {
|
||||
CUcontext current = nullptr;
|
||||
CUresult result = tensorflow::wrap::cuCtxGetCurrent(¤t);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
LOG(FATAL) << "failed to query current context: " << ToString(result);
|
||||
LOG(FATAL) << "failed to query current context: " << gpu::ToString(result);
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
@ -18,495 +18,45 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
|
||||
#include "cuda/include/cuda.h"
|
||||
#include "tensorflow/stream_executor/device_options.h"
|
||||
#include "tensorflow/stream_executor/lib/status.h"
|
||||
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
|
||||
// Identifies the memory space where an allocation resides. See
|
||||
// CUDADriver::GetPointerMemorySpace().
|
||||
enum class MemorySpace { kHost, kDevice };
|
||||
|
||||
// Returns a casual string, such as "host" for the provided memory space.
|
||||
string MemorySpaceString(MemorySpace memory_space);
|
||||
|
||||
class CudaContext;
|
||||
|
||||
// CUDADriver contains wrappers for calls to the userspace library driver. It's
|
||||
// useful to isolate these calls and put basic wrappers around them to separate
|
||||
// userspace library driver behaviors from the rest of the program.
|
||||
//
|
||||
// At the moment it's simply used as a namespace.
|
||||
//
|
||||
// The calls log any specific errors internally and return whether the operation
|
||||
// was successful to the caller.
|
||||
//
|
||||
// The order of parameters is generally kept symmetric with the underlying CUDA
|
||||
// driver API.
|
||||
//
|
||||
// Links on functions are to specific documentation under
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/
|
||||
//
|
||||
// Thread safety: these functions should not be used from signal handlers.
|
||||
class CUDADriver {
|
||||
public:
|
||||
// Wraps a call to cuInit with logging to help indicate what has gone wrong in
|
||||
// the case of failure. Safe to call multiple times; will be fast on all calls
|
||||
// after the first.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
|
||||
static port::Status Init();
|
||||
|
||||
// Returns the device associated with the given context.
|
||||
// device is an outparam owned by the caller, must not be null.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
|
||||
static port::StatusOr<CUdevice> DeviceFromContext(CudaContext* context);
|
||||
|
||||
// Creates a new CUDA stream associated with the given context via
|
||||
// cuStreamCreate.
|
||||
// stream is an outparam owned by the caller, must not be null.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
|
||||
static bool CreateStream(CudaContext* context, CUstream* stream);
|
||||
|
||||
// Destroys a CUDA stream associated with the given context.
|
||||
// stream is owned by the caller, must not be null, and *stream is set to null
|
||||
// if the stream is successfully destroyed.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
|
||||
static void DestroyStream(CudaContext* context, CUstream* stream);
|
||||
|
||||
// CUDA events can explicitly disable event TSC retrieval for some presumed
|
||||
// performance improvement if timing is unnecessary.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
|
||||
enum class EventFlags { kDefault, kDisableTiming };
|
||||
|
||||
// Creates a new event associated with the given context.
|
||||
// result is an outparam owned by the caller and must not be null.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
|
||||
static port::Status CreateEvent(CudaContext* context, CUevent* result,
|
||||
EventFlags flags);
|
||||
|
||||
// Destroys *event and turns it into a nullptr. event may not be null, but
|
||||
// *event may be, via cuEventDestroy
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
|
||||
static port::Status DestroyEvent(CudaContext* context, CUevent* event);
|
||||
|
||||
// Allocates a GPU memory space of size bytes associated with the given
|
||||
// context via cuMemAlloc.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
|
||||
static void* DeviceAllocate(CudaContext* context, uint64 bytes);
|
||||
|
||||
// Deallocates a GPU memory space of size bytes associated with the given
|
||||
// context via cuMemFree.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
|
||||
static void DeviceDeallocate(CudaContext* context, void* location);
|
||||
|
||||
// Allocates a unified memory space of size bytes associated with the given
|
||||
// context via cuMemAllocManaged.
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
|
||||
static void* UnifiedMemoryAllocate(CudaContext* context, uint64 bytes);
|
||||
|
||||
// Deallocates a unified memory space of size bytes associated with the given
|
||||
// context via cuMemFree.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
|
||||
static void UnifiedMemoryDeallocate(CudaContext* context, void* location);
|
||||
|
||||
// Allocates page-locked and CUDA-registered memory on the host via
|
||||
// cuMemAllocHost.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
|
||||
static void* HostAllocate(CudaContext* context, uint64 bytes);
|
||||
|
||||
// Deallocates a location created by HostAllocate, via cuMemFreeHost.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
|
||||
static void HostDeallocate(CudaContext* context, void* location);
|
||||
|
||||
// Registers a memory region at location of size bytes via cuMemHostRegister.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
|
||||
static bool HostRegister(CudaContext* context, void* location, uint64 bytes);
|
||||
|
||||
// Unregisters a memory region that was previously registered at location via
|
||||
// cuMemHostUnregister.
|
||||
//
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
|
||||
//
|
||||
// TODO(leary) verify an error will be returned if the location wasn't
|
||||
// previously registered.
|
||||
static bool HostUnregister(CudaContext* context, void* location);
|
||||
|
||||
// Given a device ordinal, returns a device handle into the device outparam,
|
||||
// which must not be null.
|
||||
//
|
||||
// N.B. these device handles do not have a corresponding destroy function in
|
||||
// the CUDA driver API.
|
||||
static port::Status GetDevice(int device_ordinal, CUdevice* device);
|
||||
|
||||
// Given a device handle, returns the name reported by the driver for the
|
||||
// device.
|
||||
static bool GetDeviceName(CUdevice device, string* name_out);
|
||||
|
||||
// Given a device to create a context for, returns a context handle into the
|
||||
// context outparam, which must not be null.
|
||||
//
|
||||
// N.B. CUDA contexts are weird. They are implicitly associated with the
|
||||
// calling thread. Current documentation on contexts and their influence on
|
||||
// userspace processes is given here:
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
|
||||
static port::Status CreateContext(CUdevice device,
|
||||
const DeviceOptions& device_options,
|
||||
CudaContext** context);
|
||||
|
||||
// Destroys the provided context via cuCtxDestroy.
|
||||
// Don't do this while clients could still be using the context, per the docs
|
||||
// bad things will happen.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
|
||||
static void DestroyContext(CudaContext* context);
|
||||
|
||||
// Queries the runtime for the specified attribute of the specified function.
|
||||
// cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
|
||||
// in terms of integer-sized values, so there's no potential for overrun (as
|
||||
// of CUDA 5.5).
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
|
||||
static bool FuncGetAttribute(CUfunction_attribute attribute,
|
||||
CUfunction function, int* attribute_value);
|
||||
|
||||
// Sets the preferred cache configuration for the specified function.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
|
||||
static bool FuncSetCacheConfig(CUfunction function,
|
||||
CUfunc_cache cache_config);
|
||||
|
||||
// Gets the preferred shared memory bank configuration for the specified
|
||||
// CONTEXT (not function!), either default or four- or eight-byte bank size.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
|
||||
static port::StatusOr<CUsharedconfig> ContextGetSharedMemConfig(
|
||||
CudaContext* context);
|
||||
|
||||
// Sets the preferred shared memory bank configuration for the specified
|
||||
// CONTEXT (not function!), either default or four- or eight-byte bank size.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
|
||||
static port::Status ContextSetSharedMemConfig(
|
||||
CudaContext* context, CUsharedconfig shared_mem_config);
|
||||
|
||||
// Launches a CUDA kernel via cuLaunchKernel.
|
||||
// TODO(leary) describe the structure of kernel_params and extra in a readable
|
||||
// way.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
|
||||
static bool LaunchKernel(CudaContext* context, CUfunction function,
|
||||
unsigned int grid_dim_x, unsigned int grid_dim_y,
|
||||
unsigned int grid_dim_z, unsigned int block_dim_x,
|
||||
unsigned int block_dim_y, unsigned int block_dim_z,
|
||||
unsigned int shared_mem_bytes, CUstream stream,
|
||||
void** kernel_params, void** extra);
|
||||
|
||||
// Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
|
||||
// handle in "module". Any error logs that are produced are logged internally.
|
||||
static bool LoadPtx(CudaContext* context, const char* ptx_contents,
|
||||
CUmodule* module);
|
||||
|
||||
// Loads cubin_bytes with the CUDA driver's blob loading interface and stores
|
||||
// the resulting handle in "module".
|
||||
static port::Status LoadCubin(CudaContext* context, const char* cubin_bytes,
|
||||
CUmodule* module);
|
||||
|
||||
// Retrieves a named kernel from a loaded module, and places the resulting
|
||||
// handle into function (outparam) on success. Neither kernel_name nor
|
||||
// function may be null. No ownership is taken of kernel_name.
|
||||
static bool GetModuleFunction(CudaContext* context, CUmodule module,
|
||||
const char* kernel_name, CUfunction* function);
|
||||
|
||||
// Retrieves a named global/constant symbol from a loaded module, and returns
|
||||
// a device pointer and size of the symbol on success. symbol_name may not be
|
||||
// null. At least one of dptr or bytes should not be null. No ownership is
|
||||
// taken of symbol_name.
|
||||
static bool GetModuleSymbol(CudaContext* context, CUmodule module,
|
||||
const char* symbol_name, CUdeviceptr* dptr,
|
||||
size_t* bytes);
|
||||
|
||||
// Unloads module from the current context via cuModuleUnload.
|
||||
// TODO(leary) the documentation doesn't say what kind of disasters happen
|
||||
// if you try to unload a module while its CUfunctions are in use.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
|
||||
static void UnloadModule(CudaContext* context, CUmodule module);
|
||||
|
||||
// Performs a synchronous memset of the device memory segment via cuMemsetD8.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
|
||||
static bool SynchronousMemsetUint8(CudaContext* context, CUdeviceptr location,
|
||||
uint8 value, size_t size);
|
||||
|
||||
// Performs a synchronous memset of the device memory segment via cuMemsetD32.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
|
||||
static bool SynchronousMemsetUint32(CudaContext* context,
|
||||
CUdeviceptr location, uint32 value,
|
||||
size_t uint32_count);
|
||||
|
||||
// Performs an asynchronous memset of the device memory segment via
|
||||
// cuMemsetD8Async.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
|
||||
static bool AsynchronousMemsetUint8(CudaContext* context,
|
||||
CUdeviceptr location, uint8 value,
|
||||
size_t uint32_count, CUstream stream);
|
||||
|
||||
// Performs an asynchronous memset of the device memory segment via
|
||||
// cuMemsetD32Async.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
|
||||
static bool AsynchronousMemsetUint32(CudaContext* context,
|
||||
CUdeviceptr location, uint32 value,
|
||||
size_t uint32_count, CUstream stream);
|
||||
|
||||
// -- Synchronous memcopies.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
|
||||
|
||||
static port::Status SynchronousMemcpyD2H(CudaContext* context, void* host_dst,
|
||||
CUdeviceptr gpu_src, uint64 size);
|
||||
static port::Status SynchronousMemcpyH2D(CudaContext* context,
|
||||
CUdeviceptr gpu_dst,
|
||||
const void* host_src, uint64 size);
|
||||
static port::Status SynchronousMemcpyD2D(CudaContext* context,
|
||||
CUdeviceptr gpu_dst,
|
||||
CUdeviceptr gpu_src, uint64 size);
|
||||
|
||||
// -- Asynchronous memcopies.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
|
||||
|
||||
static bool AsynchronousMemcpyD2H(CudaContext* context, void* host_dst,
|
||||
CUdeviceptr gpu_src, uint64 size,
|
||||
CUstream stream);
|
||||
static bool AsynchronousMemcpyH2D(CudaContext* context, CUdeviceptr gpu_dst,
|
||||
const void* host_src, uint64 size,
|
||||
CUstream stream);
|
||||
static bool AsynchronousMemcpyD2D(CudaContext* context, CUdeviceptr gpu_dst,
|
||||
CUdeviceptr gpu_src, uint64 size,
|
||||
CUstream stream);
|
||||
|
||||
// The CUDA stream callback type signature.
|
||||
// The data passed to AddStreamCallback is subsequently passed to this
|
||||
// callback when it fires.
|
||||
//
|
||||
// Some notable things:
|
||||
// * Callbacks must not make any CUDA API calls.
|
||||
// * Callbacks from independent streams execute in an undefined order and may
|
||||
// be serialized.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
|
||||
typedef void (*StreamCallback)(CUstream stream, CUresult status, void* data);
|
||||
|
||||
// Enqueues a callback operation into stream.
|
||||
// See StreamCallback above and the NVIDIA documentation for additional
|
||||
// details.
|
||||
static bool AddStreamCallback(CudaContext* context, CUstream stream,
|
||||
StreamCallback callback, void* data);
|
||||
|
||||
// Causes stream to wait for event to trigger before proceeding via
|
||||
// cuStreamWaitEvent.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
|
||||
static bool WaitStreamOnEvent(CudaContext* context, CUstream stream,
|
||||
CUevent event);
|
||||
|
||||
// Blocks the calling thread until the operations enqueued onto stream have
|
||||
// been completed, via cuStreamSynchronize.
|
||||
//
|
||||
// TODO(leary) if a pathological thread enqueues operations onto the stream
|
||||
// while another thread blocks like this, can you wind up waiting an unbounded
|
||||
// amount of time?
|
||||
//
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
|
||||
static port::Status SynchronizeStream(CudaContext* context, CUstream stream);
|
||||
|
||||
// Blocks the calling thread until the operations associated with the context
|
||||
// have been completed, via cuCtxSynchronize.
|
||||
//
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
|
||||
static bool SynchronizeContext(CudaContext* context);
|
||||
|
||||
// Returns true if all stream tasks have completed at time of the call. Note
|
||||
// the potential for races around this call (if another thread adds work to
|
||||
// the stream immediately after this returns).
|
||||
static bool IsStreamIdle(CudaContext* context, CUstream stream);
|
||||
|
||||
// Returns whether code in the from context can access memory in the to
|
||||
// context via cuDeviceCanAccessPeer.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
|
||||
static bool CanEnablePeerAccess(CudaContext* from, CudaContext* to);
|
||||
|
||||
// Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
|
||||
static port::Status EnablePeerAccess(CudaContext* from, CudaContext* to);
|
||||
|
||||
// Returns the elapsed milliseconds between start and stop via
|
||||
// cuEventElapsedTime.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
|
||||
static bool GetEventElapsedTime(CudaContext* context,
|
||||
float* elapsed_milliseconds, CUevent start,
|
||||
CUevent stop);
|
||||
|
||||
// Records that an event occurred when execution reaches the current point in
|
||||
// thestream via cuEventRecord.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
|
||||
static port::Status RecordEvent(CudaContext* context, CUevent event,
|
||||
CUstream stream);
|
||||
|
||||
// Polls (without blocking) to determine the status of an event - pending or
|
||||
// complete (or an error status).
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
|
||||
static port::StatusOr<CUresult> QueryEvent(CudaContext* context,
|
||||
CUevent event);
|
||||
|
||||
// -- Pointer-specific calls.
|
||||
|
||||
// Returns the context in which pointer was allocated or registered.
|
||||
static port::StatusOr<CudaContext*> GetPointerContext(CUdeviceptr pointer);
|
||||
|
||||
// Returns the device associated with the context from GetPointerContext().
|
||||
static port::StatusOr<CUdevice> GetPointerDevice(CUdeviceptr pointer);
|
||||
|
||||
// Returns the memory space addressed by pointer.
|
||||
static port::StatusOr<MemorySpace> GetPointerMemorySpace(CUdeviceptr pointer);
|
||||
|
||||
// Returns the base address and size of the device pointer dptr.
|
||||
static port::Status GetPointerAddressRange(CUdeviceptr dptr,
|
||||
CUdeviceptr* base, size_t* size);
|
||||
|
||||
// -- Device-specific calls.
|
||||
|
||||
// Returns the compute capability for the device; i.e (3, 5).
|
||||
// This is currently done via the deprecated device API.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
|
||||
static port::Status GetComputeCapability(int* cc_major, int* cc_minor,
|
||||
CUdevice device);
|
||||
|
||||
// Returns the number of multiprocessors on the device (note that the device
|
||||
// may be multi-GPU-per-board).
|
||||
static port::StatusOr<int> GetMultiprocessorCount(CUdevice device);
|
||||
|
||||
// Returns the limit on number of threads that can be resident in a single
|
||||
// multiprocessor.
|
||||
static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(CUdevice device);
|
||||
|
||||
// Returns the limit on number of threads which may be resident for a single
|
||||
// block (cooperative thread array).
|
||||
static port::StatusOr<int64> GetMaxThreadsPerBlock(CUdevice device);
|
||||
|
||||
// Returns the amount of shared memory available on a single GPU core (i.e.
|
||||
// SM on NVIDIA devices).
|
||||
static port::StatusOr<int64> GetMaxSharedMemoryPerCore(CUdevice device);
|
||||
|
||||
// Returns the amount of shared memory available for a single block
|
||||
// (cooperative thread array).
|
||||
static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(CUdevice device);
|
||||
|
||||
// Returns the maximum supported number of registers per block.
|
||||
static port::StatusOr<int64> GetMaxRegistersPerBlock(CUdevice device);
|
||||
|
||||
// Returns the number of threads per warp.
|
||||
static port::StatusOr<int64> GetThreadsPerWarp(CUdevice device);
|
||||
|
||||
// Queries the grid limits for device with cuDeviceGetAttribute calls.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
|
||||
static bool GetGridLimits(int* x, int* y, int* z, CUdevice device);
|
||||
|
||||
// Gets a specific integer-valued property about the given device.
|
||||
//
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
|
||||
static port::StatusOr<int> GetDeviceAttribute(CUdevice_attribute attribute,
|
||||
CUdevice device);
|
||||
|
||||
// Returns whether ECC is enabled for the given CUdevice via
|
||||
// cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
|
||||
static bool IsEccEnabled(CUdevice device, bool* result);
|
||||
|
||||
// Returns the total amount of memory available for allocation by the CUDA
|
||||
// context, in bytes, via cuDeviceTotalMem.
|
||||
static bool GetDeviceTotalMemory(CUdevice device, uint64* result);
|
||||
|
||||
// Returns the free amount of memory and total amount of memory, as reported
|
||||
// by cuMemGetInfo.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
|
||||
static bool GetDeviceMemoryInfo(CudaContext* context, int64* free,
|
||||
int64* total);
|
||||
|
||||
// Returns a PCI bus id string for the device.
|
||||
// [domain]:[bus]:[device].[function]
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
|
||||
static string GetPCIBusID(CUdevice device);
|
||||
|
||||
// -- Context- and device-independent calls.
|
||||
|
||||
// Returns the number of visible CUDA device via cuDeviceGetCount.
|
||||
// This should correspond to the set of device ordinals available.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
|
||||
static int GetDeviceCount();
|
||||
|
||||
// Returns the driver version number via cuDriverGetVersion.
|
||||
// This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
|
||||
// instead, the CUDA toolkit release number that this driver is compatible
|
||||
// with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
|
||||
// compatible driver).
|
||||
//
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
|
||||
static bool GetDriverVersion(int* driver_version);
|
||||
|
||||
// -- Other calls
|
||||
|
||||
// Returns the maximum number of blocks (per multiprocessor) occupied by the
|
||||
// specified kernel/CUfunction when launched with the specified parameters.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
|
||||
static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
|
||||
CudaContext* context, CUfunction kernel, int threads_per_block,
|
||||
size_t dynamic_shared_memory_bytes);
|
||||
|
||||
// Returns the current context set in CUDA. This is done by calling the cuda
|
||||
// driver (e.g., this value is not our cached view of the current context).
|
||||
static CUcontext CurrentContextOrDie();
|
||||
|
||||
// Seam for injecting an error at CUDA initialization time for testing
|
||||
// purposes.
|
||||
static bool driver_inject_init_error_;
|
||||
};
|
||||
|
||||
// Ensures a context is activated within a scope.
|
||||
class ScopedActivateContext {
|
||||
public:
|
||||
// Activates the context via cuCtxSetCurrent, if it is not the currently
|
||||
// active context (a la cuCtxGetCurrent). Note the alternative push/pop
|
||||
// mechanism is said by NVIDIA to be relatively slow and deprecated.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
|
||||
explicit ScopedActivateContext(CudaContext* context);
|
||||
|
||||
// Checks that the context has remained activated for the duration of the
|
||||
// scope.
|
||||
~ScopedActivateContext();
|
||||
|
||||
private:
|
||||
CudaContext* to_restore_ = nullptr;
|
||||
};
|
||||
|
||||
// CudaContext wraps a cuda CUcontext handle, and includes a unique id. The
|
||||
namespace gpu {
|
||||
// CUDAContext wraps a cuda CUcontext handle, and includes a unique id. The
|
||||
// unique id is positive, and ids are not repeated within the process.
|
||||
class CudaContext {
|
||||
class GpuContext {
|
||||
public:
|
||||
CudaContext(CUcontext context, int64 id) : context_(context), id_(id) {}
|
||||
GpuContext(CUcontext context, int64 id) : context_(context), id_(id) {}
|
||||
|
||||
CUcontext context() const { return context_; }
|
||||
int64 id() const { return id_; }
|
||||
|
||||
// Disallow copying and moving.
|
||||
CudaContext(CudaContext&&) = delete;
|
||||
CudaContext(const CudaContext&) = delete;
|
||||
CudaContext& operator=(CudaContext&&) = delete;
|
||||
CudaContext& operator=(const CudaContext&) = delete;
|
||||
GpuContext(GpuContext&&) = delete;
|
||||
GpuContext(const GpuContext&) = delete;
|
||||
GpuContext& operator=(GpuContext&&) = delete;
|
||||
GpuContext& operator=(const GpuContext&) = delete;
|
||||
|
||||
private:
|
||||
CUcontext const context_;
|
||||
const int64 id_;
|
||||
};
|
||||
|
||||
inline CUcontext CurrentContextOrDie() {
|
||||
return CUDADriver::CurrentContextOrDie();
|
||||
}
|
||||
} // namespace gpu
|
||||
|
||||
namespace cuda {
|
||||
|
||||
using MemorySpace = gpu::MemorySpace;
|
||||
|
||||
using CUDADriver = gpu::GpuDriver;
|
||||
|
||||
using ScopedActivateContext = gpu::ScopedActivateContext;
|
||||
|
||||
using CudaContext = gpu::GpuContext;
|
||||
|
||||
// Returns the current context set in CUDA. This is done by calling the cuda
|
||||
// driver (e.g., this value is not our cached view of the current context).
|
||||
CUcontext CurrentContextOrDie();
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace stream_executor
|
||||
|
@ -78,6 +78,7 @@ namespace wrap {
|
||||
__macro(cuDeviceGetCount) \
|
||||
__macro(cuDeviceGetName) \
|
||||
__macro(cuDeviceGetPCIBusId) \
|
||||
__macro(cuDeviceGetProperties) \
|
||||
__macro(cuDevicePrimaryCtxGetState) \
|
||||
__macro(cuDevicePrimaryCtxRelease) \
|
||||
__macro(cuDevicePrimaryCtxRetain) \
|
||||
|
@ -20,30 +20,11 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
CUDAEvent::CUDAEvent(CUDAExecutor* parent)
|
||||
: parent_(parent), cuda_event_(nullptr) {}
|
||||
|
||||
CUDAEvent::~CUDAEvent() {}
|
||||
|
||||
port::Status CUDAEvent::Init() {
|
||||
return CUDADriver::CreateEvent(parent_->cuda_context(), &cuda_event_,
|
||||
CUDADriver::EventFlags::kDisableTiming);
|
||||
}
|
||||
|
||||
port::Status CUDAEvent::Destroy() {
|
||||
return CUDADriver::DestroyEvent(parent_->cuda_context(), &cuda_event_);
|
||||
}
|
||||
|
||||
port::Status CUDAEvent::Record(CUDAStream* stream) {
|
||||
return CUDADriver::RecordEvent(parent_->cuda_context(), cuda_event_,
|
||||
stream->cuda_stream());
|
||||
}
|
||||
|
||||
Event::Status CUDAEvent::PollForStatus() {
|
||||
Event::Status GpuEvent::PollForStatus() {
|
||||
port::StatusOr<CUresult> status =
|
||||
CUDADriver::QueryEvent(parent_->cuda_context(), cuda_event_);
|
||||
GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << "Error polling for event status: "
|
||||
<< status.status().error_message();
|
||||
@ -62,9 +43,5 @@ Event::Status CUDAEvent::PollForStatus() {
|
||||
}
|
||||
}
|
||||
|
||||
const CUevent& CUDAEvent::cuda_event() {
|
||||
return cuda_event_;
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
@ -16,45 +16,12 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
|
||||
|
||||
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_stream.h"
|
||||
#include "tensorflow/stream_executor/event.h"
|
||||
#include "tensorflow/stream_executor/lib/status.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_event.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
|
||||
// CUDAEvent wraps a CUevent in the platform-independent EventInterface
|
||||
// interface.
|
||||
class CUDAEvent : public internal::EventInterface {
|
||||
public:
|
||||
explicit CUDAEvent(CUDAExecutor* parent);
|
||||
|
||||
~CUDAEvent() override;
|
||||
|
||||
// Populates the CUDA-platform-specific elements of this object.
|
||||
port::Status Init();
|
||||
|
||||
// Deallocates any platform-specific elements of this object. This is broken
|
||||
// out (not part of the destructor) to allow for error reporting.
|
||||
port::Status Destroy();
|
||||
|
||||
// Inserts the event at the current position into the specified stream.
|
||||
port::Status Record(CUDAStream* stream);
|
||||
|
||||
// Polls the CUDA platform for the event's current status.
|
||||
Event::Status PollForStatus();
|
||||
|
||||
// The underlying CUDA event element.
|
||||
const CUevent& cuda_event();
|
||||
|
||||
private:
|
||||
// The Executor used to which this object and CUevent are bound.
|
||||
CUDAExecutor* parent_;
|
||||
|
||||
// The underlying CUDA event element.
|
||||
CUevent cuda_event_;
|
||||
};
|
||||
using CUDAEvent = gpu::GpuEvent;
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace stream_executor
|
||||
|
@ -33,7 +33,7 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
|
||||
|
||||
@ -45,13 +45,13 @@ namespace wrap {
|
||||
// manner on first use. This dynamic loading technique is used to avoid DSO
|
||||
// dependencies on vendor libraries which may or may not be available in the
|
||||
// deployed binary environment.
|
||||
#define STREAM_EXECUTOR_CUFFT_WRAP(__name) \
|
||||
struct WrapperShim__##__name { \
|
||||
template <typename... Args> \
|
||||
cufftResult operator()(CUDAExecutor *parent, Args... args) { \
|
||||
cuda::ScopedActivateExecutorContext sac{parent}; \
|
||||
return ::__name(args...); \
|
||||
} \
|
||||
#define STREAM_EXECUTOR_CUFFT_WRAP(__name) \
|
||||
struct WrapperShim__##__name { \
|
||||
template <typename... Args> \
|
||||
cufftResult operator()(GpuExecutor *parent, Args... args) { \
|
||||
gpu::ScopedActivateExecutorContext sac{parent}; \
|
||||
return ::__name(args...); \
|
||||
} \
|
||||
} __name;
|
||||
|
||||
#else
|
||||
@ -77,8 +77,8 @@ namespace wrap {
|
||||
return f; \
|
||||
} \
|
||||
template <typename... Args> \
|
||||
cufftResult operator()(CUDAExecutor *parent, Args... args) { \
|
||||
cuda::ScopedActivateExecutorContext sac{parent}; \
|
||||
cufftResult operator()(GpuExecutor *parent, Args... args) { \
|
||||
gpu::ScopedActivateExecutorContext sac{parent}; \
|
||||
return DynLoad()(args...); \
|
||||
} \
|
||||
} __name; \
|
||||
@ -145,8 +145,8 @@ cufftType CUDAFftType(fft::Type type) {
|
||||
}
|
||||
|
||||
// Associates the given stream with the given cuFFT plan.
|
||||
bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
|
||||
auto ret = wrap::cufftSetStream(parent, plan, AsCUDAStreamValue(stream));
|
||||
bool SetStream(GpuExecutor *parent, cufftHandle plan, Stream *stream) {
|
||||
auto ret = wrap::cufftSetStream(parent, plan, AsGpuStreamValue(stream));
|
||||
if (ret != CUFFT_SUCCESS) {
|
||||
LOG(ERROR) << "failed to run cuFFT routine cufftSetStream: " << ret;
|
||||
return false;
|
||||
@ -157,7 +157,7 @@ bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
|
||||
} // namespace
|
||||
|
||||
port::Status CUDAFftPlan::Initialize(
|
||||
CUDAExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
|
||||
GpuExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
|
||||
uint64 *input_embed, uint64 input_stride, uint64 input_distance,
|
||||
uint64 *output_embed, uint64 output_stride, uint64 output_distance,
|
||||
fft::Type type, int batch_count, ScratchAllocator *scratch_allocator) {
|
||||
@ -317,7 +317,7 @@ port::Status CUDAFftPlan::Initialize(
|
||||
return port::Status::OK();
|
||||
}
|
||||
|
||||
port::Status CUDAFftPlan::Initialize(CUDAExecutor *parent, Stream *stream,
|
||||
port::Status CUDAFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
|
||||
int rank, uint64 *elem_count,
|
||||
fft::Type type,
|
||||
ScratchAllocator *scratch_allocator) {
|
||||
@ -549,8 +549,8 @@ bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec,
|
||||
}
|
||||
|
||||
auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
|
||||
CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
|
||||
CUDAComplex(CUDAMemoryMutable(output)));
|
||||
GpuComplex(const_cast<InputT *>(GpuMemory(input))),
|
||||
GpuComplex(GpuMemoryMutable(output)));
|
||||
|
||||
if (ret != CUFFT_SUCCESS) {
|
||||
LOG(ERROR) << "failed to run cuFFT routine: " << ret;
|
||||
@ -576,8 +576,8 @@ bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
|
||||
}
|
||||
|
||||
auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
|
||||
CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
|
||||
CUDAComplex(CUDAMemoryMutable(output)),
|
||||
GpuComplex(const_cast<InputT *>(GpuMemory(input))),
|
||||
GpuComplex(GpuMemoryMutable(output)),
|
||||
cuda_fft_plan->GetFftDirection());
|
||||
|
||||
if (ret != CUFFT_SUCCESS) {
|
||||
@ -614,22 +614,22 @@ STREAM_EXECUTOR_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
|
||||
|
||||
#undef STREAM_EXECUTOR_CUDA_DEFINE_FFT
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
|
||||
void initialize_cufft() {
|
||||
port::Status status =
|
||||
PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
|
||||
cuda::kCudaPlatformId, cuda::kCuFftPlugin, "cuFFT",
|
||||
cuda::kCudaPlatformId, gpu::kCuFftPlugin, "cuFFT",
|
||||
[](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
|
||||
cuda::CUDAExecutor *cuda_executor =
|
||||
dynamic_cast<cuda::CUDAExecutor *>(parent);
|
||||
gpu::GpuExecutor *cuda_executor =
|
||||
dynamic_cast<gpu::GpuExecutor *>(parent);
|
||||
if (cuda_executor == nullptr) {
|
||||
LOG(ERROR) << "Attempting to initialize an instance of the cuFFT "
|
||||
<< "support library with a non-CUDA StreamExecutor";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return new cuda::CUDAFft(cuda_executor);
|
||||
return new gpu::CUDAFft(cuda_executor);
|
||||
});
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << "Unable to register cuFFT factory: "
|
||||
@ -637,7 +637,7 @@ void initialize_cufft() {
|
||||
}
|
||||
|
||||
PluginRegistry::Instance()->SetDefaultFactory(
|
||||
cuda::kCudaPlatformId, PluginKind::kFft, cuda::kCuFftPlugin);
|
||||
cuda::kCudaPlatformId, PluginKind::kFft, gpu::kCuFftPlugin);
|
||||
}
|
||||
|
||||
} // namespace stream_executor
|
||||
|
@ -30,9 +30,9 @@ namespace stream_executor {
|
||||
|
||||
class Stream;
|
||||
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
class CUDAExecutor;
|
||||
class GpuExecutor;
|
||||
|
||||
// Opaque and unique indentifier for the cuFFT plugin.
|
||||
extern const PluginId kCuFftPlugin;
|
||||
@ -64,17 +64,17 @@ class CUDAFftPlan : public fft::Plan {
|
||||
}
|
||||
|
||||
// Initialize function for batched plan
|
||||
port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
|
||||
uint64 *elem_count, uint64 *input_embed,
|
||||
port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
|
||||
uint64* elem_count, uint64* input_embed,
|
||||
uint64 input_stride, uint64 input_distance,
|
||||
uint64 *output_embed, uint64 output_stride,
|
||||
uint64* output_embed, uint64 output_stride,
|
||||
uint64 output_distance, fft::Type type,
|
||||
int batch_count, ScratchAllocator *scratch_allocator);
|
||||
int batch_count, ScratchAllocator* scratch_allocator);
|
||||
|
||||
// Initialize function for 1d,2d, and 3d plan
|
||||
port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
|
||||
uint64 *elem_count, fft::Type type,
|
||||
ScratchAllocator *scratch_allocator);
|
||||
port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
|
||||
uint64* elem_count, fft::Type type,
|
||||
ScratchAllocator* scratch_allocator);
|
||||
|
||||
port::Status UpdateScratchAllocator(Stream *stream,
|
||||
ScratchAllocator *scratch_allocator);
|
||||
@ -83,7 +83,7 @@ class CUDAFftPlan : public fft::Plan {
|
||||
bool IsInitialized() const { return is_initialized_; }
|
||||
|
||||
private:
|
||||
CUDAExecutor *parent_;
|
||||
GpuExecutor* parent_;
|
||||
cufftHandle plan_;
|
||||
fft::Type fft_type_;
|
||||
DeviceMemory<uint8> scratch_;
|
||||
@ -96,7 +96,7 @@ class CUDAFftPlan : public fft::Plan {
|
||||
// This satisfies the platform-agnostic FftSupport interface.
|
||||
//
|
||||
// Note that the cuFFT handle that this encapsulates is implicitly tied to the
|
||||
// context (and, as a result, the device) that the parent CUDAExecutor is tied
|
||||
// context (and, as a result, the device) that the parent GpuExecutor is tied
|
||||
// to. This simply happens as an artifact of creating the cuFFT handle when a
|
||||
// CUDA context is active.
|
||||
//
|
||||
@ -104,13 +104,13 @@ class CUDAFftPlan : public fft::Plan {
|
||||
// context of parent_, so all context is explicit.
|
||||
class CUDAFft : public fft::FftSupport {
|
||||
public:
|
||||
explicit CUDAFft(CUDAExecutor *parent) : parent_(parent) {}
|
||||
explicit CUDAFft(GpuExecutor* parent) : parent_(parent) {}
|
||||
~CUDAFft() override {}
|
||||
|
||||
TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES
|
||||
|
||||
private:
|
||||
CUDAExecutor *parent_;
|
||||
GpuExecutor* parent_;
|
||||
|
||||
// Two helper functions that execute dynload::cufftExec?2?.
|
||||
|
||||
@ -131,7 +131,7 @@ class CUDAFft : public fft::FftSupport {
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(CUDAFft);
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
|
||||
|
@ -72,7 +72,7 @@ extern bool FLAGS_check_gpu_leaks;
|
||||
bool FLAGS_prefer_cubin_to_ptx = true;
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
// Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
|
||||
// It has been observed that loading both PTX and cubins into the driver library
|
||||
@ -84,17 +84,16 @@ namespace cuda {
|
||||
// variable with extern linkage and populate it from another translation unit.
|
||||
std::function<string(const string &)> g_cubinate;
|
||||
|
||||
static CUDAEvent *AsCUDAEvent(Event *event) {
|
||||
static GpuEvent* AsGpuEvent(Event* event) {
|
||||
DCHECK(event != nullptr);
|
||||
return static_cast<CUDAEvent *>(event->implementation());
|
||||
return static_cast<GpuEvent*>(event->implementation());
|
||||
}
|
||||
|
||||
|
||||
// Given a platform-independent timer datatype, returns the internal CUDA
|
||||
// platform implementation pointer.
|
||||
static CUDATimer *AsCUDATimer(Timer *timer) {
|
||||
static GpuTimer* AsGpuTimer(Timer* timer) {
|
||||
DCHECK(timer != nullptr);
|
||||
return static_cast<CUDATimer *>(timer->implementation());
|
||||
return static_cast<GpuTimer*>(timer->implementation());
|
||||
}
|
||||
|
||||
// Given const GPU memory, returns a libcuda device pointer datatype, suitable
|
||||
@ -112,48 +111,49 @@ static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
|
||||
return AsCudaDevicePtr(*gpu_mem);
|
||||
}
|
||||
|
||||
CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec) {
|
||||
GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
|
||||
CHECK(cuda_exec != nullptr);
|
||||
return cuda_exec->cuda_context();
|
||||
return cuda_exec->gpu_context();
|
||||
}
|
||||
|
||||
CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec) {
|
||||
return static_cast<CUDAExecutor *>(stream_exec->implementation());
|
||||
GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
|
||||
return static_cast<GpuExecutor*>(stream_exec->implementation());
|
||||
}
|
||||
|
||||
CUDAExecutor::~CUDAExecutor() {
|
||||
CHECK(kernel_to_gpu_binary_.empty()) << "CUDAExecutor has live kernels.";
|
||||
CHECK(gpu_binary_to_module_.empty()) << "CUDAExecutor has loaded modules.";
|
||||
GpuExecutor::~GpuExecutor() {
|
||||
CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
|
||||
CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
|
||||
if (context_ != nullptr) {
|
||||
CUDADriver::DestroyContext(context_);
|
||||
GpuDriver::DestroyContext(context_);
|
||||
}
|
||||
}
|
||||
|
||||
port::Status CUDAExecutor::Init(int device_ordinal,
|
||||
DeviceOptions device_options) {
|
||||
port::Status GpuExecutor::Init(int device_ordinal,
|
||||
DeviceOptions device_options) {
|
||||
device_ordinal_ = device_ordinal;
|
||||
|
||||
auto status = CUDADriver::Init();
|
||||
auto status = GpuDriver::Init();
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
status = CUDADriver::GetDevice(device_ordinal_, &device_);
|
||||
status = GpuDriver::GetDevice(device_ordinal_, &device_);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
status = CUDADriver::CreateContext(device_, device_options, &context_);
|
||||
status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
|
||||
&context_);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
return CUDADriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
|
||||
return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
|
||||
}
|
||||
|
||||
bool CUDAExecutor::FindOnDiskForComputeCapability(
|
||||
bool GpuExecutor::FindOnDiskForComputeCapability(
|
||||
absl::string_view filename, absl::string_view canonical_suffix,
|
||||
string *found_filename) const {
|
||||
string* found_filename) const {
|
||||
if (cc_major_ == 0 && cc_minor_ == 0) {
|
||||
return false;
|
||||
}
|
||||
@ -177,6 +177,13 @@ bool CUDAExecutor::FindOnDiskForComputeCapability(
|
||||
return false;
|
||||
}
|
||||
|
||||
bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
|
||||
absl::string_view canonical_suffix,
|
||||
string* found_filename) const {
|
||||
LOG(ERROR)
|
||||
<< "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
|
||||
return false;
|
||||
}
|
||||
// Returns the path to the running executable.
|
||||
// N.B. Derived from //knowledge/smalltalk/background_kb.cc
|
||||
// Arg: strip_exe: if true, remove the name of the executable itself from the
|
||||
@ -211,12 +218,12 @@ static string GetBinaryDir(bool strip_exe) {
|
||||
return exe_path;
|
||||
}
|
||||
|
||||
bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
|
||||
bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, CUmodule* module) {
|
||||
uint64_t module_refcount;
|
||||
std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
|
||||
|
||||
if (*module == nullptr) {
|
||||
auto load_status = CUDADriver::LoadCubin(context_, cubin, module);
|
||||
auto load_status = GpuDriver::LoadCubin(context_, cubin, module);
|
||||
if (!load_status.ok()) {
|
||||
LOG(ERROR) << "failed to load CUBIN: " << load_status;
|
||||
return false;
|
||||
@ -233,12 +240,12 @@ bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
|
||||
bool GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
|
||||
uint64_t module_refcount;
|
||||
std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
|
||||
|
||||
if (*module == nullptr) {
|
||||
if (!CUDADriver::LoadPtx(context_, ptx, module)) {
|
||||
if (!GpuDriver::LoadPtx(context_, ptx, module)) {
|
||||
return false;
|
||||
}
|
||||
VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
|
||||
@ -253,9 +260,14 @@ bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
|
||||
KernelBase *kernel) {
|
||||
CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
|
||||
bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, CUmodule* module) {
|
||||
LOG(ERROR) << "Feature not supported on CUDA platform (LoadModuleFromHsaco)";
|
||||
return false;
|
||||
}
|
||||
|
||||
bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
|
||||
KernelBase* kernel) {
|
||||
GpuKernel* cuda_kernel = AsGpuKernel(kernel);
|
||||
CUmodule module;
|
||||
const string *kernelname;
|
||||
|
||||
@ -295,8 +307,8 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
|
||||
return false;
|
||||
}
|
||||
VLOG(2) << "getting function " << *kernelname << " from module " << module;
|
||||
if (!CUDADriver::GetModuleFunction(context_, module, kernelname->c_str(),
|
||||
cuda_kernel->cuda_function_ptr())) {
|
||||
if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
|
||||
cuda_kernel->gpu_function_ptr())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -313,7 +325,7 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
|
||||
bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
|
||||
auto module_it = gpu_binary_to_module_.find(gpu_binary);
|
||||
if (gpu_binary_to_module_.end() == module_it) {
|
||||
VLOG(3) << "No loaded CUDA module for " << gpu_binary;
|
||||
@ -324,13 +336,13 @@ bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
|
||||
VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
|
||||
if (--refcount == 0) {
|
||||
VLOG(3) << "Unloading CUDA module " << module;
|
||||
CUDADriver::UnloadModule(context_, module);
|
||||
GpuDriver::UnloadModule(context_, module);
|
||||
gpu_binary_to_module_.erase(module_it);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
|
||||
void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
|
||||
VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
|
||||
|
||||
mutex_lock lock{in_memory_modules_mu_};
|
||||
@ -346,9 +358,9 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
|
||||
kernel_to_gpu_binary_.erase(gpu_binary_it);
|
||||
}
|
||||
|
||||
bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
|
||||
ModuleHandle *module_handle) {
|
||||
// In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
|
||||
bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
|
||||
ModuleHandle* module_handle) {
|
||||
// In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
|
||||
// ModuleHandle::id().
|
||||
CUmodule cu_module;
|
||||
if (spec.has_cuda_cubin_in_memory()) {
|
||||
@ -382,25 +394,23 @@ bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
|
||||
return false;
|
||||
}
|
||||
|
||||
bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) {
|
||||
bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
|
||||
const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
|
||||
mutex_lock lock{in_memory_modules_mu_};
|
||||
return UnloadGpuBinary(gpu_binary);
|
||||
}
|
||||
|
||||
bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
|
||||
KernelMetadata *kernel_metadata) {
|
||||
bool GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
|
||||
KernelMetadata* kernel_metadata) {
|
||||
int value;
|
||||
if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
|
||||
*cuda_kernel->cuda_function_ptr(),
|
||||
&value)) {
|
||||
if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
|
||||
*cuda_kernel->gpu_function_ptr(), &value)) {
|
||||
return false;
|
||||
}
|
||||
kernel_metadata->set_registers_per_thread(value);
|
||||
|
||||
if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
|
||||
*cuda_kernel->cuda_function_ptr(),
|
||||
&value)) {
|
||||
if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
|
||||
*cuda_kernel->gpu_function_ptr(), &value)) {
|
||||
return false;
|
||||
}
|
||||
kernel_metadata->set_shared_memory_bytes(value);
|
||||
@ -408,13 +418,13 @@ bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
|
||||
const BlockDim &block_dims, const KernelBase &kernel,
|
||||
const KernelArgsArrayBase &args) {
|
||||
bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
|
||||
const BlockDim& block_dims, const KernelBase& kernel,
|
||||
const KernelArgsArrayBase& args) {
|
||||
CHECK_EQ(kernel.Arity(), args.number_of_arguments());
|
||||
CUstream custream = AsCUDAStreamValue(stream);
|
||||
const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
|
||||
CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
|
||||
CUstream custream = AsGpuStreamValue(stream);
|
||||
const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
|
||||
CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
|
||||
|
||||
// Only perform/print the occupancy check once. Even just checking to see
|
||||
// whether we've done an occupancy check on this kernel before isn't free
|
||||
@ -431,16 +441,16 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
|
||||
|
||||
if (cuda_kernel->GetPreferredCacheConfig() !=
|
||||
KernelCacheConfig::kNoPreference) {
|
||||
CUDADriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetCUDACacheConfig());
|
||||
GpuDriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetGpuCacheConfig());
|
||||
}
|
||||
|
||||
void **kernel_params = const_cast<void **>(args.argument_addresses().data());
|
||||
|
||||
if (!CUDADriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
|
||||
block_dims.z, thread_dims.x, thread_dims.y,
|
||||
thread_dims.z, args.number_of_shared_bytes(),
|
||||
custream, kernel_params,
|
||||
nullptr /* = extra */)) {
|
||||
if (!GpuDriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
|
||||
block_dims.z, thread_dims.x, thread_dims.y,
|
||||
thread_dims.z, args.number_of_shared_bytes(),
|
||||
custream, kernel_params,
|
||||
nullptr /* = extra */)) {
|
||||
LOG(ERROR) << "failed to launch CUDA kernel " << kernel.name() << " with "
|
||||
<< args.number_of_arguments()
|
||||
<< " args; thread dim: " << thread_dims.ToString()
|
||||
@ -454,9 +464,9 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
|
||||
// This is a non-essential operation; if there's a failure, proceed without
|
||||
// logging an error. It's nearly certain that in case of failures, we'd never
|
||||
// get here in the first place; these are very low-impact routines.
|
||||
void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
|
||||
const ThreadDim &thread_dims,
|
||||
const BlockDim &block_dims) {
|
||||
void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
|
||||
const ThreadDim& thread_dims,
|
||||
const BlockDim& block_dims) {
|
||||
VLOG(2) << "Computing kernel occupancy for kernel "
|
||||
<< kernel.demangled_name();
|
||||
VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
|
||||
@ -475,8 +485,8 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
|
||||
const DeviceDescription &device_description =
|
||||
kernel.parent()->GetDeviceDescription();
|
||||
|
||||
const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
|
||||
CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
|
||||
const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
|
||||
CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
|
||||
|
||||
int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
|
||||
smem_per_block, thread_dims, cufunc);
|
||||
@ -496,10 +506,11 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
|
||||
// Compute and return maximum blocks per core (occupancy) based on the
|
||||
// device description, some kernel characteristics and the number of threads per
|
||||
// block. If unable to compute occupancy, zero is returned.
|
||||
int CUDAExecutor::CalculateOccupancy(
|
||||
const DeviceDescription &device_description, uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block, const ThreadDim &thread_dims,
|
||||
CUfunction func) {
|
||||
int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims,
|
||||
CUfunction func) {
|
||||
int suggested_blocks = 0;
|
||||
int suggested_threads = 0;
|
||||
CUresult err = tensorflow::wrap::cuOccupancyMaxPotentialBlockSize(
|
||||
@ -511,12 +522,12 @@ int CUDAExecutor::CalculateOccupancy(
|
||||
|
||||
// Compute and return the suggested thread count to achieve ideal occupancy.
|
||||
// If the provided thread dimensions match this number, zero is returned.
|
||||
int CUDAExecutor::CompareOccupancy(int *initial_blocks,
|
||||
const DeviceDescription &device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim &thread_dims,
|
||||
CUfunction func) {
|
||||
int GpuExecutor::CompareOccupancy(int* initial_blocks,
|
||||
const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims,
|
||||
CUfunction func) {
|
||||
int suggested_blocks = 0;
|
||||
int suggested_threads = 0;
|
||||
CUresult err = tensorflow::wrap::cuOccupancyMaxPotentialBlockSize(
|
||||
@ -531,88 +542,87 @@ int CUDAExecutor::CompareOccupancy(int *initial_blocks,
|
||||
}
|
||||
}
|
||||
|
||||
void *CUDAExecutor::Allocate(uint64 size) {
|
||||
return CUDADriver::DeviceAllocate(context_, size);
|
||||
void* GpuExecutor::Allocate(uint64 size) {
|
||||
return GpuDriver::DeviceAllocate(context_, size);
|
||||
}
|
||||
|
||||
void *CUDAExecutor::AllocateSubBuffer(DeviceMemoryBase *mem,
|
||||
uint64 offset_bytes, uint64 size_bytes) {
|
||||
void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
|
||||
uint64 size_bytes) {
|
||||
// offset and size are in bytes, so char* works as the pointer type.
|
||||
return reinterpret_cast<char *>(mem->opaque()) + offset_bytes;
|
||||
}
|
||||
|
||||
void CUDAExecutor::Deallocate(DeviceMemoryBase *mem) {
|
||||
void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
|
||||
// CUDA "sub-buffers" are just pointer + offset, so no dealloc is necessary.
|
||||
if (!mem->is_sub_buffer()) {
|
||||
CUDADriver::DeviceDeallocate(context_, mem->opaque());
|
||||
GpuDriver::DeviceDeallocate(context_, mem->opaque());
|
||||
}
|
||||
}
|
||||
|
||||
bool CUDAExecutor::HostMemoryRegister(void *location, uint64 size) {
|
||||
bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
|
||||
if (location == nullptr || size == 0) {
|
||||
LOG(WARNING) << "attempting to register null or zero-sized memory: "
|
||||
<< location << "; size " << size;
|
||||
}
|
||||
VLOG(2) << "registering " << location << " size " << size;
|
||||
return CUDADriver::HostRegister(context_, location, size);
|
||||
return GpuDriver::HostRegister(context_, location, size);
|
||||
}
|
||||
|
||||
bool CUDAExecutor::HostMemoryUnregister(void *location) {
|
||||
bool GpuExecutor::HostMemoryUnregister(void* location) {
|
||||
VLOG(2) << "unregistering " << location;
|
||||
return CUDADriver::HostUnregister(context_, location);
|
||||
return GpuDriver::HostUnregister(context_, location);
|
||||
}
|
||||
|
||||
bool CUDAExecutor::SynchronizeAllActivity() {
|
||||
return CUDADriver::SynchronizeContext(context_);
|
||||
bool GpuExecutor::SynchronizeAllActivity() {
|
||||
return GpuDriver::SynchronizeContext(context_);
|
||||
}
|
||||
|
||||
bool CUDAExecutor::SynchronousMemZero(DeviceMemoryBase *location, uint64 size) {
|
||||
bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
|
||||
if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
|
||||
size % 4 == 0) {
|
||||
return CUDADriver::SynchronousMemsetUint32(
|
||||
return GpuDriver::SynchronousMemsetUint32(
|
||||
context_, AsCudaDevicePtr(location), 0x0, size / 4);
|
||||
}
|
||||
return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
|
||||
0x0, size);
|
||||
return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
|
||||
0x0, size);
|
||||
}
|
||||
|
||||
bool CUDAExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value,
|
||||
uint64 size) {
|
||||
bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
|
||||
uint64 size) {
|
||||
if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
|
||||
size % 4 == 0) {
|
||||
// cudaMemset reinterprets "value" as a uint8.
|
||||
uint8 byte_value = static_cast<uint8>(value);
|
||||
uint32 pattern = (byte_value << 24) | (byte_value << 16) |
|
||||
(byte_value << 8) | byte_value;
|
||||
return CUDADriver::SynchronousMemsetUint32(
|
||||
return GpuDriver::SynchronousMemsetUint32(
|
||||
context_, AsCudaDevicePtr(location), pattern, size / 4);
|
||||
}
|
||||
return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
|
||||
value, size);
|
||||
return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
|
||||
value, size);
|
||||
}
|
||||
|
||||
port::Status CUDAExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
|
||||
const void *host_src,
|
||||
uint64 size) {
|
||||
return CUDADriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
|
||||
host_src, size);
|
||||
port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
|
||||
const void* host_src, uint64 size) {
|
||||
return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
|
||||
host_src, size);
|
||||
}
|
||||
|
||||
port::Status CUDAExecutor::SynchronousMemcpy(void *host_dst,
|
||||
const DeviceMemoryBase &gpu_src,
|
||||
uint64 size) {
|
||||
return CUDADriver::SynchronousMemcpyD2H(context_, host_dst,
|
||||
AsCudaDevicePtr(gpu_src), size);
|
||||
port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
|
||||
const DeviceMemoryBase& gpu_src,
|
||||
uint64 size) {
|
||||
return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
|
||||
AsCudaDevicePtr(gpu_src), size);
|
||||
}
|
||||
|
||||
port::Status CUDAExecutor::SynchronousMemcpyDeviceToDevice(
|
||||
DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) {
|
||||
return CUDADriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
|
||||
AsCudaDevicePtr(gpu_src), size);
|
||||
port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
|
||||
DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
|
||||
return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
|
||||
AsCudaDevicePtr(gpu_src), size);
|
||||
}
|
||||
|
||||
bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
|
||||
uint64 size) {
|
||||
bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
|
||||
uint64 size) {
|
||||
if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
|
||||
size % 4 == 0) {
|
||||
return Memset32(stream, location, 0x0, size);
|
||||
@ -621,88 +631,87 @@ bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
|
||||
}
|
||||
}
|
||||
|
||||
bool CUDAExecutor::Memset(Stream *stream, DeviceMemoryBase *location,
|
||||
uint8 pattern, uint64 size) {
|
||||
bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
|
||||
uint8 pattern, uint64 size) {
|
||||
VLOG(2) << "enqueueing memset8 operation onto stream " << stream
|
||||
<< " at location " << location << " with size " << size
|
||||
<< " and pattern " << std::hex << pattern;
|
||||
return CUDADriver::AsynchronousMemsetUint8(
|
||||
context_, AsCudaDevicePtr(location), pattern, size,
|
||||
AsCUDAStreamValue(stream));
|
||||
return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
|
||||
pattern, size,
|
||||
AsGpuStreamValue(stream));
|
||||
}
|
||||
|
||||
bool CUDAExecutor::Memset32(Stream *stream, DeviceMemoryBase *location,
|
||||
uint32 pattern, uint64 size) {
|
||||
bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
|
||||
uint32 pattern, uint64 size) {
|
||||
VLOG(2) << "enqueueing memset32 operation onto stream " << stream
|
||||
<< " at location " << location << " with size " << size
|
||||
<< " and pattern " << std::hex << pattern;
|
||||
CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
|
||||
size % 4 == 0);
|
||||
return CUDADriver::AsynchronousMemsetUint32(
|
||||
return GpuDriver::AsynchronousMemsetUint32(
|
||||
context_, AsCudaDevicePtr(location), pattern, size / 4,
|
||||
AsCUDAStreamValue(stream));
|
||||
AsGpuStreamValue(stream));
|
||||
}
|
||||
|
||||
bool CUDAExecutor::Memcpy(Stream *stream, void *host_dst,
|
||||
const DeviceMemoryBase &gpu_src, uint64 size) {
|
||||
return CUDADriver::AsynchronousMemcpyD2H(context_, host_dst,
|
||||
AsCudaDevicePtr(gpu_src), size,
|
||||
AsCUDAStreamValue(stream));
|
||||
bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
|
||||
const DeviceMemoryBase& gpu_src, uint64 size) {
|
||||
return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
|
||||
AsCudaDevicePtr(gpu_src), size,
|
||||
AsGpuStreamValue(stream));
|
||||
}
|
||||
|
||||
bool CUDAExecutor::Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst,
|
||||
const void *host_src, uint64 size) {
|
||||
return CUDADriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
|
||||
host_src, size,
|
||||
AsCUDAStreamValue(stream));
|
||||
bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
|
||||
const void* host_src, uint64 size) {
|
||||
return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
|
||||
host_src, size,
|
||||
AsGpuStreamValue(stream));
|
||||
}
|
||||
|
||||
bool CUDAExecutor::MemcpyDeviceToDevice(Stream *stream,
|
||||
DeviceMemoryBase *gpu_dst,
|
||||
const DeviceMemoryBase &gpu_src,
|
||||
uint64 size) {
|
||||
return CUDADriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
|
||||
AsCudaDevicePtr(gpu_src), size,
|
||||
AsCUDAStreamValue(stream));
|
||||
bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
|
||||
DeviceMemoryBase* gpu_dst,
|
||||
const DeviceMemoryBase& gpu_src,
|
||||
uint64 size) {
|
||||
return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
|
||||
AsCudaDevicePtr(gpu_src), size,
|
||||
AsGpuStreamValue(stream));
|
||||
}
|
||||
|
||||
bool CUDAExecutor::HostCallback(Stream *stream,
|
||||
std::function<port::Status()> callback) {
|
||||
bool GpuExecutor::HostCallback(Stream* stream,
|
||||
std::function<port::Status()> callback) {
|
||||
auto callback_ptr = new std::function<void()>([callback]() {
|
||||
port::Status s = callback();
|
||||
if (!s.ok()) {
|
||||
LOG(WARNING) << "Host callback failed: " << s;
|
||||
}
|
||||
});
|
||||
return CUDADriver::AddStreamCallback(context_, AsCUDAStreamValue(stream),
|
||||
InternalHostCallback, callback_ptr);
|
||||
return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
|
||||
InternalHostCallback, callback_ptr);
|
||||
}
|
||||
|
||||
/* static */ void CUDAExecutor::InternalHostCallback(CUstream stream,
|
||||
CUresult status,
|
||||
void *data) {
|
||||
/* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
|
||||
CUresult status,
|
||||
void* data) {
|
||||
std::function<void()> *callback =
|
||||
reinterpret_cast<std::function<void()> *>(data);
|
||||
(*callback)();
|
||||
delete callback;
|
||||
}
|
||||
|
||||
port::Status CUDAExecutor::AllocateEvent(Event *event) {
|
||||
return AsCUDAEvent(event)->Init();
|
||||
port::Status GpuExecutor::AllocateEvent(Event* event) {
|
||||
return AsGpuEvent(event)->Init();
|
||||
}
|
||||
|
||||
port::Status CUDAExecutor::DeallocateEvent(Event *event) {
|
||||
return AsCUDAEvent(event)->Destroy();
|
||||
port::Status GpuExecutor::DeallocateEvent(Event* event) {
|
||||
return AsGpuEvent(event)->Destroy();
|
||||
}
|
||||
|
||||
port::Status CUDAExecutor::RecordEvent(Stream *stream, Event *event) {
|
||||
return AsCUDAEvent(event)->Record(AsCUDAStream(stream));
|
||||
port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
|
||||
return AsGpuEvent(event)->Record(AsGpuStream(stream));
|
||||
}
|
||||
|
||||
port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
|
||||
if (CUDADriver::WaitStreamOnEvent(context_,
|
||||
AsCUDAStream(stream)->cuda_stream(),
|
||||
AsCUDAEvent(event)->cuda_event())) {
|
||||
port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
|
||||
if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
|
||||
AsGpuEvent(event)->gpu_event())) {
|
||||
return port::Status::OK();
|
||||
} else {
|
||||
return port::Status(
|
||||
@ -712,61 +721,61 @@ port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
|
||||
}
|
||||
}
|
||||
|
||||
Event::Status CUDAExecutor::PollForEventStatus(Event *event) {
|
||||
return AsCUDAEvent(event)->PollForStatus();
|
||||
Event::Status GpuExecutor::PollForEventStatus(Event* event) {
|
||||
return AsGpuEvent(event)->PollForStatus();
|
||||
}
|
||||
|
||||
bool CUDAExecutor::AllocateStream(Stream *stream) {
|
||||
return AsCUDAStream(stream)->Init();
|
||||
bool GpuExecutor::AllocateStream(Stream* stream) {
|
||||
return AsGpuStream(stream)->Init();
|
||||
}
|
||||
|
||||
void CUDAExecutor::DeallocateStream(Stream *stream) {
|
||||
CUDAStream *cuda_stream = AsCUDAStream(stream);
|
||||
void GpuExecutor::DeallocateStream(Stream* stream) {
|
||||
GpuStream* cuda_stream = AsGpuStream(stream);
|
||||
if (!cuda_stream->IsIdle()) {
|
||||
LOG(ERROR) << "Deallocating stream with pending work";
|
||||
}
|
||||
cuda_stream->Destroy();
|
||||
}
|
||||
|
||||
bool CUDAExecutor::AllocateTimer(Timer *timer) {
|
||||
return AsCUDATimer(timer)->Init();
|
||||
bool GpuExecutor::AllocateTimer(Timer* timer) {
|
||||
return AsGpuTimer(timer)->Init();
|
||||
}
|
||||
|
||||
void CUDAExecutor::DeallocateTimer(Timer *timer) {
|
||||
AsCUDATimer(timer)->Destroy();
|
||||
void GpuExecutor::DeallocateTimer(Timer* timer) {
|
||||
AsGpuTimer(timer)->Destroy();
|
||||
}
|
||||
|
||||
bool CUDAExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
|
||||
CUevent other_completed_event = *AsCUDAStream(other)->completed_event();
|
||||
bool ok = CUDADriver::RecordEvent(context_, other_completed_event,
|
||||
AsCUDAStreamValue(other))
|
||||
.ok();
|
||||
bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
|
||||
CUevent other_completed_event = *AsGpuStream(other)->completed_event();
|
||||
bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
|
||||
AsGpuStreamValue(other))
|
||||
.ok();
|
||||
if (!ok) {
|
||||
LOG(ERROR) << "failed to record completion event; "
|
||||
"therefore, failed to create inter-stream dependency";
|
||||
return false;
|
||||
}
|
||||
|
||||
return CUDADriver::WaitStreamOnEvent(context_, AsCUDAStreamValue(dependent),
|
||||
other_completed_event);
|
||||
return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
|
||||
other_completed_event);
|
||||
}
|
||||
|
||||
bool CUDAExecutor::StartTimer(Stream *stream, Timer *timer) {
|
||||
return AsCUDATimer(timer)->Start(AsCUDAStream(stream));
|
||||
bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
|
||||
return AsGpuTimer(timer)->Start(AsGpuStream(stream));
|
||||
}
|
||||
|
||||
bool CUDAExecutor::StopTimer(Stream *stream, Timer *timer) {
|
||||
return AsCUDATimer(timer)->Stop(AsCUDAStream(stream));
|
||||
bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
|
||||
return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
|
||||
}
|
||||
|
||||
port::Status CUDAExecutor::BlockHostUntilDone(Stream *stream) {
|
||||
return CUDADriver::SynchronizeStream(context_, AsCUDAStreamValue(stream));
|
||||
port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
|
||||
return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
|
||||
}
|
||||
|
||||
blas::BlasSupport *CUDAExecutor::CreateBlas() {
|
||||
blas::BlasSupport* GpuExecutor::CreateBlas() {
|
||||
PluginRegistry *registry = PluginRegistry::Instance();
|
||||
port::StatusOr<PluginRegistry::BlasFactory> status =
|
||||
registry->GetFactory<PluginRegistry::BlasFactory>(kCudaPlatformId,
|
||||
registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
|
||||
plugin_config_.blas());
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << "Unable to retrieve BLAS factory: "
|
||||
@ -777,10 +786,10 @@ blas::BlasSupport *CUDAExecutor::CreateBlas() {
|
||||
return status.ValueOrDie()(this);
|
||||
}
|
||||
|
||||
dnn::DnnSupport *CUDAExecutor::CreateDnn() {
|
||||
dnn::DnnSupport* GpuExecutor::CreateDnn() {
|
||||
PluginRegistry *registry = PluginRegistry::Instance();
|
||||
port::StatusOr<PluginRegistry::DnnFactory> status =
|
||||
registry->GetFactory<PluginRegistry::DnnFactory>(kCudaPlatformId,
|
||||
registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
|
||||
plugin_config_.dnn());
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << "Unable to retrieve DNN factory: "
|
||||
@ -791,10 +800,10 @@ dnn::DnnSupport *CUDAExecutor::CreateDnn() {
|
||||
return status.ValueOrDie()(this);
|
||||
}
|
||||
|
||||
fft::FftSupport *CUDAExecutor::CreateFft() {
|
||||
fft::FftSupport* GpuExecutor::CreateFft() {
|
||||
PluginRegistry *registry = PluginRegistry::Instance();
|
||||
port::StatusOr<PluginRegistry::FftFactory> status =
|
||||
registry->GetFactory<PluginRegistry::FftFactory>(kCudaPlatformId,
|
||||
registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
|
||||
plugin_config_.fft());
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << "Unable to retrieve FFT factory: "
|
||||
@ -805,10 +814,10 @@ fft::FftSupport *CUDAExecutor::CreateFft() {
|
||||
return status.ValueOrDie()(this);
|
||||
}
|
||||
|
||||
rng::RngSupport *CUDAExecutor::CreateRng() {
|
||||
rng::RngSupport* GpuExecutor::CreateRng() {
|
||||
PluginRegistry *registry = PluginRegistry::Instance();
|
||||
port::StatusOr<PluginRegistry::RngFactory> status =
|
||||
registry->GetFactory<PluginRegistry::RngFactory>(kCudaPlatformId,
|
||||
registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
|
||||
plugin_config_.rng());
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << "Unable to retrieve RNG factory: "
|
||||
@ -820,23 +829,21 @@ rng::RngSupport *CUDAExecutor::CreateRng() {
|
||||
}
|
||||
|
||||
// TODO(rspringer): Remove in b/18544742.
|
||||
bool CUDAExecutor::SupportsDnn() const {
|
||||
return true;
|
||||
bool GpuExecutor::SupportsDnn() const { return true; }
|
||||
|
||||
bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
|
||||
GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
|
||||
return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
|
||||
}
|
||||
|
||||
bool CUDAExecutor::CanEnablePeerAccessTo(StreamExecutorInterface *other) {
|
||||
CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
|
||||
return CUDADriver::CanEnablePeerAccess(context_, cuda_other->context_);
|
||||
port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
|
||||
GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
|
||||
return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
|
||||
}
|
||||
|
||||
port::Status CUDAExecutor::EnablePeerAccessTo(StreamExecutorInterface *other) {
|
||||
CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
|
||||
return CUDADriver::EnablePeerAccess(context_, cuda_other->context_);
|
||||
}
|
||||
|
||||
SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
|
||||
SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
|
||||
port::StatusOr<CUsharedconfig> cuda_config =
|
||||
CUDADriver::ContextGetSharedMemConfig(context_);
|
||||
GpuDriver::ContextGetSharedMemConfig(context_);
|
||||
if (!cuda_config.ok()) {
|
||||
// Don't log; the failed call will log necessary output.
|
||||
return SharedMemoryConfig::kDefault;
|
||||
@ -855,7 +862,7 @@ SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
|
||||
}
|
||||
}
|
||||
|
||||
port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
|
||||
port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
|
||||
SharedMemoryConfig config) {
|
||||
CUsharedconfig cuda_config;
|
||||
switch (config) {
|
||||
@ -872,21 +879,21 @@ port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
|
||||
LOG(FATAL) << "Invalid shared memory configuration specified: "
|
||||
<< static_cast<int>(config);
|
||||
}
|
||||
return CUDADriver::ContextSetSharedMemConfig(context_, cuda_config);
|
||||
return GpuDriver::ContextSetSharedMemConfig(context_, cuda_config);
|
||||
}
|
||||
|
||||
bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
|
||||
return CUDADriver::GetDeviceMemoryInfo(context_, free, total);
|
||||
bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
|
||||
return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
|
||||
}
|
||||
|
||||
bool CUDAExecutor::GetSymbol(const string &symbol_name,
|
||||
ModuleHandle module_handle, void **mem,
|
||||
size_t *bytes) {
|
||||
bool GpuExecutor::GetSymbol(const string& symbol_name,
|
||||
ModuleHandle module_handle, void** mem,
|
||||
size_t* bytes) {
|
||||
auto lookup_in_module = [&](CUmodule module) {
|
||||
CHECK(module != nullptr);
|
||||
return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
|
||||
reinterpret_cast<CUdeviceptr *>(mem),
|
||||
bytes);
|
||||
return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
|
||||
reinterpret_cast<CUdeviceptr*>(mem),
|
||||
bytes);
|
||||
};
|
||||
|
||||
{ // give limited scope to mutex_lock
|
||||
@ -908,13 +915,13 @@ bool CUDAExecutor::GetSymbol(const string &symbol_name,
|
||||
return false;
|
||||
}
|
||||
|
||||
bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
|
||||
bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
|
||||
// The BlockDim name is a mismatch against these GRID_DIM_* queries because
|
||||
// we use BlockDims to express the dimensions of blocks within a grid
|
||||
// (as opposed to ThreadDim which expresses the dimensions of threads
|
||||
// within a block).
|
||||
int x, y, z;
|
||||
if (!CUDADriver::GetGridLimits(&x, &y, &z, device_)) {
|
||||
if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -924,35 +931,35 @@ bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CUDAExecutor::SupportsBlas() const { return true; }
|
||||
bool GpuExecutor::SupportsBlas() const { return true; }
|
||||
|
||||
bool CUDAExecutor::SupportsFft() const { return true; }
|
||||
bool GpuExecutor::SupportsFft() const { return true; }
|
||||
|
||||
bool CUDAExecutor::SupportsRng() const { return true; }
|
||||
bool GpuExecutor::SupportsRng() const { return true; }
|
||||
|
||||
std::unique_ptr<internal::EventInterface>
|
||||
CUDAExecutor::CreateEventImplementation() {
|
||||
return std::unique_ptr<internal::EventInterface>(new CUDAEvent(this));
|
||||
GpuExecutor::CreateEventImplementation() {
|
||||
return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
|
||||
}
|
||||
|
||||
std::unique_ptr<internal::KernelInterface>
|
||||
CUDAExecutor::CreateKernelImplementation() {
|
||||
return std::unique_ptr<internal::KernelInterface>(new CUDAKernel());
|
||||
GpuExecutor::CreateKernelImplementation() {
|
||||
return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
|
||||
}
|
||||
|
||||
std::unique_ptr<internal::StreamInterface>
|
||||
CUDAExecutor::GetStreamImplementation() {
|
||||
return std::unique_ptr<internal::StreamInterface>(new CUDAStream(this));
|
||||
GpuExecutor::GetStreamImplementation() {
|
||||
return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
|
||||
}
|
||||
|
||||
std::unique_ptr<internal::TimerInterface>
|
||||
CUDAExecutor::GetTimerImplementation() {
|
||||
return std::unique_ptr<internal::TimerInterface>(new CUDATimer(this));
|
||||
GpuExecutor::GetTimerImplementation() {
|
||||
return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
|
||||
}
|
||||
|
||||
void *CUDAExecutor::GpuContextHack() { return context_; }
|
||||
void* GpuExecutor::GpuContextHack() { return context_; }
|
||||
|
||||
CudaContext* CUDAExecutor::cuda_context() { return context_; }
|
||||
GpuContext* GpuExecutor::gpu_context() { return context_; }
|
||||
|
||||
// Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
|
||||
// of SysFS. Returns -1 if it cannot.
|
||||
@ -1019,21 +1026,21 @@ static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
|
||||
DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
|
||||
internal::DeviceDescriptionBuilder builder;
|
||||
|
||||
{
|
||||
int driver_version = 0;
|
||||
(void)CUDADriver::GetDriverVersion(&driver_version);
|
||||
(void)GpuDriver::GetDriverVersion(&driver_version);
|
||||
string augmented_driver_version = port::Printf(
|
||||
"%d (%s)", driver_version,
|
||||
DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str());
|
||||
cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
|
||||
.c_str());
|
||||
builder.set_driver_version(augmented_driver_version);
|
||||
}
|
||||
|
||||
{
|
||||
string pci_bus_id = CUDADriver::GetPCIBusID(device_);
|
||||
string pci_bus_id = GpuDriver::GetPCIBusID(device_);
|
||||
|
||||
// Lower the hex characters to match sysfs.
|
||||
pci_bus_id = port::Lowercase(pci_bus_id);
|
||||
@ -1046,43 +1053,43 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
|
||||
|
||||
{
|
||||
builder.set_threads_per_block_limit(
|
||||
CUDADriver::GetDeviceAttribute(
|
||||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device_)
|
||||
GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
|
||||
device_)
|
||||
.ValueOrDie());
|
||||
|
||||
ThreadDim thread_dim_limit;
|
||||
thread_dim_limit.x = CUDADriver::GetDeviceAttribute(
|
||||
thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
|
||||
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device_)
|
||||
.ValueOrDie();
|
||||
thread_dim_limit.y = CUDADriver::GetDeviceAttribute(
|
||||
thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
|
||||
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device_)
|
||||
.ValueOrDie();
|
||||
thread_dim_limit.z = CUDADriver::GetDeviceAttribute(
|
||||
thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
|
||||
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device_)
|
||||
.ValueOrDie();
|
||||
builder.set_thread_dim_limit(thread_dim_limit);
|
||||
|
||||
int clock_rate =
|
||||
CUDADriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
|
||||
GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
|
||||
.ValueOrDie();
|
||||
builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
|
||||
}
|
||||
|
||||
{
|
||||
bool ecc_enabled = false;
|
||||
(void)CUDADriver::IsEccEnabled(device_, &ecc_enabled);
|
||||
(void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
|
||||
builder.set_ecc_enabled(ecc_enabled);
|
||||
}
|
||||
|
||||
{
|
||||
uint64 device_memory_size = -1;
|
||||
(void)CUDADriver::GetDeviceTotalMemory(device_, &device_memory_size);
|
||||
(void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
|
||||
builder.set_device_memory_size(device_memory_size);
|
||||
}
|
||||
|
||||
port::StatusOr<int> mem_clock_khz = CUDADriver::GetDeviceAttribute(
|
||||
port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
|
||||
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
|
||||
port::StatusOr<int> mem_bus_width_bits = CUDADriver::GetDeviceAttribute(
|
||||
port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
|
||||
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
|
||||
if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
|
||||
// Times 2 because HBM is DDR memory; it gets two data bits per each data
|
||||
@ -1100,7 +1107,7 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
|
||||
|
||||
{
|
||||
string device_name;
|
||||
(void)CUDADriver::GetDeviceName(device_, &device_name);
|
||||
(void)GpuDriver::GetDeviceName(device_, &device_name);
|
||||
builder.set_name(device_name);
|
||||
}
|
||||
|
||||
@ -1114,19 +1121,19 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
|
||||
builder.set_device_vendor("NVIDIA Corporation");
|
||||
builder.set_cuda_compute_capability(cc_major_, cc_minor_);
|
||||
builder.set_shared_memory_per_core(
|
||||
CUDADriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
|
||||
GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
|
||||
builder.set_shared_memory_per_block(
|
||||
CUDADriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
|
||||
GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
|
||||
builder.set_core_count(
|
||||
CUDADriver::GetMultiprocessorCount(device_).ValueOrDie());
|
||||
GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
|
||||
builder.set_threads_per_core_limit(
|
||||
CUDADriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
|
||||
GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
|
||||
builder.set_registers_per_block_limit(
|
||||
CUDADriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
|
||||
GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
|
||||
builder.set_threads_per_warp(
|
||||
CUDADriver::GetThreadsPerWarp(device_).ValueOrDie());
|
||||
GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
|
||||
builder.set_registers_per_core_limit(
|
||||
CUDADriver::GetDeviceAttribute(
|
||||
GpuDriver::GetDeviceAttribute(
|
||||
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
|
||||
.ValueOrDie());
|
||||
|
||||
@ -1134,11 +1141,11 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
|
||||
return built.release();
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
|
||||
void initialize_cuda_gpu_executor() {
|
||||
*internal::MakeCUDAExecutorImplementation() = [](const PluginConfig &config) {
|
||||
return new cuda::CUDAExecutor{config};
|
||||
*internal::MakeCUDAExecutorImplementation() = [](const PluginConfig& config) {
|
||||
return new gpu::GpuExecutor{config};
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -22,289 +22,12 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
|
||||
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
|
||||
#include "tensorflow/stream_executor/event.h"
|
||||
#include "tensorflow/stream_executor/lib/status.h"
|
||||
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||
#include "tensorflow/stream_executor/platform.h"
|
||||
#include "tensorflow/stream_executor/platform/mutex.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/platform/thread_annotations.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
|
||||
// CUDA-platform implementation of the platform-agnostic
|
||||
// StreamExecutorInferface.
|
||||
class CUDAExecutor : public internal::StreamExecutorInterface {
|
||||
public:
|
||||
// sub_platform indicates the subplatform used in this executor; it must
|
||||
// be a CUDA type.
|
||||
explicit CUDAExecutor(const PluginConfig &plugin_config)
|
||||
: device_(0),
|
||||
context_(nullptr),
|
||||
device_ordinal_(0),
|
||||
cc_major_(0),
|
||||
cc_minor_(0),
|
||||
plugin_config_(plugin_config) {}
|
||||
|
||||
// See the corresponding StreamExecutor methods for method comments on the
|
||||
// following overrides.
|
||||
|
||||
~CUDAExecutor() override;
|
||||
|
||||
port::Status Init(int device_ordinal, DeviceOptions device_options) override;
|
||||
|
||||
bool GetKernel(const MultiKernelLoaderSpec &spec,
|
||||
KernelBase *kernel) override;
|
||||
void UnloadKernel(const KernelBase *kernel) override;
|
||||
bool LoadModule(const MultiModuleLoaderSpec &spec,
|
||||
ModuleHandle *module_handle) override;
|
||||
bool UnloadModule(ModuleHandle module_handle) override;
|
||||
|
||||
bool Launch(Stream *stream, const ThreadDim &thread_dims,
|
||||
const BlockDim &block_dims, const KernelBase &k,
|
||||
const KernelArgsArrayBase &args) override;
|
||||
|
||||
int CalculateOccupancy(const DeviceDescription &device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim &thread_dims, CUfunction func);
|
||||
|
||||
int CompareOccupancy(int *initial_blocks,
|
||||
const DeviceDescription &device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim &thread_dims, CUfunction func);
|
||||
|
||||
void *Allocate(uint64 size) override;
|
||||
|
||||
void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
|
||||
uint64 size_bytes) override;
|
||||
|
||||
void Deallocate(DeviceMemoryBase *mem) override;
|
||||
|
||||
void *UnifiedMemoryAllocate(uint64 size) override {
|
||||
return CUDADriver::UnifiedMemoryAllocate(context_, size);
|
||||
}
|
||||
|
||||
void UnifiedMemoryDeallocate(void *location) override {
|
||||
return CUDADriver::UnifiedMemoryDeallocate(context_, location);
|
||||
}
|
||||
|
||||
// CUDA allocation/registration functions are necessary because the driver
|
||||
// internally sets up buffers for DMA operations (and page locks them).
|
||||
// There's no external interface for us to otherwise control these DMA
|
||||
// settings.
|
||||
void *HostMemoryAllocate(uint64 size) override {
|
||||
return CUDADriver::HostAllocate(context_, size);
|
||||
}
|
||||
|
||||
void HostMemoryDeallocate(void *location) override {
|
||||
return CUDADriver::HostDeallocate(context_, location);
|
||||
}
|
||||
|
||||
bool HostMemoryRegister(void *location, uint64 size) override;
|
||||
|
||||
bool HostMemoryUnregister(void *location) override;
|
||||
|
||||
bool SynchronizeAllActivity() override;
|
||||
|
||||
bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override;
|
||||
|
||||
bool SynchronousMemSet(DeviceMemoryBase *location, int value,
|
||||
uint64 size) override;
|
||||
|
||||
port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
|
||||
const void *host_src, uint64 size) override;
|
||||
|
||||
port::Status SynchronousMemcpy(void *host_dst,
|
||||
const DeviceMemoryBase &gpu_src,
|
||||
uint64 size) override;
|
||||
|
||||
port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
|
||||
const DeviceMemoryBase &gpu_src,
|
||||
uint64 size) override;
|
||||
|
||||
bool MemZero(Stream *stream, DeviceMemoryBase *location,
|
||||
uint64 size) override;
|
||||
bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern,
|
||||
uint64 size) override;
|
||||
bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
|
||||
uint64 size) override;
|
||||
|
||||
bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src,
|
||||
uint64 size) override;
|
||||
|
||||
bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src,
|
||||
uint64 size) override;
|
||||
|
||||
bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
|
||||
const DeviceMemoryBase &gpu_src,
|
||||
uint64 size) override;
|
||||
|
||||
bool HostCallback(Stream *stream,
|
||||
std::function<port::Status()> callback) override;
|
||||
|
||||
bool AllocateStream(Stream *stream) override;
|
||||
|
||||
void DeallocateStream(Stream *stream) override;
|
||||
|
||||
bool CreateStreamDependency(Stream *dependent, Stream *other) override;
|
||||
|
||||
bool AllocateTimer(Timer *timer) override;
|
||||
|
||||
void DeallocateTimer(Timer *timer) override;
|
||||
|
||||
bool StartTimer(Stream *stream, Timer *timer) override;
|
||||
|
||||
bool StopTimer(Stream *stream, Timer *timer) override;
|
||||
|
||||
port::Status AllocateEvent(Event *event) override;
|
||||
|
||||
port::Status DeallocateEvent(Event *event) override;
|
||||
|
||||
port::Status RecordEvent(Stream *stream, Event *event) override;
|
||||
|
||||
port::Status WaitForEvent(Stream *stream, Event *event) override;
|
||||
|
||||
Event::Status PollForEventStatus(Event *event) override;
|
||||
|
||||
port::Status BlockHostUntilDone(Stream *stream) override;
|
||||
|
||||
int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
|
||||
|
||||
port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override;
|
||||
|
||||
bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override;
|
||||
|
||||
SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
|
||||
|
||||
port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
|
||||
|
||||
bool DeviceMemoryUsage(int64 *free, int64 *total) const override;
|
||||
|
||||
// Search for the symbol and returns a device pointer and size.
|
||||
// Returns false if symbol does not exist.
|
||||
bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
|
||||
void **mem, size_t *bytes) override;
|
||||
|
||||
DeviceDescription *PopulateDeviceDescription() const override;
|
||||
|
||||
// Populates the block_dim_limit by querying the device driver API. If an
|
||||
// error occurs at any point while asking the driver for block dim limits, it
|
||||
// will be only partially populated as a result, and an error will be logged.
|
||||
bool FillBlockDimLimit(BlockDim *block_dim_limit) const;
|
||||
|
||||
bool SupportsBlas() const override;
|
||||
|
||||
blas::BlasSupport *CreateBlas() override;
|
||||
|
||||
bool SupportsFft() const override;
|
||||
|
||||
fft::FftSupport *CreateFft() override;
|
||||
|
||||
bool SupportsRng() const override;
|
||||
|
||||
rng::RngSupport *CreateRng() override;
|
||||
|
||||
bool SupportsDnn() const override;
|
||||
|
||||
dnn::DnnSupport *CreateDnn() override;
|
||||
|
||||
std::unique_ptr<internal::EventInterface> CreateEventImplementation()
|
||||
override;
|
||||
|
||||
std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
|
||||
override;
|
||||
|
||||
std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
|
||||
|
||||
std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
|
||||
|
||||
void *GpuContextHack() override;
|
||||
|
||||
CudaContext* cuda_context();
|
||||
|
||||
private:
|
||||
// Attempts to find a more specific version of the file indicated by
|
||||
// filename by looking for compute-capability-specific suffixed versions; i.e.
|
||||
// looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
|
||||
// we're on a compute capability 3.0 machine.
|
||||
bool FindOnDiskForComputeCapability(absl::string_view filename,
|
||||
absl::string_view canonical_suffix,
|
||||
string *found_filename) const;
|
||||
|
||||
// Host callback landing routine invoked by CUDA.
|
||||
// data: User-provided callback provided to HostCallback() above, captured
|
||||
// as a std::function<void()>. Allocated/initialized inside
|
||||
// HostCallback() and owned and deleted by this call.
|
||||
static void InternalHostCallback(CUstream stream, CUresult status,
|
||||
void *data);
|
||||
|
||||
// Collects metadata for the specified kernel.
|
||||
bool GetKernelMetadata(CUDAKernel *cuda_kernel,
|
||||
KernelMetadata *kernel_metadata);
|
||||
|
||||
// Prints to VLOG(2) information about the kernel's occupancy and how it might
|
||||
// be improved.
|
||||
void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
|
||||
const BlockDim &block_dims);
|
||||
|
||||
bool LoadModuleFromCuBin(const char *cubin, CUmodule *module)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
|
||||
|
||||
// Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated.
|
||||
bool LoadModuleFromPtx(const char *ptx, CUmodule *module)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
|
||||
|
||||
bool UnloadGpuBinary(const void *gpu_binary)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
|
||||
|
||||
// Guards the in-memory-module mapping.
|
||||
mutex in_memory_modules_mu_;
|
||||
|
||||
// Kernel -> loaded GPU binary. Many kernels may load the same binary.
|
||||
std::unordered_map<const KernelBase *, const void *> kernel_to_gpu_binary_
|
||||
GUARDED_BY(in_memory_modules_mu_);
|
||||
// GPU binary (PTX or CUBIN) -> {CUDA module, reference count}.
|
||||
std::unordered_map<const void *, std::pair<CUmodule, uint64>>
|
||||
gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
|
||||
|
||||
// Guards the launched kernel set.
|
||||
mutex launched_kernels_mu_;
|
||||
|
||||
// Keeps track of the set of launched kernels. Currently used to suppress the
|
||||
// occupancy check on subsequent launches.
|
||||
std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_);
|
||||
|
||||
// Handle for the CUDA device being operated on. Immutable
|
||||
// post-initialization.
|
||||
CUdevice device_;
|
||||
|
||||
// Handle for session with the library/driver. Immutable post-initialization.
|
||||
CudaContext* context_;
|
||||
|
||||
// The device ordinal value that this executor was initialized with; recorded
|
||||
// for use in getting device metadata. Immutable post-initialization.
|
||||
int device_ordinal_;
|
||||
|
||||
// The major verion of the compute capability for device_.
|
||||
int cc_major_;
|
||||
|
||||
// The minor verion of the compute capability for device_.
|
||||
int cc_minor_;
|
||||
|
||||
// The plugin configuration associated with this instance.
|
||||
PluginConfig plugin_config_;
|
||||
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor);
|
||||
};
|
||||
using CUDAExecutor = gpu::GpuExecutor;
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace stream_executor
|
||||
|
@ -17,88 +17,9 @@ limitations under the License.
|
||||
//
|
||||
// These are typically placed here for use by multiple source components (for
|
||||
// example, BLAS and executor components).
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <complex>
|
||||
|
||||
#include "cuda/include/cuComplex.h"
|
||||
|
||||
namespace stream_executor {
|
||||
|
||||
template <typename ElemT>
|
||||
class DeviceMemory;
|
||||
|
||||
namespace cuda {
|
||||
|
||||
// Converts a const DeviceMemory reference to its underlying typed pointer in
|
||||
// CUDA
|
||||
// device memory.
|
||||
template <typename T>
|
||||
const T *CUDAMemory(const DeviceMemory<T> &mem) {
|
||||
return static_cast<const T *>(mem.opaque());
|
||||
}
|
||||
|
||||
// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
|
||||
// pointer in CUDA device memory.
|
||||
template <typename T>
|
||||
T *CUDAMemoryMutable(DeviceMemory<T> *mem) {
|
||||
return static_cast<T *>(mem->opaque());
|
||||
}
|
||||
|
||||
static_assert(sizeof(std::complex<float>) == sizeof(cuComplex),
|
||||
"std::complex<float> and cuComplex should have the same size");
|
||||
static_assert(offsetof(cuComplex, x) == 0,
|
||||
"The real part of cuComplex should appear first.");
|
||||
static_assert(sizeof(std::complex<double>) == sizeof(cuDoubleComplex),
|
||||
"std::complex<double> and cuDoubleComplex should have the same "
|
||||
"size");
|
||||
static_assert(offsetof(cuDoubleComplex, x) == 0,
|
||||
"The real part of cuDoubleComplex should appear first.");
|
||||
|
||||
// Type traits to get CUDA complex types from std::complex<>.
|
||||
|
||||
template <typename T>
|
||||
struct CUDAComplexT {
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CUDAComplexT<std::complex<float>> {
|
||||
typedef cuComplex type;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CUDAComplexT<std::complex<double>> {
|
||||
typedef cuDoubleComplex type;
|
||||
};
|
||||
|
||||
// Converts pointers of std::complex<> to pointers of
|
||||
// cuComplex/cuDoubleComplex. No type conversion for non-complex types.
|
||||
|
||||
template <typename T>
|
||||
inline const typename CUDAComplexT<T>::type *CUDAComplex(const T *p) {
|
||||
return reinterpret_cast<const typename CUDAComplexT<T>::type *>(p);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline typename CUDAComplexT<T>::type *CUDAComplex(T *p) {
|
||||
return reinterpret_cast<typename CUDAComplexT<T>::type *>(p);
|
||||
}
|
||||
|
||||
// Converts values of std::complex<float/double> to values of
|
||||
// cuComplex/cuDoubleComplex.
|
||||
inline cuComplex CUDAComplexValue(std::complex<float> val) {
|
||||
return {val.real(), val.imag()};
|
||||
}
|
||||
|
||||
inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) {
|
||||
return {val.real(), val.imag()};
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace stream_executor
|
||||
#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
|
||||
|
38
tensorflow/stream_executor/cuda/cuda_kernel.cc
Normal file
38
tensorflow/stream_executor/cuda/cuda_kernel.cc
Normal file
@ -0,0 +1,38 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
CUfunc_cache GpuKernel::GetGpuCacheConfig() const {
|
||||
switch (preferred_cache_config_) {
|
||||
case KernelCacheConfig::kNoPreference:
|
||||
return CU_FUNC_CACHE_PREFER_NONE;
|
||||
case KernelCacheConfig::kPreferShared:
|
||||
return CU_FUNC_CACHE_PREFER_SHARED;
|
||||
case KernelCacheConfig::kPreferL1:
|
||||
return CU_FUNC_CACHE_PREFER_L1;
|
||||
case KernelCacheConfig::kPreferEqual:
|
||||
return CU_FUNC_CACHE_PREFER_EQUAL;
|
||||
default:
|
||||
LOG(FATAL) << "Unknown KernelCacheConfig"
|
||||
<< static_cast<int32>(preferred_cache_config_);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
@ -22,104 +22,12 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
|
||||
|
||||
#include "tensorflow/stream_executor/kernel_cache_config.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/platform/logging.h"
|
||||
#include "cuda/include/cuda.h"
|
||||
|
||||
#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
|
||||
#error \
|
||||
"No driver calls in this file, wrap driver functionality in cuda_driver.cc."
|
||||
#endif
|
||||
|
||||
#ifdef __CUDA_RUNTIME_H__
|
||||
#error \
|
||||
"CUDA runtime being included into CUDA GPU executor; should be driver only."
|
||||
#endif
|
||||
#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
|
||||
// Wraps a CUfunction to implement the platform-independent KernelInterface.
|
||||
class CUDAKernel : public internal::KernelInterface {
|
||||
public:
|
||||
CUDAKernel() : cuda_function_(nullptr), arity_(0),
|
||||
preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
|
||||
|
||||
// Note that the function is unloaded when the module is unloaded, and the
|
||||
// module that the function is contained in is owned by the CUDAExecutor.
|
||||
~CUDAKernel() override {}
|
||||
|
||||
// As arity cannot be reflected upon using the CUDA API, the arity is
|
||||
// explicitly set during the CUDAExecutor::GetKernel initialization process.
|
||||
void set_arity(unsigned arity) { arity_ = arity; }
|
||||
unsigned Arity() const override { return arity_; }
|
||||
|
||||
// Returns the CUfunction value for passing to the CUDA API.
|
||||
CUfunction AsCUDAFunctionValue() const {
|
||||
DCHECK(cuda_function_ != nullptr);
|
||||
return const_cast<CUfunction>(cuda_function_);
|
||||
}
|
||||
|
||||
// Returns the slot that the CUfunction is stored within for this object,
|
||||
// for the CUDA API which wants to load into a CUfunction*.
|
||||
CUfunction *cuda_function_ptr() { return &cuda_function_; }
|
||||
|
||||
// CUDA supports setting the preferred cache configuration of a CUfunction
|
||||
// (more-or-less equivalent to a CUDAKernel). We support this via the below
|
||||
// functions; users can set a preference, and that is applied when the kernel
|
||||
// is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to
|
||||
// load the kernel & set the preference when the user calls the setter below;
|
||||
// either approach is valid.
|
||||
// Sets the current kernel cache configuration preference.
|
||||
void SetPreferredCacheConfig(KernelCacheConfig config) override {
|
||||
preferred_cache_config_ = config;
|
||||
}
|
||||
|
||||
// Returns the current kernel cache configuration preference.
|
||||
KernelCacheConfig GetPreferredCacheConfig() const override {
|
||||
return preferred_cache_config_;
|
||||
}
|
||||
|
||||
// Returns the current kernel cache configuration preference as a
|
||||
// CUfunc_cache.
|
||||
CUfunc_cache GetCUDACacheConfig() const {
|
||||
switch (preferred_cache_config_) {
|
||||
case KernelCacheConfig::kNoPreference:
|
||||
return CU_FUNC_CACHE_PREFER_NONE;
|
||||
case KernelCacheConfig::kPreferShared:
|
||||
return CU_FUNC_CACHE_PREFER_SHARED;
|
||||
case KernelCacheConfig::kPreferL1:
|
||||
return CU_FUNC_CACHE_PREFER_L1;
|
||||
case KernelCacheConfig::kPreferEqual:
|
||||
return CU_FUNC_CACHE_PREFER_EQUAL;
|
||||
default:
|
||||
LOG(FATAL) << "Unknown KernelCacheConfig"
|
||||
<< static_cast<int32>(preferred_cache_config_);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
CUfunction cuda_function_; // Wrapped CUDA kernel handle.
|
||||
unsigned arity_; // Number of formal parameters the kernel takes.
|
||||
|
||||
// Preferred (but not required) cache configuration for this kernel.
|
||||
KernelCacheConfig preferred_cache_config_;
|
||||
};
|
||||
|
||||
// Given a platform-independent kernel datatype, returns the (const) internal
|
||||
// CUDA platform implementation pointer.
|
||||
inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) {
|
||||
return static_cast<const CUDAKernel *>(kernel->implementation());
|
||||
}
|
||||
|
||||
// Given a platform-independent kernel datatype, returns the (non-const)
|
||||
// internal CUDA platform implementation pointer.
|
||||
inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
|
||||
return static_cast<CUDAKernel *>(kernel->implementation());
|
||||
}
|
||||
using CUDAKernel = gpu::GpuKernel;
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace stream_executor
|
||||
|
@ -25,7 +25,7 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/lib/stringprintf.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
namespace {
|
||||
|
||||
// Synchronize with spinlocks.
|
||||
@ -129,16 +129,16 @@ port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus(
|
||||
port::Printf("Executor for bus %d not found.", bus_ordinal));
|
||||
}
|
||||
|
||||
Platform::Id CudaPlatform::id() const { return kCudaPlatformId; }
|
||||
Platform::Id CudaPlatform::id() const { return cuda::kCudaPlatformId; }
|
||||
|
||||
int CudaPlatform::VisibleDeviceCount() const {
|
||||
// Throw away the result - it logs internally, and this [containing] function
|
||||
// isn't in the path of user control. It's safe to call this > 1x.
|
||||
if (!cuda::CUDADriver::Init().ok()) {
|
||||
if (!gpu::GpuDriver::Init().ok()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return CUDADriver::GetDeviceCount();
|
||||
return GpuDriver::GetDeviceCount();
|
||||
}
|
||||
|
||||
const string& CudaPlatform::Name() const { return name_; }
|
||||
@ -169,7 +169,7 @@ port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
|
||||
port::StatusOr<std::unique_ptr<StreamExecutor>>
|
||||
CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
|
||||
auto executor = MakeUnique<StreamExecutor>(
|
||||
this, MakeUnique<CUDAExecutor>(config.plugin_config));
|
||||
this, MakeUnique<GpuExecutor>(config.plugin_config));
|
||||
auto init_status = executor->Init(config.ordinal, config.device_options);
|
||||
if (!init_status.ok()) {
|
||||
return port::Status(
|
||||
@ -191,13 +191,13 @@ void CudaPlatform::UnregisterTraceListener(TraceListener* listener) {
|
||||
LOG(FATAL) << "not yet implemented: unregister CUDA trace listener";
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
|
||||
static void InitializeCudaPlatform() {
|
||||
// Disabling leak checking, MultiPlatformManager does not destroy its
|
||||
// registered platforms.
|
||||
|
||||
std::unique_ptr<cuda::CudaPlatform> platform(new cuda::CudaPlatform);
|
||||
std::unique_ptr<gpu::CudaPlatform> platform(new gpu::CudaPlatform);
|
||||
SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
|
||||
}
|
||||
|
||||
|
@ -32,7 +32,7 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/trace_listener.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
// Opaque and unique identifier for the CUDA platform plugin.
|
||||
// This is needed so that plugins can refer to/identify this platform without
|
||||
@ -102,6 +102,12 @@ class CudaPlatform : public Platform {
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(CudaPlatform);
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
namespace cuda {
|
||||
|
||||
using CudaPlatform = gpu::CudaPlatform;
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace stream_executor
|
||||
|
||||
|
@ -58,33 +58,33 @@ std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
|
||||
}
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
|
||||
PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);
|
||||
|
||||
namespace wrap {
|
||||
|
||||
#ifdef PLATFORM_GOOGLE
|
||||
#define STREAM_EXECUTOR_CURAND_WRAP(__name) \
|
||||
struct WrapperShim__##__name { \
|
||||
template <typename... Args> \
|
||||
curandStatus_t operator()(CUDAExecutor *parent, Args... args) { \
|
||||
cuda::ScopedActivateExecutorContext sac{parent}; \
|
||||
return ::__name(args...); \
|
||||
} \
|
||||
#define STREAM_EXECUTOR_CURAND_WRAP(__name) \
|
||||
struct WrapperShim__##__name { \
|
||||
template <typename... Args> \
|
||||
curandStatus_t operator()(GpuExecutor* parent, Args... args) { \
|
||||
gpu::ScopedActivateExecutorContext sac{parent}; \
|
||||
return ::__name(args...); \
|
||||
} \
|
||||
} __name;
|
||||
|
||||
#else
|
||||
#define STREAM_EXECUTOR_CURAND_WRAP(__name) \
|
||||
struct DynLoadShim__##__name { \
|
||||
static const char *kName; \
|
||||
static const char* kName; \
|
||||
using FuncPtrT = std::add_pointer<decltype(::__name)>::type; \
|
||||
static void *GetDsoHandle() { \
|
||||
static void* GetDsoHandle() { \
|
||||
auto s = internal::CachedDsoLoader::GetCurandDsoHandle(); \
|
||||
return s.ValueOrDie(); \
|
||||
} \
|
||||
static FuncPtrT LoadOrDie() { \
|
||||
void *f; \
|
||||
void* f; \
|
||||
auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
|
||||
kName, &f); \
|
||||
CHECK(s.ok()) << "could not find " << kName \
|
||||
@ -96,12 +96,12 @@ namespace wrap {
|
||||
return f; \
|
||||
} \
|
||||
template <typename... Args> \
|
||||
curandStatus_t operator()(CUDAExecutor *parent, Args... args) { \
|
||||
cuda::ScopedActivateExecutorContext sac{parent}; \
|
||||
curandStatus_t operator()(GpuExecutor* parent, Args... args) { \
|
||||
gpu::ScopedActivateExecutorContext sac{parent}; \
|
||||
return DynLoad()(args...); \
|
||||
} \
|
||||
} __name; \
|
||||
const char *DynLoadShim__##__name::kName = #__name;
|
||||
const char* DynLoadShim__##__name::kName = #__name;
|
||||
#endif
|
||||
|
||||
STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
|
||||
@ -116,38 +116,15 @@ STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormalDouble);
|
||||
|
||||
} // namespace wrap
|
||||
|
||||
template <typename T>
|
||||
string TypeString();
|
||||
GpuRng::GpuRng(GpuExecutor* parent) : parent_(parent), rng_(nullptr) {}
|
||||
|
||||
template <>
|
||||
string TypeString<float>() {
|
||||
return "float";
|
||||
}
|
||||
|
||||
template <>
|
||||
string TypeString<double>() {
|
||||
return "double";
|
||||
}
|
||||
|
||||
template <>
|
||||
string TypeString<std::complex<float>>() {
|
||||
return "std::complex<float>";
|
||||
}
|
||||
|
||||
template <>
|
||||
string TypeString<std::complex<double>>() {
|
||||
return "std::complex<double>";
|
||||
}
|
||||
|
||||
CUDARng::CUDARng(CUDAExecutor *parent) : parent_(parent), rng_(nullptr) {}
|
||||
|
||||
CUDARng::~CUDARng() {
|
||||
GpuRng::~GpuRng() {
|
||||
if (rng_ != nullptr) {
|
||||
wrap::curandDestroyGenerator(parent_, rng_);
|
||||
}
|
||||
}
|
||||
|
||||
bool CUDARng::Init() {
|
||||
bool GpuRng::Init() {
|
||||
mutex_lock lock(mu_);
|
||||
CHECK(rng_ == nullptr);
|
||||
|
||||
@ -162,9 +139,9 @@ bool CUDARng::Init() {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CUDARng::SetStream(Stream *stream) {
|
||||
bool GpuRng::SetStream(Stream* stream) {
|
||||
curandStatus_t ret =
|
||||
wrap::curandSetStream(parent_, rng_, AsCUDAStreamValue(stream));
|
||||
wrap::curandSetStream(parent_, rng_, AsGpuStreamValue(stream));
|
||||
if (ret != CURAND_STATUS_SUCCESS) {
|
||||
LOG(ERROR) << "failed to set stream for random generation: " << ret;
|
||||
return false;
|
||||
@ -182,8 +159,7 @@ constexpr bool ComplexIsConsecutiveFloats() {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
|
||||
DeviceMemory<T> *v) {
|
||||
bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
|
||||
mutex_lock lock(mu_);
|
||||
static_assert(ComplexIsConsecutiveFloats(),
|
||||
"std::complex values are not stored as consecutive values");
|
||||
@ -203,11 +179,11 @@ bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
|
||||
if (std::is_same<T, float>::value ||
|
||||
std::is_same<T, std::complex<float>>::value) {
|
||||
ret = wrap::curandGenerateUniform(
|
||||
parent_, rng_, reinterpret_cast<float *>(CUDAMemoryMutable(v)),
|
||||
parent_, rng_, reinterpret_cast<float*>(GpuMemoryMutable(v)),
|
||||
element_count);
|
||||
} else {
|
||||
ret = wrap::curandGenerateUniformDouble(
|
||||
parent_, rng_, reinterpret_cast<double *>(CUDAMemoryMutable(v)),
|
||||
parent_, rng_, reinterpret_cast<double*>(GpuMemoryMutable(v)),
|
||||
element_count);
|
||||
}
|
||||
if (ret != CURAND_STATUS_SUCCESS) {
|
||||
@ -220,29 +196,29 @@ bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) {
|
||||
bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) {
|
||||
return DoPopulateRandUniformInternal(stream, v);
|
||||
}
|
||||
|
||||
bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) {
|
||||
bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) {
|
||||
return DoPopulateRandUniformInternal(stream, v);
|
||||
}
|
||||
|
||||
bool CUDARng::DoPopulateRandUniform(Stream *stream,
|
||||
DeviceMemory<std::complex<float>> *v) {
|
||||
bool GpuRng::DoPopulateRandUniform(Stream* stream,
|
||||
DeviceMemory<std::complex<float>>* v) {
|
||||
return DoPopulateRandUniformInternal(stream, v);
|
||||
}
|
||||
|
||||
bool CUDARng::DoPopulateRandUniform(Stream *stream,
|
||||
DeviceMemory<std::complex<double>> *v) {
|
||||
bool GpuRng::DoPopulateRandUniform(Stream* stream,
|
||||
DeviceMemory<std::complex<double>>* v) {
|
||||
return DoPopulateRandUniformInternal(stream, v);
|
||||
}
|
||||
|
||||
template <typename ElemT, typename FuncT>
|
||||
bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
|
||||
ElemT stddev,
|
||||
DeviceMemory<ElemT> *v,
|
||||
FuncT func) {
|
||||
bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
|
||||
ElemT stddev,
|
||||
DeviceMemory<ElemT>* v,
|
||||
FuncT func) {
|
||||
mutex_lock lock(mu_);
|
||||
|
||||
if (!SetStream(stream)) {
|
||||
@ -251,7 +227,7 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
|
||||
|
||||
uint64 element_count = v->ElementCount();
|
||||
curandStatus_t ret =
|
||||
func(parent_, rng_, CUDAMemoryMutable(v), element_count, mean, stddev);
|
||||
func(parent_, rng_, GpuMemoryMutable(v), element_count, mean, stddev);
|
||||
|
||||
if (ret != CURAND_STATUS_SUCCESS) {
|
||||
LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
|
||||
@ -262,19 +238,19 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CUDARng::DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
|
||||
DeviceMemory<float> *v) {
|
||||
bool GpuRng::DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
|
||||
DeviceMemory<float>* v) {
|
||||
return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
|
||||
wrap::curandGenerateNormal);
|
||||
}
|
||||
|
||||
bool CUDARng::DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
|
||||
DeviceMemory<double> *v) {
|
||||
bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
|
||||
DeviceMemory<double>* v) {
|
||||
return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
|
||||
wrap::curandGenerateNormalDouble);
|
||||
}
|
||||
|
||||
bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
|
||||
bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
|
||||
mutex_lock lock(mu_);
|
||||
CHECK(rng_ != nullptr);
|
||||
|
||||
@ -303,15 +279,15 @@ bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
|
||||
void initialize_curand() {
|
||||
port::Status status =
|
||||
PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
|
||||
cuda::kCudaPlatformId, cuda::kCuRandPlugin, "cuRAND",
|
||||
[](internal::StreamExecutorInterface *parent) -> rng::RngSupport * {
|
||||
cuda::CUDAExecutor *cuda_executor =
|
||||
dynamic_cast<cuda::CUDAExecutor *>(parent);
|
||||
cuda::kCudaPlatformId, gpu::kGpuRandPlugin, "cuRAND",
|
||||
[](internal::StreamExecutorInterface* parent) -> rng::RngSupport* {
|
||||
gpu::GpuExecutor* cuda_executor =
|
||||
dynamic_cast<gpu::GpuExecutor*>(parent);
|
||||
if (cuda_executor == nullptr) {
|
||||
LOG(ERROR)
|
||||
<< "Attempting to initialize an instance of the cuRAND "
|
||||
@ -319,7 +295,7 @@ void initialize_curand() {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
cuda::CUDARng *rng = new cuda::CUDARng(cuda_executor);
|
||||
gpu::GpuRng* rng = new gpu::GpuRng(cuda_executor);
|
||||
if (!rng->Init()) {
|
||||
// Note: Init() will log a more specific error.
|
||||
delete rng;
|
||||
@ -334,7 +310,7 @@ void initialize_curand() {
|
||||
}
|
||||
|
||||
PluginRegistry::Instance()->SetDefaultFactory(
|
||||
cuda::kCudaPlatformId, PluginKind::kRng, cuda::kCuRandPlugin);
|
||||
cuda::kCudaPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
|
||||
}
|
||||
|
||||
} // namespace stream_executor
|
||||
|
@ -16,85 +16,13 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
|
||||
|
||||
#include "tensorflow/stream_executor/platform/mutex.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/platform/thread_annotations.h"
|
||||
#include "tensorflow/stream_executor/plugin_registry.h"
|
||||
#include "tensorflow/stream_executor/rng.h"
|
||||
|
||||
typedef struct curandGenerator_st *curandGenerator_t;
|
||||
#include "tensorflow/stream_executor/gpu/gpu_rng.h"
|
||||
|
||||
namespace stream_executor {
|
||||
|
||||
class Stream;
|
||||
template <typename ElemT>
|
||||
class DeviceMemory;
|
||||
|
||||
namespace cuda {
|
||||
|
||||
// Opaque and unique identifier for the cuRAND plugin.
|
||||
extern const PluginId kCuRandPlugin;
|
||||
|
||||
class CUDAExecutor;
|
||||
|
||||
// CUDA-platform implementation of the random number generation support
|
||||
// interface.
|
||||
//
|
||||
// Thread-safe post-initialization.
|
||||
class CUDARng : public rng::RngSupport {
|
||||
public:
|
||||
explicit CUDARng(CUDAExecutor *parent);
|
||||
|
||||
// Retrieves a curand library generator handle. This is necessary for
|
||||
// enqueuing random number generation work onto the device.
|
||||
// TODO(leary) provide a way for users to select the RNG algorithm.
|
||||
bool Init();
|
||||
|
||||
// Releases a curand library generator handle, if one was acquired.
|
||||
~CUDARng() override;
|
||||
|
||||
// See rng::RngSupport for details on the following overrides.
|
||||
bool DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) override;
|
||||
bool DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) override;
|
||||
bool DoPopulateRandUniform(Stream *stream,
|
||||
DeviceMemory<std::complex<float>> *v) override;
|
||||
bool DoPopulateRandUniform(Stream *stream,
|
||||
DeviceMemory<std::complex<double>> *v) override;
|
||||
bool DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
|
||||
DeviceMemory<float> *v) override;
|
||||
bool DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
|
||||
DeviceMemory<double> *v) override;
|
||||
|
||||
bool SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) override;
|
||||
|
||||
private:
|
||||
// Actually performs the work of generating random numbers - the public
|
||||
// methods are thin wrappers to this interface.
|
||||
template <typename T>
|
||||
bool DoPopulateRandUniformInternal(Stream *stream, DeviceMemory<T> *v);
|
||||
template <typename ElemT, typename FuncT>
|
||||
bool DoPopulateRandGaussianInternal(Stream *stream, ElemT mean, ElemT stddev,
|
||||
DeviceMemory<ElemT> *v, FuncT func);
|
||||
|
||||
// Sets the stream for the internal curand generator.
|
||||
//
|
||||
// This is a stateful operation, as the handle can only have one stream set at
|
||||
// a given time, so it is usually performed right before enqueuing work to do
|
||||
// with random number generation.
|
||||
bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
|
||||
|
||||
// mutex that guards the cuRAND handle for this device.
|
||||
mutex mu_;
|
||||
|
||||
// CUDAExecutor which instantiated this CUDARng.
|
||||
// Immutable post-initialization.
|
||||
CUDAExecutor *parent_;
|
||||
|
||||
// cuRANDalibrary handle on the device.
|
||||
curandGenerator_t rng_ GUARDED_BY(mu_);
|
||||
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(CUDARng);
|
||||
};
|
||||
using CUDARng = gpu::GpuRng;
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace stream_executor
|
||||
|
@ -13,79 +13,22 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Defines the CUDAStream type - the CUDA-specific implementation of the generic
|
||||
// Defines the GpuStream type - the CUDA-specific implementation of the generic
|
||||
// StreamExecutor Stream interface.
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
|
||||
|
||||
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
|
||||
#include "tensorflow/stream_executor/platform/thread_annotations.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
|
||||
class CUDAExecutor;
|
||||
using CUDAStream = gpu::GpuStream;
|
||||
|
||||
// Wraps a CUstream in order to satisfy the platform-independent
|
||||
// StreamInterface.
|
||||
//
|
||||
// Thread-safe post-initialization.
|
||||
class CUDAStream : public internal::StreamInterface {
|
||||
public:
|
||||
explicit CUDAStream(CUDAExecutor *parent)
|
||||
: parent_(parent), cuda_stream_(nullptr), completed_event_(nullptr) {}
|
||||
|
||||
// Note: teardown is handled by a parent's call to DeallocateStream.
|
||||
~CUDAStream() override {}
|
||||
|
||||
void *GpuStreamHack() override { return cuda_stream_; }
|
||||
void **GpuStreamMemberHack() override {
|
||||
return reinterpret_cast<void **>(&cuda_stream_);
|
||||
}
|
||||
|
||||
// Explicitly initialize the CUDA resources associated with this stream, used
|
||||
// by StreamExecutor::AllocateStream().
|
||||
bool Init();
|
||||
|
||||
// Explicitly destroy the CUDA resources associated with this stream, used by
|
||||
// StreamExecutor::DeallocateStream().
|
||||
void Destroy();
|
||||
|
||||
// Returns true if no work is pending or executing on the stream.
|
||||
bool IsIdle() const;
|
||||
|
||||
// Retrieves an event which indicates that all work enqueued into the stream
|
||||
// has completed. Ownership of the event is not transferred to the caller, the
|
||||
// event is owned by this stream.
|
||||
CUevent* completed_event() { return &completed_event_; }
|
||||
|
||||
// Returns the CUstream value for passing to the CUDA API.
|
||||
//
|
||||
// Precond: this CUDAStream has been allocated (otherwise passing a nullptr
|
||||
// into the NVIDIA library causes difficult-to-understand faults).
|
||||
CUstream cuda_stream() const {
|
||||
DCHECK(cuda_stream_ != nullptr);
|
||||
return const_cast<CUstream>(cuda_stream_);
|
||||
}
|
||||
|
||||
CUDAExecutor *parent() const { return parent_; }
|
||||
|
||||
private:
|
||||
CUDAExecutor *parent_; // Executor that spawned this stream.
|
||||
CUstream cuda_stream_; // Wrapped CUDA stream handle.
|
||||
|
||||
// Event that indicates this stream has completed.
|
||||
CUevent completed_event_ = nullptr;
|
||||
};
|
||||
|
||||
// Helper functions to simplify extremely common flows.
|
||||
// Converts a Stream to the underlying CUDAStream implementation.
|
||||
CUDAStream *AsCUDAStream(Stream *stream);
|
||||
|
||||
// Extracts a CUstream from a CUDAStream-backed Stream object.
|
||||
CUstream AsCUDAStreamValue(Stream *stream);
|
||||
inline CUDAStream* AsCUDAStream(Stream* stream) {
|
||||
return gpu::AsGpuStream(stream);
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace stream_executor
|
||||
|
@ -13,76 +13,18 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Defines the CUDATimer type - the CUDA-specific implementation of the generic
|
||||
// Defines the GpuTimer type - the CUDA-specific implementation of the generic
|
||||
// StreamExecutor Timer interface.
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
|
||||
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_timer.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
|
||||
class CUDAExecutor;
|
||||
class CUDAStream;
|
||||
|
||||
// Wraps a pair of CUevents in order to satisfy the platform-independent
|
||||
// TimerInferface -- both a start and a stop event are present which may be
|
||||
// recorded in a stream.
|
||||
class CUDATimer : public internal::TimerInterface {
|
||||
public:
|
||||
explicit CUDATimer(CUDAExecutor *parent)
|
||||
: parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
|
||||
|
||||
// Note: teardown needs to be explicitly handled in this API by a call to
|
||||
// StreamExecutor::DeallocateTimer(), which invokes Destroy().
|
||||
// TODO(csigg): Change to RAII.
|
||||
~CUDATimer() override {}
|
||||
|
||||
// Allocates the platform-specific pieces of the timer, called as part of
|
||||
// StreamExecutor::AllocateTimer().
|
||||
bool Init();
|
||||
|
||||
// Deallocates the platform-specific pieces of the timer, called as part of
|
||||
// StreamExecutor::DeallocateTimer().
|
||||
void Destroy();
|
||||
|
||||
// Records the "timer start" event at the current point in the stream.
|
||||
bool Start(CUDAStream *stream);
|
||||
|
||||
// Records the "timer stop" event at the current point in the stream.
|
||||
bool Stop(CUDAStream *stream);
|
||||
|
||||
// Returns the elapsed time, in milliseconds, between the start and stop
|
||||
// events.
|
||||
float GetElapsedMilliseconds() const;
|
||||
|
||||
// See Timer::Microseconds().
|
||||
// TODO(leary) make this into an error code interface...
|
||||
uint64 Microseconds() const override {
|
||||
return GetElapsedMilliseconds() * 1e3;
|
||||
}
|
||||
|
||||
// See Timer::Nanoseconds().
|
||||
uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
|
||||
|
||||
private:
|
||||
CUDAExecutor *parent_;
|
||||
CUevent start_event_; // Event recorded to indicate the "start" timestamp
|
||||
// executing in a stream.
|
||||
CUevent stop_event_; // Event recorded to indicate the "stop" timestamp
|
||||
// executing in a stream.
|
||||
};
|
||||
|
||||
struct TimerDeleter {
|
||||
void operator()(CUDATimer *t) {
|
||||
t->Destroy();
|
||||
delete t;
|
||||
}
|
||||
};
|
||||
using CUDATimer = gpu::GpuTimer;
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace stream_executor
|
||||
|
@ -16,7 +16,7 @@ limitations under the License.
|
||||
#include "tensorflow/stream_executor/cuda/cudnn_version.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
|
||||
CudnnVersion loaded_version) {
|
||||
@ -36,5 +36,5 @@ bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
|
||||
loaded_version.minor_version >= source_version.minor_version));
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
@ -21,7 +21,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/lib/strings/strcat.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
struct CudnnVersion {
|
||||
CudnnVersion() = default;
|
||||
@ -44,7 +44,7 @@ struct CudnnVersion {
|
||||
bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
|
||||
CudnnVersion loaded_version);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
|
||||
|
@ -18,7 +18,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/test.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
namespace {
|
||||
|
||||
TEST(CuDNNVersion, ToString) {
|
||||
@ -68,5 +68,5 @@ TEST(IsSourceCompatibleWithCudnnLibraryTest, Basic) {
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
@ -50,6 +50,7 @@ DeviceDescription::DeviceDescription()
|
||||
clock_rate_ghz_(-1.0),
|
||||
cuda_compute_capability_major_(-1),
|
||||
cuda_compute_capability_minor_(-1),
|
||||
rocm_amdgpu_isa_version_(-1),
|
||||
numa_node_(-1),
|
||||
core_count_(-1),
|
||||
ecc_enabled_(false) {}
|
||||
@ -112,6 +113,15 @@ bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const {
|
||||
return cuda_compute_capability_major_ != 0;
|
||||
}
|
||||
|
||||
bool DeviceDescription::rocm_amdgpu_isa_version(int *version) const {
|
||||
bool status = false;
|
||||
if (rocm_amdgpu_isa_version_ > 0) {
|
||||
*version = rocm_amdgpu_isa_version_;
|
||||
status = true;
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
bool ThreadDimOk(const DeviceDescription &device_description,
|
||||
const ThreadDim &thread_dim) {
|
||||
auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
|
||||
|
@ -133,6 +133,11 @@ class DeviceDescription {
|
||||
// zero, and the return value will be false.
|
||||
bool cuda_compute_capability(int *major, int *minor) const;
|
||||
|
||||
// Returns the AMDGPU ISA version if we're running on the ROCm platform.
|
||||
// If the information is not available, the version is not modified,
|
||||
// and the return value will be false.
|
||||
bool rocm_amdgpu_isa_version(int *version) const;
|
||||
|
||||
// Returns the maximum amount of shared memory present on a single core
|
||||
// (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
|
||||
// devices). Note that some devices, such as NVIDIA's have a configurable
|
||||
@ -195,6 +200,9 @@ class DeviceDescription {
|
||||
int cuda_compute_capability_major_;
|
||||
int cuda_compute_capability_minor_;
|
||||
|
||||
// ROCM AMDGPU ISA version, 0 if not available.
|
||||
int rocm_amdgpu_isa_version_;
|
||||
|
||||
int numa_node_;
|
||||
int core_count_;
|
||||
bool ecc_enabled_;
|
||||
@ -280,6 +288,10 @@ class DeviceDescriptionBuilder {
|
||||
device_description_->cuda_compute_capability_minor_ = minor;
|
||||
}
|
||||
|
||||
void set_rocm_amdgpu_isa_version(int version) {
|
||||
device_description_->rocm_amdgpu_isa_version_ = version;
|
||||
}
|
||||
|
||||
void set_numa_node(int value) { device_description_->numa_node_ = value; }
|
||||
void set_core_count(int value) { device_description_->core_count_ = value; }
|
||||
void set_ecc_enabled(bool value) {
|
||||
|
209
tensorflow/stream_executor/gpu/BUILD
Normal file
209
tensorflow/stream_executor/gpu/BUILD
Normal file
@ -0,0 +1,209 @@
|
||||
# Description:
|
||||
# GPU-platform specific StreamExecutor support code.
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
load(
|
||||
"//tensorflow/stream_executor:build_defs.bzl",
|
||||
"stream_executor_friends",
|
||||
)
|
||||
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
|
||||
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
|
||||
|
||||
package_group(
|
||||
name = "friends",
|
||||
packages = stream_executor_friends(),
|
||||
)
|
||||
|
||||
package(
|
||||
default_visibility = [":friends"],
|
||||
)
|
||||
|
||||
# Filegroup used to collect source files for the dependency check.
|
||||
filegroup(
|
||||
name = "c_srcs",
|
||||
data = glob([
|
||||
"**/*.cc",
|
||||
"**/*.h",
|
||||
]),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_activation_header",
|
||||
hdrs = ["gpu_activation.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = ["//tensorflow/stream_executor/platform"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_activation",
|
||||
srcs = ["gpu_activation.cc"],
|
||||
hdrs = ["gpu_activation.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
":gpu_activation_header",
|
||||
":gpu_driver_header",
|
||||
"//tensorflow/stream_executor",
|
||||
"//tensorflow/stream_executor:stream_executor_internal",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_diagnostics_header",
|
||||
hdrs = ["gpu_diagnostics.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_driver_header",
|
||||
hdrs = ["gpu_driver.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
":gpu_types_header",
|
||||
"//tensorflow/stream_executor:device_options",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
"@local_config_cuda//cuda:cuda_headers",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_event_header",
|
||||
hdrs = ["gpu_event.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
":gpu_driver_header",
|
||||
":gpu_stream_header",
|
||||
"//tensorflow/stream_executor:event",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_event",
|
||||
srcs = ["gpu_event.cc"],
|
||||
hdrs = ["gpu_event.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
":gpu_driver_header",
|
||||
":gpu_executor_header",
|
||||
":gpu_stream",
|
||||
"//tensorflow/stream_executor:stream_executor_headers",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_executor_header",
|
||||
hdrs = ["gpu_executor.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
":gpu_kernel_header",
|
||||
"//tensorflow/stream_executor:event",
|
||||
"//tensorflow/stream_executor:platform",
|
||||
"//tensorflow/stream_executor:stream_executor_internal",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
"@com_google_absl//absl/strings",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_helpers_header",
|
||||
hdrs = ["gpu_helpers.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [":gpu_types_header"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_kernel_header",
|
||||
hdrs = ["gpu_kernel.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
":gpu_driver_header",
|
||||
"//tensorflow/stream_executor:event",
|
||||
"//tensorflow/stream_executor:stream_executor_pimpl_header",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_rng_header",
|
||||
hdrs = ["gpu_rng.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
":gpu_types_header",
|
||||
"//tensorflow/stream_executor:plugin_registry",
|
||||
"//tensorflow/stream_executor:rng",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_stream_header",
|
||||
hdrs = ["gpu_stream.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
":gpu_driver_header",
|
||||
"//tensorflow/stream_executor:stream_executor_internal",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_stream",
|
||||
srcs = ["gpu_stream.cc"],
|
||||
hdrs = ["gpu_stream.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
":gpu_driver_header",
|
||||
":gpu_executor_header",
|
||||
"//tensorflow/stream_executor:stream_executor_headers",
|
||||
"//tensorflow/stream_executor:stream_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_timer_header",
|
||||
hdrs = ["gpu_timer.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
":gpu_driver_header",
|
||||
":gpu_executor_header",
|
||||
"//tensorflow/stream_executor:stream_executor_internal",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_timer",
|
||||
srcs = ["gpu_timer.cc"],
|
||||
hdrs = ["gpu_timer.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
":gpu_driver_header",
|
||||
":gpu_executor_header",
|
||||
":gpu_stream",
|
||||
"//tensorflow/stream_executor:stream_executor_headers",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "gpu_types_header",
|
||||
hdrs = ["gpu_types.h"],
|
||||
visibility = ["//tensorflow/stream_executor:__subpackages__"],
|
||||
deps = [
|
||||
"//tensorflow/stream_executor/platform",
|
||||
] + if_cuda_is_configured([
|
||||
"@local_config_cuda//cuda:cuda_headers",
|
||||
]) + if_rocm_is_configured([
|
||||
"@local_config_rocm//rocm:rocm_headers",
|
||||
]),
|
||||
)
|
@ -1,4 +1,4 @@
|
||||
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -13,36 +13,36 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/stream_executor/cuda/cuda_activation.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_activation.h"
|
||||
|
||||
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
||||
#include "tensorflow/stream_executor/stream_executor.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec);
|
||||
CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec);
|
||||
GpuContext* ExtractGpuContext(GpuExecutor* gpu_exec);
|
||||
GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec);
|
||||
|
||||
ScopedActivateExecutorContext::ScopedActivateExecutorContext(
|
||||
CUDAExecutor *cuda_exec):
|
||||
driver_scoped_activate_context_(
|
||||
new ScopedActivateContext{ExtractCudaContext(cuda_exec)}) { }
|
||||
GpuExecutor* gpu_exec)
|
||||
: driver_scoped_activate_context_(
|
||||
new ScopedActivateContext{ExtractGpuContext(gpu_exec)}) {}
|
||||
|
||||
ScopedActivateExecutorContext::ScopedActivateExecutorContext(
|
||||
StreamExecutor *stream_exec)
|
||||
: ScopedActivateExecutorContext(ExtractCudaExecutor(stream_exec)) {}
|
||||
StreamExecutor* stream_exec)
|
||||
: ScopedActivateExecutorContext(ExtractGpuExecutor(stream_exec)) {}
|
||||
|
||||
ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
|
||||
delete static_cast<ScopedActivateContext *>(driver_scoped_activate_context_);
|
||||
delete static_cast<ScopedActivateContext*>(driver_scoped_activate_context_);
|
||||
}
|
||||
|
||||
ScopedActivateExecutorContext::ScopedActivateExecutorContext(
|
||||
ScopedActivateExecutorContext &&other)
|
||||
ScopedActivateExecutorContext&& other)
|
||||
: driver_scoped_activate_context_(other.driver_scoped_activate_context_) {
|
||||
other.driver_scoped_activate_context_ = nullptr;
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
61
tensorflow/stream_executor/gpu/gpu_activation.h
Normal file
61
tensorflow/stream_executor/gpu/gpu_activation.h
Normal file
@ -0,0 +1,61 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// This file contains APIs that assume a StreamExecutor is backed by CUDA.
|
||||
// It reaches into the CUDA implementation to activate an underlying CUDA
|
||||
// context.
|
||||
//
|
||||
// Having this file separate from gpu/gpu_executor.h means that dependent
|
||||
// code does not also have to depend on cuda.h.
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
|
||||
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
|
||||
namespace stream_executor {
|
||||
|
||||
class StreamExecutor;
|
||||
|
||||
namespace gpu {
|
||||
|
||||
class GpuExecutor;
|
||||
class ScopedActivateContext;
|
||||
|
||||
// Activates a CUDA context within an enclosing scope.
|
||||
class ScopedActivateExecutorContext {
|
||||
public:
|
||||
// Form that takes a CUDA executor implementation.
|
||||
explicit ScopedActivateExecutorContext(GpuExecutor* gpu_exec);
|
||||
|
||||
// Form that takes a pImpl executor and extracts a CUDA implementation --
|
||||
// fatal failure if it is not CUDA inside.
|
||||
explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
|
||||
|
||||
ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
|
||||
|
||||
~ScopedActivateExecutorContext();
|
||||
|
||||
private:
|
||||
// The cuda.h-using datatype that we wrap.
|
||||
ScopedActivateContext* driver_scoped_activate_context_;
|
||||
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
|
99
tensorflow/stream_executor/gpu/gpu_diagnostics.h
Normal file
99
tensorflow/stream_executor/gpu/gpu_diagnostics.h
Normal file
@ -0,0 +1,99 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
|
||||
|
||||
#include <tuple>
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
|
||||
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
// e.g. DriverVersion{346, 3, 4}
|
||||
using DriverVersion = std::tuple<int, int, int>;
|
||||
|
||||
// FIXME: These functions are in stream_executor::cuda namespaces for now
|
||||
// Will move to stream_executor::gpu namespace in the near future
|
||||
//
|
||||
//// Converts a parsed driver version to string form.
|
||||
// string DriverVersionToString(DriverVersion version);
|
||||
//
|
||||
//// Converts a parsed driver version or status value to natural string form.
|
||||
// string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
|
||||
//
|
||||
//// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
|
||||
// port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
|
||||
|
||||
class Diagnostician {
|
||||
public:
|
||||
// Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
|
||||
// not initializing).
|
||||
//
|
||||
// Note: if we're running on a machine that has no GPUs, we don't want to
|
||||
// produce very much log spew beyond saying, "looks like there's no CUDA
|
||||
// kernel
|
||||
// module running".
|
||||
//
|
||||
// Note: we use non-Google-File:: API here because we may be called before
|
||||
// InitGoogle has completed.
|
||||
static void LogDiagnosticInformation();
|
||||
|
||||
// Given the driver version file contents, finds the kernel module version and
|
||||
// returns it as a string.
|
||||
//
|
||||
// This is solely used for more informative log messages when the user is
|
||||
// running on a machine that happens to have a libcuda/kernel driver mismatch.
|
||||
static port::StatusOr<DriverVersion> FindKernelModuleVersion(
|
||||
const string& driver_version_file_contents);
|
||||
|
||||
// Extracts the kernel driver version from the current host.
|
||||
static port::StatusOr<DriverVersion> FindKernelDriverVersion();
|
||||
|
||||
// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
|
||||
// driver-interfacing DSO version number. Returns it as a string.
|
||||
static port::StatusOr<DriverVersion> FindDsoVersion();
|
||||
|
||||
// Logs information about the kernel driver version and userspace driver
|
||||
// library version.
|
||||
static void LogDriverVersionInformation();
|
||||
|
||||
private:
|
||||
// Given the DSO version number and the driver version file contents, extracts
|
||||
// the driver version and compares, warning the user in the case of
|
||||
// incompatibility.
|
||||
//
|
||||
// This is solely used for more informative log messages when the user is
|
||||
// running on a machine that happens to have a libcuda/kernel driver mismatch.
|
||||
static void WarnOnDsoKernelMismatch(
|
||||
port::StatusOr<DriverVersion> dso_version,
|
||||
port::StatusOr<DriverVersion> kernel_version);
|
||||
|
||||
// Logs information about the dev nodes present on this machine: their
|
||||
// existence, permissions, accessibility from this uid/gid.
|
||||
static void LogDevNodeDiagnosticInformation();
|
||||
|
||||
static string GetDevNodePath(int dev_node_ordinal);
|
||||
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
|
525
tensorflow/stream_executor/gpu/gpu_driver.h
Normal file
525
tensorflow/stream_executor/gpu/gpu_driver.h
Normal file
@ -0,0 +1,525 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// CUDA userspace driver library wrapper functionality.
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
|
||||
#include "cuda/include/cuda.h"
|
||||
#include "tensorflow/stream_executor/device_options.h"
|
||||
#include "tensorflow/stream_executor/lib/status.h"
|
||||
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
|
||||
#include "tensorflow/stream_executor/gpu/gpu_types.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
// Identifies the memory space where an allocation resides. See
|
||||
// GpuDriver::GetPointerMemorySpace().
|
||||
enum class MemorySpace { kHost, kDevice };
|
||||
|
||||
// Returns a casual string, such as "host" for the provided memory space.
|
||||
string MemorySpaceString(MemorySpace memory_space);
|
||||
|
||||
class GpuContext;
|
||||
|
||||
// GpuDriver contains wrappers for calls to the userspace library driver. It's
|
||||
// useful to isolate these calls and put basic wrappers around them to separate
|
||||
// userspace library driver behaviors from the rest of the program.
|
||||
//
|
||||
// At the moment it's simply used as a namespace.
|
||||
//
|
||||
// The calls log any specific errors internally and return whether the operation
|
||||
// was successful to the caller.
|
||||
//
|
||||
// The order of parameters is generally kept symmetric with the underlying CUDA
|
||||
// driver API.
|
||||
//
|
||||
// Links on functions are to specific documentation under
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/
|
||||
//
|
||||
// Thread safety: these functions should not be used from signal handlers.
|
||||
class GpuDriver {
|
||||
public:
|
||||
// Wraps a call to cuInit with logging to help indicate what has gone wrong in
|
||||
// the case of failure. Safe to call multiple times; will be fast on all calls
|
||||
// after the first.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
|
||||
static port::Status Init();
|
||||
|
||||
// Returns the device associated with the given context.
|
||||
// device is an outparam owned by the caller, must not be null.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
|
||||
static port::StatusOr<GpuDeviceHandle> DeviceFromContext(GpuContext* context);
|
||||
|
||||
// Creates a new CUDA stream associated with the given context via
|
||||
// cuStreamCreate.
|
||||
// stream is an outparam owned by the caller, must not be null.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
|
||||
static bool CreateStream(GpuContext* context, GpuStreamHandle* stream);
|
||||
|
||||
// Destroys a CUDA stream associated with the given context.
|
||||
// stream is owned by the caller, must not be null, and *stream is set to null
|
||||
// if the stream is successfully destroyed.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
|
||||
static void DestroyStream(GpuContext* context, GpuStreamHandle* stream);
|
||||
|
||||
// CUDA events can explicitly disable event TSC retrieval for some presumed
|
||||
// performance improvement if timing is unnecessary.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
|
||||
enum class EventFlags { kDefault, kDisableTiming };
|
||||
|
||||
// Creates a new event associated with the given context.
|
||||
// result is an outparam owned by the caller and must not be null.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
|
||||
static port::Status CreateEvent(GpuContext* context, GpuEventHandle* result,
|
||||
EventFlags flags);
|
||||
|
||||
// Destroys *event and turns it into a nullptr. event may not be null, but
|
||||
// *event may be, via cuEventDestroy
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
|
||||
static port::Status DestroyEvent(GpuContext* context, GpuEventHandle* event);
|
||||
|
||||
// Allocates a GPU memory space of size bytes associated with the given
|
||||
// context via cuMemAlloc.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
|
||||
static void* DeviceAllocate(GpuContext* context, uint64 bytes);
|
||||
|
||||
// Deallocates a GPU memory space of size bytes associated with the given
|
||||
// context via cuMemFree.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
|
||||
static void DeviceDeallocate(GpuContext* context, void* location);
|
||||
|
||||
// Allocates a unified memory space of size bytes associated with the given
|
||||
// context via cuMemAllocManaged.
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
|
||||
// (supported on CUDA only)
|
||||
static void* UnifiedMemoryAllocate(GpuContext* context, uint64 bytes);
|
||||
|
||||
// Deallocates a unified memory space of size bytes associated with the given
|
||||
// context via cuMemFree.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
|
||||
// (supported on CUDA only)
|
||||
static void UnifiedMemoryDeallocate(GpuContext* context, void* location);
|
||||
|
||||
// Allocates page-locked and CUDA-registered memory on the host via
|
||||
// cuMemAllocHost.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
|
||||
static void* HostAllocate(GpuContext* context, uint64 bytes);
|
||||
|
||||
// Deallocates a location created by HostAllocate, via cuMemFreeHost.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
|
||||
static void HostDeallocate(GpuContext* context, void* location);
|
||||
|
||||
// Registers a memory region at location of size bytes via cuMemHostRegister.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
|
||||
static bool HostRegister(GpuContext* context, void* location, uint64 bytes);
|
||||
|
||||
// Unregisters a memory region that was previously registered at location via
|
||||
// cuMemHostUnregister.
|
||||
//
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
|
||||
//
|
||||
// TODO(leary) verify an error will be returned if the location wasn't
|
||||
// previously registered.
|
||||
static bool HostUnregister(GpuContext* context, void* location);
|
||||
|
||||
// Given a device ordinal, returns a device handle into the device outparam,
|
||||
// which must not be null.
|
||||
//
|
||||
// N.B. these device handles do not have a corresponding destroy function in
|
||||
// the CUDA driver API.
|
||||
static port::Status GetDevice(int device_ordinal, GpuDeviceHandle* device);
|
||||
|
||||
// Given a device handle, returns the name reported by the driver for the
|
||||
// device.
|
||||
static bool GetDeviceName(GpuDeviceHandle device, string* device_name);
|
||||
|
||||
// Given a device to create a context for, returns a context handle into the
|
||||
// context outparam, which must not be null.
|
||||
//
|
||||
// N.B. CUDA contexts are weird. They are implicitly associated with the
|
||||
// calling thread. Current documentation on contexts and their influence on
|
||||
// userspace processes is given here:
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
|
||||
static port::Status CreateContext(int device_ordinal, GpuDeviceHandle device,
|
||||
const DeviceOptions& device_options,
|
||||
GpuContext** context);
|
||||
|
||||
// Destroys the provided context via cuCtxDestroy.
|
||||
// Don't do this while clients could still be using the context, per the docs
|
||||
// bad things will happen.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
|
||||
static void DestroyContext(GpuContext* context);
|
||||
|
||||
// Queries the runtime for the specified attribute of the specified function.
|
||||
// cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
|
||||
// in terms of integer-sized values, so there's no potential for overrun (as
|
||||
// of CUDA 5.5).
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
|
||||
static bool FuncGetAttribute(GpuFunctionAttribute attribute,
|
||||
GpuFunctionHandle function,
|
||||
int* attribute_value);
|
||||
|
||||
// Sets the preferred cache configuration for the specified function.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
|
||||
static bool FuncSetCacheConfig(GpuFunctionHandle function,
|
||||
GpuFuncCachePreference cache_config);
|
||||
|
||||
// Gets the preferred shared memory bank configuration for the specified
|
||||
// CONTEXT (not function!), either default or four- or eight-byte bank size.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
|
||||
static port::StatusOr<GpuSharedMemConfig> ContextGetSharedMemConfig(
|
||||
GpuContext* context);
|
||||
|
||||
// Sets the preferred shared memory bank configuration for the specified
|
||||
// CONTEXT (not function!), either default or four- or eight-byte bank size.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
|
||||
static port::Status ContextSetSharedMemConfig(
|
||||
GpuContext* context, GpuSharedMemConfig shared_mem_config);
|
||||
|
||||
// Launches a CUDA kernel via cuLaunchKernel.
|
||||
// TODO(leary) describe the structure of kernel_params and extra in a readable
|
||||
// way.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
|
||||
static bool LaunchKernel(GpuContext* context, GpuFunctionHandle function,
|
||||
unsigned int grid_dim_x, unsigned int grid_dim_y,
|
||||
unsigned int grid_dim_z, unsigned int block_dim_x,
|
||||
unsigned int block_dim_y, unsigned int block_dim_z,
|
||||
unsigned int shared_mem_bytes,
|
||||
GpuStreamHandle stream, void** kernel_params,
|
||||
void** extra);
|
||||
|
||||
// Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
|
||||
// handle in "module". Any error logs that are produced are logged internally.
|
||||
// (supported on CUDA only)
|
||||
static bool LoadPtx(GpuContext* context, const char* ptx_contents,
|
||||
GpuModuleHandle* module);
|
||||
|
||||
// Loads cubin_bytes with the CUDA driver's blob loading interface and stores
|
||||
// the resulting handle in "module".
|
||||
// (supported on CUDA only)
|
||||
static port::Status LoadCubin(GpuContext* context, const char* cubin_bytes,
|
||||
GpuModuleHandle* module);
|
||||
|
||||
// Loads HSACO with the ROCM runtime and stores the resulting handle in
|
||||
// "module". Any error logs that are produced are logged internally.
|
||||
// (supported on ROCm only)
|
||||
static bool LoadHsaco(GpuContext* context, const char* hsaco_contents,
|
||||
GpuModuleHandle* module);
|
||||
|
||||
// Retrieves a named kernel from a loaded module, and places the resulting
|
||||
// handle into function (outparam) on success. Neither kernel_name nor
|
||||
// function may be null. No ownership is taken of kernel_name.
|
||||
static bool GetModuleFunction(GpuContext* context, GpuModuleHandle module,
|
||||
const char* kernel_name,
|
||||
GpuFunctionHandle* function);
|
||||
|
||||
// Retrieves a named global/constant symbol from a loaded module, and returns
|
||||
// a device pointer and size of the symbol on success. symbol_name may not be
|
||||
// null. At least one of dptr or bytes should not be null. No ownership is
|
||||
// taken of symbol_name.
|
||||
static bool GetModuleSymbol(GpuContext* context, GpuModuleHandle module,
|
||||
const char* symbol_name, GpuDevicePtr* dptr,
|
||||
size_t* bytes);
|
||||
|
||||
// Unloads module from the current context via cuModuleUnload.
|
||||
// TODO(leary) the documentation doesn't say what kind of disasters happen
|
||||
// if you try to unload a module while its GpuFunctionHandles are in use.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
|
||||
static void UnloadModule(GpuContext* context, GpuModuleHandle module);
|
||||
|
||||
// Performs a synchronous memset of the device memory segment via cuMemsetD8.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
|
||||
static bool SynchronousMemsetUint8(GpuContext* context, GpuDevicePtr location,
|
||||
uint8 value, size_t size);
|
||||
|
||||
// Performs a synchronous memset of the device memory segment via cuMemsetD32.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
|
||||
static bool SynchronousMemsetUint32(GpuContext* context,
|
||||
GpuDevicePtr location, uint32 value,
|
||||
size_t uint32_count);
|
||||
|
||||
// Performs an asynchronous memset of the device memory segment via
|
||||
// cuMemsetD8Async.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
|
||||
static bool AsynchronousMemsetUint8(GpuContext* context,
|
||||
GpuDevicePtr location, uint8 value,
|
||||
size_t uint32_count,
|
||||
GpuStreamHandle stream);
|
||||
|
||||
// Performs an asynchronous memset of the device memory segment via
|
||||
// cuMemsetD32Async.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
|
||||
static bool AsynchronousMemsetUint32(GpuContext* context,
|
||||
GpuDevicePtr location, uint32 value,
|
||||
size_t uint32_count,
|
||||
GpuStreamHandle stream);
|
||||
|
||||
// -- Synchronous memcopies.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
|
||||
|
||||
static port::Status SynchronousMemcpyD2H(GpuContext* context, void* host_dst,
|
||||
GpuDevicePtr gpu_src, uint64 size);
|
||||
static port::Status SynchronousMemcpyH2D(GpuContext* context,
|
||||
GpuDevicePtr gpu_dst,
|
||||
const void* host_src, uint64 size);
|
||||
static port::Status SynchronousMemcpyD2D(GpuContext* context,
|
||||
GpuDevicePtr gpu_dst,
|
||||
GpuDevicePtr gpu_src, uint64 size);
|
||||
|
||||
// -- Asynchronous memcopies.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
|
||||
|
||||
static bool AsynchronousMemcpyD2H(GpuContext* context, void* host_dst,
|
||||
GpuDevicePtr gpu_src, uint64 size,
|
||||
GpuStreamHandle stream);
|
||||
static bool AsynchronousMemcpyH2D(GpuContext* context, GpuDevicePtr gpu_dst,
|
||||
const void* host_src, uint64 size,
|
||||
GpuStreamHandle stream);
|
||||
static bool AsynchronousMemcpyD2D(GpuContext* context, GpuDevicePtr gpu_dst,
|
||||
GpuDevicePtr gpu_src, uint64 size,
|
||||
GpuStreamHandle stream);
|
||||
|
||||
// The CUDA stream callback type signature.
|
||||
// The data passed to AddStreamCallback is subsequently passed to this
|
||||
// callback when it fires.
|
||||
//
|
||||
// Some notable things:
|
||||
// * Callbacks must not make any CUDA API calls.
|
||||
// * Callbacks from independent streams execute in an undefined order and may
|
||||
// be serialized.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
|
||||
typedef void (*StreamCallback)(GpuStreamHandle stream, GpuStatus status,
|
||||
void* data);
|
||||
|
||||
// Enqueues a callback operation into stream.
|
||||
// See StreamCallback above and the NVIDIA documentation for additional
|
||||
// details.
|
||||
static bool AddStreamCallback(GpuContext* context, GpuStreamHandle stream,
|
||||
StreamCallback callback, void* data);
|
||||
|
||||
// Causes stream to wait for event to trigger before proceeding via
|
||||
// cuStreamWaitEvent.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
|
||||
static bool WaitStreamOnEvent(GpuContext* context, GpuStreamHandle stream,
|
||||
GpuEventHandle event);
|
||||
|
||||
// Blocks the calling thread until the operations enqueued onto stream have
|
||||
// been completed, via cuStreamSynchronize.
|
||||
//
|
||||
// TODO(leary) if a pathological thread enqueues operations onto the stream
|
||||
// while another thread blocks like this, can you wind up waiting an unbounded
|
||||
// amount of time?
|
||||
//
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
|
||||
static port::Status SynchronizeStream(GpuContext* context,
|
||||
GpuStreamHandle stream);
|
||||
|
||||
// Blocks the calling thread until the operations associated with the context
|
||||
// have been completed, via cuCtxSynchronize.
|
||||
//
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
|
||||
static bool SynchronizeContext(GpuContext* context);
|
||||
|
||||
// Returns true if all stream tasks have completed at time of the call. Note
|
||||
// the potential for races around this call (if another thread adds work to
|
||||
// the stream immediately after this returns).
|
||||
static bool IsStreamIdle(GpuContext* context, GpuStreamHandle stream);
|
||||
|
||||
// Returns whether code in the from context can access memory in the to
|
||||
// context via cuDeviceCanAccessPeer.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
|
||||
static bool CanEnablePeerAccess(GpuContext* from, GpuContext* to);
|
||||
|
||||
// Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
|
||||
static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to);
|
||||
|
||||
// Returns the elapsed milliseconds between start and stop via
|
||||
// cuEventElapsedTime.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
|
||||
static bool GetEventElapsedTime(GpuContext* context,
|
||||
float* elapsed_milliseconds,
|
||||
GpuEventHandle start, GpuEventHandle stop);
|
||||
|
||||
// Records that an event occurred when execution reaches the current point in
|
||||
// thestream via cuEventRecord.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
|
||||
static port::Status RecordEvent(GpuContext* context, GpuEventHandle event,
|
||||
GpuStreamHandle stream);
|
||||
|
||||
// Polls (without blocking) to determine the status of an event - pending or
|
||||
// complete (or an error status).
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
|
||||
static port::StatusOr<GpuStatus> QueryEvent(GpuContext* context,
|
||||
GpuEventHandle event);
|
||||
|
||||
// -- Pointer-specific calls.
|
||||
|
||||
// Returns the context in which pointer was allocated or registered.
|
||||
static port::StatusOr<GpuContext*> GetPointerContext(GpuDevicePtr pointer);
|
||||
|
||||
// Returns the device associated with the context from GetPointerContext().
|
||||
static port::StatusOr<GpuDeviceHandle> GetPointerDevice(GpuDevicePtr pointer);
|
||||
|
||||
// Returns the memory space addressed by pointer.
|
||||
static port::StatusOr<MemorySpace> GetPointerMemorySpace(
|
||||
GpuDevicePtr pointer);
|
||||
|
||||
// Returns the base address and size of the device pointer dptr.
|
||||
static port::Status GetPointerAddressRange(GpuDevicePtr dptr,
|
||||
GpuDevicePtr* base, size_t* size);
|
||||
|
||||
// -- Device-specific calls.
|
||||
|
||||
// Returns the compute capability for the device; i.e (3, 5).
|
||||
// This is currently done via the deprecated device API.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
|
||||
// (supported on CUDA only)
|
||||
static port::Status GetComputeCapability(int* cc_major, int* cc_minor,
|
||||
GpuDeviceHandle device);
|
||||
|
||||
// Returns Gpu ISA version for the device; i.e 803, 900.
|
||||
// (supported on ROCm only)
|
||||
static port::Status GetGpuISAVersion(int* version, GpuDeviceHandle device);
|
||||
|
||||
// Returns the number of multiprocessors on the device (note that the device
|
||||
// may be multi-GPU-per-board).
|
||||
static port::StatusOr<int> GetMultiprocessorCount(GpuDeviceHandle device);
|
||||
|
||||
// Returns the limit on number of threads that can be resident in a single
|
||||
// multiprocessor.
|
||||
static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(
|
||||
GpuDeviceHandle device);
|
||||
|
||||
// Returns the limit on number of threads which may be resident for a single
|
||||
// block (cooperative thread array).
|
||||
static port::StatusOr<int64> GetMaxThreadsPerBlock(GpuDeviceHandle device);
|
||||
|
||||
// Returns the amount of shared memory available on a single GPU core (i.e.
|
||||
// SM on NVIDIA devices).
|
||||
static port::StatusOr<int64> GetMaxSharedMemoryPerCore(
|
||||
GpuDeviceHandle device);
|
||||
|
||||
// Returns the amount of shared memory available for a single block
|
||||
// (cooperative thread array).
|
||||
static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(
|
||||
GpuDeviceHandle device);
|
||||
|
||||
// Returns the maximum supported number of registers per block.
|
||||
static port::StatusOr<int64> GetMaxRegistersPerBlock(GpuDeviceHandle device);
|
||||
|
||||
// Returns the number of threads per warp.
|
||||
static port::StatusOr<int64> GetThreadsPerWarp(GpuDeviceHandle device);
|
||||
|
||||
// Queries the grid limits for device with cuDeviceGetAttribute calls.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
|
||||
static bool GetGridLimits(int* x, int* y, int* z, GpuDeviceHandle device);
|
||||
|
||||
// Returns a grab-bag of device properties in a caller-owned device_properties
|
||||
// structure for device_ordinal via cuDeviceGetProperties.
|
||||
//
|
||||
// This call is deprecated in the NVIDIA driver API; its replacement is
|
||||
// GetDeviceAttribute
|
||||
//
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
|
||||
static bool GetDeviceProperties(GpuDeviceProperty* device_properties,
|
||||
int device_ordinal);
|
||||
|
||||
// Gets a specific integer-valued property about the given device.
|
||||
//
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
|
||||
static port::StatusOr<int> GetDeviceAttribute(GpuDeviceAttribute attribute,
|
||||
GpuDeviceHandle device);
|
||||
|
||||
// Returns whether ECC is enabled for the given GpuDeviceHandle via
|
||||
// cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
|
||||
static bool IsEccEnabled(GpuDeviceHandle device, bool* result);
|
||||
|
||||
// Returns the total amount of memory available for allocation by the CUDA
|
||||
// context, in bytes, via cuDeviceTotalMem.
|
||||
static bool GetDeviceTotalMemory(GpuDeviceHandle device, uint64* result);
|
||||
|
||||
// Returns the free amount of memory and total amount of memory, as reported
|
||||
// by cuMemGetInfo.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
|
||||
static bool GetDeviceMemoryInfo(GpuContext* context, int64* free,
|
||||
int64* total);
|
||||
|
||||
// Returns a PCI bus id string for the device.
|
||||
// [domain]:[bus]:[device].[function]
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
|
||||
static string GetPCIBusID(GpuDeviceHandle device);
|
||||
|
||||
// -- Context- and device-independent calls.
|
||||
|
||||
// Returns the number of visible CUDA device via cuDeviceGetCount.
|
||||
// This should correspond to the set of device ordinals available.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
|
||||
static int GetDeviceCount();
|
||||
|
||||
// Returns the driver version number via cuDriverGetVersion.
|
||||
// This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
|
||||
// instead, the CUDA toolkit release number that this driver is compatible
|
||||
// with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
|
||||
// compatible driver).
|
||||
//
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
|
||||
static bool GetDriverVersion(int* driver_version);
|
||||
|
||||
// -- Other calls
|
||||
|
||||
// Returns the maximum number of blocks (per multiprocessor) occupied by the
|
||||
// specified kernel/GpuFunctionHandle when launched with the specified
|
||||
// parameters.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
|
||||
static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
|
||||
GpuContext* context, GpuFunctionHandle kernel, int threads_per_block,
|
||||
size_t dynamic_shared_memory_bytes);
|
||||
|
||||
// Seam for injecting an error at CUDA initialization time for testing
|
||||
// purposes.
|
||||
static bool driver_inject_init_error_;
|
||||
};
|
||||
|
||||
// Ensures a context is activated within a scope.
|
||||
class ScopedActivateContext {
|
||||
public:
|
||||
// Activates the context via cuCtxSetCurrent, if it is not the currently
|
||||
// active context (a la cuCtxGetCurrent). Note the alternative push/pop
|
||||
// mechanism is said by NVIDIA to be relatively slow and deprecated.
|
||||
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
|
||||
explicit ScopedActivateContext(GpuContext* context);
|
||||
|
||||
// Checks that the context has remained activated for the duration of the
|
||||
// scope.
|
||||
~ScopedActivateContext();
|
||||
|
||||
private:
|
||||
GpuContext* to_restore_ = nullptr;
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
|
47
tensorflow/stream_executor/gpu/gpu_event.cc
Normal file
47
tensorflow/stream_executor/gpu/gpu_event.cc
Normal file
@ -0,0 +1,47 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/stream_executor/gpu/gpu_event.h"
|
||||
|
||||
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
|
||||
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
GpuEvent::GpuEvent(GpuExecutor* parent)
|
||||
: parent_(parent), gpu_event_(nullptr) {}
|
||||
|
||||
GpuEvent::~GpuEvent() {}
|
||||
|
||||
port::Status GpuEvent::Init() {
|
||||
return GpuDriver::CreateEvent(parent_->gpu_context(), &gpu_event_,
|
||||
GpuDriver::EventFlags::kDisableTiming);
|
||||
}
|
||||
|
||||
port::Status GpuEvent::Destroy() {
|
||||
return GpuDriver::DestroyEvent(parent_->gpu_context(), &gpu_event_);
|
||||
}
|
||||
|
||||
port::Status GpuEvent::Record(GpuStream* stream) {
|
||||
return GpuDriver::RecordEvent(parent_->gpu_context(), gpu_event_,
|
||||
stream->gpu_stream());
|
||||
}
|
||||
|
||||
GpuEventHandle GpuEvent::gpu_event() { return gpu_event_; }
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
62
tensorflow/stream_executor/gpu/gpu_event.h
Normal file
62
tensorflow/stream_executor/gpu/gpu_event.h
Normal file
@ -0,0 +1,62 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
|
||||
|
||||
#include "tensorflow/stream_executor/event.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
|
||||
#include "tensorflow/stream_executor/lib/status.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
// GpuEvent wraps a GpuEventHandle in the platform-independent EventInterface
|
||||
// interface.
|
||||
class GpuEvent : public internal::EventInterface {
|
||||
public:
|
||||
explicit GpuEvent(GpuExecutor* parent);
|
||||
|
||||
~GpuEvent() override;
|
||||
|
||||
// Populates the CUDA-platform-specific elements of this object.
|
||||
port::Status Init();
|
||||
|
||||
// Deallocates any platform-specific elements of this object. This is broken
|
||||
// out (not part of the destructor) to allow for error reporting.
|
||||
port::Status Destroy();
|
||||
|
||||
// Inserts the event at the current position into the specified stream.
|
||||
port::Status Record(GpuStream* stream);
|
||||
|
||||
// Polls the CUDA platform for the event's current status.
|
||||
Event::Status PollForStatus();
|
||||
|
||||
// The underlying CUDA event element.
|
||||
GpuEventHandle gpu_event();
|
||||
|
||||
private:
|
||||
// The Executor used to which this object and GpuEventHandle are bound.
|
||||
GpuExecutor* parent_;
|
||||
|
||||
// The underlying CUDA event element.
|
||||
GpuEventHandle gpu_event_;
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
|
347
tensorflow/stream_executor/gpu/gpu_executor.h
Normal file
347
tensorflow/stream_executor/gpu/gpu_executor.h
Normal file
@ -0,0 +1,347 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// The CUDA implementation of the StreamExecutorInterface functionality.
|
||||
// CUDA inclusions are ideally confined to this implementation file.
|
||||
//
|
||||
// The notions from the StreamExecutor basically correspond to the CUDA streams
|
||||
// programming model provided by the libcuda.so driver APIs, so we don't have
|
||||
// to do much more than wrap the calls to the libraries appropriately.
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
|
||||
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "tensorflow/stream_executor/event.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
|
||||
#include "tensorflow/stream_executor/lib/status.h"
|
||||
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||
#include "tensorflow/stream_executor/platform.h"
|
||||
#include "tensorflow/stream_executor/platform/mutex.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/platform/thread_annotations.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
// CUDA-platform implementation of the platform-agnostic
|
||||
// StreamExecutorInferface.
|
||||
class GpuExecutor : public internal::StreamExecutorInterface {
|
||||
public:
|
||||
// sub_platform indicates the subplatform used in this executor; it must
|
||||
// be a CUDA type.
|
||||
explicit GpuExecutor(const PluginConfig& plugin_config)
|
||||
: device_(0),
|
||||
context_(nullptr),
|
||||
device_ordinal_(0),
|
||||
cc_major_(0),
|
||||
cc_minor_(0),
|
||||
version_(0),
|
||||
plugin_config_(plugin_config) {}
|
||||
|
||||
// See the corresponding StreamExecutor methods for method comments on the
|
||||
// following overrides.
|
||||
|
||||
~GpuExecutor() override;
|
||||
|
||||
port::Status Init(int device_ordinal, DeviceOptions device_options) override;
|
||||
|
||||
bool GetKernel(const MultiKernelLoaderSpec& spec,
|
||||
KernelBase* kernel) override;
|
||||
// (supported on CUDA only)
|
||||
void UnloadKernel(const KernelBase* kernel) override;
|
||||
bool LoadModule(const MultiModuleLoaderSpec& spec,
|
||||
ModuleHandle* module_handle) override;
|
||||
bool UnloadModule(ModuleHandle module_handle) override;
|
||||
|
||||
bool Launch(Stream* stream, const ThreadDim& thread_dims,
|
||||
const BlockDim& block_dims, const KernelBase& k,
|
||||
const KernelArgsArrayBase& args) override;
|
||||
|
||||
// (supported on CUDA only)
|
||||
int CalculateOccupancy(const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims, GpuFunctionHandle func);
|
||||
|
||||
// (supported on CUDA only)
|
||||
int CompareOccupancy(int* initial_blocks,
|
||||
const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims, GpuFunctionHandle func);
|
||||
|
||||
void* Allocate(uint64 size) override;
|
||||
|
||||
void* AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
|
||||
uint64 size_bytes) override;
|
||||
|
||||
void Deallocate(DeviceMemoryBase* mem) override;
|
||||
|
||||
void* UnifiedMemoryAllocate(uint64 size) override {
|
||||
return GpuDriver::UnifiedMemoryAllocate(context_, size);
|
||||
}
|
||||
|
||||
void UnifiedMemoryDeallocate(void* location) override {
|
||||
return GpuDriver::UnifiedMemoryDeallocate(context_, location);
|
||||
}
|
||||
|
||||
// CUDA allocation/registration functions are necessary because the driver
|
||||
// internally sets up buffers for DMA operations (and page locks them).
|
||||
// There's no external interface for us to otherwise control these DMA
|
||||
// settings.
|
||||
void* HostMemoryAllocate(uint64 size) override {
|
||||
return GpuDriver::HostAllocate(context_, size);
|
||||
}
|
||||
|
||||
void HostMemoryDeallocate(void* location) override {
|
||||
return GpuDriver::HostDeallocate(context_, location);
|
||||
}
|
||||
|
||||
bool HostMemoryRegister(void* location, uint64 size) override;
|
||||
|
||||
bool HostMemoryUnregister(void* location) override;
|
||||
|
||||
bool SynchronizeAllActivity() override;
|
||||
|
||||
bool SynchronousMemZero(DeviceMemoryBase* location, uint64 size) override;
|
||||
|
||||
bool SynchronousMemSet(DeviceMemoryBase* location, int value,
|
||||
uint64 size) override;
|
||||
|
||||
port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
|
||||
const void* host_src, uint64 size) override;
|
||||
|
||||
port::Status SynchronousMemcpy(void* host_dst,
|
||||
const DeviceMemoryBase& gpu_src,
|
||||
uint64 size) override;
|
||||
|
||||
port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
|
||||
const DeviceMemoryBase& gpu_src,
|
||||
uint64 size) override;
|
||||
|
||||
bool MemZero(Stream* stream, DeviceMemoryBase* location,
|
||||
uint64 size) override;
|
||||
bool Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
|
||||
uint64 size) override;
|
||||
bool Memset32(Stream* stream, DeviceMemoryBase* location, uint32 pattern,
|
||||
uint64 size) override;
|
||||
|
||||
bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
|
||||
uint64 size) override;
|
||||
|
||||
bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
|
||||
uint64 size) override;
|
||||
|
||||
bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
|
||||
const DeviceMemoryBase& gpu_src,
|
||||
uint64 size) override;
|
||||
|
||||
bool HostCallback(Stream* stream,
|
||||
std::function<port::Status()> callback) override;
|
||||
|
||||
bool AllocateStream(Stream* stream) override;
|
||||
|
||||
void DeallocateStream(Stream* stream) override;
|
||||
|
||||
bool CreateStreamDependency(Stream* dependent, Stream* other) override;
|
||||
|
||||
bool AllocateTimer(Timer* timer) override;
|
||||
|
||||
void DeallocateTimer(Timer* timer) override;
|
||||
|
||||
bool StartTimer(Stream* stream, Timer* timer) override;
|
||||
|
||||
bool StopTimer(Stream* stream, Timer* timer) override;
|
||||
|
||||
port::Status AllocateEvent(Event* event) override;
|
||||
|
||||
port::Status DeallocateEvent(Event* event) override;
|
||||
|
||||
port::Status RecordEvent(Stream* stream, Event* event) override;
|
||||
|
||||
port::Status WaitForEvent(Stream* stream, Event* event) override;
|
||||
|
||||
Event::Status PollForEventStatus(Event* event) override;
|
||||
|
||||
port::Status BlockHostUntilDone(Stream* stream) override;
|
||||
|
||||
int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
|
||||
|
||||
port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
|
||||
|
||||
bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
|
||||
|
||||
SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
|
||||
|
||||
port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
|
||||
|
||||
bool DeviceMemoryUsage(int64* free, int64* total) const override;
|
||||
|
||||
// Search for the symbol and returns a device pointer and size.
|
||||
// Returns false if symbol does not exist.
|
||||
bool GetSymbol(const string& symbol_name, ModuleHandle module_handle,
|
||||
void** mem, size_t* bytes) override;
|
||||
|
||||
DeviceDescription* PopulateDeviceDescription() const override;
|
||||
|
||||
// Populates the block_dim_limit by querying the device driver API. If an
|
||||
// error occurs at any point while asking the driver for block dim limits, it
|
||||
// will be only partially populated as a result, and an error will be logged.
|
||||
bool FillBlockDimLimit(BlockDim* block_dim_limit) const;
|
||||
|
||||
bool SupportsBlas() const override;
|
||||
|
||||
blas::BlasSupport* CreateBlas() override;
|
||||
|
||||
bool SupportsFft() const override;
|
||||
|
||||
fft::FftSupport* CreateFft() override;
|
||||
|
||||
bool SupportsRng() const override;
|
||||
|
||||
rng::RngSupport* CreateRng() override;
|
||||
|
||||
bool SupportsDnn() const override;
|
||||
|
||||
dnn::DnnSupport* CreateDnn() override;
|
||||
|
||||
std::unique_ptr<internal::EventInterface> CreateEventImplementation()
|
||||
override;
|
||||
|
||||
std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
|
||||
override;
|
||||
|
||||
std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
|
||||
|
||||
std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
|
||||
|
||||
void* GpuContextHack() override;
|
||||
|
||||
GpuContext* gpu_context();
|
||||
|
||||
private:
|
||||
// Attempts to find a more specific version of the file indicated by
|
||||
// filename by looking for compute-capability-specific suffixed versions; i.e.
|
||||
// looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
|
||||
// we're on a compute capability 3.0 machine.
|
||||
// (supported on CUDA only)
|
||||
bool FindOnDiskForComputeCapability(absl::string_view filename,
|
||||
absl::string_view canonical_suffix,
|
||||
string* found_filename) const;
|
||||
|
||||
// Attempts to find a more specific version of the file indicated by
|
||||
// filename by looking for AMDGPU ISA-specific suffixed versions.
|
||||
// (supported on ROCm only)
|
||||
|
||||
bool FindOnDiskForISAVersion(absl::string_view filename,
|
||||
absl::string_view canonical_suffix,
|
||||
string* found_filename) const;
|
||||
|
||||
// Host callback landing routine invoked by CUDA.
|
||||
// data: User-provided callback provided to HostCallback() above, captured
|
||||
// as a std::function<void()>. Allocated/initialized inside
|
||||
// HostCallback() and owned and deleted by this call.
|
||||
static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
|
||||
void* data);
|
||||
|
||||
// Collects metadata for the specified kernel.
|
||||
bool GetKernelMetadata(GpuKernel* cuda_kernel,
|
||||
KernelMetadata* kernel_metadata);
|
||||
|
||||
// Prints to VLOG(2) information about the kernel's occupancy and how it might
|
||||
// be improved.
|
||||
void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
|
||||
const BlockDim& block_dims);
|
||||
|
||||
// (supported on CUDA only)
|
||||
bool LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
|
||||
|
||||
// Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated.
|
||||
// (supported on CUDA only)
|
||||
bool LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
|
||||
|
||||
// (supported on ROCm only)
|
||||
bool LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
|
||||
|
||||
bool UnloadGpuBinary(const void* gpu_binary)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
|
||||
|
||||
// Guards the on-disk-module mapping.
|
||||
mutex disk_modules_mu_;
|
||||
|
||||
// Mapping from filename to GPUModuleHandle, if it was already retrieved.
|
||||
// Multiple GPUFunctionHandle are usually obtained from a single
|
||||
// GPUModuleHandle so we attempt to hit in this mapping first, before
|
||||
// retrieving it.
|
||||
std::map<string, GpuModuleHandle> disk_modules_ GUARDED_BY(disk_modules_mu_);
|
||||
|
||||
// Guards the in-memory-module mapping.
|
||||
mutex in_memory_modules_mu_;
|
||||
|
||||
std::map<const char*, GpuModuleHandle> in_memory_modules_
|
||||
GUARDED_BY(in_memory_modules_mu_);
|
||||
|
||||
// Kernel -> loaded GPU binary. Many kernels may load the same binary.
|
||||
std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
|
||||
GUARDED_BY(in_memory_modules_mu_);
|
||||
// GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
|
||||
std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>>
|
||||
gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
|
||||
|
||||
// Guards the launched kernel set.
|
||||
mutex launched_kernels_mu_;
|
||||
|
||||
// Keeps track of the set of launched kernels. Currently used to suppress the
|
||||
// occupancy check on subsequent launches.
|
||||
std::set<GpuFunctionHandle> launched_kernels_
|
||||
GUARDED_BY(launched_kernels_mu_);
|
||||
|
||||
// Handle for the CUDA device being operated on. Immutable
|
||||
// post-initialization.
|
||||
GpuDeviceHandle device_;
|
||||
|
||||
// Handle for session with the library/driver. Immutable post-initialization.
|
||||
GpuContext* context_;
|
||||
|
||||
// The device ordinal value that this executor was initialized with; recorded
|
||||
// for use in getting device metadata. Immutable post-initialization.
|
||||
int device_ordinal_;
|
||||
|
||||
// The major verion of the compute capability for device_.
|
||||
int cc_major_;
|
||||
|
||||
// The minor verion of the compute capability for device_.
|
||||
int cc_minor_;
|
||||
|
||||
// GPU ISA version for device_.
|
||||
int version_;
|
||||
|
||||
// The plugin configuration associated with this instance.
|
||||
PluginConfig plugin_config_;
|
||||
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
|
107
tensorflow/stream_executor/gpu/gpu_helpers.h
Normal file
107
tensorflow/stream_executor/gpu/gpu_helpers.h
Normal file
@ -0,0 +1,107 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Common helper functions used for dealing with CUDA API datatypes.
|
||||
//
|
||||
// These are typically placed here for use by multiple source components (for
|
||||
// example, BLAS and executor components).
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <complex>
|
||||
|
||||
#include "tensorflow/stream_executor/gpu/gpu_types.h"
|
||||
|
||||
namespace stream_executor {
|
||||
|
||||
template <typename ElemT>
|
||||
class DeviceMemory;
|
||||
|
||||
namespace gpu {
|
||||
|
||||
// Converts a const DeviceMemory reference to its underlying typed pointer in
|
||||
// CUDA
|
||||
// device memory.
|
||||
template <typename T>
|
||||
const T* GpuMemory(const DeviceMemory<T>& mem) {
|
||||
return static_cast<const T*>(mem.opaque());
|
||||
}
|
||||
|
||||
// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
|
||||
// pointer in CUDA device memory.
|
||||
template <typename T>
|
||||
T* GpuMemoryMutable(DeviceMemory<T>* mem) {
|
||||
return static_cast<T*>(mem->opaque());
|
||||
}
|
||||
|
||||
static_assert(
|
||||
sizeof(std::complex<float>) == sizeof(GpuComplexType),
|
||||
"std::complex<float> and GpuComplexType should have the same size");
|
||||
static_assert(offsetof(GpuComplexType, x) == 0,
|
||||
"The real part of GpuComplexType should appear first.");
|
||||
static_assert(
|
||||
sizeof(std::complex<double>) == sizeof(GpuDoubleComplexType),
|
||||
"std::complex<double> and GpuDoubleComplexType should have the same "
|
||||
"size");
|
||||
static_assert(offsetof(GpuDoubleComplexType, x) == 0,
|
||||
"The real part of GpuDoubleComplexType should appear first.");
|
||||
|
||||
// Type traits to get CUDA complex types from std::complex<>.
|
||||
|
||||
template <typename T>
|
||||
struct GpuComplexT {
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GpuComplexT<std::complex<float>> {
|
||||
typedef GpuComplexType type;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GpuComplexT<std::complex<double>> {
|
||||
typedef GpuDoubleComplexType type;
|
||||
};
|
||||
|
||||
// Converts pointers of std::complex<> to pointers of
|
||||
// GpuComplexType/GpuDoubleComplexType. No type conversion for non-complex
|
||||
// types.
|
||||
|
||||
template <typename T>
|
||||
inline const typename GpuComplexT<T>::type* GpuComplex(const T* p) {
|
||||
return reinterpret_cast<const typename GpuComplexT<T>::type*>(p);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline typename GpuComplexT<T>::type* GpuComplex(T* p) {
|
||||
return reinterpret_cast<typename GpuComplexT<T>::type*>(p);
|
||||
}
|
||||
|
||||
// Converts values of std::complex<float/double> to values of
|
||||
// GpuComplexType/GpuDoubleComplexType.
|
||||
inline GpuComplexType GpuComplexValue(std::complex<float> val) {
|
||||
return {val.real(), val.imag()};
|
||||
}
|
||||
|
||||
inline GpuDoubleComplexType GpuComplexValue(std::complex<double> val) {
|
||||
return {val.real(), val.imag()};
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
|
105
tensorflow/stream_executor/gpu/gpu_kernel.h
Normal file
105
tensorflow/stream_executor/gpu/gpu_kernel.h
Normal file
@ -0,0 +1,105 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// The CUDA implementation of the StreamExecutorInterface functionality.
|
||||
// CUDA inclusions are ideally confined to this implementation file.
|
||||
//
|
||||
// The notions from the StreamExecutor basically correspond to the CUDA streams
|
||||
// programming model provided by the libcuda.so driver APIs, so we don't have
|
||||
// to do much more than wrap the calls to the libraries appropriately.
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
|
||||
|
||||
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
||||
#include "tensorflow/stream_executor/kernel_cache_config.h"
|
||||
#include "tensorflow/stream_executor/platform/logging.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
// Wraps a GpuFunctionHandle to implement the platform-independent
|
||||
// KernelInterface.
|
||||
class GpuKernel : public internal::KernelInterface {
|
||||
public:
|
||||
GpuKernel()
|
||||
: gpu_function_(nullptr),
|
||||
arity_(0),
|
||||
preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
|
||||
|
||||
// Note that the function is unloaded when the module is unloaded, and the
|
||||
// module that the function is contained in is owned by the GpuExecutor.
|
||||
~GpuKernel() override {}
|
||||
|
||||
// As arity cannot be reflected upon using the CUDA API, the arity is
|
||||
// explicitly set during the GpuExecutor::GetKernel initialization process.
|
||||
void set_arity(unsigned arity) { arity_ = arity; }
|
||||
unsigned Arity() const override { return arity_; }
|
||||
|
||||
// Returns the GpuFunctionHandle value for passing to the CUDA API.
|
||||
GpuFunctionHandle AsGpuFunctionHandle() const {
|
||||
DCHECK(gpu_function_ != nullptr);
|
||||
return const_cast<GpuFunctionHandle>(gpu_function_);
|
||||
}
|
||||
|
||||
// Returns the slot that the GpuFunctionHandle is stored within for this
|
||||
// object, for the CUDA API which wants to load into a GpuFunctionHandle*.
|
||||
GpuFunctionHandle* gpu_function_ptr() { return &gpu_function_; }
|
||||
|
||||
// CUDA supports setting the preferred cache configuration of a
|
||||
// GpuFunctionHandle (more-or-less equivalent to a GpuKernel). We support this
|
||||
// via the below functions; users can set a preference, and that is applied
|
||||
// when the kernel is [lazy-]loaded (in GpuExecutor::Launch). The alternative
|
||||
// would be to load the kernel & set the preference when the user calls the
|
||||
// setter below; either approach is valid. Sets the current kernel cache
|
||||
// configuration preference.
|
||||
void SetPreferredCacheConfig(KernelCacheConfig config) override {
|
||||
preferred_cache_config_ = config;
|
||||
}
|
||||
|
||||
// Returns the current kernel cache configuration preference.
|
||||
KernelCacheConfig GetPreferredCacheConfig() const override {
|
||||
return preferred_cache_config_;
|
||||
}
|
||||
|
||||
// Returns the current kernel cache configuration preference as a
|
||||
// CUfunc_cache.
|
||||
GpuFuncCachePreference GetGpuCacheConfig() const;
|
||||
|
||||
private:
|
||||
GpuFunctionHandle gpu_function_; // Wrapped CUDA kernel handle.
|
||||
unsigned arity_; // Number of formal parameters the kernel takes.
|
||||
|
||||
// Preferred (but not required) cache configuration for this kernel.
|
||||
KernelCacheConfig preferred_cache_config_;
|
||||
};
|
||||
|
||||
// Given a platform-independent kernel datatype, returns the (const) internal
|
||||
// CUDA platform implementation pointer.
|
||||
inline const GpuKernel* AsGpuKernel(const KernelBase* kernel) {
|
||||
return static_cast<const GpuKernel*>(kernel->implementation());
|
||||
}
|
||||
|
||||
// Given a platform-independent kernel datatype, returns the (non-const)
|
||||
// internal CUDA platform implementation pointer.
|
||||
inline GpuKernel* AsGpuKernel(KernelBase* kernel) {
|
||||
return static_cast<GpuKernel*>(kernel->implementation());
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
|
125
tensorflow/stream_executor/gpu/gpu_rng.h
Normal file
125
tensorflow/stream_executor/gpu/gpu_rng.h
Normal file
@ -0,0 +1,125 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
|
||||
|
||||
#include "tensorflow/stream_executor/platform/mutex.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/platform/thread_annotations.h"
|
||||
#include "tensorflow/stream_executor/plugin_registry.h"
|
||||
#include "tensorflow/stream_executor/rng.h"
|
||||
|
||||
#include "tensorflow/stream_executor/gpu/gpu_types.h"
|
||||
|
||||
namespace stream_executor {
|
||||
|
||||
class Stream;
|
||||
template <typename ElemT>
|
||||
class DeviceMemory;
|
||||
|
||||
namespace gpu {
|
||||
|
||||
// Opaque and unique identifier for the GPU RNG plugin.
|
||||
extern const PluginId kGpuRandPlugin;
|
||||
|
||||
class GpuExecutor;
|
||||
|
||||
// GPU-platform implementation of the random number generation support
|
||||
// interface.
|
||||
//
|
||||
// Thread-safe post-initialization.
|
||||
class GpuRng : public rng::RngSupport {
|
||||
public:
|
||||
explicit GpuRng(GpuExecutor* parent);
|
||||
|
||||
// Retrieves a gpu rng library generator handle. This is necessary for
|
||||
// enqueuing random number generation work onto the device.
|
||||
// TODO(leary) provide a way for users to select the RNG algorithm.
|
||||
bool Init();
|
||||
|
||||
// Releases a gpu rng library generator handle, if one was acquired.
|
||||
~GpuRng() override;
|
||||
|
||||
// See rng::RngSupport for details on the following overrides.
|
||||
bool DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) override;
|
||||
bool DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) override;
|
||||
bool DoPopulateRandUniform(Stream* stream,
|
||||
DeviceMemory<std::complex<float>>* v) override;
|
||||
bool DoPopulateRandUniform(Stream* stream,
|
||||
DeviceMemory<std::complex<double>>* v) override;
|
||||
bool DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
|
||||
DeviceMemory<float>* v) override;
|
||||
bool DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
|
||||
DeviceMemory<double>* v) override;
|
||||
|
||||
bool SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) override;
|
||||
|
||||
private:
|
||||
// Actually performs the work of generating random numbers - the public
|
||||
// methods are thin wrappers to this interface.
|
||||
template <typename T>
|
||||
bool DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v);
|
||||
template <typename ElemT, typename FuncT>
|
||||
bool DoPopulateRandGaussianInternal(Stream* stream, ElemT mean, ElemT stddev,
|
||||
DeviceMemory<ElemT>* v, FuncT func);
|
||||
|
||||
// Sets the stream for the internal gpu rng generator.
|
||||
//
|
||||
// This is a stateful operation, as the handle can only have one stream set at
|
||||
// a given time, so it is usually performed right before enqueuing work to do
|
||||
// with random number generation.
|
||||
bool SetStream(Stream* stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
|
||||
|
||||
// mutex that guards the gpu rng library handle for this device.
|
||||
mutex mu_;
|
||||
|
||||
// GpuExecutor which instantiated this GpuRng.
|
||||
// Immutable post-initialization.
|
||||
GpuExecutor* parent_;
|
||||
|
||||
// gpu rng library handle on the device.
|
||||
GpuRngHandle rng_ GUARDED_BY(mu_);
|
||||
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(GpuRng);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
string TypeString();
|
||||
|
||||
template <>
|
||||
string TypeString<float>() {
|
||||
return "float";
|
||||
}
|
||||
|
||||
template <>
|
||||
string TypeString<double>() {
|
||||
return "double";
|
||||
}
|
||||
|
||||
template <>
|
||||
string TypeString<std::complex<float>>() {
|
||||
return "std::complex<float>";
|
||||
}
|
||||
|
||||
template <>
|
||||
string TypeString<std::complex<double>>() {
|
||||
return "std::complex<double>";
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
|
@ -1,4 +1,4 @@
|
||||
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -13,49 +13,49 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/stream_executor/cuda/cuda_stream.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
|
||||
|
||||
#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
|
||||
#include "tensorflow/stream_executor/lib/status.h"
|
||||
#include "tensorflow/stream_executor/stream.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
bool CUDAStream::Init() {
|
||||
if (!CUDADriver::CreateStream(parent_->cuda_context(), &cuda_stream_)) {
|
||||
bool GpuStream::Init() {
|
||||
if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_)) {
|
||||
return false;
|
||||
}
|
||||
return CUDADriver::CreateEvent(parent_->cuda_context(), &completed_event_,
|
||||
CUDADriver::EventFlags::kDisableTiming)
|
||||
return GpuDriver::CreateEvent(parent_->gpu_context(), &completed_event_,
|
||||
GpuDriver::EventFlags::kDisableTiming)
|
||||
.ok();
|
||||
}
|
||||
|
||||
void CUDAStream::Destroy() {
|
||||
void GpuStream::Destroy() {
|
||||
if (completed_event_ != nullptr) {
|
||||
port::Status status =
|
||||
CUDADriver::DestroyEvent(parent_->cuda_context(), &completed_event_);
|
||||
GpuDriver::DestroyEvent(parent_->gpu_context(), &completed_event_);
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << status.error_message();
|
||||
}
|
||||
}
|
||||
|
||||
CUDADriver::DestroyStream(parent_->cuda_context(), &cuda_stream_);
|
||||
GpuDriver::DestroyStream(parent_->gpu_context(), &gpu_stream_);
|
||||
}
|
||||
|
||||
bool CUDAStream::IsIdle() const {
|
||||
return CUDADriver::IsStreamIdle(parent_->cuda_context(), cuda_stream_);
|
||||
bool GpuStream::IsIdle() const {
|
||||
return GpuDriver::IsStreamIdle(parent_->gpu_context(), gpu_stream_);
|
||||
}
|
||||
|
||||
CUDAStream *AsCUDAStream(Stream *stream) {
|
||||
GpuStream* AsGpuStream(Stream* stream) {
|
||||
DCHECK(stream != nullptr);
|
||||
return static_cast<CUDAStream *>(stream->implementation());
|
||||
return static_cast<GpuStream*>(stream->implementation());
|
||||
}
|
||||
|
||||
CUstream AsCUDAStreamValue(Stream *stream) {
|
||||
GpuStreamHandle AsGpuStreamValue(Stream* stream) {
|
||||
DCHECK(stream != nullptr);
|
||||
return AsCUDAStream(stream)->cuda_stream();
|
||||
return AsGpuStream(stream)->gpu_stream();
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
96
tensorflow/stream_executor/gpu/gpu_stream.h
Normal file
96
tensorflow/stream_executor/gpu/gpu_stream.h
Normal file
@ -0,0 +1,96 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Defines the GpuStream type - the CUDA-specific implementation of the generic
|
||||
// StreamExecutor Stream interface.
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
|
||||
|
||||
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
||||
#include "tensorflow/stream_executor/platform/thread_annotations.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
class GpuExecutor;
|
||||
|
||||
// Wraps a GpuStreamHandle in order to satisfy the platform-independent
|
||||
// StreamInterface.
|
||||
//
|
||||
// Thread-safe post-initialization.
|
||||
class GpuStream : public internal::StreamInterface {
|
||||
public:
|
||||
explicit GpuStream(GpuExecutor* parent)
|
||||
: parent_(parent), gpu_stream_(nullptr), completed_event_(nullptr) {}
|
||||
|
||||
// Note: teardown is handled by a parent's call to DeallocateStream.
|
||||
~GpuStream() override {}
|
||||
|
||||
void* GpuStreamHack() override { return gpu_stream_; }
|
||||
void** GpuStreamMemberHack() override {
|
||||
return reinterpret_cast<void**>(&gpu_stream_);
|
||||
}
|
||||
|
||||
// Explicitly initialize the CUDA resources associated with this stream, used
|
||||
// by StreamExecutor::AllocateStream().
|
||||
bool Init();
|
||||
|
||||
// Explicitly destroy the CUDA resources associated with this stream, used by
|
||||
// StreamExecutor::DeallocateStream().
|
||||
void Destroy();
|
||||
|
||||
// Returns true if no work is pending or executing on the stream.
|
||||
bool IsIdle() const;
|
||||
|
||||
// Retrieves an event which indicates that all work enqueued into the stream
|
||||
// has completed. Ownership of the event is not transferred to the caller, the
|
||||
// event is owned by this stream.
|
||||
GpuEventHandle* completed_event() { return &completed_event_; }
|
||||
|
||||
// Returns the GpuStreamHandle value for passing to the CUDA API.
|
||||
//
|
||||
// Precond: this GpuStream has been allocated (otherwise passing a nullptr
|
||||
// into the NVIDIA library causes difficult-to-understand faults).
|
||||
GpuStreamHandle gpu_stream() const {
|
||||
DCHECK(gpu_stream_ != nullptr);
|
||||
return const_cast<GpuStreamHandle>(gpu_stream_);
|
||||
}
|
||||
|
||||
// TODO(timshen): Migrate away and remove this function.
|
||||
GpuStreamHandle cuda_stream() const { return gpu_stream(); }
|
||||
|
||||
GpuExecutor* parent() const { return parent_; }
|
||||
|
||||
private:
|
||||
GpuExecutor* parent_; // Executor that spawned this stream.
|
||||
GpuStreamHandle gpu_stream_; // Wrapped CUDA stream handle.
|
||||
|
||||
// Event that indicates this stream has completed.
|
||||
GpuEventHandle completed_event_ = nullptr;
|
||||
};
|
||||
|
||||
// Helper functions to simplify extremely common flows.
|
||||
// Converts a Stream to the underlying GpuStream implementation.
|
||||
GpuStream* AsGpuStream(Stream* stream);
|
||||
|
||||
// Extracts a GpuStreamHandle from a GpuStream-backed Stream object.
|
||||
GpuStreamHandle AsGpuStreamValue(Stream* stream);
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
|
@ -1,4 +1,4 @@
|
||||
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -13,31 +13,31 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/stream_executor/cuda/cuda_timer.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_timer.h"
|
||||
|
||||
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_stream.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
|
||||
#include "tensorflow/stream_executor/lib/status.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace cuda {
|
||||
namespace gpu {
|
||||
|
||||
bool CUDATimer::Init() {
|
||||
bool GpuTimer::Init() {
|
||||
CHECK(start_event_ == nullptr && stop_event_ == nullptr);
|
||||
CudaContext* context = parent_->cuda_context();
|
||||
port::Status status = CUDADriver::CreateEvent(
|
||||
context, &start_event_, CUDADriver::EventFlags::kDefault);
|
||||
GpuContext* context = parent_->gpu_context();
|
||||
port::Status status = GpuDriver::CreateEvent(context, &start_event_,
|
||||
GpuDriver::EventFlags::kDefault);
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << status;
|
||||
return false;
|
||||
}
|
||||
|
||||
status = CUDADriver::CreateEvent(context, &stop_event_,
|
||||
CUDADriver::EventFlags::kDefault);
|
||||
status = GpuDriver::CreateEvent(context, &stop_event_,
|
||||
GpuDriver::EventFlags::kDefault);
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << status;
|
||||
status = CUDADriver::DestroyEvent(context, &start_event_);
|
||||
status = GpuDriver::DestroyEvent(context, &start_event_);
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << status;
|
||||
}
|
||||
@ -48,47 +48,46 @@ bool CUDATimer::Init() {
|
||||
return true;
|
||||
}
|
||||
|
||||
void CUDATimer::Destroy() {
|
||||
CudaContext* context = parent_->cuda_context();
|
||||
port::Status status = CUDADriver::DestroyEvent(context, &start_event_);
|
||||
void GpuTimer::Destroy() {
|
||||
GpuContext* context = parent_->gpu_context();
|
||||
port::Status status = GpuDriver::DestroyEvent(context, &start_event_);
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << status;
|
||||
}
|
||||
|
||||
status = CUDADriver::DestroyEvent(context, &stop_event_);
|
||||
status = GpuDriver::DestroyEvent(context, &stop_event_);
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << status;
|
||||
}
|
||||
}
|
||||
|
||||
float CUDATimer::GetElapsedMilliseconds() const {
|
||||
float GpuTimer::GetElapsedMilliseconds() const {
|
||||
CHECK(start_event_ != nullptr && stop_event_ != nullptr);
|
||||
// TODO(leary) provide a way to query timer resolution?
|
||||
// CUDA docs say a resolution of about 0.5us
|
||||
float elapsed_milliseconds = NAN;
|
||||
(void)CUDADriver::GetEventElapsedTime(parent_->cuda_context(),
|
||||
&elapsed_milliseconds, start_event_,
|
||||
stop_event_);
|
||||
(void)GpuDriver::GetEventElapsedTime(
|
||||
parent_->gpu_context(), &elapsed_milliseconds, start_event_, stop_event_);
|
||||
return elapsed_milliseconds;
|
||||
}
|
||||
|
||||
bool CUDATimer::Start(CUDAStream* stream) {
|
||||
port::Status status = CUDADriver::RecordEvent(
|
||||
parent_->cuda_context(), start_event_, stream->cuda_stream());
|
||||
bool GpuTimer::Start(GpuStream* stream) {
|
||||
port::Status status = GpuDriver::RecordEvent(
|
||||
parent_->gpu_context(), start_event_, stream->gpu_stream());
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << status;
|
||||
}
|
||||
return status.ok();
|
||||
}
|
||||
|
||||
bool CUDATimer::Stop(CUDAStream* stream) {
|
||||
port::Status status = CUDADriver::RecordEvent(
|
||||
parent_->cuda_context(), stop_event_, stream->cuda_stream());
|
||||
bool GpuTimer::Stop(GpuStream* stream) {
|
||||
port::Status status = GpuDriver::RecordEvent(
|
||||
parent_->gpu_context(), stop_event_, stream->gpu_stream());
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << status;
|
||||
}
|
||||
return status.ok();
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
90
tensorflow/stream_executor/gpu/gpu_timer.h
Normal file
90
tensorflow/stream_executor/gpu/gpu_timer.h
Normal file
@ -0,0 +1,90 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// Defines the GpuTimer type - the CUDA-specific implementation of the generic
|
||||
// StreamExecutor Timer interface.
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
|
||||
|
||||
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
class GpuExecutor;
|
||||
class GpuStream;
|
||||
|
||||
// Wraps a pair of GpuEventHandles in order to satisfy the platform-independent
|
||||
// TimerInferface -- both a start and a stop event are present which may be
|
||||
// recorded in a stream.
|
||||
class GpuTimer : public internal::TimerInterface {
|
||||
public:
|
||||
explicit GpuTimer(GpuExecutor* parent)
|
||||
: parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
|
||||
|
||||
// Note: teardown needs to be explicitly handled in this API by a call to
|
||||
// StreamExecutor::DeallocateTimer(), which invokes Destroy().
|
||||
// TODO(csigg): Change to RAII.
|
||||
~GpuTimer() override {}
|
||||
|
||||
// Allocates the platform-specific pieces of the timer, called as part of
|
||||
// StreamExecutor::AllocateTimer().
|
||||
bool Init();
|
||||
|
||||
// Deallocates the platform-specific pieces of the timer, called as part of
|
||||
// StreamExecutor::DeallocateTimer().
|
||||
void Destroy();
|
||||
|
||||
// Records the "timer start" event at the current point in the stream.
|
||||
bool Start(GpuStream* stream);
|
||||
|
||||
// Records the "timer stop" event at the current point in the stream.
|
||||
bool Stop(GpuStream* stream);
|
||||
|
||||
// Returns the elapsed time, in milliseconds, between the start and stop
|
||||
// events.
|
||||
float GetElapsedMilliseconds() const;
|
||||
|
||||
// See Timer::Microseconds().
|
||||
// TODO(leary) make this into an error code interface...
|
||||
uint64 Microseconds() const override {
|
||||
return GetElapsedMilliseconds() * 1e3;
|
||||
}
|
||||
|
||||
// See Timer::Nanoseconds().
|
||||
uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
|
||||
|
||||
private:
|
||||
GpuExecutor* parent_;
|
||||
GpuEventHandle start_event_; // Event recorded to indicate the "start"
|
||||
// timestamp executing in a stream.
|
||||
GpuEventHandle stop_event_; // Event recorded to indicate the "stop"
|
||||
// timestamp executing in a stream.
|
||||
};
|
||||
|
||||
struct GpuTimerDeleter {
|
||||
void operator()(GpuTimer* t) {
|
||||
t->Destroy();
|
||||
delete t;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
|
84
tensorflow/stream_executor/gpu/gpu_types.h
Normal file
84
tensorflow/stream_executor/gpu/gpu_types.h
Normal file
@ -0,0 +1,84 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
// GPU (ROCm / CUDA) specific type handle resolution
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
|
||||
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
|
||||
#include "rocm/include/hip/hip_complex.h"
|
||||
#include "rocm/include/hip/hip_runtime.h"
|
||||
#include "rocm/include/hiprand/hiprand.h"
|
||||
|
||||
#else // CUDA
|
||||
|
||||
#include "cuda/include/cuComplex.h"
|
||||
#include "cuda/include/cuda.h"
|
||||
|
||||
// cannot include curand.h here
|
||||
// because it triggers the #error in cuda/cuda_gpu_executor.cc
|
||||
// (because curand.h includes cuda_runtime.h)
|
||||
// so explicitly adding the lone typedef we need from that file
|
||||
typedef struct curandGenerator_st* curandGenerator_t;
|
||||
|
||||
#endif
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
#if TENSORFLOW_USE_ROCM
|
||||
|
||||
using GpuStreamHandle = hipStream_t;
|
||||
using GpuEventHandle = hipEvent_t;
|
||||
using GpuFunctionHandle = hipFunction_t;
|
||||
using GpuFunctionAttribute = hipDeviceAttribute_t; // not a typo!
|
||||
using GpuDeviceHandle = hipDevice_t;
|
||||
using GpuDevicePtr = hipDeviceptr_t;
|
||||
using GpuDeviceAttribute = hipDeviceAttribute_t;
|
||||
using GpuDeviceProperty = hipDeviceProp_t;
|
||||
using GpuModuleHandle = hipModule_t;
|
||||
using GpuStatus = hipError_t;
|
||||
using GpuFuncCachePreference = hipFuncCache_t;
|
||||
using GpuSharedMemConfig = hipSharedMemConfig;
|
||||
using GpuComplexType = hipComplex;
|
||||
using GpuDoubleComplexType = hipDoubleComplex;
|
||||
using GpuRngHandle = hiprandGenerator_t;
|
||||
|
||||
#else // CUDA
|
||||
|
||||
using GpuStreamHandle = CUstream;
|
||||
using GpuEventHandle = CUevent;
|
||||
using GpuFunctionHandle = CUfunction;
|
||||
using GpuFunctionAttribute = CUfunction_attribute;
|
||||
using GpuDeviceHandle = CUdevice;
|
||||
using GpuDevicePtr = CUdeviceptr;
|
||||
using GpuDeviceAttribute = CUdevice_attribute;
|
||||
using GpuDeviceProperty = CUdevprop;
|
||||
using GpuModuleHandle = CUmodule;
|
||||
using GpuStatus = CUresult;
|
||||
using GpuFuncCachePreference = CUfunc_cache;
|
||||
using GpuSharedMemConfig = CUsharedconfig;
|
||||
using GpuComplexType = cuComplex;
|
||||
using GpuDoubleComplexType = cuDoubleComplex;
|
||||
using GpuRngHandle = curandGenerator_t;
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
|
@ -28,6 +28,8 @@ string PlatformKindString(PlatformKind kind) {
|
||||
switch (kind) {
|
||||
case PlatformKind::kCuda:
|
||||
return "CUDA";
|
||||
case PlatformKind::kROCm:
|
||||
return "ROCm";
|
||||
case PlatformKind::kOpenCL:
|
||||
return "OpenCL";
|
||||
case PlatformKind::kHost:
|
||||
@ -52,6 +54,7 @@ PlatformKind PlatformKindFromString(string kind) {
|
||||
bool PlatformIsRunnable(PlatformKind kind) {
|
||||
switch (kind) {
|
||||
case PlatformKind::kCuda:
|
||||
case PlatformKind::kROCm:
|
||||
case PlatformKind::kOpenCL:
|
||||
case PlatformKind::kHost:
|
||||
return true;
|
||||
@ -63,6 +66,7 @@ bool PlatformIsRunnable(PlatformKind kind) {
|
||||
bool PlatformIsRunnableOnDevice(PlatformKind kind) {
|
||||
switch (kind) {
|
||||
case PlatformKind::kCuda:
|
||||
case PlatformKind::kROCm:
|
||||
case PlatformKind::kOpenCL:
|
||||
return true;
|
||||
default:
|
||||
|
@ -40,6 +40,7 @@ class StreamExecutor;
|
||||
enum class PlatformKind {
|
||||
kInvalid,
|
||||
kCuda,
|
||||
kROCm,
|
||||
kOpenCL,
|
||||
kHost,
|
||||
kMock,
|
||||
|
267
tensorflow/stream_executor/rocm/BUILD
Normal file
267
tensorflow/stream_executor/rocm/BUILD
Normal file
@ -0,0 +1,267 @@
|
||||
# Description:
|
||||
# ROCm-platform specific StreamExecutor support code.
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
load("//tensorflow:tensorflow.bzl", "tf_cc_test")
|
||||
load(
|
||||
"//tensorflow/stream_executor:build_defs.bzl",
|
||||
"stream_executor_friends",
|
||||
)
|
||||
load("//tensorflow:tensorflow.bzl", "tf_copts")
|
||||
load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
|
||||
load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
|
||||
|
||||
package_group(
|
||||
name = "friends",
|
||||
packages = stream_executor_friends(),
|
||||
)
|
||||
|
||||
package(
|
||||
default_visibility = [":friends"],
|
||||
)
|
||||
|
||||
# Filegroup used to collect source files for the dependency check.
|
||||
filegroup(
|
||||
name = "c_srcs",
|
||||
data = glob([
|
||||
"**/*.cc",
|
||||
"**/*.h",
|
||||
]),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "rocm_diagnostics",
|
||||
srcs = if_rocm_is_configured(["rocm_diagnostics.cc"]),
|
||||
hdrs = [],
|
||||
deps = if_rocm_is_configured([
|
||||
"@com_google_absl//absl/container:inlined_vector",
|
||||
"@com_google_absl//absl/strings",
|
||||
"@com_google_absl//absl/strings:str_format",
|
||||
"//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
]),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "rocm_driver",
|
||||
srcs = if_rocm_is_configured(["rocm_driver.cc"]),
|
||||
hdrs = [],
|
||||
deps = if_rocm_is_configured([
|
||||
":rocm_diagnostics",
|
||||
"@com_google_absl//absl/base",
|
||||
"@com_google_absl//absl/container:inlined_vector",
|
||||
"@com_google_absl//absl/strings",
|
||||
"//tensorflow/stream_executor:device_options",
|
||||
"//tensorflow/stream_executor/gpu:gpu_driver_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
"@local_config_rocm//rocm:rocm_headers",
|
||||
]),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "rocm_event",
|
||||
srcs = if_rocm_is_configured(["rocm_event.cc"]),
|
||||
hdrs = [],
|
||||
deps = if_rocm_is_configured([
|
||||
":rocm_driver",
|
||||
"//tensorflow/stream_executor:stream_executor_headers",
|
||||
"//tensorflow/stream_executor/gpu:gpu_event_header",
|
||||
"//tensorflow/stream_executor/gpu:gpu_executor_header",
|
||||
"//tensorflow/stream_executor/gpu:gpu_stream_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
]),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "rocm_gpu_executor",
|
||||
srcs = if_rocm_is_configured(["rocm_gpu_executor.cc"]),
|
||||
hdrs = [],
|
||||
deps = if_rocm_is_configured([
|
||||
":rocm_diagnostics",
|
||||
":rocm_driver",
|
||||
":rocm_event",
|
||||
":rocm_kernel",
|
||||
":rocm_platform_id",
|
||||
"@com_google_absl//absl/strings",
|
||||
"//tensorflow/stream_executor:event",
|
||||
"//tensorflow/stream_executor:plugin_registry",
|
||||
"//tensorflow/stream_executor:stream_executor_internal",
|
||||
"//tensorflow/stream_executor:stream_executor_pimpl_header",
|
||||
"//tensorflow/stream_executor:timer",
|
||||
"//tensorflow/stream_executor/gpu:gpu_activation_header",
|
||||
"//tensorflow/stream_executor/gpu:gpu_event",
|
||||
"//tensorflow/stream_executor/gpu:gpu_kernel_header",
|
||||
"//tensorflow/stream_executor/gpu:gpu_stream",
|
||||
"//tensorflow/stream_executor/gpu:gpu_timer",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
"//tensorflow/stream_executor/platform:dso_loader",
|
||||
]),
|
||||
alwayslink = True,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "rocm_kernel",
|
||||
srcs = if_rocm_is_configured(["rocm_kernel.cc"]),
|
||||
hdrs = [],
|
||||
visibility = ["//visibility:public"],
|
||||
deps = if_rocm_is_configured([
|
||||
"//tensorflow/stream_executor/gpu:gpu_kernel_header",
|
||||
]),
|
||||
alwayslink = True,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "rocm_platform",
|
||||
srcs = if_rocm_is_configured(["rocm_platform.cc"]),
|
||||
hdrs = if_rocm_is_configured(["rocm_platform.h"]),
|
||||
visibility = ["//visibility:public"],
|
||||
deps = if_rocm_is_configured([
|
||||
":rocm_driver",
|
||||
":rocm_gpu_executor",
|
||||
":rocm_platform_id",
|
||||
"//tensorflow/stream_executor", # buildcleaner: keep
|
||||
"//tensorflow/stream_executor:executor_cache",
|
||||
"//tensorflow/stream_executor:multi_platform_manager",
|
||||
"//tensorflow/stream_executor:stream_executor_pimpl_header",
|
||||
"//tensorflow/stream_executor/lib",
|
||||
"//tensorflow/stream_executor/platform",
|
||||
]),
|
||||
alwayslink = True, # Registers itself with the MultiPlatformManager.
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "rocm_platform_id",
|
||||
srcs = ["rocm_platform_id.cc"],
|
||||
hdrs = ["rocm_platform_id.h"],
|
||||
deps = ["//tensorflow/stream_executor:platform"],
|
||||
)
|
||||
|
||||
# FIXME: enable in future PRs
|
||||
#cc_library(
|
||||
# name = "rocblas_plugin",
|
||||
# srcs = ["rocm_blas.cc"],
|
||||
# hdrs = ["rocm_blas.h"],
|
||||
# visibility = ["//visibility:public"],
|
||||
# deps = [
|
||||
# ":rocm_gpu_executor",
|
||||
# ":rocm_platform_id",
|
||||
# "//third_party/eigen3",
|
||||
# "//tensorflow/core:lib_internal",
|
||||
# "//tensorflow/stream_executor",
|
||||
# "//tensorflow/stream_executor:event",
|
||||
# "//tensorflow/stream_executor:host_or_device_scalar",
|
||||
# "//tensorflow/stream_executor:plugin_registry",
|
||||
# "//tensorflow/stream_executor:scratch_allocator",
|
||||
# "//tensorflow/stream_executor:timer",
|
||||
# "//tenosrflow/stream_executor/gpu:gpu_activation_header",
|
||||
# "//tenosrflow/stream_executor/gpu:gpu_stream_header",
|
||||
# "//tenosrflow/stream_executor/gpu:gpu_timer_header",
|
||||
# "//tensorflow/stream_executor/lib",
|
||||
# "//tensorflow/stream_executor/platform",
|
||||
# "//tensorflow/stream_executor/platform:dso_loader",
|
||||
# "@com_google_absl//absl/strings",
|
||||
# "@local_config_rocm//rocm:rocm_headers",
|
||||
# ] + if_static(["@local_config_rocm//rocm:rocblas"]),
|
||||
# alwayslink = True,
|
||||
#)
|
||||
|
||||
# FIXME: enable in future PRs
|
||||
#cc_library(
|
||||
# name = "rocfft_plugin",
|
||||
# srcs = ["rocm_fft.cc"],
|
||||
# hdrs = [],
|
||||
# visibility = ["//visibility:public"],
|
||||
# deps = [
|
||||
# ":rocm_platform_id",
|
||||
# "//tensorflow/stream_executor:event",
|
||||
# "//tensorflow/stream_executor:fft",
|
||||
# "//tensorflow/stream_executor:plugin_registry",
|
||||
# "//tensorflow/stream_executor:scratch_allocator",
|
||||
# "//tenosrflow/stream_executor/gpu:gpu_stream_header",
|
||||
# "//tensorflow/stream_executor/lib",
|
||||
# "//tensorflow/stream_executor/platform",
|
||||
# "//tensorflow/stream_executor/platform:dso_loader",
|
||||
# "@local_config_rocm//rocm:rocm_headers",
|
||||
# ] + if_static(["@local_config_rocm//rocm:rocfft"]),
|
||||
# alwayslink = True,
|
||||
#)
|
||||
|
||||
# FIXME: enable in future PRs
|
||||
#cc_library(
|
||||
# name = "miopen_plugin",
|
||||
# srcs = ["rocm_dnn.cc"],
|
||||
# hdrs = [],
|
||||
# copts = [
|
||||
# # STREAM_EXECUTOR_CUDNN_WRAP would fail on Clang with the default
|
||||
# # setting of template depth 256
|
||||
# "-ftemplate-depth-512",
|
||||
# ],
|
||||
# visibility = ["//visibility:public"],
|
||||
# deps = [
|
||||
# ":rocm_diagnostics",
|
||||
# ":rocm_driver",
|
||||
# ":rocm_gpu_executor",
|
||||
# ":rocm_platform_id",
|
||||
# "//third_party/eigen3",
|
||||
# "//tensorflow/core:lib",
|
||||
# "//tensorflow/core:lib_internal",
|
||||
# "//tensorflow/core:logger",
|
||||
# "//tensorflow/stream_executor:dnn",
|
||||
# "//tensorflow/stream_executor:event",
|
||||
# "//tensorflow/stream_executor:logging_proto_cc",
|
||||
# "//tensorflow/stream_executor:plugin_registry",
|
||||
# "//tensorflow/stream_executor:scratch_allocator",
|
||||
# "//tensorflow/stream_executor:stream_executor_pimpl_header",
|
||||
# "//tensorflow/stream_executor:temporary_device_memory",
|
||||
# "//tenosrflow/stream_executor/gpu:gpu_activation_header",
|
||||
# "//tenosrflow/stream_executor/gpu:gpu_stream_header",
|
||||
# "//tenosrflow/stream_executor/gpu:gpu_timer_header",
|
||||
# "//tensorflow/stream_executor/lib",
|
||||
# "//tensorflow/stream_executor/platform",
|
||||
# "//tensorflow/stream_executor/platform:dso_loader",
|
||||
# "@com_google_absl//absl/strings",
|
||||
# "@local_config_rocm//rocm:rocm_headers",
|
||||
# ] + tf_additional_miopen_plugin_deps() + if_static(["@local_config_rocm//rocm:miopen"]),
|
||||
# alwayslink = True,
|
||||
#)
|
||||
|
||||
# FIXME: enable in future PRs
|
||||
#cc_library(
|
||||
# name = "rocrand_plugin",
|
||||
# srcs = ["rocm_rng.cc"],
|
||||
# hdrs = [],
|
||||
# deps = [
|
||||
# ":rocm_gpu_executor",
|
||||
# ":rocm_platform_id",
|
||||
# "@local_config_rocm//rocm:rocm_headers",
|
||||
# "//tensorflow/stream_executor:event",
|
||||
# "//tensorflow/stream_executor:plugin_registry",
|
||||
# "//tensorflow/stream_executor:rng",
|
||||
# "//tenosrflow/stream_executor/gpu:gpu_activation_header",
|
||||
# "//tenosrflow/stream_executor/gpu:gpu_stream_header",
|
||||
# "//tensorflow/stream_executor/lib",
|
||||
# "//tensorflow/stream_executor/platform",
|
||||
# "//tensorflow/stream_executor/platform:dso_loader",
|
||||
# ] + if_static(["@local_config_rocm//rocm:curand"]),
|
||||
# alwayslink = True,
|
||||
#)
|
||||
|
||||
cc_library(
|
||||
name = "all_runtime",
|
||||
copts = tf_copts(),
|
||||
visibility = ["//visibility:public"],
|
||||
deps = if_rocm_is_configured([
|
||||
# FIXME: enable in future PRs
|
||||
#":miopen_plugin",
|
||||
#":rocfft_plugin",
|
||||
#":rocblas_plugin",
|
||||
#":rocrand_plugin",
|
||||
":rocm_driver",
|
||||
":rocm_platform",
|
||||
]),
|
||||
alwayslink = 1,
|
||||
)
|
234
tensorflow/stream_executor/rocm/rocm_diagnostics.cc
Normal file
234
tensorflow/stream_executor/rocm/rocm_diagnostics.cc
Normal file
@ -0,0 +1,234 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include <dirent.h>
|
||||
|
||||
#include <limits.h>
|
||||
#include <link.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/sysmacros.h>
|
||||
#include <unistd.h>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "absl/container/inlined_vector.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
|
||||
#include "tensorflow/stream_executor/lib/error.h"
|
||||
#include "tensorflow/stream_executor/lib/numbers.h"
|
||||
#include "tensorflow/stream_executor/lib/process_state.h"
|
||||
#include "tensorflow/stream_executor/lib/status.h"
|
||||
#include "tensorflow/stream_executor/lib/str_util.h"
|
||||
#include "tensorflow/stream_executor/lib/stringprintf.h"
|
||||
#include "tensorflow/stream_executor/platform/logging.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
string DriverVersionToString(DriverVersion version) {
|
||||
return absl::StrFormat("%d.%d.%d", std::get<0>(version), std::get<1>(version),
|
||||
std::get<2>(version));
|
||||
}
|
||||
|
||||
string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
|
||||
if (!version.ok()) {
|
||||
return version.status().ToString();
|
||||
}
|
||||
|
||||
return DriverVersionToString(version.ValueOrDie());
|
||||
}
|
||||
|
||||
port::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
|
||||
std::vector<string> pieces = port::Split(value, '.');
|
||||
if (pieces.size() != 2 && pieces.size() != 3) {
|
||||
return port::Status{port::error::INVALID_ARGUMENT,
|
||||
absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
|
||||
"for driver version; got \"%s\"",
|
||||
value.c_str())};
|
||||
}
|
||||
|
||||
int major;
|
||||
int minor;
|
||||
int patch = 0;
|
||||
if (!port::safe_strto32(pieces[0], &major)) {
|
||||
return port::Status{
|
||||
port::error::INVALID_ARGUMENT,
|
||||
absl::StrFormat("could not parse major version number \"%s\" as an "
|
||||
"integer from string \"%s\"",
|
||||
pieces[0].c_str(), value.c_str())};
|
||||
}
|
||||
if (!port::safe_strto32(pieces[1], &minor)) {
|
||||
return port::Status{
|
||||
port::error::INVALID_ARGUMENT,
|
||||
absl::StrFormat("could not parse minor version number \"%s\" as an "
|
||||
"integer from string \"%s\"",
|
||||
pieces[1].c_str(), value.c_str())};
|
||||
}
|
||||
if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
|
||||
return port::Status{
|
||||
port::error::INVALID_ARGUMENT,
|
||||
absl::StrFormat("could not parse patch version number \"%s\" as an "
|
||||
"integer from string \"%s\"",
|
||||
pieces[2].c_str(), value.c_str())};
|
||||
}
|
||||
|
||||
DriverVersion result{major, minor, patch};
|
||||
VLOG(2) << "version string \"" << value << "\" made value "
|
||||
<< DriverVersionToString(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// -- class Diagnostician
|
||||
|
||||
string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
|
||||
return absl::StrCat("/dev/kfd", dev_node_ordinal);
|
||||
}
|
||||
|
||||
void Diagnostician::LogDiagnosticInformation() {
|
||||
LOG(INFO) << "retrieving ROCM diagnostic information for host: "
|
||||
<< port::Hostname();
|
||||
|
||||
LogDriverVersionInformation();
|
||||
}
|
||||
|
||||
/* static */ void Diagnostician::LogDriverVersionInformation() {
|
||||
LOG(INFO) << "hostname: " << port::Hostname();
|
||||
if (VLOG_IS_ON(1)) {
|
||||
const char* value = getenv("LD_LIBRARY_PATH");
|
||||
string library_path = value == nullptr ? "" : value;
|
||||
VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
|
||||
|
||||
std::vector<string> pieces = port::Split(library_path, ':');
|
||||
for (const auto& piece : pieces) {
|
||||
if (piece.empty()) {
|
||||
continue;
|
||||
}
|
||||
DIR* dir = opendir(piece.c_str());
|
||||
if (dir == nullptr) {
|
||||
VLOG(1) << "could not open \"" << piece << "\"";
|
||||
continue;
|
||||
}
|
||||
while (dirent* entity = readdir(dir)) {
|
||||
VLOG(1) << piece << " :: " << entity->d_name;
|
||||
}
|
||||
closedir(dir);
|
||||
}
|
||||
}
|
||||
port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
|
||||
LOG(INFO) << "librocm reported version is: "
|
||||
<< DriverVersionStatusToString(dso_version);
|
||||
|
||||
port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
|
||||
LOG(INFO) << "kernel reported version is: "
|
||||
<< DriverVersionStatusToString(kernel_version);
|
||||
|
||||
if (kernel_version.ok() && dso_version.ok()) {
|
||||
WarnOnDsoKernelMismatch(dso_version, kernel_version);
|
||||
}
|
||||
}
|
||||
|
||||
// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
|
||||
// driver-interfacing DSO version number. Returns it as a string.
|
||||
port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
|
||||
port::StatusOr<DriverVersion> result{port::Status{
|
||||
port::error::NOT_FOUND,
|
||||
"was unable to find librocm.so DSO loaded into this program"}};
|
||||
|
||||
// Callback used when iterating through DSOs. Looks for the driver-interfacing
|
||||
// DSO and yields its version number into the callback data, when found.
|
||||
auto iterate_phdr = [](struct dl_phdr_info* info, size_t size,
|
||||
void* data) -> int {
|
||||
if (strstr(info->dlpi_name, "librocm.so.1")) {
|
||||
VLOG(1) << "found DLL info with name: " << info->dlpi_name;
|
||||
char resolved_path[PATH_MAX] = {0};
|
||||
if (realpath(info->dlpi_name, resolved_path) == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
VLOG(1) << "found DLL info with resolved path: " << resolved_path;
|
||||
const char* slash = rindex(resolved_path, '/');
|
||||
if (slash == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
const char* so_suffix = ".so.";
|
||||
const char* dot = strstr(slash, so_suffix);
|
||||
if (dot == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
string dso_version = dot + strlen(so_suffix);
|
||||
// TODO(b/22689637): Eliminate the explicit namespace if possible.
|
||||
auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
|
||||
auto result = static_cast<port::StatusOr<DriverVersion>*>(data);
|
||||
*result = StringToDriverVersion(stripped_dso_version);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
|
||||
dl_iterate_phdr(iterate_phdr, &result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
|
||||
const string& driver_version_file_contents) {
|
||||
static const char* kDriverFilePrelude = "Kernel Module ";
|
||||
size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
|
||||
if (offset == string::npos) {
|
||||
return port::Status{
|
||||
port::error::NOT_FOUND,
|
||||
absl::StrCat("could not find kernel module information in "
|
||||
"driver version file contents: \"",
|
||||
driver_version_file_contents, "\"")};
|
||||
}
|
||||
|
||||
string version_and_rest = driver_version_file_contents.substr(
|
||||
offset + strlen(kDriverFilePrelude), string::npos);
|
||||
size_t space_index = version_and_rest.find(" ");
|
||||
auto kernel_version = version_and_rest.substr(0, space_index);
|
||||
// TODO(b/22689637): Eliminate the explicit namespace if possible.
|
||||
auto stripped_kernel_version =
|
||||
port::StripSuffixString(kernel_version, ".ld64");
|
||||
return StringToDriverVersion(stripped_kernel_version);
|
||||
}
|
||||
|
||||
void Diagnostician::WarnOnDsoKernelMismatch(
|
||||
port::StatusOr<DriverVersion> dso_version,
|
||||
port::StatusOr<DriverVersion> kernel_version) {
|
||||
if (kernel_version.ok() && dso_version.ok() &&
|
||||
dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
|
||||
LOG(INFO) << "kernel version seems to match DSO: "
|
||||
<< DriverVersionToString(kernel_version.ValueOrDie());
|
||||
} else {
|
||||
LOG(ERROR) << "kernel version "
|
||||
<< DriverVersionStatusToString(kernel_version)
|
||||
<< " does not match DSO version "
|
||||
<< DriverVersionStatusToString(dso_version)
|
||||
<< " -- cannot find working devices in this configuration";
|
||||
}
|
||||
}
|
||||
|
||||
port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
|
||||
auto status = port::Status{port::error::UNIMPLEMENTED,
|
||||
"kernel reported driver version not implemented"};
|
||||
return status;
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
1365
tensorflow/stream_executor/rocm/rocm_driver.cc
Normal file
1365
tensorflow/stream_executor/rocm/rocm_driver.cc
Normal file
File diff suppressed because it is too large
Load Diff
46
tensorflow/stream_executor/rocm/rocm_event.cc
Normal file
46
tensorflow/stream_executor/rocm/rocm_event.cc
Normal file
@ -0,0 +1,46 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/stream_executor/gpu/gpu_event.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
|
||||
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
Event::Status GpuEvent::PollForStatus() {
|
||||
port::StatusOr<hipError_t> status =
|
||||
GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << "Error polling for event status: "
|
||||
<< status.status().error_message();
|
||||
return Event::Status::kError;
|
||||
}
|
||||
|
||||
switch (status.ValueOrDie()) {
|
||||
case hipSuccess:
|
||||
return Event::Status::kComplete;
|
||||
case hipErrorNotReady:
|
||||
return Event::Status::kPending;
|
||||
default:
|
||||
LOG(INFO) << "Error condition returned for event status: "
|
||||
<< status.ValueOrDie();
|
||||
return Event::Status::kError;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
976
tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
Normal file
976
tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
Normal file
@ -0,0 +1,976 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
#include "absl/base/casts.h"
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_event.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_timer.h"
|
||||
#include "tensorflow/stream_executor/kernel_cache_config.h"
|
||||
#include "tensorflow/stream_executor/lib/env.h"
|
||||
#include "tensorflow/stream_executor/lib/error.h"
|
||||
#include "tensorflow/stream_executor/lib/initialize.h"
|
||||
#include "tensorflow/stream_executor/lib/mathutil.h"
|
||||
#include "tensorflow/stream_executor/lib/numbers.h"
|
||||
#include "tensorflow/stream_executor/lib/path.h"
|
||||
#include "tensorflow/stream_executor/lib/process_state.h"
|
||||
#include "tensorflow/stream_executor/lib/ptr_util.h"
|
||||
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||
#include "tensorflow/stream_executor/lib/str_util.h"
|
||||
#include "tensorflow/stream_executor/lib/stringprintf.h"
|
||||
#include "tensorflow/stream_executor/platform.h"
|
||||
#include "tensorflow/stream_executor/platform/dso_loader.h"
|
||||
#include "tensorflow/stream_executor/platform/logging.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/plugin_registry.h"
|
||||
#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
|
||||
#include "tensorflow/stream_executor/stream.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_pimpl.h"
|
||||
#include "tensorflow/stream_executor/timer.h"
|
||||
|
||||
#ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
|
||||
#error \
|
||||
"No driver calls in this file, wrap driver functionality in rocm_driver.cc."
|
||||
#endif
|
||||
|
||||
#ifdef __ROCM_RUNTIME_H__
|
||||
#error \
|
||||
"ROCM runtime being included into ROCM GPU executor; should be driver only."
|
||||
#endif
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
static GpuEvent* AsGpuEvent(Event* event) {
|
||||
DCHECK(event != nullptr);
|
||||
return static_cast<GpuEvent*>(event->implementation());
|
||||
}
|
||||
|
||||
// Given a platform-independent timer datatype, returns the internal ROCM
|
||||
// platform implementation pointer.
|
||||
static GpuTimer* AsGpuTimer(Timer* timer) {
|
||||
DCHECK(timer != nullptr);
|
||||
return static_cast<GpuTimer*>(timer->implementation());
|
||||
}
|
||||
|
||||
// Given const GPU memory, returns a librocm device pointer datatype, suitable
|
||||
// for passing directly to librocm APIs.
|
||||
//
|
||||
// N.B. we must lose constness in order to pass a suitable type to the existing
|
||||
// librocm APIs, so the caller should take care to only pass the result of const
|
||||
// GPU memory conversions to librocm functions which will honor constness.
|
||||
static hipDeviceptr_t AsROCmDevicePtr(const DeviceMemoryBase& gpu_mem) {
|
||||
return const_cast<hipDeviceptr_t>(gpu_mem.opaque());
|
||||
}
|
||||
|
||||
// See description on const version above.
|
||||
static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
|
||||
return AsROCmDevicePtr(*gpu_mem);
|
||||
}
|
||||
|
||||
static GpuContext* GetGpuContext(Stream* stream) {
|
||||
return static_cast<GpuExecutor*>(stream->parent()->implementation())
|
||||
->gpu_context();
|
||||
}
|
||||
|
||||
GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
|
||||
CHECK(rocm_exec != nullptr);
|
||||
return rocm_exec->gpu_context();
|
||||
}
|
||||
|
||||
GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
|
||||
return static_cast<GpuExecutor*>(stream_exec->implementation());
|
||||
}
|
||||
|
||||
GpuExecutor::~GpuExecutor() {
|
||||
for (auto& it : disk_modules_) {
|
||||
GpuDriver::UnloadModule(context_, it.second);
|
||||
}
|
||||
for (auto& it : in_memory_modules_) {
|
||||
GpuDriver::UnloadModule(context_, it.second);
|
||||
}
|
||||
if (context_ != nullptr) {
|
||||
GpuDriver::DestroyContext(context_);
|
||||
}
|
||||
CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
|
||||
}
|
||||
bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
|
||||
const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
|
||||
mutex_lock lock{in_memory_modules_mu_};
|
||||
return UnloadGpuBinary(gpu_binary);
|
||||
}
|
||||
|
||||
bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
|
||||
auto module_it = gpu_binary_to_module_.find(gpu_binary);
|
||||
if (gpu_binary_to_module_.end() == module_it) {
|
||||
VLOG(3) << "No loaded HSACO module for " << gpu_binary;
|
||||
return false;
|
||||
}
|
||||
auto& module = module_it->second.first;
|
||||
auto& refcount = module_it->second.second;
|
||||
VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
|
||||
if (--refcount == 0) {
|
||||
VLOG(3) << "Unloading HSACO module " << module;
|
||||
GpuDriver::UnloadModule(context_, module);
|
||||
gpu_binary_to_module_.erase(module_it);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
|
||||
LOG(FATAL) << "Feature not supported on ROCM platform (UnloadKernel)";
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::Init(int device_ordinal,
|
||||
DeviceOptions device_options) {
|
||||
device_ordinal_ = device_ordinal;
|
||||
|
||||
auto status = GpuDriver::Init();
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
status = GpuDriver::GetDevice(device_ordinal_, &device_);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
|
||||
&context_);
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
}
|
||||
|
||||
return GpuDriver::GetGpuISAVersion(&version_, device_);
|
||||
}
|
||||
|
||||
bool GpuExecutor::FindOnDiskForComputeCapability(
|
||||
absl::string_view filename, absl::string_view canonical_suffix,
|
||||
string* found_filename) const {
|
||||
LOG(FATAL) << "Feature not supported on ROCM platform "
|
||||
"(FindOnDiskForComputeCapability)";
|
||||
return false;
|
||||
}
|
||||
|
||||
bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
|
||||
absl::string_view canonical_suffix,
|
||||
string* found_filename) const {
|
||||
if (version_ == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
string cc_specific =
|
||||
absl::StrCat(filename, ".cc", version_, canonical_suffix);
|
||||
if (port::FileExists(cc_specific).ok()) {
|
||||
VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
|
||||
<< cc_specific;
|
||||
*found_filename = cc_specific;
|
||||
return true;
|
||||
}
|
||||
|
||||
VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
|
||||
<< cc_specific;
|
||||
if (port::FileExists(string(filename)).ok()) {
|
||||
*found_filename = string(filename);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns the path to the running executable.
|
||||
// N.B. Derived from //knowledge/smalltalk/background_kb.cc
|
||||
// Arg: strip_exe: if true, remove the name of the executable itself from the
|
||||
// returned string. Example: calling this from /usr/bin/foo
|
||||
// would return /usr/bin.
|
||||
static string GetBinaryDir(bool strip_exe) {
|
||||
char exe_path[PATH_MAX] = {0};
|
||||
CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
|
||||
// Make sure it's null-terminated:
|
||||
exe_path[sizeof(exe_path) - 1] = 0;
|
||||
|
||||
if (strip_exe) {
|
||||
// The exe is the last component of the path, so remove one component.
|
||||
string ret = exe_path;
|
||||
std::vector<string> components = port::Split(exe_path, '/');
|
||||
components.pop_back();
|
||||
return port::Join(components, "/");
|
||||
}
|
||||
return exe_path;
|
||||
}
|
||||
|
||||
bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
|
||||
KernelBase* kernel) {
|
||||
GpuKernel* rocm_kernel = AsGpuKernel(kernel);
|
||||
hipModule_t module = nullptr;
|
||||
const string* kernelname;
|
||||
|
||||
const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
|
||||
bool has_cubin = spec.has_cuda_cubin_on_disk();
|
||||
if (has_cubin) {
|
||||
on_disk_spec = &spec.cuda_cubin_on_disk();
|
||||
}
|
||||
|
||||
if (on_disk_spec != nullptr) {
|
||||
LOG(WARNING) << "loading ROCM kernel from disk is not supported";
|
||||
return false;
|
||||
} else if (spec.has_cuda_cubin_in_memory()) {
|
||||
kernelname = &spec.cuda_cubin_in_memory().kernelname();
|
||||
|
||||
const char* hsaco = spec.cuda_cubin_in_memory().bytes();
|
||||
mutex_lock lock{in_memory_modules_mu_};
|
||||
module = in_memory_modules_[hsaco];
|
||||
|
||||
if (module == nullptr) {
|
||||
if (!GpuDriver::LoadHsaco(context_, hsaco, &module)) {
|
||||
LOG(ERROR) << "failed to load HSACO\n";
|
||||
return false;
|
||||
}
|
||||
in_memory_modules_[hsaco] = module;
|
||||
}
|
||||
} else {
|
||||
LOG(WARNING) << "no method of loading ROCM kernel provided";
|
||||
return false;
|
||||
}
|
||||
|
||||
VLOG(2) << "getting function " << *kernelname << " from module " << module;
|
||||
if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
|
||||
rocm_kernel->gpu_function_ptr())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// We have to trust the kernel loader spec arity because there doesn't appear
|
||||
// to be a way to reflect on the number of expected arguments w/the ROCM API.
|
||||
rocm_kernel->set_arity(spec.arity());
|
||||
|
||||
KernelMetadata kernel_metadata;
|
||||
if (!GetKernelMetadata(rocm_kernel, &kernel_metadata)) {
|
||||
LOG(WARNING) << "Unable to get metadata for kernel " << kernelname;
|
||||
}
|
||||
kernel->set_metadata(kernel_metadata);
|
||||
kernel->set_name(*kernelname);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
|
||||
KernelMetadata* kernel_metadata) {
|
||||
int value = 0;
|
||||
// TODO(ROCm) implement this feature in HIP
|
||||
kernel_metadata->set_registers_per_thread(value);
|
||||
|
||||
// TODO(ROCm) implement this feature in HIP
|
||||
kernel_metadata->set_shared_memory_bytes(value);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
|
||||
const BlockDim& block_dims, const KernelBase& kernel,
|
||||
const KernelArgsArrayBase& args) {
|
||||
CHECK_EQ(kernel.Arity(), args.number_of_arguments());
|
||||
GpuStreamHandle hipstream = AsGpuStreamValue(stream);
|
||||
const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
|
||||
hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();
|
||||
|
||||
// Only perform/print the occupancy check once. Even just checking to see
|
||||
// whether we've done an occupancy check on this kernel before isn't free
|
||||
// (because we have to synchronize), so we only do this at -v 2+.
|
||||
if (VLOG_IS_ON(2)) {
|
||||
mutex_lock lock(launched_kernels_mu_);
|
||||
if (!launched_kernels_.count(hipfunc)) {
|
||||
VlogOccupancyInfo(kernel, thread_dims, block_dims);
|
||||
// TODO(rspringer): Remove elements from launched_kernels_...if we ever
|
||||
// expose a kernel/module deallocation method.
|
||||
launched_kernels_.insert(hipfunc);
|
||||
}
|
||||
}
|
||||
|
||||
if (rocm_kernel->GetPreferredCacheConfig() !=
|
||||
KernelCacheConfig::kNoPreference) {
|
||||
GpuDriver::FuncSetCacheConfig(hipfunc, rocm_kernel->GetGpuCacheConfig());
|
||||
}
|
||||
|
||||
// prepare kernargs
|
||||
// KernelArgsArrayBase keeps the pointer of arguments
|
||||
// deference them here
|
||||
std::vector<void*> kernargs;
|
||||
KernelArgIterator iter = args.arg_iterator();
|
||||
while (iter.has_next()) {
|
||||
KernelArg arg = iter.next();
|
||||
VLOG(2) << "*(arg.address): "
|
||||
<< reinterpret_cast<void*>(
|
||||
*static_cast<const uint64_t*>(arg.address));
|
||||
kernargs.push_back(
|
||||
reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
|
||||
}
|
||||
|
||||
size_t size = sizeof(void*) * kernargs.size();
|
||||
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
|
||||
|
||||
if (!GpuDriver::LaunchKernel(
|
||||
GetGpuContext(stream), hipfunc, block_dims.x, block_dims.y,
|
||||
block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
|
||||
args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config)) {
|
||||
LOG(ERROR) << "failed to launch ROCM kernel with args: "
|
||||
<< args.number_of_arguments()
|
||||
<< "; thread dim: " << thread_dims.ToString()
|
||||
<< "; block dim: " << block_dims.ToString();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims,
|
||||
GpuFunctionHandle func) {
|
||||
LOG(FATAL) << "Feature not supported on ROCM platform (CalculateOccupancy)";
|
||||
return 0;
|
||||
}
|
||||
|
||||
int GpuExecutor::CompareOccupancy(int* initial_blocks,
|
||||
const DeviceDescription& device_description,
|
||||
uint64 registers_per_thread,
|
||||
uint64 shared_memory_per_block,
|
||||
const ThreadDim& thread_dims,
|
||||
GpuFunctionHandle func) {
|
||||
LOG(FATAL) << "Feature not supported on ROCM platform (CompareOccupancy)";
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
|
||||
ModuleHandle* module_handle) {
|
||||
// In GpuExecutor we store the pointer to the HSACO binary as
|
||||
// ModuleHandle::id().
|
||||
hipModule_t hip_module = nullptr;
|
||||
// TODO(ROCm): Need generic term instead of cubin/cuda/ptx
|
||||
if (spec.has_cuda_cubin_in_memory()) {
|
||||
mutex_lock lock{in_memory_modules_mu_};
|
||||
if (!LoadModuleFromHsaco(
|
||||
reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
|
||||
&hip_module)) {
|
||||
return false;
|
||||
}
|
||||
*module_handle = ModuleHandle(const_cast<void*>(
|
||||
static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
|
||||
return true;
|
||||
} else {
|
||||
LOG(ERROR) << "No HSACO binary found \n";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, hipModule_t* module) {
|
||||
LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
|
||||
return false;
|
||||
}
|
||||
|
||||
bool GpuExecutor::LoadModuleFromPtx(const char* ptx, hipModule_t* module) {
|
||||
LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
|
||||
return false;
|
||||
}
|
||||
|
||||
bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, hipModule_t* module) {
|
||||
uint64_t module_refcount;
|
||||
std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
|
||||
|
||||
if (*module == nullptr) {
|
||||
if (!GpuDriver::LoadHsaco(context_, hsaco, module)) {
|
||||
LOG(ERROR) << "failed to load : HSACO \n";
|
||||
return false;
|
||||
}
|
||||
module_refcount = 1;
|
||||
VLOG(3) << "Loaded HSACO " << static_cast<const void*>(hsaco)
|
||||
<< " as module " << *module;
|
||||
} else {
|
||||
++module_refcount;
|
||||
VLOG(3) << "HSACO " << static_cast<const void*>(hsaco)
|
||||
<< " is already loaded as module " << *module;
|
||||
}
|
||||
gpu_binary_to_module_[hsaco] = {*module, module_refcount};
|
||||
return true;
|
||||
}
|
||||
|
||||
// This is a non-essential operation; if there's a failure, proceed without
|
||||
// logging an error. It's nearly certain that in case of failures, we'd never
|
||||
// get here in the first place; these are very low-impact routines.
|
||||
void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
|
||||
const ThreadDim& thread_dims,
|
||||
const BlockDim& block_dims) {
|
||||
// TODO(ROCm) implement this feature in HIP
|
||||
}
|
||||
|
||||
void* GpuExecutor::Allocate(uint64 size) {
|
||||
return GpuDriver::DeviceAllocate(context_, size);
|
||||
}
|
||||
|
||||
void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
|
||||
uint64 size_bytes) {
|
||||
// offset and size are in bytes, so char* works as the pointer type.
|
||||
return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
|
||||
}
|
||||
|
||||
void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
|
||||
// ROCM "sub-buffers" are just pointer + offset, so no dealloc is necessary.
|
||||
if (!mem->is_sub_buffer()) {
|
||||
GpuDriver::DeviceDeallocate(context_, mem->opaque());
|
||||
}
|
||||
}
|
||||
|
||||
bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
|
||||
if (location == nullptr || size == 0) {
|
||||
LOG(WARNING) << "attempting to register null or zero-sized memory: "
|
||||
<< location << "; size " << size;
|
||||
}
|
||||
VLOG(2) << "registering " << location << " size " << size;
|
||||
return GpuDriver::HostRegister(context_, location, size);
|
||||
}
|
||||
|
||||
bool GpuExecutor::HostMemoryUnregister(void* location) {
|
||||
VLOG(2) << "unregistering " << location;
|
||||
return GpuDriver::HostUnregister(context_, location);
|
||||
}
|
||||
|
||||
bool GpuExecutor::SynchronizeAllActivity() {
|
||||
return GpuDriver::SynchronizeContext(context_);
|
||||
}
|
||||
|
||||
bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
|
||||
if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
|
||||
size % 4 == 0) {
|
||||
return GpuDriver::SynchronousMemsetUint32(
|
||||
context_, AsROCmDevicePtr(location), 0x0, size / 4);
|
||||
}
|
||||
return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
|
||||
0x0, size);
|
||||
}
|
||||
|
||||
bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
|
||||
uint64 size) {
|
||||
if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
|
||||
size % 4 == 0) {
|
||||
// hipMemset reinterprets "value" as a uint8.
|
||||
uint8 byte_value = static_cast<uint8>(value);
|
||||
uint32 pattern = (byte_value << 24) | (byte_value << 16) |
|
||||
(byte_value << 8) | byte_value;
|
||||
return GpuDriver::SynchronousMemsetUint32(
|
||||
context_, AsROCmDevicePtr(location), pattern, size / 4);
|
||||
}
|
||||
return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
|
||||
value, size);
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
|
||||
const void* host_src, uint64 size) {
|
||||
return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
|
||||
host_src, size);
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
|
||||
const DeviceMemoryBase& gpu_src,
|
||||
uint64 size) {
|
||||
return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
|
||||
AsROCmDevicePtr(gpu_src), size);
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
|
||||
DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
|
||||
return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
|
||||
AsROCmDevicePtr(gpu_src), size);
|
||||
}
|
||||
|
||||
bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
|
||||
uint64 size) {
|
||||
if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
|
||||
size % 4 == 0) {
|
||||
return Memset32(stream, location, 0x0, size);
|
||||
} else {
|
||||
return Memset(stream, location, 0x0, size);
|
||||
}
|
||||
}
|
||||
|
||||
bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
|
||||
uint8 pattern, uint64 size) {
|
||||
VLOG(2) << "enqueueing memset8 operation onto stream " << stream
|
||||
<< " at location " << location << " with size " << size
|
||||
<< " and pattern " << std::hex << pattern;
|
||||
return GpuDriver::AsynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
|
||||
pattern, size,
|
||||
AsGpuStreamValue(stream));
|
||||
}
|
||||
|
||||
bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
|
||||
uint32 pattern, uint64 size) {
|
||||
VLOG(2) << "enqueueing memset32 operation onto stream " << stream
|
||||
<< " at location " << location << " with size " << size
|
||||
<< " and pattern " << std::hex << pattern;
|
||||
CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
|
||||
size % 4 == 0);
|
||||
return GpuDriver::AsynchronousMemsetUint32(
|
||||
context_, AsROCmDevicePtr(location), pattern, size / 4,
|
||||
AsGpuStreamValue(stream));
|
||||
}
|
||||
|
||||
bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
|
||||
const DeviceMemoryBase& gpu_src, uint64 size) {
|
||||
return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
|
||||
AsROCmDevicePtr(gpu_src), size,
|
||||
AsGpuStreamValue(stream));
|
||||
}
|
||||
|
||||
bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
|
||||
const void* host_src, uint64 size) {
|
||||
return GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
|
||||
host_src, size,
|
||||
AsGpuStreamValue(stream));
|
||||
}
|
||||
|
||||
bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
|
||||
DeviceMemoryBase* gpu_dst,
|
||||
const DeviceMemoryBase& gpu_src,
|
||||
uint64 size) {
|
||||
return GpuDriver::AsynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
|
||||
AsROCmDevicePtr(gpu_src), size,
|
||||
AsGpuStreamValue(stream));
|
||||
}
|
||||
|
||||
bool GpuExecutor::HostCallback(Stream* stream,
|
||||
std::function<port::Status()> callback) {
|
||||
auto callback_ptr = new std::function<void()>([callback]() {
|
||||
port::Status s = callback();
|
||||
if (!s.ok()) {
|
||||
LOG(WARNING) << "Host callback failed: " << s;
|
||||
}
|
||||
});
|
||||
return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
|
||||
InternalHostCallback, callback_ptr);
|
||||
}
|
||||
|
||||
/* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
|
||||
hipError_t status,
|
||||
void* data) {
|
||||
std::function<void()>* callback =
|
||||
reinterpret_cast<std::function<void()>*>(data);
|
||||
(*callback)();
|
||||
delete callback;
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::AllocateEvent(Event* event) {
|
||||
return AsGpuEvent(event)->Init();
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::DeallocateEvent(Event* event) {
|
||||
return AsGpuEvent(event)->Destroy();
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
|
||||
return AsGpuEvent(event)->Record(AsGpuStream(stream));
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
|
||||
if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
|
||||
AsGpuEvent(event)->gpu_event())) {
|
||||
return port::Status::OK();
|
||||
} else {
|
||||
return port::Status{
|
||||
port::error::INTERNAL,
|
||||
absl::StrFormat("error recording waiting for ROCM event on stream %p",
|
||||
stream)};
|
||||
}
|
||||
}
|
||||
|
||||
Event::Status GpuExecutor::PollForEventStatus(Event* event) {
|
||||
return AsGpuEvent(event)->PollForStatus();
|
||||
}
|
||||
|
||||
bool GpuExecutor::AllocateStream(Stream* stream) {
|
||||
return AsGpuStream(stream)->Init();
|
||||
}
|
||||
|
||||
void GpuExecutor::DeallocateStream(Stream* stream) {
|
||||
GpuStream* rocm_stream = AsGpuStream(stream);
|
||||
if (!rocm_stream->IsIdle()) {
|
||||
LOG(ERROR) << "Deallocating stream with pending work";
|
||||
}
|
||||
rocm_stream->Destroy();
|
||||
}
|
||||
|
||||
bool GpuExecutor::AllocateTimer(Timer* timer) {
|
||||
return AsGpuTimer(timer)->Init();
|
||||
}
|
||||
|
||||
void GpuExecutor::DeallocateTimer(Timer* timer) {
|
||||
AsGpuTimer(timer)->Destroy();
|
||||
}
|
||||
|
||||
bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
|
||||
GpuEventHandle other_completed_event = *AsGpuStream(other)->completed_event();
|
||||
bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
|
||||
AsGpuStreamValue(other))
|
||||
.ok();
|
||||
if (!ok) {
|
||||
LOG(ERROR) << "failed to record completion event; "
|
||||
"therefore, failed to create inter-stream dependency";
|
||||
return false;
|
||||
}
|
||||
|
||||
return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
|
||||
other_completed_event);
|
||||
}
|
||||
|
||||
bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
|
||||
return AsGpuTimer(timer)->Start(AsGpuStream(stream));
|
||||
}
|
||||
|
||||
bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
|
||||
return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
|
||||
return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
|
||||
}
|
||||
|
||||
blas::BlasSupport* GpuExecutor::CreateBlas() {
|
||||
PluginRegistry* registry = PluginRegistry::Instance();
|
||||
port::StatusOr<PluginRegistry::BlasFactory> status =
|
||||
registry->GetFactory<PluginRegistry::BlasFactory>(kROCmPlatformId,
|
||||
plugin_config_.blas());
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << "Unable to retrieve BLAS factory: "
|
||||
<< status.status().error_message();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return status.ValueOrDie()(this);
|
||||
}
|
||||
|
||||
dnn::DnnSupport* GpuExecutor::CreateDnn() {
|
||||
PluginRegistry* registry = PluginRegistry::Instance();
|
||||
port::StatusOr<PluginRegistry::DnnFactory> status =
|
||||
registry->GetFactory<PluginRegistry::DnnFactory>(kROCmPlatformId,
|
||||
plugin_config_.dnn());
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << "Unable to retrieve DNN factory: "
|
||||
<< status.status().error_message();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return status.ValueOrDie()(this);
|
||||
}
|
||||
|
||||
fft::FftSupport* GpuExecutor::CreateFft() {
|
||||
PluginRegistry* registry = PluginRegistry::Instance();
|
||||
port::StatusOr<PluginRegistry::FftFactory> status =
|
||||
registry->GetFactory<PluginRegistry::FftFactory>(kROCmPlatformId,
|
||||
plugin_config_.fft());
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << "Unable to retrieve FFT factory: "
|
||||
<< status.status().error_message();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return status.ValueOrDie()(this);
|
||||
}
|
||||
|
||||
rng::RngSupport* GpuExecutor::CreateRng() {
|
||||
PluginRegistry* registry = PluginRegistry::Instance();
|
||||
port::StatusOr<PluginRegistry::RngFactory> status =
|
||||
registry->GetFactory<PluginRegistry::RngFactory>(kROCmPlatformId,
|
||||
plugin_config_.rng());
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << "Unable to retrieve RNG factory: "
|
||||
<< status.status().error_message();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return status.ValueOrDie()(this);
|
||||
}
|
||||
|
||||
// TODO(rspringer): Remove in b/18544742.
|
||||
bool GpuExecutor::SupportsDnn() const { return true; }
|
||||
|
||||
bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
|
||||
GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
|
||||
return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
|
||||
GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
|
||||
return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
|
||||
}
|
||||
|
||||
SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
|
||||
port::StatusOr<hipSharedMemConfig> rocm_config =
|
||||
GpuDriver::ContextGetSharedMemConfig(context_);
|
||||
if (!rocm_config.ok()) {
|
||||
// Don't log; the failed call will log necessary output.
|
||||
return SharedMemoryConfig::kDefault;
|
||||
}
|
||||
|
||||
switch (rocm_config.ValueOrDie()) {
|
||||
case hipSharedMemBankSizeDefault:
|
||||
return SharedMemoryConfig::kDefault;
|
||||
case hipSharedMemBankSizeFourByte:
|
||||
return SharedMemoryConfig::kFourByte;
|
||||
case hipSharedMemBankSizeEightByte:
|
||||
return SharedMemoryConfig::kEightByte;
|
||||
default:
|
||||
LOG(FATAL) << "Invalid shared memory configuration returned: "
|
||||
<< rocm_config.ValueOrDie();
|
||||
}
|
||||
}
|
||||
|
||||
port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
|
||||
SharedMemoryConfig config) {
|
||||
hipSharedMemConfig rocm_config;
|
||||
switch (config) {
|
||||
case SharedMemoryConfig::kDefault:
|
||||
rocm_config = hipSharedMemBankSizeDefault;
|
||||
break;
|
||||
case SharedMemoryConfig::kFourByte:
|
||||
rocm_config = hipSharedMemBankSizeFourByte;
|
||||
break;
|
||||
case SharedMemoryConfig::kEightByte:
|
||||
rocm_config = hipSharedMemBankSizeEightByte;
|
||||
break;
|
||||
default:
|
||||
LOG(FATAL) << "Invalid shared memory configuration specified: "
|
||||
<< static_cast<int>(config);
|
||||
}
|
||||
return GpuDriver::ContextSetSharedMemConfig(context_, rocm_config);
|
||||
}
|
||||
|
||||
bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
|
||||
return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
|
||||
}
|
||||
|
||||
bool GpuExecutor::GetSymbol(const string& symbol_name,
|
||||
ModuleHandle module_handle, void** mem,
|
||||
size_t* bytes) {
|
||||
{ // give limited scope to mutex_lock
|
||||
mutex_lock lock{disk_modules_mu_};
|
||||
for (auto& it : disk_modules_) {
|
||||
if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
|
||||
reinterpret_cast<hipDeviceptr_t*>(mem),
|
||||
bytes)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{ // give limited scope to mutex_lock
|
||||
mutex_lock lock{in_memory_modules_mu_};
|
||||
for (auto& it : in_memory_modules_) {
|
||||
if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
|
||||
reinterpret_cast<hipDeviceptr_t*>(mem),
|
||||
bytes)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{ // give limited scope to mutex_lock
|
||||
mutex_lock lock{in_memory_modules_mu_};
|
||||
if (static_cast<bool>(module_handle)) {
|
||||
auto it = gpu_binary_to_module_.find(module_handle.id());
|
||||
CHECK(it != gpu_binary_to_module_.end());
|
||||
if (GpuDriver::GetModuleSymbol(
|
||||
context_, it->second.first, symbol_name.c_str(),
|
||||
reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& it : gpu_binary_to_module_) {
|
||||
if (GpuDriver::GetModuleSymbol(
|
||||
context_, it.second.first, symbol_name.c_str(),
|
||||
reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
|
||||
// The BlockDim name is a mismatch against these GRID_DIM_* queries because
|
||||
// we use BlockDims to express the dimensions of blocks within a grid
|
||||
// (as opposed to ThreadDim which expresses the dimensions of threads
|
||||
// within a block).
|
||||
int x, y, z;
|
||||
if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
block_dim_limit->x = x;
|
||||
block_dim_limit->y = y;
|
||||
block_dim_limit->z = z;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool GpuExecutor::SupportsBlas() const { return true; }
|
||||
|
||||
bool GpuExecutor::SupportsFft() const { return true; }
|
||||
|
||||
bool GpuExecutor::SupportsRng() const { return true; }
|
||||
|
||||
std::unique_ptr<internal::EventInterface>
|
||||
GpuExecutor::CreateEventImplementation() {
|
||||
return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
|
||||
}
|
||||
|
||||
std::unique_ptr<internal::KernelInterface>
|
||||
GpuExecutor::CreateKernelImplementation() {
|
||||
return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
|
||||
}
|
||||
|
||||
std::unique_ptr<internal::StreamInterface>
|
||||
GpuExecutor::GetStreamImplementation() {
|
||||
return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
|
||||
}
|
||||
|
||||
std::unique_ptr<internal::TimerInterface>
|
||||
GpuExecutor::GetTimerImplementation() {
|
||||
return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
|
||||
}
|
||||
|
||||
void* GpuExecutor::GpuContextHack() { return context_; }
|
||||
|
||||
GpuContext* GpuExecutor::gpu_context() { return context_; }
|
||||
|
||||
// Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
|
||||
// of SysFS. Returns -1 if it cannot.
|
||||
//
|
||||
// For anything more complicated/prod-focused than this, you'll likely want to
|
||||
// turn to gsys' topology modeling.
|
||||
static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
|
||||
// TODO(ROCm) implement this feature in HIP
|
||||
return 1;
|
||||
}
|
||||
|
||||
DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
|
||||
internal::DeviceDescriptionBuilder builder;
|
||||
|
||||
{
|
||||
int driver_version = 0;
|
||||
(void)GpuDriver::GetDriverVersion(&driver_version);
|
||||
string augmented_driver_version = absl::StrFormat(
|
||||
"%d (%s)", driver_version,
|
||||
DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str());
|
||||
builder.set_driver_version(augmented_driver_version);
|
||||
}
|
||||
|
||||
{
|
||||
string pci_bus_id = GpuDriver::GetPCIBusID(device_);
|
||||
|
||||
// Lower the hex characters to match sysfs.
|
||||
pci_bus_id = port::Lowercase(pci_bus_id);
|
||||
builder.set_pci_bus_id(pci_bus_id);
|
||||
|
||||
// Read the NUMA node corresponding to the PCI bus ID out of sysfs.
|
||||
int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_);
|
||||
builder.set_numa_node(numa_node);
|
||||
}
|
||||
|
||||
hipDeviceProp_t prop;
|
||||
if (GpuDriver::GetDeviceProperties(&prop, device_ordinal_)) {
|
||||
builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
|
||||
|
||||
ThreadDim thread_dim_limit;
|
||||
thread_dim_limit.x = prop.maxThreadsDim[0];
|
||||
thread_dim_limit.y = prop.maxThreadsDim[1];
|
||||
thread_dim_limit.z = prop.maxThreadsDim[2];
|
||||
builder.set_thread_dim_limit(thread_dim_limit);
|
||||
|
||||
float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
|
||||
builder.set_clock_rate_ghz(clock_rate_ghz);
|
||||
}
|
||||
|
||||
{
|
||||
bool ecc_enabled = false;
|
||||
(void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
|
||||
builder.set_ecc_enabled(ecc_enabled);
|
||||
}
|
||||
|
||||
{
|
||||
uint64 device_memory_size = -1;
|
||||
(void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
|
||||
builder.set_device_memory_size(device_memory_size);
|
||||
}
|
||||
|
||||
{
|
||||
BlockDim block_dim_limit;
|
||||
FillBlockDimLimit(&block_dim_limit);
|
||||
builder.set_block_dim_limit(block_dim_limit);
|
||||
}
|
||||
|
||||
{
|
||||
string device_name;
|
||||
(void)GpuDriver::GetDeviceName(device_, &device_name);
|
||||
builder.set_name(device_name);
|
||||
}
|
||||
|
||||
builder.set_platform_version(
|
||||
absl::StrCat("AMDGPU ISA version: gfx", version_));
|
||||
|
||||
// TODO(leary) should be a way to query this from the driver, but this is
|
||||
// unlikely to change for us any time soon.
|
||||
builder.set_device_address_bits(64);
|
||||
|
||||
builder.set_device_vendor("Advanced Micro Devices, Inc");
|
||||
builder.set_rocm_amdgpu_isa_version(version_);
|
||||
builder.set_shared_memory_per_core(
|
||||
GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
|
||||
builder.set_shared_memory_per_block(
|
||||
GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
|
||||
builder.set_core_count(
|
||||
GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
|
||||
builder.set_threads_per_core_limit(
|
||||
GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
|
||||
builder.set_registers_per_block_limit(
|
||||
GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
|
||||
builder.set_threads_per_warp(
|
||||
GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
|
||||
builder.set_registers_per_core_limit(64 * 1024);
|
||||
|
||||
auto built = builder.Build();
|
||||
return built.release();
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
void initialize_rocm_gpu_executor() {
|
||||
*internal::MakeROCMExecutorImplementation() = [](const PluginConfig& config) {
|
||||
return new gpu::GpuExecutor{config};
|
||||
};
|
||||
}
|
||||
|
||||
} // namespace stream_executor
|
||||
|
||||
REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {
|
||||
stream_executor::initialize_rocm_gpu_executor();
|
||||
});
|
38
tensorflow/stream_executor/rocm/rocm_kernel.cc
Normal file
38
tensorflow/stream_executor/rocm/rocm_kernel.cc
Normal file
@ -0,0 +1,38 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
hipFuncCache_t GpuKernel::GetGpuCacheConfig() const {
|
||||
switch (preferred_cache_config_) {
|
||||
case KernelCacheConfig::kNoPreference:
|
||||
return hipFuncCachePreferNone;
|
||||
case KernelCacheConfig::kPreferShared:
|
||||
return hipFuncCachePreferShared;
|
||||
case KernelCacheConfig::kPreferL1:
|
||||
return hipFuncCachePreferL1;
|
||||
case KernelCacheConfig::kPreferEqual:
|
||||
return hipFuncCachePreferEqual;
|
||||
default:
|
||||
LOG(FATAL) << "Unknown KernelCacheConfig"
|
||||
<< static_cast<int32>(preferred_cache_config_);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
180
tensorflow/stream_executor/rocm/rocm_platform.cc
Normal file
180
tensorflow/stream_executor/rocm/rocm_platform.cc
Normal file
@ -0,0 +1,180 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/stream_executor/rocm/rocm_platform.h"
|
||||
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_driver.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
|
||||
#include "tensorflow/stream_executor/lib/error.h"
|
||||
#include "tensorflow/stream_executor/lib/initialize.h"
|
||||
#include "tensorflow/stream_executor/lib/ptr_util.h"
|
||||
#include "tensorflow/stream_executor/lib/status.h"
|
||||
#include "tensorflow/stream_executor/lib/stringprintf.h"
|
||||
#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
ROCmPlatform::ROCmPlatform()
|
||||
: name_("ROCM"), min_numa_node_(0), limit_numa_node_(0) {}
|
||||
|
||||
ROCmPlatform::~ROCmPlatform() {}
|
||||
|
||||
// Due to legacy issues in user code, we can't currently call InpectNumaNodes
|
||||
// at module initialization time, because non-GPU programs still include this
|
||||
// plugin via various methods, so instead, it has to be init-on-reference.
|
||||
void ROCmPlatform::InspectNumaNodes() {
|
||||
// To get NUMA node information, we need to create all executors, so we can
|
||||
// examine their device descriptions to see their bus assignments.
|
||||
static bool initialized = false;
|
||||
static mutex numa_mutex(LINKER_INITIALIZED);
|
||||
mutex_lock lock(numa_mutex);
|
||||
if (initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
StreamExecutorConfig config;
|
||||
for (int i = 0; i < VisibleDeviceCount(); i++) {
|
||||
config.ordinal = i;
|
||||
StreamExecutor* exec = GetExecutor(config).ValueOrDie();
|
||||
if (i == 0) {
|
||||
// NUMA nodes may not start at 0, so set the minimum node based on the
|
||||
// first executor we see.
|
||||
min_numa_node_ = exec->GetDeviceDescription().numa_node();
|
||||
limit_numa_node_ = min_numa_node_ + 1;
|
||||
} else {
|
||||
min_numa_node_ =
|
||||
std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
|
||||
limit_numa_node_ = std::max(limit_numa_node_,
|
||||
exec->GetDeviceDescription().numa_node() + 1);
|
||||
}
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
int ROCmPlatform::BusCount() {
|
||||
InspectNumaNodes();
|
||||
return limit_numa_node_ - min_numa_node_;
|
||||
}
|
||||
|
||||
int ROCmPlatform::DeviceToBus(int device_ordinal) {
|
||||
StreamExecutorConfig config;
|
||||
config.ordinal = device_ordinal;
|
||||
StreamExecutor* exec = GetExecutor(config).ValueOrDie();
|
||||
return exec->GetDeviceDescription().numa_node() - min_numa_node_;
|
||||
}
|
||||
|
||||
port::StatusOr<StreamExecutor*> ROCmPlatform::FirstExecutorForBus(
|
||||
int bus_ordinal) {
|
||||
InspectNumaNodes();
|
||||
CHECK_LT(bus_ordinal, BusCount()) << "bus ordinal out of available range";
|
||||
for (int i = 0; i < VisibleDeviceCount(); i++) {
|
||||
if (DeviceToBus(i) == bus_ordinal) {
|
||||
StreamExecutorConfig config;
|
||||
config.ordinal = i;
|
||||
return GetExecutor(config).ValueOrDie();
|
||||
}
|
||||
}
|
||||
|
||||
return port::Status{
|
||||
port::error::NOT_FOUND,
|
||||
absl::StrFormat("Executor for bus %d not found.", bus_ordinal)};
|
||||
}
|
||||
|
||||
Platform::Id ROCmPlatform::id() const { return kROCmPlatformId; }
|
||||
|
||||
int ROCmPlatform::VisibleDeviceCount() const {
|
||||
// Throw away the result - it logs internally, and this [containing] function
|
||||
// isn't in the path of user control. It's safe to call this > 1x.
|
||||
|
||||
if (!gpu::GpuDriver::Init().ok()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return GpuDriver::GetDeviceCount();
|
||||
}
|
||||
|
||||
const string& ROCmPlatform::Name() const { return name_; }
|
||||
|
||||
port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal) {
|
||||
StreamExecutorConfig config;
|
||||
config.ordinal = ordinal;
|
||||
config.plugin_config = PluginConfig();
|
||||
config.device_options = DeviceOptions::Default();
|
||||
return GetExecutor(config);
|
||||
}
|
||||
|
||||
port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDeviceWithPluginConfig(
|
||||
int device_ordinal, const PluginConfig& plugin_config) {
|
||||
StreamExecutorConfig config;
|
||||
config.ordinal = device_ordinal;
|
||||
config.plugin_config = plugin_config;
|
||||
config.device_options = DeviceOptions::Default();
|
||||
return GetExecutor(config);
|
||||
}
|
||||
|
||||
port::StatusOr<StreamExecutor*> ROCmPlatform::GetExecutor(
|
||||
const StreamExecutorConfig& config) {
|
||||
return executor_cache_.GetOrCreate(
|
||||
config, [&]() { return GetUncachedExecutor(config); });
|
||||
}
|
||||
|
||||
port::StatusOr<std::unique_ptr<StreamExecutor>>
|
||||
ROCmPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
|
||||
auto executor = MakeUnique<StreamExecutor>(
|
||||
this, MakeUnique<GpuExecutor>(config.plugin_config));
|
||||
auto init_status = executor->Init(config.ordinal, config.device_options);
|
||||
if (!init_status.ok()) {
|
||||
return port::Status{
|
||||
port::error::INTERNAL,
|
||||
absl::StrFormat(
|
||||
"failed initializing StreamExecutor for ROCM device ordinal %d: %s",
|
||||
config.ordinal, init_status.ToString().c_str())};
|
||||
}
|
||||
|
||||
return std::move(executor);
|
||||
}
|
||||
|
||||
void ROCmPlatform::RegisterTraceListener(
|
||||
std::unique_ptr<TraceListener> listener) {
|
||||
LOG(FATAL) << "not yet implemented: register ROCM trace listener";
|
||||
}
|
||||
|
||||
void ROCmPlatform::UnregisterTraceListener(TraceListener* listener) {
|
||||
LOG(FATAL) << "not yet implemented: unregister ROCM trace listener";
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
static void InitializeROCmPlatform() {
|
||||
// Disabling leak checking, MultiPlatformManager does not destroy its
|
||||
// registered platforms.
|
||||
auto status = MultiPlatformManager::PlatformWithName("ROCM");
|
||||
if (!status.ok()) {
|
||||
std::unique_ptr<gpu::ROCmPlatform> platform(new gpu::ROCmPlatform);
|
||||
SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace stream_executor
|
||||
|
||||
REGISTER_MODULE_INITIALIZER(rocm_platform,
|
||||
stream_executor::InitializeROCmPlatform());
|
||||
|
||||
DECLARE_MODULE_INITIALIZER(multi_platform_manager);
|
||||
// Note that module initialization sequencing is not supported in the
|
||||
// open-source project, so this will be a no-op there.
|
||||
REGISTER_MODULE_INITIALIZER_SEQUENCE(rocm_platform, multi_platform_manager);
|
110
tensorflow/stream_executor/rocm/rocm_platform.h
Normal file
110
tensorflow/stream_executor/rocm/rocm_platform.h
Normal file
@ -0,0 +1,110 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/stream_executor/executor_cache.h"
|
||||
#include "tensorflow/stream_executor/lib/statusor.h"
|
||||
#include "tensorflow/stream_executor/multi_platform_manager.h"
|
||||
#include "tensorflow/stream_executor/platform.h"
|
||||
#include "tensorflow/stream_executor/platform/mutex.h"
|
||||
#include "tensorflow/stream_executor/platform/port.h"
|
||||
#include "tensorflow/stream_executor/platform/thread_annotations.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_internal.h"
|
||||
#include "tensorflow/stream_executor/stream_executor_pimpl.h"
|
||||
#include "tensorflow/stream_executor/trace_listener.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
// Opaque and unique identifier for the ROCM platform plugin.
|
||||
// This is needed so that plugins can refer to/identify this platform without
|
||||
// instantiating a ROCmPlatform object.
|
||||
extern const Platform::Id kROCmPlatformId;
|
||||
|
||||
// ROCm-specific platform plugin, registered as a singleton value via module
|
||||
// initializer.
|
||||
class ROCmPlatform : public Platform {
|
||||
public:
|
||||
ROCmPlatform();
|
||||
~ROCmPlatform() override;
|
||||
|
||||
// ROCmPlatform-specific functionality
|
||||
// Returns the number of distinct buses / NUMA nodes on the machine.
|
||||
int BusCount();
|
||||
|
||||
// Returns the bus/NUMA node for the specified device ordinal.
|
||||
int DeviceToBus(int device_ordinal);
|
||||
|
||||
// Returns the lowest-ordinal-number StreamExecutor on the specified bus.
|
||||
port::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal);
|
||||
|
||||
// Platform interface implementation:
|
||||
// Returns the same value as kROCmPlatform above.
|
||||
Platform::Id id() const override;
|
||||
|
||||
// Returns -1 as a sentinel on internal failure (and logs the error).
|
||||
int VisibleDeviceCount() const override;
|
||||
|
||||
const string& Name() const override;
|
||||
|
||||
port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
|
||||
|
||||
port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
|
||||
int ordinal, const PluginConfig& config) override;
|
||||
|
||||
port::StatusOr<StreamExecutor*> GetExecutor(
|
||||
const StreamExecutorConfig& config) override;
|
||||
|
||||
port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
|
||||
const StreamExecutorConfig& config) override;
|
||||
|
||||
void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
|
||||
|
||||
void UnregisterTraceListener(TraceListener* listener) override;
|
||||
|
||||
private:
|
||||
// Determines the number of NUMA nodes and the assignment of executor to each.
|
||||
void InspectNumaNodes();
|
||||
|
||||
// This platform's name.
|
||||
string name_;
|
||||
|
||||
// mutex that guards internal state.
|
||||
mutable mutex mu_;
|
||||
|
||||
// Cache of created executors.
|
||||
ExecutorCache executor_cache_;
|
||||
|
||||
// The smallest NUMA node value for any device managed by this machine
|
||||
// manager. Used, along with limit_numa_node_, to convert NUMA nodes into bus
|
||||
// ordinals. The NUMA node space occupied by GPUs is assumed to be dense./
|
||||
int min_numa_node_;
|
||||
|
||||
// Larger than the NUMA node value for any device managed by this machine
|
||||
// manager.
|
||||
int limit_numa_node_;
|
||||
|
||||
SE_DISALLOW_COPY_AND_ASSIGN(ROCmPlatform);
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
|
24
tensorflow/stream_executor/rocm/rocm_platform_id.cc
Normal file
24
tensorflow/stream_executor/rocm/rocm_platform_id.cc
Normal file
@ -0,0 +1,24 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
PLATFORM_DEFINE_ID(kROCmPlatformId);
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
34
tensorflow/stream_executor/rocm/rocm_platform_id.h
Normal file
34
tensorflow/stream_executor/rocm/rocm_platform_id.h
Normal file
@ -0,0 +1,34 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
|
||||
#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
|
||||
|
||||
#include "tensorflow/stream_executor/platform.h"
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
// Opaque and unique identifier for the ROCm platform.
|
||||
// This is needed so that plugins can refer to/identify this platform without
|
||||
// instantiating a ROCmPlatform object.
|
||||
// This is broken out here to avoid a circular dependency between ROCmPlatform
|
||||
// and GpuExecutor.
|
||||
extern const Platform::Id kROCmPlatformId;
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
#endif // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
|
284
tensorflow/stream_executor/rocm/rocm_rng.cc
Normal file
284
tensorflow/stream_executor/rocm/rocm_rng.cc
Normal file
@ -0,0 +1,284 @@
|
||||
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "rocm/include/hiprand/hiprand.h"
|
||||
#include "tensorflow/stream_executor/device_memory.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_activation.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_executor.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_rng.h"
|
||||
#include "tensorflow/stream_executor/gpu/gpu_stream.h"
|
||||
#include "tensorflow/stream_executor/lib/env.h"
|
||||
#include "tensorflow/stream_executor/lib/initialize.h"
|
||||
#include "tensorflow/stream_executor/lib/status.h"
|
||||
#include "tensorflow/stream_executor/platform/logging.h"
|
||||
#include "tensorflow/stream_executor/rng.h"
|
||||
#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
|
||||
|
||||
// Formats hiprandStatus_t to output prettified values into a log stream.
|
||||
std::ostream& operator<<(std::ostream& in, const hiprandStatus_t& status) {
|
||||
#define OSTREAM_HIPRAND_STATUS(__name) \
|
||||
case HIPRAND_STATUS_##__name: \
|
||||
in << "HIPRAND_STATUS_" #__name; \
|
||||
return in;
|
||||
|
||||
switch (status) {
|
||||
OSTREAM_HIPRAND_STATUS(SUCCESS)
|
||||
OSTREAM_HIPRAND_STATUS(VERSION_MISMATCH)
|
||||
OSTREAM_HIPRAND_STATUS(NOT_INITIALIZED)
|
||||
OSTREAM_HIPRAND_STATUS(ALLOCATION_FAILED)
|
||||
OSTREAM_HIPRAND_STATUS(TYPE_ERROR)
|
||||
OSTREAM_HIPRAND_STATUS(OUT_OF_RANGE)
|
||||
OSTREAM_HIPRAND_STATUS(LENGTH_NOT_MULTIPLE)
|
||||
OSTREAM_HIPRAND_STATUS(LAUNCH_FAILURE)
|
||||
OSTREAM_HIPRAND_STATUS(PREEXISTING_FAILURE)
|
||||
OSTREAM_HIPRAND_STATUS(INITIALIZATION_FAILED)
|
||||
OSTREAM_HIPRAND_STATUS(ARCH_MISMATCH)
|
||||
OSTREAM_HIPRAND_STATUS(INTERNAL_ERROR)
|
||||
default:
|
||||
in << "hiprandStatus_t(" << static_cast<int>(status) << ")";
|
||||
return in;
|
||||
}
|
||||
}
|
||||
|
||||
namespace stream_executor {
|
||||
namespace gpu {
|
||||
|
||||
PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);
|
||||
|
||||
namespace wrap {
|
||||
|
||||
#define PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(__name) \
|
||||
struct WrapperShim__##__name { \
|
||||
template <typename... Args> \
|
||||
hiprandStatus_t operator()(GpuExecutor* parent, Args... args) { \
|
||||
gpu::ScopedActivateExecutorContext sac{parent}; \
|
||||
return ::__name(args...); \
|
||||
} \
|
||||
} __name;
|
||||
|
||||
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandCreateGenerator);
|
||||
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandDestroyGenerator);
|
||||
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetStream);
|
||||
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateUniform);
|
||||
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateUniformDouble);
|
||||
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetPseudoRandomGeneratorSeed);
|
||||
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandSetGeneratorOffset);
|
||||
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateNormal);
|
||||
PERFTOOLS_GPUTOOLS_HIPRAND_WRAP(hiprandGenerateNormalDouble);
|
||||
|
||||
} // namespace wrap
|
||||
|
||||
GpuRng::GpuRng(GpuExecutor* parent) : parent_(parent), rng_(nullptr) {}
|
||||
|
||||
GpuRng::~GpuRng() {
|
||||
if (rng_ != nullptr) {
|
||||
wrap::hiprandDestroyGenerator(parent_, rng_);
|
||||
}
|
||||
}
|
||||
|
||||
bool GpuRng::Init() {
|
||||
mutex_lock lock{mu_};
|
||||
CHECK(rng_ == nullptr);
|
||||
|
||||
hiprandStatus_t ret =
|
||||
wrap::hiprandCreateGenerator(parent_, &rng_, HIPRAND_RNG_PSEUDO_DEFAULT);
|
||||
if (ret != HIPRAND_STATUS_SUCCESS) {
|
||||
LOG(ERROR) << "failed to create random number generator: " << ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
CHECK(rng_ != nullptr);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool GpuRng::SetStream(Stream* stream) {
|
||||
hiprandStatus_t ret =
|
||||
wrap::hiprandSetStream(parent_, rng_, AsGpuStreamValue(stream));
|
||||
if (ret != HIPRAND_STATUS_SUCCESS) {
|
||||
LOG(ERROR) << "failed to set stream for random generation: " << ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns true if std::complex stores its contents as two consecutive
|
||||
// elements. Tests int, float and double, as the last two are independent
|
||||
// specializations.
|
||||
constexpr bool ComplexIsConsecutiveFloats() {
|
||||
return sizeof(std::complex<int>) == 8 && sizeof(std::complex<float>) == 8 &&
|
||||
sizeof(std::complex<double>) == 16;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
|
||||
mutex_lock lock{mu_};
|
||||
static_assert(ComplexIsConsecutiveFloats(),
|
||||
"std::complex values are not stored as consecutive values");
|
||||
|
||||
if (!SetStream(stream)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// std::complex<T> is currently implemented as two consecutive T variables.
|
||||
uint64 element_count = v->ElementCount();
|
||||
if (std::is_same<T, std::complex<float>>::value ||
|
||||
std::is_same<T, std::complex<double>>::value) {
|
||||
element_count *= 2;
|
||||
}
|
||||
|
||||
hiprandStatus_t ret;
|
||||
if (std::is_same<T, float>::value ||
|
||||
std::is_same<T, std::complex<float>>::value) {
|
||||
ret = wrap::hiprandGenerateUniform(
|
||||
parent_, rng_, reinterpret_cast<float*>(GpuMemoryMutable(v)),
|
||||
element_count);
|
||||
} else {
|
||||
ret = wrap::hiprandGenerateUniformDouble(
|
||||
parent_, rng_, reinterpret_cast<double*>(GpuMemoryMutable(v)),
|
||||
element_count);
|
||||
}
|
||||
if (ret != HIPRAND_STATUS_SUCCESS) {
|
||||
LOG(ERROR) << "failed to do uniform generation of " << v->ElementCount()
|
||||
<< " " << TypeString<T>() << "s at " << v->opaque() << ": "
|
||||
<< ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) {
|
||||
return DoPopulateRandUniformInternal(stream, v);
|
||||
}
|
||||
|
||||
bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) {
|
||||
return DoPopulateRandUniformInternal(stream, v);
|
||||
}
|
||||
|
||||
bool GpuRng::DoPopulateRandUniform(Stream* stream,
|
||||
DeviceMemory<std::complex<float>>* v) {
|
||||
return DoPopulateRandUniformInternal(stream, v);
|
||||
}
|
||||
|
||||
bool GpuRng::DoPopulateRandUniform(Stream* stream,
|
||||
DeviceMemory<std::complex<double>>* v) {
|
||||
return DoPopulateRandUniformInternal(stream, v);
|
||||
}
|
||||
|
||||
template <typename ElemT, typename FuncT>
|
||||
bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
|
||||
ElemT stddev,
|
||||
DeviceMemory<ElemT>* v,
|
||||
FuncT func) {
|
||||
mutex_lock lock{mu_};
|
||||
|
||||
if (!SetStream(stream)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint64 element_count = v->ElementCount();
|
||||
hiprandStatus_t ret =
|
||||
func(parent_, rng_, GpuMemoryMutable(v), element_count, mean, stddev);
|
||||
|
||||
if (ret != HIPRAND_STATUS_SUCCESS) {
|
||||
LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
|
||||
<< " floats at " << v->opaque() << ": " << ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool GpuRng::DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
|
||||
DeviceMemory<float>* v) {
|
||||
return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
|
||||
wrap::hiprandGenerateNormal);
|
||||
}
|
||||
|
||||
bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
|
||||
DeviceMemory<double>* v) {
|
||||
return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
|
||||
wrap::hiprandGenerateNormalDouble);
|
||||
}
|
||||
|
||||
bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
|
||||
mutex_lock lock{mu_};
|
||||
CHECK(rng_ != nullptr);
|
||||
|
||||
if (!CheckSeed(seed, seed_bytes)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!SetStream(stream)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Requires 8 bytes of seed data; checked in RngSupport::CheckSeed (above)
|
||||
// (which itself requires 16 for API consistency with host RNG fallbacks).
|
||||
hiprandStatus_t ret = wrap::hiprandSetPseudoRandomGeneratorSeed(
|
||||
parent_, rng_, *(reinterpret_cast<const uint64*>(seed)));
|
||||
if (ret != HIPRAND_STATUS_SUCCESS) {
|
||||
LOG(ERROR) << "failed to set rng seed: " << ret;
|
||||
return false;
|
||||
}
|
||||
|
||||
ret = wrap::hiprandSetGeneratorOffset(parent_, rng_, 0);
|
||||
if (ret != HIPRAND_STATUS_SUCCESS) {
|
||||
LOG(ERROR) << "failed to reset rng position: " << ret;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace stream_executor
|
||||
|
||||
namespace se = ::stream_executor;
|
||||
|
||||
REGISTER_MODULE_INITIALIZER(register_hiprand, {
|
||||
se::port::Status status =
|
||||
se::PluginRegistry::Instance()
|
||||
->RegisterFactory<se::PluginRegistry::RngFactory>(
|
||||
se::gpu::kROCmPlatformId, se::gpu::kGpuRandPlugin, "hipRAND",
|
||||
[](se::internal::StreamExecutorInterface* parent)
|
||||
-> se::rng::RngSupport* {
|
||||
se::gpu::GpuExecutor* rocm_executor =
|
||||
dynamic_cast<se::gpu::GpuExecutor*>(parent);
|
||||
if (rocm_executor == nullptr) {
|
||||
LOG(ERROR)
|
||||
<< "Attempting to initialize an instance of the hipRAND "
|
||||
<< "support library with a non-ROCM StreamExecutor";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
se::gpu::GpuRng* rng = new se::gpu::GpuRng(rocm_executor);
|
||||
if (!rng->Init()) {
|
||||
// Note: Init() will log a more specific error.
|
||||
delete rng;
|
||||
return nullptr;
|
||||
}
|
||||
return rng;
|
||||
});
|
||||
|
||||
if (!status.ok()) {
|
||||
LOG(ERROR) << "Unable to register hipRAND factory: "
|
||||
<< status.error_message();
|
||||
}
|
||||
|
||||
se::PluginRegistry::Instance()->SetDefaultFactory(
|
||||
se::gpu::kROCmPlatformId, se::PluginKind::kRng, se::gpu::kGpuRandPlugin);
|
||||
});
|
@ -25,6 +25,13 @@ StreamExecutorFactory* MakeCUDAExecutorImplementation() {
|
||||
return &instance;
|
||||
}
|
||||
|
||||
// -- ROCm
|
||||
|
||||
StreamExecutorFactory* MakeROCMExecutorImplementation() {
|
||||
static StreamExecutorFactory instance;
|
||||
return &instance;
|
||||
}
|
||||
|
||||
// -- OpenCL
|
||||
|
||||
StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
|
||||
|
@ -374,9 +374,11 @@ using StreamFactory = std::function<StreamInterface *(StreamExecutor *)>;
|
||||
using TimerFactory = std::function<TimerInterface *(StreamExecutor *)>;
|
||||
using KernelFactory = std::function<KernelInterface*()>;
|
||||
|
||||
StreamExecutorFactory* MakeCUDAExecutorImplementation();
|
||||
StreamExecutorFactory *MakeCUDAExecutorImplementation();
|
||||
|
||||
StreamExecutorFactory* MakeOpenCLExecutorImplementation();
|
||||
StreamExecutorFactory *MakeROCMExecutorImplementation();
|
||||
|
||||
StreamExecutorFactory *MakeOpenCLExecutorImplementation();
|
||||
|
||||
extern StreamExecutorFactory MakeHostExecutorImplementation;
|
||||
|
||||
|
@ -71,6 +71,9 @@ internal::StreamExecutorInterface *StreamExecutorImplementationFromPlatformKind(
|
||||
case PlatformKind::kCuda:
|
||||
factory = *internal::MakeCUDAExecutorImplementation();
|
||||
break;
|
||||
case PlatformKind::kROCm:
|
||||
factory = *internal::MakeROCMExecutorImplementation();
|
||||
break;
|
||||
case PlatformKind::kOpenCL:
|
||||
factory = *internal::MakeOpenCLExecutorImplementation();
|
||||
break;
|
||||
@ -188,6 +191,8 @@ StreamExecutor::StreamExecutor(
|
||||
memory_limit_bytes_(GetMemoryLimitBytes()) {
|
||||
if (port::Lowercase(platform_->Name()) == "cuda") {
|
||||
platform_kind_ = PlatformKind::kCuda;
|
||||
} else if (port::Lowercase(platform_->Name()) == "rocm") {
|
||||
platform_kind_ = PlatformKind::kROCm;
|
||||
} else if (port::Lowercase(platform_->Name()) == "opencl") {
|
||||
platform_kind_ = PlatformKind::kOpenCL;
|
||||
} else if (port::Lowercase(platform_->Name()) == "host") {
|
||||
|
1
third_party/gpus/rocm/BUILD.tpl
vendored
1
third_party/gpus/rocm/BUILD.tpl
vendored
@ -18,6 +18,7 @@ cc_library(
|
||||
includes = [
|
||||
".",
|
||||
"rocm/include",
|
||||
"rocm/include/rocrand",
|
||||
],
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user