Change 109695551 Update FAQ Change 109694725 Add a gradient for resize_bilinear op. Change 109694505 Don't mention variables module in docs variables.Variable should be tf.Variable. Change 109658848 Adding an option to create a new thread-pool for each session. Change 109640570 Take the snapshot of stream-executor. + Expose an interface for scratch space allocation in the interface. Change 109638559 Let image_summary accept uint8 input This allows users to do their own normalization / scaling if the default (very weird) behavior of image_summary is undesired. This required a slight tweak to fake_input.cc to make polymorphically typed fake inputs infer if their type attr is not set but has a default. Unfortunately, adding a second valid type to image_summary *disables* automatic implicit conversion from np.float64 to tf.float32, so this change is slightly backwards incompatible. Change 109636969 Add serialization operations for SparseTensor. Change 109636644 Update generated Op docs. Change 109634899 TensorFlow: add a markdown file for producing release notes for our releases. Seed with 0.5.0 with a boring but accurate description. Change 109634502 Let histogram_summary take any realnumbertype It used to take only floats, not it understands ints. Change 109634434 TensorFlow: update locations where we mention python 3 support, update them to current truth. Change 109632108 Move HSV <> RGB conversions, grayscale conversions, and adjust_* ops back to tensorflow - make GPU-capable version of RGBToHSV and HSVToRGB, allows only float input/output - change docs to reflect new size constraints - change HSV format to be [0,1] for all components - add automatic dtype conversion for all adjust_* and grayscale conversion ops - fix up docs Change 109631077 Improve optimizer exceptions 1. grads_and_vars is now a tuple, so must be wrapped when passed to format. 2. Use '%r' instead of '%s' for dtype formatting Base CL: 109697989
2201 lines
106 KiB
C++
2201 lines
106 KiB
C++
/* Copyright 2015 Google Inc. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#include "tensorflow/stream_executor/cuda/cuda_blas.h"
|
|
|
|
#include <dlfcn.h>
|
|
|
|
#include <complex>
|
|
|
|
#include "tensorflow/stream_executor/cuda/cuda_activation.h"
|
|
#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
|
|
#include "tensorflow/stream_executor/cuda/cuda_helpers.h"
|
|
#include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
|
|
#include "tensorflow/stream_executor/cuda/cuda_stream.h"
|
|
#include "tensorflow/stream_executor/device_memory.h"
|
|
#include "tensorflow/stream_executor/dso_loader.h"
|
|
#include "tensorflow/stream_executor/lib/initialize.h"
|
|
#include "tensorflow/stream_executor/lib/status.h"
|
|
#include "tensorflow/stream_executor/lib/status_macros.h"
|
|
#include "tensorflow/stream_executor/lib/strcat.h"
|
|
#include "tensorflow/stream_executor/lib/stringprintf.h"
|
|
#include "tensorflow/stream_executor/platform/logging.h"
|
|
#include "tensorflow/stream_executor/platform/port.h"
|
|
#include "tensorflow/stream_executor/plugin_registry.h"
|
|
#include "tensorflow/stream_executor/stream_executor.h"
|
|
#include "third_party/gpus/cuda/include/cublas_v2.h"
|
|
|
|
namespace perftools {
|
|
namespace gputools {
|
|
namespace cuda {
|
|
|
|
PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
|
|
|
|
namespace dynload {
|
|
|
|
#define PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name) \
|
|
struct DynLoadShim__##__name { \
|
|
static const char *kName; \
|
|
using FuncPointerT = std::add_pointer<decltype(::__name)>::type; \
|
|
static void *GetDsoHandle() { \
|
|
static auto status = internal::CachedDsoLoader::GetCublasDsoHandle(); \
|
|
return status.ValueOrDie(); \
|
|
} \
|
|
static FuncPointerT DynLoad() { \
|
|
static void *f = dlsym(GetDsoHandle(), kName); \
|
|
CHECK(f != nullptr) << "could not find " << kName \
|
|
<< " in cuBLAS DSO; dlerror: " << dlerror(); \
|
|
return reinterpret_cast<FuncPointerT>(f); \
|
|
} \
|
|
template <typename... Args> \
|
|
cublasStatus_t operator()(CUDAExecutor * parent, Args... args) { \
|
|
cuda::ScopedActivateExecutorContext sac{parent}; \
|
|
return DynLoad()(args...); \
|
|
} \
|
|
} __name; \
|
|
const char *DynLoadShim__##__name::kName = #__name;
|
|
|
|
#define PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(__name) \
|
|
PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(__name)
|
|
|
|
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
|
|
__macro(cublasSnrm2) \
|
|
__macro(cublasDnrm2) \
|
|
__macro(cublasScnrm2) \
|
|
__macro(cublasDznrm2) \
|
|
__macro(cublasSdot) \
|
|
__macro(cublasDdot) \
|
|
__macro(cublasCdotu) \
|
|
__macro(cublasCdotc) \
|
|
__macro(cublasZdotu) \
|
|
__macro(cublasZdotc) \
|
|
__macro(cublasSscal) \
|
|
__macro(cublasDscal) \
|
|
__macro(cublasCscal) \
|
|
__macro(cublasCsscal) \
|
|
__macro(cublasZscal) \
|
|
__macro(cublasZdscal) \
|
|
__macro(cublasSaxpy) \
|
|
__macro(cublasDaxpy) \
|
|
__macro(cublasCaxpy) \
|
|
__macro(cublasZaxpy) \
|
|
__macro(cublasScopy) \
|
|
__macro(cublasDcopy) \
|
|
__macro(cublasCcopy) \
|
|
__macro(cublasZcopy) \
|
|
__macro(cublasSswap) \
|
|
__macro(cublasDswap) \
|
|
__macro(cublasCswap) \
|
|
__macro(cublasZswap) \
|
|
__macro(cublasIsamax) \
|
|
__macro(cublasIdamax) \
|
|
__macro(cublasIcamax) \
|
|
__macro(cublasIzamax) \
|
|
__macro(cublasIsamin) \
|
|
__macro(cublasIdamin) \
|
|
__macro(cublasIcamin) \
|
|
__macro(cublasIzamin) \
|
|
__macro(cublasSasum) \
|
|
__macro(cublasDasum) \
|
|
__macro(cublasScasum) \
|
|
__macro(cublasDzasum) \
|
|
__macro(cublasSrot) \
|
|
__macro(cublasDrot) \
|
|
__macro(cublasCrot) \
|
|
__macro(cublasCsrot) \
|
|
__macro(cublasZrot) \
|
|
__macro(cublasZdrot) \
|
|
__macro(cublasSrotg) \
|
|
__macro(cublasDrotg) \
|
|
__macro(cublasCrotg) \
|
|
__macro(cublasZrotg) \
|
|
__macro(cublasSrotm) \
|
|
__macro(cublasDrotm) \
|
|
__macro(cublasSrotmg) \
|
|
__macro(cublasDrotmg) \
|
|
__macro(cublasSgemv) \
|
|
__macro(cublasDgemv) \
|
|
__macro(cublasCgemv) \
|
|
__macro(cublasZgemv) \
|
|
__macro(cublasSgbmv) \
|
|
__macro(cublasDgbmv) \
|
|
__macro(cublasCgbmv) \
|
|
__macro(cublasZgbmv) \
|
|
__macro(cublasStrmv) \
|
|
__macro(cublasDtrmv) \
|
|
__macro(cublasCtrmv) \
|
|
__macro(cublasZtrmv) \
|
|
__macro(cublasStbmv) \
|
|
__macro(cublasDtbmv) \
|
|
__macro(cublasCtbmv) \
|
|
__macro(cublasZtbmv) \
|
|
__macro(cublasStpmv) \
|
|
__macro(cublasDtpmv) \
|
|
__macro(cublasCtpmv) \
|
|
__macro(cublasZtpmv) \
|
|
__macro(cublasStrsv) \
|
|
__macro(cublasDtrsv) \
|
|
__macro(cublasCtrsv) \
|
|
__macro(cublasZtrsv) \
|
|
__macro(cublasStpsv) \
|
|
__macro(cublasDtpsv) \
|
|
__macro(cublasCtpsv) \
|
|
__macro(cublasZtpsv) \
|
|
__macro(cublasStbsv) \
|
|
__macro(cublasDtbsv) \
|
|
__macro(cublasCtbsv) \
|
|
__macro(cublasZtbsv) \
|
|
__macro(cublasSsymv) \
|
|
__macro(cublasDsymv) \
|
|
__macro(cublasCsymv) \
|
|
__macro(cublasZsymv) \
|
|
__macro(cublasChemv) \
|
|
__macro(cublasZhemv) \
|
|
__macro(cublasSsbmv) \
|
|
__macro(cublasDsbmv) \
|
|
__macro(cublasChbmv) \
|
|
__macro(cublasZhbmv) \
|
|
__macro(cublasSspmv) \
|
|
__macro(cublasDspmv) \
|
|
__macro(cublasChpmv) \
|
|
__macro(cublasZhpmv) \
|
|
__macro(cublasSger) \
|
|
__macro(cublasDger) \
|
|
__macro(cublasCgeru) \
|
|
__macro(cublasCgerc) \
|
|
__macro(cublasZgeru) \
|
|
__macro(cublasZgerc) \
|
|
__macro(cublasSsyr) \
|
|
__macro(cublasDsyr) \
|
|
__macro(cublasCsyr) \
|
|
__macro(cublasZsyr) \
|
|
__macro(cublasCher) \
|
|
__macro(cublasZher) \
|
|
__macro(cublasSspr) \
|
|
__macro(cublasDspr) \
|
|
__macro(cublasChpr) \
|
|
__macro(cublasZhpr) \
|
|
__macro(cublasSsyr2) \
|
|
__macro(cublasDsyr2) \
|
|
__macro(cublasCsyr2) \
|
|
__macro(cublasZsyr2) \
|
|
__macro(cublasCher2) \
|
|
__macro(cublasZher2) \
|
|
__macro(cublasSspr2) \
|
|
__macro(cublasDspr2) \
|
|
__macro(cublasChpr2) \
|
|
__macro(cublasZhpr2) \
|
|
__macro(cublasSgemm) \
|
|
__macro(cublasDgemm) \
|
|
__macro(cublasCgemm) \
|
|
__macro(cublasZgemm) \
|
|
__macro(cublasSsyrk) \
|
|
__macro(cublasDsyrk) \
|
|
__macro(cublasCsyrk) \
|
|
__macro(cublasZsyrk) \
|
|
__macro(cublasCherk) \
|
|
__macro(cublasZherk) \
|
|
__macro(cublasSsyr2k) \
|
|
__macro(cublasDsyr2k) \
|
|
__macro(cublasCsyr2k) \
|
|
__macro(cublasZsyr2k) \
|
|
__macro(cublasCher2k) \
|
|
__macro(cublasZher2k) \
|
|
__macro(cublasSsyrkx) \
|
|
__macro(cublasDsyrkx) \
|
|
__macro(cublasCsyrkx) \
|
|
__macro(cublasZsyrkx) \
|
|
__macro(cublasCherkx) \
|
|
__macro(cublasZherkx) \
|
|
__macro(cublasSsymm) \
|
|
__macro(cublasDsymm) \
|
|
__macro(cublasCsymm) \
|
|
__macro(cublasZsymm) \
|
|
__macro(cublasChemm) \
|
|
__macro(cublasZhemm) \
|
|
__macro(cublasStrsm) \
|
|
__macro(cublasDtrsm) \
|
|
__macro(cublasCtrsm) \
|
|
__macro(cublasZtrsm) \
|
|
__macro(cublasStrmm) \
|
|
__macro(cublasDtrmm) \
|
|
__macro(cublasCtrmm) \
|
|
__macro(cublasZtrmm) \
|
|
__macro(cublasSgeam) \
|
|
__macro(cublasDgeam) \
|
|
__macro(cublasCgeam) \
|
|
__macro(cublasZgeam) \
|
|
__macro(cublasSdgmm) \
|
|
__macro(cublasDdgmm) \
|
|
__macro(cublasCdgmm) \
|
|
__macro(cublasZdgmm)
|
|
|
|
PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasCreate)
|
|
PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasDestroy)
|
|
PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetStream)
|
|
PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasSetPointerMode)
|
|
PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP(cublasGetPointerMode)
|
|
PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasSgemmBatched)
|
|
PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasDgemmBatched)
|
|
PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasCgemmBatched)
|
|
PERFTOOLS_GPUTOOLS_CUBLAS_WRAP(cublasZgemmBatched)
|
|
CUBLAS_BLAS_ROUTINE_EACH(PERFTOOLS_GPUTOOLS_CUBLAS_V2_WRAP)
|
|
|
|
} // namespace dynload
|
|
|
|
static string ToString(cublasStatus_t status) {
|
|
switch (status) {
|
|
case CUBLAS_STATUS_SUCCESS:
|
|
return "CUBLAS_STATUS_SUCCESS";
|
|
case CUBLAS_STATUS_NOT_INITIALIZED:
|
|
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
|
case CUBLAS_STATUS_ALLOC_FAILED:
|
|
return "CUBLAS_STATUS_ALLOC_FAILED";
|
|
case CUBLAS_STATUS_INVALID_VALUE:
|
|
return "CUBLAS_STATUS_INVALID_VALUE";
|
|
case CUBLAS_STATUS_ARCH_MISMATCH:
|
|
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
|
case CUBLAS_STATUS_MAPPING_ERROR:
|
|
return "CUBLAS_STATUS_MAPPING_ERROR";
|
|
case CUBLAS_STATUS_EXECUTION_FAILED:
|
|
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
|
case CUBLAS_STATUS_INTERNAL_ERROR:
|
|
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
|
default:
|
|
return port::StrCat("<invalid cublas status: ", status, ">");
|
|
}
|
|
}
|
|
|
|
// cuBLAS has interfaces that permit pointers to be passed from either the host
|
|
// memory space or the device memory space; however, you must instruct it as to
|
|
// which address space those pointers are in with cublasSetPointerMode.
|
|
//
|
|
// This helper sets the cuBLAS pointer mode to a desired value for a cuBLAS call
|
|
// you are about to perform in a given scope.
|
|
//
|
|
// The prior cuBLAS pointer mode is retained and restored when this object goes
|
|
// out of scope.
|
|
class ScopedCublasPointerMode {
|
|
public:
|
|
// Note that, because the setting of the cublas pointer mode is fallible,
|
|
// construction of this scoped datatype must be paired with a call to
|
|
// Init().
|
|
//
|
|
// Parameters:
|
|
// handle: The cublas library handle to act upon in setting the pointer mode.
|
|
explicit ScopedCublasPointerMode(CUDAExecutor *parent, cublasHandle_t handle)
|
|
: parent_(parent), handle_(handle), ok_(false) {}
|
|
|
|
// Attempts the switch to the requested scoped pointer mode, new_mode.
|
|
//
|
|
// Note that when false is returned, an appropriate error has already been
|
|
// logged.
|
|
bool Init(cublasPointerMode_t new_mode) {
|
|
cublasStatus_t ret =
|
|
dynload::cublasGetPointerMode_v2(parent_, handle_, &old_mode_);
|
|
if (ret != CUBLAS_STATUS_SUCCESS) {
|
|
LOG(ERROR) << "failed to get old cublas pointer mode: " << ToString(ret);
|
|
return ok_ = false;
|
|
}
|
|
|
|
ret = dynload::cublasSetPointerMode_v2(parent_, handle_, new_mode);
|
|
if (ret != CUBLAS_STATUS_SUCCESS) {
|
|
LOG(ERROR) << "failed to set new cublas pointer mode: " << ToString(ret);
|
|
return ok_ = false;
|
|
}
|
|
|
|
return ok_ = true;
|
|
}
|
|
|
|
// Switches back to the prior pointer mode, if the switch operation was
|
|
// successful in the first place.
|
|
~ScopedCublasPointerMode() {
|
|
if (ok_) {
|
|
cublasStatus_t ret =
|
|
dynload::cublasSetPointerMode_v2(parent_, handle_, old_mode_);
|
|
if (ret != CUBLAS_STATUS_SUCCESS) {
|
|
LOG(ERROR) << "failed to set former cublas pointer mode: "
|
|
<< ToString(ret);
|
|
}
|
|
}
|
|
}
|
|
|
|
private:
|
|
CUDAExecutor *parent_; // Executor establishing this pointer mode for.
|
|
cublasHandle_t handle_; // Handle to the cuBLAS instance of interest.
|
|
cublasPointerMode_t old_mode_; // Prior cuBLAS pointer mode, to be restored.
|
|
bool ok_; // Whether the change was successful.
|
|
};
|
|
|
|
bool CUDABlas::Init() {
|
|
cublasStatus_t ret = dynload::cublasCreate_v2(parent_, &blas_);
|
|
if (ret != CUBLAS_STATUS_SUCCESS) {
|
|
LOG(ERROR) << "failed to create cublas handle: " << ToString(ret);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
CUDABlas::CUDABlas(cuda::CUDAExecutor *parent)
|
|
: parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {}
|
|
|
|
CUDABlas::~CUDABlas() {
|
|
if (blas_ != nullptr) {
|
|
dynload::cublasDestroy_v2(parent_, blas_);
|
|
}
|
|
}
|
|
|
|
bool CUDABlas::SetStream(Stream *stream) {
|
|
CHECK(stream != nullptr);
|
|
CHECK(AsCUDAStreamValue(stream) != nullptr);
|
|
CHECK(blas_ != nullptr);
|
|
cublasStatus_t ret =
|
|
dynload::cublasSetStream_v2(parent_, blas_, AsCUDAStreamValue(stream));
|
|
if (ret != CUBLAS_STATUS_SUCCESS) {
|
|
LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
namespace {
|
|
|
|
// Helper functions transforming blas arguments into cuBLAS arguments.
|
|
|
|
cublasOperation_t CUDABlasTranspose(blas::Transpose trans) {
|
|
switch (trans) {
|
|
case blas::Transpose::kNoTranspose:
|
|
return CUBLAS_OP_N;
|
|
case blas::Transpose::kTranspose:
|
|
return CUBLAS_OP_T;
|
|
case blas::Transpose::kConjugateTranspose:
|
|
return CUBLAS_OP_C;
|
|
default:
|
|
LOG(FATAL) << "Invalid value of blas::Transpose.";
|
|
}
|
|
}
|
|
|
|
cublasFillMode_t CUDABlasUpperLower(blas::UpperLower uplo) {
|
|
switch (uplo) {
|
|
case blas::UpperLower::kUpper:
|
|
return CUBLAS_FILL_MODE_UPPER;
|
|
case blas::UpperLower::kLower:
|
|
return CUBLAS_FILL_MODE_LOWER;
|
|
default:
|
|
LOG(FATAL) << "Invalid value of blas::UpperLower.";
|
|
}
|
|
}
|
|
|
|
cublasDiagType_t CUDABlasDiagonal(blas::Diagonal diag) {
|
|
switch (diag) {
|
|
case blas::Diagonal::kUnit:
|
|
return CUBLAS_DIAG_UNIT;
|
|
case blas::Diagonal::kNonUnit:
|
|
return CUBLAS_DIAG_NON_UNIT;
|
|
default:
|
|
LOG(FATAL) << "Invalid value of blas::Diagonal.";
|
|
}
|
|
}
|
|
|
|
cublasSideMode_t CUDABlasSide(blas::Side side) {
|
|
switch (side) {
|
|
case blas::Side::kLeft:
|
|
return CUBLAS_SIDE_LEFT;
|
|
case blas::Side::kRight:
|
|
return CUBLAS_SIDE_RIGHT;
|
|
default:
|
|
LOG(FATAL) << "Invalid value of blas::Side.";
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
template <typename FuncT, typename... Args>
|
|
bool CUDABlas::DoBlasInternal(FuncT cublas_func, Stream *stream,
|
|
bool pointer_mode_host, Args... args) {
|
|
mutex_lock lock{mu_};
|
|
|
|
CHECK(blas_ != nullptr);
|
|
if (!SetStream(stream)) {
|
|
return false;
|
|
}
|
|
|
|
ScopedCublasPointerMode pointer_mode{parent_, blas_};
|
|
if (!pointer_mode.Init(pointer_mode_host ? CUBLAS_POINTER_MODE_HOST
|
|
: CUBLAS_POINTER_MODE_DEVICE)) {
|
|
return false;
|
|
}
|
|
|
|
cublasStatus_t ret = cublas_func(parent_, blas_, args...);
|
|
if (ret != CUBLAS_STATUS_SUCCESS) {
|
|
LOG(ERROR) << "failed to run cuBLAS routine " << cublas_func.kName << ": "
|
|
<< ToString(ret);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<float> &x, int incx,
|
|
DeviceMemory<float> *result) {
|
|
return DoBlasInternal(dynload::cublasSasum, stream,
|
|
false /* = pointer_mode_host */, elem_count,
|
|
CUDAMemory(x), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<double> &x, int incx,
|
|
DeviceMemory<double> *result) {
|
|
return DoBlasInternal(dynload::cublasDasum, stream,
|
|
false /* = pointer_mode_host */, elem_count,
|
|
CUDAMemory(x), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
DeviceMemory<float> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasScasum, stream, false /* = pointer_mode_host */,
|
|
elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
DeviceMemory<double> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDzasum, stream, false /* = pointer_mode_host */,
|
|
elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
|
|
const DeviceMemory<float> &x, int incx,
|
|
DeviceMemory<float> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasSaxpy, stream,
|
|
true /* = pointer_mode_host */, elem_count, &alpha,
|
|
CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
|
|
const DeviceMemory<double> &x, int incx,
|
|
DeviceMemory<double> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasDaxpy, stream,
|
|
true /* = pointer_mode_host */, elem_count, &alpha,
|
|
CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
DeviceMemory<std::complex<float>> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasCaxpy, stream,
|
|
true /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
DeviceMemory<std::complex<double>> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasZaxpy, stream,
|
|
true /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<float> &x, int incx,
|
|
DeviceMemory<float> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasScopy, stream,
|
|
true /* = pointer_mode_host */, elem_count,
|
|
CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<double> &x, int incx,
|
|
DeviceMemory<double> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasDcopy, stream,
|
|
true /* = pointer_mode_host */, elem_count,
|
|
CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
DeviceMemory<std::complex<float>> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasCcopy, stream,
|
|
true /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
DeviceMemory<std::complex<double>> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasZcopy, stream,
|
|
true /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<float> &x, int incx,
|
|
const DeviceMemory<float> &y, int incy,
|
|
DeviceMemory<float> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasSdot, stream, false /* = pointer_mode_host */, elem_count,
|
|
CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<double> &x, int incx,
|
|
const DeviceMemory<double> &y, int incy,
|
|
DeviceMemory<double> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDdot, stream, false /* = pointer_mode_host */, elem_count,
|
|
CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
const DeviceMemory<std::complex<float>> &y, int incy,
|
|
DeviceMemory<std::complex<float>> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCdotc, stream, false /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
|
|
CUDAComplex(CUDAMemoryMutable(result)));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
const DeviceMemory<std::complex<double>> &y, int incy,
|
|
DeviceMemory<std::complex<double>> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZdotc, stream, false /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
|
|
CUDAComplex(CUDAMemoryMutable(result)));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
const DeviceMemory<std::complex<float>> &y, int incy,
|
|
DeviceMemory<std::complex<float>> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCdotu, stream, false /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
|
|
CUDAComplex(CUDAMemoryMutable(result)));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
const DeviceMemory<std::complex<double>> &y, int incy,
|
|
DeviceMemory<std::complex<double>> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZdotu, stream, false /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
|
|
CUDAComplex(CUDAMemoryMutable(result)));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<float> &x, int incx,
|
|
DeviceMemory<float> *result) {
|
|
return DoBlasInternal(dynload::cublasSnrm2, stream,
|
|
false /* = pointer_mode_host */, elem_count,
|
|
CUDAMemory(x), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<double> &x, int incx,
|
|
DeviceMemory<double> *result) {
|
|
return DoBlasInternal(dynload::cublasDnrm2, stream,
|
|
false /* = pointer_mode_host */, elem_count,
|
|
CUDAMemory(x), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
DeviceMemory<float> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasScnrm2, stream, false /* = pointer_mode_host */,
|
|
elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
DeviceMemory<double> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDznrm2, stream, false /* = pointer_mode_host */,
|
|
elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
|
|
DeviceMemory<float> *x, int incx,
|
|
DeviceMemory<float> *y, int incy, float c, float s) {
|
|
return DoBlasInternal(
|
|
dynload::cublasSrot, stream, true /* = pointer_mode_host */, elem_count,
|
|
CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
|
|
DeviceMemory<double> *x, int incx,
|
|
DeviceMemory<double> *y, int incy, double c,
|
|
double s) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDrot, stream, true /* = pointer_mode_host */, elem_count,
|
|
CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
|
|
DeviceMemory<std::complex<float>> *x, int incx,
|
|
DeviceMemory<std::complex<float>> *y, int incy,
|
|
float c, float s) {
|
|
return DoBlasInternal(dynload::cublasCsrot, stream,
|
|
true /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(CUDAMemoryMutable(x)), incx,
|
|
CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
|
|
DeviceMemory<std::complex<double>> *x, int incx,
|
|
DeviceMemory<std::complex<double>> *y, int incy,
|
|
double c, double s) {
|
|
return DoBlasInternal(dynload::cublasZdrot, stream,
|
|
true /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(CUDAMemoryMutable(x)), incx,
|
|
CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a,
|
|
DeviceMemory<float> *b, DeviceMemory<float> *c,
|
|
DeviceMemory<float> *s) {
|
|
return DoBlasInternal(dynload::cublasSrotg, stream,
|
|
false /* = pointer_mode_host */, CUDAMemoryMutable(a),
|
|
CUDAMemoryMutable(b), CUDAMemoryMutable(c),
|
|
CUDAMemoryMutable(s));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
|
|
DeviceMemory<double> *b, DeviceMemory<double> *c,
|
|
DeviceMemory<double> *s) {
|
|
return DoBlasInternal(dynload::cublasDrotg, stream,
|
|
false /* = pointer_mode_host */,
|
|
CUDAComplex(CUDAMemoryMutable(a)), CUDAMemoryMutable(b),
|
|
CUDAMemoryMutable(c), CUDAMemoryMutable(s));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
|
|
DeviceMemory<std::complex<float>> *b,
|
|
DeviceMemory<float> *c,
|
|
DeviceMemory<std::complex<float>> *s) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCrotg, stream, false /* = pointer_mode_host */,
|
|
CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)),
|
|
CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s)));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
|
|
DeviceMemory<std::complex<double>> *b,
|
|
DeviceMemory<double> *c,
|
|
DeviceMemory<std::complex<double>> *s) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZrotg, stream, false /* = pointer_mode_host */,
|
|
CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)),
|
|
CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s)));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
|
|
DeviceMemory<float> *x, int incx,
|
|
DeviceMemory<float> *y, int incy,
|
|
const DeviceMemory<float> ¶m) {
|
|
return DoBlasInternal(dynload::cublasSrotm, stream,
|
|
false /* = pointer_mode_host */, elem_count,
|
|
CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy,
|
|
CUDAMemory(param));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
|
|
DeviceMemory<double> *x, int incx,
|
|
DeviceMemory<double> *y, int incy,
|
|
const DeviceMemory<double> ¶m) {
|
|
return DoBlasInternal(dynload::cublasDrotm, stream,
|
|
false /* = pointer_mode_host */, elem_count,
|
|
CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy,
|
|
CUDAMemory(param));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
|
|
DeviceMemory<float> *d2, DeviceMemory<float> *x1,
|
|
const DeviceMemory<float> &y1,
|
|
DeviceMemory<float> *param) {
|
|
return DoBlasInternal(dynload::cublasSrotmg, stream,
|
|
false /* = pointer_mode_host */, CUDAMemoryMutable(d1),
|
|
CUDAMemoryMutable(d2), CUDAMemoryMutable(x1),
|
|
CUDAMemory(y1), CUDAMemoryMutable(param));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
|
|
DeviceMemory<double> *d2, DeviceMemory<double> *x1,
|
|
const DeviceMemory<double> &y1,
|
|
DeviceMemory<double> *param) {
|
|
return DoBlasInternal(dynload::cublasDrotmg, stream,
|
|
false /* = pointer_mode_host */, CUDAMemoryMutable(d1),
|
|
CUDAMemoryMutable(d2), CUDAMemoryMutable(x1),
|
|
CUDAMemory(y1), CUDAMemoryMutable(param));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
|
|
DeviceMemory<float> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasSscal, stream,
|
|
true /* = pointer_mode_host */, elem_count, &alpha,
|
|
CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
|
|
DeviceMemory<double> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasDscal, stream,
|
|
true /* = pointer_mode_host */, elem_count, &alpha,
|
|
CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
|
|
DeviceMemory<std::complex<float>> *x, int incx) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCsscal, stream, true /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
|
|
DeviceMemory<std::complex<double>> *x, int incx) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZdscal, stream, true /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
|
|
std::complex<float> alpha,
|
|
DeviceMemory<std::complex<float>> *x, int incx) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCscal, stream, true /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
|
|
std::complex<double> alpha,
|
|
DeviceMemory<std::complex<double>> *x, int incx) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZscal, stream, true /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
|
|
DeviceMemory<float> *x, int incx,
|
|
DeviceMemory<float> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasSswap, stream,
|
|
true /* = pointer_mode_host */, elem_count,
|
|
CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
|
|
DeviceMemory<double> *x, int incx,
|
|
DeviceMemory<double> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasDswap, stream,
|
|
true /* = pointer_mode_host */, elem_count,
|
|
CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
|
|
DeviceMemory<std::complex<float>> *x, int incx,
|
|
DeviceMemory<std::complex<float>> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasCswap, stream,
|
|
true /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(CUDAMemoryMutable(x)), incx,
|
|
CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
|
|
DeviceMemory<std::complex<double>> *x, int incx,
|
|
DeviceMemory<std::complex<double>> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasZswap, stream,
|
|
true /* = pointer_mode_host */, elem_count,
|
|
CUDAComplex(CUDAMemoryMutable(x)), incx,
|
|
CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<float> &x, int incx,
|
|
DeviceMemory<int> *result) {
|
|
return DoBlasInternal(dynload::cublasIsamax, stream,
|
|
false /* = pointer_mode_host */, elem_count,
|
|
CUDAMemory(x), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<double> &x, int incx,
|
|
DeviceMemory<int> *result) {
|
|
return DoBlasInternal(dynload::cublasIdamax, stream,
|
|
false /* = pointer_mode_host */, elem_count,
|
|
CUDAMemory(x), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
DeviceMemory<int> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasIcamax, stream, false /* = pointer_mode_host */,
|
|
elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<double>> &x,
|
|
int incx, DeviceMemory<int> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasIzamax, stream, false /* = pointer_mode_host */,
|
|
elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<float> &x, int incx,
|
|
DeviceMemory<int> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasIsamin, stream, false /* = pointer_mode_host */,
|
|
elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<double> &x, int incx,
|
|
DeviceMemory<int> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasIdamin, stream, false /* = pointer_mode_host */,
|
|
elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
DeviceMemory<int> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasIcamin, stream, false /* = pointer_mode_host */,
|
|
elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
|
|
const DeviceMemory<std::complex<double>> &x,
|
|
int incx, DeviceMemory<int> *result) {
|
|
return DoBlasInternal(
|
|
dynload::cublasIzamin, stream, false /* = pointer_mode_host */,
|
|
elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
|
|
uint64 n, uint64 kl, uint64 ku, float alpha,
|
|
const DeviceMemory<float> &a, int lda,
|
|
const DeviceMemory<float> &x, int incx, float beta,
|
|
DeviceMemory<float> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasSgbmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda,
|
|
CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
|
|
uint64 n, uint64 kl, uint64 ku, double alpha,
|
|
const DeviceMemory<double> &a, int lda,
|
|
const DeviceMemory<double> &x, int incx, double beta,
|
|
DeviceMemory<double> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDgbmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda,
|
|
CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
|
|
uint64 n, uint64 kl, uint64 ku,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
std::complex<float> beta,
|
|
DeviceMemory<std::complex<float>> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCgbmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
|
|
uint64 n, uint64 kl, uint64 ku,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
std::complex<double> beta,
|
|
DeviceMemory<std::complex<double>> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZgbmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
|
|
uint64 n, float alpha, const DeviceMemory<float> &a,
|
|
int lda, const DeviceMemory<float> &x, int incx,
|
|
float beta, DeviceMemory<float> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasSgemv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
|
|
incx, &beta, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
|
|
uint64 n, double alpha, const DeviceMemory<double> &a,
|
|
int lda, const DeviceMemory<double> &x, int incx,
|
|
double beta, DeviceMemory<double> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDgemv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
|
|
incx, &beta, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
|
|
uint64 n, std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
std::complex<float> beta,
|
|
DeviceMemory<std::complex<float>> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCgemv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
|
|
uint64 n, std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
std::complex<double> beta,
|
|
DeviceMemory<std::complex<double>> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZgemv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
|
|
const DeviceMemory<float> &x, int incx,
|
|
const DeviceMemory<float> &y, int incy,
|
|
DeviceMemory<float> *a, int lda) {
|
|
return DoBlasInternal(
|
|
dynload::cublasSger, stream, true /* = pointer_mode_host */, m, n, &alpha,
|
|
CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
|
|
const DeviceMemory<double> &x, int incx,
|
|
const DeviceMemory<double> &y, int incy,
|
|
DeviceMemory<double> *a, int lda) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDger, stream, true /* = pointer_mode_host */, m, n, &alpha,
|
|
CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
const DeviceMemory<std::complex<float>> &y, int incy,
|
|
DeviceMemory<std::complex<float>> *a, int lda) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCgerc, stream, true /* = pointer_mode_host */, m, n,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
const DeviceMemory<std::complex<double>> &y, int incy,
|
|
DeviceMemory<std::complex<double>> *a, int lda) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZgerc, stream, true /* = pointer_mode_host */, m, n,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
const DeviceMemory<std::complex<float>> &y, int incy,
|
|
DeviceMemory<std::complex<float>> *a, int lda) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCgeru, stream, true /* = pointer_mode_host */, m, n,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
const DeviceMemory<std::complex<double>> &y, int incy,
|
|
DeviceMemory<std::complex<double>> *a, int lda) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZgeru, stream, true /* = pointer_mode_host */, m, n,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
uint64 k, std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
std::complex<float> beta,
|
|
DeviceMemory<std::complex<float>> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasChbmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
uint64 k, std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
std::complex<double> beta,
|
|
DeviceMemory<std::complex<double>> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZhbmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
std::complex<float> beta,
|
|
DeviceMemory<std::complex<float>> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasChemv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
std::complex<double> beta,
|
|
DeviceMemory<std::complex<double>> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZhemv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
float alpha,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
DeviceMemory<std::complex<float>> *a, int lda) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCher, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(CUDAMemoryMutable(a)), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
double alpha,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
DeviceMemory<std::complex<double>> *a, int lda) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZher, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(CUDAMemoryMutable(a)), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
const DeviceMemory<std::complex<float>> &y, int incy,
|
|
DeviceMemory<std::complex<float>> *a, int lda) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCher2, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
|
|
CUDAComplex(CUDAMemoryMutable(a)), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
const DeviceMemory<std::complex<double>> &y, int incy,
|
|
DeviceMemory<std::complex<double>> *a, int lda) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZher2, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
|
|
CUDAComplex(CUDAMemoryMutable(a)), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &ap,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
std::complex<float> beta,
|
|
DeviceMemory<std::complex<float>> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasChpmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &ap,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
std::complex<double> beta,
|
|
DeviceMemory<std::complex<double>> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZhpmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
float alpha,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
DeviceMemory<std::complex<float>> *ap) {
|
|
return DoBlasInternal(
|
|
dynload::cublasChpr, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap)));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
double alpha,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
DeviceMemory<std::complex<double>> *ap) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZhpr, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap)));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &x, int incx,
|
|
const DeviceMemory<std::complex<float>> &y, int incy,
|
|
DeviceMemory<std::complex<float>> *ap) {
|
|
return DoBlasInternal(
|
|
dynload::cublasChpr2, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
|
|
CUDAComplex(CUDAMemoryMutable(ap)));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &x, int incx,
|
|
const DeviceMemory<std::complex<double>> &y, int incy,
|
|
DeviceMemory<std::complex<double>> *ap) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZhpr2, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
|
|
CUDAComplex(CUDAMemoryMutable(ap)));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
uint64 k, float alpha, const DeviceMemory<float> &a,
|
|
int lda, const DeviceMemory<float> &x, int incx,
|
|
float beta, DeviceMemory<float> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasSsbmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
|
|
incx, &beta, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
uint64 k, double alpha, const DeviceMemory<double> &a,
|
|
int lda, const DeviceMemory<double> &x, int incx,
|
|
double beta, DeviceMemory<double> *y, int incy) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDsbmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
|
|
incx, &beta, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
float alpha, const DeviceMemory<float> &ap,
|
|
const DeviceMemory<float> &x, int incx, float beta,
|
|
DeviceMemory<float> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasSspmv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap),
|
|
CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
double alpha, const DeviceMemory<double> &ap,
|
|
const DeviceMemory<double> &x, int incx, double beta,
|
|
DeviceMemory<double> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasDspmv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap),
|
|
CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
float alpha, const DeviceMemory<float> &x, int incx,
|
|
DeviceMemory<float> *ap) {
|
|
return DoBlasInternal(dynload::cublasSspr, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
|
|
incx, CUDAMemoryMutable(ap));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
double alpha, const DeviceMemory<double> &x, int incx,
|
|
DeviceMemory<double> *ap) {
|
|
return DoBlasInternal(dynload::cublasDspr, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
|
|
incx, CUDAMemoryMutable(ap));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
float alpha, const DeviceMemory<float> &x, int incx,
|
|
const DeviceMemory<float> &y, int incy,
|
|
DeviceMemory<float> *ap) {
|
|
return DoBlasInternal(dynload::cublasSspr2, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
|
|
incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
double alpha, const DeviceMemory<double> &x, int incx,
|
|
const DeviceMemory<double> &y, int incy,
|
|
DeviceMemory<double> *ap) {
|
|
return DoBlasInternal(dynload::cublasDspr2, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
|
|
incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
float alpha, const DeviceMemory<float> &a, int lda,
|
|
const DeviceMemory<float> &x, int incx, float beta,
|
|
DeviceMemory<float> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasSsymv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda,
|
|
CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
double alpha, const DeviceMemory<double> &a, int lda,
|
|
const DeviceMemory<double> &x, int incx, double beta,
|
|
DeviceMemory<double> *y, int incy) {
|
|
return DoBlasInternal(dynload::cublasDsymv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda,
|
|
CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
float alpha, const DeviceMemory<float> &x, int incx,
|
|
DeviceMemory<float> *a, int lda) {
|
|
return DoBlasInternal(dynload::cublasSsyr, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
|
|
incx, CUDAMemoryMutable(a), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
double alpha, const DeviceMemory<double> &x, int incx,
|
|
DeviceMemory<double> *a, int lda) {
|
|
return DoBlasInternal(dynload::cublasDsyr, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
|
|
incx, CUDAMemoryMutable(a), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
float alpha, const DeviceMemory<float> &x, int incx,
|
|
const DeviceMemory<float> &y, int incy,
|
|
DeviceMemory<float> *a, int lda) {
|
|
return DoBlasInternal(dynload::cublasSsyr2, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
|
|
incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
|
|
double alpha, const DeviceMemory<double> &x, int incx,
|
|
const DeviceMemory<double> &y, int incy,
|
|
DeviceMemory<double> *a, int lda) {
|
|
return DoBlasInternal(dynload::cublasDsyr2, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
|
|
incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
uint64 k, const DeviceMemory<float> &a, int lda,
|
|
DeviceMemory<float> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasStbmv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
|
|
CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
uint64 k, const DeviceMemory<double> &a, int lda,
|
|
DeviceMemory<double> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasDtbmv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
|
|
CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
uint64 k, const DeviceMemory<std::complex<float>> &a,
|
|
int lda, DeviceMemory<std::complex<float>> *x,
|
|
int incx) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCtbmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
|
|
CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
uint64 k, const DeviceMemory<std::complex<double>> &a,
|
|
int lda, DeviceMemory<std::complex<double>> *x,
|
|
int incx) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZtbmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
|
|
CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
uint64 k, const DeviceMemory<float> &a, int lda,
|
|
DeviceMemory<float> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasStbsv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
|
|
CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
uint64 k, const DeviceMemory<double> &a, int lda,
|
|
DeviceMemory<double> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasDtbsv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
|
|
CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
uint64 k, const DeviceMemory<std::complex<float>> &a,
|
|
int lda, DeviceMemory<std::complex<float>> *x,
|
|
int incx) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCtbsv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
|
|
CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
uint64 k, const DeviceMemory<std::complex<double>> &a,
|
|
int lda, DeviceMemory<std::complex<double>> *x,
|
|
int incx) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZtbsv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
|
|
CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<float> &ap, DeviceMemory<float> *x,
|
|
int incx) {
|
|
return DoBlasInternal(
|
|
dynload::cublasStpmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<double> &ap,
|
|
DeviceMemory<double> *x, int incx) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDtpmv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<std::complex<float>> &ap,
|
|
DeviceMemory<std::complex<float>> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasCtpmv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
|
|
CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<std::complex<double>> &ap,
|
|
DeviceMemory<std::complex<double>> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasZtpmv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
|
|
CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<float> &ap, DeviceMemory<float> *x,
|
|
int incx) {
|
|
return DoBlasInternal(
|
|
dynload::cublasStpsv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<double> &ap,
|
|
DeviceMemory<double> *x, int incx) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDtpsv, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<std::complex<float>> &ap,
|
|
DeviceMemory<std::complex<float>> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasCtpsv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
|
|
CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<std::complex<double>> &ap,
|
|
DeviceMemory<std::complex<double>> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasZtpsv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
|
|
CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<float> &a, int lda,
|
|
DeviceMemory<float> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasStrmv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
|
|
CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<double> &a, int lda,
|
|
DeviceMemory<double> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasDtrmv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
|
|
CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
DeviceMemory<std::complex<float>> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasCtrmv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
|
|
lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
DeviceMemory<std::complex<double>> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasZtrmv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
|
|
lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<float> &a, int lda,
|
|
DeviceMemory<float> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasStrsv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
|
|
CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<double> &a, int lda,
|
|
DeviceMemory<double> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasDtrsv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
|
|
CUDAMemoryMutable(x), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
DeviceMemory<std::complex<float>> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasCtrsv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
|
|
lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, blas::Diagonal diag, uint64 n,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
DeviceMemory<std::complex<double>> *x, int incx) {
|
|
return DoBlasInternal(dynload::cublasZtrsv, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
|
|
CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
|
|
lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
|
|
blas::Transpose transb, uint64 m, uint64 n, uint64 k,
|
|
float alpha, const DeviceMemory<float> &a, int lda,
|
|
const DeviceMemory<float> &b, int ldb, float beta,
|
|
DeviceMemory<float> *c, int ldc) {
|
|
VLOG(1) << port::Printf(
|
|
"doing cuBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
|
|
"k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
|
|
"c=%p ldc=%d",
|
|
static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
|
|
a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
|
|
if (transa == blas::Transpose::kNoTranspose) {
|
|
if (lda < static_cast<int64>(m)) {
|
|
LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
|
|
"precondition violation";
|
|
}
|
|
} else {
|
|
if (lda < static_cast<int64>(k)) {
|
|
LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
|
|
<< ") (transpose case); precondition violation";
|
|
}
|
|
}
|
|
if (transb == blas::Transpose::kNoTranspose) {
|
|
if (ldb < static_cast<int64>(k)) {
|
|
LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
|
|
<< ") (no transpose case); precondition violation";
|
|
}
|
|
} else {
|
|
if (ldb < static_cast<int64>(n)) {
|
|
LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
|
|
"precondition violation";
|
|
}
|
|
}
|
|
return DoBlasInternal(
|
|
dynload::cublasSgemm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
|
|
CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
|
|
blas::Transpose transb, uint64 m, uint64 n, uint64 k,
|
|
double alpha, const DeviceMemory<double> &a, int lda,
|
|
const DeviceMemory<double> &b, int ldb, double beta,
|
|
DeviceMemory<double> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDgemm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
|
|
CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
|
|
blas::Transpose transb, uint64 m, uint64 n, uint64 k,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
const DeviceMemory<std::complex<float>> &b, int ldb,
|
|
std::complex<float> beta,
|
|
DeviceMemory<std::complex<float>> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCgemm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
|
|
CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
|
|
CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
|
|
blas::Transpose transb, uint64 m, uint64 n, uint64 k,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
const DeviceMemory<std::complex<double>> &b, int ldb,
|
|
std::complex<double> beta,
|
|
DeviceMemory<std::complex<double>> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZgemm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
|
|
CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
|
|
CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
template <typename T, typename FuncT>
|
|
port::Status CUDABlas::DoBlasGemmBatchedInternal(
|
|
FuncT cublas_func, Stream *stream, blas::Transpose transa,
|
|
blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
|
|
const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
|
|
const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
|
|
const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
|
|
int batch_count) {
|
|
std::vector<T *> a_ptr_vec, b_ptr_vec, c_ptr_vec;
|
|
for (int i = 0; i < batch_count; ++i) {
|
|
a_ptr_vec.push_back(static_cast<T *>(a_array[i]->opaque()));
|
|
b_ptr_vec.push_back(static_cast<T *>(b_array[i]->opaque()));
|
|
c_ptr_vec.push_back(static_cast<T *>(c_array[i]->opaque()));
|
|
}
|
|
|
|
typedef typename CUDAComplexT<T>::type CUDA_T;
|
|
SE_ASSIGN_OR_RETURN(
|
|
std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_ptr_array,
|
|
stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
|
|
SE_ASSIGN_OR_RETURN(
|
|
std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_ptr_array,
|
|
stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
|
|
SE_ASSIGN_OR_RETURN(
|
|
std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_ptr_array,
|
|
stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
|
|
|
|
if (!stream->ThenMemcpy(a_ptr_array->mutable_device_memory(),
|
|
a_ptr_vec.data(), batch_count * sizeof(T *))
|
|
.ok() ||
|
|
!stream->ThenMemcpy(b_ptr_array->mutable_device_memory(),
|
|
b_ptr_vec.data(), batch_count * sizeof(T *))
|
|
.ok() ||
|
|
!stream->ThenMemcpy(c_ptr_array->mutable_device_memory(),
|
|
c_ptr_vec.data(), batch_count * sizeof(T *))
|
|
.ok()) {
|
|
return port::Status(port::error::INTERNAL,
|
|
"failed to copy memory from host to device in "
|
|
"CUDABlas::DoBlasGemmBatched");
|
|
}
|
|
|
|
bool ok = DoBlasInternal(
|
|
cublas_func, stream, true /* = pointer_mode_host */,
|
|
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
|
|
CUDAComplex(&alpha),
|
|
const_cast<const CUDA_T **>(CUDAMemory(a_ptr_array->device_memory())),
|
|
lda,
|
|
const_cast<const CUDA_T **>(CUDAMemory(b_ptr_array->device_memory())),
|
|
ldb, CUDAComplex(&beta),
|
|
const_cast<CUDA_T **>(CUDAMemory(c_ptr_array->device_memory())), ldc,
|
|
batch_count);
|
|
|
|
if (ok) {
|
|
return port::Status::OK();
|
|
}
|
|
return port::Status(port::error::INTERNAL,
|
|
"failed BLAS call, see log for details");
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGemmBatched(
|
|
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
|
|
uint64 n, uint64 k, float alpha,
|
|
const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda,
|
|
const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta,
|
|
const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
|
|
int batch_count) {
|
|
SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
|
|
dynload::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha,
|
|
a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGemmBatched(
|
|
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
|
|
uint64 n, uint64 k, double alpha,
|
|
const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda,
|
|
const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb,
|
|
double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array,
|
|
int ldc, int batch_count) {
|
|
SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
|
|
dynload::cublasDgemmBatched, stream, transa, transb, m, n, k, alpha,
|
|
a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGemmBatched(
|
|
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
|
|
uint64 n, uint64 k, std::complex<float> alpha,
|
|
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a_array,
|
|
int lda,
|
|
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array,
|
|
int ldb, std::complex<float> beta,
|
|
const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
|
|
int ldc, int batch_count) {
|
|
SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
|
|
dynload::cublasCgemmBatched, stream, transa, transb, m, n, k, alpha,
|
|
a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasGemmBatched(
|
|
Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
|
|
uint64 n, uint64 k, std::complex<double> alpha,
|
|
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a_array,
|
|
int lda,
|
|
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array,
|
|
int ldb, std::complex<double> beta,
|
|
const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array,
|
|
int ldc, int batch_count) {
|
|
SE_RETURN_STATUS_AS_BOOL(DoBlasGemmBatchedInternal(
|
|
dynload::cublasZgemmBatched, stream, transa, transb, m, n, k, alpha,
|
|
a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count));
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, uint64 m, uint64 n,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
const DeviceMemory<std::complex<float>> &b, int ldb,
|
|
std::complex<float> beta,
|
|
DeviceMemory<std::complex<float>> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasChemm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, uint64 m, uint64 n,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
const DeviceMemory<std::complex<double>> &b, int ldb,
|
|
std::complex<double> beta,
|
|
DeviceMemory<std::complex<double>> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZhemm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, uint64 n, uint64 k,
|
|
float alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
float beta, DeviceMemory<std::complex<float>> *c,
|
|
int ldc) {
|
|
return DoBlasInternal(dynload::cublasCherk, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
|
|
k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
|
|
&beta, CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, uint64 n, uint64 k,
|
|
double alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
double beta, DeviceMemory<std::complex<double>> *c,
|
|
int ldc) {
|
|
return DoBlasInternal(dynload::cublasZherk, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
|
|
k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
|
|
&beta, CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, uint64 n, uint64 k,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
const DeviceMemory<std::complex<float>> &b, int ldb,
|
|
float beta, DeviceMemory<std::complex<float>> *c,
|
|
int ldc) {
|
|
return DoBlasInternal(dynload::cublasCher2k, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
|
|
k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
|
|
CUDAComplex(CUDAMemory(b)), ldb, &beta,
|
|
CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, uint64 n, uint64 k,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
const DeviceMemory<std::complex<double>> &b, int ldb,
|
|
double beta, DeviceMemory<std::complex<double>> *c,
|
|
int ldc) {
|
|
return DoBlasInternal(dynload::cublasZher2k, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
|
|
k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
|
|
CUDAComplex(CUDAMemory(b)), ldb, &beta,
|
|
CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, uint64 m, uint64 n,
|
|
float alpha, const DeviceMemory<float> &a, int lda,
|
|
const DeviceMemory<float> &b, int ldb, float beta,
|
|
DeviceMemory<float> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasSsymm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a),
|
|
lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, uint64 m, uint64 n,
|
|
double alpha, const DeviceMemory<double> &a, int lda,
|
|
const DeviceMemory<double> &b, int ldb, double beta,
|
|
DeviceMemory<double> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDsymm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a),
|
|
lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, uint64 m, uint64 n,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
const DeviceMemory<std::complex<float>> &b, int ldb,
|
|
std::complex<float> beta,
|
|
DeviceMemory<std::complex<float>> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCsymm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, uint64 m, uint64 n,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
const DeviceMemory<std::complex<double>> &b, int ldb,
|
|
std::complex<double> beta,
|
|
DeviceMemory<std::complex<double>> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZsymm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
|
|
CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, uint64 n, uint64 k,
|
|
float alpha, const DeviceMemory<float> &a, int lda,
|
|
float beta, DeviceMemory<float> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasSsyrk, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
|
|
CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, uint64 n, uint64 k,
|
|
double alpha, const DeviceMemory<double> &a, int lda,
|
|
double beta, DeviceMemory<double> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDsyrk, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
|
|
CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, uint64 n, uint64 k,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
std::complex<float> beta,
|
|
DeviceMemory<std::complex<float>> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCsyrk, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta),
|
|
CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, uint64 n, uint64 k,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
std::complex<double> beta,
|
|
DeviceMemory<std::complex<double>> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZsyrk, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k,
|
|
CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta),
|
|
CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, uint64 n, uint64 k,
|
|
float alpha, const DeviceMemory<float> &a, int lda,
|
|
const DeviceMemory<float> &b, int ldb, float beta,
|
|
DeviceMemory<float> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasSsyr2k, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
|
|
CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, uint64 n, uint64 k,
|
|
double alpha, const DeviceMemory<double> &a, int lda,
|
|
const DeviceMemory<double> &b, int ldb, double beta,
|
|
DeviceMemory<double> *c, int ldc) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDsyr2k, stream, true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
|
|
CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, uint64 n, uint64 k,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
const DeviceMemory<std::complex<float>> &b, int ldb,
|
|
std::complex<float> beta,
|
|
DeviceMemory<std::complex<float>> *c, int ldc) {
|
|
return DoBlasInternal(dynload::cublasCsyr2k, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
|
|
k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
|
|
CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
|
|
CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
|
|
blas::Transpose trans, uint64 n, uint64 k,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
const DeviceMemory<std::complex<double>> &b, int ldb,
|
|
std::complex<double> beta,
|
|
DeviceMemory<std::complex<double>> *c, int ldc) {
|
|
return DoBlasInternal(dynload::cublasZsyr2k, stream,
|
|
true /* = pointer_mode_host */,
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
|
|
k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
|
|
CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
|
|
CUDAComplex(CUDAMemoryMutable(c)), ldc);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, blas::Transpose transa,
|
|
blas::Diagonal diag, uint64 m, uint64 n, float alpha,
|
|
const DeviceMemory<float> &a, int lda,
|
|
DeviceMemory<float> *b, int ldb) {
|
|
return DoBlasInternal(
|
|
dynload::cublasStrmm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
|
|
CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda,
|
|
CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, blas::Transpose transa,
|
|
blas::Diagonal diag, uint64 m, uint64 n, double alpha,
|
|
const DeviceMemory<double> &a, int lda,
|
|
DeviceMemory<double> *b, int ldb) {
|
|
return DoBlasInternal(
|
|
dynload::cublasDtrmm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
|
|
CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda,
|
|
CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, blas::Transpose transa,
|
|
blas::Diagonal diag, uint64 m, uint64 n,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
DeviceMemory<std::complex<float>> *b, int ldb) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCtrmm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
|
|
CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb,
|
|
CUDAComplex(CUDAMemoryMutable(b)), ldb);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, blas::Transpose transa,
|
|
blas::Diagonal diag, uint64 m, uint64 n,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
DeviceMemory<std::complex<double>> *b, int ldb) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZtrmm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
|
|
CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb,
|
|
CUDAComplex(CUDAMemoryMutable(b)), ldb);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, blas::Transpose transa,
|
|
blas::Diagonal diag, uint64 m, uint64 n, float alpha,
|
|
const DeviceMemory<float> &a, int lda,
|
|
DeviceMemory<float> *b, int ldb) {
|
|
return DoBlasInternal(dynload::cublasStrsm, stream,
|
|
true /* = pointer_mode_host */, CUDABlasSide(side),
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
|
|
CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a),
|
|
lda, CUDAMemoryMutable(b), ldb);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, blas::Transpose transa,
|
|
blas::Diagonal diag, uint64 m, uint64 n, double alpha,
|
|
const DeviceMemory<double> &a, int lda,
|
|
DeviceMemory<double> *b, int ldb) {
|
|
return DoBlasInternal(dynload::cublasDtrsm, stream,
|
|
true /* = pointer_mode_host */, CUDABlasSide(side),
|
|
CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
|
|
CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a),
|
|
lda, CUDAMemoryMutable(b), ldb);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, blas::Transpose transa,
|
|
blas::Diagonal diag, uint64 m, uint64 n,
|
|
std::complex<float> alpha,
|
|
const DeviceMemory<std::complex<float>> &a, int lda,
|
|
DeviceMemory<std::complex<float>> *b, int ldb) {
|
|
return DoBlasInternal(
|
|
dynload::cublasCtrsm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
|
|
CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb);
|
|
}
|
|
|
|
bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
|
|
blas::UpperLower uplo, blas::Transpose transa,
|
|
blas::Diagonal diag, uint64 m, uint64 n,
|
|
std::complex<double> alpha,
|
|
const DeviceMemory<std::complex<double>> &a, int lda,
|
|
DeviceMemory<std::complex<double>> *b, int ldb) {
|
|
return DoBlasInternal(
|
|
dynload::cublasZtrsm, stream, true /* = pointer_mode_host */,
|
|
CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
|
|
CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
|
|
CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb);
|
|
}
|
|
|
|
} // namespace cuda
|
|
|
|
namespace gpu = ::perftools::gputools;
|
|
|
|
void initialize_cublas() {
|
|
gpu::port::Status status =
|
|
gpu::PluginRegistry::Instance()
|
|
->RegisterFactory<gpu::PluginRegistry::BlasFactory>(
|
|
gpu::cuda::kCudaPlatformId, gpu::cuda::kCuBlasPlugin, "cuBLAS",
|
|
[](gpu::internal::StreamExecutorInterface
|
|
*parent) -> gpu::blas::BlasSupport * {
|
|
gpu::cuda::CUDAExecutor *cuda_executor =
|
|
dynamic_cast<gpu::cuda::CUDAExecutor *>(parent);
|
|
if (cuda_executor == nullptr) {
|
|
LOG(ERROR)
|
|
<< "Attempting to initialize an instance of the cuBLAS "
|
|
<< "support library with a non-CUDA StreamExecutor";
|
|
return nullptr;
|
|
}
|
|
|
|
gpu::cuda::CUDABlas *blas =
|
|
new gpu::cuda::CUDABlas(cuda_executor);
|
|
if (!blas->Init()) {
|
|
// Note: Init() will log a more specific error.
|
|
delete blas;
|
|
return nullptr;
|
|
}
|
|
return blas;
|
|
});
|
|
|
|
if (!status.ok()) {
|
|
LOG(ERROR) << "Unable to register cuBLAS factory: "
|
|
<< status.error_message();
|
|
}
|
|
|
|
// Prime the cuBLAS DSO. The loader will log more information.
|
|
auto statusor = gpu::internal::CachedDsoLoader::GetCublasDsoHandle();
|
|
if (!statusor.ok()) {
|
|
LOG(INFO) << "Unable to load cuBLAS DSO.";
|
|
}
|
|
|
|
gpu::PluginRegistry::Instance()->SetDefaultFactory(gpu::cuda::kCudaPlatformId,
|
|
gpu::PluginKind::kBlas,
|
|
gpu::cuda::kCuBlasPlugin);
|
|
}
|
|
|
|
} // namespace gputools
|
|
} // namespace perftools
|
|
|
|
REGISTER_MODULE_INITIALIZER(register_cublas,
|
|
{ perftools::gputools::initialize_cublas(); });
|