Internal change
PiperOrigin-RevId: 317230321 Change-Id: I043dae37768f6e9cf946d4db2a8c36123ed2d6d9
This commit is contained in:
parent
4a14e778d6
commit
9c4b749b09
|
@ -938,13 +938,6 @@ cc_library(
|
|||
alwayslink = 1,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "tf32_utils",
|
||||
srcs = ["tf32_utils.cc"],
|
||||
hdrs = ["tf32_utils.h"],
|
||||
copts = tf_copts(),
|
||||
)
|
||||
|
||||
tf_cc_tests(
|
||||
name = "low_level_library_tests",
|
||||
size = "small",
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/platform/tf32_utils.h"
|
||||
|
||||
#include <atomic>
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
// Whether TensorFloat-32 should be used where supported.
|
||||
// TODO(nluehr): Maybe enable by default after TF32 Ampere testing.
|
||||
static std::atomic<bool> tf32_allowed{false};
|
||||
|
||||
void allow_tf32_execution(bool allowed) { tf32_allowed = allowed; }
|
||||
|
||||
bool tf32_execution_allowed() { return tf32_allowed; }
|
||||
|
||||
} // namespace tensorflow
|
|
@ -1,27 +0,0 @@
|
|||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
|
||||
#define TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
void allow_tf32_execution(bool allowed);
|
||||
|
||||
bool tf32_execution_allowed();
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
|
|
@ -788,16 +788,6 @@ tf_python_pybind_extension(
|
|||
],
|
||||
)
|
||||
|
||||
tf_python_pybind_extension(
|
||||
name = "_pywrap_tf32_execution",
|
||||
srcs = ["util/tf32.cc"],
|
||||
module_name = "_pywrap_tf32_execution",
|
||||
deps = [
|
||||
"//tensorflow/core/platform:tf32_utils",
|
||||
"@pybind11",
|
||||
],
|
||||
)
|
||||
|
||||
tf_python_pybind_extension(
|
||||
name = "_pywrap_util_port",
|
||||
srcs = ["util/port_wrapper.cc"],
|
||||
|
@ -5688,7 +5678,6 @@ py_library(
|
|||
"//tensorflow:composite_tensor_whitelist",
|
||||
],
|
||||
deps = [
|
||||
":_pywrap_tf32_execution",
|
||||
":tf_decorator",
|
||||
":tf_export",
|
||||
":tf_stack",
|
||||
|
|
|
@ -18,42 +18,11 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from tensorflow.python import _pywrap_tf32_execution
|
||||
from tensorflow.python.eager import context
|
||||
from tensorflow.python.util import deprecation
|
||||
from tensorflow.python.util.tf_export import tf_export
|
||||
|
||||
|
||||
# No tf_export until TF is built against CUDA11 which is required for TF32.
|
||||
def tensor_float_32_execution_allowed():
|
||||
"""Get if TensorFloat-32 operations are enabled on supported hardware.
|
||||
|
||||
Returns:
|
||||
True if TensorFloat-32 execution is enabled and False otherwise.
|
||||
"""
|
||||
return _pywrap_tf32_execution.is_allowed()
|
||||
|
||||
|
||||
# No tf_export until TF is built against CUDA 11 which is required for TF32.
|
||||
def allow_tensor_float_32_execution(allowed):
|
||||
"""Allow use of TensorFloat-32 with float32 ops on supported hardware.
|
||||
|
||||
TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
|
||||
TensorFloat-32 kernels take float32 inputs and produce float32 outputs.
|
||||
Internally, the inputs are cast to a custom representation with 10-bit
|
||||
mantissa (similar to float16) and 8-bit exponent (similar to float32) and are
|
||||
executed using TensorCores with float32 accumulation. For more information,
|
||||
see https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/.
|
||||
|
||||
TensorFloat-32 execution is disabled by default, but this may change in a
|
||||
future version.
|
||||
|
||||
Args:
|
||||
allowed: whether to allow TensorFloat-32 execution
|
||||
"""
|
||||
_pywrap_tf32_execution.allow(allowed)
|
||||
|
||||
|
||||
@tf_export('config.threading.get_intra_op_parallelism_threads')
|
||||
def get_intra_op_parallelism_threads():
|
||||
"""Get number of threads used within an individual op for parallelism.
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "pybind11/pybind11.h"
|
||||
#include "tensorflow/core/platform/tf32_utils.h"
|
||||
|
||||
PYBIND11_MODULE(_pywrap_tf32_execution, m) {
|
||||
m.def("allow", &tensorflow::allow_tf32_execution);
|
||||
m.def("is_allowed", &tensorflow::tf32_execution_allowed);
|
||||
}
|
|
@ -251,7 +251,6 @@ cc_library(
|
|||
"@local_config_cuda//cuda:cuda_headers",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:lib_internal",
|
||||
"//tensorflow/core/platform:tf32_utils",
|
||||
"//tensorflow/stream_executor",
|
||||
"//tensorflow/stream_executor:event",
|
||||
"//tensorflow/stream_executor:host_or_device_scalar",
|
||||
|
@ -357,7 +356,6 @@ cc_library(
|
|||
"@local_config_cuda//cuda:cudnn_header",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:lib_internal",
|
||||
"//tensorflow/core/platform:tf32_utils",
|
||||
"//tensorflow/stream_executor:dnn",
|
||||
"//tensorflow/stream_executor:event",
|
||||
"//tensorflow/stream_executor:plugin_registry",
|
||||
|
|
|
@ -49,7 +49,6 @@ limitations under the License.
|
|||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "third_party/eigen3/Eigen/Core"
|
||||
#include "tensorflow/core/platform/tf32_utils.h"
|
||||
#include "tensorflow/core/util/env_var.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_activation.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
|
||||
|
@ -102,6 +101,18 @@ static std::string ToString(cublasStatus_t status) {
|
|||
}
|
||||
}
|
||||
|
||||
// Decide whether to enable TENSOR_OP_MATH
|
||||
static bool TensorOpMathEnabled() {
|
||||
static bool is_enabled = [] {
|
||||
bool is_disabled;
|
||||
TF_CHECK_OK(
|
||||
tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH",
|
||||
/*default_val=*/false, &is_disabled));
|
||||
return !is_disabled;
|
||||
}();
|
||||
return is_enabled;
|
||||
}
|
||||
|
||||
// cuBLAS has interfaces that permit pointers to be passed from either the host
|
||||
// memory space or the device memory space; however, you must instruct it as to
|
||||
// which address space those pointers are in with cublasSetPointerMode.
|
||||
|
@ -226,19 +237,6 @@ bool CUDABlas::Init() {
|
|||
return false;
|
||||
}
|
||||
|
||||
absl::MutexLock lock(&mu_);
|
||||
#if CUDA_VERSION >= 9000
|
||||
#if CUBLAS_VER_MAJOR >= 11
|
||||
ret = cublasSetMathMode(blas_, CUBLAS_TF32_TENSOR_OP_MATH);
|
||||
#else
|
||||
ret = cublasSetMathMode(blas_, CUBLAS_TENSOR_OP_MATH);
|
||||
#endif
|
||||
if (ret != CUBLAS_STATUS_SUCCESS) {
|
||||
LOG(ERROR) << "failed to set cublas default math mode: " << ToString(ret);
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -401,7 +399,7 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
|
|||
template <typename FuncT, typename... Args>
|
||||
bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
|
||||
bool pointer_mode_host, bool err_on_failure,
|
||||
Args... args) {
|
||||
bool use_tensor_op_math, Args... args) {
|
||||
absl::MutexLock lock(&mu_);
|
||||
|
||||
CHECK(blas_ != nullptr);
|
||||
|
@ -415,10 +413,10 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
|
|||
: CUBLAS_POINTER_MODE_DEVICE)) {
|
||||
return false;
|
||||
}
|
||||
#if CUBLAS_VER_MAJOR >= 11
|
||||
#if CUDA_VERSION >= 9000
|
||||
ScopedCublasMathMode math_mode{blas_};
|
||||
if (!tensorflow::tf32_execution_allowed()) {
|
||||
if (!math_mode.Init(CUBLAS_DEFAULT_MATH)) {
|
||||
if (use_tensor_op_math) {
|
||||
if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -1635,9 +1633,21 @@ bool CUDABlas::DoBlasGemm(
|
|||
}
|
||||
}
|
||||
|
||||
bool use_tensor_ops = false;
|
||||
#if CUDA_VERSION >= 9000
|
||||
int cc_major, cc_minor;
|
||||
stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
|
||||
&cc_minor);
|
||||
|
||||
// GPUs < sm_70 don't support tensor ops.
|
||||
if (cc_major >= 7 && TensorOpMathEnabled()) {
|
||||
use_tensor_ops = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
return DoBlasInternalImpl(
|
||||
cublasSgemmEx, stream, true /* = pointer_mode_host */,
|
||||
true /* = err_on_failure= */, CUDABlasTranspose(transa),
|
||||
true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
|
||||
CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
|
||||
SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
|
||||
GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
|
||||
|
@ -1911,7 +1921,8 @@ static bool TensorOpsAvailable(int cc_major) {
|
|||
// strictly correct. We can't simply enable it, though, as that would change
|
||||
// clients' behavior significantly: Using tensor ops on fp32 inputs cause them
|
||||
// to be rounded to fp16.
|
||||
if (cc_major >= 7 && std::is_same<InType, Eigen::half>::value) {
|
||||
if (cc_major >= 7 && TensorOpMathEnabled() &&
|
||||
std::is_same<InType, Eigen::half>::value) {
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
@ -2259,8 +2270,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
|
|||
if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
|
||||
&cc_major, &cc_minor) &&
|
||||
cc_major >= 5) {
|
||||
bool use_tensor_ops =
|
||||
data_type == CUDA_R_16F || tensorflow::tf32_execution_allowed();
|
||||
bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
|
||||
cublasGemmAlgo_t algo =
|
||||
(use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
|
||||
cudaDataType_t compute_type =
|
||||
|
@ -2274,7 +2284,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
|
|||
bool ok;
|
||||
ok = DoBlasInternalImpl(
|
||||
AS_LAMBDA(cublasGemmBatchedEx), stream, true /* = pointer_mode_host */,
|
||||
true /* = err_on_failure */, CUDABlasTranspose(transa),
|
||||
true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
|
||||
CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
|
||||
b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
|
||||
batch_count, compute_type, algo);
|
||||
|
@ -2409,25 +2419,33 @@ bool CUDABlas::DoBlasGemmStridedBatched(
|
|||
int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
|
||||
int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
|
||||
int64 stride_c, int batch_count) {
|
||||
#if CUDA_VERSION >= 9010
|
||||
bool use_tensor_ops = false;
|
||||
#if CUDA_VERSION >= 9000
|
||||
int cc_major, cc_minor;
|
||||
if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
|
||||
&cc_major, &cc_minor) &&
|
||||
cc_major >= 5) {
|
||||
cublasGemmAlgo_t algo =
|
||||
(cc_major >= 7 ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
|
||||
bool ok = DoBlasInternalImpl(
|
||||
AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
|
||||
true /* = pointer_mode_host */, true /* = err_on_failure */,
|
||||
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
|
||||
GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F, ldb,
|
||||
stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
|
||||
batch_count, CUDA_R_32F, algo);
|
||||
if (ok) {
|
||||
return true;
|
||||
&cc_major, &cc_minor)) {
|
||||
// GPUs < sm_70 don't support tensor ops.
|
||||
if (cc_major >= 7 && TensorOpMathEnabled()) {
|
||||
use_tensor_ops = true;
|
||||
}
|
||||
LOG(ERROR) << "failed BLAS call, see log for details";
|
||||
return false;
|
||||
#if CUDA_VERSION >= 9010
|
||||
if (cc_major >= 5) {
|
||||
cublasGemmAlgo_t algo =
|
||||
(use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
|
||||
bool ok = DoBlasInternalImpl(
|
||||
AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
|
||||
true /* = pointer_mode_host */, true /* = err_on_failure */,
|
||||
use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb),
|
||||
m, n, k, &alpha, GpuMemory(a), CUDA_R_16F, lda, stride_a,
|
||||
GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta, GpuMemoryMutable(c),
|
||||
CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo);
|
||||
if (ok) {
|
||||
return true;
|
||||
}
|
||||
LOG(ERROR) << "failed BLAS call, see log for details";
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
// Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
|
||||
|
@ -2440,7 +2458,7 @@ bool CUDABlas::DoBlasGemmStridedBatched(
|
|||
reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
|
||||
bool ok = DoBlasInternalImpl(
|
||||
cublasSgemmEx, stream, true /* = pointer_mode_host */,
|
||||
true /* = err_on_failure= */, CUDABlasTranspose(transa),
|
||||
true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
|
||||
CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
|
||||
lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
|
||||
SE_CUDA_DATA_HALF, ldc);
|
||||
|
|
|
@ -83,7 +83,7 @@ class CUDABlas : public blas::BlasSupport {
|
|||
template <typename FuncT, typename... Args>
|
||||
bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
|
||||
bool pointer_mode_host, bool err_on_failure,
|
||||
Args... args);
|
||||
bool use_tensor_op_math, Args... args);
|
||||
|
||||
// Convenience functions that call DoBlasInternalImpl with different values
|
||||
// for err_on_failure.
|
||||
|
@ -91,7 +91,8 @@ class CUDABlas : public blas::BlasSupport {
|
|||
bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
|
||||
Args... args) {
|
||||
return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
|
||||
/*err_on_failure=*/true, args...);
|
||||
/*err_on_failure=*/true, /*use_tensor_ops=*/false,
|
||||
args...);
|
||||
}
|
||||
template <typename FuncT, typename... Args>
|
||||
bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream,
|
||||
|
@ -99,7 +100,8 @@ class CUDABlas : public blas::BlasSupport {
|
|||
// Tensor ops are hard-coded off in this path, but can still be enabled with
|
||||
// a specific algorithm choice as in DoBlasGemmWithAlgorithmImpl().
|
||||
return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
|
||||
/*err_on_failure=*/false, args...);
|
||||
/*err_on_failure=*/false,
|
||||
/*use_tensor_ops=*/false, args...);
|
||||
}
|
||||
|
||||
// A helper function to implement DoBlasGemmBatched interfaces for generic
|
||||
|
|
|
@ -22,7 +22,6 @@ limitations under the License.
|
|||
#include "absl/strings/str_cat.h"
|
||||
#include "third_party/eigen3/Eigen/Core"
|
||||
#include "tensorflow/core/lib/core/errors.h"
|
||||
#include "tensorflow/core/platform/tf32_utils.h"
|
||||
#include "tensorflow/core/util/env_var.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_activation.h"
|
||||
#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
|
||||
|
@ -602,6 +601,31 @@ class CudnnFilterDescriptor {
|
|||
SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor);
|
||||
};
|
||||
|
||||
// A helper function to decide whether to enable the TENSOR_OP_MATH math type
|
||||
bool TensorOpMathEnabled() {
|
||||
static bool is_enabled = [] {
|
||||
bool is_disabled = false;
|
||||
TF_CHECK_OK(
|
||||
tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_TENSOR_OP_MATH",
|
||||
/*default_val=*/false, &is_disabled));
|
||||
return !is_disabled;
|
||||
}();
|
||||
return is_enabled;
|
||||
}
|
||||
|
||||
// A helper function to decide whether to enable the TENSOR_OP_MATH math type
|
||||
// for RNNs.
|
||||
bool RnnTensorOpMathEnabled() {
|
||||
static bool is_enabled = [] {
|
||||
bool is_disabled = false;
|
||||
TF_CHECK_OK(
|
||||
tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH",
|
||||
/*default_val=*/false, &is_disabled));
|
||||
return !is_disabled;
|
||||
}();
|
||||
return is_enabled;
|
||||
}
|
||||
|
||||
// A helper function to decide whether to use
|
||||
// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
|
||||
// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
|
||||
|
@ -706,6 +730,10 @@ class CudnnConvolutionDescriptor {
|
|||
: CUDNN_CROSS_CORRELATION,
|
||||
data_type));
|
||||
|
||||
// NOTE(benbarsdell): This only applies if tensor op math is enabled
|
||||
// and algo selection is set to Default.
|
||||
this->set_use_tensor_op_math(true);
|
||||
|
||||
#if CUDNN_MAJOR >= 7
|
||||
VLOG(2) << "Requesting grouped convolution: "
|
||||
<< convolution_descriptor.group_count();
|
||||
|
@ -717,15 +745,13 @@ class CudnnConvolutionDescriptor {
|
|||
#endif
|
||||
}
|
||||
|
||||
void set_use_tensor_op_math(bool use_tensor_op_math) {
|
||||
void set_use_tensor_op_math(bool use_tensor_op_math) const {
|
||||
#if CUDNN_VERSION >= 7000
|
||||
cudnnMathType_t math_type =
|
||||
#if CUDNN_VERSION >= 8000
|
||||
(use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH);
|
||||
#else
|
||||
(use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
|
||||
#endif
|
||||
CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
|
||||
if (TensorOpMathEnabled()) {
|
||||
CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -737,40 +763,6 @@ class CudnnConvolutionDescriptor {
|
|||
SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolutionDescriptor);
|
||||
};
|
||||
|
||||
// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
|
||||
// set
|
||||
static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
|
||||
cudnnMathType_t math_type;
|
||||
CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
|
||||
#if CUDNN_VERSION >= 8000
|
||||
return math_type != CUDNN_FMA_MATH;
|
||||
#else
|
||||
return math_type == CUDNN_TENSOR_OP_MATH;
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool TensorOpMathAvailable(int cc_major) {
|
||||
return cc_major >= 7 && CUDNN_VERSION >= 7000;
|
||||
}
|
||||
|
||||
static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
|
||||
int cc_major, cc_minor;
|
||||
std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
|
||||
if (!TensorOpMathAvailable(cc_major)) {
|
||||
return false;
|
||||
}
|
||||
if (input_type == dnn::DataType::kFloat) {
|
||||
#if CUDNN_VERSION < 8000
|
||||
return false;
|
||||
#else
|
||||
if (!tensorflow::tf32_execution_allowed()) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
|
||||
// within a scope.
|
||||
class CudnnPoolingDescriptor {
|
||||
|
@ -1163,31 +1155,21 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
|
|||
// in profile mode, which is run with algorithms returned from
|
||||
// GetRnnAlgorithms() (which are non-default and explicitly set whether to
|
||||
// use tensor ops). CuDNN 7.2.1 fixed this issue
|
||||
bool allow_tensor_ops =
|
||||
data_type != CUDNN_DATA_FLOAT || tensorflow::tf32_execution_allowed();
|
||||
bool use_tensor_ops;
|
||||
if (algorithm_config.algorithm().has_value()) {
|
||||
use_tensor_ops = algorithm_config.algorithm()->tensor_ops_enabled();
|
||||
} else {
|
||||
use_tensor_ops = CUDNN_VERSION >= 7201 && allow_tensor_ops;
|
||||
}
|
||||
|
||||
if (use_tensor_ops && !allow_tensor_ops) {
|
||||
return port::Status(port::error::INVALID_ARGUMENT,
|
||||
"Algo requests disallowed tensor op evaluation.");
|
||||
}
|
||||
|
||||
cudnnMathType_t math_type;
|
||||
if (use_tensor_ops) {
|
||||
math_type = CUDNN_TENSOR_OP_MATH;
|
||||
} else {
|
||||
#if CUDNN_VERSION >= 8000
|
||||
math_type = CUDNN_FMA_MATH;
|
||||
if (RnnTensorOpMathEnabled()) {
|
||||
cudnnMathType_t math_type;
|
||||
if (algorithm_config.algorithm().has_value()) {
|
||||
math_type = algorithm_config.algorithm()->tensor_ops_enabled()
|
||||
? CUDNN_TENSOR_OP_MATH
|
||||
: CUDNN_DEFAULT_MATH;
|
||||
} else {
|
||||
#if CUDNN_VERSION >= 7201
|
||||
math_type = CUDNN_TENSOR_OP_MATH;
|
||||
#else
|
||||
math_type = CUDNN_DEFAULT_MATH;
|
||||
#endif // CUDNN_VERSION >= 8000
|
||||
math_type = CUDNN_DEFAULT_MATH;
|
||||
#endif // CUDNN_VERSION >= 7201
|
||||
}
|
||||
CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
|
||||
}
|
||||
CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
|
||||
#endif // CUDNN_VERSION >= 7000
|
||||
|
||||
return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan),
|
||||
|
@ -2578,11 +2560,10 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
|
|||
const CudnnTensorDescriptor& output_nd,
|
||||
const dnn::AlgorithmDesc& algorithm_desc,
|
||||
ScratchAllocator* scratch_allocator) {
|
||||
if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
|
||||
return port::Status(
|
||||
port::error::INTERNAL,
|
||||
"Mismatch between cudnn conv and algorithm descriptors.");
|
||||
}
|
||||
// TODO(csigg): This has side effects on the convolution descriptor. It is
|
||||
// functionally correct because the convolution is run with the algorithm of
|
||||
// the last call to this function, but should be fixed anyway.
|
||||
conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
|
||||
|
||||
// Query the size of the workspace and allocate it.
|
||||
size_t size_in_bytes;
|
||||
|
@ -2622,11 +2603,10 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
|
|||
const CudnnTensorDescriptor& output_nd,
|
||||
const dnn::AlgorithmDesc& algorithm_desc,
|
||||
ScratchAllocator* scratch_allocator) {
|
||||
if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
|
||||
return port::Status(
|
||||
port::error::INTERNAL,
|
||||
"Mismatch between cudnn conv and algorithm descriptors.");
|
||||
}
|
||||
// TODO(csigg): This has side effects on the convolution descriptor. It is
|
||||
// functionally correct because the convolution is run with the algorithm of
|
||||
// the last call to this function, but should be fixed anyway.
|
||||
conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
|
||||
|
||||
// Query the size of the workspace and allocate it.
|
||||
size_t size_in_bytes;
|
||||
|
@ -2668,11 +2648,10 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
|
|||
const CudnnTensorDescriptor& output_nd,
|
||||
const dnn::AlgorithmDesc& algorithm_desc,
|
||||
ScratchAllocator* scratch_allocator) {
|
||||
if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
|
||||
return port::Status(
|
||||
port::error::INTERNAL,
|
||||
"Mismatch between cudnn conv and algorithm descriptors.");
|
||||
}
|
||||
// TODO(csigg): This has side effects on the convolution descriptor. It is
|
||||
// functionally correct because the convolution is run with the algorithm of
|
||||
// the last call to this function, but should be fixed anyway.
|
||||
conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
|
||||
|
||||
// Query the size of the workspace and allocate it.
|
||||
size_t size_in_bytes;
|
||||
|
@ -2706,42 +2685,18 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
|
|||
return scratch_allocator->AllocateBytes(size_in_bytes);
|
||||
}
|
||||
|
||||
port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
|
||||
absl::optional<dnn::AlgorithmDesc> desc) {
|
||||
bool use_tensor_ops;
|
||||
if (desc.has_value()) {
|
||||
use_tensor_ops = desc->tensor_ops_enabled();
|
||||
if (use_tensor_ops && !IsTensorMathAllowed(stream, type)) {
|
||||
return port::Status(port::error::INVALID_ARGUMENT,
|
||||
"Algo requests disallowed tensor op evaluation.");
|
||||
}
|
||||
} else {
|
||||
use_tensor_ops = IsTensorMathAllowed(stream, type);
|
||||
}
|
||||
return use_tensor_ops;
|
||||
static bool TensorOpMathAvailable(int cc_major) {
|
||||
return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled();
|
||||
}
|
||||
|
||||
cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
|
||||
dnn::DataType GetConvAccumulatorType(dnn::DataType data_type);
|
||||
|
||||
port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
|
||||
Stream* stream, const CudnnHandle& cudnn,
|
||||
const dnn::AlgorithmConfig& algorithm_config,
|
||||
const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
|
||||
dnn::DataType element_type,
|
||||
const dnn::ConvolutionDescriptor& convolution_descriptor,
|
||||
const CudnnConvolutionDescriptor& conv,
|
||||
const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
|
||||
DeviceMemory<uint8>* scratch) {
|
||||
absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
|
||||
|
||||
CudnnConvolutionDescriptor conv(
|
||||
convolution_descriptor,
|
||||
ToCudnnDataType(GetConvAccumulatorType(element_type)));
|
||||
bool use_tensor_ops;
|
||||
SE_ASSIGN_OR_RETURN(use_tensor_ops,
|
||||
UseTensorOps(stream, element_type, algo_desc));
|
||||
conv.set_use_tensor_op_math(use_tensor_ops);
|
||||
|
||||
if (!algo_desc.has_value()) {
|
||||
// Pick fastest algorithm within memory limit according to cuDNN's
|
||||
// heuristics.
|
||||
|
@ -2754,7 +2709,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
|
|||
GetCudnnConvolutionForwardAlgo(
|
||||
cudnn, input_nd, filter, conv, output_nd,
|
||||
specify_workspace_limit, memory_limit_bytes));
|
||||
algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
|
||||
int cc_major, cc_minor;
|
||||
std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
|
||||
algo_desc = dnn::AlgorithmDesc(
|
||||
algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
|
||||
}
|
||||
|
||||
const auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
|
||||
|
@ -2778,9 +2736,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
|
|||
"Returned status: ", scratch_or.status().ToString()));
|
||||
}
|
||||
|
||||
SE_ASSIGN_OR_RETURN(use_tensor_ops,
|
||||
UseTensorOps(stream, element_type, algo_desc));
|
||||
conv.set_use_tensor_op_math(use_tensor_ops);
|
||||
SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
|
||||
stream, cudnn, input_nd, filter, conv,
|
||||
output_nd, *algo_desc, scratch_allocator));
|
||||
|
@ -2791,19 +2746,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
|
|||
Stream* stream, const CudnnHandle& cudnn,
|
||||
const dnn::AlgorithmConfig& algorithm_config,
|
||||
const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
|
||||
dnn::DataType element_type,
|
||||
const dnn::ConvolutionDescriptor& convolution_descriptor,
|
||||
const CudnnConvolutionDescriptor& conv,
|
||||
const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
|
||||
DeviceMemory<uint8>* scratch) {
|
||||
absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
|
||||
CudnnConvolutionDescriptor conv(
|
||||
convolution_descriptor,
|
||||
ToCudnnDataType(GetConvAccumulatorType(element_type)));
|
||||
bool use_tensor_ops;
|
||||
SE_ASSIGN_OR_RETURN(use_tensor_ops,
|
||||
UseTensorOps(stream, element_type, algo_desc));
|
||||
conv.set_use_tensor_op_math(use_tensor_ops);
|
||||
|
||||
if (!algo_desc.has_value()) {
|
||||
// Pick fastest algorithm within memory limit according to cuDNN's
|
||||
// heuristics.
|
||||
|
@ -2816,7 +2762,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
|
|||
GetCudnnConvolutionBackwardDataAlgo(
|
||||
cudnn, input_nd, filter, conv, output_nd,
|
||||
specify_workspace_limit, memory_limit_bytes));
|
||||
algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
|
||||
int cc_major, cc_minor;
|
||||
std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
|
||||
algo_desc = dnn::AlgorithmDesc(
|
||||
algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
|
||||
}
|
||||
|
||||
const auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
|
||||
|
@ -2839,9 +2788,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
|
|||
"while a secondary algorithm is not provided.");
|
||||
}
|
||||
|
||||
SE_ASSIGN_OR_RETURN(use_tensor_ops,
|
||||
UseTensorOps(stream, element_type, algo_desc));
|
||||
conv.set_use_tensor_op_math(use_tensor_ops);
|
||||
SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
|
||||
stream, cudnn, input_nd, filter, conv,
|
||||
output_nd, *algo_desc, scratch_allocator));
|
||||
|
@ -2852,19 +2798,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
|
|||
Stream* stream, const CudnnHandle& cudnn,
|
||||
const dnn::AlgorithmConfig& algorithm_config,
|
||||
const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
|
||||
dnn::DataType element_type,
|
||||
const dnn::ConvolutionDescriptor& convolution_descriptor,
|
||||
const CudnnConvolutionDescriptor& conv,
|
||||
const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
|
||||
DeviceMemory<uint8>* scratch) {
|
||||
absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
|
||||
CudnnConvolutionDescriptor conv(
|
||||
convolution_descriptor,
|
||||
ToCudnnDataType(GetConvAccumulatorType(element_type)));
|
||||
bool use_tensor_ops;
|
||||
SE_ASSIGN_OR_RETURN(use_tensor_ops,
|
||||
UseTensorOps(stream, element_type, algo_desc));
|
||||
conv.set_use_tensor_op_math(use_tensor_ops);
|
||||
|
||||
if (!algo_desc.has_value()) {
|
||||
// Pick fastest algorithm within memory limit according to cuDNN's
|
||||
// heuristics.
|
||||
|
@ -2877,7 +2814,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
|
|||
GetCudnnConvolutionBackwardFilterAlgo(
|
||||
cudnn, input_nd, filter, conv, output_nd,
|
||||
specify_workspace_limit, memory_limit_bytes));
|
||||
algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
|
||||
int cc_major, cc_minor;
|
||||
std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
|
||||
algo_desc = dnn::AlgorithmDesc(
|
||||
algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
|
||||
}
|
||||
|
||||
auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
|
||||
|
@ -2900,9 +2840,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
|
|||
"while a secondary algorithm is not provided.");
|
||||
}
|
||||
|
||||
SE_ASSIGN_OR_RETURN(use_tensor_ops,
|
||||
UseTensorOps(stream, element_type, algo_desc));
|
||||
conv.set_use_tensor_op_math(use_tensor_ops);
|
||||
SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace(
|
||||
stream, cudnn, input_nd, filter, conv,
|
||||
output_nd, *algo_desc, scratch_allocator));
|
||||
|
@ -3067,32 +3004,35 @@ port::Status CudnnSupport::DoPrepareForConvolution(
|
|||
CudnnTensorDescriptor output_nd(
|
||||
output_descriptor,
|
||||
ToCudnnDataType(element_type, output_descriptor.layout()));
|
||||
CudnnConvolutionDescriptor conv(
|
||||
convolution_descriptor,
|
||||
ToCudnnDataType(GetConvAccumulatorType(element_type)));
|
||||
|
||||
auto cudnn = cudnn_->GetHandle(parent_, stream);
|
||||
|
||||
switch (kind) {
|
||||
case dnn::ConvolutionKind::FORWARD: {
|
||||
SE_ASSIGN_OR_RETURN(*algorithm_desc,
|
||||
GetCudnnConvolutionForwardAlgorithm(
|
||||
stream, cudnn, algorithm_config, input_nd,
|
||||
filter_nd, element_type, convolution_descriptor,
|
||||
output_nd, scratch_allocator, scratch_memory));
|
||||
SE_ASSIGN_OR_RETURN(
|
||||
*algorithm_desc,
|
||||
GetCudnnConvolutionForwardAlgorithm(
|
||||
stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
|
||||
output_nd, scratch_allocator, scratch_memory));
|
||||
break;
|
||||
}
|
||||
case dnn::ConvolutionKind::BACKWARD_DATA: {
|
||||
SE_ASSIGN_OR_RETURN(*algorithm_desc,
|
||||
GetCudnnConvolutionBackwardDataAlgorithm(
|
||||
stream, cudnn, algorithm_config, input_nd,
|
||||
filter_nd, element_type, convolution_descriptor,
|
||||
output_nd, scratch_allocator, scratch_memory));
|
||||
SE_ASSIGN_OR_RETURN(
|
||||
*algorithm_desc,
|
||||
GetCudnnConvolutionBackwardDataAlgorithm(
|
||||
stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
|
||||
output_nd, scratch_allocator, scratch_memory));
|
||||
break;
|
||||
}
|
||||
case dnn::ConvolutionKind::BACKWARD_FILTER: {
|
||||
SE_ASSIGN_OR_RETURN(*algorithm_desc,
|
||||
GetCudnnConvolutionBackwardFilterAlgorithm(
|
||||
stream, cudnn, algorithm_config, input_nd,
|
||||
filter_nd, element_type, convolution_descriptor,
|
||||
output_nd, scratch_allocator, scratch_memory));
|
||||
SE_ASSIGN_OR_RETURN(
|
||||
*algorithm_desc,
|
||||
GetCudnnConvolutionBackwardFilterAlgorithm(
|
||||
stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
|
||||
output_nd, scratch_allocator, scratch_memory));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
@ -3121,9 +3061,8 @@ port::Status CudnnSupport::DoConvolve(
|
|||
auto accumulator_type = GetConvAccumulatorType(element_type);
|
||||
CudnnConvolutionDescriptor conv(convolution_descriptor,
|
||||
ToCudnnDataType(accumulator_type));
|
||||
SE_ASSIGN_OR_RETURN(bool use_tensor_ops,
|
||||
UseTensorOps(stream, element_type, algorithm_desc));
|
||||
conv.set_use_tensor_op_math(use_tensor_ops);
|
||||
// Set use_tensor_math param to correct value
|
||||
conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
|
||||
|
||||
auto cudnn = cudnn_->GetHandle(parent_, stream);
|
||||
// Alpha is the scaling factor for input.
|
||||
|
@ -3356,6 +3295,14 @@ port::Status CudnnSupport::DoConvolve(
|
|||
return port::Status::OK();
|
||||
}
|
||||
|
||||
// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
|
||||
// set
|
||||
static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
|
||||
cudnnMathType_t math_type;
|
||||
CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
|
||||
return math_type == CUDNN_TENSOR_OP_MATH;
|
||||
}
|
||||
|
||||
template <typename ElementType, typename BiasType, typename ScaleType,
|
||||
typename OutputType>
|
||||
port::Status CudnnSupport::DoFusedConvolveImpl(
|
||||
|
@ -3389,6 +3336,8 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
|
|||
filter_descriptor,
|
||||
GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
|
||||
CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
|
||||
CudnnConvolutionDescriptor conv(convolution_descriptor,
|
||||
ToCudnnDataType(accumulator_type));
|
||||
|
||||
auto cudnn = cudnn_->GetHandle(parent_, stream);
|
||||
|
||||
|
@ -3398,14 +3347,9 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
|
|||
SE_ASSIGN_OR_RETURN(
|
||||
dnn::AlgorithmDesc algo_desc,
|
||||
GetCudnnConvolutionForwardAlgorithm(
|
||||
stream, cudnn, algorithm_config, conv_input_nd, filter,
|
||||
dnn::ToDataType<ElementType>::value, convolution_descriptor,
|
||||
stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
|
||||
output_nd, scratch_allocator, &scratch));
|
||||
|
||||
CudnnConvolutionDescriptor conv(convolution_descriptor,
|
||||
ToCudnnDataType(accumulator_type));
|
||||
conv.set_use_tensor_op_math(algo_desc.tensor_ops_enabled());
|
||||
|
||||
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
|
||||
if (is_profiling) {
|
||||
timer.reset(new GpuTimer(parent_)); // NOLINT
|
||||
|
@ -3536,7 +3480,9 @@ bool CudnnSupport::GetRnnAlgorithms(
|
|||
for (auto i : algo_types) {
|
||||
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
||||
#if CUDNN_VERSION >= 7100
|
||||
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
|
||||
if (RnnTensorOpMathEnabled()) {
|
||||
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return true;
|
||||
|
|
Loading…
Reference in New Issue