Internal change

PiperOrigin-RevId: 317230321
Change-Id: I043dae37768f6e9cf946d4db2a8c36123ed2d6d9
This commit is contained in:
A. Unique TensorFlower 2020-06-18 19:17:40 -07:00 committed by TensorFlower Gardener
parent 4a14e778d6
commit 9c4b749b09
10 changed files with 172 additions and 336 deletions

View File

@ -938,13 +938,6 @@ cc_library(
alwayslink = 1,
)
cc_library(
name = "tf32_utils",
srcs = ["tf32_utils.cc"],
hdrs = ["tf32_utils.h"],
copts = tf_copts(),
)
tf_cc_tests(
name = "low_level_library_tests",
size = "small",

View File

@ -1,30 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/platform/tf32_utils.h"
#include <atomic>
namespace tensorflow {
// Whether TensorFloat-32 should be used where supported.
// TODO(nluehr): Maybe enable by default after TF32 Ampere testing.
static std::atomic<bool> tf32_allowed{false};
void allow_tf32_execution(bool allowed) { tf32_allowed = allowed; }
bool tf32_execution_allowed() { return tf32_allowed; }
} // namespace tensorflow

View File

@ -1,27 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
#define TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
namespace tensorflow {
void allow_tf32_execution(bool allowed);
bool tf32_execution_allowed();
} // namespace tensorflow
#endif // TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_

View File

@ -788,16 +788,6 @@ tf_python_pybind_extension(
],
)
tf_python_pybind_extension(
name = "_pywrap_tf32_execution",
srcs = ["util/tf32.cc"],
module_name = "_pywrap_tf32_execution",
deps = [
"//tensorflow/core/platform:tf32_utils",
"@pybind11",
],
)
tf_python_pybind_extension(
name = "_pywrap_util_port",
srcs = ["util/port_wrapper.cc"],
@ -5688,7 +5678,6 @@ py_library(
"//tensorflow:composite_tensor_whitelist",
],
deps = [
":_pywrap_tf32_execution",
":tf_decorator",
":tf_export",
":tf_stack",

View File

@ -18,42 +18,11 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python import _pywrap_tf32_execution
from tensorflow.python.eager import context
from tensorflow.python.util import deprecation
from tensorflow.python.util.tf_export import tf_export
# No tf_export until TF is built against CUDA11 which is required for TF32.
def tensor_float_32_execution_allowed():
"""Get if TensorFloat-32 operations are enabled on supported hardware.
Returns:
True if TensorFloat-32 execution is enabled and False otherwise.
"""
return _pywrap_tf32_execution.is_allowed()
# No tf_export until TF is built against CUDA 11 which is required for TF32.
def allow_tensor_float_32_execution(allowed):
"""Allow use of TensorFloat-32 with float32 ops on supported hardware.
TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
TensorFloat-32 kernels take float32 inputs and produce float32 outputs.
Internally, the inputs are cast to a custom representation with 10-bit
mantissa (similar to float16) and 8-bit exponent (similar to float32) and are
executed using TensorCores with float32 accumulation. For more information,
see https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/.
TensorFloat-32 execution is disabled by default, but this may change in a
future version.
Args:
allowed: whether to allow TensorFloat-32 execution
"""
_pywrap_tf32_execution.allow(allowed)
@tf_export('config.threading.get_intra_op_parallelism_threads')
def get_intra_op_parallelism_threads():
"""Get number of threads used within an individual op for parallelism.

View File

@ -1,22 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "pybind11/pybind11.h"
#include "tensorflow/core/platform/tf32_utils.h"
PYBIND11_MODULE(_pywrap_tf32_execution, m) {
m.def("allow", &tensorflow::allow_tf32_execution);
m.def("is_allowed", &tensorflow::tf32_execution_allowed);
}

View File

@ -251,7 +251,6 @@ cc_library(
"@local_config_cuda//cuda:cuda_headers",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//tensorflow/core/platform:tf32_utils",
"//tensorflow/stream_executor",
"//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:host_or_device_scalar",
@ -357,7 +356,6 @@ cc_library(
"@local_config_cuda//cuda:cudnn_header",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//tensorflow/core/platform:tf32_utils",
"//tensorflow/stream_executor:dnn",
"//tensorflow/stream_executor:event",
"//tensorflow/stream_executor:plugin_registry",

View File

@ -49,7 +49,6 @@ limitations under the License.
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "third_party/eigen3/Eigen/Core"
#include "tensorflow/core/platform/tf32_utils.h"
#include "tensorflow/core/util/env_var.h"
#include "tensorflow/stream_executor/cuda/cuda_activation.h"
#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
@ -102,6 +101,18 @@ static std::string ToString(cublasStatus_t status) {
}
}
// Decide whether to enable TENSOR_OP_MATH
static bool TensorOpMathEnabled() {
static bool is_enabled = [] {
bool is_disabled;
TF_CHECK_OK(
tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH",
/*default_val=*/false, &is_disabled));
return !is_disabled;
}();
return is_enabled;
}
// cuBLAS has interfaces that permit pointers to be passed from either the host
// memory space or the device memory space; however, you must instruct it as to
// which address space those pointers are in with cublasSetPointerMode.
@ -226,19 +237,6 @@ bool CUDABlas::Init() {
return false;
}
absl::MutexLock lock(&mu_);
#if CUDA_VERSION >= 9000
#if CUBLAS_VER_MAJOR >= 11
ret = cublasSetMathMode(blas_, CUBLAS_TF32_TENSOR_OP_MATH);
#else
ret = cublasSetMathMode(blas_, CUBLAS_TENSOR_OP_MATH);
#endif
if (ret != CUBLAS_STATUS_SUCCESS) {
LOG(ERROR) << "failed to set cublas default math mode: " << ToString(ret);
return false;
}
#endif
return true;
}
@ -401,7 +399,7 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
template <typename FuncT, typename... Args>
bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
bool pointer_mode_host, bool err_on_failure,
Args... args) {
bool use_tensor_op_math, Args... args) {
absl::MutexLock lock(&mu_);
CHECK(blas_ != nullptr);
@ -415,10 +413,10 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
: CUBLAS_POINTER_MODE_DEVICE)) {
return false;
}
#if CUBLAS_VER_MAJOR >= 11
#if CUDA_VERSION >= 9000
ScopedCublasMathMode math_mode{blas_};
if (!tensorflow::tf32_execution_allowed()) {
if (!math_mode.Init(CUBLAS_DEFAULT_MATH)) {
if (use_tensor_op_math) {
if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
return false;
}
}
@ -1635,9 +1633,21 @@ bool CUDABlas::DoBlasGemm(
}
}
bool use_tensor_ops = false;
#if CUDA_VERSION >= 9000
int cc_major, cc_minor;
stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
&cc_minor);
// GPUs < sm_70 don't support tensor ops.
if (cc_major >= 7 && TensorOpMathEnabled()) {
use_tensor_ops = true;
}
#endif
return DoBlasInternalImpl(
cublasSgemmEx, stream, true /* = pointer_mode_host */,
true /* = err_on_failure= */, CUDABlasTranspose(transa),
true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
@ -1911,7 +1921,8 @@ static bool TensorOpsAvailable(int cc_major) {
// strictly correct. We can't simply enable it, though, as that would change
// clients' behavior significantly: Using tensor ops on fp32 inputs cause them
// to be rounded to fp16.
if (cc_major >= 7 && std::is_same<InType, Eigen::half>::value) {
if (cc_major >= 7 && TensorOpMathEnabled() &&
std::is_same<InType, Eigen::half>::value) {
return true;
}
#endif
@ -2259,8 +2270,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
&cc_major, &cc_minor) &&
cc_major >= 5) {
bool use_tensor_ops =
data_type == CUDA_R_16F || tensorflow::tf32_execution_allowed();
bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
cublasGemmAlgo_t algo =
(use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
cudaDataType_t compute_type =
@ -2274,7 +2284,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
bool ok;
ok = DoBlasInternalImpl(
AS_LAMBDA(cublasGemmBatchedEx), stream, true /* = pointer_mode_host */,
true /* = err_on_failure */, CUDABlasTranspose(transa),
true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
batch_count, compute_type, algo);
@ -2409,25 +2419,33 @@ bool CUDABlas::DoBlasGemmStridedBatched(
int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
int64 stride_c, int batch_count) {
#if CUDA_VERSION >= 9010
bool use_tensor_ops = false;
#if CUDA_VERSION >= 9000
int cc_major, cc_minor;
if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
&cc_major, &cc_minor) &&
cc_major >= 5) {
cublasGemmAlgo_t algo =
(cc_major >= 7 ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
bool ok = DoBlasInternalImpl(
AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
true /* = pointer_mode_host */, true /* = err_on_failure */,
CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F, ldb,
stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
batch_count, CUDA_R_32F, algo);
if (ok) {
return true;
&cc_major, &cc_minor)) {
// GPUs < sm_70 don't support tensor ops.
if (cc_major >= 7 && TensorOpMathEnabled()) {
use_tensor_ops = true;
}
LOG(ERROR) << "failed BLAS call, see log for details";
return false;
#if CUDA_VERSION >= 9010
if (cc_major >= 5) {
cublasGemmAlgo_t algo =
(use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
bool ok = DoBlasInternalImpl(
AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
true /* = pointer_mode_host */, true /* = err_on_failure */,
use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb),
m, n, k, &alpha, GpuMemory(a), CUDA_R_16F, lda, stride_a,
GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta, GpuMemoryMutable(c),
CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo);
if (ok) {
return true;
}
LOG(ERROR) << "failed BLAS call, see log for details";
return false;
}
#endif
}
#endif
// Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
@ -2440,7 +2458,7 @@ bool CUDABlas::DoBlasGemmStridedBatched(
reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
bool ok = DoBlasInternalImpl(
cublasSgemmEx, stream, true /* = pointer_mode_host */,
true /* = err_on_failure= */, CUDABlasTranspose(transa),
true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
SE_CUDA_DATA_HALF, ldc);

View File

@ -83,7 +83,7 @@ class CUDABlas : public blas::BlasSupport {
template <typename FuncT, typename... Args>
bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
bool pointer_mode_host, bool err_on_failure,
Args... args);
bool use_tensor_op_math, Args... args);
// Convenience functions that call DoBlasInternalImpl with different values
// for err_on_failure.
@ -91,7 +91,8 @@ class CUDABlas : public blas::BlasSupport {
bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
Args... args) {
return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
/*err_on_failure=*/true, args...);
/*err_on_failure=*/true, /*use_tensor_ops=*/false,
args...);
}
template <typename FuncT, typename... Args>
bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream,
@ -99,7 +100,8 @@ class CUDABlas : public blas::BlasSupport {
// Tensor ops are hard-coded off in this path, but can still be enabled with
// a specific algorithm choice as in DoBlasGemmWithAlgorithmImpl().
return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
/*err_on_failure=*/false, args...);
/*err_on_failure=*/false,
/*use_tensor_ops=*/false, args...);
}
// A helper function to implement DoBlasGemmBatched interfaces for generic

View File

@ -22,7 +22,6 @@ limitations under the License.
#include "absl/strings/str_cat.h"
#include "third_party/eigen3/Eigen/Core"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/platform/tf32_utils.h"
#include "tensorflow/core/util/env_var.h"
#include "tensorflow/stream_executor/cuda/cuda_activation.h"
#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@ -602,6 +601,31 @@ class CudnnFilterDescriptor {
SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor);
};
// A helper function to decide whether to enable the TENSOR_OP_MATH math type
bool TensorOpMathEnabled() {
static bool is_enabled = [] {
bool is_disabled = false;
TF_CHECK_OK(
tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_TENSOR_OP_MATH",
/*default_val=*/false, &is_disabled));
return !is_disabled;
}();
return is_enabled;
}
// A helper function to decide whether to enable the TENSOR_OP_MATH math type
// for RNNs.
bool RnnTensorOpMathEnabled() {
static bool is_enabled = [] {
bool is_disabled = false;
TF_CHECK_OK(
tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH",
/*default_val=*/false, &is_disabled));
return !is_disabled;
}();
return is_enabled;
}
// A helper function to decide whether to use
// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
@ -706,6 +730,10 @@ class CudnnConvolutionDescriptor {
: CUDNN_CROSS_CORRELATION,
data_type));
// NOTE(benbarsdell): This only applies if tensor op math is enabled
// and algo selection is set to Default.
this->set_use_tensor_op_math(true);
#if CUDNN_MAJOR >= 7
VLOG(2) << "Requesting grouped convolution: "
<< convolution_descriptor.group_count();
@ -717,15 +745,13 @@ class CudnnConvolutionDescriptor {
#endif
}
void set_use_tensor_op_math(bool use_tensor_op_math) {
void set_use_tensor_op_math(bool use_tensor_op_math) const {
#if CUDNN_VERSION >= 7000
cudnnMathType_t math_type =
#if CUDNN_VERSION >= 8000
(use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH);
#else
(use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
#endif
CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
if (TensorOpMathEnabled()) {
CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
}
#endif
}
@ -737,40 +763,6 @@ class CudnnConvolutionDescriptor {
SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolutionDescriptor);
};
// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
// set
static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
cudnnMathType_t math_type;
CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
#if CUDNN_VERSION >= 8000
return math_type != CUDNN_FMA_MATH;
#else
return math_type == CUDNN_TENSOR_OP_MATH;
#endif
}
static bool TensorOpMathAvailable(int cc_major) {
return cc_major >= 7 && CUDNN_VERSION >= 7000;
}
static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
int cc_major, cc_minor;
std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
if (!TensorOpMathAvailable(cc_major)) {
return false;
}
if (input_type == dnn::DataType::kFloat) {
#if CUDNN_VERSION < 8000
return false;
#else
if (!tensorflow::tf32_execution_allowed()) {
return false;
}
#endif
}
return true;
}
// Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
// within a scope.
class CudnnPoolingDescriptor {
@ -1163,31 +1155,21 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
// in profile mode, which is run with algorithms returned from
// GetRnnAlgorithms() (which are non-default and explicitly set whether to
// use tensor ops). CuDNN 7.2.1 fixed this issue
bool allow_tensor_ops =
data_type != CUDNN_DATA_FLOAT || tensorflow::tf32_execution_allowed();
bool use_tensor_ops;
if (algorithm_config.algorithm().has_value()) {
use_tensor_ops = algorithm_config.algorithm()->tensor_ops_enabled();
} else {
use_tensor_ops = CUDNN_VERSION >= 7201 && allow_tensor_ops;
}
if (use_tensor_ops && !allow_tensor_ops) {
return port::Status(port::error::INVALID_ARGUMENT,
"Algo requests disallowed tensor op evaluation.");
}
cudnnMathType_t math_type;
if (use_tensor_ops) {
math_type = CUDNN_TENSOR_OP_MATH;
} else {
#if CUDNN_VERSION >= 8000
math_type = CUDNN_FMA_MATH;
if (RnnTensorOpMathEnabled()) {
cudnnMathType_t math_type;
if (algorithm_config.algorithm().has_value()) {
math_type = algorithm_config.algorithm()->tensor_ops_enabled()
? CUDNN_TENSOR_OP_MATH
: CUDNN_DEFAULT_MATH;
} else {
#if CUDNN_VERSION >= 7201
math_type = CUDNN_TENSOR_OP_MATH;
#else
math_type = CUDNN_DEFAULT_MATH;
#endif // CUDNN_VERSION >= 8000
math_type = CUDNN_DEFAULT_MATH;
#endif // CUDNN_VERSION >= 7201
}
CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
}
CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
#endif // CUDNN_VERSION >= 7000
return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan),
@ -2578,11 +2560,10 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
const CudnnTensorDescriptor& output_nd,
const dnn::AlgorithmDesc& algorithm_desc,
ScratchAllocator* scratch_allocator) {
if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
return port::Status(
port::error::INTERNAL,
"Mismatch between cudnn conv and algorithm descriptors.");
}
// TODO(csigg): This has side effects on the convolution descriptor. It is
// functionally correct because the convolution is run with the algorithm of
// the last call to this function, but should be fixed anyway.
conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
// Query the size of the workspace and allocate it.
size_t size_in_bytes;
@ -2622,11 +2603,10 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
const CudnnTensorDescriptor& output_nd,
const dnn::AlgorithmDesc& algorithm_desc,
ScratchAllocator* scratch_allocator) {
if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
return port::Status(
port::error::INTERNAL,
"Mismatch between cudnn conv and algorithm descriptors.");
}
// TODO(csigg): This has side effects on the convolution descriptor. It is
// functionally correct because the convolution is run with the algorithm of
// the last call to this function, but should be fixed anyway.
conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
// Query the size of the workspace and allocate it.
size_t size_in_bytes;
@ -2668,11 +2648,10 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
const CudnnTensorDescriptor& output_nd,
const dnn::AlgorithmDesc& algorithm_desc,
ScratchAllocator* scratch_allocator) {
if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
return port::Status(
port::error::INTERNAL,
"Mismatch between cudnn conv and algorithm descriptors.");
}
// TODO(csigg): This has side effects on the convolution descriptor. It is
// functionally correct because the convolution is run with the algorithm of
// the last call to this function, but should be fixed anyway.
conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
// Query the size of the workspace and allocate it.
size_t size_in_bytes;
@ -2706,42 +2685,18 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
return scratch_allocator->AllocateBytes(size_in_bytes);
}
port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
absl::optional<dnn::AlgorithmDesc> desc) {
bool use_tensor_ops;
if (desc.has_value()) {
use_tensor_ops = desc->tensor_ops_enabled();
if (use_tensor_ops && !IsTensorMathAllowed(stream, type)) {
return port::Status(port::error::INVALID_ARGUMENT,
"Algo requests disallowed tensor op evaluation.");
}
} else {
use_tensor_ops = IsTensorMathAllowed(stream, type);
}
return use_tensor_ops;
static bool TensorOpMathAvailable(int cc_major) {
return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled();
}
cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
dnn::DataType GetConvAccumulatorType(dnn::DataType data_type);
port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
Stream* stream, const CudnnHandle& cudnn,
const dnn::AlgorithmConfig& algorithm_config,
const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
dnn::DataType element_type,
const dnn::ConvolutionDescriptor& convolution_descriptor,
const CudnnConvolutionDescriptor& conv,
const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
DeviceMemory<uint8>* scratch) {
absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
CudnnConvolutionDescriptor conv(
convolution_descriptor,
ToCudnnDataType(GetConvAccumulatorType(element_type)));
bool use_tensor_ops;
SE_ASSIGN_OR_RETURN(use_tensor_ops,
UseTensorOps(stream, element_type, algo_desc));
conv.set_use_tensor_op_math(use_tensor_ops);
if (!algo_desc.has_value()) {
// Pick fastest algorithm within memory limit according to cuDNN's
// heuristics.
@ -2754,7 +2709,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
GetCudnnConvolutionForwardAlgo(
cudnn, input_nd, filter, conv, output_nd,
specify_workspace_limit, memory_limit_bytes));
algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
int cc_major, cc_minor;
std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
algo_desc = dnn::AlgorithmDesc(
algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
}
const auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
@ -2778,9 +2736,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
"Returned status: ", scratch_or.status().ToString()));
}
SE_ASSIGN_OR_RETURN(use_tensor_ops,
UseTensorOps(stream, element_type, algo_desc));
conv.set_use_tensor_op_math(use_tensor_ops);
SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
stream, cudnn, input_nd, filter, conv,
output_nd, *algo_desc, scratch_allocator));
@ -2791,19 +2746,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
Stream* stream, const CudnnHandle& cudnn,
const dnn::AlgorithmConfig& algorithm_config,
const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
dnn::DataType element_type,
const dnn::ConvolutionDescriptor& convolution_descriptor,
const CudnnConvolutionDescriptor& conv,
const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
DeviceMemory<uint8>* scratch) {
absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
CudnnConvolutionDescriptor conv(
convolution_descriptor,
ToCudnnDataType(GetConvAccumulatorType(element_type)));
bool use_tensor_ops;
SE_ASSIGN_OR_RETURN(use_tensor_ops,
UseTensorOps(stream, element_type, algo_desc));
conv.set_use_tensor_op_math(use_tensor_ops);
if (!algo_desc.has_value()) {
// Pick fastest algorithm within memory limit according to cuDNN's
// heuristics.
@ -2816,7 +2762,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
GetCudnnConvolutionBackwardDataAlgo(
cudnn, input_nd, filter, conv, output_nd,
specify_workspace_limit, memory_limit_bytes));
algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
int cc_major, cc_minor;
std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
algo_desc = dnn::AlgorithmDesc(
algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
}
const auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
@ -2839,9 +2788,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
"while a secondary algorithm is not provided.");
}
SE_ASSIGN_OR_RETURN(use_tensor_ops,
UseTensorOps(stream, element_type, algo_desc));
conv.set_use_tensor_op_math(use_tensor_ops);
SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
stream, cudnn, input_nd, filter, conv,
output_nd, *algo_desc, scratch_allocator));
@ -2852,19 +2798,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
Stream* stream, const CudnnHandle& cudnn,
const dnn::AlgorithmConfig& algorithm_config,
const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
dnn::DataType element_type,
const dnn::ConvolutionDescriptor& convolution_descriptor,
const CudnnConvolutionDescriptor& conv,
const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
DeviceMemory<uint8>* scratch) {
absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
CudnnConvolutionDescriptor conv(
convolution_descriptor,
ToCudnnDataType(GetConvAccumulatorType(element_type)));
bool use_tensor_ops;
SE_ASSIGN_OR_RETURN(use_tensor_ops,
UseTensorOps(stream, element_type, algo_desc));
conv.set_use_tensor_op_math(use_tensor_ops);
if (!algo_desc.has_value()) {
// Pick fastest algorithm within memory limit according to cuDNN's
// heuristics.
@ -2877,7 +2814,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
GetCudnnConvolutionBackwardFilterAlgo(
cudnn, input_nd, filter, conv, output_nd,
specify_workspace_limit, memory_limit_bytes));
algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
int cc_major, cc_minor;
std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
algo_desc = dnn::AlgorithmDesc(
algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
}
auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
@ -2900,9 +2840,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
"while a secondary algorithm is not provided.");
}
SE_ASSIGN_OR_RETURN(use_tensor_ops,
UseTensorOps(stream, element_type, algo_desc));
conv.set_use_tensor_op_math(use_tensor_ops);
SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace(
stream, cudnn, input_nd, filter, conv,
output_nd, *algo_desc, scratch_allocator));
@ -3067,32 +3004,35 @@ port::Status CudnnSupport::DoPrepareForConvolution(
CudnnTensorDescriptor output_nd(
output_descriptor,
ToCudnnDataType(element_type, output_descriptor.layout()));
CudnnConvolutionDescriptor conv(
convolution_descriptor,
ToCudnnDataType(GetConvAccumulatorType(element_type)));
auto cudnn = cudnn_->GetHandle(parent_, stream);
switch (kind) {
case dnn::ConvolutionKind::FORWARD: {
SE_ASSIGN_OR_RETURN(*algorithm_desc,
GetCudnnConvolutionForwardAlgorithm(
stream, cudnn, algorithm_config, input_nd,
filter_nd, element_type, convolution_descriptor,
output_nd, scratch_allocator, scratch_memory));
SE_ASSIGN_OR_RETURN(
*algorithm_desc,
GetCudnnConvolutionForwardAlgorithm(
stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
output_nd, scratch_allocator, scratch_memory));
break;
}
case dnn::ConvolutionKind::BACKWARD_DATA: {
SE_ASSIGN_OR_RETURN(*algorithm_desc,
GetCudnnConvolutionBackwardDataAlgorithm(
stream, cudnn, algorithm_config, input_nd,
filter_nd, element_type, convolution_descriptor,
output_nd, scratch_allocator, scratch_memory));
SE_ASSIGN_OR_RETURN(
*algorithm_desc,
GetCudnnConvolutionBackwardDataAlgorithm(
stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
output_nd, scratch_allocator, scratch_memory));
break;
}
case dnn::ConvolutionKind::BACKWARD_FILTER: {
SE_ASSIGN_OR_RETURN(*algorithm_desc,
GetCudnnConvolutionBackwardFilterAlgorithm(
stream, cudnn, algorithm_config, input_nd,
filter_nd, element_type, convolution_descriptor,
output_nd, scratch_allocator, scratch_memory));
SE_ASSIGN_OR_RETURN(
*algorithm_desc,
GetCudnnConvolutionBackwardFilterAlgorithm(
stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
output_nd, scratch_allocator, scratch_memory));
break;
}
default:
@ -3121,9 +3061,8 @@ port::Status CudnnSupport::DoConvolve(
auto accumulator_type = GetConvAccumulatorType(element_type);
CudnnConvolutionDescriptor conv(convolution_descriptor,
ToCudnnDataType(accumulator_type));
SE_ASSIGN_OR_RETURN(bool use_tensor_ops,
UseTensorOps(stream, element_type, algorithm_desc));
conv.set_use_tensor_op_math(use_tensor_ops);
// Set use_tensor_math param to correct value
conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
auto cudnn = cudnn_->GetHandle(parent_, stream);
// Alpha is the scaling factor for input.
@ -3356,6 +3295,14 @@ port::Status CudnnSupport::DoConvolve(
return port::Status::OK();
}
// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
// set
static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
cudnnMathType_t math_type;
CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
return math_type == CUDNN_TENSOR_OP_MATH;
}
template <typename ElementType, typename BiasType, typename ScaleType,
typename OutputType>
port::Status CudnnSupport::DoFusedConvolveImpl(
@ -3389,6 +3336,8 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
filter_descriptor,
GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
CudnnConvolutionDescriptor conv(convolution_descriptor,
ToCudnnDataType(accumulator_type));
auto cudnn = cudnn_->GetHandle(parent_, stream);
@ -3398,14 +3347,9 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
SE_ASSIGN_OR_RETURN(
dnn::AlgorithmDesc algo_desc,
GetCudnnConvolutionForwardAlgorithm(
stream, cudnn, algorithm_config, conv_input_nd, filter,
dnn::ToDataType<ElementType>::value, convolution_descriptor,
stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
output_nd, scratch_allocator, &scratch));
CudnnConvolutionDescriptor conv(convolution_descriptor,
ToCudnnDataType(accumulator_type));
conv.set_use_tensor_op_math(algo_desc.tensor_ops_enabled());
std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
if (is_profiling) {
timer.reset(new GpuTimer(parent_)); // NOLINT
@ -3536,7 +3480,9 @@ bool CudnnSupport::GetRnnAlgorithms(
for (auto i : algo_types) {
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
#if CUDNN_VERSION >= 7100
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
if (RnnTensorOpMathEnabled()) {
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
}
#endif
}
return true;