From 9c4b749b09b958c436e0681a4276b47fc9316a8a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 18 Jun 2020 19:17:40 -0700 Subject: [PATCH] Internal change PiperOrigin-RevId: 317230321 Change-Id: I043dae37768f6e9cf946d4db2a8c36123ed2d6d9 --- tensorflow/core/platform/BUILD | 7 - tensorflow/core/platform/tf32_utils.cc | 30 -- tensorflow/core/platform/tf32_utils.h | 27 -- tensorflow/python/BUILD | 11 - tensorflow/python/framework/config.py | 31 --- tensorflow/python/util/tf32.cc | 22 -- tensorflow/stream_executor/cuda/BUILD | 2 - tensorflow/stream_executor/cuda/cuda_blas.cc | 98 ++++--- tensorflow/stream_executor/cuda/cuda_blas.h | 8 +- tensorflow/stream_executor/cuda/cuda_dnn.cc | 272 ++++++++----------- 10 files changed, 172 insertions(+), 336 deletions(-) delete mode 100644 tensorflow/core/platform/tf32_utils.cc delete mode 100644 tensorflow/core/platform/tf32_utils.h delete mode 100644 tensorflow/python/util/tf32.cc diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD index 33a1e7cfe0a..70bb8a89417 100644 --- a/tensorflow/core/platform/BUILD +++ b/tensorflow/core/platform/BUILD @@ -938,13 +938,6 @@ cc_library( alwayslink = 1, ) -cc_library( - name = "tf32_utils", - srcs = ["tf32_utils.cc"], - hdrs = ["tf32_utils.h"], - copts = tf_copts(), -) - tf_cc_tests( name = "low_level_library_tests", size = "small", diff --git a/tensorflow/core/platform/tf32_utils.cc b/tensorflow/core/platform/tf32_utils.cc deleted file mode 100644 index d2f40ea161a..00000000000 --- a/tensorflow/core/platform/tf32_utils.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/platform/tf32_utils.h" - -#include - -namespace tensorflow { - -// Whether TensorFloat-32 should be used where supported. -// TODO(nluehr): Maybe enable by default after TF32 Ampere testing. -static std::atomic tf32_allowed{false}; - -void allow_tf32_execution(bool allowed) { tf32_allowed = allowed; } - -bool tf32_execution_allowed() { return tf32_allowed; } - -} // namespace tensorflow diff --git a/tensorflow/core/platform/tf32_utils.h b/tensorflow/core/platform/tf32_utils.h deleted file mode 100644 index 7a158d00ad3..00000000000 --- a/tensorflow/core/platform/tf32_utils.h +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_ -#define TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_ - -namespace tensorflow { - -void allow_tf32_execution(bool allowed); - -bool tf32_execution_allowed(); - -} // namespace tensorflow - -#endif // TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_ diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 5f9e2dfb1ff..de9cf9a24c7 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -788,16 +788,6 @@ tf_python_pybind_extension( ], ) -tf_python_pybind_extension( - name = "_pywrap_tf32_execution", - srcs = ["util/tf32.cc"], - module_name = "_pywrap_tf32_execution", - deps = [ - "//tensorflow/core/platform:tf32_utils", - "@pybind11", - ], -) - tf_python_pybind_extension( name = "_pywrap_util_port", srcs = ["util/port_wrapper.cc"], @@ -5688,7 +5678,6 @@ py_library( "//tensorflow:composite_tensor_whitelist", ], deps = [ - ":_pywrap_tf32_execution", ":tf_decorator", ":tf_export", ":tf_stack", diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py index 544b6882618..9ff16f2a327 100644 --- a/tensorflow/python/framework/config.py +++ b/tensorflow/python/framework/config.py @@ -18,42 +18,11 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.python import _pywrap_tf32_execution from tensorflow.python.eager import context from tensorflow.python.util import deprecation from tensorflow.python.util.tf_export import tf_export -# No tf_export until TF is built against CUDA11 which is required for TF32. -def tensor_float_32_execution_allowed(): - """Get if TensorFloat-32 operations are enabled on supported hardware. - - Returns: - True if TensorFloat-32 execution is enabled and False otherwise. - """ - return _pywrap_tf32_execution.is_allowed() - - -# No tf_export until TF is built against CUDA 11 which is required for TF32. -def allow_tensor_float_32_execution(allowed): - """Allow use of TensorFloat-32 with float32 ops on supported hardware. - - TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture. - TensorFloat-32 kernels take float32 inputs and produce float32 outputs. - Internally, the inputs are cast to a custom representation with 10-bit - mantissa (similar to float16) and 8-bit exponent (similar to float32) and are - executed using TensorCores with float32 accumulation. For more information, - see https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/. - - TensorFloat-32 execution is disabled by default, but this may change in a - future version. - - Args: - allowed: whether to allow TensorFloat-32 execution - """ - _pywrap_tf32_execution.allow(allowed) - - @tf_export('config.threading.get_intra_op_parallelism_threads') def get_intra_op_parallelism_threads(): """Get number of threads used within an individual op for parallelism. diff --git a/tensorflow/python/util/tf32.cc b/tensorflow/python/util/tf32.cc deleted file mode 100644 index 7dece6ccdae..00000000000 --- a/tensorflow/python/util/tf32.cc +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "pybind11/pybind11.h" -#include "tensorflow/core/platform/tf32_utils.h" - -PYBIND11_MODULE(_pywrap_tf32_execution, m) { - m.def("allow", &tensorflow::allow_tf32_execution); - m.def("is_allowed", &tensorflow::tf32_execution_allowed); -} diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD index 3a14be9ad50..c3cf9f5db15 100644 --- a/tensorflow/stream_executor/cuda/BUILD +++ b/tensorflow/stream_executor/cuda/BUILD @@ -251,7 +251,6 @@ cc_library( "@local_config_cuda//cuda:cuda_headers", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", - "//tensorflow/core/platform:tf32_utils", "//tensorflow/stream_executor", "//tensorflow/stream_executor:event", "//tensorflow/stream_executor:host_or_device_scalar", @@ -357,7 +356,6 @@ cc_library( "@local_config_cuda//cuda:cudnn_header", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", - "//tensorflow/core/platform:tf32_utils", "//tensorflow/stream_executor:dnn", "//tensorflow/stream_executor:event", "//tensorflow/stream_executor:plugin_registry", diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index fcd0e7b16fb..c9f0fc462c9 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -49,7 +49,6 @@ limitations under the License. #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "third_party/eigen3/Eigen/Core" -#include "tensorflow/core/platform/tf32_utils.h" #include "tensorflow/core/util/env_var.h" #include "tensorflow/stream_executor/cuda/cuda_activation.h" #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" @@ -102,6 +101,18 @@ static std::string ToString(cublasStatus_t status) { } } +// Decide whether to enable TENSOR_OP_MATH +static bool TensorOpMathEnabled() { + static bool is_enabled = [] { + bool is_disabled; + TF_CHECK_OK( + tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH", + /*default_val=*/false, &is_disabled)); + return !is_disabled; + }(); + return is_enabled; +} + // cuBLAS has interfaces that permit pointers to be passed from either the host // memory space or the device memory space; however, you must instruct it as to // which address space those pointers are in with cublasSetPointerMode. @@ -226,19 +237,6 @@ bool CUDABlas::Init() { return false; } - absl::MutexLock lock(&mu_); -#if CUDA_VERSION >= 9000 -#if CUBLAS_VER_MAJOR >= 11 - ret = cublasSetMathMode(blas_, CUBLAS_TF32_TENSOR_OP_MATH); -#else - ret = cublasSetMathMode(blas_, CUBLAS_TENSOR_OP_MATH); -#endif - if (ret != CUBLAS_STATUS_SUCCESS) { - LOG(ERROR) << "failed to set cublas default math mode: " << ToString(ret); - return false; - } -#endif - return true; } @@ -401,7 +399,7 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) { template bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream, bool pointer_mode_host, bool err_on_failure, - Args... args) { + bool use_tensor_op_math, Args... args) { absl::MutexLock lock(&mu_); CHECK(blas_ != nullptr); @@ -415,10 +413,10 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream, : CUBLAS_POINTER_MODE_DEVICE)) { return false; } -#if CUBLAS_VER_MAJOR >= 11 +#if CUDA_VERSION >= 9000 ScopedCublasMathMode math_mode{blas_}; - if (!tensorflow::tf32_execution_allowed()) { - if (!math_mode.Init(CUBLAS_DEFAULT_MATH)) { + if (use_tensor_op_math) { + if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) { return false; } } @@ -1635,9 +1633,21 @@ bool CUDABlas::DoBlasGemm( } } + bool use_tensor_ops = false; +#if CUDA_VERSION >= 9000 + int cc_major, cc_minor; + stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major, + &cc_minor); + + // GPUs < sm_70 don't support tensor ops. + if (cc_major >= 7 && TensorOpMathEnabled()) { + use_tensor_ops = true; + } +#endif + return DoBlasInternalImpl( cublasSgemmEx, stream, true /* = pointer_mode_host */, - true /* = err_on_failure= */, CUDABlasTranspose(transa), + true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a), SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta, GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc); @@ -1911,7 +1921,8 @@ static bool TensorOpsAvailable(int cc_major) { // strictly correct. We can't simply enable it, though, as that would change // clients' behavior significantly: Using tensor ops on fp32 inputs cause them // to be rounded to fp16. - if (cc_major >= 7 && std::is_same::value) { + if (cc_major >= 7 && TensorOpMathEnabled() && + std::is_same::value) { return true; } #endif @@ -2259,8 +2270,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal( if (stream->parent()->GetDeviceDescription().cuda_compute_capability( &cc_major, &cc_minor) && cc_major >= 5) { - bool use_tensor_ops = - data_type == CUDA_R_16F || tensorflow::tf32_execution_allowed(); + bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F; cublasGemmAlgo_t algo = (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT); cudaDataType_t compute_type = @@ -2274,7 +2284,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal( bool ok; ok = DoBlasInternalImpl( AS_LAMBDA(cublasGemmBatchedEx), stream, true /* = pointer_mode_host */, - true /* = err_on_failure */, CUDABlasTranspose(transa), + true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda, b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc, batch_count, compute_type, algo); @@ -2409,25 +2419,33 @@ bool CUDABlas::DoBlasGemmStridedBatched( int lda, int64 stride_a, const DeviceMemory &b, int ldb, int64 stride_b, float beta, DeviceMemory *c, int ldc, int64 stride_c, int batch_count) { -#if CUDA_VERSION >= 9010 + bool use_tensor_ops = false; +#if CUDA_VERSION >= 9000 int cc_major, cc_minor; if (stream->parent()->GetDeviceDescription().cuda_compute_capability( - &cc_major, &cc_minor) && - cc_major >= 5) { - cublasGemmAlgo_t algo = - (cc_major >= 7 ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT); - bool ok = DoBlasInternalImpl( - AS_LAMBDA(cublasGemmStridedBatchedEx), stream, - true /* = pointer_mode_host */, true /* = err_on_failure */, - CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, - GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F, ldb, - stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c, - batch_count, CUDA_R_32F, algo); - if (ok) { - return true; + &cc_major, &cc_minor)) { + // GPUs < sm_70 don't support tensor ops. + if (cc_major >= 7 && TensorOpMathEnabled()) { + use_tensor_ops = true; } - LOG(ERROR) << "failed BLAS call, see log for details"; - return false; +#if CUDA_VERSION >= 9010 + if (cc_major >= 5) { + cublasGemmAlgo_t algo = + (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT); + bool ok = DoBlasInternalImpl( + AS_LAMBDA(cublasGemmStridedBatchedEx), stream, + true /* = pointer_mode_host */, true /* = err_on_failure */, + use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb), + m, n, k, &alpha, GpuMemory(a), CUDA_R_16F, lda, stride_a, + GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta, GpuMemoryMutable(c), + CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo); + if (ok) { + return true; + } + LOG(ERROR) << "failed BLAS call, see log for details"; + return false; + } +#endif } #endif // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop. @@ -2440,7 +2458,7 @@ bool CUDABlas::DoBlasGemmStridedBatched( reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c); bool ok = DoBlasInternalImpl( cublasSgemmEx, stream, true /* = pointer_mode_host */, - true /* = err_on_failure= */, CUDABlasTranspose(transa), + true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF, lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix, SE_CUDA_DATA_HALF, ldc); diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h index 556456c83db..817bdb72777 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.h +++ b/tensorflow/stream_executor/cuda/cuda_blas.h @@ -83,7 +83,7 @@ class CUDABlas : public blas::BlasSupport { template bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream, bool pointer_mode_host, bool err_on_failure, - Args... args); + bool use_tensor_op_math, Args... args); // Convenience functions that call DoBlasInternalImpl with different values // for err_on_failure. @@ -91,7 +91,8 @@ class CUDABlas : public blas::BlasSupport { bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host, Args... args) { return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host, - /*err_on_failure=*/true, args...); + /*err_on_failure=*/true, /*use_tensor_ops=*/false, + args...); } template bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream, @@ -99,7 +100,8 @@ class CUDABlas : public blas::BlasSupport { // Tensor ops are hard-coded off in this path, but can still be enabled with // a specific algorithm choice as in DoBlasGemmWithAlgorithmImpl(). return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host, - /*err_on_failure=*/false, args...); + /*err_on_failure=*/false, + /*use_tensor_ops=*/false, args...); } // A helper function to implement DoBlasGemmBatched interfaces for generic diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index 192bae91572..be18c989861 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -22,7 +22,6 @@ limitations under the License. #include "absl/strings/str_cat.h" #include "third_party/eigen3/Eigen/Core" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/platform/tf32_utils.h" #include "tensorflow/core/util/env_var.h" #include "tensorflow/stream_executor/cuda/cuda_activation.h" #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" @@ -602,6 +601,31 @@ class CudnnFilterDescriptor { SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor); }; +// A helper function to decide whether to enable the TENSOR_OP_MATH math type +bool TensorOpMathEnabled() { + static bool is_enabled = [] { + bool is_disabled = false; + TF_CHECK_OK( + tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_TENSOR_OP_MATH", + /*default_val=*/false, &is_disabled)); + return !is_disabled; + }(); + return is_enabled; +} + +// A helper function to decide whether to enable the TENSOR_OP_MATH math type +// for RNNs. +bool RnnTensorOpMathEnabled() { + static bool is_enabled = [] { + bool is_disabled = false; + TF_CHECK_OK( + tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH", + /*default_val=*/false, &is_disabled)); + return !is_disabled; + }(); + return is_enabled; +} + // A helper function to decide whether to use // CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in // some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT @@ -706,6 +730,10 @@ class CudnnConvolutionDescriptor { : CUDNN_CROSS_CORRELATION, data_type)); + // NOTE(benbarsdell): This only applies if tensor op math is enabled + // and algo selection is set to Default. + this->set_use_tensor_op_math(true); + #if CUDNN_MAJOR >= 7 VLOG(2) << "Requesting grouped convolution: " << convolution_descriptor.group_count(); @@ -717,15 +745,13 @@ class CudnnConvolutionDescriptor { #endif } - void set_use_tensor_op_math(bool use_tensor_op_math) { + void set_use_tensor_op_math(bool use_tensor_op_math) const { #if CUDNN_VERSION >= 7000 cudnnMathType_t math_type = -#if CUDNN_VERSION >= 8000 - (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH); -#else (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH); -#endif - CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type)); + if (TensorOpMathEnabled()) { + CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type)); + } #endif } @@ -737,40 +763,6 @@ class CudnnConvolutionDescriptor { SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolutionDescriptor); }; -// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math -// set -static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) { - cudnnMathType_t math_type; - CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type)); -#if CUDNN_VERSION >= 8000 - return math_type != CUDNN_FMA_MATH; -#else - return math_type == CUDNN_TENSOR_OP_MATH; -#endif -} - -static bool TensorOpMathAvailable(int cc_major) { - return cc_major >= 7 && CUDNN_VERSION >= 7000; -} - -static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) { - int cc_major, cc_minor; - std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream); - if (!TensorOpMathAvailable(cc_major)) { - return false; - } - if (input_type == dnn::DataType::kFloat) { -#if CUDNN_VERSION < 8000 - return false; -#else - if (!tensorflow::tf32_execution_allowed()) { - return false; - } -#endif - } - return true; -} - // Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle // within a scope. class CudnnPoolingDescriptor { @@ -1163,31 +1155,21 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor { // in profile mode, which is run with algorithms returned from // GetRnnAlgorithms() (which are non-default and explicitly set whether to // use tensor ops). CuDNN 7.2.1 fixed this issue - bool allow_tensor_ops = - data_type != CUDNN_DATA_FLOAT || tensorflow::tf32_execution_allowed(); - bool use_tensor_ops; - if (algorithm_config.algorithm().has_value()) { - use_tensor_ops = algorithm_config.algorithm()->tensor_ops_enabled(); - } else { - use_tensor_ops = CUDNN_VERSION >= 7201 && allow_tensor_ops; - } - - if (use_tensor_ops && !allow_tensor_ops) { - return port::Status(port::error::INVALID_ARGUMENT, - "Algo requests disallowed tensor op evaluation."); - } - - cudnnMathType_t math_type; - if (use_tensor_ops) { - math_type = CUDNN_TENSOR_OP_MATH; - } else { -#if CUDNN_VERSION >= 8000 - math_type = CUDNN_FMA_MATH; + if (RnnTensorOpMathEnabled()) { + cudnnMathType_t math_type; + if (algorithm_config.algorithm().has_value()) { + math_type = algorithm_config.algorithm()->tensor_ops_enabled() + ? CUDNN_TENSOR_OP_MATH + : CUDNN_DEFAULT_MATH; + } else { +#if CUDNN_VERSION >= 7201 + math_type = CUDNN_TENSOR_OP_MATH; #else - math_type = CUDNN_DEFAULT_MATH; -#endif // CUDNN_VERSION >= 8000 + math_type = CUDNN_DEFAULT_MATH; +#endif // CUDNN_VERSION >= 7201 + } + CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type)); } - CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type)); #endif // CUDNN_VERSION >= 7000 return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan), @@ -2578,11 +2560,10 @@ port::StatusOr> AllocateCudnnConvolutionForwardWorkspace( const CudnnTensorDescriptor& output_nd, const dnn::AlgorithmDesc& algorithm_desc, ScratchAllocator* scratch_allocator) { - if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) { - return port::Status( - port::error::INTERNAL, - "Mismatch between cudnn conv and algorithm descriptors."); - } + // TODO(csigg): This has side effects on the convolution descriptor. It is + // functionally correct because the convolution is run with the algorithm of + // the last call to this function, but should be fixed anyway. + conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled()); // Query the size of the workspace and allocate it. size_t size_in_bytes; @@ -2622,11 +2603,10 @@ AllocateCudnnConvolutionBackwardDataWorkspace( const CudnnTensorDescriptor& output_nd, const dnn::AlgorithmDesc& algorithm_desc, ScratchAllocator* scratch_allocator) { - if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) { - return port::Status( - port::error::INTERNAL, - "Mismatch between cudnn conv and algorithm descriptors."); - } + // TODO(csigg): This has side effects on the convolution descriptor. It is + // functionally correct because the convolution is run with the algorithm of + // the last call to this function, but should be fixed anyway. + conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled()); // Query the size of the workspace and allocate it. size_t size_in_bytes; @@ -2668,11 +2648,10 @@ AllocateCudnnConvolutionBackwardFilterWorkspace( const CudnnTensorDescriptor& output_nd, const dnn::AlgorithmDesc& algorithm_desc, ScratchAllocator* scratch_allocator) { - if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) { - return port::Status( - port::error::INTERNAL, - "Mismatch between cudnn conv and algorithm descriptors."); - } + // TODO(csigg): This has side effects on the convolution descriptor. It is + // functionally correct because the convolution is run with the algorithm of + // the last call to this function, but should be fixed anyway. + conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled()); // Query the size of the workspace and allocate it. size_t size_in_bytes; @@ -2706,42 +2685,18 @@ AllocateCudnnConvolutionBackwardFilterWorkspace( return scratch_allocator->AllocateBytes(size_in_bytes); } -port::StatusOr UseTensorOps(Stream* stream, dnn::DataType type, - absl::optional desc) { - bool use_tensor_ops; - if (desc.has_value()) { - use_tensor_ops = desc->tensor_ops_enabled(); - if (use_tensor_ops && !IsTensorMathAllowed(stream, type)) { - return port::Status(port::error::INVALID_ARGUMENT, - "Algo requests disallowed tensor op evaluation."); - } - } else { - use_tensor_ops = IsTensorMathAllowed(stream, type); - } - return use_tensor_ops; +static bool TensorOpMathAvailable(int cc_major) { + return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled(); } -cudnnDataType_t GetRnnComputeType(dnn::DataType data_type); -dnn::DataType GetConvAccumulatorType(dnn::DataType data_type); - port::StatusOr GetCudnnConvolutionForwardAlgorithm( Stream* stream, const CudnnHandle& cudnn, const dnn::AlgorithmConfig& algorithm_config, const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter, - dnn::DataType element_type, - const dnn::ConvolutionDescriptor& convolution_descriptor, + const CudnnConvolutionDescriptor& conv, const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator, DeviceMemory* scratch) { absl::optional algo_desc = algorithm_config.algorithm(); - - CudnnConvolutionDescriptor conv( - convolution_descriptor, - ToCudnnDataType(GetConvAccumulatorType(element_type))); - bool use_tensor_ops; - SE_ASSIGN_OR_RETURN(use_tensor_ops, - UseTensorOps(stream, element_type, algo_desc)); - conv.set_use_tensor_op_math(use_tensor_ops); - if (!algo_desc.has_value()) { // Pick fastest algorithm within memory limit according to cuDNN's // heuristics. @@ -2754,7 +2709,10 @@ port::StatusOr GetCudnnConvolutionForwardAlgorithm( GetCudnnConvolutionForwardAlgo( cudnn, input_nd, filter, conv, output_nd, specify_workspace_limit, memory_limit_bytes)); - algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops); + int cc_major, cc_minor; + std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream); + algo_desc = dnn::AlgorithmDesc( + algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major)); } const auto scratch_or = AllocateCudnnConvolutionForwardWorkspace( @@ -2778,9 +2736,6 @@ port::StatusOr GetCudnnConvolutionForwardAlgorithm( "Returned status: ", scratch_or.status().ToString())); } - SE_ASSIGN_OR_RETURN(use_tensor_ops, - UseTensorOps(stream, element_type, algo_desc)); - conv.set_use_tensor_op_math(use_tensor_ops); SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace( stream, cudnn, input_nd, filter, conv, output_nd, *algo_desc, scratch_allocator)); @@ -2791,19 +2746,10 @@ port::StatusOr GetCudnnConvolutionBackwardDataAlgorithm( Stream* stream, const CudnnHandle& cudnn, const dnn::AlgorithmConfig& algorithm_config, const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter, - dnn::DataType element_type, - const dnn::ConvolutionDescriptor& convolution_descriptor, + const CudnnConvolutionDescriptor& conv, const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator, DeviceMemory* scratch) { absl::optional algo_desc = algorithm_config.algorithm(); - CudnnConvolutionDescriptor conv( - convolution_descriptor, - ToCudnnDataType(GetConvAccumulatorType(element_type))); - bool use_tensor_ops; - SE_ASSIGN_OR_RETURN(use_tensor_ops, - UseTensorOps(stream, element_type, algo_desc)); - conv.set_use_tensor_op_math(use_tensor_ops); - if (!algo_desc.has_value()) { // Pick fastest algorithm within memory limit according to cuDNN's // heuristics. @@ -2816,7 +2762,10 @@ port::StatusOr GetCudnnConvolutionBackwardDataAlgorithm( GetCudnnConvolutionBackwardDataAlgo( cudnn, input_nd, filter, conv, output_nd, specify_workspace_limit, memory_limit_bytes)); - algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops); + int cc_major, cc_minor; + std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream); + algo_desc = dnn::AlgorithmDesc( + algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major)); } const auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace( @@ -2839,9 +2788,6 @@ port::StatusOr GetCudnnConvolutionBackwardDataAlgorithm( "while a secondary algorithm is not provided."); } - SE_ASSIGN_OR_RETURN(use_tensor_ops, - UseTensorOps(stream, element_type, algo_desc)); - conv.set_use_tensor_op_math(use_tensor_ops); SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace( stream, cudnn, input_nd, filter, conv, output_nd, *algo_desc, scratch_allocator)); @@ -2852,19 +2798,10 @@ port::StatusOr GetCudnnConvolutionBackwardFilterAlgorithm( Stream* stream, const CudnnHandle& cudnn, const dnn::AlgorithmConfig& algorithm_config, const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter, - dnn::DataType element_type, - const dnn::ConvolutionDescriptor& convolution_descriptor, + const CudnnConvolutionDescriptor& conv, const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator, DeviceMemory* scratch) { absl::optional algo_desc = algorithm_config.algorithm(); - CudnnConvolutionDescriptor conv( - convolution_descriptor, - ToCudnnDataType(GetConvAccumulatorType(element_type))); - bool use_tensor_ops; - SE_ASSIGN_OR_RETURN(use_tensor_ops, - UseTensorOps(stream, element_type, algo_desc)); - conv.set_use_tensor_op_math(use_tensor_ops); - if (!algo_desc.has_value()) { // Pick fastest algorithm within memory limit according to cuDNN's // heuristics. @@ -2877,7 +2814,10 @@ port::StatusOr GetCudnnConvolutionBackwardFilterAlgorithm( GetCudnnConvolutionBackwardFilterAlgo( cudnn, input_nd, filter, conv, output_nd, specify_workspace_limit, memory_limit_bytes)); - algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops); + int cc_major, cc_minor; + std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream); + algo_desc = dnn::AlgorithmDesc( + algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major)); } auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace( @@ -2900,9 +2840,6 @@ port::StatusOr GetCudnnConvolutionBackwardFilterAlgorithm( "while a secondary algorithm is not provided."); } - SE_ASSIGN_OR_RETURN(use_tensor_ops, - UseTensorOps(stream, element_type, algo_desc)); - conv.set_use_tensor_op_math(use_tensor_ops); SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace( stream, cudnn, input_nd, filter, conv, output_nd, *algo_desc, scratch_allocator)); @@ -3067,32 +3004,35 @@ port::Status CudnnSupport::DoPrepareForConvolution( CudnnTensorDescriptor output_nd( output_descriptor, ToCudnnDataType(element_type, output_descriptor.layout())); + CudnnConvolutionDescriptor conv( + convolution_descriptor, + ToCudnnDataType(GetConvAccumulatorType(element_type))); auto cudnn = cudnn_->GetHandle(parent_, stream); switch (kind) { case dnn::ConvolutionKind::FORWARD: { - SE_ASSIGN_OR_RETURN(*algorithm_desc, - GetCudnnConvolutionForwardAlgorithm( - stream, cudnn, algorithm_config, input_nd, - filter_nd, element_type, convolution_descriptor, - output_nd, scratch_allocator, scratch_memory)); + SE_ASSIGN_OR_RETURN( + *algorithm_desc, + GetCudnnConvolutionForwardAlgorithm( + stream, cudnn, algorithm_config, input_nd, filter_nd, conv, + output_nd, scratch_allocator, scratch_memory)); break; } case dnn::ConvolutionKind::BACKWARD_DATA: { - SE_ASSIGN_OR_RETURN(*algorithm_desc, - GetCudnnConvolutionBackwardDataAlgorithm( - stream, cudnn, algorithm_config, input_nd, - filter_nd, element_type, convolution_descriptor, - output_nd, scratch_allocator, scratch_memory)); + SE_ASSIGN_OR_RETURN( + *algorithm_desc, + GetCudnnConvolutionBackwardDataAlgorithm( + stream, cudnn, algorithm_config, input_nd, filter_nd, conv, + output_nd, scratch_allocator, scratch_memory)); break; } case dnn::ConvolutionKind::BACKWARD_FILTER: { - SE_ASSIGN_OR_RETURN(*algorithm_desc, - GetCudnnConvolutionBackwardFilterAlgorithm( - stream, cudnn, algorithm_config, input_nd, - filter_nd, element_type, convolution_descriptor, - output_nd, scratch_allocator, scratch_memory)); + SE_ASSIGN_OR_RETURN( + *algorithm_desc, + GetCudnnConvolutionBackwardFilterAlgorithm( + stream, cudnn, algorithm_config, input_nd, filter_nd, conv, + output_nd, scratch_allocator, scratch_memory)); break; } default: @@ -3121,9 +3061,8 @@ port::Status CudnnSupport::DoConvolve( auto accumulator_type = GetConvAccumulatorType(element_type); CudnnConvolutionDescriptor conv(convolution_descriptor, ToCudnnDataType(accumulator_type)); - SE_ASSIGN_OR_RETURN(bool use_tensor_ops, - UseTensorOps(stream, element_type, algorithm_desc)); - conv.set_use_tensor_op_math(use_tensor_ops); + // Set use_tensor_math param to correct value + conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled()); auto cudnn = cudnn_->GetHandle(parent_, stream); // Alpha is the scaling factor for input. @@ -3356,6 +3295,14 @@ port::Status CudnnSupport::DoConvolve( return port::Status::OK(); } +// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math +// set +static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) { + cudnnMathType_t math_type; + CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type)); + return math_type == CUDNN_TENSOR_OP_MATH; +} + template port::Status CudnnSupport::DoFusedConvolveImpl( @@ -3389,6 +3336,8 @@ port::Status CudnnSupport::DoFusedConvolveImpl( filter_descriptor, GetCudnnDataType(conv_input_descriptor.layout())); CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType()); + CudnnConvolutionDescriptor conv(convolution_descriptor, + ToCudnnDataType(accumulator_type)); auto cudnn = cudnn_->GetHandle(parent_, stream); @@ -3398,14 +3347,9 @@ port::Status CudnnSupport::DoFusedConvolveImpl( SE_ASSIGN_OR_RETURN( dnn::AlgorithmDesc algo_desc, GetCudnnConvolutionForwardAlgorithm( - stream, cudnn, algorithm_config, conv_input_nd, filter, - dnn::ToDataType::value, convolution_descriptor, + stream, cudnn, algorithm_config, conv_input_nd, filter, conv, output_nd, scratch_allocator, &scratch)); - CudnnConvolutionDescriptor conv(convolution_descriptor, - ToCudnnDataType(accumulator_type)); - conv.set_use_tensor_op_math(algo_desc.tensor_ops_enabled()); - std::unique_ptr timer; if (is_profiling) { timer.reset(new GpuTimer(parent_)); // NOLINT @@ -3536,7 +3480,9 @@ bool CudnnSupport::GetRnnAlgorithms( for (auto i : algo_types) { out_algorithms->push_back({i, /*use_tensor_ops=*/false}); #if CUDNN_VERSION >= 7100 - out_algorithms->push_back({i, /*use_tensor_ops=*/true}); + if (RnnTensorOpMathEnabled()) { + out_algorithms->push_back({i, /*use_tensor_ops=*/true}); + } #endif } return true;