diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index c9f0fc462c9..65c07e72154 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -101,18 +101,6 @@ static std::string ToString(cublasStatus_t status) { } } -// Decide whether to enable TENSOR_OP_MATH -static bool TensorOpMathEnabled() { - static bool is_enabled = [] { - bool is_disabled; - TF_CHECK_OK( - tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH", - /*default_val=*/false, &is_disabled)); - return !is_disabled; - }(); - return is_enabled; -} - // cuBLAS has interfaces that permit pointers to be passed from either the host // memory space or the device memory space; however, you must instruct it as to // which address space those pointers are in with cublasSetPointerMode. @@ -1640,7 +1628,7 @@ bool CUDABlas::DoBlasGemm( &cc_minor); // GPUs < sm_70 don't support tensor ops. - if (cc_major >= 7 && TensorOpMathEnabled()) { + if (cc_major >= 7) { use_tensor_ops = true; } #endif @@ -1921,8 +1909,7 @@ static bool TensorOpsAvailable(int cc_major) { // strictly correct. We can't simply enable it, though, as that would change // clients' behavior significantly: Using tensor ops on fp32 inputs cause them // to be rounded to fp16. - if (cc_major >= 7 && TensorOpMathEnabled() && - std::is_same::value) { + if (cc_major >= 7 && std::is_same::value) { return true; } #endif @@ -2270,7 +2257,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal( if (stream->parent()->GetDeviceDescription().cuda_compute_capability( &cc_major, &cc_minor) && cc_major >= 5) { - bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F; + bool use_tensor_ops = data_type == CUDA_R_16F; cublasGemmAlgo_t algo = (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT); cudaDataType_t compute_type = @@ -2425,7 +2412,7 @@ bool CUDABlas::DoBlasGemmStridedBatched( if (stream->parent()->GetDeviceDescription().cuda_compute_capability( &cc_major, &cc_minor)) { // GPUs < sm_70 don't support tensor ops. - if (cc_major >= 7 && TensorOpMathEnabled()) { + if (cc_major >= 7) { use_tensor_ops = true; } #if CUDA_VERSION >= 9010 diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index be18c989861..e46c271443b 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -601,31 +601,6 @@ class CudnnFilterDescriptor { SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor); }; -// A helper function to decide whether to enable the TENSOR_OP_MATH math type -bool TensorOpMathEnabled() { - static bool is_enabled = [] { - bool is_disabled = false; - TF_CHECK_OK( - tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_TENSOR_OP_MATH", - /*default_val=*/false, &is_disabled)); - return !is_disabled; - }(); - return is_enabled; -} - -// A helper function to decide whether to enable the TENSOR_OP_MATH math type -// for RNNs. -bool RnnTensorOpMathEnabled() { - static bool is_enabled = [] { - bool is_disabled = false; - TF_CHECK_OK( - tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH", - /*default_val=*/false, &is_disabled)); - return !is_disabled; - }(); - return is_enabled; -} - // A helper function to decide whether to use // CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in // some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT @@ -749,9 +724,7 @@ class CudnnConvolutionDescriptor { #if CUDNN_VERSION >= 7000 cudnnMathType_t math_type = (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH); - if (TensorOpMathEnabled()) { - CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type)); - } + CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type)); #endif } @@ -1155,21 +1128,19 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor { // in profile mode, which is run with algorithms returned from // GetRnnAlgorithms() (which are non-default and explicitly set whether to // use tensor ops). CuDNN 7.2.1 fixed this issue - if (RnnTensorOpMathEnabled()) { - cudnnMathType_t math_type; - if (algorithm_config.algorithm().has_value()) { - math_type = algorithm_config.algorithm()->tensor_ops_enabled() - ? CUDNN_TENSOR_OP_MATH - : CUDNN_DEFAULT_MATH; - } else { + cudnnMathType_t math_type; + if (algorithm_config.algorithm().has_value()) { + math_type = algorithm_config.algorithm()->tensor_ops_enabled() + ? CUDNN_TENSOR_OP_MATH + : CUDNN_DEFAULT_MATH; + } else { #if CUDNN_VERSION >= 7201 - math_type = CUDNN_TENSOR_OP_MATH; + math_type = CUDNN_TENSOR_OP_MATH; #else - math_type = CUDNN_DEFAULT_MATH; + math_type = CUDNN_DEFAULT_MATH; #endif // CUDNN_VERSION >= 7201 - } - CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type)); } + CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type)); #endif // CUDNN_VERSION >= 7000 return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan), @@ -2686,7 +2657,7 @@ AllocateCudnnConvolutionBackwardFilterWorkspace( } static bool TensorOpMathAvailable(int cc_major) { - return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled(); + return cc_major >= 7 && CUDNN_VERSION >= 7000; } port::StatusOr GetCudnnConvolutionForwardAlgorithm( @@ -3480,9 +3451,7 @@ bool CudnnSupport::GetRnnAlgorithms( for (auto i : algo_types) { out_algorithms->push_back({i, /*use_tensor_ops=*/false}); #if CUDNN_VERSION >= 7100 - if (RnnTensorOpMathEnabled()) { - out_algorithms->push_back({i, /*use_tensor_ops=*/true}); - } + out_algorithms->push_back({i, /*use_tensor_ops=*/true}); #endif } return true;