Internal change

PiperOrigin-RevId: 317230321 Change-Id: I043dae37768f6e9cf946d4db2a8c36123ed2d6d9
2020-06-18 19:17:40 -07:00 · 2020-06-18 19:17:40 -07:00 · 9c4b749b09
commit 9c4b749b09
parent 4a14e778d6
10 changed files with 172 additions and 336 deletions
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@ -938,13 +938,6 @@ cc_library(
    alwayslink = 1,
 )

-cc_library(
-    name = "tf32_utils",
-    srcs = ["tf32_utils.cc"],
-    hdrs = ["tf32_utils.h"],
-    copts = tf_copts(),
-)
-
 tf_cc_tests(
    name = "low_level_library_tests",
    size = "small",
--- a/tensorflow/core/platform/tf32_utils.cc
+++ b/tensorflow/core/platform/tf32_utils.cc
@ -1,30 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/tf32_utils.h"
-
-#include <atomic>
-
-namespace tensorflow {
-
-// Whether TensorFloat-32 should be used where supported.
-// TODO(nluehr): Maybe enable by default after TF32 Ampere testing.
-static std::atomic<bool> tf32_allowed{false};
-
-void allow_tf32_execution(bool allowed) { tf32_allowed = allowed; }
-
-bool tf32_execution_allowed() { return tf32_allowed; }
-
-}  // namespace tensorflow
--- a/tensorflow/core/platform/tf32_utils.h
+++ b/tensorflow/core/platform/tf32_utils.h
@ -1,27 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
-#define TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
-
-namespace tensorflow {
-
-void allow_tf32_execution(bool allowed);
-
-bool tf32_execution_allowed();
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -788,16 +788,6 @@ tf_python_pybind_extension(
    ],
 )

-tf_python_pybind_extension(
-    name = "_pywrap_tf32_execution",
-    srcs = ["util/tf32.cc"],
-    module_name = "_pywrap_tf32_execution",
-    deps = [
-        "//tensorflow/core/platform:tf32_utils",
-        "@pybind11",
-    ],
-)
-
 tf_python_pybind_extension(
    name = "_pywrap_util_port",
    srcs = ["util/port_wrapper.cc"],
@ -5688,7 +5678,6 @@ py_library(
        "//tensorflow:composite_tensor_whitelist",
    ],
    deps = [
-        ":_pywrap_tf32_execution",
        ":tf_decorator",
        ":tf_export",
        ":tf_stack",
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@ -18,42 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-from tensorflow.python import _pywrap_tf32_execution
 from tensorflow.python.eager import context
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export


-# No tf_export until TF is built against CUDA11 which is required for TF32.
-def tensor_float_32_execution_allowed():
-  """Get if TensorFloat-32 operations are enabled on supported hardware.
-
-  Returns:
-    True if TensorFloat-32 execution is enabled and False otherwise.
-  """
-  return _pywrap_tf32_execution.is_allowed()
-
-
-# No tf_export until TF is built against CUDA 11 which is required for TF32.
-def allow_tensor_float_32_execution(allowed):
-  """Allow use of TensorFloat-32 with float32 ops on supported hardware.
-
-  TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
-  TensorFloat-32 kernels take float32 inputs and produce float32 outputs.
-  Internally, the inputs are cast to a custom representation with 10-bit
-  mantissa (similar to float16) and 8-bit exponent (similar to float32) and are
-  executed using TensorCores with float32 accumulation. For more information,
-  see https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/.
-
-  TensorFloat-32 execution is disabled by default, but this may change in a
-  future version.
-
-  Args:
-    allowed: whether to allow TensorFloat-32 execution
-  """
-  _pywrap_tf32_execution.allow(allowed)
-
-
@tf_export('config.threading.get_intra_op_parallelism_threads')
 def get_intra_op_parallelism_threads():
  """Get number of threads used within an individual op for parallelism.
--- a/tensorflow/python/util/tf32.cc
+++ b/tensorflow/python/util/tf32.cc
@ -1,22 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "pybind11/pybind11.h"
-#include "tensorflow/core/platform/tf32_utils.h"
-
-PYBIND11_MODULE(_pywrap_tf32_execution, m) {
-  m.def("allow", &tensorflow::allow_tf32_execution);
-  m.def("is_allowed", &tensorflow::tf32_execution_allowed);
-}
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@ -251,7 +251,6 @@ cc_library(
        "@local_config_cuda//cuda:cuda_headers",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:tf32_utils",
        "//tensorflow/stream_executor",
        "//tensorflow/stream_executor:event",
        "//tensorflow/stream_executor:host_or_device_scalar",
@ -357,7 +356,6 @@ cc_library(
        "@local_config_cuda//cuda:cudnn_header",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:tf32_utils",
        "//tensorflow/stream_executor:dnn",
        "//tensorflow/stream_executor:event",
        "//tensorflow/stream_executor:plugin_registry",
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@ -49,7 +49,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
@ -102,6 +101,18 @@ static std::string ToString(cublasStatus_t status) {
  }
 }

+// Decide whether to enable TENSOR_OP_MATH
+static bool TensorOpMathEnabled() {
+  static bool is_enabled = [] {
+    bool is_disabled;
+    TF_CHECK_OK(
+        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH",
+                                       /*default_val=*/false, &is_disabled));
+    return !is_disabled;
+  }();
+  return is_enabled;
+}
+
 // cuBLAS has interfaces that permit pointers to be passed from either the host
 // memory space or the device memory space; however, you must instruct it as to
 // which address space those pointers are in with cublasSetPointerMode.
@ -226,19 +237,6 @@ bool CUDABlas::Init() {
    return false;
  }

-  absl::MutexLock lock(&mu_);
-#if CUDA_VERSION >= 9000
-#if CUBLAS_VER_MAJOR >= 11
-  ret = cublasSetMathMode(blas_, CUBLAS_TF32_TENSOR_OP_MATH);
-#else
-  ret = cublasSetMathMode(blas_, CUBLAS_TENSOR_OP_MATH);
-#endif
-  if (ret != CUBLAS_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set cublas default math mode: " << ToString(ret);
-    return false;
-  }
-#endif
-
  return true;
 }

@ -401,7 +399,7 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
 template <typename FuncT, typename... Args>
 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                  bool pointer_mode_host, bool err_on_failure,
-                                  Args... args) {
+                                  bool use_tensor_op_math, Args... args) {
  absl::MutexLock lock(&mu_);

  CHECK(blas_ != nullptr);
@ -415,10 +413,10 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                           : CUBLAS_POINTER_MODE_DEVICE)) {
    return false;
  }
-#if CUBLAS_VER_MAJOR >= 11
+#if CUDA_VERSION >= 9000
  ScopedCublasMathMode math_mode{blas_};
-  if (!tensorflow::tf32_execution_allowed()) {
-    if (!math_mode.Init(CUBLAS_DEFAULT_MATH)) {
+  if (use_tensor_op_math) {
+    if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
      return false;
    }
  }
@ -1635,9 +1633,21 @@ bool CUDABlas::DoBlasGemm(
    }
  }

+  bool use_tensor_ops = false;
+#if CUDA_VERSION >= 9000
+  int cc_major, cc_minor;
+  stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                   &cc_minor);
+
+  // GPUs < sm_70 don't support tensor ops.
+  if (cc_major >= 7 && TensorOpMathEnabled()) {
+    use_tensor_ops = true;
+  }
+#endif
+
  return DoBlasInternalImpl(
      cublasSgemmEx, stream, true /* = pointer_mode_host */,
-      true /* = err_on_failure= */, CUDABlasTranspose(transa),
+      true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
      CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
      SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
      GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
@ -1911,7 +1921,8 @@ static bool TensorOpsAvailable(int cc_major) {
  // strictly correct.  We can't simply enable it, though, as that would change
  // clients' behavior significantly: Using tensor ops on fp32 inputs cause them
  // to be rounded to fp16.
-  if (cc_major >= 7 && std::is_same<InType, Eigen::half>::value) {
+  if (cc_major >= 7 && TensorOpMathEnabled() &&
+      std::is_same<InType, Eigen::half>::value) {
    return true;
  }
 #endif
@ -2259,8 +2270,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
  if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
          &cc_major, &cc_minor) &&
      cc_major >= 5) {
-    bool use_tensor_ops =
-        data_type == CUDA_R_16F || tensorflow::tf32_execution_allowed();
+    bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
    cublasGemmAlgo_t algo =
        (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
    cudaDataType_t compute_type =
@ -2274,7 +2284,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
    bool ok;
    ok = DoBlasInternalImpl(
        AS_LAMBDA(cublasGemmBatchedEx), stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure */, CUDABlasTranspose(transa),
+        true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
        CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
        b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
        batch_count, compute_type, algo);
@ -2409,25 +2419,33 @@ bool CUDABlas::DoBlasGemmStridedBatched(
    int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
    int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
    int64 stride_c, int batch_count) {
-#if CUDA_VERSION >= 9010
+  bool use_tensor_ops = false;
+#if CUDA_VERSION >= 9000
  int cc_major, cc_minor;
  if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
-          &cc_major, &cc_minor) &&
-      cc_major >= 5) {
-    cublasGemmAlgo_t algo =
-        (cc_major >= 7 ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
-    bool ok = DoBlasInternalImpl(
-        AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
-        true /* = pointer_mode_host */, true /* = err_on_failure */,
-        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-        GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F, ldb,
-        stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
-        batch_count, CUDA_R_32F, algo);
-    if (ok) {
-      return true;
+          &cc_major, &cc_minor)) {
+    // GPUs < sm_70 don't support tensor ops.
+    if (cc_major >= 7 && TensorOpMathEnabled()) {
+      use_tensor_ops = true;
    }
-    LOG(ERROR) << "failed BLAS call, see log for details";
-    return false;
+#if CUDA_VERSION >= 9010
+    if (cc_major >= 5) {
+      cublasGemmAlgo_t algo =
+          (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+      bool ok = DoBlasInternalImpl(
+          AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
+          true /* = pointer_mode_host */, true /* = err_on_failure */,
+          use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb),
+          m, n, k, &alpha, GpuMemory(a), CUDA_R_16F, lda, stride_a,
+          GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta, GpuMemoryMutable(c),
+          CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo);
+      if (ok) {
+        return true;
+      }
+      LOG(ERROR) << "failed BLAS call, see log for details";
+      return false;
+    }
+#endif
  }
 #endif
  // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
@ -2440,7 +2458,7 @@ bool CUDABlas::DoBlasGemmStridedBatched(
        reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
    bool ok = DoBlasInternalImpl(
        cublasSgemmEx, stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure= */, CUDABlasTranspose(transa),
+        true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
        CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
        lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
        SE_CUDA_DATA_HALF, ldc);
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@ -83,7 +83,7 @@ class CUDABlas : public blas::BlasSupport {
  template <typename FuncT, typename... Args>
  bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                          bool pointer_mode_host, bool err_on_failure,
-                          Args... args);
+                          bool use_tensor_op_math, Args... args);

  // Convenience functions that call DoBlasInternalImpl with different values
  // for err_on_failure.
@ -91,7 +91,8 @@ class CUDABlas : public blas::BlasSupport {
  bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
                      Args... args) {
    return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/true, args...);
+                              /*err_on_failure=*/true, /*use_tensor_ops=*/false,
+                              args...);
  }
  template <typename FuncT, typename... Args>
  bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream,
@ -99,7 +100,8 @@ class CUDABlas : public blas::BlasSupport {
    // Tensor ops are hard-coded off in this path, but can still be enabled with
    // a specific algorithm choice as in DoBlasGemmWithAlgorithmImpl().
    return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/false, args...);
+                              /*err_on_failure=*/false,
+                              /*use_tensor_ops=*/false, args...);
  }

  // A helper function to implement DoBlasGemmBatched interfaces for generic
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@ -602,6 +601,31 @@ class CudnnFilterDescriptor {
  SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor);
 };

+// A helper function to decide whether to enable the TENSOR_OP_MATH math type
+bool TensorOpMathEnabled() {
+  static bool is_enabled = [] {
+    bool is_disabled = false;
+    TF_CHECK_OK(
+        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_TENSOR_OP_MATH",
+                                       /*default_val=*/false, &is_disabled));
+    return !is_disabled;
+  }();
+  return is_enabled;
+}
+
+// A helper function to decide whether to enable the TENSOR_OP_MATH math type
+// for RNNs.
+bool RnnTensorOpMathEnabled() {
+  static bool is_enabled = [] {
+    bool is_disabled = false;
+    TF_CHECK_OK(
+        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH",
+                                       /*default_val=*/false, &is_disabled));
+    return !is_disabled;
+  }();
+  return is_enabled;
+}
+
 // A helper function to decide whether to use
 // CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
 // some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
@ -706,6 +730,10 @@ class CudnnConvolutionDescriptor {
            : CUDNN_CROSS_CORRELATION,
        data_type));

+    // NOTE(benbarsdell): This only applies if tensor op math is enabled
+    //                      and algo selection is set to Default.
+    this->set_use_tensor_op_math(true);
+
 #if CUDNN_MAJOR >= 7
    VLOG(2) << "Requesting grouped convolution: "
            << convolution_descriptor.group_count();
@ -717,15 +745,13 @@ class CudnnConvolutionDescriptor {
 #endif
  }

-  void set_use_tensor_op_math(bool use_tensor_op_math) {
+  void set_use_tensor_op_math(bool use_tensor_op_math) const {
 #if CUDNN_VERSION >= 7000
    cudnnMathType_t math_type =
-#if CUDNN_VERSION >= 8000
-        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH);
-#else
        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
-#endif
-    CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
+    if (TensorOpMathEnabled()) {
+      CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
+    }
 #endif
  }

@ -737,40 +763,6 @@ class CudnnConvolutionDescriptor {
  SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolutionDescriptor);
 };

-// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
-// set
-static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
-  cudnnMathType_t math_type;
-  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
-#if CUDNN_VERSION >= 8000
-  return math_type != CUDNN_FMA_MATH;
-#else
-  return math_type == CUDNN_TENSOR_OP_MATH;
-#endif
-}
-
-static bool TensorOpMathAvailable(int cc_major) {
-  return cc_major >= 7 && CUDNN_VERSION >= 7000;
-}
-
-static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
-  int cc_major, cc_minor;
-  std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-  if (!TensorOpMathAvailable(cc_major)) {
-    return false;
-  }
-  if (input_type == dnn::DataType::kFloat) {
-#if CUDNN_VERSION < 8000
-    return false;
-#else
-    if (!tensorflow::tf32_execution_allowed()) {
-      return false;
-    }
-#endif
-  }
-  return true;
-}
-
 // Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
 // within a scope.
 class CudnnPoolingDescriptor {
@ -1163,31 +1155,21 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
    // in profile mode, which is run with algorithms returned from
    // GetRnnAlgorithms() (which are non-default and explicitly set whether to
    // use tensor ops). CuDNN 7.2.1 fixed this issue
-    bool allow_tensor_ops =
-        data_type != CUDNN_DATA_FLOAT || tensorflow::tf32_execution_allowed();
-    bool use_tensor_ops;
-    if (algorithm_config.algorithm().has_value()) {
-      use_tensor_ops = algorithm_config.algorithm()->tensor_ops_enabled();
-    } else {
-      use_tensor_ops = CUDNN_VERSION >= 7201 && allow_tensor_ops;
-    }
-
-    if (use_tensor_ops && !allow_tensor_ops) {
-      return port::Status(port::error::INVALID_ARGUMENT,
-                          "Algo requests disallowed tensor op evaluation.");
-    }
-
-    cudnnMathType_t math_type;
-    if (use_tensor_ops) {
-      math_type = CUDNN_TENSOR_OP_MATH;
-    } else {
-#if CUDNN_VERSION >= 8000
-      math_type = CUDNN_FMA_MATH;
+    if (RnnTensorOpMathEnabled()) {
+      cudnnMathType_t math_type;
+      if (algorithm_config.algorithm().has_value()) {
+        math_type = algorithm_config.algorithm()->tensor_ops_enabled()
+                        ? CUDNN_TENSOR_OP_MATH
+                        : CUDNN_DEFAULT_MATH;
+      } else {
+#if CUDNN_VERSION >= 7201
+        math_type = CUDNN_TENSOR_OP_MATH;
 #else
-      math_type = CUDNN_DEFAULT_MATH;
-#endif  // CUDNN_VERSION >= 8000
+        math_type = CUDNN_DEFAULT_MATH;
+#endif  // CUDNN_VERSION >= 7201
+      }
+      CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
    }
-    CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
 #endif  // CUDNN_VERSION >= 7000

    return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan),
@ -2578,11 +2560,10 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
    const CudnnTensorDescriptor& output_nd,
    const dnn::AlgorithmDesc& algorithm_desc,
    ScratchAllocator* scratch_allocator) {
-  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
-    return port::Status(
-        port::error::INTERNAL,
-        "Mismatch between cudnn conv and algorithm descriptors.");
-  }
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());

  // Query the size of the workspace and allocate it.
  size_t size_in_bytes;
@ -2622,11 +2603,10 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
    const CudnnTensorDescriptor& output_nd,
    const dnn::AlgorithmDesc& algorithm_desc,
    ScratchAllocator* scratch_allocator) {
-  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
-    return port::Status(
-        port::error::INTERNAL,
-        "Mismatch between cudnn conv and algorithm descriptors.");
-  }
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());

  // Query the size of the workspace and allocate it.
  size_t size_in_bytes;
@ -2668,11 +2648,10 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
    const CudnnTensorDescriptor& output_nd,
    const dnn::AlgorithmDesc& algorithm_desc,
    ScratchAllocator* scratch_allocator) {
-  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
-    return port::Status(
-        port::error::INTERNAL,
-        "Mismatch between cudnn conv and algorithm descriptors.");
-  }
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());

  // Query the size of the workspace and allocate it.
  size_t size_in_bytes;
@ -2706,42 +2685,18 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
  return scratch_allocator->AllocateBytes(size_in_bytes);
 }

-port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
-                                  absl::optional<dnn::AlgorithmDesc> desc) {
-  bool use_tensor_ops;
-  if (desc.has_value()) {
-    use_tensor_ops = desc->tensor_ops_enabled();
-    if (use_tensor_ops && !IsTensorMathAllowed(stream, type)) {
-      return port::Status(port::error::INVALID_ARGUMENT,
-                          "Algo requests disallowed tensor op evaluation.");
-    }
-  } else {
-    use_tensor_ops = IsTensorMathAllowed(stream, type);
-  }
-  return use_tensor_ops;
+static bool TensorOpMathAvailable(int cc_major) {
+  return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled();
 }

-cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
-dnn::DataType GetConvAccumulatorType(dnn::DataType data_type);
-
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
    Stream* stream, const CudnnHandle& cudnn,
    const dnn::AlgorithmConfig& algorithm_config,
    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    dnn::DataType element_type,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const CudnnConvolutionDescriptor& conv,
    const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
    DeviceMemory<uint8>* scratch) {
  absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
-
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
-  bool use_tensor_ops;
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
-
  if (!algo_desc.has_value()) {
    // Pick fastest algorithm within memory limit according to cuDNN's
    // heuristics.
@ -2754,7 +2709,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                        GetCudnnConvolutionForwardAlgo(
                            cudnn, input_nd, filter, conv, output_nd,
                            specify_workspace_limit, memory_limit_bytes));
-    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
+    int cc_major, cc_minor;
+    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+    algo_desc = dnn::AlgorithmDesc(
+        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
  }

  const auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
@ -2778,9 +2736,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                     "Returned status: ", scratch_or.status().ToString()));
  }

-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
  SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
                                    stream, cudnn, input_nd, filter, conv,
                                    output_nd, *algo_desc, scratch_allocator));
@ -2791,19 +2746,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
    Stream* stream, const CudnnHandle& cudnn,
    const dnn::AlgorithmConfig& algorithm_config,
    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    dnn::DataType element_type,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const CudnnConvolutionDescriptor& conv,
    const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
    DeviceMemory<uint8>* scratch) {
  absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
-  bool use_tensor_ops;
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
-
  if (!algo_desc.has_value()) {
    // Pick fastest algorithm within memory limit according to cuDNN's
    // heuristics.
@ -2816,7 +2762,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
                        GetCudnnConvolutionBackwardDataAlgo(
                            cudnn, input_nd, filter, conv, output_nd,
                            specify_workspace_limit, memory_limit_bytes));
-    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
+    int cc_major, cc_minor;
+    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+    algo_desc = dnn::AlgorithmDesc(
+        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
  }

  const auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
@ -2839,9 +2788,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
        "while a secondary algorithm is not provided.");
  }

-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
  SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
                                    stream, cudnn, input_nd, filter, conv,
                                    output_nd, *algo_desc, scratch_allocator));
@ -2852,19 +2798,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
    Stream* stream, const CudnnHandle& cudnn,
    const dnn::AlgorithmConfig& algorithm_config,
    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    dnn::DataType element_type,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const CudnnConvolutionDescriptor& conv,
    const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
    DeviceMemory<uint8>* scratch) {
  absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
-  bool use_tensor_ops;
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
-
  if (!algo_desc.has_value()) {
    // Pick fastest algorithm within memory limit according to cuDNN's
    // heuristics.
@ -2877,7 +2814,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
                        GetCudnnConvolutionBackwardFilterAlgo(
                            cudnn, input_nd, filter, conv, output_nd,
                            specify_workspace_limit, memory_limit_bytes));
-    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
+    int cc_major, cc_minor;
+    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+    algo_desc = dnn::AlgorithmDesc(
+        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
  }

  auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
@ -2900,9 +2840,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
        "while a secondary algorithm is not provided.");
  }

-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
  SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace(
                                    stream, cudnn, input_nd, filter, conv,
                                    output_nd, *algo_desc, scratch_allocator));
@ -3067,32 +3004,35 @@ port::Status CudnnSupport::DoPrepareForConvolution(
  CudnnTensorDescriptor output_nd(
      output_descriptor,
      ToCudnnDataType(element_type, output_descriptor.layout()));
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));

  auto cudnn = cudnn_->GetHandle(parent_, stream);

  switch (kind) {
    case dnn::ConvolutionKind::FORWARD: {
-      SE_ASSIGN_OR_RETURN(*algorithm_desc,
-                          GetCudnnConvolutionForwardAlgorithm(
-                              stream, cudnn, algorithm_config, input_nd,
-                              filter_nd, element_type, convolution_descriptor,
-                              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionForwardAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
      break;
    }
    case dnn::ConvolutionKind::BACKWARD_DATA: {
-      SE_ASSIGN_OR_RETURN(*algorithm_desc,
-                          GetCudnnConvolutionBackwardDataAlgorithm(
-                              stream, cudnn, algorithm_config, input_nd,
-                              filter_nd, element_type, convolution_descriptor,
-                              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionBackwardDataAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
      break;
    }
    case dnn::ConvolutionKind::BACKWARD_FILTER: {
-      SE_ASSIGN_OR_RETURN(*algorithm_desc,
-                          GetCudnnConvolutionBackwardFilterAlgorithm(
-                              stream, cudnn, algorithm_config, input_nd,
-                              filter_nd, element_type, convolution_descriptor,
-                              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionBackwardFilterAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
      break;
    }
    default:
@ -3121,9 +3061,8 @@ port::Status CudnnSupport::DoConvolve(
  auto accumulator_type = GetConvAccumulatorType(element_type);
  CudnnConvolutionDescriptor conv(convolution_descriptor,
                                  ToCudnnDataType(accumulator_type));
-  SE_ASSIGN_OR_RETURN(bool use_tensor_ops,
-                      UseTensorOps(stream, element_type, algorithm_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
+  // Set use_tensor_math param to correct value
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());

  auto cudnn = cudnn_->GetHandle(parent_, stream);
  // Alpha is the scaling factor for input.
@ -3356,6 +3295,14 @@ port::Status CudnnSupport::DoConvolve(
  return port::Status::OK();
 }

+// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
+// set
+static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
+  cudnnMathType_t math_type;
+  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
+  return math_type == CUDNN_TENSOR_OP_MATH;
+}
+
 template <typename ElementType, typename BiasType, typename ScaleType,
          typename OutputType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
@ -3389,6 +3336,8 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
      filter_descriptor,
      GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
  CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  ToCudnnDataType(accumulator_type));

  auto cudnn = cudnn_->GetHandle(parent_, stream);

@ -3398,14 +3347,9 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
  SE_ASSIGN_OR_RETURN(
      dnn::AlgorithmDesc algo_desc,
      GetCudnnConvolutionForwardAlgorithm(
-          stream, cudnn, algorithm_config, conv_input_nd, filter,
-          dnn::ToDataType<ElementType>::value, convolution_descriptor,
+          stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
          output_nd, scratch_allocator, &scratch));

-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
-  conv.set_use_tensor_op_math(algo_desc.tensor_ops_enabled());
-
  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
  if (is_profiling) {
    timer.reset(new GpuTimer(parent_));  // NOLINT
@ -3536,7 +3480,9 @@ bool CudnnSupport::GetRnnAlgorithms(
  for (auto i : algo_types) {
    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
 #if CUDNN_VERSION >= 7100
-    out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    if (RnnTensorOpMathEnabled()) {
+      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    }
 #endif
  }
  return true;