Internal change

PiperOrigin-RevId: 291690233 Change-Id: I294cf4f577d1d17ad5bbb6e3be022ece6268575c
2020-01-27 03:15:10 -08:00 · 2020-01-27 03:15:10 -08:00 · 2938772a08
commit 2938772a08
parent 4f5d1cc221
6 changed files with 110 additions and 214 deletions
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@ -672,7 +672,6 @@ cc_library(
        "//tensorflow/compiler/xla/service:hlo_pass",
        "//tensorflow/core:autotuning_proto_cc",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/core/util/proto:proto_utils",
        "//tensorflow/stream_executor:device_memory_allocator",
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
 #include "tensorflow/stream_executor/gpu/redzone_allocator.h"
@ -333,35 +332,6 @@ StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
  return result_or;
 }
 // The following function allows deterministic ops to be implemented relatively
 // quickly using environment variables. It is intended to be temporary. The
 // longer-term intention is to enable deterministic ops via tf.config and
 // appropriate plumbing. See the discussion on PR 34951 for more information:
 // https://github.com/tensorflow/tensorflow/pull/34951#discussion_r355682316
 // This function and associated comment are replicated in the following three
 // places:
 //   1. tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
 //   2. tensorflow/core/kernels/gpu_utils.cc
 //   3. tensorflow/stream_executor/cuda/cuda_dnn.cc
 // When implementing the plumbing, you should also search for the use of
 // TF_DETERMINISTIC_OPS on its own.
 // TODO(duncanriach): move to an API that uses tf.config and implement the first
 //                    phase of plumbing.
 static bool RequireCudnnDeterminism() {
  static bool require_cudnn_determinism = [] {
    bool deterministic_ops = false;
    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
                                               /*default_val=*/false,
                                               &deterministic_ops));
    bool cudnn_deterministic = false;
    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
                                               /*default_val=*/false,
                                               &cudnn_deterministic));
    return deterministic_ops || cudnn_deterministic;
  }();
  return require_cudnn_determinism;
 }
 StatusOr<tensorflow::AutotuneResult>
 GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
    const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
@ -598,41 +568,43 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
    }
  }
  // For now, we ignore WRONG_RESULT failures because false-positives are
  // possible (e.g. perhaps the reference algorithm is the one that's
  // incorrect!).  But we don't ignore REDZONE_MODIFIED failures because they're
  // quite severe and can be detected with high accuracy.
  auto has_failure = [](const AutotuneResult& r) {
    return r.has_failure() &&
           r.failure().kind() != AutotuneResult::WRONG_RESULT;
  };
  // Choose the fastest convolution that doesn't produce a REDZONE_MODIFIED
  // error.
  //
  // TODO(jlebar): We ought to be able to detect redzone reads by noticing NaNs
  // in the output of the conv and skip those.
  //
-  // For now, we ignore WRONG_RESULT failures because false-positives are
+  // The successful one should have a smaller key, since we are doing
-  // possible (e.g. perhaps the reference algorithm is the one that's
+  // min_element. If they are both unsuccessful, keep the earlier one in
-  // incorrect!).  But we don't ignore REDZONE_MODIFIED failures because they're
+  // the vector by comparing pointers.
-  // quite severe and can be detected with high accuracy.
+  auto result_comparison_key = [&has_failure](const AutotuneResult& r) {
-  std::vector<AutotuneResult> filtered_results;
+    return std::make_tuple(
-  absl::c_copy_if(
+        has_failure(r),
-      profile_results, std::back_inserter(filtered_results),
+        tensorflow::proto_utils::FromDurationProto(r.run_time()));
-      [](const AutotuneResult& r) {
+  };
-        return !(r.has_failure() &&
+  const auto& best_result = absl::c_min_element(
-                 r.failure().kind() != AutotuneResult::WRONG_RESULT);
+      profile_results,
      [&](const AutotuneResult& lhs, const AutotuneResult& rhs) {
        return result_comparison_key(lhs) < result_comparison_key(rhs);
      });
-  if (filtered_results.empty()) {
+
-    return InternalError(
+  if (best_result != profile_results.end() && !has_failure(*best_result)) {
-        "All algorithms tried for convolution %s failed. Falling back to "
+    return *best_result;
        "default algorithm. ",
        instr->ToString());
  }
-  auto selected_result = filtered_results.begin();
+  return InternalError(
-  if (!RequireCudnnDeterminism()) {
+      "All algorithms tried for convolution %s failed.  Falling back to "
-    selected_result = absl::c_min_element(
+      "default algorithm.",
-        filtered_results,
+      instr->ToString());
        [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
          return tensorflow::proto_utils::FromDurationProto(lhs.run_time()) <
                 tensorflow::proto_utils::FromDurationProto(rhs.run_time());
        });
  }
  return *selected_result;
 }
 StatusOr<tensorflow::AutotuneResult>
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -532,7 +532,6 @@ tf_cuda_library(
        "//tensorflow/core:conv_autotuning_proto_cc",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor",
        "//tensorflow/core/util:env_var",
        "//tensorflow/core/util/proto:proto_utils",
        "//tensorflow/stream_executor/gpu:asm_compiler",
        "//tensorflow/stream_executor/gpu:redzone_allocator",
--- a/tensorflow/core/kernels/gpu_utils.cc
+++ b/tensorflow/core/kernels/gpu_utils.cc
@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/core/protobuf/conv_autotuning.pb.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/proto/proto_utils.h"
 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
 #include "tensorflow/stream_executor/gpu/redzone_allocator.h"
@ -211,35 +210,6 @@ void LogFusedConvForwardAutotuneResults(
  Logger::GetSingleton()->LogProto(log);
 }
 // The following function allows deterministic ops to be implemented relatively
 // quickly using environment variables. It is intended to be temporary. The
 // longer-term intention is to enable deterministic ops via tf.config and
 // appropriate plumbing. See the discussion on PR 34951 for more information:
 // https://github.com/tensorflow/tensorflow/pull/34951#discussion_r355682316
 // This function and associated comment are replicated in the following three
 // places:
 //   1. tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
 //   2. tensorflow/core/kernels/gpu_utils.cc
 //   3. tensorflow/stream_executor/cuda/cuda_dnn.cc
 // When implementing the plumbing, you should also search for the use of
 // TF_DETERMINISTIC_OPS on its own.
 // TODO(duncanriach): move to an API that uses tf.config and implement the first
 //                    phase of plumbing.
 bool RequireCudnnDeterminism() {
  static bool require_cudnn_determinism = [] {
    bool deterministic_ops = false;
    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
                                               /*default_val=*/false,
                                               &deterministic_ops));
    bool cudnn_deterministic = false;
    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
                                               /*default_val=*/false,
                                               &cudnn_deterministic));
    return deterministic_ops || cudnn_deterministic;
  }();
  return require_cudnn_determinism;
 }
 Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
                              se::dnn::AlgorithmConfig* algo) {
  std::vector<AutotuneResult> filtered_results;
@ -249,32 +219,31 @@ Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
  if (filtered_results.empty()) {
    return errors::NotFound("No algorithm worked!");
  }
  std::vector<AutotuneResult> filtered_results_no_scratch;
  absl::c_copy_if(
      filtered_results, std::back_inserter(filtered_results_no_scratch),
      [](const AutotuneResult& result) { return result.scratch_bytes() == 0; });
-  auto selected_result = filtered_results.begin();
+  const auto best_result = absl::c_min_element(
-  auto selected_result_no_scratch = filtered_results_no_scratch.begin();
+      filtered_results,
-  if (!RequireCudnnDeterminism()) {
+      [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
-    auto compare_run_times = [](const AutotuneResult& lhs,
+        return proto_utils::FromDurationProto(lhs.run_time()) <
-                                const AutotuneResult& rhs) {
+               proto_utils::FromDurationProto(rhs.run_time());
-      return proto_utils::FromDurationProto(lhs.run_time()) <
+      });
             proto_utils::FromDurationProto(rhs.run_time());
    };
    selected_result = absl::c_min_element(filtered_results, compare_run_times);
    selected_result_no_scratch =
        absl::c_min_element(filtered_results_no_scratch, compare_run_times);
  }
-  algo->set_algorithm({selected_result->conv().algorithm(),
+  const auto best_result_no_scratch = absl::c_min_element(
-                       selected_result->conv().tensor_ops_enabled()});
+      filtered_results,
-  if (selected_result_no_scratch != filtered_results_no_scratch.end()) {
+      [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
        return std::make_tuple(lhs.scratch_bytes(),
                               proto_utils::FromDurationProto(lhs.run_time())) <
               std::make_tuple(rhs.scratch_bytes(),
                               proto_utils::FromDurationProto(rhs.run_time()));
      });
  algo->set_algorithm({best_result->conv().algorithm(),
                       best_result->conv().tensor_ops_enabled()});
  if (best_result_no_scratch != filtered_results.end() &&
      best_result_no_scratch->scratch_bytes() == 0) {
    algo->set_algorithm_no_scratch(
-        {selected_result_no_scratch->conv().algorithm(),
+        {best_result_no_scratch->conv().algorithm(),
-         selected_result_no_scratch->conv().tensor_ops_enabled()});
+         best_result_no_scratch->conv().tensor_ops_enabled()});
  }
  return Status::OK();
 }
--- a/tensorflow/python/kernel_tests/cudnn_deterministic_base.py
+++ b/tensorflow/python/kernel_tests/cudnn_deterministic_base.py
@ -28,39 +28,22 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
-# Notes:
+# Setting either of the two environment variables TF_CUDNN_DETERMINISTIC or
-#
+# TF_DETERMINISTIC_OPS to "true" or "1" will disable autotuning of cuDNN
-# Deterministic cuDNN operation is selected by setting either of the two
+# algorithms and cause deterministic cuDNN algorithms to be selected when both
-# environment variables TF_CUDNN_DETERMINISTIC or TF_DETERMINISTIC_OPS to 'true'
+# deterministic and non-deterministic algorithms are available. These tests are
-# or '1' while also not setting the environment variable TF_CUDNN_USE_AUTOTUNE
+# intended to confirm that deterministic algorithms are chosen when either
-# to 'false' or '0'.
+# environment variable is set to "true" or "1". The tested configurations were
-#
+# first confirmed to produce non-deterministic results when the environment
-# Where both deterministic and non-deterministic cuDNN algorithms are available,
+# variables are not set.
 # selecting determinitic operation will lead to only the deterministic
 # algorithms being chosen. Additionally, selecting deterministic operation will
 # result in a deterministic, or reproducible, selection of algorithms (for any
 # given layer configuration) for each of the forward and the two backward paths.
 #
 # These tests intend to confirm that deterministic algorithms are chosen (for
 # the back-prop paths) when desterministic operation is selected. The tested
 # configurations were first confirmed to produce non-deterministic results when
 # the above-mentioned environment variables are not set.
 #
 # Even though selecting determinitic operation should ensure that the same
 # algorithms, for a given layer configuration, are always used (i.e. that
 # algorithm selection is deterministic / reproducible), this is not tested.
-# TODO(duncanriach): Add test for deterministic cuDNN max-pooling
+_PADDING = 'SAME'
 _STRIDES = [1, 1, 1, 1]
-LayerShapeNHWC = collections.namedtuple('LayerShapeNHWC',
+LayerShape = collections.namedtuple('LayerShape',
-                                        'batch, height, width, channels')
+                                    'batch, height, width, channels')
-FilterShape2D = collections.namedtuple(
+FilterShape = collections.namedtuple(
-    'FilterShape2D', 'height, width, in_channels, out_channels')
+    'FilterShape', 'height, width, in_channels, out_channels')
 LayerShapeNCDHW = collections.namedtuple(
    'LayerShapeNCDHW', 'batch, channels, depth, height, width')
 FilterShape3D = collections.namedtuple(
    'FilterShape3D', 'depth, height, width, in_channels, out_channels')
 class ConvolutionTest(test.TestCase):
@ -71,13 +54,14 @@ class ConvolutionTest(test.TestCase):
    return constant_op.constant(
        2 * np.random.random_sample(shape) - 1, dtype=dtypes.float32)
-  def _random_out_op(self, in_shape, filter_shape, strides, padding):
+  def _random_out_op(self, in_shape, filter_shape):
    # Choosing not to use array_op.zeros() to prevent possible removal by
    # optimization
    in_op = self._random_data_op(in_shape)
    filter_op = self._random_data_op(filter_shape)
    # Use the forward op's shape-inference
-    conv_op = nn_ops.conv2d(in_op, filter_op, strides=strides, padding=padding)
+    conv_op = nn_ops.conv2d(
        in_op, filter_op, strides=_STRIDES, padding=_PADDING)
    out_shape = conv_op.get_shape()
    out_op = self._random_data_op(out_shape)
    return out_op
@ -88,54 +72,29 @@ class ConvolutionTest(test.TestCase):
      result_2 = self.evaluate(operation)
    self.assertAllEqual(result_1, result_2)
  # The default forward algorithm choice, when using cuDNN 7, does not support
  # the following layer configuration. This test case intends to confirm that
  # an alternative algorithm is selected. Note that, in cuDNN 7, all forward
  # algorithms are determnistic.
  @test_util.run_cuda_only
  def testForward(self):
    np.random.seed(3)
    in_shape = LayerShapeNCDHW(batch=2, channels=3, depth=5, height=7, width=6)
    filter_shape = FilterShape3D(
        depth=3, height=3, width=3, in_channels=3, out_channels=2)
    in_op = self._random_data_op(in_shape)
    filter_op = self._random_data_op(filter_shape)
    strides = [1, 1, 1, 1, 1]
    padding = 'VALID'
    dilations = [1, 1, 2, 2, 2]
    out_op = nn_ops.conv3d(
        in_op,
        filter_op,
        strides=strides,
        padding=padding,
        data_format='NCDHW',
        dilations=dilations)
    self._assert_reproducible(out_op)
  @test_util.run_cuda_only
  def testBackwardFilterGradient(self):
    np.random.seed(1)
-    in_shape = LayerShapeNHWC(batch=8, height=128, width=128, channels=8)
+    in_shape = LayerShape(batch=8, height=128, width=128, channels=8)
-    filter_shape = FilterShape2D(
+    filter_shape = FilterShape(height=3, width=3, in_channels=8, out_channels=8)
        height=3, width=3, in_channels=8, out_channels=8)
    in_op = self._random_data_op(in_shape)
-    strides = [1, 1, 1, 1]
+    out_op = self._random_out_op(in_shape, filter_shape)
    padding = 'SAME'
    out_op = self._random_out_op(in_shape, filter_shape, strides, padding)
    filter_gradient_op = nn_ops.conv2d_backprop_filter(
-        in_op, filter_shape, out_op, strides=strides, padding=padding)
+        in_op, filter_shape, out_op, strides=_STRIDES, padding=_PADDING)
    self._assert_reproducible(filter_gradient_op)
  @test_util.run_cuda_only
  def testBackwardInputGradient(self):
    np.random.seed(2)
-    in_shape = LayerShapeNHWC(batch=8, height=32, width=32, channels=8)
+    in_shape = LayerShape(batch=8, height=32, width=32, channels=8)
-    filter_shape = FilterShape2D(
+    filter_shape = FilterShape(
        height=7, width=7, in_channels=8, out_channels=128)
    filter_op = self._random_data_op(filter_shape)
-    strides = [1, 1, 1, 1]
+    out_op = self._random_out_op(in_shape, filter_shape)
    padding = 'SAME'
    out_op = self._random_out_op(in_shape, filter_shape, strides, padding)
    input_gradient_op = nn_ops.conv2d_backprop_input(
-        in_shape, filter_op, out_op, strides=strides, padding=padding)
+        in_shape, filter_op, out_op, strides=_STRIDES, padding=_PADDING)
    self._assert_reproducible(input_gradient_op)
  # TODO(duncanriach): (1) add test to confirm that forward autotuning is
  #   disabled for cuDNN convolution; (2) add test for deterministic cuDNN
  #   max-pooling
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@ -648,22 +648,9 @@ bool BatchnormSpatialPersistentEnabled() {
  return is_enabled;
 }
-// The following function allows deterministic ops to be implemented relatively
+// A helper function to decide whether to enable deterministic functionality.
-// quickly using environment variables. It is intended to be temporary. The
+bool RequireDeterminism() {
-// longer-term intention is to enable deterministic ops via tf.config and
+  static bool require_determinism = [] {
 // appropriate plumbing. See the discussion on PR 34951 for more information:
 // https://github.com/tensorflow/tensorflow/pull/34951#discussion_r355682316
 // This function and associated comment are replicated in the following three
 // places:
 //   1. tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
 //   2. tensorflow/core/kernels/gpu_utils.cc
 //   3. tensorflow/stream_executor/cuda/cuda_dnn.cc
 // When implementing the plumbing, you should also search for the use of
 // TF_DETERMINISTIC_OPS on its own.
 // TODO(duncanriach): move to an API that uses tf.config and implement the first
 //                    phase of plumbing.
 bool RequireCudnnDeterminism() {
  static bool require_cudnn_determinism = [] {
    bool deterministic_ops = false;
    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
                                               /*default_val=*/false,
@ -674,7 +661,7 @@ bool RequireCudnnDeterminism() {
                                               &cudnn_deterministic));
    return deterministic_ops || cudnn_deterministic;
  }();
-  return require_cudnn_determinism;
+  return require_determinism;
 }
 std::tuple<int, int> GetCcMajorMinor(Stream* stream) {
@ -775,7 +762,7 @@ class CudnnPoolingDescriptor {
    std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                   &CheckedNarrowing<int64, int>);
    bool propagate_nans = pooling_descriptor.propagate_nans();
-    const auto cudnn_max_pooling_mode = RequireCudnnDeterminism()
+    const auto cudnn_max_pooling_mode = RequireDeterminism()
                                            ? CUDNN_POOLING_MAX_DETERMINISTIC
                                            : CUDNN_POOLING_MAX;
    CHECK_CUDNN_OK(cudnnSetPoolingNdDescriptor(
@ -3343,15 +3330,21 @@ bool CudnnSupport::GetConvolveAlgorithms(
  bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
  out_algorithms->clear();
  if (RequireDeterminism()) {
    out_algorithms->push_back({CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
                               tensor_op_math_available});
    return true;
  }
  std::vector<dnn::AlgorithmDesc::Index> algo_types = {
-      // clang-format off
+    // clang-format off
    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
    CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
    CUDNN_CONVOLUTION_FWD_ALGO_FFT,
    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
-      // clang-format on
+    // clang-format on
  };
  if (CudnnEnvVar<FftTilingForward>::IsEnabled()) {
    algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING);
@ -3360,12 +3353,11 @@ bool CudnnSupport::GetConvolveAlgorithms(
    algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
  }
  // The algorithms are intentionally ordered for deterministic operation
  for (auto i : algo_types) {
    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
    if (tensor_op_math_available) {
      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
    }
    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
  }
  return true;
@ -3399,8 +3391,15 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
  bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
  out_algorithms->clear();
  if (RequireDeterminism()) {
    out_algorithms->push_back(
        {CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, tensor_op_math_available});
    return true;
  }
  std::vector<dnn::AlgorithmDesc::Index> algo_types = {
      // clang-format off
    CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
    CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
@ -3410,16 +3409,12 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
  if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
    algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
  }
  if (!RequireCudnnDeterminism()) {
    algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0);
  }
  // The algorithms are intentionally ordered for deterministic operation
  for (auto i : algo_types) {
    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
    if (tensor_op_math_available) {
      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
    }
    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
  }
  return true;
@ -3431,10 +3426,18 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
  bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
  out_algorithms->clear();
  if (RequireDeterminism()) {
    out_algorithms->push_back(
        {CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, tensor_op_math_available});
    return true;
  }
  std::vector<dnn::AlgorithmDesc::Index> algo_types = {
      // clang-format off
      CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
      CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
      CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
      CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
      // Based on cudnn.h, the following is not implemented.
      // CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
@ -3446,17 +3449,12 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
  if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
    algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
  }
  if (!RequireCudnnDeterminism()) {
    algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0);
    algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3);
  }
  // The algorithms are intentionally ordered for deterministic operation
  for (auto i : algo_types) {
    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
    if (tensor_op_math_available) {
      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
    }
    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
  }
  return true;