Internal change
PiperOrigin-RevId: 291690233 Change-Id: I294cf4f577d1d17ad5bbb6e3be022ece6268575c
This commit is contained in:
parent
4f5d1cc221
commit
2938772a08
@ -672,7 +672,6 @@ cc_library(
|
||||
"//tensorflow/compiler/xla/service:hlo_pass",
|
||||
"//tensorflow/core:autotuning_proto_cc",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:lib_internal",
|
||||
"//tensorflow/core:stream_executor_no_cuda",
|
||||
"//tensorflow/core/util/proto:proto_utils",
|
||||
"//tensorflow/stream_executor:device_memory_allocator",
|
||||
|
@ -35,7 +35,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/lib/strings/numbers.h"
|
||||
#include "tensorflow/core/platform/logger.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/util/env_var.h"
|
||||
#include "tensorflow/core/util/proto/proto_utils.h"
|
||||
#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
|
||||
|
||||
@ -333,35 +332,6 @@ StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
|
||||
return result_or;
|
||||
}
|
||||
|
||||
// The following function allows deterministic ops to be implemented relatively
|
||||
// quickly using environment variables. It is intended to be temporary. The
|
||||
// longer-term intention is to enable deterministic ops via tf.config and
|
||||
// appropriate plumbing. See the discussion on PR 34951 for more information:
|
||||
// https://github.com/tensorflow/tensorflow/pull/34951#discussion_r355682316
|
||||
// This function and associated comment are replicated in the following three
|
||||
// places:
|
||||
// 1. tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
|
||||
// 2. tensorflow/core/kernels/gpu_utils.cc
|
||||
// 3. tensorflow/stream_executor/cuda/cuda_dnn.cc
|
||||
// When implementing the plumbing, you should also search for the use of
|
||||
// TF_DETERMINISTIC_OPS on its own.
|
||||
// TODO(duncanriach): move to an API that uses tf.config and implement the first
|
||||
// phase of plumbing.
|
||||
static bool RequireCudnnDeterminism() {
|
||||
static bool require_cudnn_determinism = [] {
|
||||
bool deterministic_ops = false;
|
||||
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
|
||||
/*default_val=*/false,
|
||||
&deterministic_ops));
|
||||
bool cudnn_deterministic = false;
|
||||
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
|
||||
/*default_val=*/false,
|
||||
&cudnn_deterministic));
|
||||
return deterministic_ops || cudnn_deterministic;
|
||||
}();
|
||||
return require_cudnn_determinism;
|
||||
}
|
||||
|
||||
StatusOr<tensorflow::AutotuneResult>
|
||||
GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
||||
const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
|
||||
@ -598,41 +568,43 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
||||
}
|
||||
}
|
||||
|
||||
// For now, we ignore WRONG_RESULT failures because false-positives are
|
||||
// possible (e.g. perhaps the reference algorithm is the one that's
|
||||
// incorrect!). But we don't ignore REDZONE_MODIFIED failures because they're
|
||||
// quite severe and can be detected with high accuracy.
|
||||
auto has_failure = [](const AutotuneResult& r) {
|
||||
return r.has_failure() &&
|
||||
r.failure().kind() != AutotuneResult::WRONG_RESULT;
|
||||
};
|
||||
|
||||
// Choose the fastest convolution that doesn't produce a REDZONE_MODIFIED
|
||||
// error.
|
||||
//
|
||||
// TODO(jlebar): We ought to be able to detect redzone reads by noticing NaNs
|
||||
// in the output of the conv and skip those.
|
||||
//
|
||||
// For now, we ignore WRONG_RESULT failures because false-positives are
|
||||
// possible (e.g. perhaps the reference algorithm is the one that's
|
||||
// incorrect!). But we don't ignore REDZONE_MODIFIED failures because they're
|
||||
// quite severe and can be detected with high accuracy.
|
||||
std::vector<AutotuneResult> filtered_results;
|
||||
absl::c_copy_if(
|
||||
profile_results, std::back_inserter(filtered_results),
|
||||
[](const AutotuneResult& r) {
|
||||
return !(r.has_failure() &&
|
||||
r.failure().kind() != AutotuneResult::WRONG_RESULT);
|
||||
// The successful one should have a smaller key, since we are doing
|
||||
// min_element. If they are both unsuccessful, keep the earlier one in
|
||||
// the vector by comparing pointers.
|
||||
auto result_comparison_key = [&has_failure](const AutotuneResult& r) {
|
||||
return std::make_tuple(
|
||||
has_failure(r),
|
||||
tensorflow::proto_utils::FromDurationProto(r.run_time()));
|
||||
};
|
||||
const auto& best_result = absl::c_min_element(
|
||||
profile_results,
|
||||
[&](const AutotuneResult& lhs, const AutotuneResult& rhs) {
|
||||
return result_comparison_key(lhs) < result_comparison_key(rhs);
|
||||
});
|
||||
if (filtered_results.empty()) {
|
||||
return InternalError(
|
||||
"All algorithms tried for convolution %s failed. Falling back to "
|
||||
"default algorithm. ",
|
||||
instr->ToString());
|
||||
|
||||
if (best_result != profile_results.end() && !has_failure(*best_result)) {
|
||||
return *best_result;
|
||||
}
|
||||
|
||||
auto selected_result = filtered_results.begin();
|
||||
if (!RequireCudnnDeterminism()) {
|
||||
selected_result = absl::c_min_element(
|
||||
filtered_results,
|
||||
[](const AutotuneResult& lhs, const AutotuneResult& rhs) {
|
||||
return tensorflow::proto_utils::FromDurationProto(lhs.run_time()) <
|
||||
tensorflow::proto_utils::FromDurationProto(rhs.run_time());
|
||||
});
|
||||
}
|
||||
|
||||
return *selected_result;
|
||||
return InternalError(
|
||||
"All algorithms tried for convolution %s failed. Falling back to "
|
||||
"default algorithm.",
|
||||
instr->ToString());
|
||||
}
|
||||
|
||||
StatusOr<tensorflow::AutotuneResult>
|
||||
|
@ -532,7 +532,6 @@ tf_cuda_library(
|
||||
"//tensorflow/core:conv_autotuning_proto_cc",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:stream_executor",
|
||||
"//tensorflow/core/util:env_var",
|
||||
"//tensorflow/core/util/proto:proto_utils",
|
||||
"//tensorflow/stream_executor/gpu:asm_compiler",
|
||||
"//tensorflow/stream_executor/gpu:redzone_allocator",
|
||||
|
@ -24,7 +24,6 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/logger.h"
|
||||
#include "tensorflow/core/protobuf/autotuning.pb.h"
|
||||
#include "tensorflow/core/protobuf/conv_autotuning.pb.h"
|
||||
#include "tensorflow/core/util/env_var.h"
|
||||
#include "tensorflow/core/util/proto/proto_utils.h"
|
||||
#include "tensorflow/stream_executor/gpu/asm_compiler.h"
|
||||
#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
|
||||
@ -211,35 +210,6 @@ void LogFusedConvForwardAutotuneResults(
|
||||
Logger::GetSingleton()->LogProto(log);
|
||||
}
|
||||
|
||||
// The following function allows deterministic ops to be implemented relatively
|
||||
// quickly using environment variables. It is intended to be temporary. The
|
||||
// longer-term intention is to enable deterministic ops via tf.config and
|
||||
// appropriate plumbing. See the discussion on PR 34951 for more information:
|
||||
// https://github.com/tensorflow/tensorflow/pull/34951#discussion_r355682316
|
||||
// This function and associated comment are replicated in the following three
|
||||
// places:
|
||||
// 1. tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
|
||||
// 2. tensorflow/core/kernels/gpu_utils.cc
|
||||
// 3. tensorflow/stream_executor/cuda/cuda_dnn.cc
|
||||
// When implementing the plumbing, you should also search for the use of
|
||||
// TF_DETERMINISTIC_OPS on its own.
|
||||
// TODO(duncanriach): move to an API that uses tf.config and implement the first
|
||||
// phase of plumbing.
|
||||
bool RequireCudnnDeterminism() {
|
||||
static bool require_cudnn_determinism = [] {
|
||||
bool deterministic_ops = false;
|
||||
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
|
||||
/*default_val=*/false,
|
||||
&deterministic_ops));
|
||||
bool cudnn_deterministic = false;
|
||||
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
|
||||
/*default_val=*/false,
|
||||
&cudnn_deterministic));
|
||||
return deterministic_ops || cudnn_deterministic;
|
||||
}();
|
||||
return require_cudnn_determinism;
|
||||
}
|
||||
|
||||
Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
|
||||
se::dnn::AlgorithmConfig* algo) {
|
||||
std::vector<AutotuneResult> filtered_results;
|
||||
@ -249,32 +219,31 @@ Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
|
||||
if (filtered_results.empty()) {
|
||||
return errors::NotFound("No algorithm worked!");
|
||||
}
|
||||
std::vector<AutotuneResult> filtered_results_no_scratch;
|
||||
absl::c_copy_if(
|
||||
filtered_results, std::back_inserter(filtered_results_no_scratch),
|
||||
[](const AutotuneResult& result) { return result.scratch_bytes() == 0; });
|
||||
|
||||
auto selected_result = filtered_results.begin();
|
||||
auto selected_result_no_scratch = filtered_results_no_scratch.begin();
|
||||
if (!RequireCudnnDeterminism()) {
|
||||
auto compare_run_times = [](const AutotuneResult& lhs,
|
||||
const AutotuneResult& rhs) {
|
||||
return proto_utils::FromDurationProto(lhs.run_time()) <
|
||||
proto_utils::FromDurationProto(rhs.run_time());
|
||||
};
|
||||
selected_result = absl::c_min_element(filtered_results, compare_run_times);
|
||||
selected_result_no_scratch =
|
||||
absl::c_min_element(filtered_results_no_scratch, compare_run_times);
|
||||
}
|
||||
const auto best_result = absl::c_min_element(
|
||||
filtered_results,
|
||||
[](const AutotuneResult& lhs, const AutotuneResult& rhs) {
|
||||
return proto_utils::FromDurationProto(lhs.run_time()) <
|
||||
proto_utils::FromDurationProto(rhs.run_time());
|
||||
});
|
||||
|
||||
algo->set_algorithm({selected_result->conv().algorithm(),
|
||||
selected_result->conv().tensor_ops_enabled()});
|
||||
if (selected_result_no_scratch != filtered_results_no_scratch.end()) {
|
||||
const auto best_result_no_scratch = absl::c_min_element(
|
||||
filtered_results,
|
||||
[](const AutotuneResult& lhs, const AutotuneResult& rhs) {
|
||||
return std::make_tuple(lhs.scratch_bytes(),
|
||||
proto_utils::FromDurationProto(lhs.run_time())) <
|
||||
std::make_tuple(rhs.scratch_bytes(),
|
||||
proto_utils::FromDurationProto(rhs.run_time()));
|
||||
});
|
||||
|
||||
algo->set_algorithm({best_result->conv().algorithm(),
|
||||
best_result->conv().tensor_ops_enabled()});
|
||||
if (best_result_no_scratch != filtered_results.end() &&
|
||||
best_result_no_scratch->scratch_bytes() == 0) {
|
||||
algo->set_algorithm_no_scratch(
|
||||
{selected_result_no_scratch->conv().algorithm(),
|
||||
selected_result_no_scratch->conv().tensor_ops_enabled()});
|
||||
{best_result_no_scratch->conv().algorithm(),
|
||||
best_result_no_scratch->conv().tensor_ops_enabled()});
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -28,39 +28,22 @@ from tensorflow.python.framework import test_util
|
||||
from tensorflow.python.ops import nn_ops
|
||||
from tensorflow.python.platform import test
|
||||
|
||||
# Notes:
|
||||
#
|
||||
# Deterministic cuDNN operation is selected by setting either of the two
|
||||
# environment variables TF_CUDNN_DETERMINISTIC or TF_DETERMINISTIC_OPS to 'true'
|
||||
# or '1' while also not setting the environment variable TF_CUDNN_USE_AUTOTUNE
|
||||
# to 'false' or '0'.
|
||||
#
|
||||
# Where both deterministic and non-deterministic cuDNN algorithms are available,
|
||||
# selecting determinitic operation will lead to only the deterministic
|
||||
# algorithms being chosen. Additionally, selecting deterministic operation will
|
||||
# result in a deterministic, or reproducible, selection of algorithms (for any
|
||||
# given layer configuration) for each of the forward and the two backward paths.
|
||||
#
|
||||
# These tests intend to confirm that deterministic algorithms are chosen (for
|
||||
# the back-prop paths) when desterministic operation is selected. The tested
|
||||
# configurations were first confirmed to produce non-deterministic results when
|
||||
# the above-mentioned environment variables are not set.
|
||||
#
|
||||
# Even though selecting determinitic operation should ensure that the same
|
||||
# algorithms, for a given layer configuration, are always used (i.e. that
|
||||
# algorithm selection is deterministic / reproducible), this is not tested.
|
||||
# Setting either of the two environment variables TF_CUDNN_DETERMINISTIC or
|
||||
# TF_DETERMINISTIC_OPS to "true" or "1" will disable autotuning of cuDNN
|
||||
# algorithms and cause deterministic cuDNN algorithms to be selected when both
|
||||
# deterministic and non-deterministic algorithms are available. These tests are
|
||||
# intended to confirm that deterministic algorithms are chosen when either
|
||||
# environment variable is set to "true" or "1". The tested configurations were
|
||||
# first confirmed to produce non-deterministic results when the environment
|
||||
# variables are not set.
|
||||
|
||||
# TODO(duncanriach): Add test for deterministic cuDNN max-pooling
|
||||
_PADDING = 'SAME'
|
||||
_STRIDES = [1, 1, 1, 1]
|
||||
|
||||
LayerShapeNHWC = collections.namedtuple('LayerShapeNHWC',
|
||||
'batch, height, width, channels')
|
||||
FilterShape2D = collections.namedtuple(
|
||||
'FilterShape2D', 'height, width, in_channels, out_channels')
|
||||
|
||||
LayerShapeNCDHW = collections.namedtuple(
|
||||
'LayerShapeNCDHW', 'batch, channels, depth, height, width')
|
||||
FilterShape3D = collections.namedtuple(
|
||||
'FilterShape3D', 'depth, height, width, in_channels, out_channels')
|
||||
LayerShape = collections.namedtuple('LayerShape',
|
||||
'batch, height, width, channels')
|
||||
FilterShape = collections.namedtuple(
|
||||
'FilterShape', 'height, width, in_channels, out_channels')
|
||||
|
||||
|
||||
class ConvolutionTest(test.TestCase):
|
||||
@ -71,13 +54,14 @@ class ConvolutionTest(test.TestCase):
|
||||
return constant_op.constant(
|
||||
2 * np.random.random_sample(shape) - 1, dtype=dtypes.float32)
|
||||
|
||||
def _random_out_op(self, in_shape, filter_shape, strides, padding):
|
||||
def _random_out_op(self, in_shape, filter_shape):
|
||||
# Choosing not to use array_op.zeros() to prevent possible removal by
|
||||
# optimization
|
||||
in_op = self._random_data_op(in_shape)
|
||||
filter_op = self._random_data_op(filter_shape)
|
||||
# Use the forward op's shape-inference
|
||||
conv_op = nn_ops.conv2d(in_op, filter_op, strides=strides, padding=padding)
|
||||
conv_op = nn_ops.conv2d(
|
||||
in_op, filter_op, strides=_STRIDES, padding=_PADDING)
|
||||
out_shape = conv_op.get_shape()
|
||||
out_op = self._random_data_op(out_shape)
|
||||
return out_op
|
||||
@ -88,54 +72,29 @@ class ConvolutionTest(test.TestCase):
|
||||
result_2 = self.evaluate(operation)
|
||||
self.assertAllEqual(result_1, result_2)
|
||||
|
||||
# The default forward algorithm choice, when using cuDNN 7, does not support
|
||||
# the following layer configuration. This test case intends to confirm that
|
||||
# an alternative algorithm is selected. Note that, in cuDNN 7, all forward
|
||||
# algorithms are determnistic.
|
||||
@test_util.run_cuda_only
|
||||
def testForward(self):
|
||||
np.random.seed(3)
|
||||
in_shape = LayerShapeNCDHW(batch=2, channels=3, depth=5, height=7, width=6)
|
||||
filter_shape = FilterShape3D(
|
||||
depth=3, height=3, width=3, in_channels=3, out_channels=2)
|
||||
in_op = self._random_data_op(in_shape)
|
||||
filter_op = self._random_data_op(filter_shape)
|
||||
strides = [1, 1, 1, 1, 1]
|
||||
padding = 'VALID'
|
||||
dilations = [1, 1, 2, 2, 2]
|
||||
out_op = nn_ops.conv3d(
|
||||
in_op,
|
||||
filter_op,
|
||||
strides=strides,
|
||||
padding=padding,
|
||||
data_format='NCDHW',
|
||||
dilations=dilations)
|
||||
self._assert_reproducible(out_op)
|
||||
|
||||
@test_util.run_cuda_only
|
||||
def testBackwardFilterGradient(self):
|
||||
np.random.seed(1)
|
||||
in_shape = LayerShapeNHWC(batch=8, height=128, width=128, channels=8)
|
||||
filter_shape = FilterShape2D(
|
||||
height=3, width=3, in_channels=8, out_channels=8)
|
||||
in_shape = LayerShape(batch=8, height=128, width=128, channels=8)
|
||||
filter_shape = FilterShape(height=3, width=3, in_channels=8, out_channels=8)
|
||||
in_op = self._random_data_op(in_shape)
|
||||
strides = [1, 1, 1, 1]
|
||||
padding = 'SAME'
|
||||
out_op = self._random_out_op(in_shape, filter_shape, strides, padding)
|
||||
out_op = self._random_out_op(in_shape, filter_shape)
|
||||
filter_gradient_op = nn_ops.conv2d_backprop_filter(
|
||||
in_op, filter_shape, out_op, strides=strides, padding=padding)
|
||||
in_op, filter_shape, out_op, strides=_STRIDES, padding=_PADDING)
|
||||
self._assert_reproducible(filter_gradient_op)
|
||||
|
||||
@test_util.run_cuda_only
|
||||
def testBackwardInputGradient(self):
|
||||
np.random.seed(2)
|
||||
in_shape = LayerShapeNHWC(batch=8, height=32, width=32, channels=8)
|
||||
filter_shape = FilterShape2D(
|
||||
in_shape = LayerShape(batch=8, height=32, width=32, channels=8)
|
||||
filter_shape = FilterShape(
|
||||
height=7, width=7, in_channels=8, out_channels=128)
|
||||
filter_op = self._random_data_op(filter_shape)
|
||||
strides = [1, 1, 1, 1]
|
||||
padding = 'SAME'
|
||||
out_op = self._random_out_op(in_shape, filter_shape, strides, padding)
|
||||
out_op = self._random_out_op(in_shape, filter_shape)
|
||||
input_gradient_op = nn_ops.conv2d_backprop_input(
|
||||
in_shape, filter_op, out_op, strides=strides, padding=padding)
|
||||
in_shape, filter_op, out_op, strides=_STRIDES, padding=_PADDING)
|
||||
self._assert_reproducible(input_gradient_op)
|
||||
|
||||
# TODO(duncanriach): (1) add test to confirm that forward autotuning is
|
||||
# disabled for cuDNN convolution; (2) add test for deterministic cuDNN
|
||||
# max-pooling
|
||||
|
@ -648,22 +648,9 @@ bool BatchnormSpatialPersistentEnabled() {
|
||||
return is_enabled;
|
||||
}
|
||||
|
||||
// The following function allows deterministic ops to be implemented relatively
|
||||
// quickly using environment variables. It is intended to be temporary. The
|
||||
// longer-term intention is to enable deterministic ops via tf.config and
|
||||
// appropriate plumbing. See the discussion on PR 34951 for more information:
|
||||
// https://github.com/tensorflow/tensorflow/pull/34951#discussion_r355682316
|
||||
// This function and associated comment are replicated in the following three
|
||||
// places:
|
||||
// 1. tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
|
||||
// 2. tensorflow/core/kernels/gpu_utils.cc
|
||||
// 3. tensorflow/stream_executor/cuda/cuda_dnn.cc
|
||||
// When implementing the plumbing, you should also search for the use of
|
||||
// TF_DETERMINISTIC_OPS on its own.
|
||||
// TODO(duncanriach): move to an API that uses tf.config and implement the first
|
||||
// phase of plumbing.
|
||||
bool RequireCudnnDeterminism() {
|
||||
static bool require_cudnn_determinism = [] {
|
||||
// A helper function to decide whether to enable deterministic functionality.
|
||||
bool RequireDeterminism() {
|
||||
static bool require_determinism = [] {
|
||||
bool deterministic_ops = false;
|
||||
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
|
||||
/*default_val=*/false,
|
||||
@ -674,7 +661,7 @@ bool RequireCudnnDeterminism() {
|
||||
&cudnn_deterministic));
|
||||
return deterministic_ops || cudnn_deterministic;
|
||||
}();
|
||||
return require_cudnn_determinism;
|
||||
return require_determinism;
|
||||
}
|
||||
|
||||
std::tuple<int, int> GetCcMajorMinor(Stream* stream) {
|
||||
@ -775,7 +762,7 @@ class CudnnPoolingDescriptor {
|
||||
std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
|
||||
&CheckedNarrowing<int64, int>);
|
||||
bool propagate_nans = pooling_descriptor.propagate_nans();
|
||||
const auto cudnn_max_pooling_mode = RequireCudnnDeterminism()
|
||||
const auto cudnn_max_pooling_mode = RequireDeterminism()
|
||||
? CUDNN_POOLING_MAX_DETERMINISTIC
|
||||
: CUDNN_POOLING_MAX;
|
||||
CHECK_CUDNN_OK(cudnnSetPoolingNdDescriptor(
|
||||
@ -3343,15 +3330,21 @@ bool CudnnSupport::GetConvolveAlgorithms(
|
||||
bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
|
||||
out_algorithms->clear();
|
||||
|
||||
if (RequireDeterminism()) {
|
||||
out_algorithms->push_back({CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
|
||||
tensor_op_math_available});
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<dnn::AlgorithmDesc::Index> algo_types = {
|
||||
// clang-format off
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
|
||||
// clang-format off
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_FFT,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
|
||||
// clang-format on
|
||||
// clang-format on
|
||||
};
|
||||
if (CudnnEnvVar<FftTilingForward>::IsEnabled()) {
|
||||
algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING);
|
||||
@ -3360,12 +3353,11 @@ bool CudnnSupport::GetConvolveAlgorithms(
|
||||
algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
|
||||
}
|
||||
|
||||
// The algorithms are intentionally ordered for deterministic operation
|
||||
for (auto i : algo_types) {
|
||||
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
||||
if (tensor_op_math_available) {
|
||||
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
|
||||
}
|
||||
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -3399,8 +3391,15 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
|
||||
bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
|
||||
out_algorithms->clear();
|
||||
|
||||
if (RequireDeterminism()) {
|
||||
out_algorithms->push_back(
|
||||
{CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, tensor_op_math_available});
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<dnn::AlgorithmDesc::Index> algo_types = {
|
||||
// clang-format off
|
||||
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
|
||||
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
|
||||
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
|
||||
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
|
||||
@ -3410,16 +3409,12 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
|
||||
if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
|
||||
algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
|
||||
}
|
||||
if (!RequireCudnnDeterminism()) {
|
||||
algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0);
|
||||
}
|
||||
|
||||
// The algorithms are intentionally ordered for deterministic operation
|
||||
for (auto i : algo_types) {
|
||||
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
||||
if (tensor_op_math_available) {
|
||||
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
|
||||
}
|
||||
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -3431,10 +3426,18 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
|
||||
bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
|
||||
out_algorithms->clear();
|
||||
|
||||
if (RequireDeterminism()) {
|
||||
out_algorithms->push_back(
|
||||
{CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, tensor_op_math_available});
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<dnn::AlgorithmDesc::Index> algo_types = {
|
||||
// clang-format off
|
||||
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
|
||||
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
|
||||
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
|
||||
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
|
||||
// Based on cudnn.h, the following is not implemented.
|
||||
// CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
|
||||
|
||||
@ -3446,17 +3449,12 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
|
||||
if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
|
||||
algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
|
||||
}
|
||||
if (!RequireCudnnDeterminism()) {
|
||||
algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0);
|
||||
algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3);
|
||||
}
|
||||
|
||||
// The algorithms are intentionally ordered for deterministic operation
|
||||
for (auto i : algo_types) {
|
||||
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
||||
if (tensor_op_math_available) {
|
||||
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
|
||||
}
|
||||
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
||||
}
|
||||
|
||||
return true;
|
||||
|
Loading…
Reference in New Issue
Block a user