Internal change
PiperOrigin-RevId: 291690233 Change-Id: I294cf4f577d1d17ad5bbb6e3be022ece6268575c
This commit is contained in:
parent
4f5d1cc221
commit
2938772a08
@ -672,7 +672,6 @@ cc_library(
|
|||||||
"//tensorflow/compiler/xla/service:hlo_pass",
|
"//tensorflow/compiler/xla/service:hlo_pass",
|
||||||
"//tensorflow/core:autotuning_proto_cc",
|
"//tensorflow/core:autotuning_proto_cc",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:lib_internal",
|
|
||||||
"//tensorflow/core:stream_executor_no_cuda",
|
"//tensorflow/core:stream_executor_no_cuda",
|
||||||
"//tensorflow/core/util/proto:proto_utils",
|
"//tensorflow/core/util/proto:proto_utils",
|
||||||
"//tensorflow/stream_executor:device_memory_allocator",
|
"//tensorflow/stream_executor:device_memory_allocator",
|
||||||
|
@ -35,7 +35,6 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/lib/strings/numbers.h"
|
#include "tensorflow/core/lib/strings/numbers.h"
|
||||||
#include "tensorflow/core/platform/logger.h"
|
#include "tensorflow/core/platform/logger.h"
|
||||||
#include "tensorflow/core/platform/mutex.h"
|
#include "tensorflow/core/platform/mutex.h"
|
||||||
#include "tensorflow/core/util/env_var.h"
|
|
||||||
#include "tensorflow/core/util/proto/proto_utils.h"
|
#include "tensorflow/core/util/proto/proto_utils.h"
|
||||||
#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
|
#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
|
||||||
|
|
||||||
@ -333,35 +332,6 @@ StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
|
|||||||
return result_or;
|
return result_or;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The following function allows deterministic ops to be implemented relatively
|
|
||||||
// quickly using environment variables. It is intended to be temporary. The
|
|
||||||
// longer-term intention is to enable deterministic ops via tf.config and
|
|
||||||
// appropriate plumbing. See the discussion on PR 34951 for more information:
|
|
||||||
// https://github.com/tensorflow/tensorflow/pull/34951#discussion_r355682316
|
|
||||||
// This function and associated comment are replicated in the following three
|
|
||||||
// places:
|
|
||||||
// 1. tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
|
|
||||||
// 2. tensorflow/core/kernels/gpu_utils.cc
|
|
||||||
// 3. tensorflow/stream_executor/cuda/cuda_dnn.cc
|
|
||||||
// When implementing the plumbing, you should also search for the use of
|
|
||||||
// TF_DETERMINISTIC_OPS on its own.
|
|
||||||
// TODO(duncanriach): move to an API that uses tf.config and implement the first
|
|
||||||
// phase of plumbing.
|
|
||||||
static bool RequireCudnnDeterminism() {
|
|
||||||
static bool require_cudnn_determinism = [] {
|
|
||||||
bool deterministic_ops = false;
|
|
||||||
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
|
|
||||||
/*default_val=*/false,
|
|
||||||
&deterministic_ops));
|
|
||||||
bool cudnn_deterministic = false;
|
|
||||||
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
|
|
||||||
/*default_val=*/false,
|
|
||||||
&cudnn_deterministic));
|
|
||||||
return deterministic_ops || cudnn_deterministic;
|
|
||||||
}();
|
|
||||||
return require_cudnn_determinism;
|
|
||||||
}
|
|
||||||
|
|
||||||
StatusOr<tensorflow::AutotuneResult>
|
StatusOr<tensorflow::AutotuneResult>
|
||||||
GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
||||||
const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
|
const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
|
||||||
@ -598,41 +568,43 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// For now, we ignore WRONG_RESULT failures because false-positives are
|
||||||
|
// possible (e.g. perhaps the reference algorithm is the one that's
|
||||||
|
// incorrect!). But we don't ignore REDZONE_MODIFIED failures because they're
|
||||||
|
// quite severe and can be detected with high accuracy.
|
||||||
|
auto has_failure = [](const AutotuneResult& r) {
|
||||||
|
return r.has_failure() &&
|
||||||
|
r.failure().kind() != AutotuneResult::WRONG_RESULT;
|
||||||
|
};
|
||||||
|
|
||||||
// Choose the fastest convolution that doesn't produce a REDZONE_MODIFIED
|
// Choose the fastest convolution that doesn't produce a REDZONE_MODIFIED
|
||||||
// error.
|
// error.
|
||||||
//
|
//
|
||||||
// TODO(jlebar): We ought to be able to detect redzone reads by noticing NaNs
|
// TODO(jlebar): We ought to be able to detect redzone reads by noticing NaNs
|
||||||
// in the output of the conv and skip those.
|
// in the output of the conv and skip those.
|
||||||
//
|
//
|
||||||
// For now, we ignore WRONG_RESULT failures because false-positives are
|
// The successful one should have a smaller key, since we are doing
|
||||||
// possible (e.g. perhaps the reference algorithm is the one that's
|
// min_element. If they are both unsuccessful, keep the earlier one in
|
||||||
// incorrect!). But we don't ignore REDZONE_MODIFIED failures because they're
|
// the vector by comparing pointers.
|
||||||
// quite severe and can be detected with high accuracy.
|
auto result_comparison_key = [&has_failure](const AutotuneResult& r) {
|
||||||
std::vector<AutotuneResult> filtered_results;
|
return std::make_tuple(
|
||||||
absl::c_copy_if(
|
has_failure(r),
|
||||||
profile_results, std::back_inserter(filtered_results),
|
tensorflow::proto_utils::FromDurationProto(r.run_time()));
|
||||||
[](const AutotuneResult& r) {
|
};
|
||||||
return !(r.has_failure() &&
|
const auto& best_result = absl::c_min_element(
|
||||||
r.failure().kind() != AutotuneResult::WRONG_RESULT);
|
profile_results,
|
||||||
|
[&](const AutotuneResult& lhs, const AutotuneResult& rhs) {
|
||||||
|
return result_comparison_key(lhs) < result_comparison_key(rhs);
|
||||||
});
|
});
|
||||||
if (filtered_results.empty()) {
|
|
||||||
return InternalError(
|
if (best_result != profile_results.end() && !has_failure(*best_result)) {
|
||||||
"All algorithms tried for convolution %s failed. Falling back to "
|
return *best_result;
|
||||||
"default algorithm. ",
|
|
||||||
instr->ToString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
auto selected_result = filtered_results.begin();
|
return InternalError(
|
||||||
if (!RequireCudnnDeterminism()) {
|
"All algorithms tried for convolution %s failed. Falling back to "
|
||||||
selected_result = absl::c_min_element(
|
"default algorithm.",
|
||||||
filtered_results,
|
instr->ToString());
|
||||||
[](const AutotuneResult& lhs, const AutotuneResult& rhs) {
|
|
||||||
return tensorflow::proto_utils::FromDurationProto(lhs.run_time()) <
|
|
||||||
tensorflow::proto_utils::FromDurationProto(rhs.run_time());
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return *selected_result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
StatusOr<tensorflow::AutotuneResult>
|
StatusOr<tensorflow::AutotuneResult>
|
||||||
|
@ -532,7 +532,6 @@ tf_cuda_library(
|
|||||||
"//tensorflow/core:conv_autotuning_proto_cc",
|
"//tensorflow/core:conv_autotuning_proto_cc",
|
||||||
"//tensorflow/core:lib",
|
"//tensorflow/core:lib",
|
||||||
"//tensorflow/core:stream_executor",
|
"//tensorflow/core:stream_executor",
|
||||||
"//tensorflow/core/util:env_var",
|
|
||||||
"//tensorflow/core/util/proto:proto_utils",
|
"//tensorflow/core/util/proto:proto_utils",
|
||||||
"//tensorflow/stream_executor/gpu:asm_compiler",
|
"//tensorflow/stream_executor/gpu:asm_compiler",
|
||||||
"//tensorflow/stream_executor/gpu:redzone_allocator",
|
"//tensorflow/stream_executor/gpu:redzone_allocator",
|
||||||
|
@ -24,7 +24,6 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/logger.h"
|
#include "tensorflow/core/platform/logger.h"
|
||||||
#include "tensorflow/core/protobuf/autotuning.pb.h"
|
#include "tensorflow/core/protobuf/autotuning.pb.h"
|
||||||
#include "tensorflow/core/protobuf/conv_autotuning.pb.h"
|
#include "tensorflow/core/protobuf/conv_autotuning.pb.h"
|
||||||
#include "tensorflow/core/util/env_var.h"
|
|
||||||
#include "tensorflow/core/util/proto/proto_utils.h"
|
#include "tensorflow/core/util/proto/proto_utils.h"
|
||||||
#include "tensorflow/stream_executor/gpu/asm_compiler.h"
|
#include "tensorflow/stream_executor/gpu/asm_compiler.h"
|
||||||
#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
|
#include "tensorflow/stream_executor/gpu/redzone_allocator.h"
|
||||||
@ -211,35 +210,6 @@ void LogFusedConvForwardAutotuneResults(
|
|||||||
Logger::GetSingleton()->LogProto(log);
|
Logger::GetSingleton()->LogProto(log);
|
||||||
}
|
}
|
||||||
|
|
||||||
// The following function allows deterministic ops to be implemented relatively
|
|
||||||
// quickly using environment variables. It is intended to be temporary. The
|
|
||||||
// longer-term intention is to enable deterministic ops via tf.config and
|
|
||||||
// appropriate plumbing. See the discussion on PR 34951 for more information:
|
|
||||||
// https://github.com/tensorflow/tensorflow/pull/34951#discussion_r355682316
|
|
||||||
// This function and associated comment are replicated in the following three
|
|
||||||
// places:
|
|
||||||
// 1. tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
|
|
||||||
// 2. tensorflow/core/kernels/gpu_utils.cc
|
|
||||||
// 3. tensorflow/stream_executor/cuda/cuda_dnn.cc
|
|
||||||
// When implementing the plumbing, you should also search for the use of
|
|
||||||
// TF_DETERMINISTIC_OPS on its own.
|
|
||||||
// TODO(duncanriach): move to an API that uses tf.config and implement the first
|
|
||||||
// phase of plumbing.
|
|
||||||
bool RequireCudnnDeterminism() {
|
|
||||||
static bool require_cudnn_determinism = [] {
|
|
||||||
bool deterministic_ops = false;
|
|
||||||
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
|
|
||||||
/*default_val=*/false,
|
|
||||||
&deterministic_ops));
|
|
||||||
bool cudnn_deterministic = false;
|
|
||||||
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
|
|
||||||
/*default_val=*/false,
|
|
||||||
&cudnn_deterministic));
|
|
||||||
return deterministic_ops || cudnn_deterministic;
|
|
||||||
}();
|
|
||||||
return require_cudnn_determinism;
|
|
||||||
}
|
|
||||||
|
|
||||||
Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
|
Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
|
||||||
se::dnn::AlgorithmConfig* algo) {
|
se::dnn::AlgorithmConfig* algo) {
|
||||||
std::vector<AutotuneResult> filtered_results;
|
std::vector<AutotuneResult> filtered_results;
|
||||||
@ -249,32 +219,31 @@ Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
|
|||||||
if (filtered_results.empty()) {
|
if (filtered_results.empty()) {
|
||||||
return errors::NotFound("No algorithm worked!");
|
return errors::NotFound("No algorithm worked!");
|
||||||
}
|
}
|
||||||
std::vector<AutotuneResult> filtered_results_no_scratch;
|
|
||||||
absl::c_copy_if(
|
|
||||||
filtered_results, std::back_inserter(filtered_results_no_scratch),
|
|
||||||
[](const AutotuneResult& result) { return result.scratch_bytes() == 0; });
|
|
||||||
|
|
||||||
auto selected_result = filtered_results.begin();
|
const auto best_result = absl::c_min_element(
|
||||||
auto selected_result_no_scratch = filtered_results_no_scratch.begin();
|
filtered_results,
|
||||||
if (!RequireCudnnDeterminism()) {
|
[](const AutotuneResult& lhs, const AutotuneResult& rhs) {
|
||||||
auto compare_run_times = [](const AutotuneResult& lhs,
|
return proto_utils::FromDurationProto(lhs.run_time()) <
|
||||||
const AutotuneResult& rhs) {
|
proto_utils::FromDurationProto(rhs.run_time());
|
||||||
return proto_utils::FromDurationProto(lhs.run_time()) <
|
});
|
||||||
proto_utils::FromDurationProto(rhs.run_time());
|
|
||||||
};
|
|
||||||
selected_result = absl::c_min_element(filtered_results, compare_run_times);
|
|
||||||
selected_result_no_scratch =
|
|
||||||
absl::c_min_element(filtered_results_no_scratch, compare_run_times);
|
|
||||||
}
|
|
||||||
|
|
||||||
algo->set_algorithm({selected_result->conv().algorithm(),
|
const auto best_result_no_scratch = absl::c_min_element(
|
||||||
selected_result->conv().tensor_ops_enabled()});
|
filtered_results,
|
||||||
if (selected_result_no_scratch != filtered_results_no_scratch.end()) {
|
[](const AutotuneResult& lhs, const AutotuneResult& rhs) {
|
||||||
|
return std::make_tuple(lhs.scratch_bytes(),
|
||||||
|
proto_utils::FromDurationProto(lhs.run_time())) <
|
||||||
|
std::make_tuple(rhs.scratch_bytes(),
|
||||||
|
proto_utils::FromDurationProto(rhs.run_time()));
|
||||||
|
});
|
||||||
|
|
||||||
|
algo->set_algorithm({best_result->conv().algorithm(),
|
||||||
|
best_result->conv().tensor_ops_enabled()});
|
||||||
|
if (best_result_no_scratch != filtered_results.end() &&
|
||||||
|
best_result_no_scratch->scratch_bytes() == 0) {
|
||||||
algo->set_algorithm_no_scratch(
|
algo->set_algorithm_no_scratch(
|
||||||
{selected_result_no_scratch->conv().algorithm(),
|
{best_result_no_scratch->conv().algorithm(),
|
||||||
selected_result_no_scratch->conv().tensor_ops_enabled()});
|
best_result_no_scratch->conv().tensor_ops_enabled()});
|
||||||
}
|
}
|
||||||
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -28,39 +28,22 @@ from tensorflow.python.framework import test_util
|
|||||||
from tensorflow.python.ops import nn_ops
|
from tensorflow.python.ops import nn_ops
|
||||||
from tensorflow.python.platform import test
|
from tensorflow.python.platform import test
|
||||||
|
|
||||||
# Notes:
|
# Setting either of the two environment variables TF_CUDNN_DETERMINISTIC or
|
||||||
#
|
# TF_DETERMINISTIC_OPS to "true" or "1" will disable autotuning of cuDNN
|
||||||
# Deterministic cuDNN operation is selected by setting either of the two
|
# algorithms and cause deterministic cuDNN algorithms to be selected when both
|
||||||
# environment variables TF_CUDNN_DETERMINISTIC or TF_DETERMINISTIC_OPS to 'true'
|
# deterministic and non-deterministic algorithms are available. These tests are
|
||||||
# or '1' while also not setting the environment variable TF_CUDNN_USE_AUTOTUNE
|
# intended to confirm that deterministic algorithms are chosen when either
|
||||||
# to 'false' or '0'.
|
# environment variable is set to "true" or "1". The tested configurations were
|
||||||
#
|
# first confirmed to produce non-deterministic results when the environment
|
||||||
# Where both deterministic and non-deterministic cuDNN algorithms are available,
|
# variables are not set.
|
||||||
# selecting determinitic operation will lead to only the deterministic
|
|
||||||
# algorithms being chosen. Additionally, selecting deterministic operation will
|
|
||||||
# result in a deterministic, or reproducible, selection of algorithms (for any
|
|
||||||
# given layer configuration) for each of the forward and the two backward paths.
|
|
||||||
#
|
|
||||||
# These tests intend to confirm that deterministic algorithms are chosen (for
|
|
||||||
# the back-prop paths) when desterministic operation is selected. The tested
|
|
||||||
# configurations were first confirmed to produce non-deterministic results when
|
|
||||||
# the above-mentioned environment variables are not set.
|
|
||||||
#
|
|
||||||
# Even though selecting determinitic operation should ensure that the same
|
|
||||||
# algorithms, for a given layer configuration, are always used (i.e. that
|
|
||||||
# algorithm selection is deterministic / reproducible), this is not tested.
|
|
||||||
|
|
||||||
# TODO(duncanriach): Add test for deterministic cuDNN max-pooling
|
_PADDING = 'SAME'
|
||||||
|
_STRIDES = [1, 1, 1, 1]
|
||||||
|
|
||||||
LayerShapeNHWC = collections.namedtuple('LayerShapeNHWC',
|
LayerShape = collections.namedtuple('LayerShape',
|
||||||
'batch, height, width, channels')
|
'batch, height, width, channels')
|
||||||
FilterShape2D = collections.namedtuple(
|
FilterShape = collections.namedtuple(
|
||||||
'FilterShape2D', 'height, width, in_channels, out_channels')
|
'FilterShape', 'height, width, in_channels, out_channels')
|
||||||
|
|
||||||
LayerShapeNCDHW = collections.namedtuple(
|
|
||||||
'LayerShapeNCDHW', 'batch, channels, depth, height, width')
|
|
||||||
FilterShape3D = collections.namedtuple(
|
|
||||||
'FilterShape3D', 'depth, height, width, in_channels, out_channels')
|
|
||||||
|
|
||||||
|
|
||||||
class ConvolutionTest(test.TestCase):
|
class ConvolutionTest(test.TestCase):
|
||||||
@ -71,13 +54,14 @@ class ConvolutionTest(test.TestCase):
|
|||||||
return constant_op.constant(
|
return constant_op.constant(
|
||||||
2 * np.random.random_sample(shape) - 1, dtype=dtypes.float32)
|
2 * np.random.random_sample(shape) - 1, dtype=dtypes.float32)
|
||||||
|
|
||||||
def _random_out_op(self, in_shape, filter_shape, strides, padding):
|
def _random_out_op(self, in_shape, filter_shape):
|
||||||
# Choosing not to use array_op.zeros() to prevent possible removal by
|
# Choosing not to use array_op.zeros() to prevent possible removal by
|
||||||
# optimization
|
# optimization
|
||||||
in_op = self._random_data_op(in_shape)
|
in_op = self._random_data_op(in_shape)
|
||||||
filter_op = self._random_data_op(filter_shape)
|
filter_op = self._random_data_op(filter_shape)
|
||||||
# Use the forward op's shape-inference
|
# Use the forward op's shape-inference
|
||||||
conv_op = nn_ops.conv2d(in_op, filter_op, strides=strides, padding=padding)
|
conv_op = nn_ops.conv2d(
|
||||||
|
in_op, filter_op, strides=_STRIDES, padding=_PADDING)
|
||||||
out_shape = conv_op.get_shape()
|
out_shape = conv_op.get_shape()
|
||||||
out_op = self._random_data_op(out_shape)
|
out_op = self._random_data_op(out_shape)
|
||||||
return out_op
|
return out_op
|
||||||
@ -88,54 +72,29 @@ class ConvolutionTest(test.TestCase):
|
|||||||
result_2 = self.evaluate(operation)
|
result_2 = self.evaluate(operation)
|
||||||
self.assertAllEqual(result_1, result_2)
|
self.assertAllEqual(result_1, result_2)
|
||||||
|
|
||||||
# The default forward algorithm choice, when using cuDNN 7, does not support
|
|
||||||
# the following layer configuration. This test case intends to confirm that
|
|
||||||
# an alternative algorithm is selected. Note that, in cuDNN 7, all forward
|
|
||||||
# algorithms are determnistic.
|
|
||||||
@test_util.run_cuda_only
|
|
||||||
def testForward(self):
|
|
||||||
np.random.seed(3)
|
|
||||||
in_shape = LayerShapeNCDHW(batch=2, channels=3, depth=5, height=7, width=6)
|
|
||||||
filter_shape = FilterShape3D(
|
|
||||||
depth=3, height=3, width=3, in_channels=3, out_channels=2)
|
|
||||||
in_op = self._random_data_op(in_shape)
|
|
||||||
filter_op = self._random_data_op(filter_shape)
|
|
||||||
strides = [1, 1, 1, 1, 1]
|
|
||||||
padding = 'VALID'
|
|
||||||
dilations = [1, 1, 2, 2, 2]
|
|
||||||
out_op = nn_ops.conv3d(
|
|
||||||
in_op,
|
|
||||||
filter_op,
|
|
||||||
strides=strides,
|
|
||||||
padding=padding,
|
|
||||||
data_format='NCDHW',
|
|
||||||
dilations=dilations)
|
|
||||||
self._assert_reproducible(out_op)
|
|
||||||
|
|
||||||
@test_util.run_cuda_only
|
@test_util.run_cuda_only
|
||||||
def testBackwardFilterGradient(self):
|
def testBackwardFilterGradient(self):
|
||||||
np.random.seed(1)
|
np.random.seed(1)
|
||||||
in_shape = LayerShapeNHWC(batch=8, height=128, width=128, channels=8)
|
in_shape = LayerShape(batch=8, height=128, width=128, channels=8)
|
||||||
filter_shape = FilterShape2D(
|
filter_shape = FilterShape(height=3, width=3, in_channels=8, out_channels=8)
|
||||||
height=3, width=3, in_channels=8, out_channels=8)
|
|
||||||
in_op = self._random_data_op(in_shape)
|
in_op = self._random_data_op(in_shape)
|
||||||
strides = [1, 1, 1, 1]
|
out_op = self._random_out_op(in_shape, filter_shape)
|
||||||
padding = 'SAME'
|
|
||||||
out_op = self._random_out_op(in_shape, filter_shape, strides, padding)
|
|
||||||
filter_gradient_op = nn_ops.conv2d_backprop_filter(
|
filter_gradient_op = nn_ops.conv2d_backprop_filter(
|
||||||
in_op, filter_shape, out_op, strides=strides, padding=padding)
|
in_op, filter_shape, out_op, strides=_STRIDES, padding=_PADDING)
|
||||||
self._assert_reproducible(filter_gradient_op)
|
self._assert_reproducible(filter_gradient_op)
|
||||||
|
|
||||||
@test_util.run_cuda_only
|
@test_util.run_cuda_only
|
||||||
def testBackwardInputGradient(self):
|
def testBackwardInputGradient(self):
|
||||||
np.random.seed(2)
|
np.random.seed(2)
|
||||||
in_shape = LayerShapeNHWC(batch=8, height=32, width=32, channels=8)
|
in_shape = LayerShape(batch=8, height=32, width=32, channels=8)
|
||||||
filter_shape = FilterShape2D(
|
filter_shape = FilterShape(
|
||||||
height=7, width=7, in_channels=8, out_channels=128)
|
height=7, width=7, in_channels=8, out_channels=128)
|
||||||
filter_op = self._random_data_op(filter_shape)
|
filter_op = self._random_data_op(filter_shape)
|
||||||
strides = [1, 1, 1, 1]
|
out_op = self._random_out_op(in_shape, filter_shape)
|
||||||
padding = 'SAME'
|
|
||||||
out_op = self._random_out_op(in_shape, filter_shape, strides, padding)
|
|
||||||
input_gradient_op = nn_ops.conv2d_backprop_input(
|
input_gradient_op = nn_ops.conv2d_backprop_input(
|
||||||
in_shape, filter_op, out_op, strides=strides, padding=padding)
|
in_shape, filter_op, out_op, strides=_STRIDES, padding=_PADDING)
|
||||||
self._assert_reproducible(input_gradient_op)
|
self._assert_reproducible(input_gradient_op)
|
||||||
|
|
||||||
|
# TODO(duncanriach): (1) add test to confirm that forward autotuning is
|
||||||
|
# disabled for cuDNN convolution; (2) add test for deterministic cuDNN
|
||||||
|
# max-pooling
|
||||||
|
@ -648,22 +648,9 @@ bool BatchnormSpatialPersistentEnabled() {
|
|||||||
return is_enabled;
|
return is_enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The following function allows deterministic ops to be implemented relatively
|
// A helper function to decide whether to enable deterministic functionality.
|
||||||
// quickly using environment variables. It is intended to be temporary. The
|
bool RequireDeterminism() {
|
||||||
// longer-term intention is to enable deterministic ops via tf.config and
|
static bool require_determinism = [] {
|
||||||
// appropriate plumbing. See the discussion on PR 34951 for more information:
|
|
||||||
// https://github.com/tensorflow/tensorflow/pull/34951#discussion_r355682316
|
|
||||||
// This function and associated comment are replicated in the following three
|
|
||||||
// places:
|
|
||||||
// 1. tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
|
|
||||||
// 2. tensorflow/core/kernels/gpu_utils.cc
|
|
||||||
// 3. tensorflow/stream_executor/cuda/cuda_dnn.cc
|
|
||||||
// When implementing the plumbing, you should also search for the use of
|
|
||||||
// TF_DETERMINISTIC_OPS on its own.
|
|
||||||
// TODO(duncanriach): move to an API that uses tf.config and implement the first
|
|
||||||
// phase of plumbing.
|
|
||||||
bool RequireCudnnDeterminism() {
|
|
||||||
static bool require_cudnn_determinism = [] {
|
|
||||||
bool deterministic_ops = false;
|
bool deterministic_ops = false;
|
||||||
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
|
TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
|
||||||
/*default_val=*/false,
|
/*default_val=*/false,
|
||||||
@ -674,7 +661,7 @@ bool RequireCudnnDeterminism() {
|
|||||||
&cudnn_deterministic));
|
&cudnn_deterministic));
|
||||||
return deterministic_ops || cudnn_deterministic;
|
return deterministic_ops || cudnn_deterministic;
|
||||||
}();
|
}();
|
||||||
return require_cudnn_determinism;
|
return require_determinism;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::tuple<int, int> GetCcMajorMinor(Stream* stream) {
|
std::tuple<int, int> GetCcMajorMinor(Stream* stream) {
|
||||||
@ -775,7 +762,7 @@ class CudnnPoolingDescriptor {
|
|||||||
std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
|
std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
|
||||||
&CheckedNarrowing<int64, int>);
|
&CheckedNarrowing<int64, int>);
|
||||||
bool propagate_nans = pooling_descriptor.propagate_nans();
|
bool propagate_nans = pooling_descriptor.propagate_nans();
|
||||||
const auto cudnn_max_pooling_mode = RequireCudnnDeterminism()
|
const auto cudnn_max_pooling_mode = RequireDeterminism()
|
||||||
? CUDNN_POOLING_MAX_DETERMINISTIC
|
? CUDNN_POOLING_MAX_DETERMINISTIC
|
||||||
: CUDNN_POOLING_MAX;
|
: CUDNN_POOLING_MAX;
|
||||||
CHECK_CUDNN_OK(cudnnSetPoolingNdDescriptor(
|
CHECK_CUDNN_OK(cudnnSetPoolingNdDescriptor(
|
||||||
@ -3343,15 +3330,21 @@ bool CudnnSupport::GetConvolveAlgorithms(
|
|||||||
bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
|
bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
|
||||||
out_algorithms->clear();
|
out_algorithms->clear();
|
||||||
|
|
||||||
|
if (RequireDeterminism()) {
|
||||||
|
out_algorithms->push_back({CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
|
||||||
|
tensor_op_math_available});
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<dnn::AlgorithmDesc::Index> algo_types = {
|
std::vector<dnn::AlgorithmDesc::Index> algo_types = {
|
||||||
// clang-format off
|
// clang-format off
|
||||||
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
|
|
||||||
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
|
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
|
||||||
|
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
|
||||||
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
|
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
|
||||||
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
|
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
|
||||||
CUDNN_CONVOLUTION_FWD_ALGO_FFT,
|
CUDNN_CONVOLUTION_FWD_ALGO_FFT,
|
||||||
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
|
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
|
||||||
// clang-format on
|
// clang-format on
|
||||||
};
|
};
|
||||||
if (CudnnEnvVar<FftTilingForward>::IsEnabled()) {
|
if (CudnnEnvVar<FftTilingForward>::IsEnabled()) {
|
||||||
algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING);
|
algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING);
|
||||||
@ -3360,12 +3353,11 @@ bool CudnnSupport::GetConvolveAlgorithms(
|
|||||||
algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
|
algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
|
||||||
}
|
}
|
||||||
|
|
||||||
// The algorithms are intentionally ordered for deterministic operation
|
|
||||||
for (auto i : algo_types) {
|
for (auto i : algo_types) {
|
||||||
|
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
||||||
if (tensor_op_math_available) {
|
if (tensor_op_math_available) {
|
||||||
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
|
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
|
||||||
}
|
}
|
||||||
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -3399,8 +3391,15 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
|
|||||||
bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
|
bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
|
||||||
out_algorithms->clear();
|
out_algorithms->clear();
|
||||||
|
|
||||||
|
if (RequireDeterminism()) {
|
||||||
|
out_algorithms->push_back(
|
||||||
|
{CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, tensor_op_math_available});
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<dnn::AlgorithmDesc::Index> algo_types = {
|
std::vector<dnn::AlgorithmDesc::Index> algo_types = {
|
||||||
// clang-format off
|
// clang-format off
|
||||||
|
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
|
||||||
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
|
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
|
||||||
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
|
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
|
||||||
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
|
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
|
||||||
@ -3410,16 +3409,12 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
|
|||||||
if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
|
if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
|
||||||
algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
|
algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
|
||||||
}
|
}
|
||||||
if (!RequireCudnnDeterminism()) {
|
|
||||||
algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// The algorithms are intentionally ordered for deterministic operation
|
|
||||||
for (auto i : algo_types) {
|
for (auto i : algo_types) {
|
||||||
|
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
||||||
if (tensor_op_math_available) {
|
if (tensor_op_math_available) {
|
||||||
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
|
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
|
||||||
}
|
}
|
||||||
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -3431,10 +3426,18 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
|
|||||||
bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
|
bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
|
||||||
out_algorithms->clear();
|
out_algorithms->clear();
|
||||||
|
|
||||||
|
if (RequireDeterminism()) {
|
||||||
|
out_algorithms->push_back(
|
||||||
|
{CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, tensor_op_math_available});
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<dnn::AlgorithmDesc::Index> algo_types = {
|
std::vector<dnn::AlgorithmDesc::Index> algo_types = {
|
||||||
// clang-format off
|
// clang-format off
|
||||||
|
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
|
||||||
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
|
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
|
||||||
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
|
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
|
||||||
|
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
|
||||||
// Based on cudnn.h, the following is not implemented.
|
// Based on cudnn.h, the following is not implemented.
|
||||||
// CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
|
// CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
|
||||||
|
|
||||||
@ -3446,17 +3449,12 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
|
|||||||
if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
|
if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
|
||||||
algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
|
algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
|
||||||
}
|
}
|
||||||
if (!RequireCudnnDeterminism()) {
|
|
||||||
algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0);
|
|
||||||
algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3);
|
|
||||||
}
|
|
||||||
|
|
||||||
// The algorithms are intentionally ordered for deterministic operation
|
|
||||||
for (auto i : algo_types) {
|
for (auto i : algo_types) {
|
||||||
|
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
||||||
if (tensor_op_math_available) {
|
if (tensor_op_math_available) {
|
||||||
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
|
out_algorithms->push_back({i, /*use_tensor_ops=*/true});
|
||||||
}
|
}
|
||||||
out_algorithms->push_back({i, /*use_tensor_ops=*/false});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
Loading…
Reference in New Issue
Block a user