From bb2a1d3ee8d15ca950864ef4abd556fc8de1f456 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Wed, 26 Jun 2019 17:12:27 -0700 Subject: [PATCH] Provide a flag TF_DISABLE_RZ_CHECK to disable redzone check during TF convolution autotuning The flag has to be set to "1" for the redzone checking to be disabled. PiperOrigin-RevId: 255295979 --- tensorflow/core/kernels/conv_ops.cc | 42 ++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc index 0ea367d4250..56162cae1d7 100644 --- a/tensorflow/core/kernels/conv_ops.cc +++ b/tensorflow/core/kernels/conv_ops.cc @@ -73,6 +73,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; namespace { + template struct LaunchGeneric { void operator()(OpKernelContext* ctx, const Tensor& input, @@ -575,6 +576,11 @@ template struct LaunchConv2DOp; template struct LaunchConv2DOp; #if GOOGLE_CUDA +static bool RedzoneCheckDisabled() { + const char* disable_rz_str = std::getenv("TF_DISABLE_RZ_CHECK"); + return disable_rz_str != nullptr && std::strcmp(disable_rz_str, "1") == 0; +} + int64 GetDnnWorkspaceLimit(const string& envvar_in_mb, int64 default_value_in_bytes) { const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str()); @@ -997,19 +1003,24 @@ void LaunchConv2DOp::operator()( se::cuda::PtxCompilationOptions()); se::DeviceMemory output_tensor; - auto output_rz_or = rz_allocator.AllocateBytes(stream, output_ptr.size()); - if (!output_rz_or.ok()) { - static std::once_flag rz_allocation_failure_logged; - std::call_once(rz_allocation_failure_logged, []() { - LOG(WARNING) - << "Failed to allocate memory for convolution redzone " - << "checking; skipping this check. This is benign and only " - << "means that we won't check cudnn for out-of-bounds reads " - << "and writes. This message will only be printed once."; - }); - output_tensor = output_ptr; + + if (!RedzoneCheckDisabled()) { + auto output_rz_or = rz_allocator.AllocateBytes(stream, output_ptr.size()); + if (!output_rz_or.ok()) { + static std::once_flag rz_allocation_failure_logged; + std::call_once(rz_allocation_failure_logged, []() { + LOG(WARNING) + << "Failed to allocate memory for convolution redzone " + << "checking; skipping this check. This is benign and only " + << "means that we won't check cudnn for out-of-bounds reads " + << "and writes. This message will only be printed once."; + }); + output_tensor = output_ptr; + } else { + output_tensor = se::DeviceMemory(output_rz_or.ValueOrDie()); + } } else { - output_tensor = se::DeviceMemory(output_rz_or.ValueOrDie()); + output_tensor = output_ptr; } std::vector results; @@ -1019,13 +1030,18 @@ void LaunchConv2DOp::operator()( se::cuda::RedzoneAllocator rz_scratch_allocator( stream->parent()->device_ordinal(), &tf_allocator_adapter, se::cuda::PtxCompilationOptions()); + DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx); + se::ScratchAllocator* allocator_used = + !RedzoneCheckDisabled() + ? static_cast(&rz_scratch_allocator) + : static_cast(&scratch_allocator); ProfileResult profile_result; bool cudnn_launch_status = stream ->ThenConvolveWithAlgorithm( input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, - output_desc, &output_tensor, &rz_scratch_allocator, + output_desc, &output_tensor, allocator_used, AlgorithmConfig(profile_algorithm), &profile_result) .ok(); if (cudnn_launch_status && profile_result.is_valid()) {