From 1bf8f49335661fa330877db7badfa6ed264a1bb7 Mon Sep 17 00:00:00 2001 From: Tim Shen Date: Tue, 30 Jun 2020 19:04:16 -0700 Subject: [PATCH] [StreamExecutor] Workaround the cuFFT bug in CUDA 10.1/10.2/11. See https://github.com/google/jax/issues/2874 for details. PiperOrigin-RevId: 319143928 Change-Id: I8c4759e90d6e9f6e134e5f2a241cb946d7db99b3 --- tensorflow/stream_executor/cuda/cuda_fft.cc | 36 +++++++++++++++++++-- tensorflow/stream_executor/cuda/cuda_fft.h | 6 +++- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc index 79047d989bb..aab263280b5 100644 --- a/tensorflow/stream_executor/cuda/cuda_fft.cc +++ b/tensorflow/stream_executor/cuda/cuda_fft.cc @@ -29,6 +29,7 @@ limitations under the License. #include "tensorflow/stream_executor/platform/logging.h" #include "tensorflow/stream_executor/platform/port.h" #include "tensorflow/stream_executor/plugin_registry.h" +#include "tensorflow/stream_executor/stream.h" #include "tensorflow/stream_executor/stream_executor_internal.h" namespace stream_executor { @@ -82,6 +83,7 @@ port::Status CUDAFftPlan::Initialize( LOG(FATAL) << "Try to repeatedly initialize."; } is_initialized_ = true; + scratch_allocator_ = scratch_allocator; cuda::ScopedActivateExecutorContext sac(parent); int elem_count_[3], input_embed_[3], output_embed_[3]; for (int i = 0; i < rank; ++i) { @@ -243,6 +245,8 @@ port::Status CUDAFftPlan::Initialize(GpuExecutor *parent, Stream *stream, port::Status CUDAFftPlan::UpdateScratchAllocator( Stream *stream, ScratchAllocator *scratch_allocator) { + scratch_allocator_ = scratch_allocator; + if (scratch_size_bytes_ != 0) { auto allocated = scratch_allocator->AllocateBytes(scratch_size_bytes_); if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) { @@ -455,6 +459,9 @@ bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec, const DeviceMemory &input, DeviceMemory *output) { CUDAFftPlan *cuda_fft_plan = dynamic_cast(plan); + + DeviceMemory input_maybe_copy = input; + if (cuda_fft_plan == nullptr) { LOG(ERROR) << "the passed-in plan is not a CUDAFftPlan object."; return false; @@ -464,10 +471,33 @@ bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec, return false; } + // Workaround a cuFFT bug, which mutates the input buffer when it shouldn't. + // See b/155276727 and go/nvbugs/2959622. + // TODO(b/155276727): refine the bounding condition. + if (input.opaque() != output->opaque() && CUDA_VERSION >= 10010 && + CUDA_VERSION <= 11000 && + std::is_same>::value && + std::is_same::value && input.size() > 0) { + auto *allocator = cuda_fft_plan->GetScratchAllocator(); + if (allocator) { + auto allocated = allocator->AllocateBytes(input.size()); + if (allocated.ok()) { + if (stream->ThenMemcpy(&allocated.ValueOrDie(), input, input.size()) + .ok()) { + input_maybe_copy = DeviceMemory(allocated.ValueOrDie()); + } + } + // Keep going even the workaround fails, since we don't have a good + // bounding box. We don't want to give up on a potentially correct + // execution just because the allocation for the incorrect case fails. + } + } + cuda::ScopedActivateExecutorContext sac(parent_); - auto ret = cufftExec(cuda_fft_plan->GetPlan(), - GpuComplex(const_cast(GpuMemory(input))), - GpuComplex(GpuMemoryMutable(output))); + auto ret = + cufftExec(cuda_fft_plan->GetPlan(), + GpuComplex(const_cast(GpuMemory(input_maybe_copy))), + GpuComplex(GpuMemoryMutable(output))); if (ret != CUFFT_SUCCESS) { LOG(ERROR) << "failed to run cuFFT routine: " << ret; diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h index e7b0d66cb13..9cb3fd94245 100644 --- a/tensorflow/stream_executor/cuda/cuda_fft.h +++ b/tensorflow/stream_executor/cuda/cuda_fft.h @@ -50,7 +50,8 @@ class CUDAFftPlan : public fft::Plan { fft_type_(fft::Type::kInvalid), scratch_(nullptr), scratch_size_bytes_(0), - is_initialized_(false) {} + is_initialized_(false), + scratch_allocator_(nullptr) {} ~CUDAFftPlan() override; // Get FFT direction in cuFFT based on FFT type. @@ -79,6 +80,8 @@ class CUDAFftPlan : public fft::Plan { port::Status UpdateScratchAllocator(Stream *stream, ScratchAllocator *scratch_allocator); + ScratchAllocator* GetScratchAllocator() const { return scratch_allocator_; } + protected: bool IsInitialized() const { return is_initialized_; } @@ -89,6 +92,7 @@ class CUDAFftPlan : public fft::Plan { DeviceMemory scratch_; size_t scratch_size_bytes_; bool is_initialized_; + ScratchAllocator* scratch_allocator_; }; // FFT support for CUDA platform via cuFFT library.