diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc index 637098884a5..5ad2489076e 100644 --- a/tensorflow/core/kernels/conv_ops.cc +++ b/tensorflow/core/kernels/conv_ops.cc @@ -1030,7 +1030,8 @@ void LaunchConv2DOp::operator()( // TODO(zhengxq): profile each algorithm multiple times to better // accuracy. se::cuda::RedzoneAllocator rz_scratch_allocator( - stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions()); + stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(), + /*memory_limit=*/ConvolveScratchSize); DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx); se::ScratchAllocator* allocator_used = !RedzoneCheckDisabled() diff --git a/tensorflow/stream_executor/cuda/redzone_allocator.cc b/tensorflow/stream_executor/cuda/redzone_allocator.cc index cebf5852403..afd4f57024d 100644 --- a/tensorflow/stream_executor/cuda/redzone_allocator.cc +++ b/tensorflow/stream_executor/cuda/redzone_allocator.cc @@ -46,13 +46,14 @@ using RedzoneCheckStatus = RedzoneAllocator::RedzoneCheckStatus; RedzoneAllocator::RedzoneAllocator( Stream* stream, DeviceMemoryAllocator* memory_allocator, - cuda::PtxCompilationOptions ptx_compilation_opts, uint64 redzone_size, - uint8 redzone_pattern) + cuda::PtxCompilationOptions ptx_compilation_opts, int64 memory_limit, + int64 redzone_size, uint8 redzone_pattern) : device_ordinal_(stream->parent()->device_ordinal()), stream_(stream), + memory_limit_(memory_limit), redzone_size_(RoundUpToNearest( redzone_size, - static_cast(tensorflow::Allocator::kAllocatorAlignment))), + static_cast(tensorflow::Allocator::kAllocatorAlignment))), redzone_pattern_(redzone_pattern), memory_allocator_(memory_allocator), ptx_compilation_opts_(ptx_compilation_opts) {} diff --git a/tensorflow/stream_executor/cuda/redzone_allocator.h b/tensorflow/stream_executor/cuda/redzone_allocator.h index c78b54e0c5f..d09a5c0903b 100644 --- a/tensorflow/stream_executor/cuda/redzone_allocator.h +++ b/tensorflow/stream_executor/cuda/redzone_allocator.h @@ -39,15 +39,19 @@ namespace cuda { // memory for cudnn convolutions. class RedzoneAllocator : public ScratchAllocator { public: + static const int64 kDefaultMemoryLimit = 1LL << 32; // 4GB + static const int64 kDefaultRedzoneSize = + 1LL << 23; // 8MiB per side, 16MiB total. + static const uint8 kDefaultRedzonePattern = -1; RedzoneAllocator(Stream* stream, DeviceMemoryAllocator* memory_allocator, cuda::PtxCompilationOptions ptx_compilation_opts, - uint64 redzone_size = 1 << 23, // 8MiB per side, 16MiB total - uint8 redzone_pattern = -1); + int64 memory_limit = kDefaultMemoryLimit, + int64 redzone_size = kDefaultRedzoneSize, + uint8 redzone_pattern = kDefaultRedzonePattern); // Redzones don't count towards the memory limit. - int64 GetMemoryLimitInBytes() override { - return 1LL << 32; // 4GB. TODO(jlebar): Tune this? - } + int64 GetMemoryLimitInBytes() override { return memory_limit_; } + int64 TotalAllocatedBytesExcludingRedzones() const { return allocated_bytes_excluding_redzones_; } @@ -97,7 +101,10 @@ class RedzoneAllocator : public ScratchAllocator { const int device_ordinal_; Stream* stream_; - // Redzone size on *one side* of allocation. + // Memory limit of the allocator in bytes. + const int64 memory_limit_; + + // Redzone size on *one side* of allocation in bytes. // // Must be a multiple of kXlaAllocatedBufferAlignBytes, otherwise the buffers // returned to users will be misaligned. diff --git a/tensorflow/stream_executor/cuda/redzone_allocator_test.cc b/tensorflow/stream_executor/cuda/redzone_allocator_test.cc index 9f6d1bd6046..97aa2c9e301 100644 --- a/tensorflow/stream_executor/cuda/redzone_allocator_test.cc +++ b/tensorflow/stream_executor/cuda/redzone_allocator_test.cc @@ -58,8 +58,11 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) { Stream stream(stream_exec); stream.Init(); - RedzoneAllocator allocator(&stream, &se_allocator, opts, kRedzoneSize, - kRedzonePattern); + RedzoneAllocator allocator( + &stream, &se_allocator, opts, + /*memory_limit=*/RedzoneAllocator::kDefaultMemoryLimit, + /*redzone_size=*/kRedzoneSize, + /*redzone_pattern=*/kRedzonePattern); TF_ASSERT_OK_AND_ASSIGN(DeviceMemory buf, allocator.AllocateBytes(/*byte_size=*/kAllocSize)); EXPECT_REDZONE_OK(allocator.CheckRedzones()); @@ -129,8 +132,11 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) { StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec}); Stream stream(stream_exec); stream.Init(); - RedzoneAllocator allocator(&stream, &se_allocator, opts, kRedzoneSize, - /*redzone_pattern=*/-1); + RedzoneAllocator allocator( + &stream, &se_allocator, opts, + /*memory_limit=*/RedzoneAllocator::kDefaultMemoryLimit, + /*redzone_size=*/kRedzoneSize, + /*redzone_pattern=*/-1); (void)allocator.AllocateBytes(/*byte_size=*/1); EXPECT_REDZONE_OK(allocator.CheckRedzones()); } diff --git a/tensorflow/stream_executor/device_memory_allocator.h b/tensorflow/stream_executor/device_memory_allocator.h index c9213cfe390..35b6b605a4e 100644 --- a/tensorflow/stream_executor/device_memory_allocator.h +++ b/tensorflow/stream_executor/device_memory_allocator.h @@ -194,7 +194,7 @@ class DeviceMemoryAllocator { // Can we call Deallocate() as soon as a computation has been scheduled on // a stream, or do we have to wait for the computation to complete first? - virtual bool AllowsAsynchronousDeallocation() const = 0; + virtual bool AllowsAsynchronousDeallocation() const { return false; } protected: const Platform* platform_;