[SE] Accept memory limit as an argument for redzone allocator. Correctly pass it through in convolution kernel.

PiperOrigin-RevId: 261808345
2019-08-05 17:39:16 -07:00 · 2019-08-05 17:39:16 -07:00 · 3c8582bf36
commit 3c8582bf36
parent 3c98b456af
5 changed files with 30 additions and 15 deletions
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@ -1030,7 +1030,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
      // TODO(zhengxq): profile each algorithm multiple times to better
      // accuracy.
      se::cuda::RedzoneAllocator rz_scratch_allocator(
-          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions());
+          stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
+          /*memory_limit=*/ConvolveScratchSize);
      DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
      se::ScratchAllocator* allocator_used =
          !RedzoneCheckDisabled()
--- a/tensorflow/stream_executor/cuda/redzone_allocator.cc
+++ b/tensorflow/stream_executor/cuda/redzone_allocator.cc
@ -46,13 +46,14 @@ using RedzoneCheckStatus = RedzoneAllocator::RedzoneCheckStatus;

 RedzoneAllocator::RedzoneAllocator(
    Stream* stream, DeviceMemoryAllocator* memory_allocator,
-    cuda::PtxCompilationOptions ptx_compilation_opts, uint64 redzone_size,
-    uint8 redzone_pattern)
+    cuda::PtxCompilationOptions ptx_compilation_opts, int64 memory_limit,
+    int64 redzone_size, uint8 redzone_pattern)
    : device_ordinal_(stream->parent()->device_ordinal()),
      stream_(stream),
+      memory_limit_(memory_limit),
      redzone_size_(RoundUpToNearest(
          redzone_size,
-          static_cast<uint64>(tensorflow::Allocator::kAllocatorAlignment))),
+          static_cast<int64>(tensorflow::Allocator::kAllocatorAlignment))),
      redzone_pattern_(redzone_pattern),
      memory_allocator_(memory_allocator),
      ptx_compilation_opts_(ptx_compilation_opts) {}
--- a/tensorflow/stream_executor/cuda/redzone_allocator.h
+++ b/tensorflow/stream_executor/cuda/redzone_allocator.h
@ -39,15 +39,19 @@ namespace cuda {
 // memory for cudnn convolutions.
 class RedzoneAllocator : public ScratchAllocator {
 public:
+  static const int64 kDefaultMemoryLimit = 1LL << 32;  // 4GB
+  static const int64 kDefaultRedzoneSize =
+      1LL << 23;  // 8MiB per side, 16MiB total.
+  static const uint8 kDefaultRedzonePattern = -1;
  RedzoneAllocator(Stream* stream, DeviceMemoryAllocator* memory_allocator,
                   cuda::PtxCompilationOptions ptx_compilation_opts,
-                   uint64 redzone_size = 1 << 23,  // 8MiB per side, 16MiB total
-                   uint8 redzone_pattern = -1);
+                   int64 memory_limit = kDefaultMemoryLimit,
+                   int64 redzone_size = kDefaultRedzoneSize,
+                   uint8 redzone_pattern = kDefaultRedzonePattern);

  // Redzones don't count towards the memory limit.
-  int64 GetMemoryLimitInBytes() override {
-    return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
-  }
+  int64 GetMemoryLimitInBytes() override { return memory_limit_; }
+
  int64 TotalAllocatedBytesExcludingRedzones() const {
    return allocated_bytes_excluding_redzones_;
  }
@ -97,7 +101,10 @@ class RedzoneAllocator : public ScratchAllocator {
  const int device_ordinal_;
  Stream* stream_;

-  // Redzone size on *one side* of allocation.
+  // Memory limit of the allocator in bytes.
+  const int64 memory_limit_;
+
+  // Redzone size on *one side* of allocation in bytes.
  //
  // Must be a multiple of kXlaAllocatedBufferAlignBytes, otherwise the buffers
  // returned to users will be misaligned.
--- a/tensorflow/stream_executor/cuda/redzone_allocator_test.cc
+++ b/tensorflow/stream_executor/cuda/redzone_allocator_test.cc
@ -58,8 +58,11 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) {

  Stream stream(stream_exec);
  stream.Init();
-  RedzoneAllocator allocator(&stream, &se_allocator, opts, kRedzoneSize,
-                             kRedzonePattern);
+  RedzoneAllocator allocator(
+      &stream, &se_allocator, opts,
+      /*memory_limit=*/RedzoneAllocator::kDefaultMemoryLimit,
+      /*redzone_size=*/kRedzoneSize,
+      /*redzone_pattern=*/kRedzonePattern);
  TF_ASSERT_OK_AND_ASSIGN(DeviceMemory<uint8> buf,
                          allocator.AllocateBytes(/*byte_size=*/kAllocSize));
  EXPECT_REDZONE_OK(allocator.CheckRedzones());
@ -129,8 +132,11 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) {
  StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
  Stream stream(stream_exec);
  stream.Init();
-  RedzoneAllocator allocator(&stream, &se_allocator, opts, kRedzoneSize,
-                             /*redzone_pattern=*/-1);
+  RedzoneAllocator allocator(
+      &stream, &se_allocator, opts,
+      /*memory_limit=*/RedzoneAllocator::kDefaultMemoryLimit,
+      /*redzone_size=*/kRedzoneSize,
+      /*redzone_pattern=*/-1);
  (void)allocator.AllocateBytes(/*byte_size=*/1);
  EXPECT_REDZONE_OK(allocator.CheckRedzones());
 }
--- a/tensorflow/stream_executor/device_memory_allocator.h
+++ b/tensorflow/stream_executor/device_memory_allocator.h
@ -194,7 +194,7 @@ class DeviceMemoryAllocator {

  // Can we call Deallocate() as soon as a computation has been scheduled on
  // a stream, or do we have to wait for the computation to complete first?
-  virtual bool AllowsAsynchronousDeallocation() const = 0;
+  virtual bool AllowsAsynchronousDeallocation() const { return false; }

 protected:
  const Platform* platform_;