[SE] Accept memory limit as an argument for redzone allocator. Correctly pass it through in convolution kernel.

PiperOrigin-RevId: 261808345
This commit is contained in:
George Karpenkov 2019-08-05 17:39:16 -07:00 committed by TensorFlower Gardener
parent 3c98b456af
commit 3c8582bf36
5 changed files with 30 additions and 15 deletions

View File

@ -1030,7 +1030,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
// TODO(zhengxq): profile each algorithm multiple times to better
// accuracy.
se::cuda::RedzoneAllocator rz_scratch_allocator(
stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions());
stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
/*memory_limit=*/ConvolveScratchSize);
DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
se::ScratchAllocator* allocator_used =
!RedzoneCheckDisabled()

View File

@ -46,13 +46,14 @@ using RedzoneCheckStatus = RedzoneAllocator::RedzoneCheckStatus;
RedzoneAllocator::RedzoneAllocator(
Stream* stream, DeviceMemoryAllocator* memory_allocator,
cuda::PtxCompilationOptions ptx_compilation_opts, uint64 redzone_size,
uint8 redzone_pattern)
cuda::PtxCompilationOptions ptx_compilation_opts, int64 memory_limit,
int64 redzone_size, uint8 redzone_pattern)
: device_ordinal_(stream->parent()->device_ordinal()),
stream_(stream),
memory_limit_(memory_limit),
redzone_size_(RoundUpToNearest(
redzone_size,
static_cast<uint64>(tensorflow::Allocator::kAllocatorAlignment))),
static_cast<int64>(tensorflow::Allocator::kAllocatorAlignment))),
redzone_pattern_(redzone_pattern),
memory_allocator_(memory_allocator),
ptx_compilation_opts_(ptx_compilation_opts) {}

View File

@ -39,15 +39,19 @@ namespace cuda {
// memory for cudnn convolutions.
class RedzoneAllocator : public ScratchAllocator {
public:
static const int64 kDefaultMemoryLimit = 1LL << 32; // 4GB
static const int64 kDefaultRedzoneSize =
1LL << 23; // 8MiB per side, 16MiB total.
static const uint8 kDefaultRedzonePattern = -1;
RedzoneAllocator(Stream* stream, DeviceMemoryAllocator* memory_allocator,
cuda::PtxCompilationOptions ptx_compilation_opts,
uint64 redzone_size = 1 << 23, // 8MiB per side, 16MiB total
uint8 redzone_pattern = -1);
int64 memory_limit = kDefaultMemoryLimit,
int64 redzone_size = kDefaultRedzoneSize,
uint8 redzone_pattern = kDefaultRedzonePattern);
// Redzones don't count towards the memory limit.
int64 GetMemoryLimitInBytes() override {
return 1LL << 32; // 4GB. TODO(jlebar): Tune this?
}
int64 GetMemoryLimitInBytes() override { return memory_limit_; }
int64 TotalAllocatedBytesExcludingRedzones() const {
return allocated_bytes_excluding_redzones_;
}
@ -97,7 +101,10 @@ class RedzoneAllocator : public ScratchAllocator {
const int device_ordinal_;
Stream* stream_;
// Redzone size on *one side* of allocation.
// Memory limit of the allocator in bytes.
const int64 memory_limit_;
// Redzone size on *one side* of allocation in bytes.
//
// Must be a multiple of kXlaAllocatedBufferAlignBytes, otherwise the buffers
// returned to users will be misaligned.

View File

@ -58,8 +58,11 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) {
Stream stream(stream_exec);
stream.Init();
RedzoneAllocator allocator(&stream, &se_allocator, opts, kRedzoneSize,
kRedzonePattern);
RedzoneAllocator allocator(
&stream, &se_allocator, opts,
/*memory_limit=*/RedzoneAllocator::kDefaultMemoryLimit,
/*redzone_size=*/kRedzoneSize,
/*redzone_pattern=*/kRedzonePattern);
TF_ASSERT_OK_AND_ASSIGN(DeviceMemory<uint8> buf,
allocator.AllocateBytes(/*byte_size=*/kAllocSize));
EXPECT_REDZONE_OK(allocator.CheckRedzones());
@ -129,8 +132,11 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) {
StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
Stream stream(stream_exec);
stream.Init();
RedzoneAllocator allocator(&stream, &se_allocator, opts, kRedzoneSize,
/*redzone_pattern=*/-1);
RedzoneAllocator allocator(
&stream, &se_allocator, opts,
/*memory_limit=*/RedzoneAllocator::kDefaultMemoryLimit,
/*redzone_size=*/kRedzoneSize,
/*redzone_pattern=*/-1);
(void)allocator.AllocateBytes(/*byte_size=*/1);
EXPECT_REDZONE_OK(allocator.CheckRedzones());
}

View File

@ -194,7 +194,7 @@ class DeviceMemoryAllocator {
// Can we call Deallocate() as soon as a computation has been scheduled on
// a stream, or do we have to wait for the computation to complete first?
virtual bool AllowsAsynchronousDeallocation() const = 0;
virtual bool AllowsAsynchronousDeallocation() const { return false; }
protected:
const Platform* platform_;