[SE] Accept memory limit as an argument for redzone allocator. Correctly pass it through in convolution kernel.
PiperOrigin-RevId: 261808345
This commit is contained in:
parent
3c98b456af
commit
3c8582bf36
@ -1030,7 +1030,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
se::cuda::RedzoneAllocator rz_scratch_allocator(
|
||||
stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions());
|
||||
stream, &tf_allocator_adapter, se::cuda::PtxCompilationOptions(),
|
||||
/*memory_limit=*/ConvolveScratchSize);
|
||||
DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
|
||||
se::ScratchAllocator* allocator_used =
|
||||
!RedzoneCheckDisabled()
|
||||
|
@ -46,13 +46,14 @@ using RedzoneCheckStatus = RedzoneAllocator::RedzoneCheckStatus;
|
||||
|
||||
RedzoneAllocator::RedzoneAllocator(
|
||||
Stream* stream, DeviceMemoryAllocator* memory_allocator,
|
||||
cuda::PtxCompilationOptions ptx_compilation_opts, uint64 redzone_size,
|
||||
uint8 redzone_pattern)
|
||||
cuda::PtxCompilationOptions ptx_compilation_opts, int64 memory_limit,
|
||||
int64 redzone_size, uint8 redzone_pattern)
|
||||
: device_ordinal_(stream->parent()->device_ordinal()),
|
||||
stream_(stream),
|
||||
memory_limit_(memory_limit),
|
||||
redzone_size_(RoundUpToNearest(
|
||||
redzone_size,
|
||||
static_cast<uint64>(tensorflow::Allocator::kAllocatorAlignment))),
|
||||
static_cast<int64>(tensorflow::Allocator::kAllocatorAlignment))),
|
||||
redzone_pattern_(redzone_pattern),
|
||||
memory_allocator_(memory_allocator),
|
||||
ptx_compilation_opts_(ptx_compilation_opts) {}
|
||||
|
@ -39,15 +39,19 @@ namespace cuda {
|
||||
// memory for cudnn convolutions.
|
||||
class RedzoneAllocator : public ScratchAllocator {
|
||||
public:
|
||||
static const int64 kDefaultMemoryLimit = 1LL << 32; // 4GB
|
||||
static const int64 kDefaultRedzoneSize =
|
||||
1LL << 23; // 8MiB per side, 16MiB total.
|
||||
static const uint8 kDefaultRedzonePattern = -1;
|
||||
RedzoneAllocator(Stream* stream, DeviceMemoryAllocator* memory_allocator,
|
||||
cuda::PtxCompilationOptions ptx_compilation_opts,
|
||||
uint64 redzone_size = 1 << 23, // 8MiB per side, 16MiB total
|
||||
uint8 redzone_pattern = -1);
|
||||
int64 memory_limit = kDefaultMemoryLimit,
|
||||
int64 redzone_size = kDefaultRedzoneSize,
|
||||
uint8 redzone_pattern = kDefaultRedzonePattern);
|
||||
|
||||
// Redzones don't count towards the memory limit.
|
||||
int64 GetMemoryLimitInBytes() override {
|
||||
return 1LL << 32; // 4GB. TODO(jlebar): Tune this?
|
||||
}
|
||||
int64 GetMemoryLimitInBytes() override { return memory_limit_; }
|
||||
|
||||
int64 TotalAllocatedBytesExcludingRedzones() const {
|
||||
return allocated_bytes_excluding_redzones_;
|
||||
}
|
||||
@ -97,7 +101,10 @@ class RedzoneAllocator : public ScratchAllocator {
|
||||
const int device_ordinal_;
|
||||
Stream* stream_;
|
||||
|
||||
// Redzone size on *one side* of allocation.
|
||||
// Memory limit of the allocator in bytes.
|
||||
const int64 memory_limit_;
|
||||
|
||||
// Redzone size on *one side* of allocation in bytes.
|
||||
//
|
||||
// Must be a multiple of kXlaAllocatedBufferAlignBytes, otherwise the buffers
|
||||
// returned to users will be misaligned.
|
||||
|
@ -58,8 +58,11 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) {
|
||||
|
||||
Stream stream(stream_exec);
|
||||
stream.Init();
|
||||
RedzoneAllocator allocator(&stream, &se_allocator, opts, kRedzoneSize,
|
||||
kRedzonePattern);
|
||||
RedzoneAllocator allocator(
|
||||
&stream, &se_allocator, opts,
|
||||
/*memory_limit=*/RedzoneAllocator::kDefaultMemoryLimit,
|
||||
/*redzone_size=*/kRedzoneSize,
|
||||
/*redzone_pattern=*/kRedzonePattern);
|
||||
TF_ASSERT_OK_AND_ASSIGN(DeviceMemory<uint8> buf,
|
||||
allocator.AllocateBytes(/*byte_size=*/kAllocSize));
|
||||
EXPECT_REDZONE_OK(allocator.CheckRedzones());
|
||||
@ -129,8 +132,11 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) {
|
||||
StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
|
||||
Stream stream(stream_exec);
|
||||
stream.Init();
|
||||
RedzoneAllocator allocator(&stream, &se_allocator, opts, kRedzoneSize,
|
||||
/*redzone_pattern=*/-1);
|
||||
RedzoneAllocator allocator(
|
||||
&stream, &se_allocator, opts,
|
||||
/*memory_limit=*/RedzoneAllocator::kDefaultMemoryLimit,
|
||||
/*redzone_size=*/kRedzoneSize,
|
||||
/*redzone_pattern=*/-1);
|
||||
(void)allocator.AllocateBytes(/*byte_size=*/1);
|
||||
EXPECT_REDZONE_OK(allocator.CheckRedzones());
|
||||
}
|
||||
|
@ -194,7 +194,7 @@ class DeviceMemoryAllocator {
|
||||
|
||||
// Can we call Deallocate() as soon as a computation has been scheduled on
|
||||
// a stream, or do we have to wait for the computation to complete first?
|
||||
virtual bool AllowsAsynchronousDeallocation() const = 0;
|
||||
virtual bool AllowsAsynchronousDeallocation() const { return false; }
|
||||
|
||||
protected:
|
||||
const Platform* platform_;
|
||||
|
Loading…
Reference in New Issue
Block a user