diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 5e8da1634d8..915e90fcdf4 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -367,6 +367,16 @@ cc_library( ], ) +cc_library( + name = "cudnn_scratch_allocator", + srcs = ["util/cudnn_scratch_allocator.cc"], + hdrs = ["util/cudnn_scratch_allocator.h"], + deps = [ + "//tensorflow/core:framework", + "//tensorflow/stream_executor:scratch_allocator", + ], +) + filegroup( name = "util_port_hdrs", srcs = [ @@ -2885,6 +2895,7 @@ tf_cuda_library( "util/version_info.cc", "util/env_var.cc", "util/port.cc", + "util/cudnn_scratch_allocator.cc", ], ) + select({ "//tensorflow:windows": [], diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 8c634df061a..896a8352f3f 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -2298,6 +2298,7 @@ tf_kernel_library( "//tensorflow/core/util/ctc:ctc_loss_calculator_lib", ] + if_cuda([ "//tensorflow/core:stream_executor", + "//tensorflow/core:cudnn_scratch_allocator", ]), ) diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc index 2a2b32f5d28..f3d4f0cf12d 100644 --- a/tensorflow/core/kernels/ctc_loss_op.cc +++ b/tensorflow/core/kernels/ctc_loss_op.cc @@ -32,6 +32,7 @@ limitations under the License. #if GOOGLE_CUDA #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/util/stream_executor_util.h" +#include "tensorflow/core/util/cudnn_scratch_allocator.h" #endif // GOOGLE_CUDA namespace tensorflow { @@ -41,14 +42,11 @@ typedef Eigen::ThreadPoolDevice CPUDevice; using GPUDevice = Eigen::GpuDevice; namespace { -using se::DeviceMemory; using se::Stream; using se::StreamExecutor; -using se::ScratchAllocator; using se::dnn::CtcLossDescriptor; using se::dnn::RnnStateTensorDescriptor; using se::dnn::ToDataType; -using se::port::StatusOr; template void DoHistogram(OpKernelContext* ctx, const Tensor* labels_indices, @@ -56,56 +54,11 @@ void DoHistogram(OpKernelContext* ctx, const Tensor* labels_indices, std::vector *labels_lengths) { const T* h_in = labels_indices->flat().data(); for(int i = 0; i < num_indices; i++) { - T key = h_in[i * 2]; + const T& key = h_in[i * 2]; (*labels_lengths)[key]++; } } -// A helper to allocate temporary scratch memory for cudnnCTCLoss ops. It -// takes the ownership of the underlying memory. The expectation is that the -// memory should be alive for the span of the cudnnCTCLoss itself. -template -class CudnnCtcLossAllocatorInTemp : public ScratchAllocator { - public: - ~CudnnCtcLossAllocatorInTemp() override = default; - - explicit CudnnCtcLossAllocatorInTemp(OpKernelContext* context) - : context_(context) {} - - int64 GetMemoryLimitInBytes() override { - return std::numeric_limits::max(); - } - - StatusOr> AllocateBytes(int64 byte_size) override { - Tensor temporary_memory; - const DataType tf_data_type = DataTypeToEnum::v(); - int64 allocate_count = - Eigen::divup(byte_size, static_cast(sizeof(T))); - Status allocation_status(context_->allocate_temp( - tf_data_type, TensorShape({allocate_count}), &temporary_memory)); - if (!allocation_status.ok()) { - return allocation_status; - } - // Hold the reference of the allocated tensors until the end of the - // allocator. - allocated_tensors_.push_back(temporary_memory); - total_byte_size_ += byte_size; - return DeviceMemory::MakeFromByteSize( - temporary_memory.template flat().data(), - temporary_memory.template flat().size() * sizeof(T)); - } - - int64 TotalByteSize() const { return total_byte_size_; } - - Tensor get_allocated_tensor(int index) const { - return allocated_tensors_[index]; - } - - private: - int64 total_byte_size_ = 0; - OpKernelContext* context_; // not owned - std::vector allocated_tensors_; -}; } // end namespace #endif // GOOGLE_CUDA @@ -389,7 +342,7 @@ class CTCLossOpGPU : public OpKernel { auto costs_data = StreamExecutorUtil::AsDeviceMemory(*loss); auto grads_data = StreamExecutorUtil::AsDeviceMemory(*gradient); - CudnnCtcLossAllocatorInTemp workspace_allocator(ctx); + CudnnAllocatorInTemp workspace_allocator(ctx); Stream* stream = ctx->op_device_context()->stream(); bool cudnn_launch_status = diff --git a/tensorflow/core/util/cudnn_scratch_allocator.cc b/tensorflow/core/util/cudnn_scratch_allocator.cc new file mode 100644 index 00000000000..dae49972c3c --- /dev/null +++ b/tensorflow/core/util/cudnn_scratch_allocator.cc @@ -0,0 +1,57 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/util/cudnn_scratch_allocator.h" + +namespace tensorflow { + +CudnnAllocatorInTemp::~CudnnAllocatorInTemp() {} + +CudnnAllocatorInTemp::CudnnAllocatorInTemp(OpKernelContext* context) + : context_(context) {} + +int64 CudnnAllocatorInTemp::GetMemoryLimitInBytes() { + return std::numeric_limits::max(); +} + +StatusOr> CudnnAllocatorInTemp::AllocateBytes( + int64 byte_size) { + Tensor temporary_memory; + const DataType tf_data_type = DataTypeToEnum::v(); + int64 allocate_count = + Eigen::divup(byte_size, static_cast(sizeof(uint8))); + Status allocation_status(context_->allocate_temp( + tf_data_type, TensorShape({allocate_count}), &temporary_memory)); + if (!allocation_status.ok()) { + return allocation_status; + } + // Hold the reference of the allocated tensors until the end of the + // allocator. + allocated_tensors_.push_back(temporary_memory); + total_byte_size_ += byte_size; + return DeviceMemory::MakeFromByteSize( + temporary_memory.template flat().data(), + temporary_memory.template flat().size() * sizeof(uint8)); +} + +int64 CudnnAllocatorInTemp::TotalByteSize() const { + return total_byte_size_; +} + +Tensor CudnnAllocatorInTemp::get_allocated_tensor(int index) const { + return allocated_tensors_[index]; +} + +} // namespace tensorflow diff --git a/tensorflow/core/util/cudnn_scratch_allocator.h b/tensorflow/core/util/cudnn_scratch_allocator.h new file mode 100644 index 00000000000..770eafbbd8d --- /dev/null +++ b/tensorflow/core/util/cudnn_scratch_allocator.h @@ -0,0 +1,50 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_UTIL_CUDNN_SCRATCH_ALLOCATOR_H_ +#define TENSORFLOW_CORE_UTIL_CUDNN_SCRATCH_ALLOCATOR_H_ + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/stream_executor/scratch_allocator.h" + +namespace tensorflow { + +using stream_executor::ScratchAllocator; +using stream_executor::port::StatusOr; +using stream_executor::DeviceMemory; + +// A helper to allocate temporary scratch memory for CUDNN ops. It +// takes the ownership of the underlying memory. The expectation is that the +// memory should be alive for the span of the cudnnXXX itself. +class CudnnAllocatorInTemp : public ScratchAllocator { + public: + explicit CudnnAllocatorInTemp(OpKernelContext* context); + ~CudnnAllocatorInTemp() override; + int64 GetMemoryLimitInBytes() override; + StatusOr> AllocateBytes(int64 byte_size) override; + int64 TotalByteSize() const; + Tensor get_allocated_tensor(int index) const; + + private: + int64 total_byte_size_ = 0; + OpKernelContext* context_; // not owned + std::vector allocated_tensors_; + + SE_DISALLOW_COPY_AND_ASSIGN(CudnnAllocatorInTemp); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_UTIL_CUDNN_STREAM_ALLOCATOR_H_