Put the reusable class CudnnAllocatorInTemp to a separate file

2019-11-08 13:27:12 -08:00 · 2019-11-08 13:27:12 -08:00 · 46aa1ca220
commit 46aa1ca220
parent 1ab863f591
5 changed files with 122 additions and 50 deletions
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -367,6 +367,16 @@ cc_library(
    ],
 )
 cc_library(
    name = "cudnn_scratch_allocator",
    srcs = ["util/cudnn_scratch_allocator.cc"],
    hdrs = ["util/cudnn_scratch_allocator.h"],
    deps = [
        "//tensorflow/core:framework",
        "//tensorflow/stream_executor:scratch_allocator",
    ],
 )
 filegroup(
    name = "util_port_hdrs",
    srcs = [
@ -2885,6 +2895,7 @@ tf_cuda_library(
            "util/version_info.cc",
            "util/env_var.cc",
            "util/port.cc",
            "util/cudnn_scratch_allocator.cc",
        ],
    ) + select({
        "//tensorflow:windows": [],
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -2298,6 +2298,7 @@ tf_kernel_library(
        "//tensorflow/core/util/ctc:ctc_loss_calculator_lib",
    ] + if_cuda([
        "//tensorflow/core:stream_executor",
        "//tensorflow/core:cudnn_scratch_allocator",
    ]),
 )
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@ -32,6 +32,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 #include "tensorflow/core/util/cudnn_scratch_allocator.h"
 #endif // GOOGLE_CUDA
 namespace tensorflow {
@ -41,14 +42,11 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 using GPUDevice = Eigen::GpuDevice;
 namespace {
 using se::DeviceMemory;
 using se::Stream;
 using se::StreamExecutor;
 using se::ScratchAllocator;
 using se::dnn::CtcLossDescriptor;
 using se::dnn::RnnStateTensorDescriptor;
 using se::dnn::ToDataType;
 using se::port::StatusOr;
 template<typename T>
 void DoHistogram(OpKernelContext* ctx, const Tensor* labels_indices,
@ -56,56 +54,11 @@ void DoHistogram(OpKernelContext* ctx, const Tensor* labels_indices,
                 std::vector<int> *labels_lengths) {
  const T* h_in = labels_indices->flat<T>().data();
  for(int i = 0; i < num_indices; i++) {
-    T key = h_in[i * 2];
+    const T& key = h_in[i * 2];
    (*labels_lengths)[key]++;
  }
 }
 // A helper to allocate temporary scratch memory for cudnnCTCLoss ops. It
 // takes the ownership of the underlying memory. The expectation is that the
 // memory should be alive for the span of the cudnnCTCLoss itself.
 template <typename T>
 class CudnnCtcLossAllocatorInTemp : public ScratchAllocator {
 public:
  ~CudnnCtcLossAllocatorInTemp() override = default;
  explicit CudnnCtcLossAllocatorInTemp(OpKernelContext* context)
      : context_(context) {}
  int64 GetMemoryLimitInBytes() override {
    return std::numeric_limits<int64>::max();
  }
  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override {
    Tensor temporary_memory;
    const DataType tf_data_type = DataTypeToEnum<T>::v();
    int64 allocate_count =
        Eigen::divup(byte_size, static_cast<int64>(sizeof(T)));
    Status allocation_status(context_->allocate_temp(
        tf_data_type, TensorShape({allocate_count}), &temporary_memory));
    if (!allocation_status.ok()) {
      return allocation_status;
    }
    // Hold the reference of the allocated tensors until the end of the
    // allocator.
    allocated_tensors_.push_back(temporary_memory);
    total_byte_size_ += byte_size;
    return DeviceMemory<uint8>::MakeFromByteSize(
        temporary_memory.template flat<T>().data(),
        temporary_memory.template flat<T>().size() * sizeof(T));
  }
  int64 TotalByteSize() const { return total_byte_size_; }
  Tensor get_allocated_tensor(int index) const {
    return allocated_tensors_[index];
  }
 private:
  int64 total_byte_size_ = 0;
  OpKernelContext* context_;  // not owned
  std::vector<Tensor> allocated_tensors_;
 };
 } // end namespace
 #endif // GOOGLE_CUDA
@ -389,7 +342,7 @@ class CTCLossOpGPU : public OpKernel {
    auto costs_data = StreamExecutorUtil::AsDeviceMemory<float>(*loss);
    auto grads_data = StreamExecutorUtil::AsDeviceMemory<float>(*gradient);
-    CudnnCtcLossAllocatorInTemp<uint8> workspace_allocator(ctx);
+    CudnnAllocatorInTemp workspace_allocator(ctx);
    Stream* stream = ctx->op_device_context()->stream();
    bool cudnn_launch_status =
--- a/tensorflow/core/util/cudnn_scratch_allocator.cc
+++ b/tensorflow/core/util/cudnn_scratch_allocator.cc
@ -0,0 +1,57 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/util/cudnn_scratch_allocator.h"
 namespace tensorflow {
 CudnnAllocatorInTemp::~CudnnAllocatorInTemp() {}
 CudnnAllocatorInTemp::CudnnAllocatorInTemp(OpKernelContext* context)
    : context_(context) {}
 int64 CudnnAllocatorInTemp::GetMemoryLimitInBytes() {
  return std::numeric_limits<int64>::max();
 }
 StatusOr<DeviceMemory<uint8>> CudnnAllocatorInTemp::AllocateBytes(
    int64 byte_size) {
  Tensor temporary_memory;
  const DataType tf_data_type = DataTypeToEnum<uint8>::v();
  int64 allocate_count =
      Eigen::divup(byte_size, static_cast<int64>(sizeof(uint8)));
  Status allocation_status(context_->allocate_temp(
      tf_data_type, TensorShape({allocate_count}), &temporary_memory));
  if (!allocation_status.ok()) {
    return allocation_status;
  }
  // Hold the reference of the allocated tensors until the end of the
  // allocator.
  allocated_tensors_.push_back(temporary_memory);
  total_byte_size_ += byte_size;
  return DeviceMemory<uint8>::MakeFromByteSize(
      temporary_memory.template flat<uint8>().data(),
      temporary_memory.template flat<uint8>().size() * sizeof(uint8));
 }
 int64 CudnnAllocatorInTemp::TotalByteSize() const {
  return total_byte_size_;
 }
 Tensor CudnnAllocatorInTemp::get_allocated_tensor(int index) const {
  return allocated_tensors_[index];
 }
 } // namespace tensorflow
--- a/tensorflow/core/util/cudnn_scratch_allocator.h
+++ b/tensorflow/core/util/cudnn_scratch_allocator.h
@ -0,0 +1,50 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_UTIL_CUDNN_SCRATCH_ALLOCATOR_H_
 #define TENSORFLOW_CORE_UTIL_CUDNN_SCRATCH_ALLOCATOR_H_
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 namespace tensorflow {
 using stream_executor::ScratchAllocator;
 using stream_executor::port::StatusOr;
 using stream_executor::DeviceMemory;
 // A helper to allocate temporary scratch memory for CUDNN ops. It
 // takes the ownership of the underlying memory. The expectation is that the
 // memory should be alive for the span of the cudnnXXX itself.
 class CudnnAllocatorInTemp : public ScratchAllocator {
 public:
  explicit CudnnAllocatorInTemp(OpKernelContext* context);
  ~CudnnAllocatorInTemp() override;
  int64 GetMemoryLimitInBytes() override;
  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override;
  int64 TotalByteSize() const;
  Tensor get_allocated_tensor(int index) const;
 private:
  int64 total_byte_size_ = 0;
  OpKernelContext* context_;  // not owned
  std::vector<Tensor> allocated_tensors_;
 	SE_DISALLOW_COPY_AND_ASSIGN(CudnnAllocatorInTemp);
 };
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_UTIL_CUDNN_STREAM_ALLOCATOR_H_