diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 5e8da1634d8..915e90fcdf4 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -367,6 +367,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "cudnn_scratch_allocator",
+    srcs = ["util/cudnn_scratch_allocator.cc"],
+    hdrs = ["util/cudnn_scratch_allocator.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/stream_executor:scratch_allocator",
+    ],
+)
+
 filegroup(
     name = "util_port_hdrs",
     srcs = [
@@ -2885,6 +2895,7 @@ tf_cuda_library(
             "util/version_info.cc",
             "util/env_var.cc",
             "util/port.cc",
+            "util/cudnn_scratch_allocator.cc",
         ],
     ) + select({
         "//tensorflow:windows": [],
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 8c634df061a..896a8352f3f 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2298,6 +2298,7 @@ tf_kernel_library(
         "//tensorflow/core/util/ctc:ctc_loss_calculator_lib",
     ] + if_cuda([
         "//tensorflow/core:stream_executor",
+        "//tensorflow/core:cudnn_scratch_allocator",
     ]),
 )
 
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index 2a2b32f5d28..f3d4f0cf12d 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/stream_executor_util.h"
+#include "tensorflow/core/util/cudnn_scratch_allocator.h"
 #endif // GOOGLE_CUDA
 
 namespace tensorflow {
@@ -41,14 +42,11 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 using GPUDevice = Eigen::GpuDevice;
 
 namespace {
-using se::DeviceMemory;
 using se::Stream;
 using se::StreamExecutor;
-using se::ScratchAllocator;
 using se::dnn::CtcLossDescriptor;
 using se::dnn::RnnStateTensorDescriptor;
 using se::dnn::ToDataType;
-using se::port::StatusOr;
 
 template<typename T>
 void DoHistogram(OpKernelContext* ctx, const Tensor* labels_indices,
@@ -56,56 +54,11 @@ void DoHistogram(OpKernelContext* ctx, const Tensor* labels_indices,
                  std::vector<int> *labels_lengths) {
   const T* h_in = labels_indices->flat<T>().data();
   for(int i = 0; i < num_indices; i++) {
-    T key = h_in[i * 2];
+    const T& key = h_in[i * 2];
     (*labels_lengths)[key]++;
   }
 }
 
-// A helper to allocate temporary scratch memory for cudnnCTCLoss ops. It
-// takes the ownership of the underlying memory. The expectation is that the
-// memory should be alive for the span of the cudnnCTCLoss itself.
-template <typename T>
-class CudnnCtcLossAllocatorInTemp : public ScratchAllocator {
- public:
-  ~CudnnCtcLossAllocatorInTemp() override = default;
-
-  explicit CudnnCtcLossAllocatorInTemp(OpKernelContext* context)
-      : context_(context) {}
-
-  int64 GetMemoryLimitInBytes() override {
-    return std::numeric_limits<int64>::max();
-  }
-
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override {
-    Tensor temporary_memory;
-    const DataType tf_data_type = DataTypeToEnum<T>::v();
-    int64 allocate_count =
-        Eigen::divup(byte_size, static_cast<int64>(sizeof(T)));
-    Status allocation_status(context_->allocate_temp(
-        tf_data_type, TensorShape({allocate_count}), &temporary_memory));
-    if (!allocation_status.ok()) {
-      return allocation_status;
-    }
-    // Hold the reference of the allocated tensors until the end of the
-    // allocator.
-    allocated_tensors_.push_back(temporary_memory);
-    total_byte_size_ += byte_size;
-    return DeviceMemory<uint8>::MakeFromByteSize(
-        temporary_memory.template flat<T>().data(),
-        temporary_memory.template flat<T>().size() * sizeof(T));
-  }
-
-  int64 TotalByteSize() const { return total_byte_size_; }
-
-  Tensor get_allocated_tensor(int index) const {
-    return allocated_tensors_[index];
-  }
-
- private:
-  int64 total_byte_size_ = 0;
-  OpKernelContext* context_;  // not owned
-  std::vector<Tensor> allocated_tensors_;
-};
 } // end namespace
 #endif // GOOGLE_CUDA
 
@@ -389,7 +342,7 @@ class CTCLossOpGPU : public OpKernel {
     auto costs_data = StreamExecutorUtil::AsDeviceMemory<float>(*loss);
     auto grads_data = StreamExecutorUtil::AsDeviceMemory<float>(*gradient);
 
-    CudnnCtcLossAllocatorInTemp<uint8> workspace_allocator(ctx);
+    CudnnAllocatorInTemp workspace_allocator(ctx);
 
     Stream* stream = ctx->op_device_context()->stream();
     bool cudnn_launch_status =
diff --git a/tensorflow/core/util/cudnn_scratch_allocator.cc b/tensorflow/core/util/cudnn_scratch_allocator.cc
new file mode 100644
index 00000000000..dae49972c3c
--- /dev/null
+++ b/tensorflow/core/util/cudnn_scratch_allocator.cc
@@ -0,0 +1,57 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/cudnn_scratch_allocator.h"
+
+namespace tensorflow {
+
+CudnnAllocatorInTemp::~CudnnAllocatorInTemp() {}
+
+CudnnAllocatorInTemp::CudnnAllocatorInTemp(OpKernelContext* context)
+    : context_(context) {}
+
+int64 CudnnAllocatorInTemp::GetMemoryLimitInBytes() {
+  return std::numeric_limits<int64>::max();
+}
+
+StatusOr<DeviceMemory<uint8>> CudnnAllocatorInTemp::AllocateBytes(
+    int64 byte_size) {
+  Tensor temporary_memory;
+  const DataType tf_data_type = DataTypeToEnum<uint8>::v();
+  int64 allocate_count =
+      Eigen::divup(byte_size, static_cast<int64>(sizeof(uint8)));
+  Status allocation_status(context_->allocate_temp(
+      tf_data_type, TensorShape({allocate_count}), &temporary_memory));
+  if (!allocation_status.ok()) {
+    return allocation_status;
+  }
+  // Hold the reference of the allocated tensors until the end of the
+  // allocator.
+  allocated_tensors_.push_back(temporary_memory);
+  total_byte_size_ += byte_size;
+  return DeviceMemory<uint8>::MakeFromByteSize(
+      temporary_memory.template flat<uint8>().data(),
+      temporary_memory.template flat<uint8>().size() * sizeof(uint8));
+}
+
+int64 CudnnAllocatorInTemp::TotalByteSize() const {
+  return total_byte_size_;
+}
+
+Tensor CudnnAllocatorInTemp::get_allocated_tensor(int index) const {
+  return allocated_tensors_[index];
+}
+
+} // namespace tensorflow
diff --git a/tensorflow/core/util/cudnn_scratch_allocator.h b/tensorflow/core/util/cudnn_scratch_allocator.h
new file mode 100644
index 00000000000..770eafbbd8d
--- /dev/null
+++ b/tensorflow/core/util/cudnn_scratch_allocator.h
@@ -0,0 +1,50 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_CUDNN_SCRATCH_ALLOCATOR_H_
+#define TENSORFLOW_CORE_UTIL_CUDNN_SCRATCH_ALLOCATOR_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/stream_executor/scratch_allocator.h"
+
+namespace tensorflow {
+
+using stream_executor::ScratchAllocator;
+using stream_executor::port::StatusOr;
+using stream_executor::DeviceMemory;
+
+// A helper to allocate temporary scratch memory for CUDNN ops. It
+// takes the ownership of the underlying memory. The expectation is that the
+// memory should be alive for the span of the cudnnXXX itself.
+class CudnnAllocatorInTemp : public ScratchAllocator {
+ public:
+  explicit CudnnAllocatorInTemp(OpKernelContext* context);
+  ~CudnnAllocatorInTemp() override;
+  int64 GetMemoryLimitInBytes() override;
+  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override;
+  int64 TotalByteSize() const;
+  Tensor get_allocated_tensor(int index) const;
+
+ private:
+  int64 total_byte_size_ = 0;
+  OpKernelContext* context_;  // not owned
+  std::vector<Tensor> allocated_tensors_;
+
+	SE_DISALLOW_COPY_AND_ASSIGN(CudnnAllocatorInTemp);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_CUDNN_STREAM_ALLOCATOR_H_