Add ROCm support for pooling operators

max pooling for qint8 is disabled on ROCm for now
2019-06-28 17:10:51 +00:00 · 2019-06-28 17:10:51 +00:00 · 4f9e60968e
commit 4f9e60968e
parent f0ae8bef6a
11 changed files with 84 additions and 68 deletions
--- a/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/avgpooling_op_gpu.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -48,7 +48,7 @@ __global__ void AvePoolBackwardNHWC(const int nthreads,
                                    const int kernel_w, const int stride_h,
                                    const int stride_w, const int pad_t,
                                    const int pad_l, dtype* const bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
    // find out the local index
    // find out the local offset
    const int c = index % channels;
@ -90,8 +90,8 @@ bool RunAvePoolBackwardNHWC(const T* const top_diff, const int num,
                            const int pad_l, T* const bottom_diff,
                            const GPUDevice& d) {
  int x_size = num * height * width * channels;
-  GpuLaunchConfig config = GetCudaLaunchConfig(x_size, d);
-  TF_CHECK_OK(CudaLaunchKernel(
+  GpuLaunchConfig config = GetGpuLaunchConfig(x_size, d);
+  TF_CHECK_OK(GpuLaunchKernel(
      AvePoolBackwardNHWC<T>, config.block_count, config.thread_per_block, 0,
      d.stream(), config.virtual_thread_count, top_diff, num, height, width,
      channels, pooled_height, pooled_width, kernel_h, kernel_w, stride_h,
@ -121,4 +121,4 @@ template bool RunAvePoolBackwardNHWC(

 }  // end namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@ -608,6 +608,7 @@ typedef AutoTuneSingleton<ConvAutoTuneGroup, ConvParameters,
                          se::dnn::AlgorithmConfig>
    AutoTuneConv;

+#if GOOGLE_CUDA
 // Check the passed allocator for redzone violations.
 // If violations have occurred, mark the corresponding autotune result
 // as a failure.
@ -646,6 +647,7 @@ static void CheckRedzones(const se::cuda::RedzoneAllocator& rz_allocator,
    LOG(ERROR) << rz_check_status.RedzoneFailureMsg();
  }
 }
+#endif

 template <typename T>
 void LaunchConv2DOp<GPUDevice, T>::operator()(
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@ -28,7 +28,7 @@ typedef Eigen::GpuDevice GPUDevice;

 namespace tensorflow {

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 template <typename T>
 void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
@ -103,7 +103,7 @@ void DnnPooling3dOp<T>::Compute(OpKernelContext* context,
                                      output_desc, &output_data)
                    .ok();
  OP_REQUIRES(context, status,
-              errors::Internal("cudnn PoolForward launch failed"));
+              errors::Internal("dnn PoolForward launch failed"));

  if (data_format == FORMAT_NHWC) {
    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
@ -232,7 +232,7 @@ void DnnPooling3dGradOp<T>::Compute(
                             output_backprop_data, &input_backprop_data)
          .ok();
  OP_REQUIRES(context, status,
-              errors::Internal("cudnn PoolBackward launch failed"));
+              errors::Internal("dnn PoolBackward launch failed"));

  if (data_format == FORMAT_NHWC) {
    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
@ -249,6 +249,6 @@ void DnnPooling3dGradOp<T>::Compute(
 TF_CALL_float(DEFINE_DNN_OPS) TF_CALL_half(DEFINE_DNN_OPS)
 #undef DEFINE_DNN_OPS

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.h
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.h
@ -22,7 +22,7 @@ limitations under the License.

 #include "tensorflow/core/framework/op_kernel.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/platform/stream_executor.h"
 #endif

@ -30,7 +30,7 @@ limitations under the License.

 namespace tensorflow {

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 // Runs (avg/max)pooling on GPU.
 // Dimension order for all array arguments is: x, y, z.
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@ -42,10 +42,12 @@ limitations under the License.

 #if GOOGLE_CUDA
 #include "third_party/gpus/cudnn/cudnn.h"
+#endif // GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 namespace tensorflow {

@ -317,7 +319,7 @@ class MaxPoolingGradOp : public OpKernel {
  TensorFormat data_format_;
 };

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 template <typename T>
 static void MaxPoolingBackwardCustomKernel(
@ -438,7 +440,7 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
  bool propagate_nans_;
 };

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 // The operation to compute gradient of MaxPool gradients.
 // It takes three inputs:
@ -647,7 +649,7 @@ class MaxPoolingGradGradOp : public OpKernel {
  TensorFormat data_format_;
 };

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 template <class T>
 class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
@ -744,7 +746,7 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
  bool use_dnn_;
 };

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 template <typename Device, typename T>
 struct LaunchMaxPoolingNoMask;
@ -1112,7 +1114,7 @@ class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
  bool include_batch_in_index_;
 };

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename T>
 class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
 public:
@ -1383,7 +1385,7 @@ struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
  }
 };

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define REGISTER_MAX_POOL_KERNELS(D, T)                                  \
  REGISTER_KERNEL_BUILDER(                                               \
@ -1430,7 +1432,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS);
 #undef REGISTER_CPU_KERNELS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 // Forward declarations for the functor specializations for GPU.
 namespace functor {
@ -1509,7 +1511,7 @@ REGISTER_KERNEL_BUILDER(Name("MaxPoolV2")

 #undef REGISTER_GPU_ONLY_POOL_KERNELS

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #undef REGISTER_MAX_POOL_KERNELS

--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -70,7 +70,7 @@ __global__ void MaxPoolForwardNCHW(
    const int pooled_width, const int kernel_h, const int kernel_w,
    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
    dtype* top_data, int64* mask, const bool include_batch_in_index) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
@ -101,6 +101,7 @@ __global__ void MaxPoolForwardNCHW(
  }
 }

+#if GOOGLE_CUDA
 // The parameters for MaxPoolForwardNoMaskKernel_NCHW_VECT_C are the same as for
 // MaxPoolForwardNCHW above, except that mask is not supported, and each
 // element of the input and output contains 4 adjacent channel values for
@ -114,7 +115,7 @@ __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
    int32* top_data) {
  // TODO(pauldonnelly): Implement a better optimized version of this kernel.
  const int32 kMinINT8X4 = 0x80808080;
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
    int c = (index / pooled_width / pooled_height) % channels;
@ -136,6 +137,7 @@ __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
    top_data[index] = maxval;
  }
 }
+#endif // GOOGLE_CUDA

 template <bool propagate_nans, typename dtype>
 __global__ void MaxPoolForwardNHWC(
@ -144,7 +146,7 @@ __global__ void MaxPoolForwardNHWC(
    const int pooled_width, const int kernel_h, const int kernel_w,
    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
    dtype* top_data, int64* mask, const bool include_batch_in_index) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
    int n = index;
    int c = n % channels;
    n /= channels;
@ -183,7 +185,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
    const int pooled_width, const int kernel_h, const int kernel_w,
    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
    const dtype* top_diff, dtype* bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
    // First find out the index to the maximum, since we have no mask.
    int n = index;
    int c = n % channels;
@ -212,7 +214,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
    // Atomically accumulate the bottom diff. The index could still be
    // uninitialized, if all the bottom_data are NaN.
    if (maxidx != -1) {
-      CudaAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
+      GpuAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
                    top_diff[index]);
    }
  }
@ -234,7 +236,7 @@ __global__ void MaxPoolBackwardNoMaskNHWC(
 //     bottom_diff: the gradient with respect to the input.
 //     include_batch_in_index: whether to include batch dimension in flattened
 //         index of `argmax`.
-// This function relies on CudaAtomicAdd to avoid race conditions. Also, before
+// This function relies on GpuAtomicAdd to avoid race conditions. Also, before
 // the kernel is run, you will need to make sure that bottom_diff is filled with
 // zero first.
 template <typename dtype>
@ -242,10 +244,10 @@ __global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
                                const int64* mask, const int top_offset,
                                const int bottom_offset, dtype* bottom_diff,
                                const bool include_batch_in_index) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
    const int offset =
        include_batch_in_index ? 0 : (index / top_offset) * bottom_offset;
-    CudaAtomicAdd(bottom_diff + offset + mask[index], top_diff[index]);
+    GpuAtomicAdd(bottom_diff + offset + mask[index], top_diff[index]);
  }
 }

@ -270,7 +272,7 @@ __global__ void MaxPoolGradBackwardNoMaskNCHW(
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
    const dtype* top_diff, dtype* bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
    // First find out the index to the maximum, since we have no mask.
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
@ -310,7 +312,7 @@ __global__ void MaxPoolGradBackwardNoMaskNHWC(
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
    const dtype* top_diff, dtype* bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
    // First find out the index to the maximum, since we have no mask.
    int n = index;
    int c = n % channels;
@ -369,18 +371,19 @@ __global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff,
                                    const int64* mask, const int top_offset,
                                    const int bottom_offset, dtype* bottom_diff,
                                    const bool include_batch_in_index) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
    const int offset =
        include_batch_in_index ? 0 : (index / bottom_offset) * top_offset;
    bottom_diff[index] = top_diff[offset + mask[index]];
  }
 }

-#undef CUDA_1D_KERNEL_LOOP
+#undef GPU_1D_KERNEL_LOOP
 }  // namespace

 namespace functor {

+#if GOOGLE_CUDA
 // Note: channels is the outer channels (dim 1) which has already been
 // divided by 4.
 bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
@ -392,7 +395,7 @@ bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
  const int kThreadsPerBlock = 1024;
  const int output_size = batch * channels * pooled_height * pooled_width;
  if (output_size == 0) return true;
-  TF_CHECK_OK(CudaLaunchKernel(
+  TF_CHECK_OK(GpuLaunchKernel(
      MaxPoolForwardNoMaskKernel_NCHW_VECT_C,
      (output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock,
      0, d.stream(), output_size, bottom_data, height, width, channels,
@ -400,6 +403,7 @@ bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
      pad_t, pad_l, top_data));
  return d.ok();
 }
+#endif // GOOGLE_CUDA

 template <typename T>
 bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
@ -413,7 +417,7 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
  const int output_size = batch * channels * pooled_height * pooled_width;
  if (output_size == 0) return true;
  if (propagate_nans) {
-    TF_CHECK_OK(CudaLaunchKernel(
+    TF_CHECK_OK(GpuLaunchKernel(
        MaxPoolForwardNHWC<true, T>,
        (output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
        kThreadsPerBlock, 0, d.stream(), output_size, bottom_data, height,
@ -421,7 +425,7 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
        stride_h, stride_w, pad_t, pad_l, top_data, mask,
        include_batch_in_index));
  } else {
-    TF_CHECK_OK(CudaLaunchKernel(
+    TF_CHECK_OK(GpuLaunchKernel(
        MaxPoolForwardNHWC<false, T>,
        (output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
        kThreadsPerBlock, 0, d.stream(), output_size, bottom_data, height,
@ -443,12 +447,12 @@ bool MaxPoolBackwardNoMask<T>::operator()(

  const int bottom_size = batch * channels * height * width;
  if (bottom_size == 0) return true;
-  TF_CHECK_OK(CudaLaunchKernel(
+  TF_CHECK_OK(GpuLaunchKernel(
      SetZero<T>, (bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
      kThreadsPerBlock, 0, d.stream(), bottom_size, bottom_diff));

  const int top_size = batch * channels * pooled_height * pooled_width;
-  TF_CHECK_OK(CudaLaunchKernel(
+  TF_CHECK_OK(GpuLaunchKernel(
      MaxPoolBackwardNoMaskNHWC<T>,
      (top_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0,
      d.stream(), top_size, bottom_data, height, width, channels, pooled_height,
@ -465,10 +469,10 @@ bool MaxPoolBackwardWithArgmax<T>::operator()(
    const bool include_batch_in_index) {
  const int kThreadsPerBlock = 1024;
  if (input_size == 0) return true;
-  TF_CHECK_OK(CudaLaunchKernel(
+  TF_CHECK_OK(GpuLaunchKernel(
      SetZero<T>, (input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
      kThreadsPerBlock, 0, d.stream(), input_size, bottom_diff));
-  TF_CHECK_OK(CudaLaunchKernel(
+  TF_CHECK_OK(GpuLaunchKernel(
      MaxPoolBackward<T>,
      (output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock,
      0, d.stream(), output_size, top_diff, mask, top_offset, bottom_offset,
@ -486,18 +490,18 @@ bool MaxPoolGradBackwardNoMask<T>::operator()(
    const Eigen::GpuDevice& d) {
  const int num_kernels = batch * channels * pooled_height * pooled_width;
  if (num_kernels == 0) return true;
-  GpuLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+  GpuLaunchConfig config = GetGpuLaunchConfig(num_kernels, d);

  if (data_format == FORMAT_NHWC) {
    TF_CHECK_OK(
-        CudaLaunchKernel(MaxPoolGradBackwardNoMaskNHWC<T>, config.block_count,
+        GpuLaunchKernel(MaxPoolGradBackwardNoMaskNHWC<T>, config.block_count,
                         config.thread_per_block, 0, d.stream(), num_kernels,
                         bottom_data, output_data, pooled_height, pooled_width,
                         channels, height, width, kernel_h, kernel_w, stride_h,
                         stride_w, pad_t, pad_l, top_diff, bottom_diff));
  } else {
    TF_CHECK_OK(
-        CudaLaunchKernel(MaxPoolGradBackwardNoMaskNCHW<T>, config.block_count,
+        GpuLaunchKernel(MaxPoolGradBackwardNoMaskNCHW<T>, config.block_count,
                         config.thread_per_block, 0, d.stream(), num_kernels,
                         bottom_data, output_data, pooled_height, pooled_width,
                         channels, height, width, kernel_h, kernel_w, stride_h,
@ -513,8 +517,8 @@ bool MaxPoolGradBackwardWithArgmax<T>::operator()(
    T* bottom_diff, const Eigen::GpuDevice& d,
    const bool include_batch_in_index) {
  if (input_size == 0) return true;
-  GpuLaunchConfig config = GetCudaLaunchConfig(output_size, d);
-  TF_CHECK_OK(CudaLaunchKernel(
+  GpuLaunchConfig config = GetGpuLaunchConfig(output_size, d);
+  TF_CHECK_OK(GpuLaunchKernel(
      MaxPoolGradBackward<T>, config.block_count, config.thread_per_block, 0,
      d.stream(), output_size, top_diff, mask, top_offset, bottom_offset,
      bottom_diff, include_batch_in_index));
@ -539,4 +543,4 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);

 }  // end namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/work_sharder.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/cudnn_pooling_gpu.h"
 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
 #endif
@ -738,7 +738,7 @@ class MaxPooling3dGradGradOp : public OpKernel {
 TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 template <typename T>
 struct LaunchPoolingOp<GPUDevice, T, AVG> {
@ -826,7 +826,7 @@ struct LaunchMaxPooling3dGradGradOp<GPUDevice, T> {
 TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
 #undef REGISTER_GPU_KERNELS

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #ifdef TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
--- a/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -35,7 +35,7 @@ __global__ void MaxPoolGradBackwardNoMaskNCDHW(
    const int stride_p, const int stride_h, const int stride_w, const int pad_p,
    const int pad_t, const int pad_l, const dtype* top_diff,
    dtype* bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
    // First find out the index to the maximum, since we have no mask.
    int pw = index % pooled_width;
    int ph = (index / pooled_width) % pooled_height;
@ -85,7 +85,7 @@ __global__ void MaxPoolGradBackwardNoMaskNDHWC(
    const int stride_p, const int stride_h, const int stride_w, const int pad_p,
    const int pad_t, const int pad_l, const dtype* top_diff,
    dtype* bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  GPU_1D_KERNEL_LOOP(index, nthreads) {
    // First find out the index to the maximum, since we have no mask.
    int n = index;
    int c = n % channels;
@ -142,16 +142,16 @@ bool MaxPool3dGradBackward<T>::operator()(
    const T* top_diff, T* bottom_diff, const Eigen::GpuDevice& d) {
  int num_kernels =
      batch * channels * pooled_plane * pooled_height * pooled_width;
-  GpuLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+  GpuLaunchConfig config = GetGpuLaunchConfig(num_kernels, d);
  if (data_format == FORMAT_NHWC) {
-    TF_CHECK_OK(CudaLaunchKernel(
+    TF_CHECK_OK(GpuLaunchKernel(
        MaxPoolGradBackwardNoMaskNDHWC<T>, config.block_count,
        config.thread_per_block, 0, d.stream(), num_kernels, bottom_data,
        output_data, pooled_plane, pooled_height, pooled_width, channels, plane,
        height, width, kernel_p, kernel_h, kernel_w, stride_p, stride_h,
        stride_w, pad_p, pad_t, pad_l, top_diff, bottom_diff));
  } else {
-    TF_CHECK_OK(CudaLaunchKernel(
+    TF_CHECK_OK(GpuLaunchKernel(
        MaxPoolGradBackwardNoMaskNCDHW<T>, config.block_count,
        config.thread_per_block, 0, d.stream(), num_kernels, bottom_data,
        output_data, pooled_plane, pooled_height, pooled_width, channels, plane,
@ -169,4 +169,4 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);

 }  // namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/pooling_ops_3d_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_3d_gpu.h
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if !GOOGLE_CUDA
-#error This file must only be included when building with Cuda support
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with Cuda or ROCm support
 #endif

 #ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_3D_GPU_H_
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@ -22,10 +22,12 @@ limitations under the License.

 #if GOOGLE_CUDA
 #include "third_party/gpus/cudnn/cudnn.h"
+#endif // GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 namespace tensorflow {

@ -125,7 +127,7 @@ TensorShape PoolParameters::forward_output_shape() {
  }
 }

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 namespace {
 template <typename T>
@ -267,7 +269,7 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
                                      output_desc, &output_data)
                    .ok();
  OP_REQUIRES(context, status,
-              errors::Internal("cudnn PoolForward launch failed"));
+              errors::Internal("dnn PoolForward launch failed"));
 #if CUDNN_VERSION < 7300
  if (data_format == FORMAT_NHWC) {
    /// Transform the output data from NCHW back to NHWC
@ -420,7 +422,7 @@ void DnnPoolingGradOp<T>::Compute(
                             output_backprop_data, &input_backprop_data)
          .ok();
  OP_REQUIRES(context, status,
-              errors::Internal("cudnn PoolBackward launch failed"));
+              errors::Internal("dnn PoolBackward launch failed"));

  if (data_format == FORMAT_NHWC) {
    /// Transform the output data from NCHW back to NHWC.
@ -443,6 +445,6 @@ template class DnnPoolingOp<qint8>;

 #undef DEFINE_DNN_OPS

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@ -29,9 +29,9 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/work_sharder.h"

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 namespace tensorflow {

@ -269,11 +269,12 @@ class MaxPoolingOp : public OpKernel {
 template <typename Device>
 struct LaunchMaxPoolingNoMask_NCHW_VECT_C;

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <>
 struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
  static void launch(OpKernelContext* context, const PoolParameters& params,
                     const Tensor& input, Tensor* output) {
+#if GOOGLE_CUDA
    bool status = functor::MaxPoolForwardNoMask_NCHW_VECT_C()(
        reinterpret_cast<const int32*>(input.flat<qint8>().data()),
        params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols,
@ -286,9 +287,14 @@ struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
      context->SetStatus(errors::Internal(
          "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
    }
+#else
+    // ROCm TODO: add support __vmaxs4 on ROCm
+    context->SetStatus(errors::Internal(
+        "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
+#endif // GOOGLE_CUDA
  }
 };
-#endif
+#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 template <typename Device, typename T>
 class MaxPoolingV2Op : public OpKernel {
@ -405,7 +411,7 @@ class MaxPoolingV2Op : public OpKernel {
    // Spatial MaxPooling implementation.
    //
    // TODO(vrv): Remove this once we no longer need it.
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
    if (std::is_same<Device, GPUDevice>::value) {
      Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
      if (std::is_same<T, qint8>::value) {