From 9c281ba516526b63990632ba70051d3d830464fd Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Fri, 4 Sep 2020 01:40:02 +0000
Subject: [PATCH] [ROCm] Fix for ROCm CSB Breakage - 200902 - 2

The following commit introduces regressions in one unit-test, in the ROCm TF build

https://github.com/tensorflow/tensorflow/commit/458d0906ebe75072ebd87cf5088772405d0cd03e

```
//tensorflow/python/kernel_tests:pooling_ops_test_gpu                    FAILED in 3 out of 3 in 41.8s
```

The failures occur in the newly added subtests for testing the explicit padding feature. The code in the ROCm path has a couple of bugs in it, which lead to the failures. This commit fixes those bugs.

Note that changes in PR #42897 are also required for to make the above unit test pass again with the ROCm TF build. The isssue being addressed in that PR is orthogonal to what is being addressed here, and hence the two separate PRs.
---
 tensorflow/core/kernels/maxpooling_op.cc      |  2 +
 tensorflow/core/kernels/pooling_ops_common.cc | 63 +++++++++++--------
 2 files changed, 38 insertions(+), 27 deletions(-)
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 53168105aba..cd9680c8212 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -1212,9 +1212,11 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                                data_format_, tensor_in, out_shape,
                                propagate_nans_);
     } else {
+#if !defined(TENSORFLOW_USE_ROCM)
       OP_REQUIRES(context, padding_ != EXPLICIT,
                   errors::Unimplemented("Explicit padding is not supported ",
                                         "when CUDNN is not enabled."));
+#endif
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
       if (is_int8x4) {
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index bbe03dd39cb..a0c07f31b3d 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -463,6 +463,8 @@ void DnnPoolingGradOp<T>::Compute(
     return;
   }
 
+  TensorFormat transformed_input_data_format = data_format;
+
 #if CUDNN_VERSION < 7300
   /// For now, cudnn does not support NHWC format, so we need to convert it
   /// to NCHW before calling cudnn. We need to get rid of this once it is done
@@ -516,6 +518,7 @@ void DnnPoolingGradOp<T>::Compute(
       functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
                                              tensor_in->tensor<T, 4>(),
                                              transformed_input.tensor<T, 4>());
+      transformed_input_data_format = FORMAT_NCHW;
     }
     if (tensor_out) {
       // For AvgPoolGrad, the original output tensor is not necessary. However,
@@ -577,6 +580,8 @@ void DnnPoolingGradOp<T>::Compute(
   int64 input_pad_left = 0;
   int64 input_pad_right = 0;
 
+  Tensor transformed_and_padded_input_backprop;
+
   if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
                               params.pad_left != params.pad_right)) {
     // Pad the input in the same way we did during the forward pass, so that
@@ -588,7 +593,6 @@ void DnnPoolingGradOp<T>::Compute(
         std::min(params.pad_left, params.pad_right);
 
     Tensor padded_input;
-    Tensor padded_input_backprop;
     const int64 padding_rows_diff =
         std::abs(params.pad_top - params.pad_bottom);
     const int64 padding_cols_diff =
@@ -607,18 +611,18 @@ void DnnPoolingGradOp<T>::Compute(
             << " stride_rows" << params.row_stride;
 
     OP_REQUIRES_OK(
-        context,
-        context->allocate_temp(DataTypeToEnum<T>::value,
-                               ShapeFromFormat(data_format, batch_size,
-                                               new_in_rows, new_in_cols, depth),
-                               &padded_input));
+        context, context->allocate_temp(
+                     DataTypeToEnum<T>::value,
+                     ShapeFromFormat(transformed_input_data_format, batch_size,
+                                     new_in_rows, new_in_cols, depth),
+                     &padded_input));
 
     OP_REQUIRES_OK(
-        context,
-        context->allocate_temp(DataTypeToEnum<T>::value,
-                               ShapeFromFormat(data_format, batch_size,
-                                               new_in_rows, new_in_cols, depth),
-                               &transformed_input_backprop));
+        context, context->allocate_temp(
+                     DataTypeToEnum<T>::value,
+                     ShapeFromFormat(transformed_input_data_format, batch_size,
+                                     new_in_rows, new_in_cols, depth),
+                     &transformed_and_padded_input_backprop));
 
     input_pad_top = params.pad_top - common_padding_rows;
     input_pad_bottom = params.pad_bottom - common_padding_rows;
@@ -644,7 +648,8 @@ void DnnPoolingGradOp<T>::Compute(
             To32Bit(const_transformed_input.tensor<T, 4>()),
             static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom),
             static_cast<int>(input_pad_left), static_cast<int>(input_pad_right),
-            To32Bit(padded_input.tensor<T, 4>()), data_format));
+            To32Bit(padded_input.tensor<T, 4>()),
+            transformed_input_data_format));
 
     transformed_input = padded_input;
 
@@ -654,6 +659,8 @@ void DnnPoolingGradOp<T>::Compute(
             << " horizontal padding set to: " << horizontal_padding;
     tensor_in_rows = new_in_rows;
     tensor_in_cols = new_in_cols;
+  } else {
+    transformed_and_padded_input_backprop = transformed_input_backprop;
   }
 
   /// Get ready to call cudnn
@@ -690,9 +697,9 @@ void DnnPoolingGradOp<T>::Compute(
   auto output_backprop_data =
       AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
                      transformed_output_backprop.template flat<T>().size());
-  auto input_backprop_data =
-      AsDeviceMemory(transformed_input_backprop.template flat<T>().data(),
-                     transformed_input_backprop.template flat<T>().size());
+  auto input_backprop_data = AsDeviceMemory(
+      transformed_and_padded_input_backprop.template flat<T>().data(),
+      transformed_and_padded_input_backprop.template flat<T>().size());
 
   auto* stream = context->op_device_context()->stream();
   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
@@ -722,6 +729,20 @@ void DnnPoolingGradOp<T>::Compute(
   OP_REQUIRES(context, status,
               errors::Internal("dnn PoolBackward launch failed"));
 
+  if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
+                              params.pad_left != params.pad_right)) {
+    // Remove the padding that was added to the input shape above.
+    functor::PadInput<GPUDevice, T, int, 4>()(
+        context->eigen_device<GPUDevice>(),
+        To32Bit(const_cast<const Tensor&>(transformed_and_padded_input_backprop)
+                    .tensor<T, 4>()),
+        {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
+        {{static_cast<int>(-input_pad_bottom),
+          static_cast<int>(-input_pad_right)}},
+        To32Bit(transformed_input_backprop.template tensor<T, 4>()),
+        transformed_input_data_format, T{});
+  }
+
 #if CUDNN_VERSION < 7300
   if (data_format == FORMAT_NHWC) {
     /// Transform the output data from NCHW back to NHWC.
@@ -732,18 +753,6 @@ void DnnPoolingGradOp<T>::Compute(
         input_backprop->tensor<T, 4>());
   }
 #endif  // CUDNN_VERSION < 7300
-  if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
-                              params.pad_left != params.pad_right)) {
-    // Remove the padding that was added to the input shape above.
-    functor::PadInput<GPUDevice, T, int, 4>()(
-        context->eigen_device<GPUDevice>(),
-        To32Bit(const_cast<const Tensor&>(transformed_input_backprop)
-                    .tensor<T, 4>()),
-        {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
-        {{static_cast<int>(-input_pad_bottom),
-          static_cast<int>(-input_pad_right)}},
-        To32Bit(input_backprop->tensor<T, 4>()), data_format, T{});
-  }
 }
 
 #define DEFINE_DNN_OPS(T)         \