[ROCm] Fix for ROCm CSB Breakage - 200902 - 1

The following commit introduces regressions in two unit-tests, in the ROCm TF build 458d0906eb ``` //tensorflow/python/kernel_tests:conv_ops_3d_test_gpu FAILED in 3 out of 3 in 212.4s //tensorflow/python/kernel_tests:conv_ops_test_gpu FAILED in 3 out of 3 in 591.0s ``` The failures are within subtests that test padding for the float16 type. The cause of failure is a strange one. In the ROCm TF build, * the call(s) to the `PadInput` functor are in the files (conv_*.cc) that are compiled by the "CPU" compiler, while the ++ the GPUDevice specific template instantiations of the `PadInput` functor are in the files that are compiled by the "GPU" compiler. For T == Eigen::half, the value of the "padding_value" argument (prior to this commit, when it was pass-by-value) was getting corrupted, leading to regressions in the convolution unit tests. I do not understand the exact reason for the this, but based on similar past issues, it is likely due to a combination of * an ABI incompatibility between the "old" CPU compiler (gcc 5.4 for Ubuntu 16.04, gcc 7.5 for Ubuntu 18.04) and the "new" ROCm GPU compiler (hipclang which is based on latest clang), AND * Eigen::half having the same size but different internals on the CPU and GPU sides (unsigned short on CPU, union of {unsigned short, _Float16} on GPU Changing the "padding value" argument to be a const reference type seems to suppress the bug This workaround can be removed once the "CPU" compiler is gcc8 or higher.
2020-09-02 16:15:00 +00:00 · 2020-09-02 16:15:00 +00:00 · c25b1c4a55
commit c25b1c4a55
parent ff1ecf14de
9 changed files with 42 additions and 18 deletions
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@ -309,6 +309,28 @@ struct TransformDepth {
  }
 };

+// Note on the use of const reference for the "padding_value" argument
+//
+// In the ROCm TF build,
+// ++ the call(s) to the functor are in the files (conv_*.cc) that are compiled
+//    by the "CPU" compiler, while the
+// ++ the GPUDevice specific template instantiations are in the files that are
+//     compiled by the "GPU" compiler.
+//
+// For T == Eigen::half, the value of the "padding_value" argument (when it was
+// pass-by-value) was getting corrupted, leading to regressions in the
+// convolution unit tests.
+//
+// I do not understand the exact reason for the this, but based on similar past
+// issues, it is likely due to a combination of
+// ++ an ABI incompatibility between the "old" CPU compiler (gcc 5.4 for
+//    Ubuntu 16.04, gcc 7.5 for Ubuntu 18.04) and the "new" ROCm GPU compiler
+//    (hipclang which is based on latest clang), AND
+// ++ Eigen::half having the same size but different internals on the CPU and
+//    GPU sides (unsigned short on CPU, union {unsigned short, _Float16} on GPU
+//
+// Changing the "padding value" argument to be a const reference type seems to
+// suppress the bug
 template <typename Device, typename T, typename IndexType, int NDIMS>
 struct PadInput {
  void operator()(const Device& d,
@ -316,7 +338,7 @@ struct PadInput {
                  const std::array<int, NDIMS - 2>& padding_left,
                  const std::array<int, NDIMS - 2>& padding_right,
                  typename TTypes<T, NDIMS, IndexType>::Tensor out,
-                  TensorFormat format, T padding_value = T{}) {
+                  TensorFormat format, const T& padding_value) {
    Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding;
    padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0};
    for (int i = 0; i < NDIMS - 2; ++i) {
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@ -568,7 +568,7 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
                  const std::array<int, NDIMS - 2>& padding_left,
                  const std::array<int, NDIMS - 2>& padding_right,
                  typename TTypes<T, NDIMS, int>::Tensor out,
-                  TensorFormat format, T padding_value = T{}) {
+                  TensorFormat format, const T& padding_value) {
    GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
    Dimension<NDIMS> input_dims;
    for (int i = 0; i < NDIMS; ++i) {
@ -579,6 +579,8 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
      output_dims[i] = out.dimension(i);
    }

+    VLOG(-1) << static_cast<float>(padding_value);
+
    const Dimension<NDIMS - 2> padding_left_dim(padding_left);

    if (format == FORMAT_NHWC) {
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@ -812,7 +812,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
        {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
        {{static_cast<int>(input_pad_bottom),
          static_cast<int>(input_pad_right)}},
-        To32Bit(compatible_input.tensor<T, 4>()), data_format);
+        To32Bit(compatible_input.tensor<T, 4>()), data_format, T{});
  } else {
    compatible_input = input;
  }
@ -1148,7 +1148,7 @@ namespace functor {
      const std::array<int, 2>& padding_left,                           \
      const std::array<int, 2>& padding_right,                          \
      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
-      T padding_value);                                                 \
+      const T& padding_value);                                          \
  extern template struct PadInput<GPUDevice, T, int, 4>;

 DECLARE_GPU_SPEC(float);
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@ -1313,8 +1313,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
        {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
        {{static_cast<int>(-input_pad_bottom),
          static_cast<int>(-input_pad_right)}},
-        To32Bit(in_backprop_remove_padding.tensor<T, 4>()),
-        compute_data_format);
+        To32Bit(in_backprop_remove_padding.tensor<T, 4>()), compute_data_format,
+        T{});

    pre_transformed_in_backprop = in_backprop_remove_padding;
  }
@ -1346,7 +1346,7 @@ namespace functor {
      const std::array<int, 2>& padding_left,                           \
      const std::array<int, 2>& padding_right,                          \
      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
-      T padding_value);                                                 \
+      const T& padding_value);                                          \
  extern template struct PadInput<GPUDevice, T, int, 4>;

 DECLARE_GPU_SPEC(float);
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@ -1083,7 +1083,7 @@ namespace functor {
      const std::array<int, 3>& padding_left,                         \
      const std::array<int, 3>& padding_right,                        \
      typename TTypes<T, 5, int>::Tensor out, TensorFormat format,    \
-      T padding_value);
+      const T& padding_value);

 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@ -1560,7 +1560,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
                      .tensor<T, 5>()),
          {{0, 0, 0}}, {{-planes_odd, -rows_odd, -cols_odd}},
          To32Bit(in_backprop_remove_padding.tensor<T, 5>()),
-          compute_data_format);
+          compute_data_format, T{});

      pre_transformed_in_backprop = in_backprop_remove_padding;
    }
@ -1765,7 +1765,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
          context->template eigen_device<GPUDevice>(),
          To32Bit(input.tensor<T, 5>()), {{0, 0, 0}},
          {{planes_odd, rows_odd, cols_odd}},
-          To32Bit(compatible_input.tensor<T, 5>()), data_format_);
+          To32Bit(compatible_input.tensor<T, 5>()), data_format_, T{});
    } else {
      compatible_input = input;
    }
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@ -827,7 +827,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
        {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
        {{static_cast<int>(input_pad_bottom),
          static_cast<int>(input_pad_right)}},
-        To32Bit(transformed_input.tensor<T, 4>()), data_format);
+        To32Bit(transformed_input.tensor<T, 4>()), data_format, T{});

    input = transformed_input;
    in_rows = new_in_rows;
@ -1184,7 +1184,7 @@ namespace functor {
      const std::array<int, 2>& padding_left,                               \
      const std::array<int, 2>& padding_right,                              \
      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format,     \
-      T padding_value);                                                     \
+      const T& padding_value);                                              \
  extern template struct PadInput<GPUDevice, T, int, 4>

 DECLARE_GPU_SPEC(float);
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@ -194,7 +194,7 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
        functor::PadInput<GPUDevice, T, int, 5>()(
            ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 5>()),
            {{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}},
-            To32Bit(transformed_input.tensor<T, 5>()), data_format);
+            To32Bit(transformed_input.tensor<T, 5>()), data_format, T{});
        input = transformed_input;
        in_rows = new_in_rows;
        in_cols = new_in_cols;
@ -540,7 +540,7 @@ namespace functor {
      const std::array<int, 3>& padding_left,                         \
      const std::array<int, 3>& padding_right,                        \
      typename TTypes<T, 5, int>::Tensor out, TensorFormat format,    \
-      T padding_value);                                               \
+      const T& padding_value);                                        \
  template <>                                                         \
  void NHWCToNCHW<GPUDevice, T, 5>::operator()(                       \
      const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in,      \
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@ -531,7 +531,7 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
          {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
          {{static_cast<int>(input_pad_bottom),
            static_cast<int>(input_pad_right)}},
-          To32Bit(transformed_input.tensor<T, 4>()), params.data_format);
+          To32Bit(transformed_input.tensor<T, 4>()), params.data_format, T{});
      input = transformed_input;
      in_rows = new_in_rows;
      in_cols = new_in_cols;
@ -837,7 +837,7 @@ class FusedConv2DOp : public OpKernel {
      const std::array<int, 2>& padding_left,                           \
      const std::array<int, 2>& padding_right,                          \
      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
-      T padding_value);                                                 \
+      const T& padding_value);                                          \
  extern template struct PadInput<GPUDevice, T, int, 4>

 // Registration of the GPU implementations.
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@ -428,7 +428,7 @@ namespace functor {
      const std::array<int, 2>& padding_left,                           \
      const std::array<int, 2>& padding_right,                          \
      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
-      T padding_value);                                                 \
+      const T& padding_value);                                          \
  extern template struct PadInput<GPUDevice, T, int, 4>;

 DECLARE_GPU_SPEC(float);
@ -742,7 +742,7 @@ void DnnPoolingGradOp<T>::Compute(
        {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
        {{static_cast<int>(-input_pad_bottom),
          static_cast<int>(-input_pad_right)}},
-        To32Bit(input_backprop->tensor<T, 4>()), data_format);
+        To32Bit(input_backprop->tensor<T, 4>()), data_format, T{});
  }
 }