[ROCm] Fix for ROCm CSB Breakage - 200902 - 1

The following commit introduces regressions in two unit-tests, in the ROCm TF build

458d0906eb

```
//tensorflow/python/kernel_tests:conv_ops_3d_test_gpu                    FAILED in 3 out of 3 in 212.4s
//tensorflow/python/kernel_tests:conv_ops_test_gpu                       FAILED in 3 out of 3 in 591.0s
```

The failures are within subtests that test padding for the float16 type.

The cause of failure is a strange one. In the ROCm TF build,
* the call(s) to the `PadInput` functor are in the files (conv_*.cc) that are compiled by the "CPU" compiler, while the
++ the GPUDevice specific template instantiations of the  `PadInput` functor are in the files that are compiled by the "GPU" compiler.

For T == Eigen::half, the value of the "padding_value" argument (prior to this commit, when it was pass-by-value) was getting corrupted, leading to regressions in the convolution unit tests.

I do not understand the exact reason for the this, but based on similar past issues, it is likely due to a combination of
* an ABI incompatibility between the "old" CPU compiler (gcc 5.4 for Ubuntu 16.04, gcc 7.5 for Ubuntu 18.04) and the "new" ROCm GPU compiler (hipclang which is based on latest clang), AND
* Eigen::half having the same size but different internals on the CPU and GPU sides (unsigned short on CPU, union of {unsigned short, _Float16} on GPU

Changing the "padding value" argument to be a const reference type seems to suppress the bug

This workaround can be removed once the "CPU" compiler is gcc8 or higher.
This commit is contained in:
Deven Desai 2020-09-02 16:15:00 +00:00
parent ff1ecf14de
commit c25b1c4a55
9 changed files with 42 additions and 18 deletions

View File

@ -309,6 +309,28 @@ struct TransformDepth {
}
};
// Note on the use of const reference for the "padding_value" argument
//
// In the ROCm TF build,
// ++ the call(s) to the functor are in the files (conv_*.cc) that are compiled
// by the "CPU" compiler, while the
// ++ the GPUDevice specific template instantiations are in the files that are
// compiled by the "GPU" compiler.
//
// For T == Eigen::half, the value of the "padding_value" argument (when it was
// pass-by-value) was getting corrupted, leading to regressions in the
// convolution unit tests.
//
// I do not understand the exact reason for the this, but based on similar past
// issues, it is likely due to a combination of
// ++ an ABI incompatibility between the "old" CPU compiler (gcc 5.4 for
// Ubuntu 16.04, gcc 7.5 for Ubuntu 18.04) and the "new" ROCm GPU compiler
// (hipclang which is based on latest clang), AND
// ++ Eigen::half having the same size but different internals on the CPU and
// GPU sides (unsigned short on CPU, union {unsigned short, _Float16} on GPU
//
// Changing the "padding value" argument to be a const reference type seems to
// suppress the bug
template <typename Device, typename T, typename IndexType, int NDIMS>
struct PadInput {
void operator()(const Device& d,
@ -316,7 +338,7 @@ struct PadInput {
const std::array<int, NDIMS - 2>& padding_left,
const std::array<int, NDIMS - 2>& padding_right,
typename TTypes<T, NDIMS, IndexType>::Tensor out,
TensorFormat format, T padding_value = T{}) {
TensorFormat format, const T& padding_value) {
Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding;
padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0};
for (int i = 0; i < NDIMS - 2; ++i) {

View File

@ -568,7 +568,7 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
const std::array<int, NDIMS - 2>& padding_left,
const std::array<int, NDIMS - 2>& padding_right,
typename TTypes<T, NDIMS, int>::Tensor out,
TensorFormat format, T padding_value = T{}) {
TensorFormat format, const T& padding_value) {
GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
Dimension<NDIMS> input_dims;
for (int i = 0; i < NDIMS; ++i) {
@ -579,6 +579,8 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
output_dims[i] = out.dimension(i);
}
VLOG(-1) << static_cast<float>(padding_value);
const Dimension<NDIMS - 2> padding_left_dim(padding_left);
if (format == FORMAT_NHWC) {

View File

@ -812,7 +812,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
{{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
{{static_cast<int>(input_pad_bottom),
static_cast<int>(input_pad_right)}},
To32Bit(compatible_input.tensor<T, 4>()), data_format);
To32Bit(compatible_input.tensor<T, 4>()), data_format, T{});
} else {
compatible_input = input;
}
@ -1148,7 +1148,7 @@ namespace functor {
const std::array<int, 2>& padding_left, \
const std::array<int, 2>& padding_right, \
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
T padding_value); \
const T& padding_value); \
extern template struct PadInput<GPUDevice, T, int, 4>;
DECLARE_GPU_SPEC(float);

View File

@ -1313,8 +1313,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
{{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
{{static_cast<int>(-input_pad_bottom),
static_cast<int>(-input_pad_right)}},
To32Bit(in_backprop_remove_padding.tensor<T, 4>()),
compute_data_format);
To32Bit(in_backprop_remove_padding.tensor<T, 4>()), compute_data_format,
T{});
pre_transformed_in_backprop = in_backprop_remove_padding;
}
@ -1346,7 +1346,7 @@ namespace functor {
const std::array<int, 2>& padding_left, \
const std::array<int, 2>& padding_right, \
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
T padding_value); \
const T& padding_value); \
extern template struct PadInput<GPUDevice, T, int, 4>;
DECLARE_GPU_SPEC(float);

View File

@ -1083,7 +1083,7 @@ namespace functor {
const std::array<int, 3>& padding_left, \
const std::array<int, 3>& padding_right, \
typename TTypes<T, 5, int>::Tensor out, TensorFormat format, \
T padding_value);
const T& padding_value);
DECLARE_GPU_SPEC(Eigen::half);
DECLARE_GPU_SPEC(float);
@ -1560,7 +1560,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
.tensor<T, 5>()),
{{0, 0, 0}}, {{-planes_odd, -rows_odd, -cols_odd}},
To32Bit(in_backprop_remove_padding.tensor<T, 5>()),
compute_data_format);
compute_data_format, T{});
pre_transformed_in_backprop = in_backprop_remove_padding;
}
@ -1765,7 +1765,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
context->template eigen_device<GPUDevice>(),
To32Bit(input.tensor<T, 5>()), {{0, 0, 0}},
{{planes_odd, rows_odd, cols_odd}},
To32Bit(compatible_input.tensor<T, 5>()), data_format_);
To32Bit(compatible_input.tensor<T, 5>()), data_format_, T{});
} else {
compatible_input = input;
}

View File

@ -827,7 +827,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
{{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
{{static_cast<int>(input_pad_bottom),
static_cast<int>(input_pad_right)}},
To32Bit(transformed_input.tensor<T, 4>()), data_format);
To32Bit(transformed_input.tensor<T, 4>()), data_format, T{});
input = transformed_input;
in_rows = new_in_rows;
@ -1184,7 +1184,7 @@ namespace functor {
const std::array<int, 2>& padding_left, \
const std::array<int, 2>& padding_right, \
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
T padding_value); \
const T& padding_value); \
extern template struct PadInput<GPUDevice, T, int, 4>
DECLARE_GPU_SPEC(float);

View File

@ -194,7 +194,7 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
functor::PadInput<GPUDevice, T, int, 5>()(
ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 5>()),
{{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}},
To32Bit(transformed_input.tensor<T, 5>()), data_format);
To32Bit(transformed_input.tensor<T, 5>()), data_format, T{});
input = transformed_input;
in_rows = new_in_rows;
in_cols = new_in_cols;
@ -540,7 +540,7 @@ namespace functor {
const std::array<int, 3>& padding_left, \
const std::array<int, 3>& padding_right, \
typename TTypes<T, 5, int>::Tensor out, TensorFormat format, \
T padding_value); \
const T& padding_value); \
template <> \
void NHWCToNCHW<GPUDevice, T, 5>::operator()( \
const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in, \

View File

@ -531,7 +531,7 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
{{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
{{static_cast<int>(input_pad_bottom),
static_cast<int>(input_pad_right)}},
To32Bit(transformed_input.tensor<T, 4>()), params.data_format);
To32Bit(transformed_input.tensor<T, 4>()), params.data_format, T{});
input = transformed_input;
in_rows = new_in_rows;
in_cols = new_in_cols;
@ -837,7 +837,7 @@ class FusedConv2DOp : public OpKernel {
const std::array<int, 2>& padding_left, \
const std::array<int, 2>& padding_right, \
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
T padding_value); \
const T& padding_value); \
extern template struct PadInput<GPUDevice, T, int, 4>
// Registration of the GPU implementations.

View File

@ -428,7 +428,7 @@ namespace functor {
const std::array<int, 2>& padding_left, \
const std::array<int, 2>& padding_right, \
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
T padding_value); \
const T& padding_value); \
extern template struct PadInput<GPUDevice, T, int, 4>;
DECLARE_GPU_SPEC(float);
@ -742,7 +742,7 @@ void DnnPoolingGradOp<T>::Compute(
{{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
{{static_cast<int>(-input_pad_bottom),
static_cast<int>(-input_pad_right)}},
To32Bit(input_backprop->tensor<T, 4>()), data_format);
To32Bit(input_backprop->tensor<T, 4>()), data_format, T{});
}
}