diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h index d65cca6dd36..b9a8c977e11 100644 --- a/tensorflow/core/kernels/conv_2d.h +++ b/tensorflow/core/kernels/conv_2d.h @@ -309,6 +309,28 @@ struct TransformDepth { } }; +// Note on the use of const reference for the "padding_value" argument +// +// In the ROCm TF build, +// ++ the call(s) to the functor are in the files (conv_*.cc) that are compiled +// by the "CPU" compiler, while the +// ++ the GPUDevice specific template instantiations are in the files that are +// compiled by the "GPU" compiler. +// +// For T == Eigen::half, the value of the "padding_value" argument (when it was +// pass-by-value) was getting corrupted, leading to regressions in the +// convolution unit tests. +// +// I do not understand the exact reason for the this, but based on similar past +// issues, it is likely due to a combination of +// ++ an ABI incompatibility between the "old" CPU compiler (gcc 5.4 for +// Ubuntu 16.04, gcc 7.5 for Ubuntu 18.04) and the "new" ROCm GPU compiler +// (hipclang which is based on latest clang), AND +// ++ Eigen::half having the same size but different internals on the CPU and +// GPU sides (unsigned short on CPU, union {unsigned short, _Float16} on GPU +// +// Changing the "padding value" argument to be a const reference type seems to +// suppress the bug template <typename Device, typename T, typename IndexType, int NDIMS> struct PadInput { void operator()(const Device& d, @@ -316,7 +338,7 @@ struct PadInput { const std::array<int, NDIMS - 2>& padding_left, const std::array<int, NDIMS - 2>& padding_right, typename TTypes<T, NDIMS, IndexType>::Tensor out, - TensorFormat format, T padding_value = T{}) { + TensorFormat format, const T& padding_value) { Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding; padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0}; for (int i = 0; i < NDIMS - 2; ++i) { diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h index 31aa04247fa..a235d977e91 100644 --- a/tensorflow/core/kernels/conv_2d_gpu.h +++ b/tensorflow/core/kernels/conv_2d_gpu.h @@ -568,7 +568,7 @@ struct PadInput<GPUDevice, T, int, NDIMS> { const std::array<int, NDIMS - 2>& padding_left, const std::array<int, NDIMS - 2>& padding_right, typename TTypes<T, NDIMS, int>::Tensor out, - TensorFormat format, T padding_value = T{}) { + TensorFormat format, const T& padding_value) { GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d); Dimension<NDIMS> input_dims; for (int i = 0; i < NDIMS; ++i) { @@ -579,6 +579,8 @@ struct PadInput<GPUDevice, T, int, NDIMS> { output_dims[i] = out.dimension(i); } + VLOG(-1) << static_cast<float>(padding_value); + const Dimension<NDIMS - 2> padding_left_dim(padding_left); if (format == FORMAT_NHWC) { diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc index 840d2c13c77..94078264f24 100644 --- a/tensorflow/core/kernels/conv_grad_filter_ops.cc +++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc @@ -812,7 +812,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()( {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}}, {{static_cast<int>(input_pad_bottom), static_cast<int>(input_pad_right)}}, - To32Bit(compatible_input.tensor<T, 4>()), data_format); + To32Bit(compatible_input.tensor<T, 4>()), data_format, T{}); } else { compatible_input = input; } @@ -1148,7 +1148,7 @@ namespace functor { const std::array<int, 2>& padding_left, \ const std::array<int, 2>& padding_right, \ typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \ - T padding_value); \ + const T& padding_value); \ extern template struct PadInput<GPUDevice, T, int, 4>; DECLARE_GPU_SPEC(float); diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc index c1dadcf767c..0cb701494e0 100644 --- a/tensorflow/core/kernels/conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/conv_grad_input_ops.cc @@ -1313,8 +1313,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()( {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}}, {{static_cast<int>(-input_pad_bottom), static_cast<int>(-input_pad_right)}}, - To32Bit(in_backprop_remove_padding.tensor<T, 4>()), - compute_data_format); + To32Bit(in_backprop_remove_padding.tensor<T, 4>()), compute_data_format, + T{}); pre_transformed_in_backprop = in_backprop_remove_padding; } @@ -1346,7 +1346,7 @@ namespace functor { const std::array<int, 2>& padding_left, \ const std::array<int, 2>& padding_right, \ typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \ - T padding_value); \ + const T& padding_value); \ extern template struct PadInput<GPUDevice, T, int, 4>; DECLARE_GPU_SPEC(float); diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc index 7c61dc92604..f57456da854 100644 --- a/tensorflow/core/kernels/conv_grad_ops_3d.cc +++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc @@ -1083,7 +1083,7 @@ namespace functor { const std::array<int, 3>& padding_left, \ const std::array<int, 3>& padding_right, \ typename TTypes<T, 5, int>::Tensor out, TensorFormat format, \ - T padding_value); + const T& padding_value); DECLARE_GPU_SPEC(Eigen::half); DECLARE_GPU_SPEC(float); @@ -1560,7 +1560,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel { .tensor<T, 5>()), {{0, 0, 0}}, {{-planes_odd, -rows_odd, -cols_odd}}, To32Bit(in_backprop_remove_padding.tensor<T, 5>()), - compute_data_format); + compute_data_format, T{}); pre_transformed_in_backprop = in_backprop_remove_padding; } @@ -1765,7 +1765,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel { context->template eigen_device<GPUDevice>(), To32Bit(input.tensor<T, 5>()), {{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}}, - To32Bit(compatible_input.tensor<T, 5>()), data_format_); + To32Bit(compatible_input.tensor<T, 5>()), data_format_, T{}); } else { compatible_input = input; } diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc index b8c2671e7d2..e4c3ab6a642 100644 --- a/tensorflow/core/kernels/conv_ops.cc +++ b/tensorflow/core/kernels/conv_ops.cc @@ -827,7 +827,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()( {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}}, {{static_cast<int>(input_pad_bottom), static_cast<int>(input_pad_right)}}, - To32Bit(transformed_input.tensor<T, 4>()), data_format); + To32Bit(transformed_input.tensor<T, 4>()), data_format, T{}); input = transformed_input; in_rows = new_in_rows; @@ -1184,7 +1184,7 @@ namespace functor { const std::array<int, 2>& padding_left, \ const std::array<int, 2>& padding_right, \ typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \ - T padding_value); \ + const T& padding_value); \ extern template struct PadInput<GPUDevice, T, int, 4> DECLARE_GPU_SPEC(float); diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc index 660f4b6febc..d4e551a013c 100644 --- a/tensorflow/core/kernels/conv_ops_3d.cc +++ b/tensorflow/core/kernels/conv_ops_3d.cc @@ -194,7 +194,7 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> { functor::PadInput<GPUDevice, T, int, 5>()( ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 5>()), {{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}}, - To32Bit(transformed_input.tensor<T, 5>()), data_format); + To32Bit(transformed_input.tensor<T, 5>()), data_format, T{}); input = transformed_input; in_rows = new_in_rows; in_cols = new_in_cols; @@ -540,7 +540,7 @@ namespace functor { const std::array<int, 3>& padding_left, \ const std::array<int, 3>& padding_right, \ typename TTypes<T, 5, int>::Tensor out, TensorFormat format, \ - T padding_value); \ + const T& padding_value); \ template <> \ void NHWCToNCHW<GPUDevice, T, 5>::operator()( \ const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in, \ diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h index 5011e8ba7a1..0fe3131e63c 100644 --- a/tensorflow/core/kernels/conv_ops_fused_impl.h +++ b/tensorflow/core/kernels/conv_ops_fused_impl.h @@ -531,7 +531,7 @@ struct LaunchFusedConv2DOp<GPUDevice, T> { {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}}, {{static_cast<int>(input_pad_bottom), static_cast<int>(input_pad_right)}}, - To32Bit(transformed_input.tensor<T, 4>()), params.data_format); + To32Bit(transformed_input.tensor<T, 4>()), params.data_format, T{}); input = transformed_input; in_rows = new_in_rows; in_cols = new_in_cols; @@ -837,7 +837,7 @@ class FusedConv2DOp : public OpKernel { const std::array<int, 2>& padding_left, \ const std::array<int, 2>& padding_right, \ typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \ - T padding_value); \ + const T& padding_value); \ extern template struct PadInput<GPUDevice, T, int, 4> // Registration of the GPU implementations. diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc index ca2f0ecfe21..bbe03dd39cb 100644 --- a/tensorflow/core/kernels/pooling_ops_common.cc +++ b/tensorflow/core/kernels/pooling_ops_common.cc @@ -428,7 +428,7 @@ namespace functor { const std::array<int, 2>& padding_left, \ const std::array<int, 2>& padding_right, \ typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \ - T padding_value); \ + const T& padding_value); \ extern template struct PadInput<GPUDevice, T, int, 4>; DECLARE_GPU_SPEC(float); @@ -742,7 +742,7 @@ void DnnPoolingGradOp<T>::Compute( {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}}, {{static_cast<int>(-input_pad_bottom), static_cast<int>(-input_pad_right)}}, - To32Bit(input_backprop->tensor<T, 4>()), data_format); + To32Bit(input_backprop->tensor<T, 4>()), data_format, T{}); } }