Merge pull request #42897 from ROCmSoftwarePlatform:google_upstream_rocm_fix_200902_1
PiperOrigin-RevId: 333777275 Change-Id: I4dbe0f969987cf92775f81d3a6ca19869a796be8
This commit is contained in:
commit
ed2e5c0cee
@ -309,6 +309,28 @@ struct TransformDepth {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Note on the use of const reference for the "padding_value" argument
|
||||||
|
//
|
||||||
|
// In the ROCm TF build,
|
||||||
|
// ++ the call(s) to the functor are in the files (conv_*.cc) that are compiled
|
||||||
|
// by the "CPU" compiler, while the
|
||||||
|
// ++ the GPUDevice specific template instantiations are in the files that are
|
||||||
|
// compiled by the "GPU" compiler.
|
||||||
|
//
|
||||||
|
// For T == Eigen::half, the value of the "padding_value" argument (when it was
|
||||||
|
// pass-by-value) was getting corrupted, leading to regressions in the
|
||||||
|
// convolution unit tests.
|
||||||
|
//
|
||||||
|
// I do not understand the exact reason for the this, but based on similar past
|
||||||
|
// issues, it is likely due to a combination of
|
||||||
|
// ++ an ABI incompatibility between the "old" CPU compiler (gcc 5.4 for
|
||||||
|
// Ubuntu 16.04, gcc 7.5 for Ubuntu 18.04) and the "new" ROCm GPU compiler
|
||||||
|
// (hipclang which is based on latest clang), AND
|
||||||
|
// ++ Eigen::half having the same size but different internals on the CPU and
|
||||||
|
// GPU sides (unsigned short on CPU, union {unsigned short, _Float16} on GPU
|
||||||
|
//
|
||||||
|
// Changing the "padding value" argument to be a const reference type seems to
|
||||||
|
// suppress the bug
|
||||||
template <typename Device, typename T, typename IndexType, int NDIMS>
|
template <typename Device, typename T, typename IndexType, int NDIMS>
|
||||||
struct PadInput {
|
struct PadInput {
|
||||||
void operator()(const Device& d,
|
void operator()(const Device& d,
|
||||||
@ -316,7 +338,7 @@ struct PadInput {
|
|||||||
const std::array<int, NDIMS - 2>& padding_left,
|
const std::array<int, NDIMS - 2>& padding_left,
|
||||||
const std::array<int, NDIMS - 2>& padding_right,
|
const std::array<int, NDIMS - 2>& padding_right,
|
||||||
typename TTypes<T, NDIMS, IndexType>::Tensor out,
|
typename TTypes<T, NDIMS, IndexType>::Tensor out,
|
||||||
TensorFormat format, T padding_value = T{}) {
|
TensorFormat format, const T& padding_value) {
|
||||||
Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding;
|
Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding;
|
||||||
padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0};
|
padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0};
|
||||||
for (int i = 0; i < NDIMS - 2; ++i) {
|
for (int i = 0; i < NDIMS - 2; ++i) {
|
||||||
|
@ -568,7 +568,7 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
|
|||||||
const std::array<int, NDIMS - 2>& padding_left,
|
const std::array<int, NDIMS - 2>& padding_left,
|
||||||
const std::array<int, NDIMS - 2>& padding_right,
|
const std::array<int, NDIMS - 2>& padding_right,
|
||||||
typename TTypes<T, NDIMS, int>::Tensor out,
|
typename TTypes<T, NDIMS, int>::Tensor out,
|
||||||
TensorFormat format, T padding_value = T{}) {
|
TensorFormat format, const T& padding_value) {
|
||||||
GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
|
GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
|
||||||
Dimension<NDIMS> input_dims;
|
Dimension<NDIMS> input_dims;
|
||||||
for (int i = 0; i < NDIMS; ++i) {
|
for (int i = 0; i < NDIMS; ++i) {
|
||||||
|
@ -812,7 +812,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
|
|||||||
{{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
|
{{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
|
||||||
{{static_cast<int>(input_pad_bottom),
|
{{static_cast<int>(input_pad_bottom),
|
||||||
static_cast<int>(input_pad_right)}},
|
static_cast<int>(input_pad_right)}},
|
||||||
To32Bit(compatible_input.tensor<T, 4>()), data_format);
|
To32Bit(compatible_input.tensor<T, 4>()), data_format, T{});
|
||||||
} else {
|
} else {
|
||||||
compatible_input = input;
|
compatible_input = input;
|
||||||
}
|
}
|
||||||
@ -1132,7 +1132,7 @@ namespace functor {
|
|||||||
const std::array<int, 2>& padding_left, \
|
const std::array<int, 2>& padding_left, \
|
||||||
const std::array<int, 2>& padding_right, \
|
const std::array<int, 2>& padding_right, \
|
||||||
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
|
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
|
||||||
T padding_value); \
|
const T& padding_value); \
|
||||||
extern template struct PadInput<GPUDevice, T, int, 4>;
|
extern template struct PadInput<GPUDevice, T, int, 4>;
|
||||||
|
|
||||||
DECLARE_GPU_SPEC(float);
|
DECLARE_GPU_SPEC(float);
|
||||||
|
@ -1299,8 +1299,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
|
|||||||
{{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
|
{{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
|
||||||
{{static_cast<int>(-input_pad_bottom),
|
{{static_cast<int>(-input_pad_bottom),
|
||||||
static_cast<int>(-input_pad_right)}},
|
static_cast<int>(-input_pad_right)}},
|
||||||
To32Bit(in_backprop_remove_padding.tensor<T, 4>()),
|
To32Bit(in_backprop_remove_padding.tensor<T, 4>()), compute_data_format,
|
||||||
compute_data_format);
|
T{});
|
||||||
|
|
||||||
pre_transformed_in_backprop = in_backprop_remove_padding;
|
pre_transformed_in_backprop = in_backprop_remove_padding;
|
||||||
}
|
}
|
||||||
@ -1332,7 +1332,7 @@ namespace functor {
|
|||||||
const std::array<int, 2>& padding_left, \
|
const std::array<int, 2>& padding_left, \
|
||||||
const std::array<int, 2>& padding_right, \
|
const std::array<int, 2>& padding_right, \
|
||||||
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
|
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
|
||||||
T padding_value); \
|
const T& padding_value); \
|
||||||
extern template struct PadInput<GPUDevice, T, int, 4>;
|
extern template struct PadInput<GPUDevice, T, int, 4>;
|
||||||
|
|
||||||
DECLARE_GPU_SPEC(float);
|
DECLARE_GPU_SPEC(float);
|
||||||
|
@ -1083,7 +1083,7 @@ namespace functor {
|
|||||||
const std::array<int, 3>& padding_left, \
|
const std::array<int, 3>& padding_left, \
|
||||||
const std::array<int, 3>& padding_right, \
|
const std::array<int, 3>& padding_right, \
|
||||||
typename TTypes<T, 5, int>::Tensor out, TensorFormat format, \
|
typename TTypes<T, 5, int>::Tensor out, TensorFormat format, \
|
||||||
T padding_value);
|
const T& padding_value);
|
||||||
|
|
||||||
DECLARE_GPU_SPEC(Eigen::half);
|
DECLARE_GPU_SPEC(Eigen::half);
|
||||||
DECLARE_GPU_SPEC(float);
|
DECLARE_GPU_SPEC(float);
|
||||||
@ -1547,7 +1547,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
|
|||||||
.tensor<T, 5>()),
|
.tensor<T, 5>()),
|
||||||
{{0, 0, 0}}, {{-planes_odd, -rows_odd, -cols_odd}},
|
{{0, 0, 0}}, {{-planes_odd, -rows_odd, -cols_odd}},
|
||||||
To32Bit(in_backprop_remove_padding.tensor<T, 5>()),
|
To32Bit(in_backprop_remove_padding.tensor<T, 5>()),
|
||||||
compute_data_format);
|
compute_data_format, T{});
|
||||||
|
|
||||||
pre_transformed_in_backprop = in_backprop_remove_padding;
|
pre_transformed_in_backprop = in_backprop_remove_padding;
|
||||||
}
|
}
|
||||||
@ -1752,7 +1752,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
|
|||||||
context->template eigen_device<GPUDevice>(),
|
context->template eigen_device<GPUDevice>(),
|
||||||
To32Bit(input.tensor<T, 5>()), {{0, 0, 0}},
|
To32Bit(input.tensor<T, 5>()), {{0, 0, 0}},
|
||||||
{{planes_odd, rows_odd, cols_odd}},
|
{{planes_odd, rows_odd, cols_odd}},
|
||||||
To32Bit(compatible_input.tensor<T, 5>()), data_format_);
|
To32Bit(compatible_input.tensor<T, 5>()), data_format_, T{});
|
||||||
} else {
|
} else {
|
||||||
compatible_input = input;
|
compatible_input = input;
|
||||||
}
|
}
|
||||||
|
@ -827,7 +827,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
|
|||||||
{{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
|
{{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
|
||||||
{{static_cast<int>(input_pad_bottom),
|
{{static_cast<int>(input_pad_bottom),
|
||||||
static_cast<int>(input_pad_right)}},
|
static_cast<int>(input_pad_right)}},
|
||||||
To32Bit(transformed_input.tensor<T, 4>()), data_format);
|
To32Bit(transformed_input.tensor<T, 4>()), data_format, T{});
|
||||||
|
|
||||||
input = transformed_input;
|
input = transformed_input;
|
||||||
in_rows = new_in_rows;
|
in_rows = new_in_rows;
|
||||||
@ -1170,7 +1170,7 @@ namespace functor {
|
|||||||
const std::array<int, 2>& padding_left, \
|
const std::array<int, 2>& padding_left, \
|
||||||
const std::array<int, 2>& padding_right, \
|
const std::array<int, 2>& padding_right, \
|
||||||
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
|
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
|
||||||
T padding_value); \
|
const T& padding_value); \
|
||||||
extern template struct PadInput<GPUDevice, T, int, 4>
|
extern template struct PadInput<GPUDevice, T, int, 4>
|
||||||
|
|
||||||
DECLARE_GPU_SPEC(float);
|
DECLARE_GPU_SPEC(float);
|
||||||
|
@ -194,7 +194,7 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
|
|||||||
functor::PadInput<GPUDevice, T, int, 5>()(
|
functor::PadInput<GPUDevice, T, int, 5>()(
|
||||||
ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 5>()),
|
ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 5>()),
|
||||||
{{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}},
|
{{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}},
|
||||||
To32Bit(transformed_input.tensor<T, 5>()), data_format);
|
To32Bit(transformed_input.tensor<T, 5>()), data_format, T{});
|
||||||
input = transformed_input;
|
input = transformed_input;
|
||||||
in_rows = new_in_rows;
|
in_rows = new_in_rows;
|
||||||
in_cols = new_in_cols;
|
in_cols = new_in_cols;
|
||||||
@ -528,7 +528,7 @@ namespace functor {
|
|||||||
const std::array<int, 3>& padding_left, \
|
const std::array<int, 3>& padding_left, \
|
||||||
const std::array<int, 3>& padding_right, \
|
const std::array<int, 3>& padding_right, \
|
||||||
typename TTypes<T, 5, int>::Tensor out, TensorFormat format, \
|
typename TTypes<T, 5, int>::Tensor out, TensorFormat format, \
|
||||||
T padding_value); \
|
const T& padding_value); \
|
||||||
template <> \
|
template <> \
|
||||||
void NHWCToNCHW<GPUDevice, T, 5>::operator()( \
|
void NHWCToNCHW<GPUDevice, T, 5>::operator()( \
|
||||||
const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in, \
|
const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in, \
|
||||||
|
@ -531,7 +531,7 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
|
|||||||
{{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
|
{{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
|
||||||
{{static_cast<int>(input_pad_bottom),
|
{{static_cast<int>(input_pad_bottom),
|
||||||
static_cast<int>(input_pad_right)}},
|
static_cast<int>(input_pad_right)}},
|
||||||
To32Bit(transformed_input.tensor<T, 4>()), params.data_format);
|
To32Bit(transformed_input.tensor<T, 4>()), params.data_format, T{});
|
||||||
input = transformed_input;
|
input = transformed_input;
|
||||||
in_rows = new_in_rows;
|
in_rows = new_in_rows;
|
||||||
in_cols = new_in_cols;
|
in_cols = new_in_cols;
|
||||||
@ -831,7 +831,7 @@ class FusedConv2DOp : public OpKernel {
|
|||||||
const std::array<int, 2>& padding_left, \
|
const std::array<int, 2>& padding_left, \
|
||||||
const std::array<int, 2>& padding_right, \
|
const std::array<int, 2>& padding_right, \
|
||||||
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
|
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
|
||||||
T padding_value); \
|
const T& padding_value); \
|
||||||
extern template struct PadInput<GPUDevice, T, int, 4>
|
extern template struct PadInput<GPUDevice, T, int, 4>
|
||||||
|
|
||||||
// Registration of the GPU implementations.
|
// Registration of the GPU implementations.
|
||||||
|
@ -428,7 +428,7 @@ namespace functor {
|
|||||||
const std::array<int, 2>& padding_left, \
|
const std::array<int, 2>& padding_left, \
|
||||||
const std::array<int, 2>& padding_right, \
|
const std::array<int, 2>& padding_right, \
|
||||||
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
|
typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
|
||||||
T padding_value); \
|
const T& padding_value); \
|
||||||
extern template struct PadInput<GPUDevice, T, int, 4>;
|
extern template struct PadInput<GPUDevice, T, int, 4>;
|
||||||
|
|
||||||
DECLARE_GPU_SPEC(float);
|
DECLARE_GPU_SPEC(float);
|
||||||
@ -742,7 +742,7 @@ void DnnPoolingGradOp<T>::Compute(
|
|||||||
{{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
|
{{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
|
||||||
{{static_cast<int>(-input_pad_bottom),
|
{{static_cast<int>(-input_pad_bottom),
|
||||||
static_cast<int>(-input_pad_right)}},
|
static_cast<int>(-input_pad_right)}},
|
||||||
To32Bit(input_backprop->tensor<T, 4>()), data_format);
|
To32Bit(input_backprop->tensor<T, 4>()), data_format, T{});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user