Merge pull request #42944 from ROCmSoftwarePlatform:google_upstream_rocm_fix_200902_2

PiperOrigin-RevId: 334635468
Change-Id: I06271b0c24b68e87913262fe9378f1ba2650a69b
This commit is contained in:
TensorFlower Gardener 2020-09-30 11:21:30 -07:00
commit 221282c169
2 changed files with 38 additions and 27 deletions

View File

@ -1212,9 +1212,11 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
data_format_, tensor_in, out_shape, data_format_, tensor_in, out_shape,
propagate_nans_); propagate_nans_);
} else { } else {
#if !defined(TENSORFLOW_USE_ROCM)
OP_REQUIRES(context, padding_ != EXPLICIT, OP_REQUIRES(context, padding_ != EXPLICIT,
errors::Unimplemented("Explicit padding is not supported ", errors::Unimplemented("Explicit padding is not supported ",
"when CUDNN is not enabled.")); "when CUDNN is not enabled."));
#endif
Tensor* output = nullptr; Tensor* output = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
if (is_int8x4) { if (is_int8x4) {

View File

@ -463,6 +463,8 @@ void DnnPoolingGradOp<T>::Compute(
return; return;
} }
TensorFormat transformed_input_data_format = data_format;
#if CUDNN_VERSION < 7300 #if CUDNN_VERSION < 7300
/// For now, cudnn does not support NHWC format, so we need to convert it /// For now, cudnn does not support NHWC format, so we need to convert it
/// to NCHW before calling cudnn. We need to get rid of this once it is done /// to NCHW before calling cudnn. We need to get rid of this once it is done
@ -516,6 +518,7 @@ void DnnPoolingGradOp<T>::Compute(
functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(), functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
tensor_in->tensor<T, 4>(), tensor_in->tensor<T, 4>(),
transformed_input.tensor<T, 4>()); transformed_input.tensor<T, 4>());
transformed_input_data_format = FORMAT_NCHW;
} }
if (tensor_out) { if (tensor_out) {
// For AvgPoolGrad, the original output tensor is not necessary. However, // For AvgPoolGrad, the original output tensor is not necessary. However,
@ -577,6 +580,8 @@ void DnnPoolingGradOp<T>::Compute(
int64 input_pad_left = 0; int64 input_pad_left = 0;
int64 input_pad_right = 0; int64 input_pad_right = 0;
Tensor transformed_and_padded_input_backprop;
if (padding == EXPLICIT && (params.pad_top != params.pad_bottom || if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
params.pad_left != params.pad_right)) { params.pad_left != params.pad_right)) {
// Pad the input in the same way we did during the forward pass, so that // Pad the input in the same way we did during the forward pass, so that
@ -588,7 +593,6 @@ void DnnPoolingGradOp<T>::Compute(
std::min(params.pad_left, params.pad_right); std::min(params.pad_left, params.pad_right);
Tensor padded_input; Tensor padded_input;
Tensor padded_input_backprop;
const int64 padding_rows_diff = const int64 padding_rows_diff =
std::abs(params.pad_top - params.pad_bottom); std::abs(params.pad_top - params.pad_bottom);
const int64 padding_cols_diff = const int64 padding_cols_diff =
@ -607,18 +611,18 @@ void DnnPoolingGradOp<T>::Compute(
<< " stride_rows" << params.row_stride; << " stride_rows" << params.row_stride;
OP_REQUIRES_OK( OP_REQUIRES_OK(
context, context, context->allocate_temp(
context->allocate_temp(DataTypeToEnum<T>::value, DataTypeToEnum<T>::value,
ShapeFromFormat(data_format, batch_size, ShapeFromFormat(transformed_input_data_format, batch_size,
new_in_rows, new_in_cols, depth), new_in_rows, new_in_cols, depth),
&padded_input)); &padded_input));
OP_REQUIRES_OK( OP_REQUIRES_OK(
context, context, context->allocate_temp(
context->allocate_temp(DataTypeToEnum<T>::value, DataTypeToEnum<T>::value,
ShapeFromFormat(data_format, batch_size, ShapeFromFormat(transformed_input_data_format, batch_size,
new_in_rows, new_in_cols, depth), new_in_rows, new_in_cols, depth),
&transformed_input_backprop)); &transformed_and_padded_input_backprop));
input_pad_top = params.pad_top - common_padding_rows; input_pad_top = params.pad_top - common_padding_rows;
input_pad_bottom = params.pad_bottom - common_padding_rows; input_pad_bottom = params.pad_bottom - common_padding_rows;
@ -644,7 +648,8 @@ void DnnPoolingGradOp<T>::Compute(
To32Bit(const_transformed_input.tensor<T, 4>()), To32Bit(const_transformed_input.tensor<T, 4>()),
static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom), static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom),
static_cast<int>(input_pad_left), static_cast<int>(input_pad_right), static_cast<int>(input_pad_left), static_cast<int>(input_pad_right),
To32Bit(padded_input.tensor<T, 4>()), data_format)); To32Bit(padded_input.tensor<T, 4>()),
transformed_input_data_format));
transformed_input = padded_input; transformed_input = padded_input;
@ -654,6 +659,8 @@ void DnnPoolingGradOp<T>::Compute(
<< " horizontal padding set to: " << horizontal_padding; << " horizontal padding set to: " << horizontal_padding;
tensor_in_rows = new_in_rows; tensor_in_rows = new_in_rows;
tensor_in_cols = new_in_cols; tensor_in_cols = new_in_cols;
} else {
transformed_and_padded_input_backprop = transformed_input_backprop;
} }
/// Get ready to call cudnn /// Get ready to call cudnn
@ -690,9 +697,9 @@ void DnnPoolingGradOp<T>::Compute(
auto output_backprop_data = auto output_backprop_data =
AsDeviceMemory(transformed_output_backprop.template flat<T>().data(), AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
transformed_output_backprop.template flat<T>().size()); transformed_output_backprop.template flat<T>().size());
auto input_backprop_data = auto input_backprop_data = AsDeviceMemory(
AsDeviceMemory(transformed_input_backprop.template flat<T>().data(), transformed_and_padded_input_backprop.template flat<T>().data(),
transformed_input_backprop.template flat<T>().size()); transformed_and_padded_input_backprop.template flat<T>().size());
auto* stream = context->op_device_context()->stream(); auto* stream = context->op_device_context()->stream();
OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
@ -722,6 +729,20 @@ void DnnPoolingGradOp<T>::Compute(
OP_REQUIRES(context, status, OP_REQUIRES(context, status,
errors::Internal("dnn PoolBackward launch failed")); errors::Internal("dnn PoolBackward launch failed"));
if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
params.pad_left != params.pad_right)) {
// Remove the padding that was added to the input shape above.
functor::PadInput<GPUDevice, T, int, 4>()(
context->eigen_device<GPUDevice>(),
To32Bit(const_cast<const Tensor&>(transformed_and_padded_input_backprop)
.tensor<T, 4>()),
{{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
{{static_cast<int>(-input_pad_bottom),
static_cast<int>(-input_pad_right)}},
To32Bit(transformed_input_backprop.template tensor<T, 4>()),
transformed_input_data_format, T{});
}
#if CUDNN_VERSION < 7300 #if CUDNN_VERSION < 7300
if (data_format == FORMAT_NHWC) { if (data_format == FORMAT_NHWC) {
/// Transform the output data from NCHW back to NHWC. /// Transform the output data from NCHW back to NHWC.
@ -732,18 +753,6 @@ void DnnPoolingGradOp<T>::Compute(
input_backprop->tensor<T, 4>()); input_backprop->tensor<T, 4>());
} }
#endif // CUDNN_VERSION < 7300 #endif // CUDNN_VERSION < 7300
if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
params.pad_left != params.pad_right)) {
// Remove the padding that was added to the input shape above.
functor::PadInput<GPUDevice, T, int, 4>()(
context->eigen_device<GPUDevice>(),
To32Bit(const_cast<const Tensor&>(transformed_input_backprop)
.tensor<T, 4>()),
{{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
{{static_cast<int>(-input_pad_bottom),
static_cast<int>(-input_pad_right)}},
To32Bit(input_backprop->tensor<T, 4>()), data_format, T{});
}
} }
#define DEFINE_DNN_OPS(T) \ #define DEFINE_DNN_OPS(T) \