diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index d65cca6dd36..b9a8c977e11 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -309,6 +309,28 @@ struct TransformDepth {
   }
 };
 
+// Note on the use of const reference for the "padding_value" argument
+//
+// In the ROCm TF build,
+// ++ the call(s) to the functor are in the files (conv_*.cc) that are compiled
+//    by the "CPU" compiler, while the
+// ++ the GPUDevice specific template instantiations are in the files that are
+//     compiled by the "GPU" compiler.
+//
+// For T == Eigen::half, the value of the "padding_value" argument (when it was
+// pass-by-value) was getting corrupted, leading to regressions in the
+// convolution unit tests.
+//
+// I do not understand the exact reason for the this, but based on similar past
+// issues, it is likely due to a combination of
+// ++ an ABI incompatibility between the "old" CPU compiler (gcc 5.4 for
+//    Ubuntu 16.04, gcc 7.5 for Ubuntu 18.04) and the "new" ROCm GPU compiler
+//    (hipclang which is based on latest clang), AND
+// ++ Eigen::half having the same size but different internals on the CPU and
+//    GPU sides (unsigned short on CPU, union {unsigned short, _Float16} on GPU
+//
+// Changing the "padding value" argument to be a const reference type seems to
+// suppress the bug
 template <typename Device, typename T, typename IndexType, int NDIMS>
 struct PadInput {
   void operator()(const Device& d,
@@ -316,7 +338,7 @@ struct PadInput {
                   const std::array<int, NDIMS - 2>& padding_left,
                   const std::array<int, NDIMS - 2>& padding_right,
                   typename TTypes<T, NDIMS, IndexType>::Tensor out,
-                  TensorFormat format, T padding_value = T{}) {
+                  TensorFormat format, const T& padding_value) {
     Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding;
     padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0};
     for (int i = 0; i < NDIMS - 2; ++i) {
diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index 31aa04247fa..a235d977e91 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -568,7 +568,7 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
                   const std::array<int, NDIMS - 2>& padding_left,
                   const std::array<int, NDIMS - 2>& padding_right,
                   typename TTypes<T, NDIMS, int>::Tensor out,
-                  TensorFormat format, T padding_value = T{}) {
+                  TensorFormat format, const T& padding_value) {
     GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
     Dimension<NDIMS> input_dims;
     for (int i = 0; i < NDIMS; ++i) {
@@ -579,6 +579,8 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
       output_dims[i] = out.dimension(i);
     }
 
+    VLOG(-1) << static_cast<float>(padding_value);
+
     const Dimension<NDIMS - 2> padding_left_dim(padding_left);
 
     if (format == FORMAT_NHWC) {
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 840d2c13c77..94078264f24 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -812,7 +812,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
         {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
         {{static_cast<int>(input_pad_bottom),
           static_cast<int>(input_pad_right)}},
-        To32Bit(compatible_input.tensor<T, 4>()), data_format);
+        To32Bit(compatible_input.tensor<T, 4>()), data_format, T{});
   } else {
     compatible_input = input;
   }
@@ -1148,7 +1148,7 @@ namespace functor {
       const std::array<int, 2>& padding_left,                           \
       const std::array<int, 2>& padding_right,                          \
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
-      T padding_value);                                                 \
+      const T& padding_value);                                          \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index c1dadcf767c..0cb701494e0 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -1313,8 +1313,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
         {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
         {{static_cast<int>(-input_pad_bottom),
           static_cast<int>(-input_pad_right)}},
-        To32Bit(in_backprop_remove_padding.tensor<T, 4>()),
-        compute_data_format);
+        To32Bit(in_backprop_remove_padding.tensor<T, 4>()), compute_data_format,
+        T{});
 
     pre_transformed_in_backprop = in_backprop_remove_padding;
   }
@@ -1346,7 +1346,7 @@ namespace functor {
       const std::array<int, 2>& padding_left,                           \
       const std::array<int, 2>& padding_right,                          \
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
-      T padding_value);                                                 \
+      const T& padding_value);                                          \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 7c61dc92604..f57456da854 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -1083,7 +1083,7 @@ namespace functor {
       const std::array<int, 3>& padding_left,                         \
       const std::array<int, 3>& padding_right,                        \
       typename TTypes<T, 5, int>::Tensor out, TensorFormat format,    \
-      T padding_value);
+      const T& padding_value);
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -1560,7 +1560,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
                       .tensor<T, 5>()),
           {{0, 0, 0}}, {{-planes_odd, -rows_odd, -cols_odd}},
           To32Bit(in_backprop_remove_padding.tensor<T, 5>()),
-          compute_data_format);
+          compute_data_format, T{});
 
       pre_transformed_in_backprop = in_backprop_remove_padding;
     }
@@ -1765,7 +1765,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
           context->template eigen_device<GPUDevice>(),
           To32Bit(input.tensor<T, 5>()), {{0, 0, 0}},
           {{planes_odd, rows_odd, cols_odd}},
-          To32Bit(compatible_input.tensor<T, 5>()), data_format_);
+          To32Bit(compatible_input.tensor<T, 5>()), data_format_, T{});
     } else {
       compatible_input = input;
     }
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index b8c2671e7d2..e4c3ab6a642 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -827,7 +827,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
         {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
         {{static_cast<int>(input_pad_bottom),
           static_cast<int>(input_pad_right)}},
-        To32Bit(transformed_input.tensor<T, 4>()), data_format);
+        To32Bit(transformed_input.tensor<T, 4>()), data_format, T{});
 
     input = transformed_input;
     in_rows = new_in_rows;
@@ -1184,7 +1184,7 @@ namespace functor {
       const std::array<int, 2>& padding_left,                               \
       const std::array<int, 2>& padding_right,                              \
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format,     \
-      T padding_value);                                                     \
+      const T& padding_value);                                              \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 660f4b6febc..d4e551a013c 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -194,7 +194,7 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
         functor::PadInput<GPUDevice, T, int, 5>()(
             ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 5>()),
             {{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}},
-            To32Bit(transformed_input.tensor<T, 5>()), data_format);
+            To32Bit(transformed_input.tensor<T, 5>()), data_format, T{});
         input = transformed_input;
         in_rows = new_in_rows;
         in_cols = new_in_cols;
@@ -540,7 +540,7 @@ namespace functor {
       const std::array<int, 3>& padding_left,                         \
       const std::array<int, 3>& padding_right,                        \
       typename TTypes<T, 5, int>::Tensor out, TensorFormat format,    \
-      T padding_value);                                               \
+      const T& padding_value);                                        \
   template <>                                                         \
   void NHWCToNCHW<GPUDevice, T, 5>::operator()(                       \
       const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in,      \
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index 5011e8ba7a1..0fe3131e63c 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -531,7 +531,7 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
           {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
           {{static_cast<int>(input_pad_bottom),
             static_cast<int>(input_pad_right)}},
-          To32Bit(transformed_input.tensor<T, 4>()), params.data_format);
+          To32Bit(transformed_input.tensor<T, 4>()), params.data_format, T{});
       input = transformed_input;
       in_rows = new_in_rows;
       in_cols = new_in_cols;
@@ -837,7 +837,7 @@ class FusedConv2DOp : public OpKernel {
       const std::array<int, 2>& padding_left,                           \
       const std::array<int, 2>& padding_right,                          \
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
-      T padding_value);                                                 \
+      const T& padding_value);                                          \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
 // Registration of the GPU implementations.
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index ca2f0ecfe21..bbe03dd39cb 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -428,7 +428,7 @@ namespace functor {
       const std::array<int, 2>& padding_left,                           \
       const std::array<int, 2>& padding_right,                          \
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
-      T padding_value);                                                 \
+      const T& padding_value);                                          \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
 DECLARE_GPU_SPEC(float);
@@ -742,7 +742,7 @@ void DnnPoolingGradOp<T>::Compute(
         {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
         {{static_cast<int>(-input_pad_bottom),
           static_cast<int>(-input_pad_right)}},
-        To32Bit(input_backprop->tensor<T, 4>()), data_format);
+        To32Bit(input_backprop->tensor<T, 4>()), data_format, T{});
   }
 }