diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 8acae6f00c6..e941a2bb036 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4366,7 +4366,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "lrn_op",
     prefix = "lrn_op",
-    deps = NN_DEPS,
+    deps = NN_DEPS + if_rocm([":conv_ops_gpu_hdrs"]),
 )
 
 tf_kernel_library(
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index a1997579037..afe94edd4ba 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -36,9 +36,17 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
+#endif
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
+#if TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#endif
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/stream_executor_util.h"
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace tensorflow {
 
@@ -164,7 +172,7 @@ struct LaunchLRN<CPUDevice, T> {
   T beta_;
 };
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename T>
 struct LaunchLRN<GPUDevice, T> {
@@ -173,6 +181,7 @@ struct LaunchLRN<GPUDevice, T> {
 
   void launch(OpKernelContext* context, OpKernel* kernel, const Tensor& in,
               Tensor* output) {
+#if GOOGLE_CUDA
     OP_REQUIRES(
         context, beta_ >= 0.01,
         errors::InvalidArgument("cuDNN requires beta >= 0.01, got: ", beta_));
@@ -217,6 +226,71 @@ struct LaunchLRN<GPUDevice, T> {
             .ok();
     OP_REQUIRES(context, status,
                 errors::Internal("NormalizeWithDimensions launch failed"));
+#elif TENSORFLOW_USE_ROCM
+    // For NHWC input/output tensors, convert to NCHW because it's the only
+    // supported format in MIOpen for now.
+
+    // Cast to platform-specific int to avoid conversion warnings.
+    const int batch = static_cast<int>(in.dim_size(0));
+    const int rows = static_cast<int>(in.dim_size(1));
+    const int cols = static_cast<int>(in.dim_size(2));
+    const int depth = static_cast<int>(in.dim_size(3));
+
+    Tensor transformed_input;
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(
+                       DataTypeToEnum<T>::value,
+                       ShapeFromFormat(FORMAT_NCHW, in.shape(), FORMAT_NHWC),
+                       &transformed_input));
+    functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<GPUDevice>(),
+                                           in.tensor<T, 4>(),
+                                           transformed_input.tensor<T, 4>());
+
+    Tensor transformed_output;
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(
+                     DataTypeToEnum<T>::value,
+                     ShapeFromFormat(FORMAT_NCHW, output->shape(), FORMAT_NHWC),
+                     &transformed_output));
+
+    perftools::gputools::dnn::BatchDescriptor dimensions_desc;
+    dimensions_desc.set_count(batch)
+        .set_height(rows)
+        .set_width(cols)
+        .set_feature_map_count(depth)
+        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+
+    perftools::gputools::dnn::NormalizeDescriptor normalize_desc;
+    normalize_desc.set_bias(bias_)
+        .set_range(depth_radius_)
+        .set_alpha(alpha_)
+        .set_beta(beta_);
+
+    auto input_data =
+        AsDeviceMemory(transformed_input.template flat<T>().data(),
+                       transformed_input.template flat<T>().size());
+    auto output_data =
+        AsDeviceMemory(transformed_output.template flat<T>().data(),
+                       transformed_output.template flat<T>().size());
+
+    auto* stream = context->op_device_context()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    bool status =
+        stream
+            ->ThenNormalizeWithDimensions(normalize_desc, dimensions_desc,
+                                          input_data, &output_data)
+            .ok();
+    OP_REQUIRES(context, status,
+                errors::Internal("NormalizeWithDimensions launch failed"));
+
+    // Need to convert it back to NHWC once MIOpen kernels finishes.
+    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+    functor::NCHWToNHWC<GPUDevice, T, 4>()(
+        context->eigen_device<GPUDevice>(),
+        toConstTensor(transformed_output).template tensor<T, 4>(),
+        output->tensor<T, 4>());
+#endif
   }
 
   int depth_radius_;
@@ -225,7 +299,7 @@ struct LaunchLRN<GPUDevice, T> {
   T beta_;
 };
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename Device, typename T>
 class LRNOp : public OpKernel {
@@ -292,7 +366,7 @@ TF_CALL_half(REGISTER_CPU);
 
 #undef REGISTER_CPU
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU(T)                                      \
   REGISTER_KERNEL_BUILDER(                                   \
@@ -302,7 +376,7 @@ TF_CALL_float(REGISTER_GPU);
 
 #undef REGISTER_GPU
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #if !defined(IS_MOBILE_PLATFORM)
 
@@ -390,7 +464,7 @@ struct LaunchLRNGrad<CPUDevice, T> {
   T alpha_beta_2_;
 };
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename T>
 struct LaunchLRNGrad<GPUDevice, T> {
@@ -400,6 +474,7 @@ struct LaunchLRNGrad<GPUDevice, T> {
   void launch(OpKernelContext* context, OpKernel* kernel,
               const Tensor& in_grads, const Tensor& in_image,
               const Tensor& out_image, Tensor* output) {
+#if GOOGLE_CUDA
     OP_REQUIRES(
         context, beta_ >= 0.01,
         errors::InvalidArgument("cuDNN requires beta >= 0.01, got: ", beta_));
@@ -447,6 +522,105 @@ struct LaunchLRNGrad<GPUDevice, T> {
     OP_REQUIRES(
         context, status,
         errors::Internal("NormalizeBackwardWithDimensions launch failed"));
+#elif TENSORFLOW_USE_ROCM
+    // For NHWC input/output tensors, convert to NCHW because it's the only
+    // supported format in MIOpen for now.
+    const int64 batch = in_grads.dim_size(0);
+    const int64 rows = in_grads.dim_size(1);
+    const int64 cols = in_grads.dim_size(2);
+    const int64 depth = in_grads.dim_size(3);
+
+    Tensor transformed_in_grads;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DataTypeToEnum<T>::value,
+                                ShapeFromFormat(FORMAT_NCHW, in_grads.shape(),
+                                                FORMAT_NHWC),
+                                &transformed_in_grads));
+    functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<GPUDevice>(),
+                                           in_grads.tensor<T, 4>(),
+                                           transformed_in_grads.tensor<T, 4>());
+
+    Tensor transformed_in_image;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DataTypeToEnum<T>::value,
+                                ShapeFromFormat(FORMAT_NCHW, in_image.shape(),
+                                                FORMAT_NHWC),
+                                &transformed_in_image));
+    functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<GPUDevice>(),
+                                           in_image.tensor<T, 4>(),
+                                           transformed_in_image.tensor<T, 4>());
+
+    Tensor transformed_out_image;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DataTypeToEnum<T>::value,
+                                ShapeFromFormat(FORMAT_NCHW, out_image.shape(),
+                                                FORMAT_NHWC),
+                                &transformed_out_image));
+    functor::NHWCToNCHW<GPUDevice, T, 4>()(
+        context->eigen_device<GPUDevice>(), out_image.tensor<T, 4>(),
+        transformed_out_image.tensor<T, 4>());
+
+    Tensor transformed_output;
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(
+                     DataTypeToEnum<T>::value,
+                     ShapeFromFormat(FORMAT_NCHW, output->shape(), FORMAT_NHWC),
+                     &transformed_output));
+
+    perftools::gputools::dnn::BatchDescriptor dimensions_desc;
+    dimensions_desc.set_count(batch)
+        .set_height(rows)
+        .set_width(cols)
+        .set_feature_map_count(depth)
+        .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+
+    perftools::gputools::dnn::NormalizeDescriptor normalize_desc;
+    normalize_desc.set_bias(bias_)
+        .set_range(depth_radius_)
+        .set_alpha(alpha_)
+        .set_beta(beta_);
+
+    auto input_grads_data =
+        AsDeviceMemory(transformed_in_grads.template flat<T>().data(),
+                       transformed_in_grads.template flat<T>().size());
+    auto input_image_data =
+        AsDeviceMemory(transformed_in_image.template flat<T>().data(),
+                       transformed_in_image.template flat<T>().size());
+    auto output_image_data =
+        AsDeviceMemory(transformed_out_image.template flat<T>().data(),
+                       transformed_out_image.template flat<T>().size());
+    auto output_grads_data =
+        AsDeviceMemory(transformed_output.template flat<T>().data(),
+                       transformed_output.template flat<T>().size());
+
+    auto* stream = context->op_device_context()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    static int64 NormalizeBackwardScratchSize = GetDnnWorkspaceLimit(
+        // default value is in bytes despite the name of the environment
+        // variable
+        "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
+    );
+
+    DnnScratchAllocator scratch_allocator(NormalizeBackwardScratchSize,
+                                          context);
+    bool status = stream
+                      ->ThenNormalizeBackwardWithDimensions(
+                          normalize_desc, dimensions_desc, input_image_data,
+                          output_image_data, input_grads_data,
+                          &output_grads_data, &scratch_allocator)
+                      .ok();
+    OP_REQUIRES(
+        context, status,
+        errors::Internal("NormalizeBackwardWithDimensions launch failed"));
+
+    // Need to convert it back to NHWC once MIOpen kernels finishes.
+    auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+    functor::NCHWToNHWC<GPUDevice, T, 4>()(
+        context->eigen_device<GPUDevice>(),
+        toConstTensor(transformed_output).template tensor<T, 4>(),
+        output->tensor<T, 4>());
+#endif
   }
 
   int depth_radius_;
@@ -455,7 +629,7 @@ struct LaunchLRNGrad<GPUDevice, T> {
   T beta_;
 };
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename Device, typename T>
 class LRNGradOp : public OpKernel {
@@ -524,7 +698,7 @@ TF_CALL_half(REGISTER_CPU);
 
 #undef REGISTER_CPU
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define REGISTER_GPU(T)                                          \
   REGISTER_KERNEL_BUILDER(                                       \
@@ -534,7 +708,7 @@ TF_CALL_float(REGISTER_GPU);
 
 #undef REGISTER_GPU
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #endif  // !defined(IS_MOBILE_PLATFORM)