diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index b16d3c7270f..a923df5c477 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -301,7 +301,6 @@ class Conv2DBackpropFilterOp : public OpKernel {
                                               /*num_dims=*/4, data_format_));
 
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
-    use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
 
     if (std::is_same<Device, CPUDevice>::value) {
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 86090864ddb..158f93fdec1 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -426,7 +426,6 @@ class Conv2DBackpropInputOp : public OpKernel {
                                               /*num_dims=*/4, data_format_));
 
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
-    use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
 
     if (std::is_same<Device, CPUDevice>::value ||
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 8db796c216b..ca2abce0b15 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -512,7 +512,6 @@ class Conv2DOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
 
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
-    use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
   }
 
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index fb31fc14a7c..f838d05decf 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -670,7 +670,6 @@ class FusedConv2DOp : public OpKernel {
     OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
 
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
-    use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
 
     using FCT = FusedComputationType;
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index b809e1d1065..9a613c1d845 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -579,8 +579,6 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
                                               /*num_dims=*/4, data_format_));
 
-    // For in_depth == 1 and grouped convolutions.
-    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
     dtype_ = DataTypeToEnum<T>::value;
 #if CUDNN_VERSION >= 8000
@@ -638,13 +636,13 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
 
     // If in_depth==1, this operation is just a standard convolution.
     // Depthwise convolution is a special case of cuDNN's grouped convolution.
-    bool use_cudnn =
-        use_cudnn_ && (in_depth == 1 ||
-                       (use_cudnn_grouped_conv_ &&
-                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
-                                                   /*filter_cols=*/filter_cols,
-                                                   /*in_depth=*/in_depth,
-                                                   /*out_depth=*/out_depth)));
+    bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
+                     (in_depth == 1 ||
+                      (use_cudnn_grouped_conv_ &&
+                       IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
+                                                  /*filter_cols=*/filter_cols,
+                                                  /*in_depth=*/in_depth,
+                                                  /*out_depth=*/out_depth)));
 
     VLOG(2) << "DepthwiseConv2dNativeBackpropInput: "
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
@@ -674,7 +672,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
               "Failed to reshape filter tensor for grouped convolution."));
       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
       // conv is supported.
-      launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop,
+      launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
                 stride_, stride_, padding_, explicit_paddings_, in_backprop,
                 data_format_);
@@ -701,7 +699,6 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
 
   // For in_depth == 1 and grouped convolutions.
   LaunchConv2DBackpropInputOp<Device, T> launcher_;
-  bool use_cudnn_;
   bool cudnn_use_autotune_;
   DataType dtype_;
 
@@ -1085,8 +1082,6 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
                                               /*num_dims=*/4, data_format_));
 
-    // For in_depth == 1 and grouped convolutions.
-    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
 
     if (std::is_same<T, Eigen::half>::value) {
@@ -1138,13 +1133,13 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
 
     // If in_depth==1, this operation is just a standard convolution.
     // Depthwise convolution is a special case of cuDNN's grouped convolution.
-    bool use_cudnn =
-        use_cudnn_ && (in_depth == 1 ||
-                       (use_cudnn_grouped_conv_ &&
-                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
-                                                   /*filter_cols=*/filter_cols,
-                                                   /*in_depth=*/in_depth,
-                                                   /*out_depth=*/out_depth)));
+    bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
+                     (in_depth == 1 ||
+                      (use_cudnn_grouped_conv_ &&
+                       IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
+                                                  /*filter_cols=*/filter_cols,
+                                                  /*in_depth=*/in_depth,
+                                                  /*out_depth=*/out_depth)));
 
     VLOG(2) << "DepthwiseConv2dNativeBackpropFilter: "
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
@@ -1175,7 +1170,8 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
 
       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
       // conv is supported.
-      launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
+      launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
+                input,
                 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
                 padding_, explicit_paddings_, &reshaped_filter, data_format_);
       return;
@@ -1234,7 +1230,6 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
 
   // For in_depth == 1 and grouped convolutions.
   LaunchConv2DBackpropFilterOp<Device, T> launcher_;
-  bool use_cudnn_;
   bool cudnn_use_autotune_;
   DataType dtype_;
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index fe6a9e3e377..a03de90007f 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -298,8 +298,6 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
                                               /*num_dims=*/4, data_format_));
 
-    // For in_depth == 1 and grouped convolutions.
-    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
     dtype_ = DataTypeToEnum<T>::value;
 #if CUDNN_VERSION >= 8000
@@ -407,13 +405,13 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     // TODO(csigg): Have autotune decide if native is faster than cuDNN.
     // If in_depth==1, this operation is just a standard convolution.
     // Depthwise convolution is a special case of cuDNN's grouped convolution.
-    bool use_cudnn =
-        use_cudnn_ && (in_depth == 1 ||
-                       (use_cudnn_grouped_conv_ &&
-                        IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
-                                                   /*filter_cols=*/filter_cols,
-                                                   /*in_depth=*/in_depth,
-                                                   /*out_depth=*/out_depth)));
+    bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
+                     (in_depth == 1 ||
+                      (use_cudnn_grouped_conv_ &&
+                       IsCudnnSupportedFilterSize(/*filter_rows=*/filter_rows,
+                                                  /*filter_cols=*/filter_cols,
+                                                  /*in_depth=*/in_depth,
+                                                  /*out_depth=*/out_depth)));
 
     VLOG(2) << "DepthwiseConv2dNative: "
             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
@@ -443,7 +441,7 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
               "Failed to reshape filter tensor for grouped convolution."));
       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
       // conv is supported.
-      launcher_(context, use_cudnn_, cudnn_use_autotune_, input,
+      launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, input,
                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
                 stride_, stride_, padding_, explicit_paddings_, output,
                 data_format_);
@@ -485,7 +483,6 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
 
   // For in_depth == 1 and grouped convolutions.
   LaunchConv2DOp<Device, T> launcher_;
-  bool use_cudnn_;
   bool cudnn_use_autotune_;
   DataType dtype_;
 
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 5652addd00a..36ab1d71671 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -321,30 +321,6 @@ class MaxPoolingGradOp : public OpKernel {
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-template <typename T>
-static void MaxPoolingBackwardCustomKernel(
-    OpKernelContext* context, const std::vector<int32>& size,
-    const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
-    const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
-  Tensor* output = nullptr;
-  OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                              {0}, 0, tensor_in_shape, &output));
-
-  PoolParameters params{context, size,        stride,
-                        padding, FORMAT_NHWC, tensor_in_shape};
-  if (!context->status().ok()) {
-    return;
-  }
-
-  functor::MaxPoolBackwardNoMask<T>()(
-      tensor_in->flat<T>().data(), params.tensor_in_batch,
-      params.tensor_in_rows, params.tensor_in_cols, params.depth,
-      params.out_height, params.out_width, params.window_rows,
-      params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
-      params.pad_cols, out_backprop.flat<T>().data(), output->flat<T>().data(),
-      context->eigen_device<Eigen::GpuDevice>());
-}
-
 template <class T>
 class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
  public:
@@ -372,7 +348,6 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
     }
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
 
-    use_dnn_ = CanUseCudnn();
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
                                    &propagate_nans_));
   }
@@ -417,18 +392,10 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
-
-    if (use_dnn_) {
-      DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
-                                   ksize, stride, padding_, data_format_,
-                                   &tensor_in, &tensor_out, out_backprop,
-                                   output_shape, propagate_nans_);
-    } else {
-      CHECK(data_format_ == FORMAT_NHWC)
-          << "Non-Cudnn MaxPoolGrad only supports NHWC format";
-      MaxPoolingBackwardCustomKernel<T>(context, ksize, stride, padding_,
-                                        &tensor_in, out_backprop, output_shape);
-    }
+    DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
+                                 stride, padding_, data_format_, &tensor_in,
+                                 &tensor_out, out_backprop, output_shape,
+                                 propagate_nans_);
   }
 
  private:
@@ -436,7 +403,6 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
-  bool use_dnn_;
   bool propagate_nans_;
 };
 
@@ -1139,7 +1105,6 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
-    use_dnn_ = CanUseCudnn();
 
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
                                    &propagate_nans_));
@@ -1165,17 +1130,15 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                     "qint8 should be used with data_format NCHW_VECT_C."));
 
 #if CUDNN_VERSION >= 7300
-    if (use_dnn_) {
-      DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
-                               stride_, padding_, data_format_, tensor_in,
-                               out_shape, propagate_nans_);
+    DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
+                             stride_, padding_, data_format_, tensor_in,
+                             out_shape, propagate_nans_);
 #else
     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
-    if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
+    if (!is_int8x4 && data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
                                stride_, padding_, data_format_, tensor_in,
                                out_shape, propagate_nans_);
-#endif
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
@@ -1195,6 +1158,7 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                    << ") is not supported.";
       }
     }
+#endif
   }
 
  private:
@@ -1202,7 +1166,6 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
-  bool use_dnn_;
   bool propagate_nans_;
 };
 
@@ -1232,7 +1195,6 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
                       "Pooling is not yet supported on the batch dimension."));
     }
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    use_dnn_ = CanUseCudnn();
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
                                    &propagate_nans_));
   }
@@ -1275,13 +1237,13 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
     TensorShape out_shape =
         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
                         params.out_width, params.depth);
-    if (use_dnn_ && data_format_ == FORMAT_NCHW) {
+    if (data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
                                stride, padding_, data_format_, tensor_in,
                                out_shape, propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
-          << "Non-Cudnn MaxPool only supports NHWC format";
+          << "MaxPool only supports NCHW or NHWC format";
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
       LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
@@ -1294,7 +1256,6 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
-  bool use_dnn_;
   bool propagate_nans_;
 };
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 2a4bb9a94fe..4de2f29aa30 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -180,48 +180,6 @@ __global__ void MaxPoolForwardNHWC(
   }
 }
 
-template <typename dtype>
-__global__ void MaxPoolBackwardNoMaskNHWC(
-    const int nthreads, const dtype* __restrict__ bottom_data, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    const dtype* __restrict__ top_diff, dtype* __restrict__ bottom_diff) {
-  GPU_1D_KERNEL_LOOP(index, nthreads) {
-    // First find out the index to the maximum, since we have no mask.
-    int n = index;
-    int c = n % channels;
-    n /= channels;
-    int wstart = (n % pooled_width) * stride_w - pad_l;
-    n /= pooled_width;
-    int hstart = (n % pooled_height) * stride_h - pad_t;
-    n /= pooled_height;
-    int hend = min(hstart + kernel_h, height);
-    int wend = min(wstart + kernel_w, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    dtype maxval = Eigen::NumTraits<dtype>::lowest();
-    int maxidx = -1;
-    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int idx = (h * width + w) * channels + c;
-        if (bottom_data_n[idx] > maxval) {
-          maxidx = idx;
-          maxval = bottom_data_n[idx];
-        }
-      }
-    }
-
-    // Atomically accumulate the bottom diff. The index could still be
-    // uninitialized, if all the bottom_data are NaN.
-    if (maxidx != -1) {
-      GpuAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
-                   top_diff[index]);
-    }
-  }
-}
-
 // The parameters to the kernels in the backward function is as follows:
 //     nthreads: the number of threads, which is equal to the output size.
 //     top_diff: the gradient of the output data, of size N*Hout*Wout*C (or
@@ -445,31 +403,6 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
   return d.ok();
 }
 
-template <typename T>
-bool MaxPoolBackwardNoMask<T>::operator()(
-    const T* bottom_data, const int batch, const int height, const int width,
-    const int channels, const int pooled_height, const int pooled_width,
-    const int kernel_h, const int kernel_w, const int stride_h,
-    const int stride_w, const int pad_t, const int pad_l, const T* top_diff,
-    T* bottom_diff, const Eigen::GpuDevice& d) {
-  const int kThreadsPerBlock = 1024;
-
-  const int bottom_size = batch * channels * height * width;
-  if (bottom_size == 0) return true;
-  TF_CHECK_OK(GpuLaunchKernel(
-      SetZero<T>, (bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-      kThreadsPerBlock, 0, d.stream(), bottom_size, bottom_diff));
-
-  const int top_size = batch * channels * pooled_height * pooled_width;
-  TF_CHECK_OK(GpuLaunchKernel(
-      MaxPoolBackwardNoMaskNHWC<T>,
-      (top_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0,
-      d.stream(), top_size, bottom_data, height, width, channels, pooled_height,
-      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-      top_diff, bottom_diff));
-  return d.ok();
-}
-
 template <typename T>
 bool MaxPoolBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
@@ -540,7 +473,6 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct SpatialMaxPooling<GPUDevice, T>;     \
   template struct MaxPoolForwardWithOptionalArgmax<T>; \
   template struct MaxPoolBackwardWithArgmax<T>;        \
-  template struct MaxPoolBackwardNoMask<T>;            \
   template struct MaxPoolGradBackwardWithArgmax<T>;    \
   template struct MaxPoolGradBackwardNoMask<T>;
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index 5383833b318..44ccdfd9a76 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -60,16 +60,6 @@ struct MaxPoolBackwardWithArgmax {
                   const Eigen::GpuDevice& d, const bool include_batch_in_index);
 };
 
-template <typename T>
-struct MaxPoolBackwardNoMask {
-  bool operator()(const T* bottom_data, const int batch, const int height,
-                  const int width, const int channels, const int pooled_height,
-                  const int pooled_width, const int kernel_h,
-                  const int kernel_w, const int stride_h, const int stride_w,
-                  const int pad_t, const int pad_l, const T* top_diff,
-                  T* bottom_diff, const Eigen::GpuDevice& d);
-};
-
 template <typename T>
 struct MaxPoolGradBackwardWithArgmax {
   bool operator()(const int output_size, const int input_size,
diff --git a/tensorflow/core/util/use_cudnn.cc b/tensorflow/core/util/use_cudnn.cc
index 54296c3c570..442b3725db5 100644
--- a/tensorflow/core/util/use_cudnn.cc
+++ b/tensorflow/core/util/use_cudnn.cc
@@ -22,9 +22,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// TODO(b/155239286): Remove this function
-bool CanUseCudnn() { return true; }
-
 #define ADD_BOOL_CUDNN_FLAG(func_name, flag_name, default_value)           \
   bool func_name() {                                                       \
     bool value = default_value;                                            \
@@ -73,25 +70,6 @@ ADD_BOOL_CUDNN_FLAG(DebugCudnnRnnUseTensorOps,
 ADD_INT64_CUDNN_FLAG(DebugCudnnRnnAlgo, TF_DEBUG_CUDNN_RNN_ALGO, -1);
 #undef ADD_INT64_CUDNN_FLAG
 
-FP16ConvMode CudnnConvComputeMode() {
-  string value;
-  Status status = ReadStringFromEnvVar("TF_FP16_CONV_MODE", "accurate", &value);
-  if (!status.ok()) {
-    LOG(ERROR) << status;
-  }
-  string lowercase_value = absl::AsciiStrToLower(value);
-  if (lowercase_value == "accurate") {
-    return FP16ConvMode::kAccurate;
-  } else if (lowercase_value == "fast") {
-    return FP16ConvMode::kFast;
-  } else {
-    LOG(ERROR) << "FP16ConvMode only supports two modes, ACCURATE and FAST. "
-                  "Got unknown mode: "
-               << value;
-  }
-  return FP16ConvMode::kAccurate;
-}
-
 bool IsCudnnSupportedFilterSize(const int32 filter_rows,
                                 const int32 filter_cols, const int32 in_depth,
                                 const int32 out_depth) {
diff --git a/tensorflow/core/util/use_cudnn.h b/tensorflow/core/util/use_cudnn.h
index bbacd349daf..f59a6950269 100644
--- a/tensorflow/core/util/use_cudnn.h
+++ b/tensorflow/core/util/use_cudnn.h
@@ -22,20 +22,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-// FP16ConvMode: The mode to set the internal compute type for cudnn convolution
-// when the input data type is float16. Two types of modes are supported:
-//   kAccurate: Always use float32 as the internal compute type.
-//   kFast: Include both float32 and float16 compute type in the autotune.
-enum class FP16ConvMode {
-  kAccurate = 1,
-  kFast = 2,
-};
-
-bool CanUseCudnn();
 bool CudnnUseAutotune();
 bool CudnnRnnUseAutotune();
 bool CudnnDisableConv1x1Optimization();
-FP16ConvMode CudnnConvComputeMode();
 bool DebugCudnnRnn();
 bool DebugCudnnRnnUseTensorOps();
 int64 DebugCudnnRnnAlgo();