diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc index 1012fcb3254..310bd73ba65 100644 --- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc @@ -582,10 +582,22 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel { use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value; cudnn_use_autotune_ = CudnnUseAutotune(); dtype_ = DataTypeToEnum<T>::value; +#if CUDNN_VERSION >= 8000 + // From the cuDNN release note 8.0: We’ve extended the fprop and dgrad + // NHWC depthwise kernels to support more combinations (filter + // sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition + // to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides + // good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn- + // release-notes/rel_8.html#rel_8) + use_cudnn_grouped_conv_ = + dtype_ == DT_HALF && + ((data_format_ == FORMAT_NCHW && stride_ == 1 && stride_w == 1) || + (data_format_ == FORMAT_NHWC && stride_ == stride_w && + (stride_ == 1 || stride_ == 2))); +#elif CUDNN_VERSION >= 7603 // Use CuDNN grouped conv (input gradient) when stride = 1, input/output is // NCHW and float16(half). See cudnn release note 7.6.3 (https://docs.nvidi // a.com/deeplearning/sdk/cudnn-release-notes/rel_763.html#rel_763). -#if CUDNN_VERSION >= 7603 use_cudnn_grouped_conv_ = dtype_ == DT_HALF && data_format_ == FORMAT_NCHW && stride_ == 1 && stride_w == 1; diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc index 624ba69092b..fe6a9e3e377 100644 --- a/tensorflow/core/kernels/depthwise_conv_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_op.cc @@ -302,10 +302,22 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> { use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value; cudnn_use_autotune_ = CudnnUseAutotune(); dtype_ = DataTypeToEnum<T>::value; +#if CUDNN_VERSION >= 8000 + // From the cuDNN release note 8.0: We’ve extended the fprop and dgrad + // NHWC depthwise kernels to support more combinations (filter + // sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition + // to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides + // good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn- + // release-notes/rel_8.html#rel_8) + use_cudnn_grouped_conv_ = + dtype_ == DT_HALF && + (data_format_ == FORMAT_NCHW || + (data_format_ == FORMAT_NHWC && stride_ == stride_w && + (stride_ == 1 || stride_ == 2))); +#elif CUDNN_VERSION >= 7603 // Use CuDNN grouped conv only when input/output is NCHW and float16(half). // See cudnn release note 7.6.3. (https://docs.nvidia.com/deeplearning/sdk/c // udnn-release-notes/rel_763.html#rel_763) -#if CUDNN_VERSION >= 7603 use_cudnn_grouped_conv_ = dtype_ == DT_HALF && data_format_ == FORMAT_NCHW; #else use_cudnn_grouped_conv_ = false;