Merge pull request #40303 from kaixih:test_nhwc_depwise_pr

PiperOrigin-RevId: 315828768 Change-Id: Id8fa2d9eb6f651c2b9dd85f594fbeffa216472ca
2020-06-10 21:26:08 -07:00 · 2020-06-10 21:26:08 -07:00 · 9172bf8f74
commit 9172bf8f74
parent d77064eb7b f0322d6c68
2 changed files with 26 additions and 2 deletions
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@ -582,10 +582,22 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
    cudnn_use_autotune_ = CudnnUseAutotune();
    dtype_ = DataTypeToEnum<T>::value;
+#if CUDNN_VERSION >= 8000
+    // From the cuDNN release note 8.0: We’ve extended the fprop and dgrad
+    // NHWC depthwise kernels to support more combinations (filter
+    // sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition
+    // to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides
+    // good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
+    // release-notes/rel_8.html#rel_8)
+    use_cudnn_grouped_conv_ =
+        dtype_ == DT_HALF &&
+        ((data_format_ == FORMAT_NCHW && stride_ == 1 && stride_w == 1) ||
+         (data_format_ == FORMAT_NHWC && stride_ == stride_w &&
+          (stride_ == 1 || stride_ == 2)));
+#elif CUDNN_VERSION >= 7603
    // Use CuDNN grouped conv (input gradient) when stride = 1, input/output is
    // NCHW and float16(half). See cudnn release note 7.6.3 (https://docs.nvidi
    // a.com/deeplearning/sdk/cudnn-release-notes/rel_763.html#rel_763).
-#if CUDNN_VERSION >= 7603
    use_cudnn_grouped_conv_ = dtype_ == DT_HALF &&
                              data_format_ == FORMAT_NCHW && stride_ == 1 &&
                              stride_w == 1;
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@ -302,10 +302,22 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
    cudnn_use_autotune_ = CudnnUseAutotune();
    dtype_ = DataTypeToEnum<T>::value;
+#if CUDNN_VERSION >= 8000
+    // From the cuDNN release note 8.0: We’ve extended the fprop and dgrad
+    // NHWC depthwise kernels to support more combinations (filter
+    // sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition
+    // to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides
+    // good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
+    // release-notes/rel_8.html#rel_8)
+    use_cudnn_grouped_conv_ =
+        dtype_ == DT_HALF &&
+        (data_format_ == FORMAT_NCHW ||
+         (data_format_ == FORMAT_NHWC && stride_ == stride_w &&
+          (stride_ == 1 || stride_ == 2)));
+#elif CUDNN_VERSION >= 7603
    // Use CuDNN grouped conv only when input/output is NCHW and float16(half).
    // See cudnn release note 7.6.3. (https://docs.nvidia.com/deeplearning/sdk/c
    // udnn-release-notes/rel_763.html#rel_763)
-#if CUDNN_VERSION >= 7603
    use_cudnn_grouped_conv_ = dtype_ == DT_HALF && data_format_ == FORMAT_NCHW;
 #else
    use_cudnn_grouped_conv_ = false;