Merge pull request #40303 from kaixih:test_nhwc_depwise_pr
PiperOrigin-RevId: 315828768 Change-Id: Id8fa2d9eb6f651c2b9dd85f594fbeffa216472ca
This commit is contained in:
commit
9172bf8f74
tensorflow/core/kernels
@ -582,10 +582,22 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
|
||||
use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
|
||||
cudnn_use_autotune_ = CudnnUseAutotune();
|
||||
dtype_ = DataTypeToEnum<T>::value;
|
||||
#if CUDNN_VERSION >= 8000
|
||||
// From the cuDNN release note 8.0: We’ve extended the fprop and dgrad
|
||||
// NHWC depthwise kernels to support more combinations (filter
|
||||
// sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition
|
||||
// to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides
|
||||
// good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
|
||||
// release-notes/rel_8.html#rel_8)
|
||||
use_cudnn_grouped_conv_ =
|
||||
dtype_ == DT_HALF &&
|
||||
((data_format_ == FORMAT_NCHW && stride_ == 1 && stride_w == 1) ||
|
||||
(data_format_ == FORMAT_NHWC && stride_ == stride_w &&
|
||||
(stride_ == 1 || stride_ == 2)));
|
||||
#elif CUDNN_VERSION >= 7603
|
||||
// Use CuDNN grouped conv (input gradient) when stride = 1, input/output is
|
||||
// NCHW and float16(half). See cudnn release note 7.6.3 (https://docs.nvidi
|
||||
// a.com/deeplearning/sdk/cudnn-release-notes/rel_763.html#rel_763).
|
||||
#if CUDNN_VERSION >= 7603
|
||||
use_cudnn_grouped_conv_ = dtype_ == DT_HALF &&
|
||||
data_format_ == FORMAT_NCHW && stride_ == 1 &&
|
||||
stride_w == 1;
|
||||
|
@ -302,10 +302,22 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
|
||||
use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
|
||||
cudnn_use_autotune_ = CudnnUseAutotune();
|
||||
dtype_ = DataTypeToEnum<T>::value;
|
||||
#if CUDNN_VERSION >= 8000
|
||||
// From the cuDNN release note 8.0: We’ve extended the fprop and dgrad
|
||||
// NHWC depthwise kernels to support more combinations (filter
|
||||
// sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition
|
||||
// to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides
|
||||
// good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
|
||||
// release-notes/rel_8.html#rel_8)
|
||||
use_cudnn_grouped_conv_ =
|
||||
dtype_ == DT_HALF &&
|
||||
(data_format_ == FORMAT_NCHW ||
|
||||
(data_format_ == FORMAT_NHWC && stride_ == stride_w &&
|
||||
(stride_ == 1 || stride_ == 2)));
|
||||
#elif CUDNN_VERSION >= 7603
|
||||
// Use CuDNN grouped conv only when input/output is NCHW and float16(half).
|
||||
// See cudnn release note 7.6.3. (https://docs.nvidia.com/deeplearning/sdk/c
|
||||
// udnn-release-notes/rel_763.html#rel_763)
|
||||
#if CUDNN_VERSION >= 7603
|
||||
use_cudnn_grouped_conv_ = dtype_ == DT_HALF && data_format_ == FORMAT_NCHW;
|
||||
#else
|
||||
use_cudnn_grouped_conv_ = false;
|
||||
|
Loading…
Reference in New Issue
Block a user