From 65849ef4e0adb191d92734bbe26b894f64857668 Mon Sep 17 00:00:00 2001 From: Dan Ganea Date: Fri, 9 Aug 2019 20:20:58 +0200 Subject: [PATCH] Added group_count parameter to autotune descriptor Some cuDNN algorithms only support a certain number of groups. However, if the group_count is not taken into account when trying to auto_tune for the best cuDNN algorithm then a grouped convolution might receive a "cached" algorithm which does not support the amount of groups it needs. --- tensorflow/core/kernels/conv_grad_filter_ops.cc | 1 + tensorflow/core/kernels/conv_grad_input_ops.cc | 1 + tensorflow/core/kernels/conv_grad_ops_3d.cc | 2 ++ tensorflow/core/kernels/conv_ops.cc | 1 + tensorflow/core/kernels/conv_ops_3d.cc | 1 + tensorflow/core/kernels/conv_ops_fused_impl.h | 1 + tensorflow/core/kernels/conv_ops_gpu.h | 15 ++++++++++----- 7 files changed, 17 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc index 9d5f316ff6f..9b375703ef2 100644 --- a/tensorflow/core/kernels/conv_grad_filter_ops.cc +++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc @@ -965,6 +965,7 @@ void LaunchConv2DBackpropFilterOp::operator()( common_padding_cols}}, // padding_cols dtype, // tensor datatype device_id, // device_id + conv_desc.group_count() // group_count }; AlgorithmConfig algorithm_config; if (cudnn_use_autotune && !AutoTuneConvBwdFilter::GetInstance()->Find( diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc index 8974aa1e11d..3586b0ccf06 100644 --- a/tensorflow/core/kernels/conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/conv_grad_input_ops.cc @@ -1091,6 +1091,7 @@ void LaunchConv2DBackpropInputOp::operator()( common_padding_cols}}, // padding_cols dtype, // tensor data type device_id, // device_id + conv_desc.group_count() // group_count }; AlgorithmConfig algorithm_config; if (cudnn_use_autotune && !AutoTuneConvBwdData::GetInstance()->Find( diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc index 3ba6a9a6f39..037339b37ef 100644 --- a/tensorflow/core/kernels/conv_grad_ops_3d.cc +++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc @@ -1352,6 +1352,7 @@ class Conv3DBackpropInputOp : public OpKernel { {{padding_planes, padding_rows, padding_cols}}, dtype, device_id, + conv_desc.group_count() }; using se::dnn::AlgorithmConfig; @@ -1777,6 +1778,7 @@ class Conv3DBackpropFilterOp : public OpKernel { {{padding_planes, padding_rows, padding_cols}}, dtype, device_id, + conv_desc.group_count() }; using se::dnn::AlgorithmConfig; diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc index 4ea31861e7a..15c767f1ab8 100644 --- a/tensorflow/core/kernels/conv_ops.cc +++ b/tensorflow/core/kernels/conv_ops.cc @@ -984,6 +984,7 @@ void LaunchConv2DOp::operator()( common_padding_cols}}, // padding_cols dtype, // tensor datatype device_id, // device_id + conv_desc.group_count() }; AlgorithmConfig algorithm_config; if (cudnn_use_autotune && diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc index 076db5c5442..c92dccd2f81 100644 --- a/tensorflow/core/kernels/conv_ops_3d.cc +++ b/tensorflow/core/kernels/conv_ops_3d.cc @@ -425,6 +425,7 @@ struct LaunchConvOp { {{pad_planes, pad_rows, pad_cols}}, dtype, device_id, + conv_desc.group_count() }; using se::dnn::AlgorithmConfig; diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h index 8fba8ce679b..6c4e4d91a23 100644 --- a/tensorflow/core/kernels/conv_ops_fused_impl.h +++ b/tensorflow/core/kernels/conv_ops_fused_impl.h @@ -581,6 +581,7 @@ struct LaunchFusedConv2DOp { common_padding_cols}}, // padding_cols dtype, // tensor datatype device_id, // device_id + conv_desc.group_count() }, dnn_activation_mode // activation_mode }; diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h index 7906f74c616..8751937ddc7 100644 --- a/tensorflow/core/kernels/conv_ops_gpu.h +++ b/tensorflow/core/kernels/conv_ops_gpu.h @@ -97,7 +97,7 @@ class ConvParameters { TensorFormat data_format, int64 out_depths, const SpatialArray& filter, const SpatialArray& dilation, const SpatialArray& stride, const SpatialArray& padding, - DataType dtype, int device_id) + DataType dtype, int device_id, int group_count = 1) : batch_(batch), in_depths_(in_depths), out_depths_(out_depths), @@ -108,7 +108,8 @@ class ConvParameters { stride_(CheckSpatialArraySize(stride)), padding_(CheckSpatialArraySize(padding)), dtype_(dtype), - device_id_(device_id) { + device_id_(device_id), + group_count_(group_count) { hash_code_ = batch; hash_code_ = Hash64Combine(hash_code_, in_depths); for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val); @@ -120,7 +121,9 @@ class ConvParameters { for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val); hash_code_ = Hash64Combine(hash_code_, dtype); hash_code_ = Hash64Combine(hash_code_, device_id); + hash_code_ = Hash64Combine(hash_code_, group_count); } + bool operator==(const ConvParameters& other) const { return this->get_data_as_tuple() == other.get_data_as_tuple(); } @@ -142,7 +145,8 @@ class ConvParameters { "(", str_util::Join(stride_, ", "), "), ", "(", str_util::Join(padding_, ", "), "), ", dtype_, ", ", - device_id_); + device_id_, + group_count_); // clang-format on } @@ -166,12 +170,12 @@ class ConvParameters { protected: using ParameterDataType = std::tuple; + SpatialArray, SpatialArray, SpatialArray, DataType, int, int>; ParameterDataType get_data_as_tuple() const { return std::make_tuple(batch_, in_depths_, in_, data_format_, out_depths_, filter_, dilation_, stride_, padding_, dtype_, - device_id_); + device_id_, group_count_); } uint64 hash_code_; @@ -208,6 +212,7 @@ class ConvParameters { SpatialArray padding_; DataType dtype_; int device_id_; + int group_count_; }; typedef Eigen::GpuDevice GPUDevice;