Improved convolution performance in Metal backend.

Improved performance on small sizes.
Better convolution selection for A11.

PiperOrigin-RevId: 302455205
Change-Id: I5e01385da9354eea28e325b7c19201b6db169318
This commit is contained in:
Raman Sarokin 2020-03-23 09:53:33 -07:00 committed by TensorFlower Gardener
parent b8054c93d0
commit 8e9b8a438e
4 changed files with 735 additions and 1120 deletions

View File

@ -51,39 +51,6 @@ namespace tflite {
namespace gpu {
namespace metal {
namespace {
std::vector<ComputeTaskDescriptorPtr> SelectConvolution(
const GraphFloat32& graph, int id, ValueId input_id, ValueId output_id,
const Convolution2DAttributes& attr, const metal::RuntimeOptions& options) {
// Special precise version, in case we cover dst_shape poorly with standard
// work group size.
auto gpu_type = GetGpuType();
bool a11_12 = gpu_type == GpuType::kA11 || gpu_type == GpuType::kA12;
const auto dst_shape = graph.FindOutputs(id)[0]->tensor.shape;
if (GetThreadsRatioUsualToPreciseConvolution(dst_shape) >= 1.2f) {
// Special version for PowerVR >= IPhone6S/SE
// Metal has bad driver for PowerVR in IPhone6, so for Iphone6 we should use
// default kernel with shared memory.
if ((gpu_type == GpuType::kA9 || gpu_type == GpuType::kA10) &&
CheckConvolutionPrecise1x1Support(attr)) {
return ConvolutionPrecise1x1PowerVR(id, input_id, output_id, attr,
options);
}
if (a11_12 && GetThreadsRatioUsualToPreciseConvolution(dst_shape) >= 1.2f) {
return ConvolutionPrecise(id, input_id, output_id, attr, options);
}
}
if (a11_12) {
if (CheckConvolution1x1Support(attr)) {
return Convolution1x1(id, input_id, output_id, attr, options);
} else {
return ConvolutionGeneric(id, input_id, output_id, attr, options);
}
} else {
return Convolution(id, input_id, output_id, attr, options);
}
}
std::vector<ComputeTaskDescriptorPtr> SelectDepthWiseConv(
int id, ValueId input_id, ValueId output_id,
const DepthwiseConvolution2DAttributes& attr,
@ -182,12 +149,14 @@ Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
input_shapes);
break;
}
case OperationType::CONVOLUTION_2D:
*tasks = SelectConvolution(
graph, node_id, inputs[0], outputs[0],
absl::any_cast<Convolution2DAttributes>(node->operation.attributes),
options);
case OperationType::CONVOLUTION_2D: {
const auto dst_shape = graph.FindOutputs(node_id)[0]->tensor.shape;
auto attr =
absl::any_cast<Convolution2DAttributes>(node->operation.attributes);
*tasks = ConvolutionGeneric(node_id, inputs[0], outputs[0], dst_shape,
attr, options);
break;
}
case OperationType::CONVOLUTION_TRANSPOSED:
*tasks = SelectConvolutionTransposed(
node_id, inputs[0], outputs[0],

View File

@ -127,6 +127,7 @@ cc_library(
"//tensorflow/lite/delegates/gpu/common:types",
"//tensorflow/lite/delegates/gpu/common:util",
"//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
"//tensorflow/lite/delegates/gpu/metal:environment",
"//tensorflow/lite/delegates/gpu/metal:runtime_options",
"@com_google_absl//absl/strings",
],

File diff suppressed because it is too large Load Diff

View File

@ -27,67 +27,10 @@ namespace tflite {
namespace gpu {
namespace metal {
std::vector<ComputeTaskDescriptorPtr> Convolution(
int id, ValueId input_id, ValueId output_id,
const Convolution2DAttributes& params,
const metal::RuntimeOptions& options);
// Convolution for kernel 1x1
// require:
// kernel_size = 1x1;
// padding prepended and appended = 0x0
// dilation = 1x1;
// stride = 1x1;
// Works very good on A12 (IPhoneXS, etc).
// Works good on A9/A10/A11 (IPhone6S, IPhone7, IPhoneX, etc).
// Works bad on A7/A8 (IPhone5S, IPhone6, etc).
std::vector<ComputeTaskDescriptorPtr> Convolution1x1(
int id, ValueId input_id, ValueId output_id,
const Convolution2DAttributes& params, const RuntimeOptions& options);
// TODO(impjdi): Move it inside module.
bool CheckConvolution1x1Support(const Convolution2DAttributes& attr);
// This convolution pass all conv parameters (beside output_channels)
// as dynamic arguments (uniform buffer) to kernel.
// Depending on output_channels can be generated different kernels
// Kernel can proceed 4/8/12/16 output channels per one thread.
// 16 channels output is the fastest but the least flexible.
std::vector<ComputeTaskDescriptorPtr> ConvolutionGeneric(
int id, ValueId input_id, ValueId output_id,
int id, ValueId input_id, ValueId output_id, const BHWC& dst_shape,
const Convolution2DAttributes& params, const RuntimeOptions& options);
// This convolution makes more precise mapping of threads on elements.
// For example, if we have output tensor 12x7 and work group = 8x4,
// then we need 4 workgroups to cover this tensor in usual case.
// But in general we have only 84 elements(12*7), and we can cover it with 3
// workgroups of size 32. So this version of convolution use this precise
// mapping.
// But this convolution, due to some hardware limitations, doesn't work better
// always. In general it works good on A12.
// Each thread process 2 pixels in XY dimension and variable amount of pixels
// in Z dimension(depends on dst_channels).
std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise(
int id, ValueId input_id, ValueId output_id,
const Convolution2DAttributes& params, const RuntimeOptions& options);
// As previous, but specific for 1x1 and each thread process 1 pixel in XY
// dimension.
// This convolution for PowerVR in FP16 mode with FP32 accumulator
// It will work in other modes also, but not with good performance
std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise1x1PowerVR(
int id, ValueId input_id, ValueId output_id,
const Convolution2DAttributes& params, const RuntimeOptions& options);
// TODO(impjdi): Move it inside module.
bool CheckConvolutionPrecise1x1Support(const Convolution2DAttributes& attr);
// This function calculates amount of threads that should be launched for
// ConvolutionGeneric or Convolution1x1 (threads_count1) and amount of threads
// that should be launched for ConvolutionPrecise (threads_count2) and returns
// threads_count1 / threads_count2.
float GetThreadsRatioUsualToPreciseConvolution(const BHWC& dst_shape);
} // namespace metal
} // namespace gpu
} // namespace tflite