Improved convolution performance in Metal backend.
Improved performance on small sizes. Better convolution selection for A11. PiperOrigin-RevId: 302455205 Change-Id: I5e01385da9354eea28e325b7c19201b6db169318
This commit is contained in:
parent
b8054c93d0
commit
8e9b8a438e
@ -51,39 +51,6 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace metal {
|
||||
namespace {
|
||||
|
||||
std::vector<ComputeTaskDescriptorPtr> SelectConvolution(
|
||||
const GraphFloat32& graph, int id, ValueId input_id, ValueId output_id,
|
||||
const Convolution2DAttributes& attr, const metal::RuntimeOptions& options) {
|
||||
// Special precise version, in case we cover dst_shape poorly with standard
|
||||
// work group size.
|
||||
auto gpu_type = GetGpuType();
|
||||
bool a11_12 = gpu_type == GpuType::kA11 || gpu_type == GpuType::kA12;
|
||||
const auto dst_shape = graph.FindOutputs(id)[0]->tensor.shape;
|
||||
if (GetThreadsRatioUsualToPreciseConvolution(dst_shape) >= 1.2f) {
|
||||
// Special version for PowerVR >= IPhone6S/SE
|
||||
// Metal has bad driver for PowerVR in IPhone6, so for Iphone6 we should use
|
||||
// default kernel with shared memory.
|
||||
if ((gpu_type == GpuType::kA9 || gpu_type == GpuType::kA10) &&
|
||||
CheckConvolutionPrecise1x1Support(attr)) {
|
||||
return ConvolutionPrecise1x1PowerVR(id, input_id, output_id, attr,
|
||||
options);
|
||||
}
|
||||
if (a11_12 && GetThreadsRatioUsualToPreciseConvolution(dst_shape) >= 1.2f) {
|
||||
return ConvolutionPrecise(id, input_id, output_id, attr, options);
|
||||
}
|
||||
}
|
||||
if (a11_12) {
|
||||
if (CheckConvolution1x1Support(attr)) {
|
||||
return Convolution1x1(id, input_id, output_id, attr, options);
|
||||
} else {
|
||||
return ConvolutionGeneric(id, input_id, output_id, attr, options);
|
||||
}
|
||||
} else {
|
||||
return Convolution(id, input_id, output_id, attr, options);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<ComputeTaskDescriptorPtr> SelectDepthWiseConv(
|
||||
int id, ValueId input_id, ValueId output_id,
|
||||
const DepthwiseConvolution2DAttributes& attr,
|
||||
@ -182,12 +149,14 @@ Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
|
||||
input_shapes);
|
||||
break;
|
||||
}
|
||||
case OperationType::CONVOLUTION_2D:
|
||||
*tasks = SelectConvolution(
|
||||
graph, node_id, inputs[0], outputs[0],
|
||||
absl::any_cast<Convolution2DAttributes>(node->operation.attributes),
|
||||
options);
|
||||
case OperationType::CONVOLUTION_2D: {
|
||||
const auto dst_shape = graph.FindOutputs(node_id)[0]->tensor.shape;
|
||||
auto attr =
|
||||
absl::any_cast<Convolution2DAttributes>(node->operation.attributes);
|
||||
*tasks = ConvolutionGeneric(node_id, inputs[0], outputs[0], dst_shape,
|
||||
attr, options);
|
||||
break;
|
||||
}
|
||||
case OperationType::CONVOLUTION_TRANSPOSED:
|
||||
*tasks = SelectConvolutionTransposed(
|
||||
node_id, inputs[0], outputs[0],
|
||||
|
@ -127,6 +127,7 @@ cc_library(
|
||||
"//tensorflow/lite/delegates/gpu/common:types",
|
||||
"//tensorflow/lite/delegates/gpu/common:util",
|
||||
"//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
|
||||
"//tensorflow/lite/delegates/gpu/metal:environment",
|
||||
"//tensorflow/lite/delegates/gpu/metal:runtime_options",
|
||||
"@com_google_absl//absl/strings",
|
||||
],
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -27,67 +27,10 @@ namespace tflite {
|
||||
namespace gpu {
|
||||
namespace metal {
|
||||
|
||||
std::vector<ComputeTaskDescriptorPtr> Convolution(
|
||||
int id, ValueId input_id, ValueId output_id,
|
||||
const Convolution2DAttributes& params,
|
||||
const metal::RuntimeOptions& options);
|
||||
|
||||
// Convolution for kernel 1x1
|
||||
// require:
|
||||
// kernel_size = 1x1;
|
||||
// padding prepended and appended = 0x0
|
||||
// dilation = 1x1;
|
||||
// stride = 1x1;
|
||||
// Works very good on A12 (IPhoneXS, etc).
|
||||
// Works good on A9/A10/A11 (IPhone6S, IPhone7, IPhoneX, etc).
|
||||
// Works bad on A7/A8 (IPhone5S, IPhone6, etc).
|
||||
std::vector<ComputeTaskDescriptorPtr> Convolution1x1(
|
||||
int id, ValueId input_id, ValueId output_id,
|
||||
const Convolution2DAttributes& params, const RuntimeOptions& options);
|
||||
|
||||
// TODO(impjdi): Move it inside module.
|
||||
bool CheckConvolution1x1Support(const Convolution2DAttributes& attr);
|
||||
|
||||
// This convolution pass all conv parameters (beside output_channels)
|
||||
// as dynamic arguments (uniform buffer) to kernel.
|
||||
// Depending on output_channels can be generated different kernels
|
||||
// Kernel can proceed 4/8/12/16 output channels per one thread.
|
||||
// 16 channels output is the fastest but the least flexible.
|
||||
std::vector<ComputeTaskDescriptorPtr> ConvolutionGeneric(
|
||||
int id, ValueId input_id, ValueId output_id,
|
||||
int id, ValueId input_id, ValueId output_id, const BHWC& dst_shape,
|
||||
const Convolution2DAttributes& params, const RuntimeOptions& options);
|
||||
|
||||
// This convolution makes more precise mapping of threads on elements.
|
||||
// For example, if we have output tensor 12x7 and work group = 8x4,
|
||||
// then we need 4 workgroups to cover this tensor in usual case.
|
||||
// But in general we have only 84 elements(12*7), and we can cover it with 3
|
||||
// workgroups of size 32. So this version of convolution use this precise
|
||||
// mapping.
|
||||
// But this convolution, due to some hardware limitations, doesn't work better
|
||||
// always. In general it works good on A12.
|
||||
// Each thread process 2 pixels in XY dimension and variable amount of pixels
|
||||
// in Z dimension(depends on dst_channels).
|
||||
std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise(
|
||||
int id, ValueId input_id, ValueId output_id,
|
||||
const Convolution2DAttributes& params, const RuntimeOptions& options);
|
||||
|
||||
// As previous, but specific for 1x1 and each thread process 1 pixel in XY
|
||||
// dimension.
|
||||
// This convolution for PowerVR in FP16 mode with FP32 accumulator
|
||||
// It will work in other modes also, but not with good performance
|
||||
std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise1x1PowerVR(
|
||||
int id, ValueId input_id, ValueId output_id,
|
||||
const Convolution2DAttributes& params, const RuntimeOptions& options);
|
||||
|
||||
// TODO(impjdi): Move it inside module.
|
||||
bool CheckConvolutionPrecise1x1Support(const Convolution2DAttributes& attr);
|
||||
|
||||
// This function calculates amount of threads that should be launched for
|
||||
// ConvolutionGeneric or Convolution1x1 (threads_count1) and amount of threads
|
||||
// that should be launched for ConvolutionPrecise (threads_count2) and returns
|
||||
// threads_count1 / threads_count2.
|
||||
float GetThreadsRatioUsualToPreciseConvolution(const BHWC& dst_shape);
|
||||
|
||||
} // namespace metal
|
||||
} // namespace gpu
|
||||
} // namespace tflite
|
||||
|
Loading…
Reference in New Issue
Block a user