Improved convolution performance in Metal backend.

Improved performance on small sizes. Better convolution selection for A11. PiperOrigin-RevId: 302455205 Change-Id: I5e01385da9354eea28e325b7c19201b6db169318
2020-03-23 09:53:33 -07:00 · 2020-03-23 09:53:33 -07:00 · 8e9b8a438e
commit 8e9b8a438e
parent b8054c93d0
4 changed files with 735 additions and 1120 deletions
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@ -51,39 +51,6 @@ namespace tflite {
 namespace gpu {
 namespace metal {
 namespace {
-
-std::vector<ComputeTaskDescriptorPtr> SelectConvolution(
-    const GraphFloat32& graph, int id, ValueId input_id, ValueId output_id,
-    const Convolution2DAttributes& attr, const metal::RuntimeOptions& options) {
-  // Special precise version, in case we cover dst_shape poorly with standard
-  // work group size.
-  auto gpu_type = GetGpuType();
-  bool a11_12 = gpu_type == GpuType::kA11 || gpu_type == GpuType::kA12;
-  const auto dst_shape = graph.FindOutputs(id)[0]->tensor.shape;
-  if (GetThreadsRatioUsualToPreciseConvolution(dst_shape) >= 1.2f) {
-    // Special version for PowerVR >= IPhone6S/SE
-    // Metal has bad driver for PowerVR in IPhone6, so for Iphone6 we should use
-    // default kernel with shared memory.
-    if ((gpu_type == GpuType::kA9 || gpu_type == GpuType::kA10) &&
-        CheckConvolutionPrecise1x1Support(attr)) {
-      return ConvolutionPrecise1x1PowerVR(id, input_id, output_id, attr,
-                                          options);
-    }
-    if (a11_12 && GetThreadsRatioUsualToPreciseConvolution(dst_shape) >= 1.2f) {
-      return ConvolutionPrecise(id, input_id, output_id, attr, options);
-    }
-  }
-  if (a11_12) {
-    if (CheckConvolution1x1Support(attr)) {
-      return Convolution1x1(id, input_id, output_id, attr, options);
-    } else {
-      return ConvolutionGeneric(id, input_id, output_id, attr, options);
-    }
-  } else {
-    return Convolution(id, input_id, output_id, attr, options);
-  }
-}
-
 std::vector<ComputeTaskDescriptorPtr> SelectDepthWiseConv(
    int id, ValueId input_id, ValueId output_id,
    const DepthwiseConvolution2DAttributes& attr,
@ -182,12 +149,14 @@ Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
                 input_shapes);
      break;
    }
-    case OperationType::CONVOLUTION_2D:
-      *tasks = SelectConvolution(
-          graph, node_id, inputs[0], outputs[0],
-          absl::any_cast<Convolution2DAttributes>(node->operation.attributes),
-          options);
+    case OperationType::CONVOLUTION_2D: {
+      const auto dst_shape = graph.FindOutputs(node_id)[0]->tensor.shape;
+      auto attr =
+          absl::any_cast<Convolution2DAttributes>(node->operation.attributes);
+      *tasks = ConvolutionGeneric(node_id, inputs[0], outputs[0], dst_shape,
+                                  attr, options);
      break;
+    }
    case OperationType::CONVOLUTION_TRANSPOSED:
      *tasks = SelectConvolutionTransposed(
          node_id, inputs[0], outputs[0],
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@ -127,6 +127,7 @@ cc_library(
        "//tensorflow/lite/delegates/gpu/common:types",
        "//tensorflow/lite/delegates/gpu/common:util",
        "//tensorflow/lite/delegates/gpu/metal:compute_task_descriptor",
+        "//tensorflow/lite/delegates/gpu/metal:environment",
        "//tensorflow/lite/delegates/gpu/metal:runtime_options",
        "@com_google_absl//absl/strings",
    ],
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.h
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.h
@ -27,67 +27,10 @@ namespace tflite {
 namespace gpu {
 namespace metal {

-std::vector<ComputeTaskDescriptorPtr> Convolution(
-    int id, ValueId input_id, ValueId output_id,
-    const Convolution2DAttributes& params,
-    const metal::RuntimeOptions& options);
-
-// Convolution for kernel 1x1
-// require:
-//   kernel_size = 1x1;
-//   padding prepended and appended = 0x0
-//   dilation = 1x1;
-//   stride = 1x1;
-// Works very good on A12 (IPhoneXS, etc).
-// Works good on A9/A10/A11 (IPhone6S, IPhone7, IPhoneX, etc).
-// Works bad on A7/A8 (IPhone5S, IPhone6, etc).
-std::vector<ComputeTaskDescriptorPtr> Convolution1x1(
-    int id, ValueId input_id, ValueId output_id,
-    const Convolution2DAttributes& params, const RuntimeOptions& options);
-
-// TODO(impjdi): Move it inside module.
-bool CheckConvolution1x1Support(const Convolution2DAttributes& attr);
-
-// This convolution pass all conv parameters (beside output_channels)
-// as dynamic arguments (uniform buffer) to kernel.
-// Depending on output_channels can be generated different kernels
-// Kernel can proceed 4/8/12/16 output channels per one thread.
-// 16 channels output is the fastest but the least flexible.
 std::vector<ComputeTaskDescriptorPtr> ConvolutionGeneric(
-    int id, ValueId input_id, ValueId output_id,
+    int id, ValueId input_id, ValueId output_id, const BHWC& dst_shape,
    const Convolution2DAttributes& params, const RuntimeOptions& options);

-// This convolution makes more precise mapping of threads on elements.
-// For example, if we have output tensor 12x7 and work group = 8x4,
-// then we need 4 workgroups to cover this tensor in usual case.
-// But in general we have only 84 elements(12*7), and we can cover it with 3
-// workgroups of size 32. So this version of convolution use this precise
-// mapping.
-// But this convolution, due to some hardware limitations, doesn't work better
-// always. In general it works good on A12.
-// Each thread process 2 pixels in XY dimension and variable amount of pixels
-// in Z dimension(depends on dst_channels).
-std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise(
-    int id, ValueId input_id, ValueId output_id,
-    const Convolution2DAttributes& params, const RuntimeOptions& options);
-
-// As previous, but specific for 1x1 and each thread process 1 pixel in XY
-// dimension.
-// This convolution for PowerVR in FP16 mode with FP32 accumulator
-// It will work in other modes also, but not with good performance
-std::vector<ComputeTaskDescriptorPtr> ConvolutionPrecise1x1PowerVR(
-    int id, ValueId input_id, ValueId output_id,
-    const Convolution2DAttributes& params, const RuntimeOptions& options);
-
-// TODO(impjdi): Move it inside module.
-bool CheckConvolutionPrecise1x1Support(const Convolution2DAttributes& attr);
-
-// This function calculates amount of threads that should be launched for
-// ConvolutionGeneric or Convolution1x1 (threads_count1) and amount of threads
-// that should be launched for ConvolutionPrecise (threads_count2) and returns
-// threads_count1 / threads_count2.
-float GetThreadsRatioUsualToPreciseConvolution(const BHWC& dst_shape);
-
 }  // namespace metal
 }  // namespace gpu
 }  // namespace tflite