Function for better work-group picking moved to util.

Pooling is using this function for better performance. PiperOrigin-RevId: 242153291
2019-04-05 10:53:58 -07:00 · 2019-04-05 10:53:58 -07:00 · 0fdb21c045
commit 0fdb21c045
parent 0d4d88c051
5 changed files with 130 additions and 47 deletions
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@ -173,6 +173,7 @@ cc_library(
    srcs = ["pooling.cc"],
    hdrs = ["pooling.h"],
    deps = [
+        ":util",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:shape",
@ -219,6 +220,7 @@ cc_library(
    srcs = ["reshape.cc"],
    hdrs = ["reshape.h"],
    deps = [
+        ":util",
        "//tensorflow/lite/delegates/gpu/common:model",
        "//tensorflow/lite/delegates/gpu/common:operations",
        "//tensorflow/lite/delegates/gpu/common:shape",
@ -291,3 +293,12 @@ cc_library(
        "@com_google_absl//absl/strings",
    ],
 )
+
+cc_library(
+    name = "util",
+    srcs = ["util.cc"],
+    hdrs = ["util.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
--- a/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/pooling.cc
@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/util.h"

 namespace tflite {
 namespace gpu {
@ -40,8 +41,8 @@ std::string GetMaxPoolingCode(const HW& kernel_size) {
    constant int window_w = $0;
    constant int window_h = $1;
    struct uniforms {
-      int2 src_size;
-      int2 dst_size;
+      int4 src_size;
+      int4 dst_size;
      int2 stride;
      int2 offset;
    };
@ -51,7 +52,8 @@ std::string GetMaxPoolingCode(const HW& kernel_size) {
                                $$1
                                uint3 gid[[thread_position_in_grid]]) {
      if (static_cast<int>(gid.x) >= params.dst_size.x ||
-          static_cast<int>(gid.y) >= params.dst_size.y) {
+          static_cast<int>(gid.y) >= params.dst_size.y ||
+          static_cast<int>(gid.z) >= params.dst_size.z) {
        return;
      }

@ -84,8 +86,8 @@ std::string GetMaxPoolingIndicesCode(const HW& kernel_size) {
    constant int window_w = $0;
    constant int window_h = $1;
    struct uniforms {
-      int2 src_size;
-      int2 dst_size;
+      int4 src_size;
+      int4 dst_size;
      int2 stride;
      int2 offset;
    };
@ -95,7 +97,8 @@ std::string GetMaxPoolingIndicesCode(const HW& kernel_size) {
                                $$1
                                uint3 gid[[thread_position_in_grid]]) {
      if (static_cast<int>(gid.x) >= params.dst_size.x ||
-          static_cast<int>(gid.y) >= params.dst_size.y) {
+          static_cast<int>(gid.y) >= params.dst_size.y ||
+          static_cast<int>(gid.z) >= params.dst_size.z) {
        return;
      }

@ -147,8 +150,8 @@ std::string GetAveragePoolingCode(const HW& kernel_size) {
  constant int window_h = $1;
  constant float multiplier = $2;
  struct uniforms {
-    int2 src_size;
-    int2 dst_size;
+    int4 src_size;
+    int4 dst_size;
    int2 stride;
    int2 offset;
  };
@ -158,7 +161,8 @@ std::string GetAveragePoolingCode(const HW& kernel_size) {
                              uint tid[[thread_index_in_threadgroup]],
                              uint3 gid[[thread_position_in_grid]]) {
    if (static_cast<int>(gid.x) >= params.dst_size.x ||
-        static_cast<int>(gid.y) >= params.dst_size.y) {
+        static_cast<int>(gid.y) >= params.dst_size.y ||
+        static_cast<int>(gid.z) >= params.dst_size.z) {
      return;
    }

@ -219,8 +223,12 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
         std::vector<int> uniform_params = {
             dimension.w,
             dimension.h,
+             IntegralDivideRoundUp(dimension.c, 4),
+             dimension.w * dimension.h,
             output_dimension.w,
             output_dimension.h,
+             IntegralDivideRoundUp(dimension.c, 4),
+             output_dimension.w * output_dimension.h,
             params.strides.w,
             params.strides.h,
             params.padding.prepended.w,
@ -230,14 +238,14 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
       }},
  };

-  desc->resize_function = [input_id,
-                           params](const std::map<ValueId, BHWC>& buffers) {
-    const uint3 groups_size{16, 16, 1};
-    const auto& src_shape = buffers.find(input_id)->second;
-    BHWC dst_shape = CalculateOutputShape(src_shape, params);
-    int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
-    int groups_z = IntegralDivideRoundUp(dst_shape.c, 4);
+  desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
+    BHWC dst_shape = buffers.find(output_id)->second;
+    const uint3 grid =
+        uint3(dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4));
+    const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
+    int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
+    int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
+    int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };

--- a/tensorflow/lite/delegates/gpu/metal/kernels/reshape.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/reshape.cc
@ -27,31 +27,12 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
+#include "tensorflow/lite/delegates/gpu/metal/kernels/util.h"

 namespace tflite {
 namespace gpu {
 namespace metal {
 namespace {
-
-uint GetBestSize(int grid_size) {
-  if (grid_size % 8 == 0 || grid_size % 8 >= 4 || grid_size >= 16) {
-    return 8;
-  } else if (grid_size % 4 == 0 || grid_size % 4 >= 2 || grid_size >= 8) {
-    return 4;
-  } else if (grid_size % 2 == 0 || grid_size >= 4) {
-    return 2;
-  } else {
-    return 1;
-  }
-}
-
-uint3 GetWorkGroupSize(const BHWC& dst_shape) {
-  uint x_size = GetBestSize(dst_shape.w);
-  uint y_size = GetBestSize(dst_shape.h);
-  uint z_size = std::max(1u, 32u / (x_size * y_size));
-  return {x_size, y_size, z_size};
-}
-
 std::string GetReshapeCode() {
  std::string code = R"(
 #include <metal_stdlib>
@ -177,11 +158,12 @@ std::vector<ComputeTaskDescriptorPtr> Reshape(int id, ValueId input_id,
  };

  desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
-    const uint3 groups_size = GetWorkGroupSize(attr.new_shape);
-    int groups_x = IntegralDivideRoundUp(attr.new_shape.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(attr.new_shape.h, groups_size.y);
-    const int dst_layers = IntegralDivideRoundUp(attr.new_shape.c, 4);
-    int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
+    const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
+                             IntegralDivideRoundUp(attr.new_shape.c, 4));
+    const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
+    int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
+    int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
+    int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };

@ -235,11 +217,12 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
  };

  desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
-    const uint3 groups_size = GetWorkGroupSize(attr.new_shape);
-    int groups_x = IntegralDivideRoundUp(attr.new_shape.w, groups_size.x);
-    int groups_y = IntegralDivideRoundUp(attr.new_shape.h, groups_size.y);
-    const int dst_layers = IntegralDivideRoundUp(attr.new_shape.c, 4);
-    int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
+    const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
+                             IntegralDivideRoundUp(attr.new_shape.c, 4));
+    const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
+    int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
+    int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
+    int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
    return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
  };

--- a/tensorflow/lite/delegates/gpu/metal/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/util.cc
@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/kernels/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+namespace {
+
+unsigned int GetOptimalSize(unsigned int grid_size) {
+  if (grid_size % 8 == 0 || grid_size % 8 >= 4 || grid_size >= 16) {
+    return 8;
+  }
+  if (grid_size % 4 == 0 || grid_size % 4 >= 2 || grid_size >= 8) {
+    return 4;
+  }
+  if (grid_size % 2 == 0 || grid_size >= 4) {
+    return 2;
+  }
+  return 1;
+}
+
+}  // namespace
+
+uint3 GetWorkGroupSizeForGrid(const uint3& grid_size) {
+  unsigned int x_size = GetOptimalSize(grid_size.x);
+  unsigned int y_size = GetOptimalSize(grid_size.y);
+  unsigned int z_size = std::max(1u, 32u / (x_size * y_size));
+  return {x_size, y_size, z_size};
+}
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/metal/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/util.h
@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_UTIL_H_
+
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+// returns work-group size for grid that tries to cover grid optimaly
+// If you use work-group size generated by this method you MUST check
+// all three dimensions of thread on out of border in your kernel.
+uint3 GetWorkGroupSizeForGrid(const uint3& grid_size);
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_UTIL_H_