Added new ways of weights uploading.

Improved Intel performance. Added macos test for convolution. PiperOrigin-RevId: 304262331 Change-Id: If9b433364d4e6122f6f039e9da490cdfa770677d
2020-04-01 14:25:51 -07:00 · 2020-04-01 14:25:51 -07:00 · 1778de6f64
commit 1778de6f64
parent 93fd484c16
2 changed files with 102 additions and 28 deletions
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@ -1,4 +1,5 @@
 load("@build_bazel_rules_apple//apple:ios.bzl", "ios_unit_test")
+load("@build_bazel_rules_apple//apple:macos.bzl", "macos_unit_test")
 load(
    "//tensorflow/lite:special_rules.bzl",
    "tflite_ios_lab_runner",
@ -159,6 +160,15 @@ ios_unit_test(
    deps = [":conv_test_lib"],
 )

+macos_unit_test(
+    name = "macos_conv_test",
+    minimum_os_version = "10.13",
+    tags = [
+        "local",
+    ],
+    deps = [":conv_test_lib"],
+)
+
 cc_library(
    name = "custom_registry",
    srcs = ["custom_registry.cc"],
--- a/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv.cc
@ -40,6 +40,9 @@ namespace gpu {
 namespace metal {

 enum class WeightsUploadType {
+  PRIVATE_MEM_SIMD8_BROADCAST,
+  PRIVATE_MEM_SIMD16_BROADCAST,
+  PRIVATE_MEM_SIMD32_BROADCAST,
  LOCAL_MEM_BY_THREADS,
  GLOBAL_MEM,
  CONSTANT_MEM,
@ -206,6 +209,25 @@ std::string GenerateConvolution(const ConvParams& params) {
  const int local_mem_size =
      params.block_size.z * 4 * params.src_depth_loop_size;

+  const bool use_simd_broadcast =
+      params.weights_upload_type ==
+          WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST ||
+      params.weights_upload_type ==
+          WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST ||
+      params.weights_upload_type ==
+          WeightsUploadType::PRIVATE_MEM_SIMD32_BROADCAST;
+  int simd_size = 1;
+  if (params.weights_upload_type ==
+      WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST) {
+    simd_size = 8;
+  } else if (params.weights_upload_type ==
+             WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST) {
+    simd_size = 16;
+  } else if (params.weights_upload_type ==
+             WeightsUploadType::PRIVATE_MEM_SIMD32_BROADCAST) {
+    simd_size = 32;
+  }
+
  const bool use_filters_constants =
      !params.need_dst_loop && !params.need_src_loop && params.x_kernel_is_1 &&
      params.y_kernel_is_1;
@ -228,16 +250,19 @@ struct uniforms {
 $0

 kernel void ComputeFunction(
-                            $1
-                            uint tid[[thread_index_in_threadgroup]],
-                            uint3 group_id[[threadgroup_position_in_grid]],
-                            uint3 tid3d[[thread_position_in_threadgroup]],
-                            uint3 ugid[[thread_position_in_grid]])
-{
+    $1
+    uint tid[[thread_index_in_threadgroup]],
+    uint3 group_id[[threadgroup_position_in_grid]],
+    uint3 tid3d[[thread_position_in_threadgroup]],
 )";
+  if (use_simd_broadcast) {
+    c += "    uint simd_id[[thread_index_in_simdgroup]],\n";
+  }
+  c += "    uint3 ugid[[thread_position_in_grid]]){\n";
  c += GlobalIdsGen(ids_params);
  c += "  if (Z >= params.dst_size.w) return;\n";
-  if (!use_local_mem && !params.linear_whs) {
+  bool late_xy_check = use_local_mem || use_simd_broadcast;
+  if (!late_xy_check && !params.linear_whs) {
    c += "  if (X >= params.dst_size.x || Y >= params.dst_size.y) return;\n";
  }
  for (int z = 0; z < params.block_size.z; ++z) {
@ -377,6 +402,20 @@ kernel void ComputeFunction(
                                 /*global_offset_name*/ "", "tid",
                                 total_work_items, local_mem_size);
    c += "    BARRIER(mem_flags::mem_threadgroup);\n";
+  } else if (use_simd_broadcast) {
+    int parts = local_mem_size / simd_size;
+    int reminder = local_mem_size % simd_size;
+    for (int i = 0; i < parts; ++i) {
+      c += "    FLT4 simd_w" + std::to_string(i) + " = tmp[simd_id + " +
+           std::to_string(i * simd_size) + "];\n";
+    }
+    if (reminder) {
+      c += "    FLT4 simd_w" + std::to_string(parts) + ";\n";
+      c += "    if (simd_id < " + std::to_string(reminder) + ") {\n";
+      c += "      simd_w" + std::to_string(parts) + " = tmp[simd_id + " +
+           std::to_string(parts * simd_size) + "];\n";
+      c += "    }\n";
+    }
  }
  auto declare_src = [&]() {
    for (int y = 0; y < params.block_size.y; ++y) {
@ -418,6 +457,12 @@ kernel void ComputeFunction(
                std::to_string(z) + std::to_string(y) + std::to_string(x);
            std::string f_val =
                name + "[" + std::to_string(z * 4 + ch + offset) + "]";
+            if (use_simd_broadcast) {
+              int simd_id = (z * 4 + ch + offset) / simd_size;
+              int thread_id = (z * 4 + ch + offset) % simd_size;
+              f_val = "simd_broadcast(simd_w" + std::to_string(simd_id) + ", " +
+                      std::to_string(thread_id) + "u)";
+            }
            std::string s_val = "src" + s_id;
            std::string r_val = "r" + r_id;
            if (params.weight_layout == WeightsInnerBlockLayout::O4I4) {
@ -458,7 +503,7 @@ kernel void ComputeFunction(
    c += "  } while (y < params.kernel_dilation.y);\n";
  }

-  if (use_local_mem && !params.linear_whs) {
+  if (late_xy_check && !params.linear_whs) {
    c += "  if (X >= params.dst_size.x || Y >= params.dst_size.y) return;\n";
  }

@ -872,15 +917,22 @@ ConvParams GetConvParamsForA9AndHigher(const AppleGPUInfo& apple_info,
 ConvParams GetConvParamsForIntel(const Convolution2DAttributes& attr,
                                 const RuntimeOptions& options,
                                 const BHWC& dst_shape) {
+  const int dst_slices = IntegralDivideRoundUp(dst_shape.c, 4);
+  const int src_slices = IntegralDivideRoundUp(attr.weights.shape.i, 4);
  ConvParams params;
-  params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+  params.weights_upload_type = WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST;
  params.x_kernel_is_1 = IsKernelXIs1(attr);
  params.y_kernel_is_1 = IsKernelYIs1(attr);
  params.src_depth_loop_size = 1;
  params.linear_wh = false;
  params.linear_whs = false;
  params.work_group_launch_order = int3(2, 0, 1);
-  params.block_size = int3(1, 2, 4);
+  params.block_size = int3(1, 1, 1);
+  if (dst_slices % 4 == 0 || dst_slices >= 8) {
+    params.block_size.z = 4;
+  } else if (dst_slices % 2 == 0 || dst_slices >= 4) {
+    params.block_size.z = 2;
+  }
  params.work_group_size = int3(8, 2, 1);
  if (options.storage_precision == RuntimeOptions::Precision::FP16 &&
      options.accumulator_precision == RuntimeOptions::Precision::FP32) {
@ -889,9 +941,12 @@ ConvParams GetConvParamsForIntel(const Convolution2DAttributes& attr,
    params.weight_layout = WeightsInnerBlockLayout::I4O4;
  }

+  if (src_slices % 2 == 0) {
+    params.src_depth_loop_size = 2;
+  }
+
  int g1 = GetGroupsCount(dst_shape, params.work_group_size, params.block_size);
  int g2 = GetGroupsCountForLinearWH(dst_shape, {16, 1, 1}, params.block_size);
-  int g3 = GetGroupsCountForLinearWHS(dst_shape, {16, 1, 1}, params.block_size);

  if (g2 < g1) {
    params.linear_wh = true;
@ -899,15 +954,6 @@ ConvParams GetConvParamsForIntel(const Convolution2DAttributes& attr,
    params.work_group_launch_order = int3(1, 0, 2);
  }

-  float precise_threshold = 2.0f;
-  float precise_ratio = static_cast<float>(g2) / static_cast<float>(g3);
-  if (precise_ratio > precise_threshold) {
-    params.linear_wh = false;
-    params.linear_whs = true;
-    params.work_group_size = int3(16, 1, 1);
-    params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
-  }
-
  return params;
 }

@ -1073,15 +1119,33 @@ std::vector<ComputeTaskDescriptorPtr> ConvolutionWino4x4To6x6(
  params.different_weights_for_height = true;
  params.x_kernel_is_1 = true;
  params.y_kernel_is_1 = true;
-  params.weight_layout = WeightsInnerBlockLayout::O4I4;
-  if (device_info.apple_info.IsLocalMemoryPreferredOverGlobal()) {
-    params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
-    params.work_group_size = int3(32, 1, 1);
-    params.block_size = int3(4, 1, 4);
-  } else {
+  if (device_info.IsAppleGPU()) {
+    params.weight_layout = WeightsInnerBlockLayout::O4I4;
+    if (device_info.apple_info.IsLocalMemoryPreferredOverGlobal()) {
+      params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+      params.work_group_size = int3(32, 1, 1);
+      params.block_size = int3(4, 1, 4);
+    } else {
+      params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
+      params.work_group_size = int3(8, 4, 1);
+      params.block_size = int3(4, 1, 4);
+    }
+  } else if (device_info.IsIntelGPU()) {
+    params.weight_layout = WeightsInnerBlockLayout::I4O4;
+    params.weights_upload_type =
+        WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST;
+    params.work_group_size = int3(16, 1, 1);
+    params.block_size = int3(1, 1, 4);
+  } else if (device_info.IsAMDGPU()) {
+    params.weight_layout = WeightsInnerBlockLayout::I4O4;
    params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
-    params.work_group_size = int3(8, 4, 1);
-    params.block_size = int3(4, 1, 4);
+    params.work_group_size = int3(32, 1, 1);
+    params.block_size = int3(2, 1, 4);
+  } else {
+    params.weight_layout = WeightsInnerBlockLayout::I4O4;
+    params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
+    params.work_group_size = int3(32, 1, 1);
+    params.block_size = int3(2, 1, 4);
  }

  auto desc = std::make_shared<ComputeTaskDescriptor>();