Function for better work-group picking moved to util.

Pooling is using this function for better performance.

PiperOrigin-RevId: 242153291
This commit is contained in:
A. Unique TensorFlower 2019-04-05 10:53:58 -07:00 committed by TensorFlower Gardener
parent 0d4d88c051
commit 0fdb21c045
5 changed files with 130 additions and 47 deletions

View File

@ -173,6 +173,7 @@ cc_library(
srcs = ["pooling.cc"],
hdrs = ["pooling.h"],
deps = [
":util",
"//tensorflow/lite/delegates/gpu/common:model",
"//tensorflow/lite/delegates/gpu/common:operations",
"//tensorflow/lite/delegates/gpu/common:shape",
@ -219,6 +220,7 @@ cc_library(
srcs = ["reshape.cc"],
hdrs = ["reshape.h"],
deps = [
":util",
"//tensorflow/lite/delegates/gpu/common:model",
"//tensorflow/lite/delegates/gpu/common:operations",
"//tensorflow/lite/delegates/gpu/common:shape",
@ -291,3 +293,12 @@ cc_library(
"@com_google_absl//absl/strings",
],
)
cc_library(
name = "util",
srcs = ["util.cc"],
hdrs = ["util.h"],
deps = [
"//tensorflow/lite/delegates/gpu/common:types",
],
)

View File

@ -27,6 +27,7 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/common/shape.h"
#include "tensorflow/lite/delegates/gpu/common/util.h"
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
#include "tensorflow/lite/delegates/gpu/metal/kernels/util.h"
namespace tflite {
namespace gpu {
@ -40,8 +41,8 @@ std::string GetMaxPoolingCode(const HW& kernel_size) {
constant int window_w = $0;
constant int window_h = $1;
struct uniforms {
int2 src_size;
int2 dst_size;
int4 src_size;
int4 dst_size;
int2 stride;
int2 offset;
};
@ -51,7 +52,8 @@ std::string GetMaxPoolingCode(const HW& kernel_size) {
$$1
uint3 gid[[thread_position_in_grid]]) {
if (static_cast<int>(gid.x) >= params.dst_size.x ||
static_cast<int>(gid.y) >= params.dst_size.y) {
static_cast<int>(gid.y) >= params.dst_size.y ||
static_cast<int>(gid.z) >= params.dst_size.z) {
return;
}
@ -84,8 +86,8 @@ std::string GetMaxPoolingIndicesCode(const HW& kernel_size) {
constant int window_w = $0;
constant int window_h = $1;
struct uniforms {
int2 src_size;
int2 dst_size;
int4 src_size;
int4 dst_size;
int2 stride;
int2 offset;
};
@ -95,7 +97,8 @@ std::string GetMaxPoolingIndicesCode(const HW& kernel_size) {
$$1
uint3 gid[[thread_position_in_grid]]) {
if (static_cast<int>(gid.x) >= params.dst_size.x ||
static_cast<int>(gid.y) >= params.dst_size.y) {
static_cast<int>(gid.y) >= params.dst_size.y ||
static_cast<int>(gid.z) >= params.dst_size.z) {
return;
}
@ -147,8 +150,8 @@ std::string GetAveragePoolingCode(const HW& kernel_size) {
constant int window_h = $1;
constant float multiplier = $2;
struct uniforms {
int2 src_size;
int2 dst_size;
int4 src_size;
int4 dst_size;
int2 stride;
int2 offset;
};
@ -158,7 +161,8 @@ std::string GetAveragePoolingCode(const HW& kernel_size) {
uint tid[[thread_index_in_threadgroup]],
uint3 gid[[thread_position_in_grid]]) {
if (static_cast<int>(gid.x) >= params.dst_size.x ||
static_cast<int>(gid.y) >= params.dst_size.y) {
static_cast<int>(gid.y) >= params.dst_size.y ||
static_cast<int>(gid.z) >= params.dst_size.z) {
return;
}
@ -219,8 +223,12 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
std::vector<int> uniform_params = {
dimension.w,
dimension.h,
IntegralDivideRoundUp(dimension.c, 4),
dimension.w * dimension.h,
output_dimension.w,
output_dimension.h,
IntegralDivideRoundUp(dimension.c, 4),
output_dimension.w * output_dimension.h,
params.strides.w,
params.strides.h,
params.padding.prepended.w,
@ -230,14 +238,14 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
}},
};
desc->resize_function = [input_id,
params](const std::map<ValueId, BHWC>& buffers) {
const uint3 groups_size{16, 16, 1};
const auto& src_shape = buffers.find(input_id)->second;
BHWC dst_shape = CalculateOutputShape(src_shape, params);
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
int groups_z = IntegralDivideRoundUp(dst_shape.c, 4);
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
BHWC dst_shape = buffers.find(output_id)->second;
const uint3 grid =
uint3(dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4));
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};

View File

@ -27,31 +27,12 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/common/types.h"
#include "tensorflow/lite/delegates/gpu/common/util.h"
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
#include "tensorflow/lite/delegates/gpu/metal/kernels/util.h"
namespace tflite {
namespace gpu {
namespace metal {
namespace {
uint GetBestSize(int grid_size) {
if (grid_size % 8 == 0 || grid_size % 8 >= 4 || grid_size >= 16) {
return 8;
} else if (grid_size % 4 == 0 || grid_size % 4 >= 2 || grid_size >= 8) {
return 4;
} else if (grid_size % 2 == 0 || grid_size >= 4) {
return 2;
} else {
return 1;
}
}
uint3 GetWorkGroupSize(const BHWC& dst_shape) {
uint x_size = GetBestSize(dst_shape.w);
uint y_size = GetBestSize(dst_shape.h);
uint z_size = std::max(1u, 32u / (x_size * y_size));
return {x_size, y_size, z_size};
}
std::string GetReshapeCode() {
std::string code = R"(
#include <metal_stdlib>
@ -177,11 +158,12 @@ std::vector<ComputeTaskDescriptorPtr> Reshape(int id, ValueId input_id,
};
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
const uint3 groups_size = GetWorkGroupSize(attr.new_shape);
int groups_x = IntegralDivideRoundUp(attr.new_shape.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(attr.new_shape.h, groups_size.y);
const int dst_layers = IntegralDivideRoundUp(attr.new_shape.c, 4);
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
IntegralDivideRoundUp(attr.new_shape.c, 4));
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};
@ -235,11 +217,12 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
};
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
const uint3 groups_size = GetWorkGroupSize(attr.new_shape);
int groups_x = IntegralDivideRoundUp(attr.new_shape.w, groups_size.x);
int groups_y = IntegralDivideRoundUp(attr.new_shape.h, groups_size.y);
const int dst_layers = IntegralDivideRoundUp(attr.new_shape.c, 4);
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
IntegralDivideRoundUp(attr.new_shape.c, 4));
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
};

View File

@ -0,0 +1,47 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/delegates/gpu/metal/kernels/util.h"
namespace tflite {
namespace gpu {
namespace metal {
namespace {
unsigned int GetOptimalSize(unsigned int grid_size) {
if (grid_size % 8 == 0 || grid_size % 8 >= 4 || grid_size >= 16) {
return 8;
}
if (grid_size % 4 == 0 || grid_size % 4 >= 2 || grid_size >= 8) {
return 4;
}
if (grid_size % 2 == 0 || grid_size >= 4) {
return 2;
}
return 1;
}
} // namespace
uint3 GetWorkGroupSizeForGrid(const uint3& grid_size) {
unsigned int x_size = GetOptimalSize(grid_size.x);
unsigned int y_size = GetOptimalSize(grid_size.y);
unsigned int z_size = std::max(1u, 32u / (x_size * y_size));
return {x_size, y_size, z_size};
}
} // namespace metal
} // namespace gpu
} // namespace tflite

View File

@ -0,0 +1,34 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_UTIL_H_
#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_UTIL_H_
#include "tensorflow/lite/delegates/gpu/common/types.h"
namespace tflite {
namespace gpu {
namespace metal {
// returns work-group size for grid that tries to cover grid optimaly
// If you use work-group size generated by this method you MUST check
// all three dimensions of thread on out of border in your kernel.
uint3 GetWorkGroupSizeForGrid(const uint3& grid_size);
} // namespace metal
} // namespace gpu
} // namespace tflite
#endif // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_UTIL_H_