Function for better work-group picking moved to util.
Pooling is using this function for better performance. PiperOrigin-RevId: 242153291
This commit is contained in:
parent
0d4d88c051
commit
0fdb21c045
@ -173,6 +173,7 @@ cc_library(
|
||||
srcs = ["pooling.cc"],
|
||||
hdrs = ["pooling.h"],
|
||||
deps = [
|
||||
":util",
|
||||
"//tensorflow/lite/delegates/gpu/common:model",
|
||||
"//tensorflow/lite/delegates/gpu/common:operations",
|
||||
"//tensorflow/lite/delegates/gpu/common:shape",
|
||||
@ -219,6 +220,7 @@ cc_library(
|
||||
srcs = ["reshape.cc"],
|
||||
hdrs = ["reshape.h"],
|
||||
deps = [
|
||||
":util",
|
||||
"//tensorflow/lite/delegates/gpu/common:model",
|
||||
"//tensorflow/lite/delegates/gpu/common:operations",
|
||||
"//tensorflow/lite/delegates/gpu/common:shape",
|
||||
@ -291,3 +293,12 @@ cc_library(
|
||||
"@com_google_absl//absl/strings",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "util",
|
||||
srcs = ["util.cc"],
|
||||
hdrs = ["util.h"],
|
||||
deps = [
|
||||
"//tensorflow/lite/delegates/gpu/common:types",
|
||||
],
|
||||
)
|
||||
|
@ -27,6 +27,7 @@ limitations under the License.
|
||||
#include "tensorflow/lite/delegates/gpu/common/shape.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/util.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal/kernels/util.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace gpu {
|
||||
@ -40,8 +41,8 @@ std::string GetMaxPoolingCode(const HW& kernel_size) {
|
||||
constant int window_w = $0;
|
||||
constant int window_h = $1;
|
||||
struct uniforms {
|
||||
int2 src_size;
|
||||
int2 dst_size;
|
||||
int4 src_size;
|
||||
int4 dst_size;
|
||||
int2 stride;
|
||||
int2 offset;
|
||||
};
|
||||
@ -51,7 +52,8 @@ std::string GetMaxPoolingCode(const HW& kernel_size) {
|
||||
$$1
|
||||
uint3 gid[[thread_position_in_grid]]) {
|
||||
if (static_cast<int>(gid.x) >= params.dst_size.x ||
|
||||
static_cast<int>(gid.y) >= params.dst_size.y) {
|
||||
static_cast<int>(gid.y) >= params.dst_size.y ||
|
||||
static_cast<int>(gid.z) >= params.dst_size.z) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -84,8 +86,8 @@ std::string GetMaxPoolingIndicesCode(const HW& kernel_size) {
|
||||
constant int window_w = $0;
|
||||
constant int window_h = $1;
|
||||
struct uniforms {
|
||||
int2 src_size;
|
||||
int2 dst_size;
|
||||
int4 src_size;
|
||||
int4 dst_size;
|
||||
int2 stride;
|
||||
int2 offset;
|
||||
};
|
||||
@ -95,7 +97,8 @@ std::string GetMaxPoolingIndicesCode(const HW& kernel_size) {
|
||||
$$1
|
||||
uint3 gid[[thread_position_in_grid]]) {
|
||||
if (static_cast<int>(gid.x) >= params.dst_size.x ||
|
||||
static_cast<int>(gid.y) >= params.dst_size.y) {
|
||||
static_cast<int>(gid.y) >= params.dst_size.y ||
|
||||
static_cast<int>(gid.z) >= params.dst_size.z) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -147,8 +150,8 @@ std::string GetAveragePoolingCode(const HW& kernel_size) {
|
||||
constant int window_h = $1;
|
||||
constant float multiplier = $2;
|
||||
struct uniforms {
|
||||
int2 src_size;
|
||||
int2 dst_size;
|
||||
int4 src_size;
|
||||
int4 dst_size;
|
||||
int2 stride;
|
||||
int2 offset;
|
||||
};
|
||||
@ -158,7 +161,8 @@ std::string GetAveragePoolingCode(const HW& kernel_size) {
|
||||
uint tid[[thread_index_in_threadgroup]],
|
||||
uint3 gid[[thread_position_in_grid]]) {
|
||||
if (static_cast<int>(gid.x) >= params.dst_size.x ||
|
||||
static_cast<int>(gid.y) >= params.dst_size.y) {
|
||||
static_cast<int>(gid.y) >= params.dst_size.y ||
|
||||
static_cast<int>(gid.z) >= params.dst_size.z) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -219,8 +223,12 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
|
||||
std::vector<int> uniform_params = {
|
||||
dimension.w,
|
||||
dimension.h,
|
||||
IntegralDivideRoundUp(dimension.c, 4),
|
||||
dimension.w * dimension.h,
|
||||
output_dimension.w,
|
||||
output_dimension.h,
|
||||
IntegralDivideRoundUp(dimension.c, 4),
|
||||
output_dimension.w * output_dimension.h,
|
||||
params.strides.w,
|
||||
params.strides.h,
|
||||
params.padding.prepended.w,
|
||||
@ -230,14 +238,14 @@ ComputeTaskDescriptorPtr PoolingInternal(int id, ValueId input_id,
|
||||
}},
|
||||
};
|
||||
|
||||
desc->resize_function = [input_id,
|
||||
params](const std::map<ValueId, BHWC>& buffers) {
|
||||
const uint3 groups_size{16, 16, 1};
|
||||
const auto& src_shape = buffers.find(input_id)->second;
|
||||
BHWC dst_shape = CalculateOutputShape(src_shape, params);
|
||||
int groups_x = IntegralDivideRoundUp(dst_shape.w, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(dst_shape.h, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(dst_shape.c, 4);
|
||||
desc->resize_function = [output_id](const std::map<ValueId, BHWC>& buffers) {
|
||||
BHWC dst_shape = buffers.find(output_id)->second;
|
||||
const uint3 grid =
|
||||
uint3(dst_shape.w, dst_shape.h, IntegralDivideRoundUp(dst_shape.c, 4));
|
||||
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
|
||||
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
|
||||
|
@ -27,31 +27,12 @@ limitations under the License.
|
||||
#include "tensorflow/lite/delegates/gpu/common/types.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/util.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
|
||||
#include "tensorflow/lite/delegates/gpu/metal/kernels/util.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace gpu {
|
||||
namespace metal {
|
||||
namespace {
|
||||
|
||||
uint GetBestSize(int grid_size) {
|
||||
if (grid_size % 8 == 0 || grid_size % 8 >= 4 || grid_size >= 16) {
|
||||
return 8;
|
||||
} else if (grid_size % 4 == 0 || grid_size % 4 >= 2 || grid_size >= 8) {
|
||||
return 4;
|
||||
} else if (grid_size % 2 == 0 || grid_size >= 4) {
|
||||
return 2;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
uint3 GetWorkGroupSize(const BHWC& dst_shape) {
|
||||
uint x_size = GetBestSize(dst_shape.w);
|
||||
uint y_size = GetBestSize(dst_shape.h);
|
||||
uint z_size = std::max(1u, 32u / (x_size * y_size));
|
||||
return {x_size, y_size, z_size};
|
||||
}
|
||||
|
||||
std::string GetReshapeCode() {
|
||||
std::string code = R"(
|
||||
#include <metal_stdlib>
|
||||
@ -177,11 +158,12 @@ std::vector<ComputeTaskDescriptorPtr> Reshape(int id, ValueId input_id,
|
||||
};
|
||||
|
||||
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
|
||||
const uint3 groups_size = GetWorkGroupSize(attr.new_shape);
|
||||
int groups_x = IntegralDivideRoundUp(attr.new_shape.w, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(attr.new_shape.h, groups_size.y);
|
||||
const int dst_layers = IntegralDivideRoundUp(attr.new_shape.c, 4);
|
||||
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
|
||||
const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
|
||||
IntegralDivideRoundUp(attr.new_shape.c, 4));
|
||||
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
|
||||
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
|
||||
@ -235,11 +217,12 @@ std::vector<ComputeTaskDescriptorPtr> Reshapex4(int id, ValueId input_id,
|
||||
};
|
||||
|
||||
desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
|
||||
const uint3 groups_size = GetWorkGroupSize(attr.new_shape);
|
||||
int groups_x = IntegralDivideRoundUp(attr.new_shape.w, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(attr.new_shape.h, groups_size.y);
|
||||
const int dst_layers = IntegralDivideRoundUp(attr.new_shape.c, 4);
|
||||
int groups_z = IntegralDivideRoundUp(dst_layers, groups_size.z);
|
||||
const uint3 grid = uint3(attr.new_shape.w, attr.new_shape.h,
|
||||
IntegralDivideRoundUp(attr.new_shape.c, 4));
|
||||
const uint3 groups_size = GetWorkGroupSizeForGrid(grid);
|
||||
int groups_x = IntegralDivideRoundUp(grid.x, groups_size.x);
|
||||
int groups_y = IntegralDivideRoundUp(grid.y, groups_size.y);
|
||||
int groups_z = IntegralDivideRoundUp(grid.z, groups_size.z);
|
||||
return std::make_pair(groups_size, uint3{groups_x, groups_y, groups_z});
|
||||
};
|
||||
|
||||
|
47
tensorflow/lite/delegates/gpu/metal/kernels/util.cc
Normal file
47
tensorflow/lite/delegates/gpu/metal/kernels/util.cc
Normal file
@ -0,0 +1,47 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/lite/delegates/gpu/metal/kernels/util.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace gpu {
|
||||
namespace metal {
|
||||
namespace {
|
||||
|
||||
unsigned int GetOptimalSize(unsigned int grid_size) {
|
||||
if (grid_size % 8 == 0 || grid_size % 8 >= 4 || grid_size >= 16) {
|
||||
return 8;
|
||||
}
|
||||
if (grid_size % 4 == 0 || grid_size % 4 >= 2 || grid_size >= 8) {
|
||||
return 4;
|
||||
}
|
||||
if (grid_size % 2 == 0 || grid_size >= 4) {
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
uint3 GetWorkGroupSizeForGrid(const uint3& grid_size) {
|
||||
unsigned int x_size = GetOptimalSize(grid_size.x);
|
||||
unsigned int y_size = GetOptimalSize(grid_size.y);
|
||||
unsigned int z_size = std::max(1u, 32u / (x_size * y_size));
|
||||
return {x_size, y_size, z_size};
|
||||
}
|
||||
|
||||
} // namespace metal
|
||||
} // namespace gpu
|
||||
} // namespace tflite
|
34
tensorflow/lite/delegates/gpu/metal/kernels/util.h
Normal file
34
tensorflow/lite/delegates/gpu/metal/kernels/util.h
Normal file
@ -0,0 +1,34 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_UTIL_H_
|
||||
#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_UTIL_H_
|
||||
|
||||
#include "tensorflow/lite/delegates/gpu/common/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace gpu {
|
||||
namespace metal {
|
||||
|
||||
// returns work-group size for grid that tries to cover grid optimaly
|
||||
// If you use work-group size generated by this method you MUST check
|
||||
// all three dimensions of thread on out of border in your kernel.
|
||||
uint3 GetWorkGroupSizeForGrid(const uint3& grid_size);
|
||||
|
||||
} // namespace metal
|
||||
} // namespace gpu
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_DELEGATES_GPU_METAL_KERNELS_UTIL_H_
|
Loading…
x
Reference in New Issue
Block a user