Added support of reduction in any axis (beside Channels) for Mean in OpenCL.
PiperOrigin-RevId: 342984160 Change-Id: I8dbca35369c71e2cc69c02622dd758904330de66
This commit is contained in:
parent
9da79c731c
commit
5fff6ad020
@ -733,6 +733,7 @@ cc_library(
|
||||
":work_group_picking",
|
||||
"//tensorflow/lite/delegates/gpu/cl:cl_kernel",
|
||||
"//tensorflow/lite/delegates/gpu/cl:tensor",
|
||||
"//tensorflow/lite/delegates/gpu/common:operations",
|
||||
"//tensorflow/lite/delegates/gpu/common:status",
|
||||
"//tensorflow/lite/delegates/gpu/common:types",
|
||||
"//tensorflow/lite/delegates/gpu/common:util",
|
||||
|
@ -25,28 +25,83 @@ limitations under the License.
|
||||
namespace tflite {
|
||||
namespace gpu {
|
||||
namespace cl {
|
||||
|
||||
Mean::Mean(const OperationDef& definition, const GpuInfo& gpu_info)
|
||||
: GPUOperation(definition) {
|
||||
// for workgroup size:
|
||||
// must be: (x * y) % 4 = 0;
|
||||
// must be: z = 1;
|
||||
work_group_size_ = int3(16, 16, 1);
|
||||
if (gpu_info.IsAdreno()) {
|
||||
if (gpu_info.adreno_info.IsAdreno3xx()) {
|
||||
work_group_size_ = int3(16, 8, 1);
|
||||
namespace {
|
||||
// total_wg_size is pot, dimensions is {1, 2, 3}
|
||||
int3 GetWGSizeFromTotalSize(int total_wg_size, int dimensions) {
|
||||
if (dimensions == 1) {
|
||||
return {total_wg_size, 1, 1};
|
||||
} else if (dimensions == 2) {
|
||||
int3 wg_size = int3(1, 1, 1);
|
||||
while (total_wg_size != 1) {
|
||||
if (total_wg_size >= 4) {
|
||||
wg_size.x *= 2;
|
||||
wg_size.y *= 2;
|
||||
total_wg_size /= 4;
|
||||
} else {
|
||||
// total_wg_size == 2
|
||||
wg_size.x *= 2;
|
||||
total_wg_size /= 2;
|
||||
}
|
||||
}
|
||||
return wg_size;
|
||||
} else {
|
||||
// dimensions == 3
|
||||
int3 wg_size = int3(1, 1, 1);
|
||||
while (total_wg_size != 1) {
|
||||
if (total_wg_size >= 8) {
|
||||
wg_size.x *= 2;
|
||||
wg_size.y *= 2;
|
||||
wg_size.z *= 2;
|
||||
total_wg_size /= 8;
|
||||
} else if (total_wg_size == 4) {
|
||||
wg_size.x *= 2;
|
||||
wg_size.y *= 2;
|
||||
total_wg_size /= 4;
|
||||
} else {
|
||||
// total_wg_size == 2
|
||||
wg_size.x *= 2;
|
||||
total_wg_size /= 2;
|
||||
}
|
||||
}
|
||||
return wg_size;
|
||||
}
|
||||
}
|
||||
|
||||
int GetWGTotalSize(const GpuInfo& gpu_info) {
|
||||
// total_wg_size must be power of 2 and >= 4;
|
||||
int total_wg_size = 256;
|
||||
if (gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx()) {
|
||||
total_wg_size = 128;
|
||||
}
|
||||
if (gpu_info.IsMali()) {
|
||||
const MaliInfo& mali_info = gpu_info.mali_info;
|
||||
if (mali_info.IsMaliT6xx() || mali_info.IsMaliT7xx() ||
|
||||
mali_info.IsMaliT8xx()) {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
total_wg_size = 32;
|
||||
} else {
|
||||
work_group_size_ = int3(8, 8, 1);
|
||||
total_wg_size = 64;
|
||||
}
|
||||
}
|
||||
code_ = GetMeanKernelCode(definition_, work_group_size_);
|
||||
return total_wg_size;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Mean::Mean(const MeanAttributes& attr, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info)
|
||||
: GPUOperation(definition) {
|
||||
std::vector<Axis> ordered_axis_to_reduce;
|
||||
for (const auto& a :
|
||||
{Axis::CHANNELS, Axis::DEPTH, Axis::HEIGHT, Axis::WIDTH, Axis::BATCH}) {
|
||||
if (attr.dims.count(a)) {
|
||||
ordered_axis_to_reduce.push_back(a);
|
||||
}
|
||||
}
|
||||
int wg_dims = std::min(3, static_cast<int>(ordered_axis_to_reduce.size()));
|
||||
const int total_wg_size = GetWGTotalSize(gpu_info);
|
||||
work_group_size_ = GetWGSizeFromTotalSize(total_wg_size, wg_dims);
|
||||
code_ =
|
||||
GetMeanKernelCode(definition_, work_group_size_, ordered_axis_to_reduce);
|
||||
}
|
||||
|
||||
Mean::Mean(Mean&& operation) : GPUOperation(std::move(operation)) {}
|
||||
@ -59,43 +114,138 @@ Mean& Mean::operator=(Mean&& operation) {
|
||||
}
|
||||
|
||||
std::string Mean::GetMeanKernelCode(const OperationDef& op_def,
|
||||
const int3& work_group_size) {
|
||||
const int3& work_group_size,
|
||||
const std::vector<Axis>& axis_to_reduce) {
|
||||
AddSrcTensor("src_tensor", op_def.src_tensors[0]);
|
||||
AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
|
||||
args_.AddFloat("inv_multiplier_1");
|
||||
args_.AddFloat("inv_multiplier_2");
|
||||
|
||||
std::set<Axis> axis_to_leave;
|
||||
const std::vector<Axis> all_axis = {Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH,
|
||||
Axis::CHANNELS, Axis::BATCH};
|
||||
for (const auto& a : all_axis) {
|
||||
if (op_def.dst_tensors[0].HasAxis(a)) {
|
||||
bool leave = true;
|
||||
for (const auto& a_to_reduce : axis_to_reduce) {
|
||||
if (a_to_reduce == a) {
|
||||
leave = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (leave) {
|
||||
axis_to_leave.insert(a);
|
||||
}
|
||||
}
|
||||
}
|
||||
int wg_dims = std::min(3, static_cast<int>(axis_to_reduce.size()));
|
||||
|
||||
std::string c = GetCommonDefines(op_def.precision);
|
||||
const std::string wg_x = std::to_string(work_group_size.x);
|
||||
const std::string wg_y = std::to_string(work_group_size.y);
|
||||
const std::string wg_z = std::to_string(work_group_size.z);
|
||||
const int wg_total_size =
|
||||
work_group_size.x * work_group_size.y * work_group_size.z;
|
||||
c += "__kernel void main_function(\n";
|
||||
c += "$0) {\n";
|
||||
c += " __local float4 accum[" +
|
||||
std::to_string(work_group_size.x * work_group_size.y) + "];\n";
|
||||
c += " int local_x = get_local_id(0);\n";
|
||||
c += " int local_y = get_local_id(1);\n";
|
||||
c += " int local_id = local_y * " + wg_x + " + local_x;\n";
|
||||
if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) {
|
||||
c += " int linear_id_2 = get_global_id(2);\n";
|
||||
c += " int S = linear_id_2 / args.dst_tensor.Batch();\n";
|
||||
c += " int B = linear_id_2 % args.dst_tensor.Batch();\n";
|
||||
c += " args.dst_tensor.SetBatchRef(B);\n";
|
||||
c += " args.src_tensor.SetBatchRef(B);\n";
|
||||
} else {
|
||||
c += " int S = get_global_id(2);\n";
|
||||
c += " __local float4 accum[" + std::to_string(wg_total_size) + "];\n";
|
||||
if (wg_dims == 1) {
|
||||
c += " int local_x = get_local_id(0);\n";
|
||||
c += " int local_id = local_x;\n";
|
||||
} else if (wg_dims == 2) {
|
||||
c += " int local_x = get_local_id(0);\n";
|
||||
c += " int local_y = get_local_id(1);\n";
|
||||
c += " int local_id = local_y * " + wg_x + " + local_x;\n";
|
||||
} else if (wg_dims == 3) {
|
||||
c += " int local_x = get_local_id(0);\n";
|
||||
c += " int local_y = get_local_id(1);\n";
|
||||
c += " int local_z = get_local_id(2);\n";
|
||||
c += " int local_id = (local_z * " + wg_y + " + local_y) * " + wg_x +
|
||||
" + local_x;\n";
|
||||
}
|
||||
c += " if (S >= args.dst_tensor.Slices()) return;\n";
|
||||
if (axis_to_leave.count(Axis::WIDTH)) {
|
||||
if (axis_to_leave.count(Axis::BATCH)) {
|
||||
c += " int linear_id = get_group_id(0);\n";
|
||||
c += " int DST_X = linear_id / args.dst_tensor.Batch();\n";
|
||||
c += " int DST_B = linear_id % args.dst_tensor.Batch();\n";
|
||||
} else {
|
||||
c += " int DST_X = get_group_id(0);\n";
|
||||
}
|
||||
} else if (axis_to_leave.count(Axis::BATCH)) {
|
||||
c += " int DST_B = get_group_id(0);\n";
|
||||
}
|
||||
if (axis_to_leave.count(Axis::HEIGHT)) {
|
||||
if (axis_to_leave.count(Axis::DEPTH)) {
|
||||
c += " int linear_id = get_group_id(1);\n";
|
||||
c += " int DST_Y = linear_id % args.dst_tensor.Height();\n";
|
||||
c += " int DST_Z = linear_id / args.dst_tensor.Height();\n";
|
||||
} else {
|
||||
c += " int DST_Y = get_group_id(1);\n";
|
||||
}
|
||||
} else if (axis_to_leave.count(Axis::DEPTH)) {
|
||||
c += " int DST_Z = get_group_id(1);\n";
|
||||
}
|
||||
if (axis_to_leave.count(Axis::CHANNELS)) {
|
||||
c += " int DST_S = get_group_id(2);\n";
|
||||
}
|
||||
std::map<Axis, std::string> axis_to_selector = {
|
||||
{Axis::BATCH, "Batch()"}, {Axis::WIDTH, "Width()"},
|
||||
{Axis::HEIGHT, "Height()"}, {Axis::DEPTH, "Depth()"},
|
||||
{Axis::CHANNELS, "Slices()"},
|
||||
};
|
||||
std::map<Axis, std::string> axis_to_coord = {
|
||||
{Axis::BATCH, "B"}, {Axis::WIDTH, "X"}, {Axis::HEIGHT, "Y"},
|
||||
{Axis::DEPTH, "Z"}, {Axis::CHANNELS, "S"},
|
||||
};
|
||||
std::string dst_check;
|
||||
for (auto& axis : axis_to_leave) {
|
||||
if (!dst_check.empty()) {
|
||||
dst_check += " && ";
|
||||
}
|
||||
dst_check += "DST_" + axis_to_coord[axis] + " >= args.dst_tensor." +
|
||||
axis_to_selector[axis];
|
||||
}
|
||||
c += " if (" + dst_check + ") return;\n";
|
||||
c += " accum[local_id] = (float4)(0.0f);\n";
|
||||
c += " for (int s_y = local_y; s_y < args.src_tensor.Height(); s_y += " +
|
||||
wg_y + ") {\n";
|
||||
c += " for (int s_x = local_x; s_x < args.src_tensor.Width(); s_x += " +
|
||||
wg_x + ") {\n";
|
||||
c += " accum[local_id] += args.src_tensor.Read<float>(s_x, s_y, S);\n";
|
||||
c += " }\n";
|
||||
c += " }\n";
|
||||
const std::vector<std::string> local_ids = {"local_x", "local_y", "local_z"};
|
||||
const std::vector<std::string> local_sizes = {wg_x, wg_y, wg_z};
|
||||
std::map<Axis, std::string> src_coords;
|
||||
for (const auto& a : all_axis) {
|
||||
if (op_def.dst_tensors[0].HasAxis(a)) {
|
||||
src_coords[a] = "DST_" + axis_to_coord[a];
|
||||
} else {
|
||||
src_coords[a] = "0";
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < axis_to_reduce.size(); ++i) {
|
||||
const auto& axis = axis_to_reduce[i];
|
||||
const int index = axis_to_reduce.size() - 1 - i;
|
||||
const std::string first = index < wg_dims ? local_ids[index] : "0";
|
||||
const std::string step = index < wg_dims ? local_sizes[index] : "1";
|
||||
const std::string src_coord = "SRC_" + axis_to_coord[axis];
|
||||
src_coords[axis] = src_coord;
|
||||
c += " for (int " + src_coord + " = " + first + "; " + src_coord +
|
||||
" < args.src_tensor." + axis_to_selector[axis] + "; " + src_coord +
|
||||
" += " + step + ") {\n";
|
||||
}
|
||||
std::string src_coordinates;
|
||||
for (const auto& a : all_axis) {
|
||||
if (op_def.src_tensors[0].HasAxis(a)) {
|
||||
if (!src_coordinates.empty()) {
|
||||
src_coordinates += ", ";
|
||||
}
|
||||
src_coordinates += src_coords[a];
|
||||
}
|
||||
}
|
||||
c += " accum[local_id] += args.src_tensor.Read<float>(" + src_coordinates +
|
||||
");\n";
|
||||
for (int i = 0; i < axis_to_reduce.size(); ++i) {
|
||||
c += " }\n";
|
||||
}
|
||||
c += " accum[local_id] *= args.inv_multiplier_1;\n";
|
||||
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
|
||||
const int total_size = work_group_size.x * work_group_size.y;
|
||||
const int total_size =
|
||||
work_group_size.x * work_group_size.y * work_group_size.z;
|
||||
int offset = 1;
|
||||
int reminder = total_size / 4;
|
||||
for (; reminder >= 8; reminder /= 4, offset *= 4) {
|
||||
@ -114,29 +264,50 @@ std::string Mean::GetMeanKernelCode(const OperationDef& op_def,
|
||||
c += " sum += accum[" + std::to_string(offset * i) + "];\n";
|
||||
}
|
||||
c += " FLT4 result = TO_FLT4(sum * args.inv_multiplier_2);\n";
|
||||
c += " args.dst_tensor.Write(result, 0, 0, S);\n";
|
||||
std::string dst_coordinates;
|
||||
for (const auto& a : all_axis) {
|
||||
if (op_def.dst_tensors[0].HasAxis(a)) {
|
||||
if (!dst_coordinates.empty()) {
|
||||
dst_coordinates += ", ";
|
||||
}
|
||||
if (axis_to_leave.count(a)) {
|
||||
dst_coordinates += "DST_" + axis_to_coord[a];
|
||||
} else {
|
||||
dst_coordinates += "0";
|
||||
}
|
||||
}
|
||||
}
|
||||
c += " args.dst_tensor.Write(result, " + dst_coordinates + ");\n";
|
||||
c += "}\n";
|
||||
return c;
|
||||
}
|
||||
|
||||
absl::Status Mean::BindArguments(ArgumentsBinder* args) {
|
||||
const double total_size = src_[0]->Width() * src_[0]->Height();
|
||||
const double size_0 = work_group_size_.x * work_group_size_.y;
|
||||
const double size_1 = total_size / size_0;
|
||||
const double total_src_elements = 1.0 * src_[0]->Batch() * src_[0]->Width() *
|
||||
src_[0]->Height() * src_[0]->Depth() *
|
||||
src_[0]->Channels();
|
||||
const double total_dst_elements = 1.0 * dst_[0]->Batch() * dst_[0]->Width() *
|
||||
dst_[0]->Height() * dst_[0]->Depth() *
|
||||
dst_[0]->Channels();
|
||||
const double reduction_size = total_src_elements / total_dst_elements;
|
||||
const double size_0 =
|
||||
work_group_size_.x * work_group_size_.y * work_group_size_.z;
|
||||
const double size_1 = reduction_size / size_0;
|
||||
RETURN_IF_ERROR(args->SetFloat("inv_multiplier_1", 1.0 / size_1));
|
||||
RETURN_IF_ERROR(args->SetFloat("inv_multiplier_2", 1.0 / size_0));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
int3 Mean::GetGridSize() const {
|
||||
const int grid_x = work_group_size_.x;
|
||||
const int grid_y = work_group_size_.y;
|
||||
const int grid_z = dst_[0]->Slices() * dst_[0]->Batch();
|
||||
const int grid_x = work_group_size_.x * dst_[0]->Width() * dst_[0]->Batch();
|
||||
const int grid_y = work_group_size_.y * dst_[0]->Height() * dst_[0]->Depth();
|
||||
const int grid_z = work_group_size_.z * dst_[0]->Slices();
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
Mean CreateMean(const OperationDef& definition, const GpuInfo& gpu_info) {
|
||||
return Mean(definition, gpu_info);
|
||||
Mean CreateMean(const MeanAttributes& attr, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info) {
|
||||
return Mean(attr, definition, gpu_info);
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
|
@ -19,6 +19,7 @@ limitations under the License.
|
||||
#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/operations.h"
|
||||
#include "tensorflow/lite/delegates/gpu/common/types.h"
|
||||
|
||||
namespace tflite {
|
||||
@ -28,7 +29,8 @@ namespace cl {
|
||||
class Mean : public GPUOperation {
|
||||
public:
|
||||
Mean() = default;
|
||||
Mean(const OperationDef& definition, const GpuInfo& gpu_info);
|
||||
Mean(const MeanAttributes& attr, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info);
|
||||
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const GpuInfo& gpu_info,
|
||||
@ -47,10 +49,12 @@ class Mean : public GPUOperation {
|
||||
|
||||
private:
|
||||
std::string GetMeanKernelCode(const OperationDef& op_def,
|
||||
const int3& work_group_size);
|
||||
const int3& work_group_size,
|
||||
const std::vector<Axis>& axis_to_reduce);
|
||||
};
|
||||
|
||||
Mean CreateMean(const OperationDef& definition, const GpuInfo& gpu_info);
|
||||
Mean CreateMean(const MeanAttributes& attr, const OperationDef& definition,
|
||||
const GpuInfo& gpu_info);
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
|
@ -33,10 +33,13 @@ namespace gpu {
|
||||
namespace cl {
|
||||
namespace {
|
||||
|
||||
TEST_F(OpenCLOperationTest, Mean) {
|
||||
TEST_F(OpenCLOperationTest, MeanHW) {
|
||||
TensorFloat32 src_tensor;
|
||||
src_tensor.shape = BHWC(1, 2, 2, 1);
|
||||
src_tensor.data = {1.0f, 2.0f, 3.0f, 4.0f};
|
||||
MeanAttributes attr;
|
||||
attr.dims.insert(Axis::HEIGHT);
|
||||
attr.dims.insert(Axis::WIDTH);
|
||||
|
||||
for (auto storage : env_.GetSupportedStorages()) {
|
||||
for (auto precision : env_.GetSupportedPrecisions()) {
|
||||
@ -47,7 +50,7 @@ TEST_F(OpenCLOperationTest, Mean) {
|
||||
op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
|
||||
TensorFloat32 dst_tensor;
|
||||
Mean operation = CreateMean(op_def, env_.GetDevicePtr()->info_);
|
||||
Mean operation = CreateMean(attr, op_def, env_.GetDevicePtr()->GetInfo());
|
||||
ASSERT_OK(
|
||||
ExecuteGPUOperation(src_tensor, creation_context_,
|
||||
absl::make_unique<Mean>(std::move(operation)),
|
||||
|
@ -150,10 +150,11 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
|
||||
absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
|
||||
const GpuInfo& gpu_info,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
if (attr.dims != std::set<Axis>({Axis::HEIGHT, Axis::WIDTH})) {
|
||||
return absl::UnimplementedError("Mean operation supports only HW plane");
|
||||
if (attr.dims.find(Axis::CHANNELS) != attr.dims.end()) {
|
||||
return absl::UnimplementedError(
|
||||
"Mean operation doesn't support reduction in Channels dimension.");
|
||||
}
|
||||
Mean operation = CreateMean(op_def, gpu_info);
|
||||
Mean operation = CreateMean(attr, op_def, gpu_info);
|
||||
*ptr = absl::make_unique<Mean>(std::move(operation));
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
@ -586,6 +586,15 @@ BHWC CalculateOutputShape(const BHWC& input, const MeanAttributes& attr) {
|
||||
return BHWC(b, h, w, c);
|
||||
}
|
||||
|
||||
BHWDC CalculateOutputShape(const BHWDC& input, const MeanAttributes& attr) {
|
||||
const int b = attr.dims.find(Axis::BATCH) == attr.dims.end() ? input.b : 1;
|
||||
const int h = attr.dims.find(Axis::HEIGHT) == attr.dims.end() ? input.h : 1;
|
||||
const int w = attr.dims.find(Axis::WIDTH) == attr.dims.end() ? input.w : 1;
|
||||
const int d = attr.dims.find(Axis::DEPTH) == attr.dims.end() ? input.d : 1;
|
||||
const int c = attr.dims.find(Axis::CHANNELS) == attr.dims.end() ? input.c : 1;
|
||||
return BHWDC(b, h, w, d, c);
|
||||
}
|
||||
|
||||
absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
|
||||
const ConcatAttributes& attr,
|
||||
BHWC* output_shape) {
|
||||
|
@ -505,6 +505,9 @@ BHWC CalculateOutputShape(const BHWC& input,
|
||||
// @return shape of a tensor after Mean operation is applied to the given input.
|
||||
BHWC CalculateOutputShape(const BHWC& input, const MeanAttributes& attr);
|
||||
|
||||
// @return shape of a tensor after Mean operation is applied to the given input.
|
||||
BHWDC CalculateOutputShape(const BHWDC& input, const MeanAttributes& attr);
|
||||
|
||||
struct ElementwiseAttributes {
|
||||
TensorOrScalar param;
|
||||
// For elementwise operation with 2 inputs op(A, B), runtime_tensor_is_second
|
||||
|
Loading…
x
Reference in New Issue
Block a user