Optimized performance of Mean(OpenCL).

Removed Tuning.

PiperOrigin-RevId: 292206606
Change-Id: I3795a9f05177a46d5aa72f6a302aedeeb53293db
This commit is contained in:
Raman Sarokin 2020-01-29 13:29:37 -08:00 committed by TensorFlower Gardener
parent d8ec57451f
commit 720b16121e
3 changed files with 52 additions and 21 deletions

View File

@ -858,6 +858,7 @@ cc_library(
"//tensorflow/lite/delegates/gpu/cl:tensor", "//tensorflow/lite/delegates/gpu/cl:tensor",
"//tensorflow/lite/delegates/gpu/common:status", "//tensorflow/lite/delegates/gpu/common:status",
"//tensorflow/lite/delegates/gpu/common:types", "//tensorflow/lite/delegates/gpu/common:types",
"//tensorflow/lite/delegates/gpu/common:util",
], ],
) )

View File

@ -20,6 +20,7 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h" #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h" #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
#include "tensorflow/lite/delegates/gpu/common/status.h" #include "tensorflow/lite/delegates/gpu/common/status.h"
#include "tensorflow/lite/delegates/gpu/common/util.h"
namespace tflite { namespace tflite {
namespace gpu { namespace gpu {
@ -28,7 +29,8 @@ namespace {
std::string GetMeanKernelCode( std::string GetMeanKernelCode(
const OperationDef& op_def, const OperationDef& op_def,
const std::vector<ElementwiseOperation*>& linked_operations) { const std::vector<ElementwiseOperation*>& linked_operations,
const int3& work_group_size) {
TensorCodeGenerator src_tensor( TensorCodeGenerator src_tensor(
"src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"}, "src_data", WHSPoint{"src_size.x", "src_size.y", "src_size.z"},
op_def.src_tensors[0]); op_def.src_tensors[0]);
@ -36,24 +38,50 @@ std::string GetMeanKernelCode(
op_def.dst_tensors[0]); op_def.dst_tensors[0]);
std::string c = GetCommonDefines(op_def.precision); std::string c = GetCommonDefines(op_def.precision);
const std::string wg_x = std::to_string(work_group_size.x);
const std::string wg_y = std::to_string(work_group_size.y);
c += "__kernel void main_function(\n"; c += "__kernel void main_function(\n";
c += src_tensor.GetDeclaration(AccessType::READ); c += src_tensor.GetDeclaration(AccessType::READ);
c += GetArgsDeclaration(linked_operations); c += GetArgsDeclaration(linked_operations);
c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n"; c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
c += " int4 src_size \n"; c += " int4 src_size, \n";
c += " float2 inv_multipliers \n";
c += ") {\n"; c += ") {\n";
c += " int X = get_global_id(0);\n"; c += " __local float4 accum[" +
c += " int Y = get_global_id(1);\n"; std::to_string(work_group_size.x * work_group_size.y) + "];\n";
c += " int local_x = get_local_id(0);\n";
c += " int local_y = get_local_id(1);\n";
c += " int local_id = local_y * " + wg_x + " + local_x;\n";
c += " int S = get_global_id(2);\n"; c += " int S = get_global_id(2);\n";
c += " if (X >= 1 || Y >= 1 || S >= src_size.z) return;\n"; c += " if (S >= src_size.z) return;\n";
c += " float4 sum = (float4)(0.0f);\n"; c += " accum[local_id] = (float4)(0.0f);\n";
c += " for (int y = 0; y < src_size.y; ++y) {\n"; c += " for (int s_y = local_y; s_y < src_size.y; s_y += " + wg_y + ") {\n";
c += " for (int x = 0; x < src_size.x; ++x) {\n"; c += " for (int s_x = local_x; s_x < src_size.x; s_x += " + wg_x + ") {\n";
c += " sum += " + src_tensor.ReadAsFloatWHS("x", "y", "S") + ";\n"; c += " accum[local_id] += " +
src_tensor.ReadAsFloatWHS("s_x", "s_y", "S") + ";\n";
c += " }\n"; c += " }\n";
c += " }\n"; c += " }\n";
c += " sum /= (float)(src_size.x * src_size.y);\n"; c += " accum[local_id] *= inv_multipliers.x;\n";
c += " FLT4 result = TO_FLT4(sum);\n"; c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
const int total_size = work_group_size.x * work_group_size.y;
int offset = 1;
int reminder = total_size / 4;
for (; reminder >= 8; reminder /= 4, offset *= 4) {
c += " if (local_id < " + std::to_string(reminder) + ") {\n";
c += " int t = local_id * " + std::to_string(offset * 4) + ";\n";
c += " float4 sum = accum[t + " + std::to_string(offset) + "];\n";
c += " sum += accum[t + " + std::to_string(offset * 2) + "];\n";
c += " sum += accum[t + " + std::to_string(offset * 3) + "];\n";
c += " accum[t] += sum;\n";
c += " }\n";
c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
}
c += " float4 sum = accum[0];\n";
reminder *= 4;
for (int i = 1; i < reminder; ++i) {
c += " sum += accum[" + std::to_string(offset * i) + "];\n";
}
c += " FLT4 result = TO_FLT4(sum * inv_multipliers.y);\n";
c += PostProcess(linked_operations, {"result", "0", "0", "S"}); c += PostProcess(linked_operations, {"result", "0", "0", "S"});
c += " " + dst_tensor.WriteWHS("result", "0", "0", "S"); c += " " + dst_tensor.WriteWHS("result", "0", "0", "S");
c += "}\n"; c += "}\n";
@ -76,7 +104,8 @@ Mean& Mean::operator=(Mean&& operation) {
} }
Status Mean::Compile(const CreationContext& creation_context) { Status Mean::Compile(const CreationContext& creation_context) {
const auto code = GetMeanKernelCode(definition_, linked_operations_); const auto code =
GetMeanKernelCode(definition_, linked_operations_, work_group_size_);
return creation_context.cache->GetOrCreateCLKernel( return creation_context.cache->GetOrCreateCLKernel(
code, "main_function", *creation_context.context, code, "main_function", *creation_context.context,
*creation_context.device, &kernel_); *creation_context.device, &kernel_);
@ -88,21 +117,20 @@ Status Mean::BindArguments() {
RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_)); RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting())); RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB())); RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWBatchedHSB()));
const double total_size = src_[0]->Width() * src_[0]->Height();
const double size_0 = work_group_size_.x * work_group_size_.y;
const double size_1 = total_size / size_0;
RETURN_IF_ERROR(kernel_.SetBytesAuto(float2(1.0 / size_1, 1.0 / size_0)));
return OkStatus(); return OkStatus();
} }
int3 Mean::GetGridSize() const { int3 Mean::GetGridSize() const {
const int grid_x = dst_[0]->Width() * dst_[0]->Batch(); const int grid_x = work_group_size_.x * dst_[0]->Batch();
const int grid_y = dst_[0]->Height(); const int grid_y = work_group_size_.y;
const int grid_z = dst_[0]->Slices(); const int grid_z = dst_[0]->Slices();
return int3(grid_x, grid_y, grid_z); return int3(grid_x, grid_y, grid_z);
} }
Status Mean::Tune(const TuningParameters& params) {
RETURN_IF_ERROR(BindArguments());
return GetBestWorkGroup(params, kernel_, GetGridSize(), &work_group_size_);
}
Status Mean::AddToQueue(CLCommandQueue* queue) { Status Mean::AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(BindArguments()); RETURN_IF_ERROR(BindArguments());
return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_); return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);

View File

@ -31,7 +31,6 @@ class Mean : public GPUOperation {
Mean() = default; Mean() = default;
explicit Mean(const OperationDef& definition) : GPUOperation(definition) {} explicit Mean(const OperationDef& definition) : GPUOperation(definition) {}
Status AddToQueue(CLCommandQueue* queue) override; Status AddToQueue(CLCommandQueue* queue) override;
Status Tune(const TuningParameters& params) override;
Status Compile(const CreationContext& creation_context) override; Status Compile(const CreationContext& creation_context) override;
@ -45,7 +44,10 @@ class Mean : public GPUOperation {
Status BindArguments(); Status BindArguments();
int3 GetGridSize() const; int3 GetGridSize() const;
CLKernel kernel_; CLKernel kernel_;
int3 work_group_size_ = int3(1, 1, 1);
// must be: (x * y) % 4 = 0;
// must be: z = 1;
int3 work_group_size_ = int3(16, 16, 1);
}; };
Mean CreateMean(const OperationDef& definition); Mean CreateMean(const OperationDef& definition);