Changed signature of queue->Dispatch.

Better logic separation.

PiperOrigin-RevId: 336787734
Change-Id: I753df0b758e9314086f22b9a25b4d8533c933368
This commit is contained in:
Raman Sarokin 2020-10-12 18:33:47 -07:00 committed by TensorFlower Gardener
parent e18ff653b2
commit 1da2ac286f
11 changed files with 137 additions and 93 deletions

View File

@ -56,14 +56,15 @@ void CLCommandQueue::Release() {
}
}
absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
int3 work_group_size,
CLEvent* event) {
absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
const int3& work_groups_count,
const int3& work_group_size,
CLEvent* event) {
std::vector<size_t> local(3);
std::vector<size_t> global(3);
for (int i = 0; i < 3; ++i) {
local[i] = work_group_size[i];
global[i] = AlignByN(grid[i], work_group_size[i]);
global[i] = work_groups_count[i] * work_group_size[i];
}
cl_event resulting_event;
const int error_code = clEnqueueNDRangeKernel(
@ -80,9 +81,10 @@ absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
return absl::OkStatus();
}
absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
int3 work_group_size) {
return DispatchImplicit(kernel, grid, work_group_size, nullptr);
absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
const int3& work_groups_count,
const int3& work_group_size) {
return Dispatch(kernel, work_groups_count, work_group_size, nullptr);
}
absl::Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
@ -191,12 +193,13 @@ void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
absl::Status ProfilingCommandQueue::DispatchImplicit(const CLKernel& kernel,
int3 grid,
int3 work_group_size) {
absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
const int3& work_groups_count,
const int3& work_group_size) {
events_.push_back(CLEvent());
RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
kernel, grid, work_group_size, &events_[events_.size() - 1]));
RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
work_group_size,
&events_[events_.size() - 1]));
events_.back().SetName(current_label_);
return absl::OkStatus();
}
@ -213,14 +216,15 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
}
absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
const CLKernel& kernel, const DeviceInfo& device_info, const int3& grid,
const CLKernel& kernel, const DeviceInfo& device_info,
const std::vector<int3>& work_groups_count,
const std::vector<int3>& work_group_sizes, int* index) {
// Some Adreno 3xx can have wrong numbers for some events
const bool possible_bug_with_events = device_info.IsAdreno3xx();
events_.resize(work_group_sizes.size());
for (int i = 0; i < work_group_sizes.size(); ++i) {
RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
kernel, grid, work_group_sizes[i], &events_[i]));
RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
work_group_sizes[i], &events_[i]));
// reducing the speed of memory leak on Mali for some kernels
if (device_info.IsMali() && i % 8 == 7) {

View File

@ -74,14 +74,15 @@ class CLCommandQueue {
cl_command_queue queue() const { return queue_; }
virtual absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
int3 work_group_size);
virtual absl::Status Dispatch(const CLKernel& kernel,
const int3& work_groups_count,
const int3& work_group_size);
absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
const int3& work_group_size, CLEvent* event);
absl::Status EnqueueEvent(CLEvent* event);
absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
int3 work_group_size, CLEvent* event);
absl::Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data);
absl::Status EnqueueReadImage(cl_mem memory, int3 region, void* data);
@ -110,13 +111,13 @@ class ProfilingCommandQueue : public CLCommandQueue {
ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;
absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
int3 work_group_size) override;
absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
const int3& work_group_size) override;
// will write index for fastest work_group among work_group_sizes
absl::Status GetBestWorkGroupIndex(const CLKernel& kernel,
const DeviceInfo& device_info,
const int3& grid,
const std::vector<int3>& work_groups_count,
const std::vector<int3>& work_group_sizes,
int* index);

View File

@ -222,6 +222,9 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
}
void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
if (conv_params_.linear_spatial) {
grid_dimension_ = 2;
}
const bool stride_correction =
definition_.IsBatchSupported() && stride_.x != 1;
code_ =
@ -294,34 +297,13 @@ int3 ConvPowerVR::GetGridSize() const {
if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
grid_x *= task_size_z;
}
if (conv_params_.work_group_launch_order[0] == 0 &&
conv_params_.work_group_launch_order[1] == 1) {
return int3(grid_x, task_size_s, 1);
} else {
wg.x = DivideRoundUp(grid_x, work_group_size_.x);
wg.y = DivideRoundUp(task_size_s, work_group_size_.y);
return int3(
wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
}
return int3(grid_x, task_size_s, 1);
} else {
int grid_y = task_size_y;
if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
grid_y *= task_size_z;
}
if (conv_params_.work_group_launch_order[0] == 0 &&
conv_params_.work_group_launch_order[1] == 1 &&
conv_params_.work_group_launch_order[2] == 2) {
return int3(task_size_x, grid_y, task_size_s);
} else {
wg.x = DivideRoundUp(task_size_x, work_group_size_.x);
wg.y = DivideRoundUp(grid_y, work_group_size_.y);
wg.z = DivideRoundUp(task_size_s, work_group_size_.z);
return int3(
wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
}
return int3(task_size_x, grid_y, task_size_s);
}
}
@ -336,14 +318,8 @@ void ConvPowerVR::GetPossibleKernelWorkGroups(
work_groups->push_back(work_group_size_);
return;
}
if (conv_params_.work_group_launch_order[0] == 0 &&
conv_params_.work_group_launch_order[1] == 1 &&
conv_params_.work_group_launch_order[2] == 2) {
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
work_groups);
} else {
work_groups->push_back(work_group_size_);
}
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
work_groups);
}
std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
@ -513,9 +489,9 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
}
c += "__kernel void main_function(\n";
c += "$0) {\n";
c += GenerateBlockCoords(
conv_params.block_size, conv_params.work_group_launch_order,
conv_params.linear_spatial, src_def.HasAxis(Axis::DEPTH));
c += GenerateBlockCoords(conv_params.block_size, work_group_launch_order_,
conv_params.linear_spatial,
src_def.HasAxis(Axis::DEPTH));
if (!late_oob_check) {
c += " if (" + dst_oob_check + ") {\n";
c += " return;\n";
@ -1051,12 +1027,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
if (device_info.IsNvidia()) {
if (different_weights_for_height) {
work_group_size_ = int3(32, 1, 1);
conv_params.work_group_launch_order = int3(2, 0, 1);
work_group_launch_order_ = int3(2, 0, 1);
conv_params.fixed_work_group_size = true;
} else {
conv_params.linear_spatial = true;
work_group_size_ = int3(32, 1, 1);
conv_params.work_group_launch_order = int3(1, 0, 2);
work_group_launch_order_ = int3(1, 0, 2);
conv_params.fixed_work_group_size = true;
}
conv_params.block_size = int4(2, 1, 1, 4);
@ -1096,12 +1072,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
} else if (device_info.IsPowerVR()) {
if (different_weights_for_height) {
work_group_size_ = int3(32, 1, 1);
conv_params.work_group_launch_order = int3(2, 0, 1);
work_group_launch_order_ = int3(2, 0, 1);
conv_params.fixed_work_group_size = true;
} else {
conv_params.linear_spatial = true;
work_group_size_ = int3(32, 1, 1);
conv_params.work_group_launch_order = int3(1, 0, 2);
work_group_launch_order_ = int3(1, 0, 2);
conv_params.fixed_work_group_size = true;
}
conv_params.weights_data_type =
@ -1144,11 +1120,11 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
} else if (device_info.IsAMD()) {
if (different_weights_for_height) {
work_group_size_ = int3(32, 1, 1);
conv_params.work_group_launch_order = int3(2, 0, 1);
work_group_launch_order_ = int3(2, 0, 1);
conv_params.fixed_work_group_size = true;
} else {
work_group_size_ = int3(8, 4, 1);
conv_params.work_group_launch_order = int3(2, 0, 1);
work_group_launch_order_ = int3(2, 0, 1);
conv_params.fixed_work_group_size = true;
}
@ -1207,7 +1183,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
conv_params.src_depth_loop_size = 4;
}
work_group_size_ = int3(4, 4, 1);
conv_params.work_group_launch_order = int3(0, 1, 2);
work_group_launch_order_ = int3(0, 1, 2);
conv_params.fixed_work_group_size = false;
conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
} else if (device_info.IsAdreno()) {
@ -1222,7 +1198,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
}
}
work_group_size_ = int3(8, 2, 1);
conv_params.work_group_launch_order = int3(0, 1, 2);
work_group_launch_order_ = int3(0, 1, 2);
conv_params.fixed_work_group_size = false;
conv_params.src_depth_loop_size = 1;
if (definition.src_tensors.size() == 2) {
@ -1234,12 +1210,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
} else if (device_info.IsIntel()) {
if (different_weights_for_height) {
work_group_size_ = int3(16, 1, 1);
conv_params.work_group_launch_order = int3(0, 1, 2);
work_group_launch_order_ = int3(0, 1, 2);
conv_params.fixed_work_group_size = true;
} else {
conv_params.linear_spatial = true;
work_group_size_ = int3(16, 1, 1);
conv_params.work_group_launch_order = int3(0, 1, 2);
work_group_launch_order_ = int3(0, 1, 2);
conv_params.fixed_work_group_size = true;
}
conv_params.block_size = int4(1, 1, 1, 4);
@ -1274,7 +1250,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
} else {
conv_params.block_size = int4(1, 1, 1, 4);
work_group_size_ = int3(8, 2, 1);
conv_params.work_group_launch_order = int3(0, 1, 2);
work_group_launch_order_ = int3(0, 1, 2);
conv_params.fixed_work_group_size = false;
conv_params.src_depth_loop_size = 1;
conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;

View File

@ -83,7 +83,6 @@ class ConvPowerVR : public GPUOperation {
// F32_F16 precision mode
DataType weights_data_type; // used for weights and biases
int4 block_size; // WHDS
int3 work_group_launch_order;
bool fixed_work_group_size;
bool linear_spatial; // spatial dimensions are Width/Height/Depth
bool different_weights_for_height;

View File

@ -46,9 +46,11 @@ class OpenClConverterImpl : public TensorObjectConverter {
RETURN_IF_ERROR(kernel_.SetMemoryAuto(buffer_mem));
RETURN_IF_ERROR(args_.SetObjectRef("tensor", tensor));
RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
tensor->Slices());
return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
const int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
tensor->Slices());
const int3 work_group_size = {16, 8, 1};
const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
}
Arguments args_;
@ -173,9 +175,11 @@ class TensorToTensorConverter : public OpenClConverterImpl {
RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", &src_tensor));
RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", &dst_tensor));
RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
int3 grid = int3(dst_tensor.Width() * dst_tensor.Batch(),
dst_tensor.Height(), dst_tensor.Slices());
return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
const int3 grid = int3(dst_tensor.Width() * dst_tensor.Batch(),
dst_tensor.Height(), dst_tensor.Slices());
const int3 work_group_size = {16, 8, 1};
const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
}
private:

View File

@ -29,10 +29,9 @@ namespace gpu {
namespace cl {
ConvolutionTransposed3x3::ConvolutionTransposed3x3(
const OperationDef& definition, const DeviceInfo& device_info, int2 padding)
: GPUOperation(definition),
padding_(padding),
work_group_launch_order_(2, 0, 1) {
: GPUOperation(definition), padding_(padding) {
work_group_size_ = int3(8, 4, 1);
work_group_launch_order_ = int3(2, 0, 1);
if (device_info.IsPowerVR()) {
weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
} else if (device_info.IsNvidia() || device_info.IsIntel()) {
@ -54,14 +53,12 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
ConvolutionTransposed3x3&& operation)
: GPUOperation(std::move(operation)),
padding_(operation.padding_),
work_group_launch_order_(operation.work_group_launch_order_),
weights_upload_type_(operation.weights_upload_type_) {}
ConvolutionTransposed3x3& ConvolutionTransposed3x3::operator=(
ConvolutionTransposed3x3&& operation) {
if (this != &operation) {
std::swap(padding_, operation.padding_);
std::swap(work_group_launch_order_, operation.work_group_launch_order_);
std::swap(weights_upload_type_, operation.weights_upload_type_);
GPUOperation::operator=(std::move(operation));
}
@ -315,17 +312,23 @@ absl::Status ConvolutionTransposed3x3::BindArguments(ArgumentsBinder* args) {
return args->SetInt("padding_y", padding_y);
}
void ConvolutionTransposed3x3::GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
if (weights_upload_type_ == WeightsUploadType::LOCAL_MEM_ASYNC ||
weights_upload_type_ == WeightsUploadType::LOCAL_MEM_BY_THREADS) {
work_groups->push_back(work_group_size_);
return;
}
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
work_groups);
}
int3 ConvolutionTransposed3x3::GetGridSize() const {
const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
const int grid_z = dst_[0]->Slices();
int3 wg;
wg.x = DivideRoundUp(grid_x, work_group_size_.x);
wg.y = DivideRoundUp(grid_y, work_group_size_.y);
wg.z = DivideRoundUp(grid_z, work_group_size_.z);
return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
wg[work_group_launch_order_[1]] * work_group_size_.y,
wg[work_group_launch_order_[2]] * work_group_size_.z);
return int3(grid_x, grid_y, grid_z);
}
bool IsConvolutionTransposed3x3Supported(

View File

@ -40,9 +40,7 @@ class ConvolutionTransposed3x3 : public GPUOperation {
void GetPossibleKernelWorkGroups(
TuningType tuning_type, const DeviceInfo& device_info,
const KernelInfo& kernel_info,
std::vector<int3>* work_groups) const override {
work_groups->push_back(work_group_size_);
}
std::vector<int3>* work_groups) const override;
absl::Status BindArguments(ArgumentsBinder* args) override;
int3 GetGridSize() const override;
@ -78,7 +76,6 @@ class ConvolutionTransposed3x3 : public GPUOperation {
int2 padding, int3 work_group_launch_order);
int2 padding_;
int3 work_group_launch_order_;
WeightsUploadType weights_upload_type_;
};

View File

@ -49,6 +49,33 @@ std::string GetElementWiseCode(const OperationDef& op_def,
return c;
}
int3 GetWorkGroupsCount(int grid_dimension, const int3& grid_size,
const int3& work_group_size,
const int3& work_group_launch_order) {
int3 work_groups_count;
if (grid_dimension == 1) {
work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
work_groups_count.y = 1;
work_groups_count.z = 1;
} else if (grid_dimension == 2) {
int3 wgs;
wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
work_groups_count.x = wgs[work_group_launch_order[0]];
work_groups_count.y = wgs[work_group_launch_order[1]];
work_groups_count.z = 1;
} else { // grid_dimension == 3
int3 wgs;
wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
wgs.z = DivideRoundUp(grid_size.z, work_group_size.z);
work_groups_count.x = wgs[work_group_launch_order[0]];
work_groups_count.y = wgs[work_group_launch_order[1]];
work_groups_count.z = wgs[work_group_launch_order[2]];
}
return work_groups_count;
}
} // namespace
DataType OperationDef::GetDataType() const {
@ -106,9 +133,12 @@ GPUOperation::GPUOperation(GPUOperation&& operation)
src_(std::move(operation.src_)),
dst_(std::move(operation.dst_)),
kernel_(std::move(operation.kernel_)),
grid_dimension_(operation.grid_dimension_),
work_group_launch_order_(operation.work_group_launch_order_),
grid_size_(operation.grid_size_),
src_tensors_names_(std::move(operation.src_tensors_names_)),
dst_tensors_names_(std::move(operation.dst_tensors_names_)),
work_groups_count_(operation.work_groups_count_),
linkable_count_(operation.linkable_count_),
elementwise_code_(std::move(operation.elementwise_code_)) {}
@ -126,9 +156,12 @@ GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
src_ = std::move(operation.src_);
dst_ = std::move(operation.dst_);
kernel_ = std::move(operation.kernel_);
std::swap(grid_dimension_, operation.grid_dimension_);
std::swap(work_group_launch_order_, operation.work_group_launch_order_);
std::swap(grid_size_, operation.grid_size_);
src_tensors_names_ = std::move(operation.src_tensors_names_);
dst_tensors_names_ = std::move(operation.dst_tensors_names_);
std::swap(work_groups_count_, operation.work_groups_count_);
std::swap(linkable_count_, operation.linkable_count_);
elementwise_code_ = std::move(operation.elementwise_code_);
}
@ -185,6 +218,8 @@ absl::Status GPUOperation::UpdateParams() {
}
RETURN_IF_ERROR(BindArguments(&args_));
grid_size_ = GetGridSize();
work_groups_count_ = GetWorkGroupsCount(
grid_dimension_, grid_size_, work_group_size_, work_group_launch_order_);
return absl::OkStatus();
}
@ -245,14 +280,26 @@ absl::Status GPUOperation::Tune(const TuningParameters& params) {
}
if (possible_work_groups.size() == 1) {
work_group_size_ = possible_work_groups[0];
work_groups_count_ =
GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_,
work_group_launch_order_);
return absl::OkStatus();
} else {
std::vector<int3> work_groups_count(possible_work_groups.size());
for (int i = 0; i < work_groups_count.size(); ++i) {
work_groups_count[i] =
GetWorkGroupsCount(grid_dimension_, grid_size_,
possible_work_groups[i], work_group_launch_order_);
}
RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
int best_work_group_index;
RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
kernel_, *params.info, grid_size_, possible_work_groups,
kernel_, *params.info, work_groups_count, possible_work_groups,
&best_work_group_index));
work_group_size_ = possible_work_groups[best_work_group_index];
work_groups_count_ =
GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_,
work_group_launch_order_);
return absl::OkStatus();
}
}

View File

@ -120,7 +120,7 @@ class GPUOperation {
absl::Status AddToQueue(CLCommandQueue* queue) {
RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
return queue->DispatchImplicit(kernel_, grid_size_, work_group_size_);
return queue->Dispatch(kernel_, work_groups_count_, work_group_size_);
}
virtual void GetPossibleKernelWorkGroups(
@ -174,11 +174,14 @@ class GPUOperation {
std::vector<Tensor*> src_;
std::vector<Tensor*> dst_;
CLKernel kernel_;
int grid_dimension_ = 3; // can be 1, 2 or 3
int3 work_group_launch_order_ = int3(0, 1, 2);
int3 grid_size_ = int3(0, 0, 0);
std::vector<std::string> src_tensors_names_;
std::vector<std::string> dst_tensors_names_;
private:
int3 work_groups_count_ = int3(0, 0, 0);
int linkable_count_ = 0;
std::string elementwise_code_; // temporary, used during op construction
};

View File

@ -188,6 +188,14 @@ int GetRecommendedBlockSizeForConv(const DeviceInfo& device_info,
return block_size;
}
int3 GetWorkGroupsCount(const int3& grid_size, const int3& work_group_size) {
int3 work_groups_count;
work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
work_groups_count.y = DivideRoundUp(grid_size.y, work_group_size.y);
work_groups_count.z = DivideRoundUp(grid_size.z, work_group_size.z);
return work_groups_count;
}
} // namespace cl
} // namespace gpu
} // namespace tflite

View File

@ -213,6 +213,8 @@ int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size);
int GetRecommendedBlockSizeForConv(const DeviceInfo& device,
CalculationsPrecision precision,
int task_size);
int3 GetWorkGroupsCount(const int3& grid_size, const int3& work_group_size);
} // namespace cl
} // namespace gpu
} // namespace tflite