Changed signature of queue->Dispatch.
Better logic separation. PiperOrigin-RevId: 336787734 Change-Id: I753df0b758e9314086f22b9a25b4d8533c933368
This commit is contained in:
parent
e18ff653b2
commit
1da2ac286f
@ -56,14 +56,15 @@ void CLCommandQueue::Release() {
|
||||
}
|
||||
}
|
||||
|
||||
absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size,
|
||||
CLEvent* event) {
|
||||
absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
|
||||
const int3& work_groups_count,
|
||||
const int3& work_group_size,
|
||||
CLEvent* event) {
|
||||
std::vector<size_t> local(3);
|
||||
std::vector<size_t> global(3);
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
local[i] = work_group_size[i];
|
||||
global[i] = AlignByN(grid[i], work_group_size[i]);
|
||||
global[i] = work_groups_count[i] * work_group_size[i];
|
||||
}
|
||||
cl_event resulting_event;
|
||||
const int error_code = clEnqueueNDRangeKernel(
|
||||
@ -80,9 +81,10 @@ absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size) {
|
||||
return DispatchImplicit(kernel, grid, work_group_size, nullptr);
|
||||
absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
|
||||
const int3& work_groups_count,
|
||||
const int3& work_group_size) {
|
||||
return Dispatch(kernel, work_groups_count, work_group_size, nullptr);
|
||||
}
|
||||
|
||||
absl::Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
|
||||
@ -191,12 +193,13 @@ void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
|
||||
|
||||
void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
|
||||
|
||||
absl::Status ProfilingCommandQueue::DispatchImplicit(const CLKernel& kernel,
|
||||
int3 grid,
|
||||
int3 work_group_size) {
|
||||
absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
|
||||
const int3& work_groups_count,
|
||||
const int3& work_group_size) {
|
||||
events_.push_back(CLEvent());
|
||||
RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
|
||||
kernel, grid, work_group_size, &events_[events_.size() - 1]));
|
||||
RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
|
||||
work_group_size,
|
||||
&events_[events_.size() - 1]));
|
||||
events_.back().SetName(current_label_);
|
||||
return absl::OkStatus();
|
||||
}
|
||||
@ -213,14 +216,15 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
|
||||
}
|
||||
|
||||
absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
|
||||
const CLKernel& kernel, const DeviceInfo& device_info, const int3& grid,
|
||||
const CLKernel& kernel, const DeviceInfo& device_info,
|
||||
const std::vector<int3>& work_groups_count,
|
||||
const std::vector<int3>& work_group_sizes, int* index) {
|
||||
// Some Adreno 3xx can have wrong numbers for some events
|
||||
const bool possible_bug_with_events = device_info.IsAdreno3xx();
|
||||
events_.resize(work_group_sizes.size());
|
||||
for (int i = 0; i < work_group_sizes.size(); ++i) {
|
||||
RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
|
||||
kernel, grid, work_group_sizes[i], &events_[i]));
|
||||
RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
|
||||
work_group_sizes[i], &events_[i]));
|
||||
|
||||
// reducing the speed of memory leak on Mali for some kernels
|
||||
if (device_info.IsMali() && i % 8 == 7) {
|
||||
|
@ -74,14 +74,15 @@ class CLCommandQueue {
|
||||
|
||||
cl_command_queue queue() const { return queue_; }
|
||||
|
||||
virtual absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size);
|
||||
virtual absl::Status Dispatch(const CLKernel& kernel,
|
||||
const int3& work_groups_count,
|
||||
const int3& work_group_size);
|
||||
|
||||
absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
|
||||
const int3& work_group_size, CLEvent* event);
|
||||
|
||||
absl::Status EnqueueEvent(CLEvent* event);
|
||||
|
||||
absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size, CLEvent* event);
|
||||
|
||||
absl::Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data);
|
||||
absl::Status EnqueueReadImage(cl_mem memory, int3 region, void* data);
|
||||
|
||||
@ -110,13 +111,13 @@ class ProfilingCommandQueue : public CLCommandQueue {
|
||||
ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
|
||||
ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;
|
||||
|
||||
absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
|
||||
int3 work_group_size) override;
|
||||
absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
|
||||
const int3& work_group_size) override;
|
||||
|
||||
// will write index for fastest work_group among work_group_sizes
|
||||
absl::Status GetBestWorkGroupIndex(const CLKernel& kernel,
|
||||
const DeviceInfo& device_info,
|
||||
const int3& grid,
|
||||
const std::vector<int3>& work_groups_count,
|
||||
const std::vector<int3>& work_group_sizes,
|
||||
int* index);
|
||||
|
||||
|
@ -222,6 +222,9 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
|
||||
}
|
||||
|
||||
void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
|
||||
if (conv_params_.linear_spatial) {
|
||||
grid_dimension_ = 2;
|
||||
}
|
||||
const bool stride_correction =
|
||||
definition_.IsBatchSupported() && stride_.x != 1;
|
||||
code_ =
|
||||
@ -294,34 +297,13 @@ int3 ConvPowerVR::GetGridSize() const {
|
||||
if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
|
||||
grid_x *= task_size_z;
|
||||
}
|
||||
if (conv_params_.work_group_launch_order[0] == 0 &&
|
||||
conv_params_.work_group_launch_order[1] == 1) {
|
||||
return int3(grid_x, task_size_s, 1);
|
||||
} else {
|
||||
wg.x = DivideRoundUp(grid_x, work_group_size_.x);
|
||||
wg.y = DivideRoundUp(task_size_s, work_group_size_.y);
|
||||
return int3(
|
||||
wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
|
||||
wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
|
||||
}
|
||||
return int3(grid_x, task_size_s, 1);
|
||||
} else {
|
||||
int grid_y = task_size_y;
|
||||
if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
|
||||
grid_y *= task_size_z;
|
||||
}
|
||||
if (conv_params_.work_group_launch_order[0] == 0 &&
|
||||
conv_params_.work_group_launch_order[1] == 1 &&
|
||||
conv_params_.work_group_launch_order[2] == 2) {
|
||||
return int3(task_size_x, grid_y, task_size_s);
|
||||
} else {
|
||||
wg.x = DivideRoundUp(task_size_x, work_group_size_.x);
|
||||
wg.y = DivideRoundUp(grid_y, work_group_size_.y);
|
||||
wg.z = DivideRoundUp(task_size_s, work_group_size_.z);
|
||||
return int3(
|
||||
wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
|
||||
wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
|
||||
wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
|
||||
}
|
||||
return int3(task_size_x, grid_y, task_size_s);
|
||||
}
|
||||
}
|
||||
|
||||
@ -336,14 +318,8 @@ void ConvPowerVR::GetPossibleKernelWorkGroups(
|
||||
work_groups->push_back(work_group_size_);
|
||||
return;
|
||||
}
|
||||
if (conv_params_.work_group_launch_order[0] == 0 &&
|
||||
conv_params_.work_group_launch_order[1] == 1 &&
|
||||
conv_params_.work_group_launch_order[2] == 2) {
|
||||
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
|
||||
work_groups);
|
||||
} else {
|
||||
work_groups->push_back(work_group_size_);
|
||||
}
|
||||
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
|
||||
work_groups);
|
||||
}
|
||||
|
||||
std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
|
||||
@ -513,9 +489,9 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
|
||||
}
|
||||
c += "__kernel void main_function(\n";
|
||||
c += "$0) {\n";
|
||||
c += GenerateBlockCoords(
|
||||
conv_params.block_size, conv_params.work_group_launch_order,
|
||||
conv_params.linear_spatial, src_def.HasAxis(Axis::DEPTH));
|
||||
c += GenerateBlockCoords(conv_params.block_size, work_group_launch_order_,
|
||||
conv_params.linear_spatial,
|
||||
src_def.HasAxis(Axis::DEPTH));
|
||||
if (!late_oob_check) {
|
||||
c += " if (" + dst_oob_check + ") {\n";
|
||||
c += " return;\n";
|
||||
@ -1051,12 +1027,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
if (device_info.IsNvidia()) {
|
||||
if (different_weights_for_height) {
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
conv_params.work_group_launch_order = int3(2, 0, 1);
|
||||
work_group_launch_order_ = int3(2, 0, 1);
|
||||
conv_params.fixed_work_group_size = true;
|
||||
} else {
|
||||
conv_params.linear_spatial = true;
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
conv_params.work_group_launch_order = int3(1, 0, 2);
|
||||
work_group_launch_order_ = int3(1, 0, 2);
|
||||
conv_params.fixed_work_group_size = true;
|
||||
}
|
||||
conv_params.block_size = int4(2, 1, 1, 4);
|
||||
@ -1096,12 +1072,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
} else if (device_info.IsPowerVR()) {
|
||||
if (different_weights_for_height) {
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
conv_params.work_group_launch_order = int3(2, 0, 1);
|
||||
work_group_launch_order_ = int3(2, 0, 1);
|
||||
conv_params.fixed_work_group_size = true;
|
||||
} else {
|
||||
conv_params.linear_spatial = true;
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
conv_params.work_group_launch_order = int3(1, 0, 2);
|
||||
work_group_launch_order_ = int3(1, 0, 2);
|
||||
conv_params.fixed_work_group_size = true;
|
||||
}
|
||||
conv_params.weights_data_type =
|
||||
@ -1144,11 +1120,11 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
} else if (device_info.IsAMD()) {
|
||||
if (different_weights_for_height) {
|
||||
work_group_size_ = int3(32, 1, 1);
|
||||
conv_params.work_group_launch_order = int3(2, 0, 1);
|
||||
work_group_launch_order_ = int3(2, 0, 1);
|
||||
conv_params.fixed_work_group_size = true;
|
||||
} else {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
conv_params.work_group_launch_order = int3(2, 0, 1);
|
||||
work_group_launch_order_ = int3(2, 0, 1);
|
||||
conv_params.fixed_work_group_size = true;
|
||||
}
|
||||
|
||||
@ -1207,7 +1183,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
conv_params.src_depth_loop_size = 4;
|
||||
}
|
||||
work_group_size_ = int3(4, 4, 1);
|
||||
conv_params.work_group_launch_order = int3(0, 1, 2);
|
||||
work_group_launch_order_ = int3(0, 1, 2);
|
||||
conv_params.fixed_work_group_size = false;
|
||||
conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
|
||||
} else if (device_info.IsAdreno()) {
|
||||
@ -1222,7 +1198,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
}
|
||||
}
|
||||
work_group_size_ = int3(8, 2, 1);
|
||||
conv_params.work_group_launch_order = int3(0, 1, 2);
|
||||
work_group_launch_order_ = int3(0, 1, 2);
|
||||
conv_params.fixed_work_group_size = false;
|
||||
conv_params.src_depth_loop_size = 1;
|
||||
if (definition.src_tensors.size() == 2) {
|
||||
@ -1234,12 +1210,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
} else if (device_info.IsIntel()) {
|
||||
if (different_weights_for_height) {
|
||||
work_group_size_ = int3(16, 1, 1);
|
||||
conv_params.work_group_launch_order = int3(0, 1, 2);
|
||||
work_group_launch_order_ = int3(0, 1, 2);
|
||||
conv_params.fixed_work_group_size = true;
|
||||
} else {
|
||||
conv_params.linear_spatial = true;
|
||||
work_group_size_ = int3(16, 1, 1);
|
||||
conv_params.work_group_launch_order = int3(0, 1, 2);
|
||||
work_group_launch_order_ = int3(0, 1, 2);
|
||||
conv_params.fixed_work_group_size = true;
|
||||
}
|
||||
conv_params.block_size = int4(1, 1, 1, 4);
|
||||
@ -1274,7 +1250,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
|
||||
} else {
|
||||
conv_params.block_size = int4(1, 1, 1, 4);
|
||||
work_group_size_ = int3(8, 2, 1);
|
||||
conv_params.work_group_launch_order = int3(0, 1, 2);
|
||||
work_group_launch_order_ = int3(0, 1, 2);
|
||||
conv_params.fixed_work_group_size = false;
|
||||
conv_params.src_depth_loop_size = 1;
|
||||
conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
|
||||
|
@ -83,7 +83,6 @@ class ConvPowerVR : public GPUOperation {
|
||||
// F32_F16 precision mode
|
||||
DataType weights_data_type; // used for weights and biases
|
||||
int4 block_size; // WHDS
|
||||
int3 work_group_launch_order;
|
||||
bool fixed_work_group_size;
|
||||
bool linear_spatial; // spatial dimensions are Width/Height/Depth
|
||||
bool different_weights_for_height;
|
||||
|
@ -46,9 +46,11 @@ class OpenClConverterImpl : public TensorObjectConverter {
|
||||
RETURN_IF_ERROR(kernel_.SetMemoryAuto(buffer_mem));
|
||||
RETURN_IF_ERROR(args_.SetObjectRef("tensor", tensor));
|
||||
RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
|
||||
int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
|
||||
tensor->Slices());
|
||||
return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
|
||||
const int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
|
||||
tensor->Slices());
|
||||
const int3 work_group_size = {16, 8, 1};
|
||||
const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
|
||||
return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
|
||||
}
|
||||
|
||||
Arguments args_;
|
||||
@ -173,9 +175,11 @@ class TensorToTensorConverter : public OpenClConverterImpl {
|
||||
RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", &src_tensor));
|
||||
RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", &dst_tensor));
|
||||
RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
|
||||
int3 grid = int3(dst_tensor.Width() * dst_tensor.Batch(),
|
||||
dst_tensor.Height(), dst_tensor.Slices());
|
||||
return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
|
||||
const int3 grid = int3(dst_tensor.Width() * dst_tensor.Batch(),
|
||||
dst_tensor.Height(), dst_tensor.Slices());
|
||||
const int3 work_group_size = {16, 8, 1};
|
||||
const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
|
||||
return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -29,10 +29,9 @@ namespace gpu {
|
||||
namespace cl {
|
||||
ConvolutionTransposed3x3::ConvolutionTransposed3x3(
|
||||
const OperationDef& definition, const DeviceInfo& device_info, int2 padding)
|
||||
: GPUOperation(definition),
|
||||
padding_(padding),
|
||||
work_group_launch_order_(2, 0, 1) {
|
||||
: GPUOperation(definition), padding_(padding) {
|
||||
work_group_size_ = int3(8, 4, 1);
|
||||
work_group_launch_order_ = int3(2, 0, 1);
|
||||
if (device_info.IsPowerVR()) {
|
||||
weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
|
||||
} else if (device_info.IsNvidia() || device_info.IsIntel()) {
|
||||
@ -54,14 +53,12 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
|
||||
ConvolutionTransposed3x3&& operation)
|
||||
: GPUOperation(std::move(operation)),
|
||||
padding_(operation.padding_),
|
||||
work_group_launch_order_(operation.work_group_launch_order_),
|
||||
weights_upload_type_(operation.weights_upload_type_) {}
|
||||
|
||||
ConvolutionTransposed3x3& ConvolutionTransposed3x3::operator=(
|
||||
ConvolutionTransposed3x3&& operation) {
|
||||
if (this != &operation) {
|
||||
std::swap(padding_, operation.padding_);
|
||||
std::swap(work_group_launch_order_, operation.work_group_launch_order_);
|
||||
std::swap(weights_upload_type_, operation.weights_upload_type_);
|
||||
GPUOperation::operator=(std::move(operation));
|
||||
}
|
||||
@ -315,17 +312,23 @@ absl::Status ConvolutionTransposed3x3::BindArguments(ArgumentsBinder* args) {
|
||||
return args->SetInt("padding_y", padding_y);
|
||||
}
|
||||
|
||||
void ConvolutionTransposed3x3::GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
|
||||
if (weights_upload_type_ == WeightsUploadType::LOCAL_MEM_ASYNC ||
|
||||
weights_upload_type_ == WeightsUploadType::LOCAL_MEM_BY_THREADS) {
|
||||
work_groups->push_back(work_group_size_);
|
||||
return;
|
||||
}
|
||||
GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
|
||||
work_groups);
|
||||
}
|
||||
|
||||
int3 ConvolutionTransposed3x3::GetGridSize() const {
|
||||
const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
|
||||
const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
|
||||
const int grid_z = dst_[0]->Slices();
|
||||
int3 wg;
|
||||
wg.x = DivideRoundUp(grid_x, work_group_size_.x);
|
||||
wg.y = DivideRoundUp(grid_y, work_group_size_.y);
|
||||
wg.z = DivideRoundUp(grid_z, work_group_size_.z);
|
||||
return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
|
||||
wg[work_group_launch_order_[1]] * work_group_size_.y,
|
||||
wg[work_group_launch_order_[2]] * work_group_size_.z);
|
||||
return int3(grid_x, grid_y, grid_z);
|
||||
}
|
||||
|
||||
bool IsConvolutionTransposed3x3Supported(
|
||||
|
@ -40,9 +40,7 @@ class ConvolutionTransposed3x3 : public GPUOperation {
|
||||
void GetPossibleKernelWorkGroups(
|
||||
TuningType tuning_type, const DeviceInfo& device_info,
|
||||
const KernelInfo& kernel_info,
|
||||
std::vector<int3>* work_groups) const override {
|
||||
work_groups->push_back(work_group_size_);
|
||||
}
|
||||
std::vector<int3>* work_groups) const override;
|
||||
absl::Status BindArguments(ArgumentsBinder* args) override;
|
||||
int3 GetGridSize() const override;
|
||||
|
||||
@ -78,7 +76,6 @@ class ConvolutionTransposed3x3 : public GPUOperation {
|
||||
int2 padding, int3 work_group_launch_order);
|
||||
|
||||
int2 padding_;
|
||||
int3 work_group_launch_order_;
|
||||
WeightsUploadType weights_upload_type_;
|
||||
};
|
||||
|
||||
|
@ -49,6 +49,33 @@ std::string GetElementWiseCode(const OperationDef& op_def,
|
||||
return c;
|
||||
}
|
||||
|
||||
int3 GetWorkGroupsCount(int grid_dimension, const int3& grid_size,
|
||||
const int3& work_group_size,
|
||||
const int3& work_group_launch_order) {
|
||||
int3 work_groups_count;
|
||||
if (grid_dimension == 1) {
|
||||
work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
|
||||
work_groups_count.y = 1;
|
||||
work_groups_count.z = 1;
|
||||
} else if (grid_dimension == 2) {
|
||||
int3 wgs;
|
||||
wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
|
||||
wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
|
||||
work_groups_count.x = wgs[work_group_launch_order[0]];
|
||||
work_groups_count.y = wgs[work_group_launch_order[1]];
|
||||
work_groups_count.z = 1;
|
||||
} else { // grid_dimension == 3
|
||||
int3 wgs;
|
||||
wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
|
||||
wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
|
||||
wgs.z = DivideRoundUp(grid_size.z, work_group_size.z);
|
||||
work_groups_count.x = wgs[work_group_launch_order[0]];
|
||||
work_groups_count.y = wgs[work_group_launch_order[1]];
|
||||
work_groups_count.z = wgs[work_group_launch_order[2]];
|
||||
}
|
||||
return work_groups_count;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
DataType OperationDef::GetDataType() const {
|
||||
@ -106,9 +133,12 @@ GPUOperation::GPUOperation(GPUOperation&& operation)
|
||||
src_(std::move(operation.src_)),
|
||||
dst_(std::move(operation.dst_)),
|
||||
kernel_(std::move(operation.kernel_)),
|
||||
grid_dimension_(operation.grid_dimension_),
|
||||
work_group_launch_order_(operation.work_group_launch_order_),
|
||||
grid_size_(operation.grid_size_),
|
||||
src_tensors_names_(std::move(operation.src_tensors_names_)),
|
||||
dst_tensors_names_(std::move(operation.dst_tensors_names_)),
|
||||
work_groups_count_(operation.work_groups_count_),
|
||||
linkable_count_(operation.linkable_count_),
|
||||
elementwise_code_(std::move(operation.elementwise_code_)) {}
|
||||
|
||||
@ -126,9 +156,12 @@ GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
|
||||
src_ = std::move(operation.src_);
|
||||
dst_ = std::move(operation.dst_);
|
||||
kernel_ = std::move(operation.kernel_);
|
||||
std::swap(grid_dimension_, operation.grid_dimension_);
|
||||
std::swap(work_group_launch_order_, operation.work_group_launch_order_);
|
||||
std::swap(grid_size_, operation.grid_size_);
|
||||
src_tensors_names_ = std::move(operation.src_tensors_names_);
|
||||
dst_tensors_names_ = std::move(operation.dst_tensors_names_);
|
||||
std::swap(work_groups_count_, operation.work_groups_count_);
|
||||
std::swap(linkable_count_, operation.linkable_count_);
|
||||
elementwise_code_ = std::move(operation.elementwise_code_);
|
||||
}
|
||||
@ -185,6 +218,8 @@ absl::Status GPUOperation::UpdateParams() {
|
||||
}
|
||||
RETURN_IF_ERROR(BindArguments(&args_));
|
||||
grid_size_ = GetGridSize();
|
||||
work_groups_count_ = GetWorkGroupsCount(
|
||||
grid_dimension_, grid_size_, work_group_size_, work_group_launch_order_);
|
||||
return absl::OkStatus();
|
||||
}
|
||||
|
||||
@ -245,14 +280,26 @@ absl::Status GPUOperation::Tune(const TuningParameters& params) {
|
||||
}
|
||||
if (possible_work_groups.size() == 1) {
|
||||
work_group_size_ = possible_work_groups[0];
|
||||
work_groups_count_ =
|
||||
GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_,
|
||||
work_group_launch_order_);
|
||||
return absl::OkStatus();
|
||||
} else {
|
||||
std::vector<int3> work_groups_count(possible_work_groups.size());
|
||||
for (int i = 0; i < work_groups_count.size(); ++i) {
|
||||
work_groups_count[i] =
|
||||
GetWorkGroupsCount(grid_dimension_, grid_size_,
|
||||
possible_work_groups[i], work_group_launch_order_);
|
||||
}
|
||||
RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
|
||||
int best_work_group_index;
|
||||
RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
|
||||
kernel_, *params.info, grid_size_, possible_work_groups,
|
||||
kernel_, *params.info, work_groups_count, possible_work_groups,
|
||||
&best_work_group_index));
|
||||
work_group_size_ = possible_work_groups[best_work_group_index];
|
||||
work_groups_count_ =
|
||||
GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_,
|
||||
work_group_launch_order_);
|
||||
return absl::OkStatus();
|
||||
}
|
||||
}
|
||||
|
@ -120,7 +120,7 @@ class GPUOperation {
|
||||
|
||||
absl::Status AddToQueue(CLCommandQueue* queue) {
|
||||
RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
|
||||
return queue->DispatchImplicit(kernel_, grid_size_, work_group_size_);
|
||||
return queue->Dispatch(kernel_, work_groups_count_, work_group_size_);
|
||||
}
|
||||
|
||||
virtual void GetPossibleKernelWorkGroups(
|
||||
@ -174,11 +174,14 @@ class GPUOperation {
|
||||
std::vector<Tensor*> src_;
|
||||
std::vector<Tensor*> dst_;
|
||||
CLKernel kernel_;
|
||||
int grid_dimension_ = 3; // can be 1, 2 or 3
|
||||
int3 work_group_launch_order_ = int3(0, 1, 2);
|
||||
int3 grid_size_ = int3(0, 0, 0);
|
||||
std::vector<std::string> src_tensors_names_;
|
||||
std::vector<std::string> dst_tensors_names_;
|
||||
|
||||
private:
|
||||
int3 work_groups_count_ = int3(0, 0, 0);
|
||||
int linkable_count_ = 0;
|
||||
std::string elementwise_code_; // temporary, used during op construction
|
||||
};
|
||||
|
@ -188,6 +188,14 @@ int GetRecommendedBlockSizeForConv(const DeviceInfo& device_info,
|
||||
return block_size;
|
||||
}
|
||||
|
||||
int3 GetWorkGroupsCount(const int3& grid_size, const int3& work_group_size) {
|
||||
int3 work_groups_count;
|
||||
work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
|
||||
work_groups_count.y = DivideRoundUp(grid_size.y, work_group_size.y);
|
||||
work_groups_count.z = DivideRoundUp(grid_size.z, work_group_size.z);
|
||||
return work_groups_count;
|
||||
}
|
||||
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
} // namespace tflite
|
||||
|
@ -213,6 +213,8 @@ int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size);
|
||||
int GetRecommendedBlockSizeForConv(const DeviceInfo& device,
|
||||
CalculationsPrecision precision,
|
||||
int task_size);
|
||||
|
||||
int3 GetWorkGroupsCount(const int3& grid_size, const int3& work_group_size);
|
||||
} // namespace cl
|
||||
} // namespace gpu
|
||||
} // namespace tflite
|
||||
|
Loading…
Reference in New Issue
Block a user