Changed signature of queue->Dispatch.

Better logic separation. PiperOrigin-RevId: 336787734 Change-Id: I753df0b758e9314086f22b9a25b4d8533c933368
2020-10-12 18:33:47 -07:00 · 2020-10-12 18:33:47 -07:00 · 1da2ac286f
commit 1da2ac286f
parent e18ff653b2
11 changed files with 137 additions and 93 deletions
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
@ -56,14 +56,15 @@ void CLCommandQueue::Release() {
  }
 }

-absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                              int3 work_group_size,
-                                              CLEvent* event) {
+absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
+                                      const int3& work_groups_count,
+                                      const int3& work_group_size,
+                                      CLEvent* event) {
  std::vector<size_t> local(3);
  std::vector<size_t> global(3);
  for (int i = 0; i < 3; ++i) {
    local[i] = work_group_size[i];
-    global[i] = AlignByN(grid[i], work_group_size[i]);
+    global[i] = work_groups_count[i] * work_group_size[i];
  }
  cl_event resulting_event;
  const int error_code = clEnqueueNDRangeKernel(
@ -80,9 +81,10 @@ absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
  return absl::OkStatus();
 }

-absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                              int3 work_group_size) {
-  return DispatchImplicit(kernel, grid, work_group_size, nullptr);
+absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
+                                      const int3& work_groups_count,
+                                      const int3& work_group_size) {
+  return Dispatch(kernel, work_groups_count, work_group_size, nullptr);
 }

 absl::Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
@ -191,12 +193,13 @@ void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {

 void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }

-absl::Status ProfilingCommandQueue::DispatchImplicit(const CLKernel& kernel,
-                                                     int3 grid,
-                                                     int3 work_group_size) {
+absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
+                                             const int3& work_groups_count,
+                                             const int3& work_group_size) {
  events_.push_back(CLEvent());
-  RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
-      kernel, grid, work_group_size, &events_[events_.size() - 1]));
+  RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
+                                           work_group_size,
+                                           &events_[events_.size() - 1]));
  events_.back().SetName(current_label_);
  return absl::OkStatus();
 }
@ -213,14 +216,15 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
 }

 absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
-    const CLKernel& kernel, const DeviceInfo& device_info, const int3& grid,
+    const CLKernel& kernel, const DeviceInfo& device_info,
+    const std::vector<int3>& work_groups_count,
    const std::vector<int3>& work_group_sizes, int* index) {
  // Some Adreno 3xx can have wrong numbers for some events
  const bool possible_bug_with_events = device_info.IsAdreno3xx();
  events_.resize(work_group_sizes.size());
  for (int i = 0; i < work_group_sizes.size(); ++i) {
-    RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
-        kernel, grid, work_group_sizes[i], &events_[i]));
+    RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
+                                             work_group_sizes[i], &events_[i]));

    // reducing the speed of memory leak on Mali for some kernels
    if (device_info.IsMali() && i % 8 == 7) {
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
@ -74,14 +74,15 @@ class CLCommandQueue {

  cl_command_queue queue() const { return queue_; }

-  virtual absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                        int3 work_group_size);
+  virtual absl::Status Dispatch(const CLKernel& kernel,
+                                const int3& work_groups_count,
+                                const int3& work_group_size);
+
+  absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
+                        const int3& work_group_size, CLEvent* event);

  absl::Status EnqueueEvent(CLEvent* event);

-  absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                int3 work_group_size, CLEvent* event);
-
  absl::Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data);
  absl::Status EnqueueReadImage(cl_mem memory, int3 region, void* data);

@ -110,13 +111,13 @@ class ProfilingCommandQueue : public CLCommandQueue {
  ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
  ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;

-  absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                int3 work_group_size) override;
+  absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
+                        const int3& work_group_size) override;

  // will write index for fastest work_group among work_group_sizes
  absl::Status GetBestWorkGroupIndex(const CLKernel& kernel,
                                     const DeviceInfo& device_info,
-                                     const int3& grid,
+                                     const std::vector<int3>& work_groups_count,
                                     const std::vector<int3>& work_group_sizes,
                                     int* index);

--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@ -222,6 +222,9 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
 }

 void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
+  if (conv_params_.linear_spatial) {
+    grid_dimension_ = 2;
+  }
  const bool stride_correction =
      definition_.IsBatchSupported() && stride_.x != 1;
  code_ =
@ -294,34 +297,13 @@ int3 ConvPowerVR::GetGridSize() const {
    if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
      grid_x *= task_size_z;
    }
-    if (conv_params_.work_group_launch_order[0] == 0 &&
-        conv_params_.work_group_launch_order[1] == 1) {
-      return int3(grid_x, task_size_s, 1);
-    } else {
-      wg.x = DivideRoundUp(grid_x, work_group_size_.x);
-      wg.y = DivideRoundUp(task_size_s, work_group_size_.y);
-      return int3(
-          wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
-          wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
-    }
+    return int3(grid_x, task_size_s, 1);
  } else {
    int grid_y = task_size_y;
    if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
      grid_y *= task_size_z;
    }
-    if (conv_params_.work_group_launch_order[0] == 0 &&
-        conv_params_.work_group_launch_order[1] == 1 &&
-        conv_params_.work_group_launch_order[2] == 2) {
-      return int3(task_size_x, grid_y, task_size_s);
-    } else {
-      wg.x = DivideRoundUp(task_size_x, work_group_size_.x);
-      wg.y = DivideRoundUp(grid_y, work_group_size_.y);
-      wg.z = DivideRoundUp(task_size_s, work_group_size_.z);
-      return int3(
-          wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
-          wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
-          wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
-    }
+    return int3(task_size_x, grid_y, task_size_s);
  }
 }

@ -336,14 +318,8 @@ void ConvPowerVR::GetPossibleKernelWorkGroups(
    work_groups->push_back(work_group_size_);
    return;
  }
-  if (conv_params_.work_group_launch_order[0] == 0 &&
-      conv_params_.work_group_launch_order[1] == 1 &&
-      conv_params_.work_group_launch_order[2] == 2) {
-    GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
-                              work_groups);
-  } else {
-    work_groups->push_back(work_group_size_);
-  }
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
 }

 std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
@ -513,9 +489,9 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
  }
  c += "__kernel void main_function(\n";
  c += "$0) {\n";
-  c += GenerateBlockCoords(
-      conv_params.block_size, conv_params.work_group_launch_order,
-      conv_params.linear_spatial, src_def.HasAxis(Axis::DEPTH));
+  c += GenerateBlockCoords(conv_params.block_size, work_group_launch_order_,
+                           conv_params.linear_spatial,
+                           src_def.HasAxis(Axis::DEPTH));
  if (!late_oob_check) {
    c += "  if (" + dst_oob_check + ") {\n";
    c += "    return;\n";
@ -1051,12 +1027,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
  if (device_info.IsNvidia()) {
    if (different_weights_for_height) {
      work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(2, 0, 1);
+      work_group_launch_order_ = int3(2, 0, 1);
      conv_params.fixed_work_group_size = true;
    } else {
      conv_params.linear_spatial = true;
      work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(1, 0, 2);
+      work_group_launch_order_ = int3(1, 0, 2);
      conv_params.fixed_work_group_size = true;
    }
    conv_params.block_size = int4(2, 1, 1, 4);
@ -1096,12 +1072,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
  } else if (device_info.IsPowerVR()) {
    if (different_weights_for_height) {
      work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(2, 0, 1);
+      work_group_launch_order_ = int3(2, 0, 1);
      conv_params.fixed_work_group_size = true;
    } else {
      conv_params.linear_spatial = true;
      work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(1, 0, 2);
+      work_group_launch_order_ = int3(1, 0, 2);
      conv_params.fixed_work_group_size = true;
    }
    conv_params.weights_data_type =
@ -1144,11 +1120,11 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
  } else if (device_info.IsAMD()) {
    if (different_weights_for_height) {
      work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(2, 0, 1);
+      work_group_launch_order_ = int3(2, 0, 1);
      conv_params.fixed_work_group_size = true;
    } else {
      work_group_size_ = int3(8, 4, 1);
-      conv_params.work_group_launch_order = int3(2, 0, 1);
+      work_group_launch_order_ = int3(2, 0, 1);
      conv_params.fixed_work_group_size = true;
    }

@ -1207,7 +1183,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
      conv_params.src_depth_loop_size = 4;
    }
    work_group_size_ = int3(4, 4, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
+    work_group_launch_order_ = int3(0, 1, 2);
    conv_params.fixed_work_group_size = false;
    conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
  } else if (device_info.IsAdreno()) {
@ -1222,7 +1198,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
      }
    }
    work_group_size_ = int3(8, 2, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
+    work_group_launch_order_ = int3(0, 1, 2);
    conv_params.fixed_work_group_size = false;
    conv_params.src_depth_loop_size = 1;
    if (definition.src_tensors.size() == 2) {
@ -1234,12 +1210,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
  } else if (device_info.IsIntel()) {
    if (different_weights_for_height) {
      work_group_size_ = int3(16, 1, 1);
-      conv_params.work_group_launch_order = int3(0, 1, 2);
+      work_group_launch_order_ = int3(0, 1, 2);
      conv_params.fixed_work_group_size = true;
    } else {
      conv_params.linear_spatial = true;
      work_group_size_ = int3(16, 1, 1);
-      conv_params.work_group_launch_order = int3(0, 1, 2);
+      work_group_launch_order_ = int3(0, 1, 2);
      conv_params.fixed_work_group_size = true;
    }
    conv_params.block_size = int4(1, 1, 1, 4);
@ -1274,7 +1250,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
  } else {
    conv_params.block_size = int4(1, 1, 1, 4);
    work_group_size_ = int3(8, 2, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
+    work_group_launch_order_ = int3(0, 1, 2);
    conv_params.fixed_work_group_size = false;
    conv_params.src_depth_loop_size = 1;
    conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@ -83,7 +83,6 @@ class ConvPowerVR : public GPUOperation {
    // F32_F16 precision mode
    DataType weights_data_type;  // used for weights and biases
    int4 block_size;             // WHDS
-    int3 work_group_launch_order;
    bool fixed_work_group_size;
    bool linear_spatial;  // spatial dimensions are Width/Height/Depth
    bool different_weights_for_height;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
@ -46,9 +46,11 @@ class OpenClConverterImpl : public TensorObjectConverter {
    RETURN_IF_ERROR(kernel_.SetMemoryAuto(buffer_mem));
    RETURN_IF_ERROR(args_.SetObjectRef("tensor", tensor));
    RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
-    int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
-                     tensor->Slices());
-    return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
+    const int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
+                           tensor->Slices());
+    const int3 work_group_size = {16, 8, 1};
+    const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
+    return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
  }

  Arguments args_;
@ -173,9 +175,11 @@ class TensorToTensorConverter : public OpenClConverterImpl {
    RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", &src_tensor));
    RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", &dst_tensor));
    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-    int3 grid = int3(dst_tensor.Width() * dst_tensor.Batch(),
-                     dst_tensor.Height(), dst_tensor.Slices());
-    return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
+    const int3 grid = int3(dst_tensor.Width() * dst_tensor.Batch(),
+                           dst_tensor.Height(), dst_tensor.Slices());
+    const int3 work_group_size = {16, 8, 1};
+    const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
+    return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
  }

 private:
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@ -29,10 +29,9 @@ namespace gpu {
 namespace cl {
 ConvolutionTransposed3x3::ConvolutionTransposed3x3(
    const OperationDef& definition, const DeviceInfo& device_info, int2 padding)
-    : GPUOperation(definition),
-      padding_(padding),
-      work_group_launch_order_(2, 0, 1) {
+    : GPUOperation(definition), padding_(padding) {
  work_group_size_ = int3(8, 4, 1);
+  work_group_launch_order_ = int3(2, 0, 1);
  if (device_info.IsPowerVR()) {
    weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
  } else if (device_info.IsNvidia() || device_info.IsIntel()) {
@ -54,14 +53,12 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
    ConvolutionTransposed3x3&& operation)
    : GPUOperation(std::move(operation)),
      padding_(operation.padding_),
-      work_group_launch_order_(operation.work_group_launch_order_),
      weights_upload_type_(operation.weights_upload_type_) {}

 ConvolutionTransposed3x3& ConvolutionTransposed3x3::operator=(
    ConvolutionTransposed3x3&& operation) {
  if (this != &operation) {
    std::swap(padding_, operation.padding_);
-    std::swap(work_group_launch_order_, operation.work_group_launch_order_);
    std::swap(weights_upload_type_, operation.weights_upload_type_);
    GPUOperation::operator=(std::move(operation));
  }
@ -315,17 +312,23 @@ absl::Status ConvolutionTransposed3x3::BindArguments(ArgumentsBinder* args) {
  return args->SetInt("padding_y", padding_y);
 }

+void ConvolutionTransposed3x3::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  if (weights_upload_type_ == WeightsUploadType::LOCAL_MEM_ASYNC ||
+      weights_upload_type_ == WeightsUploadType::LOCAL_MEM_BY_THREADS) {
+    work_groups->push_back(work_group_size_);
+    return;
+  }
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
+}
+
 int3 ConvolutionTransposed3x3::GetGridSize() const {
  const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
  const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
  const int grid_z = dst_[0]->Slices();
-  int3 wg;
-  wg.x = DivideRoundUp(grid_x, work_group_size_.x);
-  wg.y = DivideRoundUp(grid_y, work_group_size_.y);
-  wg.z = DivideRoundUp(grid_z, work_group_size_.z);
-  return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
-              wg[work_group_launch_order_[1]] * work_group_size_.y,
-              wg[work_group_launch_order_[2]] * work_group_size_.z);
+  return int3(grid_x, grid_y, grid_z);
 }

 bool IsConvolutionTransposed3x3Supported(
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
@ -40,9 +40,7 @@ class ConvolutionTransposed3x3 : public GPUOperation {
  void GetPossibleKernelWorkGroups(
      TuningType tuning_type, const DeviceInfo& device_info,
      const KernelInfo& kernel_info,
-      std::vector<int3>* work_groups) const override {
-    work_groups->push_back(work_group_size_);
-  }
+      std::vector<int3>* work_groups) const override;
  absl::Status BindArguments(ArgumentsBinder* args) override;
  int3 GetGridSize() const override;

@ -78,7 +76,6 @@ class ConvolutionTransposed3x3 : public GPUOperation {
      int2 padding, int3 work_group_launch_order);

  int2 padding_;
-  int3 work_group_launch_order_;
  WeightsUploadType weights_upload_type_;
 };

--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@ -49,6 +49,33 @@ std::string GetElementWiseCode(const OperationDef& op_def,
  return c;
 }

+int3 GetWorkGroupsCount(int grid_dimension, const int3& grid_size,
+                        const int3& work_group_size,
+                        const int3& work_group_launch_order) {
+  int3 work_groups_count;
+  if (grid_dimension == 1) {
+    work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    work_groups_count.y = 1;
+    work_groups_count.z = 1;
+  } else if (grid_dimension == 2) {
+    int3 wgs;
+    wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
+    work_groups_count.x = wgs[work_group_launch_order[0]];
+    work_groups_count.y = wgs[work_group_launch_order[1]];
+    work_groups_count.z = 1;
+  } else {  // grid_dimension == 3
+    int3 wgs;
+    wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
+    wgs.z = DivideRoundUp(grid_size.z, work_group_size.z);
+    work_groups_count.x = wgs[work_group_launch_order[0]];
+    work_groups_count.y = wgs[work_group_launch_order[1]];
+    work_groups_count.z = wgs[work_group_launch_order[2]];
+  }
+  return work_groups_count;
+}
+
 }  // namespace

 DataType OperationDef::GetDataType() const {
@ -106,9 +133,12 @@ GPUOperation::GPUOperation(GPUOperation&& operation)
      src_(std::move(operation.src_)),
      dst_(std::move(operation.dst_)),
      kernel_(std::move(operation.kernel_)),
+      grid_dimension_(operation.grid_dimension_),
+      work_group_launch_order_(operation.work_group_launch_order_),
      grid_size_(operation.grid_size_),
      src_tensors_names_(std::move(operation.src_tensors_names_)),
      dst_tensors_names_(std::move(operation.dst_tensors_names_)),
+      work_groups_count_(operation.work_groups_count_),
      linkable_count_(operation.linkable_count_),
      elementwise_code_(std::move(operation.elementwise_code_)) {}

@ -126,9 +156,12 @@ GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
    src_ = std::move(operation.src_);
    dst_ = std::move(operation.dst_);
    kernel_ = std::move(operation.kernel_);
+    std::swap(grid_dimension_, operation.grid_dimension_);
+    std::swap(work_group_launch_order_, operation.work_group_launch_order_);
    std::swap(grid_size_, operation.grid_size_);
    src_tensors_names_ = std::move(operation.src_tensors_names_);
    dst_tensors_names_ = std::move(operation.dst_tensors_names_);
+    std::swap(work_groups_count_, operation.work_groups_count_);
    std::swap(linkable_count_, operation.linkable_count_);
    elementwise_code_ = std::move(operation.elementwise_code_);
  }
@ -185,6 +218,8 @@ absl::Status GPUOperation::UpdateParams() {
  }
  RETURN_IF_ERROR(BindArguments(&args_));
  grid_size_ = GetGridSize();
+  work_groups_count_ = GetWorkGroupsCount(
+      grid_dimension_, grid_size_, work_group_size_, work_group_launch_order_);
  return absl::OkStatus();
 }

@ -245,14 +280,26 @@ absl::Status GPUOperation::Tune(const TuningParameters& params) {
  }
  if (possible_work_groups.size() == 1) {
    work_group_size_ = possible_work_groups[0];
+    work_groups_count_ =
+        GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_,
+                           work_group_launch_order_);
    return absl::OkStatus();
  } else {
+    std::vector<int3> work_groups_count(possible_work_groups.size());
+    for (int i = 0; i < work_groups_count.size(); ++i) {
+      work_groups_count[i] =
+          GetWorkGroupsCount(grid_dimension_, grid_size_,
+                             possible_work_groups[i], work_group_launch_order_);
+    }
    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
    int best_work_group_index;
    RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
-        kernel_, *params.info, grid_size_, possible_work_groups,
+        kernel_, *params.info, work_groups_count, possible_work_groups,
        &best_work_group_index));
    work_group_size_ = possible_work_groups[best_work_group_index];
+    work_groups_count_ =
+        GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_,
+                           work_group_launch_order_);
    return absl::OkStatus();
  }
 }
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@ -120,7 +120,7 @@ class GPUOperation {

  absl::Status AddToQueue(CLCommandQueue* queue) {
    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-    return queue->DispatchImplicit(kernel_, grid_size_, work_group_size_);
+    return queue->Dispatch(kernel_, work_groups_count_, work_group_size_);
  }

  virtual void GetPossibleKernelWorkGroups(
@ -174,11 +174,14 @@ class GPUOperation {
  std::vector<Tensor*> src_;
  std::vector<Tensor*> dst_;
  CLKernel kernel_;
+  int grid_dimension_ = 3;  // can be 1, 2 or 3
+  int3 work_group_launch_order_ = int3(0, 1, 2);
  int3 grid_size_ = int3(0, 0, 0);
  std::vector<std::string> src_tensors_names_;
  std::vector<std::string> dst_tensors_names_;

 private:
+  int3 work_groups_count_ = int3(0, 0, 0);
  int linkable_count_ = 0;
  std::string elementwise_code_;  // temporary, used during op construction
 };
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@ -188,6 +188,14 @@ int GetRecommendedBlockSizeForConv(const DeviceInfo& device_info,
  return block_size;
 }

+int3 GetWorkGroupsCount(const int3& grid_size, const int3& work_group_size) {
+  int3 work_groups_count;
+  work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
+  work_groups_count.y = DivideRoundUp(grid_size.y, work_group_size.y);
+  work_groups_count.z = DivideRoundUp(grid_size.z, work_group_size.z);
+  return work_groups_count;
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@ -213,6 +213,8 @@ int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size);
 int GetRecommendedBlockSizeForConv(const DeviceInfo& device,
                                   CalculationsPrecision precision,
                                   int task_size);
+
+int3 GetWorkGroupsCount(const int3& grid_size, const int3& work_group_size);
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite