ConvPowerVR improvements for FP16 mode.

PiperOrigin-RevId: 265767693
2019-08-27 14:30:37 -07:00 · 2019-08-27 14:30:37 -07:00 · 7301b746f2
commit 7301b746f2
parent 28e24ad2f1
3 changed files with 290 additions and 182 deletions
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@ -190,6 +190,7 @@ cc_library(
        ":gpu_operation",
        ":util",
        "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:cl_device",
        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
        "//tensorflow/lite/delegates/gpu/cl:precision",
        "//tensorflow/lite/delegates/gpu/cl:tensor",
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@ -15,6 +15,7 @@ limitations under the License.

 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"

+#include <algorithm>
 #include <string>
 #include <utility>

@ -27,155 +28,16 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
-
-std::string GenerateConvPowerVR1x1(
-    const TensorDescriptor& src_descriptor,
-    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
-    const int3& block_size,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  std::string c = GetCommonDefines(precision);
-  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
-  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
-
-  bool power_vr = true;
-  c += "#define SIMD_BARRIER " +
-       (power_vr ? std::string("")
-                 : std::string("barrier(CLK_LOCAL_MEM_FENCE)")) +
-       "\n";
-  c += "#define SIMD_WAIT_EVENT(E) " +
-       (power_vr ? std::string("") : std::string("wait_group_events(1, &E);")) +
-       "\n";
-  c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
-  c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += "    __global ACCUM_FLT4* filters_buffer,    \n";
-  c += "    __global ACCUM_FLT4* biases             \n";
-  c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,                   \n";
-  c += "    int4 dst_size                    \n";
-  c += ") {\n";
-  c += "  int X = (get_group_id(1) * 8 + get_local_id(0)) * " +
-       std::to_string(block_size.x) + ";\n";
-  c += "  int Y = (get_group_id(2) * 4 + get_local_id(1)) * " +
-       std::to_string(block_size.y) + ";\n";
-  c += "  int Z = (get_group_id(0) * 1 + get_local_id(2)) * " +
-       std::to_string(block_size.z) + ";\n";
-  for (int z = 0; z < block_size.z; ++z) {
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        c += "  ACCUM_FLT4 r" + std::to_string(z) + std::to_string(y) +
-             std::to_string(x) + " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-      }
-    }
-  }
-  c += "  __local ACCUM_FLT4 data[" + std::to_string(block_size.z * 4) + "];\n";
-  c += "  __global ACCUM_FLT4* filters_loc = filters_buffer + Z * 4 * "
-       "src_size.w;\n";
-  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
-    c += "  const int src_layer_offset = src_size.x * src_size.y;\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        std::string xc = "min(X + " + std::to_string(x) + ", src_size.x - 1)";
-        std::string yc = "min(Y + " + std::to_string(y) + ", src_size.y - 1)";
-        std::string id = std::to_string(y) + std::to_string(x);
-        c += "  int src_a_" + id + " = " + yc + " * src_size.x + " + xc + ";\n";
-      }
-    }
-  }
-  c += "  int s = 0;\n";
-  c += "  do {\n";
-  for (int y = 0; y < block_size.y; ++y) {
-    for (int x = 0; x < block_size.x; ++x) {
-      if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
-        std::string id = std::to_string(y) + std::to_string(x);
-        if (precision == CalculationsPrecision::F32_F16) {
-          c += "    ACCUM_FLT4 src" + id + " = convert_float4(src_data[src_a_" +
-               id + "]);\n";
-        } else {
-          c += "    FLT4 src" + id + " = src_data[src_a_" + id + "];\n";
-        }
-        c += "    src_a_" + id + " += src_layer_offset;\n";
-      } else {
-        std::string id = std::to_string(y) + std::to_string(x);
-        if (precision == CalculationsPrecision::F32_F16) {
-          c += "    ACCUM_FLT4 src" + id + " = " +
-               src_tensor.ReadAsFloat3D("X + " + std::to_string(x),
-                                        "Y + " + std::to_string(y), "s",
-                                        TextureAddressMode::DONT_CARE) +
-               ";\n";
-        } else {
-          c += "    FLT4 src" + id + " = " +
-               src_tensor.Read3D("X + " + std::to_string(x),
-                                 "Y + " + std::to_string(y), "s",
-                                 TextureAddressMode::DONT_CARE) +
-               ";\n";
-        }
-      }
-    }
-  }
-  c += "    SIMD_BARRIER;\n";
-  c += "    event_t e = async_work_group_copy(data, filters_loc, " +
-       std::to_string(block_size.z * 4) + ", 0);\n";
-  c += "    SIMD_WAIT_EVENT(e);\n";
-  c += "    s += 1;\n";
-  const std::string channels[] = {"x", "y", "z", "w"};
-  for (int z = 0; z < block_size.z; ++z) {
-    for (int ch = 0; ch < 4; ++ch) {
-      for (int y = 0; y < block_size.y; ++y) {
-        for (int x = 0; x < block_size.x; ++x) {
-          std::string id = std::to_string(y) + std::to_string(x);
-          c += "    r" + std::to_string(z) + id + " += data[" +
-               std::to_string(z * 4 + ch) + "] * src" + id + "." +
-               channels[ch] + ";\n";
-        }
-      }
-    }
-  }
-  c += "    filters_loc += " + std::to_string(block_size.z * 4) + ";\n";
-  c += "  } while (s < src_size.w);\n";
-  c += "  SIMD_BARRIER;\n";
-  c += "  event_t e = async_work_group_copy(data, biases + Z, " +
-       std::to_string(block_size.z) + ", 0);\n";
-  c += "  SIMD_WAIT_EVENT(e);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) {\n";
-  c += "    return;\n";
-  c += "  }\n";
-  for (int z = 0; z < block_size.z; ++z) {
-    c += "  if (Z + " + std::to_string(z) + " >= dst_size.w) return;\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        const std::string xs = "X + " + std::to_string(x);
-        const std::string ys = "Y + " + std::to_string(y);
-        const std::string zs = "Z + " + std::to_string(z);
-        const std::string r_id =
-            std::to_string(z) + std::to_string(y) + std::to_string(x);
-        c += "  if (" + xs + " < dst_size.x && " + ys + " < dst_size.y) {\n";
-        c += "    FLT4 res = TO_FLT4(r" + r_id + " + data[" +
-             std::to_string(z) + "]);\n";
-        c += "    " + dst_tensor.GetAddress("address", xs, ys, zs) + "\n";
-        c += PostProcess(linked_operations, "res", zs, "address");
-        c += "    " + dst_tensor.Write3D("res", "address") + "\n";
-        c += "  }\n";
-      }
-    }
-  }
-  c += "}\n";
-  return c;
-}
-}  // namespace

 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                         const Convolution2DAttributes& attr,
-                         const int3& block_size)
+                         const ConvParams& conv_params)
    : GPUOperation(definition),
      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
      stride_(attr.strides.w, attr.strides.h),
      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
      dilation_(attr.dilations.w, attr.dilations.h),
-      block_size_(block_size),
-      work_group_size_(8, 4, 1) {}
+      conv_params_(conv_params) {}

 ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
    : GPUOperation(std::move(operation)),
@ -185,9 +47,8 @@ ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
      stride_(operation.stride_),
      padding_(operation.padding_),
      dilation_(operation.dilation_),
-      block_size_(operation.block_size_),
-      kernel_(std::move(operation.kernel_)),
-      work_group_size_(operation.work_group_size_) {}
+      conv_params_(operation.conv_params_),
+      kernel_(std::move(operation.kernel_)) {}

 ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
  if (this != &operation) {
@ -197,9 +58,8 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
    std::swap(stride_, operation.stride_);
    std::swap(padding_, operation.padding_);
    std::swap(dilation_, operation.dilation_);
-    std::swap(block_size_, operation.block_size_);
+    std::swap(conv_params_, operation.conv_params_);
    kernel_ = std::move(operation.kernel_);
-    std::swap(work_group_size_, operation.work_group_size_);
    GPUOperation::operator=(std::move(operation));
  }
  return *this;
@ -208,7 +68,7 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
 Status ConvPowerVR::Compile(const CreationContext& creation_context) {
  const std::string code = GenerateConvPowerVR1x1(
      definition_.src_tensors[0], definition_.dst_tensors[0],
-      definition_.precision, block_size_, linked_operations_);
+      definition_.precision, conv_params_, linked_operations_);
  std::vector<CompilerOptions> options;
  if (definition_.precision == CalculationsPrecision::F16 &&
      creation_context.device->IsPowerVR()) {
@ -232,19 +92,210 @@ Status ConvPowerVR::BindArguments() {
 }

 int3 ConvPowerVR::GetGridSize() const {
-  const int grid_x = IntegralDivideRoundUp(dst_[0]->Width(), block_size_.x);
-  const int grid_y = IntegralDivideRoundUp(dst_[0]->Height(), block_size_.y);
-  const int grid_z = IntegralDivideRoundUp(dst_[0]->Depth(), block_size_.z);
-  const int wg_x = IntegralDivideRoundUp(grid_x, work_group_size_.x);
-  const int wg_y = IntegralDivideRoundUp(grid_y, work_group_size_.y);
-  const int wg_z = IntegralDivideRoundUp(grid_z, work_group_size_.z);
-  return int3(wg_z * work_group_size_.x, wg_x * work_group_size_.y,
-              wg_y * work_group_size_.z);
+  const int grid_x =
+      IntegralDivideRoundUp(dst_[0]->Width(), conv_params_.block_size.x);
+  const int grid_y =
+      IntegralDivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+  const int grid_z =
+      IntegralDivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
+  const int wg_x =
+      IntegralDivideRoundUp(grid_x, conv_params_.work_group_size.x);
+  const int wg_y =
+      IntegralDivideRoundUp(grid_y, conv_params_.work_group_size.y);
+  const int wg_z =
+      IntegralDivideRoundUp(grid_z, conv_params_.work_group_size.z);
+  return int3(wg_z * conv_params_.work_group_size.x,
+              wg_x * conv_params_.work_group_size.y,
+              wg_y * conv_params_.work_group_size.z);
 }

 Status ConvPowerVR::AddToQueue(CLCommandQueue* queue) {
  RETURN_IF_ERROR(BindArguments());
-  return queue->DispatchImplicit(kernel_, GetGridSize(), work_group_size_);
+  return queue->DispatchImplicit(kernel_, GetGridSize(),
+                                 conv_params_.work_group_size);
+}
+
+std::string GenerateConvPowerVR1x1(
+    const TensorDescriptor& src_descriptor,
+    const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+    const ConvPowerVR::ConvParams& conv_params,
+    const std::vector<ElementwiseOperation*>& linked_operations) {
+  std::string c = GetCommonDefines(precision);
+  TensorCodeGenerator src_tensor("src_data", "src_size", src_descriptor);
+  TensorCodeGenerator dst_tensor("dst_data", "dst_size", dst_descriptor);
+
+  c += "#define SIMD_BARRIER " +
+       (!conv_params.explicit_sync
+            ? std::string("")
+            : std::string("barrier(CLK_LOCAL_MEM_FENCE)")) +
+       "\n";
+  c += "#define SIMD_WAIT_EVENT(E) " +
+       (!conv_params.explicit_sync ? std::string("")
+                                   : std::string("wait_group_events(1, &E);")) +
+       "\n";
+  const int3 work_group_size = conv_params.work_group_size;
+  const int3 block_size = conv_params.block_size;
+  c += "__attribute__((reqd_work_group_size(" +
+       std::to_string(work_group_size.x) + ", " +
+       std::to_string(work_group_size.y) + ", " +
+       std::to_string(work_group_size.z) + ")))\n";
+  c += "__kernel void main_function(\n";
+  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
+  c += "    __global ACCUM_FLT4* filters_buffer,    \n";
+  c += "    __global ACCUM_FLT4* biases             \n";
+  c += GetArgsDeclaration(linked_operations);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
+  c += "    int4 src_size,                   \n";
+  c += "    int4 dst_size                    \n";
+  c += ") {\n";
+  c += "  int X = (get_group_id(1) * 8 + get_local_id(0)) * " +
+       std::to_string(block_size.x) + ";\n";
+  c += "  int Y = (get_group_id(2) * 4 + get_local_id(1)) * " +
+       std::to_string(block_size.y) + ";\n";
+  c += "  int Z = (get_group_id(0) * 1 + get_local_id(2)) * " +
+       std::to_string(block_size.z) + ";\n";
+  for (int z = 0; z < block_size.z; ++z) {
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x; ++x) {
+        c += "  ACCUM_FLT4 r" + std::to_string(z) + std::to_string(y) +
+             std::to_string(x) + " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+      }
+    }
+  }
+  c += "  __local ACCUM_FLT4 data[" +
+       std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
+       "];\n";
+  c += "  __global ACCUM_FLT4* filters_loc = filters_buffer + Z * 4 * "
+       "src_size.w;\n";
+  if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+    c += "  const int src_layer_offset = src_size.x * src_size.y;\n";
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x; ++x) {
+        std::string xc = "min(X + " + std::to_string(x) + ", src_size.x - 1)";
+        std::string yc = "min(Y + " + std::to_string(y) + ", src_size.y - 1)";
+        std::string id = std::to_string(y) + std::to_string(x);
+        c += "  int src_a_" + id + " = " + yc + " * src_size.x + " + xc + ";\n";
+      }
+    }
+  }
+
+  auto declare_src = [&]() {
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x; ++x) {
+        const std::string id = std::to_string(y) + std::to_string(x);
+        if (precision == CalculationsPrecision::F32_F16) {
+          c += "    ACCUM_FLT4 src" + id + ";\n";
+        } else {
+          c += "    FLT4 src" + id + ";\n";
+        }
+      }
+    }
+  };
+  auto read_src = [&]() {
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x; ++x) {
+        if (src_descriptor.storage_type == TensorStorageType::BUFFER) {
+          std::string id = std::to_string(y) + std::to_string(x);
+          if (precision == CalculationsPrecision::F32_F16) {
+            c += "    src" + id + " = convert_float4(src_data[src_a_" + id +
+                 "]);\n";
+          } else {
+            c += "    src" + id + " = src_data[src_a_" + id + "];\n";
+          }
+          c += "    src_a_" + id + " += src_layer_offset;\n";
+        } else {
+          std::string id = std::to_string(y) + std::to_string(x);
+          if (precision == CalculationsPrecision::F32_F16) {
+            c += "    src" + id + " = " +
+                 src_tensor.ReadAsFloat3D("X + " + std::to_string(x),
+                                          "Y + " + std::to_string(y), "s",
+                                          TextureAddressMode::DONT_CARE) +
+                 ";\n";
+          } else {
+            c += "    src" + id + " = " +
+                 src_tensor.Read3D("X + " + std::to_string(x),
+                                   "Y + " + std::to_string(y), "s",
+                                   TextureAddressMode::DONT_CARE) +
+                 ";\n";
+          }
+        }
+      }
+    }
+  };
+  auto conv_core = [&]() {
+    const std::string channels[] = {"x", "y", "z", "w"};
+    for (int z = 0; z < block_size.z; ++z) {
+      for (int ch = 0; ch < 4; ++ch) {
+        for (int y = 0; y < block_size.y; ++y) {
+          for (int x = 0; x < block_size.x; ++x) {
+            std::string id = std::to_string(y) + std::to_string(x);
+            c += "    r" + std::to_string(z) + id + " += data[" +
+                 std::to_string(z * 4 + ch) + "] * src" + id + "." +
+                 channels[ch] + ";\n";
+          }
+        }
+      }
+    }
+  };
+
+  c += "  int s = 0;\n";
+  c += "  do {\n";
+  declare_src();
+  c += "    SIMD_BARRIER;\n";
+  c += "    event_t e = async_work_group_copy(data, filters_loc, " +
+       std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
+       ", 0);\n";
+  read_src();
+  c += "    SIMD_WAIT_EVENT(e);\n";
+  c += "    s += 1;\n";
+  conv_core();
+  for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
+    read_src();
+    conv_core();
+    c += "    s += 1;\n";
+  }
+  c += "    filters_loc += " +
+       std::to_string(block_size.z * 4 * conv_params.src_depth_loop_size) +
+       ";\n";
+  c += "  } while (s < src_size.w);\n";
+  c += "  SIMD_BARRIER;\n";
+  c += "  event_t e = async_work_group_copy(data, biases + Z, " +
+       std::to_string(block_size.z) + ", 0);\n";
+  c += "  SIMD_WAIT_EVENT(e);\n";
+  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.w) {\n";
+  c += "    return;\n";
+  c += "  }\n";
+  for (int z = 0; z < block_size.z; ++z) {
+    c += "  if (Z + " + std::to_string(z) + " >= dst_size.w) return;\n";
+    for (int y = 0; y < block_size.y; ++y) {
+      for (int x = 0; x < block_size.x; ++x) {
+        const std::string xs = "X + " + std::to_string(x);
+        const std::string ys = "Y + " + std::to_string(y);
+        const std::string zs = "Z + " + std::to_string(z);
+        const std::string r_id =
+            std::to_string(z) + std::to_string(y) + std::to_string(x);
+        bool need_x_check = x != 0;
+        bool need_y_check = y != 0;
+        if (need_x_check && need_y_check) {
+          c += "  if (" + xs + " < dst_size.x && " + ys + " < dst_size.y) {\n";
+        } else if (need_x_check && !need_y_check) {
+          c += "  if (" + xs + " < dst_size.x) {\n";
+        } else if (!need_x_check && need_y_check) {
+          c += "  if (" + ys + " < dst_size.y) {\n";
+        } else {
+          c += "  {\n";
+        }
+        c += "    FLT4 res = TO_FLT4(r" + r_id + " + data[" +
+             std::to_string(z) + "]);\n";
+        c += "    " + dst_tensor.GetAddress("address", xs, ys, zs) + "\n";
+        c += PostProcess(linked_operations, "res", zs, "address");
+        c += "    " + dst_tensor.Write3D("res", "address") + "\n";
+        c += "  }\n";
+      }
+    }
+  }
+  c += "}\n";
+  return c;
 }

 bool IsConvPowerVRSupported(const OperationDef& definition,
@ -255,25 +306,61 @@ bool IsConvPowerVRSupported(const OperationDef& definition,
         attr.padding.appended == HW(0, 0);
 }

+ConvPowerVR::ConvParams GuessBestParams(const CLDevice& device,
+                                        const OperationDef& definition,
+                                        const Convolution2DAttributes& attr) {
+  ConvPowerVR::ConvParams conv_params;
+  conv_params.block_size = int3(1, 1, 4);
+  conv_params.work_group_size = int3(8, 4, 1);
+  conv_params.src_depth_loop_size = 1;
+  conv_params.explicit_sync = !device.IsPowerVR();
+  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = IntegralDivideRoundUp(attr.weights.shape.i, 4);
+  if (dst_depth % 8 == 0 || dst_depth >= 32) {
+    conv_params.block_size.z = 8;
+  } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
+    conv_params.block_size.z = 4;
+  } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
+    conv_params.block_size.z = 2;
+  } else {
+    conv_params.block_size.z = dst_depth;
+  }
+  if (definition.precision == CalculationsPrecision::F16) {
+    conv_params.block_size.z = std::min(4, conv_params.block_size.z);
+    if (src_depth % 2 == 0) {
+      conv_params.src_depth_loop_size = 2;
+    }
+    if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+      conv_params.src_depth_loop_size = 4;
+    }
+    if (conv_params.block_size.z == 1) {
+      if (src_depth % 8 == 0) {
+        conv_params.src_depth_loop_size = 8;
+      }
+      if (src_depth % 4 == 0) {
+        conv_params.src_depth_loop_size = 4;
+      }
+      if (src_depth % 2 == 0) {
+        conv_params.src_depth_loop_size = 2;
+      }
+      if (src_depth <= 8) {
+        conv_params.src_depth_loop_size = src_depth;
+      }
+    }
+    conv_params.block_size.x = 2;
+    conv_params.work_group_size = int3(4, 8, 1);
+  }
+
+  return conv_params;
+}
+
 Status CreateConvPowerVR(const CreationContext& creation_context,
                         const OperationDef& definition,
                         const Convolution2DAttributes& attr,
                         ConvPowerVR* result) {
-  int3 block_size = int3(1, 1, 4);
-  const int dst_depth = IntegralDivideRoundUp(attr.weights.shape.o, 4);
-  if (dst_depth % 8 == 0 || dst_depth >= 32) {
-    block_size.z = 8;
-  } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
-    block_size.z = 4;
-  } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-    block_size.z = 2;
-  } else {
-    block_size.z = dst_depth;
-  }
-  if (definition.precision == CalculationsPrecision::F16) {
-    block_size.y = 2;
-  }
-  *result = ConvPowerVR(definition, attr, block_size);
+  *result =
+      ConvPowerVR(definition, attr,
+                  GuessBestParams(*creation_context.device, definition, attr));
  RETURN_IF_ERROR(
      result->UploadWeights(attr.weights, creation_context.context));
  LinearStorageCreateInfo create_info;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>

 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
@ -49,12 +50,16 @@ class ConvPowerVR : public GPUOperation {
  ConvPowerVR& operator=(const ConvPowerVR&) = delete;

 private:
-  friend Status CreateConvPowerVR(const CreationContext& creation_context,
-                                  const OperationDef& definition,
-                                  const Convolution2DAttributes& attr,
-                                  ConvPowerVR* result);
+  struct ConvParams {
+    int3 block_size;
+    int3 work_group_size;
+    int src_depth_loop_size;
+    bool explicit_sync;
+  };
+
  ConvPowerVR(const OperationDef& definition,
-              const Convolution2DAttributes& attr, const int3& block_size);
+              const Convolution2DAttributes& attr,
+              const ConvParams& conv_params);
  template <DataType T>
  Status UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
                       CLContext* context);
@ -62,6 +67,21 @@ class ConvPowerVR : public GPUOperation {
  void RearrangeWeight(const ::tflite::gpu::Tensor<OHWI, S>& weights,
                       absl::Span<T> dst);

+  friend Status CreateConvPowerVR(const CreationContext& creation_context,
+                                  const OperationDef& definition,
+                                  const Convolution2DAttributes& attr,
+                                  ConvPowerVR* result);
+
+  friend std::string GenerateConvPowerVR1x1(
+      const TensorDescriptor& src_descriptor,
+      const TensorDescriptor& dst_descriptor, CalculationsPrecision precision,
+      const ConvParams& conv_params,
+      const std::vector<ElementwiseOperation*>& linked_operations);
+
+  friend ConvParams GuessBestParams(const CLDevice& device,
+                                    const OperationDef& definition,
+                                    const Convolution2DAttributes& attr);
+
  Status BindArguments();
  int3 GetGridSize() const;

@ -72,10 +92,9 @@ class ConvPowerVR : public GPUOperation {
  int2 stride_;
  int2 padding_;
  int2 dilation_;
-  int3 block_size_;
+  ConvParams conv_params_;

  CLKernel kernel_;
-  int3 work_group_size_;
 };

 template <DataType T>
@ -87,7 +106,7 @@ Status ConvPowerVR::UploadWeights(const ::tflite::gpu::Tensor<OHWI, T>& weights,
  const bool f32_weights = definition_.precision != CalculationsPrecision::F16;
  const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);

-  const int dst_depth_aligned = AlignByN(dst_depth, block_size_.z);
+  const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
  const int elements_count =
      weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;

@ -113,16 +132,17 @@ void ConvPowerVR::RearrangeWeight(const ::tflite::gpu::Tensor<OHWI, S>& weights,
  const int kernel_y = weights.shape.h;

  int counter = 0;
-  for (int d = 0; d < IntegralDivideRoundUp(dst_depth, block_size_.z); ++d) {
+  for (int d = 0;
+       d < IntegralDivideRoundUp(dst_depth, conv_params_.block_size.z); ++d) {
    for (int y = 0; y < kernel_y; ++y) {
      for (int x = 0; x < kernel_x; ++x) {
        for (int s = 0; s < src_depth; ++s) {
-          for (int k = 0; k < block_size_.z; ++k) {
+          for (int k = 0; k < conv_params_.block_size.z; ++k) {
            T filters[4];
            for (int i = 0; i < 4; ++i) {
              for (int j = 0; j < 4; ++j) {
                const int s_ch = s * 4 + j;
-                const int d_ch = (d * block_size_.z + k) * 4 + i;
+                const int d_ch = (d * conv_params_.block_size.z + k) * 4 + i;
                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
                  const int f_index =
                      weights.shape.LinearIndex({d_ch, y, x, s_ch});