Preparing ConvPowerVR to support 3d convolutions.

PiperOrigin-RevId: 331182880 Change-Id: I811603ff3677349752c6200d6e1cafa62ab3a2ee
2020-09-11 11:05:08 -07:00 · 2020-09-11 11:05:08 -07:00 · 721a91c7e9
commit 721a91c7e9
parent 3a366ec295
2 changed files with 199 additions and 150 deletions
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@ -71,32 +71,33 @@ std::string GenerateAsyncUpload(const std::string& local_ptr_name,
  return c;
 }

-std::string GenerateBlockCoords(const int3& block_size,
+std::string GenerateBlockCoords(const int4& block_size,
                                const int3& work_group_launch_order,
-                                bool linear_hw) {
+                                bool linear_spatial) {
  std::string c;
  int3 launch_remap;
  launch_remap[work_group_launch_order.x] = 0;
  launch_remap[work_group_launch_order.y] = 1;
  launch_remap[work_group_launch_order.z] = 2;
-  if (linear_hw) {
+  if (linear_spatial) {
    if (work_group_launch_order[0] == 0) {
-      c += "  int linear_hw = get_global_id(0);\n";
+      c += "  int linear_spatial = get_global_id(0);\n";
    } else {
-      c += "  int linear_hw = get_group_id(" + std::to_string(launch_remap[0]) +
+      c += "  int linear_spatial = get_group_id(" +
+           std::to_string(launch_remap[0]) +
           ") * get_local_size(0) + get_local_id(0);\n";
    }
-    c += "  int DST_Y = (linear_hw / args.task_size_x) * " +
+    c += "  int DST_Y = (linear_spatial / args.task_size_x) * " +
         std::to_string(block_size.y) + ";\n";
-    c += "  int DST_X = (linear_hw % args.task_size_x) * " +
+    c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
         std::to_string(block_size.x) + ";\n";
    if (work_group_launch_order[1] == 1) {
-      c += "  int DST_S = get_global_id(1) * " + std::to_string(block_size.z) +
+      c += "  int DST_S = get_global_id(1) * " + std::to_string(block_size.w) +
           ";\n";
    } else {
      c += "  int DST_S = (get_group_id(" + std::to_string(launch_remap[1]) +
           ") * get_local_size(1) + get_local_id(1)) * " +
-           std::to_string(block_size.z) + ";\n";
+           std::to_string(block_size.w) + ";\n";
    }
  } else {
    if (work_group_launch_order[0] == 0) {
@ -116,12 +117,12 @@ std::string GenerateBlockCoords(const int3& block_size,
           std::to_string(block_size.y) + ";\n";
    }
    if (work_group_launch_order[2] == 2) {
-      c += "  int DST_S = get_global_id(2) * " + std::to_string(block_size.z) +
+      c += "  int DST_S = get_global_id(2) * " + std::to_string(block_size.w) +
           ";\n";
    } else {
      c += "  int DST_S = (get_group_id(" + std::to_string(launch_remap[2]) +
           ") * get_local_size(2) + get_local_id(2)) * " +
-           std::to_string(block_size.z) + ";\n";
+           std::to_string(block_size.w) + ";\n";
    }
  }

@ -133,10 +134,10 @@ ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                         const Convolution2DAttributes& attr,
                         const DeviceInfo& device_info, const BHWC* dst_shape)
    : GPUOperation(definition),
-      stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
-                      -attr.padding.prepended.h),
-      kernel_dilation_(attr.weights.shape.w, attr.weights.shape.h,
-                       attr.dilations.w, attr.dilations.h),
+      stride_(attr.strides.w, attr.strides.h, 1, 1),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 1, 1),
+      dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
      conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}

 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
@ -144,10 +145,10 @@ ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                         const BHWC& weights_shape,
                         const DeviceInfo& device_info, const BHWC* dst_shape)
    : GPUOperation(definition),
-      stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
-                      -attr.padding.prepended.h),
-      kernel_dilation_(weights_shape.w, weights_shape.h, attr.dilations.w,
-                       attr.dilations.h),
+      stride_(attr.strides.w, attr.strides.h, 1, 1),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+      kernel_size_(weights_shape.w, weights_shape.h, 1, 1),
+      dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
      conv_params_(GuessBestParams(device_info, definition, attr, weights_shape,
                                   dst_shape)) {}

@ -155,25 +156,33 @@ ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                         const FullyConnectedAttributes& attr,
                         const DeviceInfo& device_info, const BHWC* dst_shape)
    : GPUOperation(definition),
-      stride_padding_(1, 1, 0, 0),
-      kernel_dilation_(1, 1, 1, 1),
+      stride_(1, 1, 1, 1),
+      padding_(0, 0, 0, 0),
+      kernel_size_(1, 1, 1, 1),
+      dilation_(1, 1, 1, 1),
      conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}

 ConvPowerVR::ConvPowerVR(const OperationDef& definition)
    : GPUOperation(definition),
-      stride_padding_(1, 1, 0, 0),
-      kernel_dilation_(1, 1, 1, 1) {}
+      stride_(1, 1, 1, 1),
+      padding_(0, 0, 0, 0),
+      kernel_size_(1, 1, 1, 1),
+      dilation_(1, 1, 1, 1) {}

 ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
    : GPUOperation(std::move(operation)),
-      stride_padding_(operation.stride_padding_),
-      kernel_dilation_(operation.kernel_dilation_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      kernel_size_(operation.kernel_size_),
+      dilation_(operation.dilation_),
      conv_params_(operation.conv_params_) {}

 ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
  if (this != &operation) {
-    std::swap(stride_padding_, operation.stride_padding_);
-    std::swap(kernel_dilation_, operation.kernel_dilation_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(dilation_, operation.dilation_);
    std::swap(conv_params_, operation.conv_params_);
    GPUOperation::operator=(std::move(operation));
  }
@ -182,7 +191,7 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {

 void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_padding_.x != 1;
+      definition_.IsBatchSupported() && stride_.x != 1;
  code_ =
      GenerateConv(device_info, definition_, stride_correction, conv_params_);
  if (definition_.precision == CalculationsPrecision::F16 &&
@ -196,18 +205,16 @@ void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {

 absl::Status ConvPowerVR::BindArguments() {
  if (!conv_params_.x_kernel_is_1 || !conv_params_.y_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_padding_.x));
-    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_padding_.y));
-    RETURN_IF_ERROR(
-        args_.SetInt("padding_x", stride_padding_.z * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("padding_y", stride_padding_.w));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_dilation_.x));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_dilation_.y));
-    RETURN_IF_ERROR(
-        args_.SetInt("dilation_x", kernel_dilation_.z * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("dilation_y", kernel_dilation_.w));
+    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
+    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
+    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
+    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
+    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
  }
-  if (conv_params_.linear_hw) {
+  if (conv_params_.linear_spatial) {
    const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
                                     conv_params_.block_size.x);
    RETURN_IF_ERROR(args_.SetInt("task_size_x", grid_x));
@ -221,10 +228,10 @@ int3 ConvPowerVR::GetGridSize() const {
  const int grid_y =
      DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
  const int grid_z =
-      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
+      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w);
  int3 wg;

-  if (conv_params_.linear_hw) {
+  if (conv_params_.linear_spatial) {
    wg.x = DivideRoundUp(grid_x * grid_y, work_group_size_.x);
    wg.y = DivideRoundUp(grid_z, work_group_size_.y);
    return int3(
@ -285,6 +292,28 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
    AddSrcBuffer("weights", desc);
  }

+  const auto& src_def = op_def.src_tensors[0];
+
+  auto generate_id = [&](const std::string& x, const std::string& y,
+                         const std::string& z) {
+    std::string id;
+    if (src_def.HasAxis(Axis::WIDTH)) {
+      id += "_w" + x;
+    }
+    if (src_def.HasAxis(Axis::HEIGHT)) {
+      id += "_h" + y;
+    }
+    if (src_def.HasAxis(Axis::DEPTH)) {
+      id += "_d" + z;
+    }
+    return id;
+  };
+
+  auto generate_id_full = [&](const std::string& x, const std::string& y,
+                              const std::string& z, const std::string& s) {
+    return generate_id(x, y, z) + "_s" + s;
+  };
+
  auto dst_desc = op_def.dst_tensors[0];
  if (op_def.IsBatchSupported()) {
    dst_desc.SetStateVar("BatchedWidth", "true");
@ -302,7 +331,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
    args_.AddInt("dilation_x");
    args_.AddInt("dilation_y");
  }
-  if (conv_params_.linear_hw) {
+  if (conv_params_.linear_spatial) {
    args_.AddInt("task_size_x");
  }

@ -318,7 +347,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
          ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;

  const int local_mem_size =
-      conv_params.block_size.z * 4 * conv_params.src_depth_loop_size;
+      conv_params.block_size.w * 4 * conv_params.src_depth_loop_size;

  const bool use_simd_broadcast = conv_params.IsPrivateMemBroadcast();
  const int simd_size = conv_params.simd_size;
@ -343,7 +372,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
      c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
    }
  }
-  const int3 block_size = conv_params.block_size;
+  const int4 block_size = conv_params.block_size;
  if (conv_params.fixed_work_group_size) {
    c += "__attribute__((reqd_work_group_size(" +
         std::to_string(work_group_size_.x) + ", " +
@ -358,7 +387,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
  c += "$0) {\n";
  c += GenerateBlockCoords(conv_params.block_size,
                           conv_params.work_group_launch_order,
-                           conv_params.linear_hw);
+                           conv_params.linear_spatial);
  std::vector<std::string> dst_x(conv_params.block_size.x);
  for (int x = 0; x < conv_params.block_size.x; ++x) {
    dst_x[x] = "(DST_X + " + std::to_string(x) + ")";
@ -376,7 +405,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
  }
  if (conv_params.weights_upload_type ==
      ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    if (conv_params.linear_hw) {
+    if (conv_params.linear_spatial) {
      c += "  int lid = get_local_id(0);\n";
    } else {
      c += "  int lid = get_local_id(1) * " +
@ -386,11 +415,17 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
  if (use_simd_broadcast) {
    c += "  int simd_id = get_sub_group_local_id();\n";
  }
-  for (int z = 0; z < block_size.z; ++z) {
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        c += "  ACCUM_FLT4 r" + std::to_string(z) + std::to_string(y) +
-             std::to_string(x) + " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  for (int s = 0; s < block_size.w; ++s) {
+    const std::string sind = std::to_string(s);
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          c += "  ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) +
+               " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+        }
      }
    }
  }
@ -427,7 +462,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
        c += "  " + weights_global_ptr +
             " filters_loc = args.weights.GetPtr() + (DST_S * "
             "args.src_tensor.Height() + DST_Y * " +
-             std::to_string(block_size.z) +
+             std::to_string(block_size.w) +
             ") * 4 * args.src_tensor.Slices();\n";
      } else {
        c += "  " + weights_global_ptr +
@ -472,24 +507,28 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
  }
  if (buffer_type) {
    for (int y = 0; y < block_size.y; ++y) {
-      const std::string yck = "yck" + std::to_string(y);
+      const std::string yind = std::to_string(y);
+      const std::string yck = "yck" + yind;
      for (int x = 0; x < block_size.x; ++x) {
-        const std::string xck = "xck" + std::to_string(x);
+        const std::string xind = std::to_string(x);
+        const std::string xck = "xck" + xind;
        std::string xc =
            is1x1 ? "min(" + dst_x[x] + ", args.src_tensor.Width() - 1)" : xck;
        std::string yc =
            is1x1 ? "min(" + dst_y[y] + ", args.src_tensor.Height() - 1)" : yck;
-        std::string id = std::to_string(y) + std::to_string(x);
-        c += "  int src_a_" + id + " = " + yc +
-             " * args.src_tensor.Width() + " + xc + ";\n";
+        std::string id = generate_id(xind, yind, "");
+        c += "  int src_a" + id + " = " + yc + " * args.src_tensor.Width() + " +
+             xc + ";\n";
      }
    }
  }

  auto declare_src = [&]() {
    for (int y = 0; y < block_size.y; ++y) {
+      const std::string yind = std::to_string(y);
      for (int x = 0; x < block_size.x; ++x) {
-        const std::string id = std::to_string(y) + std::to_string(x);
+        const std::string xind = std::to_string(x);
+        const std::string id = generate_id(xind, yind, "");
        c += "    " + weights_data_type + " src" + id + ";\n";
      }
    }
@ -498,27 +537,28 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
  auto read_src = [&]() {
    const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
    for (int y = 0; y < block_size.y; ++y) {
+      const std::string yind = std::to_string(y);
      for (int x = 0; x < block_size.x; ++x) {
+        const std::string xind = std::to_string(x);
+        std::string id = generate_id(xind, yind, "");
        if (buffer_type) {
-          std::string id = std::to_string(y) + std::to_string(x);
          if (is1x1) {
            c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
-                 ">(src_a_" + id + ");\n";
+                 ">(src_a" + id + ");\n";
          } else {
            std::string condition =
                "mx" + std::to_string(x) + " && my" + std::to_string(y);
            if (conditional_read) {
              c += "    src" + id + " = " + condition +
-                   " ? args.src_tensor.Read<" + cl_type + ">(src_a_" + id +
+                   " ? args.src_tensor.Read<" + cl_type + ">(src_a" + id +
                   ") : (FLT4)(0.0f);\n";
            } else {
              c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
-                   ">(src_a_" + id + ") * (FLT)(" + condition + ");\n";
+                   ">(src_a" + id + ") * (FLT)(" + condition + ");\n";
            }
          }
-          c += "    src_a_" + id + " += src_layer_offset;\n";
+          c += "    src_a" + id + " += src_layer_offset;\n";
        } else {
-          std::string id = std::to_string(y) + std::to_string(x);
          const std::string xc = is1x1 ? dst_x[x] : "xck" + std::to_string(x);
          const std::string yc = is1x1 ? dst_y[y] : "yck" + std::to_string(y);
          c += "    src" + id + " = args.src_tensor.Read<" + cl_type + ">(" +
@ -532,15 +572,19 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
        conv_params.weights_data_type == DataType::FLOAT16);
  auto conv_core = [&](int shared_offset) {
    const std::string channels[] = {"x", "y", "z", "w"};
-    for (int z = 0; z < block_size.z; ++z) {
+    for (int s = 0; s < block_size.w; ++s) {
+      const std::string sind = std::to_string(s);
      if (weights_type_as_accum_type) {
        for (int ch = 0; ch < 4; ++ch) {
          for (int y = 0; y < block_size.y; ++y) {
+            const std::string yind = std::to_string(y);
            for (int x = 0; x < block_size.x; ++x) {
-              std::string id = std::to_string(y) + std::to_string(x);
+              const std::string xind = std::to_string(x);
+              std::string R = "r" + generate_id_full(xind, yind, "", sind);
+              std::string S = "src" + generate_id(xind, yind, "");
              if (use_simd_broadcast) {
-                int simd_id = (z * 4 + ch + shared_offset) / simd_size;
-                int thread_id = (z * 4 + ch + shared_offset) % simd_size;
+                int simd_id = (s * 4 + ch + shared_offset) / simd_size;
+                int thread_id = (s * 4 + ch + shared_offset) % simd_size;
                std::string w_val_x = "sub_group_broadcast(simd_w" +
                                      std::to_string(simd_id) + ".x, " +
                                      std::to_string(thread_id) + "u)";
@ -553,38 +597,39 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
                std::string w_val_w = "sub_group_broadcast(simd_w" +
                                      std::to_string(simd_id) + ".w, " +
                                      std::to_string(thread_id) + "u)";
-                c += "    r" + std::to_string(z) + id + ".x += " + w_val_x +
-                     " * src" + id + "." + channels[ch] + ";\n";
-                c += "    r" + std::to_string(z) + id + ".y += " + w_val_y +
-                     " * src" + id + "." + channels[ch] + ";\n";
-                c += "    r" + std::to_string(z) + id + ".z += " + w_val_z +
-                     " * src" + id + "." + channels[ch] + ";\n";
-                c += "    r" + std::to_string(z) + id + ".w += " + w_val_w +
-                     " * src" + id + "." + channels[ch] + ";\n";
+                c += "    " + R + ".x += " + w_val_x + " * " + S + "." +
+                     channels[ch] + ";\n";
+                c += "    " + R + ".y += " + w_val_y + " * " + S + "." +
+                     channels[ch] + ";\n";
+                c += "    " + R + ".z += " + w_val_z + " * " + S + "." +
+                     channels[ch] + ";\n";
+                c += "    " + R + ".w += " + w_val_w + " * " + S + "." +
+                     channels[ch] + ";\n";
              } else {
                const std::string weight_id =
-                    std::to_string(z * 4 + ch + shared_offset);
+                    std::to_string(s * 4 + ch + shared_offset);
                std::string w_val;
                if (conv_params.AreWeightsBuffer()) {
                  w_val = "weights_cache[" + weight_id + "]";
                } else {
                  w_val = "f" + weight_id;
                }
-                c += "    r" + std::to_string(z) + id + " += " + w_val +
-                     " * src" + id + "." + channels[ch] + ";\n";
+                c += "    " + R + " += " + w_val + " * " + S + "." +
+                     channels[ch] + ";\n";
              }
            }
          }
        }
      } else {  // F32_F16 precision and weights type is float16
        for (int y = 0; y < block_size.y; ++y) {
+          const std::string yind = std::to_string(y);
          for (int x = 0; x < block_size.x; ++x) {
-            std::string id = std::to_string(y) + std::to_string(x);
-            std::string R = "r" + std::to_string(z) + id;
-            std::string S = "src" + id;
+            const std::string xind = std::to_string(x);
+            std::string R = "r" + generate_id_full(xind, yind, "", sind);
+            std::string S = "src" + generate_id(xind, yind, "");
            std::vector<std::string> F(4);
            for (int i = 0; i < 4; ++i) {
-              std::string weight_id = std::to_string(z * 4 + i + shared_offset);
+              std::string weight_id = std::to_string(s * 4 + i + shared_offset);
              if (conv_params.AreWeightsBuffer()) {
                F[i] = "weights_cache[" + weight_id + "]";
              } else {
@ -633,7 +678,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
  } else if (conv_params.AreWeightsBuffer()) {  // GLOBAL_MEM/CONSTANT_MEM
    c += "    weights_cache = filters_loc;\n";
  } else {  // TEXTURES_MEM
-    for (int dst_s = 0; dst_s < block_size.z; ++dst_s) {
+    for (int dst_s = 0; dst_s < block_size.w; ++dst_s) {
      std::string f_y = is1x1 ? "s" : "filter_offset";
      if (conv_params.different_weights_for_height) {
        f_y = "DST_Y * args.src_tensor.Slices() + s";
@ -660,7 +705,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
  conv_core(0);
  for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
    read_src();
-    conv_core(i * block_size.z * 4);
+    conv_core(i * block_size.w * 4);
    c += "    s += 1;\n";
  }
  if (conv_params.AreWeightsBuffer()) {
@ -675,13 +720,13 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
    if (conv_params.weights_upload_type ==
        ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
      c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S",
-                               block_size.z);
+                               block_size.w);
    } else if (conv_params.weights_upload_type ==
               ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
      c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
      c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()",
                                   "DST_S", "lid", total_work_items,
-                                   block_size.z);
+                                   block_size.w);
      c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
    } else {
      c += "    weights_cache = args.biases.GetPtr() + DST_S;\n";
@ -694,21 +739,23 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
    c += "    return;\n";
    c += "  }\n";
  }
-  for (int z = 0; z < block_size.z; ++z) {
-    const std::string sz = std::to_string(z);
-    c += "  if (DST_S + " + sz + " >= args.dst_tensor.Slices()) return;\n";
+  for (int s = 0; s < block_size.w; ++s) {
+    const std::string sind = std::to_string(s);
+    c += "  if (DST_S + " + sind + " >= args.dst_tensor.Slices()) return;\n";
    c += "  {\n";
    if (conv_params.AreWeightsBuffer()) {
-      c += "    FLT4 bias_val = TO_FLT4(weights_cache[" + sz + "]);\n";
+      c += "    FLT4 bias_val = TO_FLT4(weights_cache[" + sind + "]);\n";
    } else {
-      c += "    FLT4 bias_val = args.biases.Read(DST_S + " + sz + ");\n";
+      c += "    FLT4 bias_val = args.biases.Read(DST_S + " + sind + ");\n";
    }
    for (int y = 0; y < block_size.y; ++y) {
+      const std::string yind = std::to_string(y);
      for (int x = 0; x < block_size.x; ++x) {
+        const std::string xind = std::to_string(x);
        const std::string xs = dst_x[x];
        const std::string ys = dst_y[y];
-        const std::string zs = "DST_S + " + sz;
-        const std::string r_id = sz + std::to_string(y) + std::to_string(x);
+        const std::string zs = "DST_S + " + sind;
+        const std::string id = generate_id_full(xind, yind, "", sind);
        bool need_x_check = x != 0;
        bool need_y_check = y != 0;
        if (need_x_check && need_y_check) {
@ -721,7 +768,7 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
        } else {
          c += "  {\n";
        }
-        c += "    FLT4 res = TO_FLT4(r" + r_id + ") + bias_val;\n";
+        c += "    FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
        c += "    args.dst_tensor.Write(res, " + xs + ", " + ys + ", " + zs +
             ");\n";
        c += "  }\n";
@ -738,7 +785,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
    int src_depth, int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
    bool different_weights_for_height, const BHWC* dst_shape) {
  ConvParams conv_params;
-  conv_params.linear_hw = false;
+  conv_params.linear_spatial = false;
  conv_params.weights_data_type =
      DeduceDataTypeFromPrecision(definition.precision);
  conv_params.x_kernel_is_1 = x_kernel_is_1;
@ -750,43 +797,43 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
      conv_params.work_group_launch_order = int3(2, 0, 1);
      conv_params.fixed_work_group_size = true;
    } else {
-      conv_params.linear_hw = true;
+      conv_params.linear_spatial = true;
      work_group_size_ = int3(32, 1, 1);
      conv_params.work_group_launch_order = int3(1, 0, 2);
      conv_params.fixed_work_group_size = true;
    }
-    conv_params.block_size = int3(2, 1, 4);
+    conv_params.block_size = int4(2, 1, 1, 4);
    conv_params.src_depth_loop_size = 1;
    conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
    if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
    } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
    } else {
-      conv_params.block_size.z = dst_depth;
+      conv_params.block_size.w = dst_depth;
    }
    if (dst_shape) {
      int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
      float task_size_per_cu =
          static_cast<float>(task_size) / device_info.compute_units_count;
      int block_size = conv_params.block_size.x * conv_params.block_size.y *
-                       conv_params.block_size.z;
+                       conv_params.block_size.w;
      float threads_per_cu = task_size_per_cu / block_size;
      float warps_per_cu = threads_per_cu / 32 /*warp_size*/;
      if (warps_per_cu < 8.0f) {
        conv_params.block_size.x = 1;
      }
-      if (warps_per_cu < 4.0f && conv_params.block_size.z >= 4) {
-        conv_params.block_size.z /= 2;
+      if (warps_per_cu < 4.0f && conv_params.block_size.w >= 4) {
+        conv_params.block_size.w /= 2;
      }
-      if (warps_per_cu < 2.0f && conv_params.block_size.z >= 2) {
-        conv_params.block_size.z /= 2;
+      if (warps_per_cu < 2.0f && conv_params.block_size.w >= 2) {
+        conv_params.block_size.w /= 2;
      }
    }
    if (src_depth % 2 == 0) {
      conv_params.src_depth_loop_size = 2;
    }
-    if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+    if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
      conv_params.src_depth_loop_size = 4;
    }
  } else if (device_info.IsPowerVR()) {
@ -795,7 +842,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
      conv_params.work_group_launch_order = int3(2, 0, 1);
      conv_params.fixed_work_group_size = true;
    } else {
-      conv_params.linear_hw = true;
+      conv_params.linear_spatial = true;
      work_group_size_ = int3(32, 1, 1);
      conv_params.work_group_launch_order = int3(1, 0, 2);
      conv_params.fixed_work_group_size = true;
@ -803,28 +850,28 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
    conv_params.weights_data_type =
        definition.precision == CalculationsPrecision::F16 ? DataType::FLOAT16
                                                           : DataType::FLOAT32;
-    conv_params.block_size = int3(1, 1, 4);
+    conv_params.block_size = int4(1, 1, 1, 4);
    conv_params.src_depth_loop_size = 1;
    conv_params.weights_upload_type =
        WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
    if (dst_depth % 8 == 0 || dst_depth >= 32) {
-      conv_params.block_size.z = 8;
+      conv_params.block_size.w = 8;
    } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
    } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
    } else {
-      conv_params.block_size.z = dst_depth;
+      conv_params.block_size.w = dst_depth;
    }
    if (definition.precision == CalculationsPrecision::F16) {
-      conv_params.block_size.z = std::min(4, conv_params.block_size.z);
+      conv_params.block_size.w = std::min(4, conv_params.block_size.w);
      if (src_depth % 2 == 0) {
        conv_params.src_depth_loop_size = 2;
      }
-      if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+      if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
        conv_params.src_depth_loop_size = 4;
      }
-      if (conv_params.block_size.z == 1) {
+      if (conv_params.block_size.w == 1) {
        if (src_depth % 2 == 0) {
          conv_params.src_depth_loop_size = 2;
        }
@ -848,20 +895,20 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
      conv_params.fixed_work_group_size = true;
    }

-    conv_params.block_size = int3(2, 1, 1);
+    conv_params.block_size = int4(2, 1, 1, 1);
    if (x_kernel_is_1 && y_kernel_is_1) {
      conv_params.block_size.y = 2;
    }
    conv_params.src_depth_loop_size = 1;
    conv_params.weights_upload_type = WeightsUploadType::CONSTANT_MEM;
    if (dst_depth % 8 == 0 || dst_depth >= 32) {
-      conv_params.block_size.z = 8;
+      conv_params.block_size.w = 8;
    } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
    } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
    } else {
-      conv_params.block_size.z = 1;
+      conv_params.block_size.w = 1;
    }
    if (src_depth % 2 == 0 && src_depth >= 16) {
      conv_params.src_depth_loop_size = 2;
@ -878,20 +925,20 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
    }
    if (block_size == 8) {
      if (dst_depth == 1 || dst_depth == 3) {
-        conv_params.block_size = int3(2, 2, 1);
+        conv_params.block_size = int4(2, 2, 1, 1);
      } else {
-        conv_params.block_size = int3(2, 2, 2);
+        conv_params.block_size = int4(2, 2, 1, 2);
      }
    } else if (block_size == 4) {
      if (dst_depth == 1 || dst_depth == 3) {
-        conv_params.block_size = int3(2, 2, 1);
+        conv_params.block_size = int4(2, 2, 1, 1);
      } else {
-        conv_params.block_size = int3(2, 1, 2);
+        conv_params.block_size = int4(2, 1, 1, 2);
      }
    } else if (block_size == 2) {
-      conv_params.block_size = int3(2, 1, 1);
+      conv_params.block_size = int4(2, 1, 1, 1);
    } else {
-      conv_params.block_size = int3(1, 1, 1);
+      conv_params.block_size = int4(1, 1, 1, 1);
    }
    conv_params.src_depth_loop_size = 1;
    MaliInfo mali_info = device_info.mali_info;
@ -907,7 +954,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
    conv_params.fixed_work_group_size = false;
    conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
  } else if (device_info.IsAdreno()) {
-    conv_params.block_size = int3(2, 2, 1);
+    conv_params.block_size = int4(2, 2, 1, 1);
    work_group_size_ = int3(8, 2, 1);
    conv_params.work_group_launch_order = int3(0, 1, 2);
    conv_params.fixed_work_group_size = false;
@ -924,12 +971,12 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
      conv_params.work_group_launch_order = int3(0, 1, 2);
      conv_params.fixed_work_group_size = true;
    } else {
-      conv_params.linear_hw = true;
+      conv_params.linear_spatial = true;
      work_group_size_ = int3(16, 1, 1);
      conv_params.work_group_launch_order = int3(0, 1, 2);
      conv_params.fixed_work_group_size = true;
    }
-    conv_params.block_size = int3(1, 1, 4);
+    conv_params.block_size = int4(1, 1, 1, 4);
    conv_params.src_depth_loop_size = 1;
    int sub_group_size = 16;
    if (definition.precision != CalculationsPrecision::F32_F16 &&
@ -944,36 +991,36 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
      conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
    }
    if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
    } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
    } else {
-      conv_params.block_size.z = dst_depth;
+      conv_params.block_size.w = dst_depth;
    }
    if (src_depth % 2 == 0) {
      conv_params.src_depth_loop_size = 2;
    }
-    if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+    if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
      conv_params.src_depth_loop_size = 4;
    }
  } else {
-    conv_params.block_size = int3(1, 1, 4);
+    conv_params.block_size = int4(1, 1, 1, 4);
    work_group_size_ = int3(8, 2, 1);
    conv_params.work_group_launch_order = int3(0, 1, 2);
    conv_params.fixed_work_group_size = false;
    conv_params.src_depth_loop_size = 1;
    conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
    if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
    } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
    } else {
-      conv_params.block_size.z = dst_depth;
+      conv_params.block_size.w = dst_depth;
    }
    if (src_depth % 2 == 0) {
      conv_params.src_depth_loop_size = 2;
    }
-    if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+    if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
      conv_params.src_depth_loop_size = 4;
    }
  }
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@ -53,7 +53,7 @@ class ConvPowerVR : public GPUOperation {
  ConvWeightsDescription GetConvWeightsDescription() const {
    ConvWeightsDescription desc;
    desc.layout = ConvWeightsLayout::kOHWIOGroupI4O4;
-    desc.output_group_size = conv_params_.block_size.z;
+    desc.output_group_size = conv_params_.block_size.w;
    return desc;
  }

@ -82,10 +82,10 @@ class ConvPowerVR : public GPUOperation {
    // weights, so for PowerVR in this kernel we have F32 weights for
    // F32_F16 precision mode
    DataType weights_data_type;  // used for weights and biases
-    int3 block_size;
+    int4 block_size;             // WHDS
    int3 work_group_launch_order;
    bool fixed_work_group_size;
-    bool linear_hw;
+    bool linear_spatial;  // spatial dimensions are Width/Height/Depth
    bool different_weights_for_height;
    int src_depth_loop_size;
    WeightsUploadType weights_upload_type;
@ -178,8 +178,10 @@ class ConvPowerVR : public GPUOperation {
                           const OperationDef& op_def, bool stride_correction,
                           const ConvParams& conv_params);

-  int4 stride_padding_;
-  int4 kernel_dilation_;
+  int4 stride_;
+  int4 padding_;
+  int4 kernel_size_;
+  int4 dilation_;
  ConvParams conv_params_;
 };

@ -214,7 +216,7 @@ void ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
  const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
                             ? sizeof(float)
                             : sizeof(half);
-  int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.z);
+  int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.w);
  desc.size = float_size * aligned_channels;
  desc.data.resize(desc.size);
  if (conv_params_.weights_data_type == DataType::FLOAT32) {
@ -235,7 +237,7 @@ void ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
 template <DataType T>
 void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
  const int dst_slices =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), conv_params_.block_size.z);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), conv_params_.block_size.w);
  const int src_slices = DivideRoundUp(weights.shape.i, 4);

  const bool f32_weights = conv_params_.weights_data_type == DataType::FLOAT32;
@ -249,19 +251,19 @@ void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
  if (f32_weights) {
    float4* ptr = reinterpret_cast<float4*>(data.data());
    if (conv_params_.AreWeightsBuffer()) {
-      RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
+      RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.w,
                                       absl::MakeSpan(ptr, elements_count));
    } else {
-      RearrangeWeightsToI4HWIOOGroupO4(weights, conv_params_.block_size.z,
+      RearrangeWeightsToI4HWIOOGroupO4(weights, conv_params_.block_size.w,
                                       absl::MakeSpan(ptr, elements_count));
    }
  } else {
    half4* ptr = reinterpret_cast<half4*>(data.data());
    if (conv_params_.AreWeightsBuffer()) {
-      RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
+      RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.w,
                                       absl::MakeSpan(ptr, elements_count));
    } else {
-      RearrangeWeightsToI4HWIOOGroupO4(weights, conv_params_.block_size.z,
+      RearrangeWeightsToI4HWIOOGroupO4(weights, conv_params_.block_size.w,
                                       absl::MakeSpan(ptr, elements_count));
    }
  }