ConvTexture replaced with ConvPowerVR.

PiperOrigin-RevId: 336742538 Change-Id: I78fdaaa2d98d59ededfec90cffe1cb53747137f4
2020-10-12 14:22:01 -07:00 · 2020-10-12 14:22:01 -07:00 · b082980a84
commit b082980a84
parent 1782a45957
8 changed files with 57 additions and 833 deletions
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@ -239,50 +239,6 @@ cc_test(
    ],
 )

-cc_library(
-    name = "conv_texture",
-    srcs = ["conv_texture.cc"],
-    hdrs = ["conv_texture.h"],
-    deps = [
-        ":gpu_operation",
-        ":util",
-        ":work_group_picking",
-        "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
-        "//tensorflow/lite/delegates/gpu/cl:cl_context",
-        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:precision",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
-        "//tensorflow/lite/delegates/gpu/cl:texture2d",
-        "//tensorflow/lite/delegates/gpu/cl:util",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/common:winograd_util",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_test(
-    name = "conv_texture_test",
-    srcs = ["conv_texture_test.cc"],
-    linkstatic = True,
-    tags = tf_gpu_tests_tags() + [
-        "linux",
-        "local",
-    ],
-    deps = [
-        ":cl_test",
-        ":conv_texture",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
    name = "conv_weights_converter",
    srcs = ["conv_weights_converter.cc"],
@ -1397,7 +1353,6 @@ test_suite(
        "conv_buffer_1x1_test",
        "conv_constants_test",
        "conv_powervr_test",
-        "conv_texture_test",
        "convolution_transposed_3x3_thin_test",
        "convolution_transposed_4x4_test",
        "convolution_transposed_test",
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@ -233,6 +233,16 @@ void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
  if (conv_params_.IsPrivateMemBroadcast() && device_info.IsCL20OrHigher()) {
    compiler_options_.push_back(CompilerOptions::CL_2_0);
  }
+  bool kernel_is_trivial =
+      conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
+  if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
+    kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1;
+  }
+  if (device_info.IsAdreno3xx() &&
+      definition_.precision == CalculationsPrecision::F16 &&
+      kernel_is_trivial) {
+    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
 }

 absl::Status ConvPowerVR::BindArguments() {
@ -284,23 +294,34 @@ int3 ConvPowerVR::GetGridSize() const {
    if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
      grid_x *= task_size_z;
    }
-    wg.x = DivideRoundUp(grid_x, work_group_size_.x);
-    wg.y = DivideRoundUp(task_size_s, work_group_size_.y);
-    return int3(
-        wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
-        wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
+    if (conv_params_.work_group_launch_order[0] == 0 &&
+        conv_params_.work_group_launch_order[1] == 1) {
+      return int3(grid_x, task_size_s, 1);
+    } else {
+      wg.x = DivideRoundUp(grid_x, work_group_size_.x);
+      wg.y = DivideRoundUp(task_size_s, work_group_size_.y);
+      return int3(
+          wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
+          wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
+    }
  } else {
    int grid_y = task_size_y;
    if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
      grid_y *= task_size_z;
    }
-    wg.x = DivideRoundUp(task_size_x, work_group_size_.x);
-    wg.y = DivideRoundUp(grid_y, work_group_size_.y);
-    wg.z = DivideRoundUp(task_size_s, work_group_size_.z);
-    return int3(
-        wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
-        wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
-        wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
+    if (conv_params_.work_group_launch_order[0] == 0 &&
+        conv_params_.work_group_launch_order[1] == 1 &&
+        conv_params_.work_group_launch_order[2] == 2) {
+      return int3(task_size_x, grid_y, task_size_s);
+    } else {
+      wg.x = DivideRoundUp(task_size_x, work_group_size_.x);
+      wg.y = DivideRoundUp(grid_y, work_group_size_.y);
+      wg.z = DivideRoundUp(task_size_s, work_group_size_.z);
+      return int3(
+          wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
+          wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
+          wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
+    }
  }
 }

@ -1190,7 +1211,16 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
    conv_params.fixed_work_group_size = false;
    conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
  } else if (device_info.IsAdreno()) {
-    conv_params.block_size = int4(2, 2, 1, 1);
+    conv_params.block_size = int4(2, 2, 1, 2);
+    if (device_info.IsAdreno3xx()) {
+      if (definition.precision == CalculationsPrecision::F16) {
+        conv_params.block_size = int4(2, 2, 1, 2);
+      } else if (definition.precision == CalculationsPrecision::F32_F16) {
+        conv_params.block_size = int4(2, 1, 1, 2);
+      } else {  // F32
+        conv_params.block_size = int4(2, 2, 1, 1);
+      }
+    }
    work_group_size_ = int3(8, 2, 1);
    conv_params.work_group_launch_order = int3(0, 1, 2);
    conv_params.fixed_work_group_size = false;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
@ -1,461 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/substitute.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/precision.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-bool UseFP16SIMD(const DeviceInfo& device_info, CalculationsPrecision precision,
-                 bool kernel1x1) {
-  if (!device_info.IsAdreno()) {
-    return false;
-  }
-  switch (precision) {
-    case CalculationsPrecision::F32:
-    case CalculationsPrecision::F32_F16:
-      return false;
-    case CalculationsPrecision::F16:
-      return device_info.IsAdreno3xx() && kernel1x1;
-  }
-}
-}  // namespace
-
-ConvTexture::ConvTexture(const OperationDef& definition,
-                         const Convolution2DAttributes& attr)
-    : GPUOperation(definition),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-      dilation_(attr.dilations.w, attr.dilations.h),
-      different_weights_for_height_(false),
-      block_size_(2, 2, 2) {
-  work_group_size_ = int3(4, 4, 2);
-}
-
-ConvTexture::ConvTexture(const OperationDef& definition)
-    : GPUOperation(definition),
-      kernel_size_(1, 1),
-      stride_(1, 1),
-      padding_(0, 0),
-      dilation_(1, 1),
-      different_weights_for_height_(false),
-      block_size_(4, 1, 2) {
-  work_group_size_ = int3(16, 1, 2);
-}
-
-ConvTexture::ConvTexture(ConvTexture&& operation)
-    : GPUOperation(std::move(operation)),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      dilation_(operation.dilation_),
-      different_weights_for_height_(operation.different_weights_for_height_),
-      block_size_(operation.block_size_) {}
-
-ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
-  if (this != &operation) {
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(different_weights_for_height_,
-              operation.different_weights_for_height_);
-    std::swap(block_size_, operation.block_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string ConvTexture::GenerateConvCode(const OperationDef& op_def,
-                                          const int3& block_size, bool is1x1,
-                                          bool adreno4xx_optimization,
-                                          bool stride_correction,
-                                          bool different_weights_for_height) {
-  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
-  if (op_def.IsBatchSupported()) {
-    src_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddSrcTensor("src_tensor", src_desc);
-
-  auto dst_desc = op_def.dst_tensors[0];
-  if (op_def.IsBatchSupported()) {
-    dst_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddDstTensor("dst_tensor", dst_desc);
-
-  if (!is1x1) {
-    args_.AddInt("kernel_size_x");
-    args_.AddInt("kernel_size_y");
-    args_.AddInt("dilation_x");
-    args_.AddInt("dilation_y");
-  }
-  args_.AddInt("stride_x");
-  args_.AddInt("stride_y");
-  args_.AddInt("padding_x");
-  args_.AddInt("padding_y");
-
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  const bool is_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER ||
-                         src_tensor_type == TensorStorageType::BUFFER;
-
-  std::vector<std::string> xs(block_size.x);
-  for (int x = 0; x < block_size.x; ++x) {
-    xs[x] = std::to_string(x);
-  }
-
-  std::vector<std::string> ys(block_size.y);
-  for (int y = 0; y < block_size.y; ++y) {
-    ys[y] = std::to_string(y);
-  }
-
-  std::vector<std::string> zs(block_size.z);
-  for (int z = 0; z < block_size.z; ++z) {
-    zs[z] = std::to_string(z);
-  }
-
-  std::string c = GetCommonDefines(op_def.precision);
-  for (int z = 0; z < block_size.z; ++z) {
-    const std::string f0 = std::to_string(z * 4 + 0);
-    const std::string f1 = std::to_string(z * 4 + 1);
-    const std::string f2 = std::to_string(z * 4 + 2);
-    const std::string f3 = std::to_string(z * 4 + 3);
-    switch (op_def.precision) {
-      case CalculationsPrecision::F32:
-      case CalculationsPrecision::F16:
-        c += "#define CONV" + zs[z] + "(R, S)    \\\n";
-        c += "R += S.x * f" + f0 + "; \\\n";
-        c += "R += S.y * f" + f1 + "; \\\n";
-        c += "R += S.z * f" + f2 + "; \\\n";
-        c += "R += S.w * f" + f3 + ";   \n";
-        break;
-      case CalculationsPrecision::F32_F16:
-        c += "#define CONV" + zs[z] + "(R, S) \\\n";
-        c += "R += convert_float4(S.x * f" + f0 + " + S.y * f" + f1 +
-             " + S.z * f" + f2 + " + S.w * f" + f3 + ");\n";
-        break;
-    }
-  }
-
-  c += "__kernel void main_function(\n";
-  c += "$0) {\n";
-  c += "  int X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
-  c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
-  c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
-  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
-       "|| Z >= args.dst_tensor.Slices()) return;\n";
-  std::vector<std::string> s_x(block_size.x);
-  std::vector<std::string> s_y(block_size.y);
-  for (int x = 0; x < block_size.x; ++x) {
-    if (stride_correction) {
-      c += "  int xc" + xs[x] + " = " +
-           GetXStrideCorrected("X + " + xs[x], "args.src_tensor.Batch()",
-                               "args.stride_x", "args.padding_x") +
-           ";\n";
-    } else {
-      c += "  int xc" + xs[x] + " = (X +" + xs[x] +
-           ") * args.stride_x + args.padding_x;\n";
-    }
-    s_x[x] = is1x1 ? "xc" + xs[x] : "cx" + xs[x];
-  }
-  for (int y = 0; y < block_size.y; ++y) {
-    c += "  int yc" + ys[y] + " = (Y +" + ys[y] +
-         ") * args.stride_y + args.padding_y;\n";
-    s_y[y] = is1x1 ? "yc" + ys[y] : "cy" + ys[y];
-  }
-  for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
-    c += "  ACCUM_FLT4 r" + std::to_string(i) +
-         " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-  }
-  std::string f_y = is1x1 ? "s" : "filter_offset";
-  if (different_weights_for_height) {
-    f_y = "Y * args.src_tensor.Slices() + s";
-  }
-  if (!is1x1) {
-    for (int x = 0; x < block_size.x; ++x) {
-      c += "  int cx" + xs[x] + ";\n";
-    }
-    for (int y = 0; y < block_size.y; ++y) {
-      c += "  int cy" + ys[y] + ";\n";
-    }
-    c += "  int filter_offset = 0;\n";
-    c += "  for (int y = 0; y < args.kernel_size_y; ++y) {\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      c += "  cy" + ys[y] + " = y * args.dilation_y + yc" + ys[y] + ";\n";
-    }
-    if (is_buffer) {
-      for (int y = 0; y < block_size.y; ++y) {
-        c += "  bool in_y" + ys[y] + " = cy" + ys[y] + " >= 0 && cy" + ys[y] +
-             " < args.src_tensor.Height();\n";
-        if (src_tensor_type == TensorStorageType::BUFFER) {
-          c += "    cy" + ys[y] + " = clamp(cy" + ys[y] +
-               ", 0, args.src_tensor.Height() - 1);\n";
-        }
-      }
-    }
-    c += "  for (int x = 0; x < args.kernel_size_x; ++x) {\n";
-    for (int x = 0; x < block_size.x; ++x) {
-      c += "  cx" + xs[x] + " = x * args.dilation_x + xc" + xs[x] + ";\n";
-    }
-    if (is_buffer) {
-      for (int x = 0; x < block_size.x; ++x) {
-        c += "  bool in_x" + xs[x] + " = cx" + xs[x] + " >= 0 && cx" + xs[x] +
-             " < args.src_tensor.Width();\n";
-        if (src_tensor_type == TensorStorageType::BUFFER) {
-          c += "    cx" + xs[x] + " = clamp(cx" + xs[x] +
-               ", 0, args.src_tensor.Width() - 1);\n";
-        }
-      }
-      for (int x = 0; x < block_size.x; ++x) {
-        for (int y = 0; y < block_size.y; ++y) {
-          const std::string id = std::to_string(y * block_size.x + x);
-          if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-            c += absl::Substitute(
-                "  int addr_$0 = select(-1, cy$2 * args.src_tensor.Width() + "
-                "cx$1, (in_x$1 "
-                "&& "
-                "in_y$2));\n",
-                y * block_size.x + x, x, y);
-            c += absl::Substitute(
-                "  int dz_$0 = select(0, args.src_tensor.Width() * "
-                "args.src_tensor.Height(), (in_x$1 && "
-                "in_y$2));\n",
-                y * block_size.x + x, x, y);
-          } else {
-            c += absl::Substitute(
-                "  int addr_$0 = cy$2 * args.src_tensor.Width() + cx$1;\n",
-                y * block_size.x + x, x, y);
-          }
-        }
-      }
-      if (src_tensor_type == TensorStorageType::BUFFER) {
-        c += "  int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
-      }
-    }
-  } else if (is_buffer) {
-    for (int y = 0; y < block_size.y; ++y) {
-      c += "  bool in_y" + ys[y] + " = yc" + ys[y] + " >= 0 && yc" + ys[y] +
-           " < args.src_tensor.Height();\n";
-    }
-    for (int x = 0; x < block_size.x; ++x) {
-      c += "  bool in_x" + xs[x] + " = xc" + xs[x] + " >= 0 && xc" + xs[x] +
-           " < args.src_tensor.Width();\n";
-    }
-    for (int x = 0; x < block_size.x; ++x) {
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string id = std::to_string(y * block_size.x + x);
-        if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-          c += absl::Substitute(
-              "  int addr_$0 = select(-1, yc$2 * args.src_tensor.Width() + "
-              "xc$1, (in_x$1 && "
-              "in_y$2));\n",
-              y * block_size.x + x, x, y);
-          c += absl::Substitute(
-              "  int dz_$0 = select(0, args.src_tensor.Width() * "
-              "args.src_tensor.Height(), (in_x$1 && "
-              "in_y$2));\n",
-              y * block_size.x + x, x, y);
-        } else {
-          c += absl::Substitute(
-              "  int addr_$0 = yc$2 * args.src_tensor.Width() + xc$1;\n",
-              y * block_size.x + x, x, y);
-        }
-      }
-    }
-    if (src_tensor_type == TensorStorageType::BUFFER) {
-      c += "  int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
-    }
-  }
-  c += "  for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
-  if (is_buffer) {
-    if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-      for (int index = 0; index < block_size.x * block_size.y; ++index) {
-        const std::string id = std::to_string(index);
-        c +=
-            "    FLT4 src" + id + " = args.src_tensor.Read(addr_" + id + ");\n";
-      }
-    } else {
-      for (int x = 0; x < block_size.x; ++x) {
-        for (int y = 0; y < block_size.y; ++y) {
-          const std::string id = std::to_string(y * block_size.x + x);
-          c += "    FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
-               ") * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id +
-               " += dz;\n";
-        }
-      }
-    }
-  }
-  for (int z = 0; z < block_size.z; ++z) {
-    c += absl::Substitute(R"(    FLT4 f$2 = args.weights0.Read($0, $1);
-    FLT4 f$3 = args.weights1.Read($0, $1);
-    FLT4 f$4 = args.weights2.Read($0, $1);
-    FLT4 f$5 = args.weights3.Read($0, $1);
-)",
-                          "Z + " + zs[z], f_y, z * 4 + 0, z * 4 + 1, z * 4 + 2,
-                          z * 4 + 3);
-  }
-  if (!is_buffer) {
-    for (int x = 0; x < block_size.x; ++x) {
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string id = std::to_string(y * block_size.x + x);
-        c += "    FLT4 src" + id + " = args.src_tensor.Read(" + s_x[x] + ", " +
-             s_y[y] + ", s);\n";
-      }
-    }
-  }
-  for (int z = 0; z < block_size.z; ++z) {
-    for (int i = 0; i < block_size.x * block_size.y; ++i) {
-      c += "    CONV" + zs[z] + "(r" +
-           std::to_string(i + z * block_size.x * block_size.y) + ", src" +
-           std::to_string(i) + ");\n";
-    }
-  }
-  if (!is1x1) {
-    c += "    filter_offset++;\n";
-  }
-  if (is_buffer) {
-    if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-      for (int index = 0; index < block_size.x * block_size.y; ++index) {
-        const std::string id = std::to_string(index);
-        c += "     addr_" + id + " += dz_" + id + ";\n";
-      }
-    }
-  }
-  c += "  }\n";  // args.src_tensor.Slices()
-  if (!is1x1) {
-    c += "  }\n";  // kernel_size_x
-    c += "  }\n";  // kernel_size_y
-  }
-  // when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y
-  std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
-  std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
-  for (int z = 0; z < block_size.z; ++z) {
-    c += "  if (Z < args.dst_tensor.Slices()) {\n";
-    c += "    FLT4 bias_val = args.biases.Read(Z);\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        const std::string id =
-            std::to_string((z * block_size.y + y) * block_size.x + x);
-        c += "    {\n";
-        c += "      int xc = " + dst_x + " + " + xs[x] + ";\n";
-        c += "      int yc = " + dst_y + " + " + ys[y] + ";\n";
-        c += "      if (xc < args.dst_tensor.Width() && yc < "
-             "args.dst_tensor.Height()) {\n";
-        c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
-        c += "        args.dst_tensor.Write(res, xc, yc, Z);\n";
-        c += "      }\n";
-        c += "    }\n";
-      }
-    }
-    c += "  }\n";
-    c += "  Z++;\n";
-  }
-  c += "}\n";
-  return c;
-}
-
-void ConvTexture::GenerateCode(const DeviceInfo& device_info) {
-  auto storage_type = definition_.GetPrimaryStorageType();
-  bool is1x1 = kernel_size_.x == 1 && kernel_size_.y == 1;
-  bool adreno4xx_optimization =
-      stride_.x == 1 && stride_.y == 1 && padding_.x == 0 && padding_.y == 0 &&
-      device_info.IsAdreno4xx() &&
-      storage_type == TensorStorageType::TEXTURE_ARRAY &&
-      definition_.precision == CalculationsPrecision::F16;
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ =
-      GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
-                       stride_correction, different_weights_for_height_);
-
-  if (UseFP16SIMD(device_info, definition_.precision, is1x1)) {
-    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
-  }
-}
-
-absl::Status ConvTexture::BindArguments() {
-  if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
-  }
-  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  return absl::OkStatus();
-}
-
-int3 ConvTexture::GetGridSize() const {
-  const int grid_x =
-      DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
-  const int grid_y = DivideRoundUp(dst_[0]->Height(), block_size_.y);
-  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
-  return int3(grid_x, grid_y, grid_z);
-}
-
-void ConvTexture::GetPossibleKernelWorkGroups(
-    TuningType tuning_type, const DeviceInfo& device_info,
-    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
-  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
-                            work_groups);
-}
-
-ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                              const OperationDef& definition,
-                              const Convolution2DAttributes& attr) {
-  ConvTexture result(definition, attr);
-  result.GenerateCode(device_info);
-  result.UploadData(attr.weights, attr.bias);
-  return result;
-}
-
-ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                              const OperationDef& definition,
-                              const FullyConnectedAttributes& attr) {
-  ConvTexture result(definition);
-  result.GenerateCode(device_info);
-  result.UploadData(attr.weights, attr.bias);
-  return result;
-}
-
-ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
-                                          const OperationDef& definition,
-                                          const Convolution2DAttributes& attr) {
-  ConvTexture result(definition);
-  result.different_weights_for_height_ = true;
-  result.block_size_ = {4, 1, 2};
-  result.GenerateCode(device_info);
-  result.UploadDataForWinograd4x4To6x6(attr.weights);
-  return result;
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
@ -1,193 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
-
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-#include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-// This convolution process BLOCK_SIZE(XxYxZ) of FLT4 values per thread.
-class ConvTexture : public GPUOperation {
- public:
-  ConvTexture() = default;
-  void GetPossibleKernelWorkGroups(
-      TuningType tuning_type, const DeviceInfo& device_info,
-      const KernelInfo& kernel_info,
-      std::vector<int3>* work_groups) const override;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  ConvTexture(ConvTexture&& operation);
-  ConvTexture& operator=(ConvTexture&& operation);
-  ConvTexture(const ConvTexture&) = delete;
-  ConvTexture& operator=(const ConvTexture&) = delete;
-
- private:
-  friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                                       const OperationDef& definition,
-                                       const Convolution2DAttributes& attr);
-  friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                                       const OperationDef& definition,
-                                       const FullyConnectedAttributes& attr);
-
-  friend ConvTexture CreateConvTextureWino4x4To6x6(
-      const DeviceInfo& device_info, const OperationDef& definition,
-      const Convolution2DAttributes& attr);
-
-  ConvTexture(const OperationDef& definition,
-              const Convolution2DAttributes& attr);
-  explicit ConvTexture(const OperationDef& definition);
-  template <DataType T>
-  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                  const tflite::gpu::Tensor<Linear, T>& biases);
-
-  template <DataType T>
-  void UploadDataForWinograd4x4To6x6(
-      const tflite::gpu::Tensor<OHWI, T>& weights);
-
-  template <DataType T>
-  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
-
-  void GenerateCode(const DeviceInfo& device_info);
-
-  std::string GenerateConvCode(const OperationDef& op_def,
-                               const int3& block_size, bool is1x1,
-                               bool adreno4xx_optimization,
-                               bool stride_correction,
-                               bool different_weights_for_height);
-
-  int2 kernel_size_;
-  int2 stride_;
-  int2 padding_;
-  int2 dilation_;
-
-  // By default in 2d convolution we have the same weights for WH dims, but in
-  // some cases we need separate weights for H dimension and convolution kernel
-  // requires very small modifications to support it.
-  bool different_weights_for_height_;
-
-  int3 block_size_ = int3(2, 2, 2);
-};
-
-template <DataType T>
-void ConvTexture::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             const tflite::gpu::Tensor<Linear, T>& biases) {
-  UploadWeights(weights);
-
-  TensorLinearDescriptor desc;
-  desc.storage_type = LinearStorageType::TEXTURE_2D;
-  desc.element_type = definition_.GetDataType();
-  desc.UploadLinearData(biases);
-  args_.AddObject("biases",
-                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-}
-
-template <DataType T>
-void ConvTexture::UploadDataForWinograd4x4To6x6(
-    const tflite::gpu::Tensor<OHWI, T>& weights) {
-  tflite::gpu::Tensor<OHWI, T> wino_weights;
-  RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
-  UploadWeights(wino_weights);
-
-  tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
-  bias.shape = Linear(1);
-  bias.data = {0.0f};
-  TensorLinearDescriptor desc;
-  desc.storage_type = LinearStorageType::TEXTURE_2D;
-  desc.element_type = definition_.GetDataType();
-  desc.UploadLinearData(bias);
-  args_.AddObject("biases",
-                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-}
-
-template <DataType T>
-void ConvTexture::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
-  int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  dst_depth = AlignByN(dst_depth, block_size_.z);
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
-  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
-  DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-
-  const int elements_count = dst_depth * src_depth * kernel_x * kernel_y * 4;
-  const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
-
-  std::vector<uint8_t> data(float4_size * elements_count);
-
-  if (f32_weights) {
-    float4* ptr = reinterpret_cast<float4*>(data.data());
-    RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.z,
-                                     absl::MakeSpan(ptr, elements_count));
-  } else {
-    half4* ptr = reinterpret_cast<half4*>(data.data());
-    RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.z,
-                                     absl::MakeSpan(ptr, elements_count));
-  }
-
-  const int texture_width = dst_depth;
-  const int texture_height = src_depth * kernel_x * kernel_y;
-  const int sub_size = float4_size * texture_width * texture_height;
-  for (int i = 0; i < 4; ++i) {
-    Texture2DDescriptor desc;
-    desc.element_type = data_type;
-    desc.size = int2(texture_width, texture_height);
-    desc.data.resize(sub_size);
-    memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
-    const std::string name = "weights" + std::to_string(i);
-    args_.AddObject(name,
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc)));
-  }
-}
-
-ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                              const OperationDef& definition,
-                              const Convolution2DAttributes& attr);
-
-ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                              const OperationDef& definition,
-                              const FullyConnectedAttributes& attr);
-
-ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
-                                          const OperationDef& definition,
-                                          const Convolution2DAttributes& attr);
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
@ -1,107 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
-
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-
-using ::testing::FloatNear;
-using ::testing::Pointwise;
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-
-TEST_F(OpenCLOperationTest, ConvTextureSimpleWeights) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 2, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-
-  Convolution2DAttributes attr;
-  attr.padding.prepended = HW(0, 0);
-  attr.padding.appended = HW(1, 1);
-  attr.strides = HW(1, 1);
-  attr.dilations = HW(1, 1);
-  attr.weights.shape = OHWI(1, 2, 2, 2);
-  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  attr.bias.shape = Linear(1);
-  attr.bias.data = {0.0f};
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      ConvTexture operation =
-          CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 2, 1), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
-    }
-  }
-}
-
-TEST_F(OpenCLOperationTest, ConvTexture) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 2, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-
-  Convolution2DAttributes attr;
-  attr.padding.prepended = HW(0, 0);
-  attr.padding.appended = HW(1, 1);
-  attr.strides = HW(1, 1);
-  attr.dilations = HW(1, 1);
-  attr.weights.shape = OHWI(2, 2, 2, 2);
-  attr.weights.data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
-                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
-  attr.bias.shape = Linear(2);
-  attr.bias.data = {0.5f, -0.5f};
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      ConvTexture operation =
-          CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 2, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f,
-                                             60.5f, 235.5f, 20.5f, 123.5f}));
-    }
-  }
-}
-
-}  // namespace
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@ -14,7 +14,6 @@ cc_library(
        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_common",
        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_constants",
        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
-        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_weights_converter",
        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
        "//tensorflow/lite/delegates/gpu/cl/kernels:work_group_picking",
@ -82,7 +81,6 @@ cc_library(
    deps = [
        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_buffer_1x1",
        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
-        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
        "//tensorflow/lite/delegates/gpu/cl/kernels:fully_connected",
        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
        "//tensorflow/lite/delegates/gpu/common:operations",
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
@ -38,8 +37,8 @@ std::unique_ptr<GPUOperation> SelectConvolutionAdreno(
    GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
    return absl::make_unique<GPUOperation>(std::move(conv));
  } else {
-    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
-    return absl::make_unique<ConvTexture>(std::move(conv));
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
  }
 }

@ -47,8 +46,8 @@ std::unique_ptr<GPUOperation> SelectConvolutionWinogradAdreno(
    const Convolution2DAttributes& attr, const BHWC& dst_shape,
    const DeviceInfo& device_info, const OperationDef& op_def,
    ModelHints hints) {
-  ConvTexture conv = CreateConvTextureWino4x4To6x6(device_info, op_def, attr);
-  return absl::make_unique<ConvTexture>(std::move(conv));
+  ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+  return absl::make_unique<ConvPowerVR>(std::move(conv));
 }

 std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsAdreno(
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
@ -18,7 +18,6 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@ -31,8 +30,9 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedGeneric(
    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
    const OperationDef& op_def, int batch_size) {
  if (op_def.IsBatchSupported()) {
-    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
-    return absl::make_unique<ConvTexture>(std::move(conv));
+    BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
  } else {
    FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
    return absl::make_unique<FullyConnected>(std::move(fc));
@ -43,8 +43,9 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedAdreno(
    const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
    const OperationDef& op_def, int batch_size) {
  if (op_def.IsBatchSupported()) {
-    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
-    return absl::make_unique<ConvTexture>(std::move(conv));
+    BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
  } else {
    FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
    return absl::make_unique<FullyConnected>(std::move(fc));
@ -71,8 +72,10 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedMali(
      ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr);
      return absl::make_unique<ConvBuffer1x1>(std::move(conv));
    } else {
-      ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
-      return absl::make_unique<ConvTexture>(std::move(conv));
+      BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
+      ConvPowerVR conv =
+          CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+      return absl::make_unique<ConvPowerVR>(std::move(conv));
    }
  } else {
    FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);