ConvConstants converted to generic GPUOperation.

PiperOrigin-RevId: 328431894 Change-Id: I464d830b55ab27cdd47761c2432f731e009fda15
2020-08-25 16:56:47 -07:00 · 2020-08-25 16:56:47 -07:00 · 0634d08af1
commit 0634d08af1
parent 776e040ae0
4 changed files with 96 additions and 172 deletions
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@ -45,84 +45,29 @@ int GetOptimalMaxConstantSize(const DeviceInfo& info) {
    return GetAdrenoOptimalMaxConstantSize(info.adreno_info.gpu_version);
  }
 }
-}  // namespace

-ConvConstants::ConvConstants(const OperationDef& definition,
-                             const Convolution2DAttributes& attr,
-                             const DeviceInfo& device_info)
-    : GPUOperation(definition),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-      dilation_(attr.dilations.w, attr.dilations.h),
-      src_channels_(attr.weights.shape.i),
-      dst_channels_(attr.weights.shape.o) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ =
-      GenerateConvolutionConstantCode(definition_, kernel_size_, src_channels_,
-                                      dst_channels_, stride_correction);
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      device_info.IsAdreno3xx()) {
-    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
-  }
-  if (definition_.precision != CalculationsPrecision::F32 &&
-      device_info.IsPowerVR()) {
-    // BUG, some PowerVRs (GE8320) produce incorrect result without it
-    compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
-  }
-}
-
-ConvConstants::ConvConstants(ConvConstants&& kernel)
-    : GPUOperation(std::move(kernel)),
-      kernel_size_(kernel.kernel_size_),
-      stride_(kernel.stride_),
-      padding_(kernel.padding_),
-      dilation_(kernel.dilation_),
-      src_channels_(kernel.src_channels_),
-      dst_channels_(kernel.dst_channels_) {}
-
-ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
-  if (this != &kernel) {
-    std::swap(kernel_size_, kernel.kernel_size_);
-    std::swap(stride_, kernel.stride_);
-    std::swap(padding_, kernel.padding_);
-    std::swap(dilation_, kernel.dilation_);
-    std::swap(src_channels_, kernel.src_channels_);
-    std::swap(dst_channels_, kernel.dst_channels_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-std::string ConvConstants::GenerateConvolutionConstantCode(
-    const OperationDef& op_def, const int2& kernel_size, int src_channels,
-    int dst_channels, bool stride_correction) {
+std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
+                                            const OHWI& weights_shape,
+                                            bool stride_correction,
+                                            GPUOperation* op) {
  auto src_desc = op_def.src_tensors[0];
  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
  if (op_def.IsBatchSupported()) {
    src_desc.SetStateVar("BatchedWidth", "true");
  }
-  AddSrcTensor("src_tensor", src_desc);
+  op->AddSrcTensor("src_tensor", src_desc);

  auto dst_desc = op_def.dst_tensors[0];
  if (op_def.IsBatchSupported()) {
    dst_desc.SetStateVar("BatchedWidth", "true");
  }
-  AddDstTensor("dst_tensor", dst_desc);
-
-  args_.AddInt("stride_x");
-  args_.AddInt("stride_y");
-  args_.AddInt("padding_x");
-  args_.AddInt("padding_y");
-  args_.AddInt("dilation_x");
-  args_.AddInt("dilation_y");
+  op->AddDstTensor("dst_tensor", dst_desc);

  std::string c = GetCommonDefines(op_def.precision);

-  const int out_z = DivideRoundUp(dst_channels, 4);
+  const int out_z = DivideRoundUp(weights_shape.o, 4);
  const std::string kOutZ = std::to_string(out_z);
-  const int src_depth = DivideRoundUp(src_channels, 4);
+  const int src_depth = DivideRoundUp(weights_shape.i, 4);

  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
  const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@ -176,11 +121,16 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
       "return;\n";
  if (stride_correction) {
    c += "  int start_x = " +
-         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
-                             "args.padding_x") +
+         GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
+                               "args.padding_x") +
         ";\n";
  } else {
-    c += "  int start_x = X * args.stride_x + args.padding_x;\n";
+    if (op_def.IsBatchSupported()) {
+      c += "  int start_x = X * args.stride_x + args.padding_x * "
+           "args.src_tensor.Batch();\n";
+    } else {
+      c += "  int start_x = X * args.stride_x + args.padding_x;\n";
+    }
  }
  c += "  int start_y = Y * args.stride_y + args.padding_y;\n";
  c += "  ACCUM_FLT4 r[" + kOutZ + "];\n";
@ -189,22 +139,25 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
  c += "  }\n";
  int filters_counter = 0;
  for (int s = 0; s < src_depth; ++s) {
-    const int ch_count = std::min(4, src_channels - s * 4);
+    const int ch_count = std::min(4, weights_shape.i - s * 4);
    const std::string s_conv = "CONV" + std::to_string(ch_count);
    const std::string s_count = ch_count == 1 ? "" : std::to_string(ch_count);
    const std::string s_type = absl::StrCat("FLT", s_count);
    const std::string s_postfix = postfixes[ch_count - 1];
-    for (int ky = 0; ky < kernel_size.y; ++ky) {
+    const std::string dilation_x =
+        op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
+                                  : "args.dilation_x";
+    for (int ky = 0; ky < weights_shape.h; ++ky) {
      std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
      if (manual_clamp) {
        c += "  {\n";
        c += "  bool y_out = " + s_y + " < 0 || " + s_y +
             " >= args.src_tensor.Height();\n";
      }
-      for (int kx = 0; kx < kernel_size.x; ++kx) {
+      for (int kx = 0; kx < weights_shape.w; ++kx) {
        c += "  {\n";
        std::string s_x =
-            absl::StrCat("(start_x + ", kx, " * args.dilation_x)");
+            absl::StrCat("(start_x + ", kx, " * " + dilation_x + ")");
        if (manual_clamp) {
          c += "    bool x_out = " + s_x + "< 0 || " + s_x +
               ">= args.src_tensor.Width();\n";
@ -240,20 +193,7 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
  return c;
 }

-absl::Status ConvConstants::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-  return args_.SetInt("dilation_y", dilation_.y);
-}
-
-int3 ConvConstants::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  return int3(grid_x, grid_y, 1);
-}
+}  // namespace

 bool IsConvConstantsSupported(const DeviceInfo& device_info,
                              const OperationDef& definition,
@ -277,20 +217,41 @@ bool IsConvConstantsSupported(const DeviceInfo& device_info,
  return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
 }

-ConvConstants CreateConvConstants(const DeviceInfo& device_info,
-                                  const OperationDef& definition,
-                                  const Convolution2DAttributes& attr) {
-  ConvConstants result(definition, attr, device_info);
-  result.UploadWeights(attr.weights);
+GPUOperation CreateConvConstants(const DeviceInfo& device_info,
+                                 const OperationDef& definition,
+                                 const Convolution2DAttributes& attr) {
+  GPUOperation op(definition);
+  UploadWeightsForConvConstants(attr.weights, definition.precision, &op);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("dilation_x", attr.dilations.w);
+  op.args_.AddInt("dilation_y", attr.dilations.h);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
+
+  const bool stride_correction =
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  op.code_ = GenerateConvolutionConstantCode(definition, attr.weights.shape,
+                                             stride_correction, &op);
+  if (definition.precision == CalculationsPrecision::F16 &&
+      device_info.IsAdreno3xx()) {
+    op.compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
+  if (definition.precision != CalculationsPrecision::F32 &&
+      device_info.IsPowerVR()) {
+    // BUG, some PowerVRs (GE8320) produce incorrect result without it
+    op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+  }

  TensorLinearDescriptor desc;
  desc.storage_type = LinearStorageType::BUFFER;
  desc.element_type = definition.GetDataType();
  desc.memory_type = MemoryType::CONSTANT;
  desc.UploadLinearData(attr.bias);
-  result.args_.AddObject(
+  op.args_.AddObject(
      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-  return result;
+  return op;
 }

 }  // namespace cl
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@ -32,78 +32,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {

-class ConvConstants : public GPUOperation {
- public:
-  ConvConstants() = default;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  ConvConstants(ConvConstants&& kernel);
-  ConvConstants& operator=(ConvConstants&& kernel);
-  ConvConstants(const ConvConstants&) = delete;
-  ConvConstants& operator=(const ConvConstants&) = delete;
-
- private:
-  friend ConvConstants CreateConvConstants(const DeviceInfo& device_info,
-                                           const OperationDef& definition,
-                                           const Convolution2DAttributes& attr);
-  ConvConstants(const OperationDef& definition,
-                const Convolution2DAttributes& attr,
-                const DeviceInfo& device_info);
-
-  template <DataType T>
-  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
-                            absl::Span<T> dst);
-
-  std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
-                                              const int2& kernel_size,
-                                              int src_channels,
-                                              int dst_channels,
-                                              bool stride_correction);
-
-  int2 kernel_size_;
-  int2 stride_;
-  int2 padding_;
-  int2 dilation_;
-  int src_channels_;
-  int dst_channels_;
-};
-
-template <DataType T>
-void ConvConstants::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
-  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
-  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
-  const int float_size = f32_weights ? 4 : 2;
-  const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
-
-  BufferDescriptor desc;
-  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-  desc.element_size = 4;
-  desc.memory_type = MemoryType::CONSTANT;
-  desc.size = float_size * float_count;
-  desc.data.resize(desc.size);
-
-  if (f32_weights) {
-    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, float_count / 4));
-  } else {
-    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, float_count / 4));
-  }
-
-  args_.AddObject("weigths",
-                  absl::make_unique<BufferDescriptor>(std::move(desc)));
-}
-
 template <DataType S, typename T>
-void ConvConstants::RearrangeWeightsData(
+void RearrangeWeightsForConvConstants(
    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
  const int src_depth = DivideRoundUp(weights.shape.i, 4);
@ -115,7 +45,7 @@ void ConvConstants::RearrangeWeightsData(
    for (int y = 0; y < kernel_y; ++y) {
      for (int x = 0; x < kernel_x; ++x) {
        for (int d = 0; d < dst_depth; ++d) {
-          const int channels_count = std::min(4, src_channels_ - s * 4);
+          const int channels_count = std::min(4, weights.shape.i - s * 4);
          T filters[4];
          for (int i = 0; i < 4; ++i) {
            for (int j = 0; j < channels_count; ++j) {
@ -145,13 +75,46 @@ void ConvConstants::RearrangeWeightsData(
  }
 }

+template <DataType T>
+void UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                   CalculationsPrecision precision,
+                                   GPUOperation* op) {
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  const bool f32_weights = precision == CalculationsPrecision::F32;
+  const int float_size = f32_weights ? 4 : 2;
+  const int float_count = weights.shape.i * dst_depth * 4 * kernel_x * kernel_y;
+
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::CONSTANT;
+  desc.size = float_size * float_count;
+  desc.data.resize(desc.size);
+
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsForConvConstants(weights,
+                                     absl::MakeSpan(ptr, float_count / 4));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsForConvConstants(weights,
+                                     absl::MakeSpan(ptr, float_count / 4));
+  }
+
+  op->args_.AddObject("weigths",
+                      absl::make_unique<BufferDescriptor>(std::move(desc)));
+}
+
 bool IsConvConstantsSupported(const DeviceInfo& device_info,
                              const OperationDef& definition,
                              const Convolution2DAttributes& attr);

-ConvConstants CreateConvConstants(const DeviceInfo& device_info,
-                                  const OperationDef& definition,
-                                  const Convolution2DAttributes& attr);
+GPUOperation CreateConvConstants(const DeviceInfo& device_info,
+                                 const OperationDef& definition,
+                                 const Convolution2DAttributes& attr);

 }  // namespace cl
 }  // namespace gpu
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
@ -55,7 +55,7 @@ TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
      TensorFloat32 dst_tensor;
-      ConvConstants operation =
+      GPUOperation operation =
          CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                    BHWC(1, 2, 2, 1), &dst_tensor));
@ -90,7 +90,7 @@ TEST_F(OpenCLOperationTest, ConvConstants) {
      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
      TensorFloat32 dst_tensor;
-      ConvConstants operation =
+      GPUOperation operation =
          CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                    BHWC(1, 2, 2, 2), &dst_tensor));
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
@ -35,8 +35,8 @@ std::unique_ptr<GPUOperation> SelectConvolutionAdreno(
    const DeviceInfo& device_info, const OperationDef& op_def,
    ModelHints hints) {
  if (IsConvConstantsSupported(device_info, op_def, attr)) {
-    ConvConstants conv = CreateConvConstants(device_info, op_def, attr);
-    return absl::make_unique<ConvConstants>(std::move(conv));
+    GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
+    return absl::make_unique<GPUOperation>(std::move(conv));
  } else {
    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
    return absl::make_unique<ConvTexture>(std::move(conv));
@ -66,8 +66,8 @@ std::unique_ptr<GPUOperation> SelectConvolutionNVidia(
    const Convolution2DAttributes& attr, const BHWC& dst_shape,
    const DeviceInfo& device_info, const OperationDef& op_def) {
  if (IsConvConstantsSupported(device_info, op_def, attr)) {
-    ConvConstants conv = CreateConvConstants(device_info, op_def, attr);
-    return absl::make_unique<ConvConstants>(std::move(conv));
+    GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
+    return absl::make_unique<GPUOperation>(std::move(conv));
  } else {
    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
    return absl::make_unique<ConvPowerVR>(std::move(conv));