Added support of dynamic weights for DepthWise Convolution in OpenCL.

PiperOrigin-RevId: 335916814 Change-Id: I95d93791293291e3499cf99e58bbc404119d88f8
2020-10-07 11:44:13 -07:00 · 2020-10-07 11:44:13 -07:00 · a2e00ba674
commit a2e00ba674
parent 6f2bb31b57
12 changed files with 147 additions and 36 deletions
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
@ -67,17 +67,18 @@ std::string GetSrcValue(int channel_multiplier, const std::string coords) {
  return c;
 }

-std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
-                                             bool stride_correction,
-                                             int channel_multiplier,
-                                             bool weights_are_buffer,
-                                             GPUOperation* op) {
+std::string GenerateDepthwiseConvolutionCode(
+    const OperationDef& op_def, bool stride_correction, int channel_multiplier,
+    bool weights_are_buffer, bool dynamic_weights, GPUOperation* op) {
  auto src_desc = op_def.src_tensors[0];
  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
  if (op_def.IsBatchSupported()) {
    src_desc.SetStateVar("BatchedWidth", "true");
  }
  op->AddSrcTensor("src_tensor", src_desc);
+  if (dynamic_weights) {
+    op->AddSrcTensor("weights", op_def.src_tensors[1]);
+  }

  auto dst_desc = op_def.dst_tensors[0];
  if (op_def.IsBatchSupported()) {
@ -122,16 +123,24 @@ std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
    }
  }
  c += "  int y_offseted = Y * args.stride_y + args.padding_y;\n";
-  std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
-  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    c += "  int z_offseted = Z * args.stride_z + args.padding_z;\n";
-    weights_offset += " * args.kernel_size_z";
-  }
-  if (weights_are_buffer) {
-    c += "  int fx_c = S * " + weights_offset + ";\n";
-  } else {
-    c += "  int fx_c = 0;\n";
+  if (!dynamic_weights) {
+    std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      c += "  int z_offseted = Z * args.stride_z + args.padding_z;\n";
+      weights_offset += " * args.kernel_size_z";
+    }
+    if (weights_are_buffer) {
+      c += "  int fx_c = S * " + weights_offset + ";\n";
+    } else {
+      c += "  int fx_c = 0;\n";
+    }
  }
+  std::string kernel_size_x =
+      dynamic_weights ? "args.weights.Width()" : "args.kernel_size_x";
+  std::string kernel_size_y =
+      dynamic_weights ? "args.weights.Height()" : "args.kernel_size_y";
+  std::string kernel_size_z =
+      dynamic_weights ? "args.weights.Depth()" : "args.kernel_size_z";

  std::string flat_coords = "x_c, y_c";
  if (manual_clamp) {
@ -139,29 +148,35 @@ std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
      check += " && !outside_z";
      flat_coords += ", z_c";
-      c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+      c += "  for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
      c += "    int z_c = z_offseted + kz * args.dilation_z;\n";
      c += "    bool outside_z = z_c < 0 || z_c >= args.src_tensor.Depth();\n";
    }
-    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
+    c += "  for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
    c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
    c += "    bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
-    c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
+    c += "    for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
    const std::string dilation_x =
        op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
                                  : "args.dilation_x";
    c += "      int x_c = x_offseted + kx * " + dilation_x + ";\n";
    c += "      bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
    c += "      if (" + check + ") {\n";
-    if (weights_are_buffer) {
-      c += "        FLT4 f = args.weights.Read(fx_c);\n";
+    if (dynamic_weights) {
+      c += "        FLT4 f = args.weights.Read(kx, ky, S);\n";
    } else {
-      c += "        FLT4 f = args.weights.Read(fx_c, S);\n";
+      if (weights_are_buffer) {
+        c += "        FLT4 f = args.weights.Read(fx_c);\n";
+      } else {
+        c += "        FLT4 f = args.weights.Read(fx_c, S);\n";
+      }
    }
    c += GetSrcValue(channel_multiplier, flat_coords);
    c += "        r += TO_ACCUM_TYPE(src_final * f);\n";
    c += "      };\n";
-    c += "      fx_c++;\n";
+    if (!dynamic_weights) {
+      c += "      fx_c++;\n";
+    }
    c += "    }\n";
    c += "  }\n";
    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
@ -170,7 +185,7 @@ std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
  } else {  // Texture types with ZERO clamping
    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
      flat_coords += ", z_c";
-      c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+      c += "  for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
      c += "    int z_c = z_offseted + kz * args.dilation_z;\n";
      if (src_tensor_type !=
          TensorStorageType::TEXTURE_3D) {  // Only TEXTURE_3D supports clamping
@ -181,20 +196,24 @@ std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
        c += "    }\n";
      }
    }
-    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
+    c += "  for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
    c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
-    c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
+    c += "    for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
    const std::string dilation_x =
        op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
                                  : "args.dilation_x";
    c += "      int x_c = x_offseted + kx * " + dilation_x + ";\n";
    c += GetSrcValue(channel_multiplier, flat_coords);
-    if (weights_are_buffer) {
-      c += "      FLT4 f = args.weights.Read(fx_c);\n";
+    if (dynamic_weights) {
+      c += "      FLT4 f = args.weights.Read(kx, ky, S);\n";
    } else {
-      c += "      FLT4 f = args.weights.Read(fx_c, S);\n";
+      if (weights_are_buffer) {
+        c += "      FLT4 f = args.weights.Read(fx_c);\n";
+      } else {
+        c += "      FLT4 f = args.weights.Read(fx_c, S);\n";
+      }
+      c += "      fx_c++;\n";
    }
-    c += "      fx_c++;\n";
    c += "      r += TO_ACCUM_TYPE(src_final * f);\n";
    c += "    }\n";
    c += "  }\n";
@ -234,7 +253,7 @@ GPUOperation CreateDepthwiseConvolution2D(
      definition.IsBatchSupported() && attr.strides.w != 1;
  op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
                                              attr.weights.shape.o,
-                                              weights_are_buffer, &op);
+                                              weights_are_buffer, false, &op);
  UploadWeightsForDWConv2D(attr.weights, weights_are_buffer,
                           definition.precision, &op);
  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
@ -249,6 +268,32 @@ GPUOperation CreateDepthwiseConvolution2D(
  return op;
 }

+GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr) {
+  GPUOperation op(definition);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("dilation_x", attr.dilations.w);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("dilation_y", attr.dilations.h);
+  const bool stride_correction =
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction, 1,
+                                              false, true, &op);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = device_info.IsMali() ? LinearStorageType::BUFFER
+                                           : LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
+  desc.UploadLinearData(attr.bias);
+  op.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return op;
+}
+
 GPUOperation CreateDepthwiseConvolution3D(
    const DeviceInfo& device_info, const OperationDef& definition,
    const DepthwiseConvolution3DAttributes& attr) {
@ -273,7 +318,7 @@ GPUOperation CreateDepthwiseConvolution3D(
      definition.IsBatchSupported() && attr.strides.w != 1;
  op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
                                              attr.weights.shape.o,
-                                              weights_are_buffer, &op);
+                                              weights_are_buffer, false, &op);
  UploadWeightsForDWConv3D(attr.weights, weights_are_buffer,
                           definition.precision, &op);
  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
@ -186,6 +186,10 @@ GPUOperation CreateDepthwiseConvolution2D(
    const DeviceInfo& device_info, const OperationDef& definition,
    const DepthwiseConvolution2DAttributes& attr);

+GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr);
+
 GPUOperation CreateDepthwiseConvolution3D(
    const DeviceInfo& device_info, const OperationDef& definition,
    const DepthwiseConvolution3DAttributes& attr);
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@ -132,6 +132,7 @@ cc_library(
        "//tensorflow/lite/delegates/gpu/cl/kernels:add",
        "//tensorflow/lite/delegates/gpu/cl/kernels:concat_xy",
        "//tensorflow/lite/delegates/gpu/cl/kernels:concat_z",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:depthwise_conv",
        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
        "//tensorflow/lite/delegates/gpu/cl/kernels:lstm",
        "//tensorflow/lite/delegates/gpu/cl/kernels:max_unpooling",
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@ -315,7 +315,16 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
    case OperationType::DEPTHWISE_CONVOLUTION: {
      auto attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
          node.operation.attributes);
-      *gpu_op = SelectDWConvolution(attr, device_info, op_def);
+      if (inputs.size() == 1) {
+        *gpu_op = SelectDWConvolution(attr, device_info, op_def);
+      } else {
+        if (inputs[1]->tensor.shape.b != 1) {
+          return absl::UnimplementedError(
+              "No support of depthwise runtime weights with channel multiplier "
+              "!= 1");
+        }
+        *gpu_op = SelectDWConvolutionDynamicWeights(attr, device_info, op_def);
+      }
      return absl::OkStatus();
    }
    case OperationType::FULLY_CONNECTED: {
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/add.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/lstm.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/mean.h"
@ -110,6 +111,13 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
  }
 }

+std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
+  return absl::make_unique<GPUOperation>(
+      CreateDepthwiseConvolution2DDynamicWeights(device_info, op_def, attr));
+}
+
 void SelectReshape(int src_channels, int dst_channels,
                   const OperationDef& op_def,
                   std::unique_ptr<GPUOperation>* ptr) {
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@ -57,6 +57,10 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
                          const DeviceInfo& device_info,
                          std::unique_ptr<GPUOperation>* ptr);

+std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def);
+
 void SelectReshape(int src_channels, int dst_channels,
                   const OperationDef& op_def,
                   std::unique_ptr<GPUOperation>* ptr);
--- a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
@ -40,6 +40,10 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
      OperationType::DEPTHWISE_CONVOLUTION) {
    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
  }
+  auto dw_inputs = graph.FindInputs(dw_node->id);
+  if (dw_inputs.size() != 1) {
+    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
+  }
  auto dw_outputs = graph.FindOutputs(dw_node->id);
  auto consumers = graph.FindConsumers(dw_outputs[0]->id);
  if (consumers.size() != 1) {
@ -60,7 +64,6 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
      dw_node->operation.attributes);
  auto conv_attr =
      absl::any_cast<Convolution2DAttributes>(conv_node->operation.attributes);
-  auto dw_inputs = graph.FindInputs(dw_node->id);
  auto conv_outputs = graph.FindOutputs(conv_node->id);
  OperationDef op_def;
  op_def.precision = precision;
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@ -511,9 +511,22 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
                           const TfLiteNode* tflite_node,
                           const TfLiteRegistration* registration) final {
    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 6));
-    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
-                                       /*runtime_inputs=*/1, /*outputs=*/1));
-    RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
+    const int runtime_inputs =
+        GetNumberOfRuntimeInputsForNode(context, tflite_node);
+    if (runtime_inputs > 2) {
+      return absl::InternalError(
+          absl::StrCat("Expected 1 or 2 input tensor(s), but node has ",
+                       runtime_inputs, " runtime inputs."));
+    }
+    const int runtime_outputs = NumOutputs(tflite_node);
+    if (runtime_outputs != 1) {
+      return absl::InternalError(
+          absl::StrCat("Expected 1 output tensor(s), but node has ",
+                       runtime_outputs, " runtime outputs."));
+    }
+    if (runtime_inputs == 1) {
+      RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
+    }
    const TfLiteDepthwiseConvParams* tf_options;
    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
    RETURN_IF_ERROR(CheckStridesAndDilation(
@ -567,7 +580,12 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
    RETURN_IF_ERROR(reader->AddOutputs(node));

    DepthwiseConvolution2DAttributes attr;
-    RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
+    const int runtime_inputs = reader->GetNumberOfRuntimeInputs();
+    if (runtime_inputs == 2) {
+      RETURN_IF_ERROR(reader->AddInput(node, 1));
+    } else {  // runtime_inputs == 1;
+      RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
+    }
    reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
    const TfLiteDepthwiseConvParams* tf_options;
    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
@ -70,6 +70,12 @@ class AddBias : public NodeTransformation {
    }
    if (node->operation.type ==
        ToString(OperationType::DEPTHWISE_CONVOLUTION)) {
+      if (graph->FindInputs(node->id).size() != 1) {
+        return {TransformStatus::DECLINED,
+                "This transformation is only applicable to depth wise conv "
+                "with one "
+                "runtime input."};
+      }
      auto& attr = absl::any_cast<DepthwiseConvolution2DAttributes&>(
          node->operation.attributes);
      return FillBias(attr.weights.shape.o * attr.weights.shape.i, &attr.bias);
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
@ -54,6 +54,10 @@ class MergeConvolutionWithAdd : public SequenceTransformation {
  TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
                                       GraphFloat32* graph) final {
    auto& conv_node = *sequence[0];
+    if (graph->FindInputs(conv_node.id).size() != 1) {
+      return {TransformStatus::DECLINED,
+              "This fusion is only applicable to ops with one runtime input."};
+    }
    auto& add_node = *sequence[1];
    if (add_node.operation.type != ToString(OperationType::ADD)) {
      return {TransformStatus::SKIPPED, ""};
--- a/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
@ -38,6 +38,10 @@ class DepthwiseConvolution : public NodeShader {
 public:
  absl::Status GenerateCode(const GenerationContext& ctx,
                            GeneratedCode* generated_code) const final {
+    if (ctx.input_shapes.size() != 1) {
+      return absl::UnimplementedError(
+          "DepthWise Convolution does not support more than 1 runtime tensor");
+    }
    const auto& attr =
        absl::any_cast<const DepthwiseConvolution2DAttributes&>(ctx.op_attr);
    auto weights = attr.weights.shape;
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@ -267,6 +267,11 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
          device_info, options);
      break;
    case OperationType::DEPTHWISE_CONVOLUTION:
+      if (graph.FindInputs(node->id).size() != 1) {
+        return absl::UnimplementedError(
+            "DepthWise Convolution does not support more than 1 runtime "
+            "tensor");
+      }
      *tasks =
          SelectDepthWiseConv(node_id, inputs[0], outputs[0],
                              absl::any_cast<DepthwiseConvolution2DAttributes>(