Added support of dynamic weights for DepthWise Convolution in OpenCL.
PiperOrigin-RevId: 335916814 Change-Id: I95d93791293291e3499cf99e58bbc404119d88f8
This commit is contained in:
parent
6f2bb31b57
commit
a2e00ba674
@ -67,17 +67,18 @@ std::string GetSrcValue(int channel_multiplier, const std::string coords) {
|
||||
return c;
|
||||
}
|
||||
|
||||
std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
|
||||
bool stride_correction,
|
||||
int channel_multiplier,
|
||||
bool weights_are_buffer,
|
||||
GPUOperation* op) {
|
||||
std::string GenerateDepthwiseConvolutionCode(
|
||||
const OperationDef& op_def, bool stride_correction, int channel_multiplier,
|
||||
bool weights_are_buffer, bool dynamic_weights, GPUOperation* op) {
|
||||
auto src_desc = op_def.src_tensors[0];
|
||||
src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
|
||||
if (op_def.IsBatchSupported()) {
|
||||
src_desc.SetStateVar("BatchedWidth", "true");
|
||||
}
|
||||
op->AddSrcTensor("src_tensor", src_desc);
|
||||
if (dynamic_weights) {
|
||||
op->AddSrcTensor("weights", op_def.src_tensors[1]);
|
||||
}
|
||||
|
||||
auto dst_desc = op_def.dst_tensors[0];
|
||||
if (op_def.IsBatchSupported()) {
|
||||
@ -122,16 +123,24 @@ std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
|
||||
}
|
||||
}
|
||||
c += " int y_offseted = Y * args.stride_y + args.padding_y;\n";
|
||||
std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
|
||||
if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
|
||||
c += " int z_offseted = Z * args.stride_z + args.padding_z;\n";
|
||||
weights_offset += " * args.kernel_size_z";
|
||||
}
|
||||
if (weights_are_buffer) {
|
||||
c += " int fx_c = S * " + weights_offset + ";\n";
|
||||
} else {
|
||||
c += " int fx_c = 0;\n";
|
||||
if (!dynamic_weights) {
|
||||
std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
|
||||
if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
|
||||
c += " int z_offseted = Z * args.stride_z + args.padding_z;\n";
|
||||
weights_offset += " * args.kernel_size_z";
|
||||
}
|
||||
if (weights_are_buffer) {
|
||||
c += " int fx_c = S * " + weights_offset + ";\n";
|
||||
} else {
|
||||
c += " int fx_c = 0;\n";
|
||||
}
|
||||
}
|
||||
std::string kernel_size_x =
|
||||
dynamic_weights ? "args.weights.Width()" : "args.kernel_size_x";
|
||||
std::string kernel_size_y =
|
||||
dynamic_weights ? "args.weights.Height()" : "args.kernel_size_y";
|
||||
std::string kernel_size_z =
|
||||
dynamic_weights ? "args.weights.Depth()" : "args.kernel_size_z";
|
||||
|
||||
std::string flat_coords = "x_c, y_c";
|
||||
if (manual_clamp) {
|
||||
@ -139,29 +148,35 @@ std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
|
||||
if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
|
||||
check += " && !outside_z";
|
||||
flat_coords += ", z_c";
|
||||
c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
|
||||
c += " for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
|
||||
c += " int z_c = z_offseted + kz * args.dilation_z;\n";
|
||||
c += " bool outside_z = z_c < 0 || z_c >= args.src_tensor.Depth();\n";
|
||||
}
|
||||
c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
|
||||
c += " for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
|
||||
c += " int y_c = y_offseted + ky * args.dilation_y;\n";
|
||||
c += " bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
|
||||
c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
|
||||
c += " for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
|
||||
const std::string dilation_x =
|
||||
op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
|
||||
: "args.dilation_x";
|
||||
c += " int x_c = x_offseted + kx * " + dilation_x + ";\n";
|
||||
c += " bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
|
||||
c += " if (" + check + ") {\n";
|
||||
if (weights_are_buffer) {
|
||||
c += " FLT4 f = args.weights.Read(fx_c);\n";
|
||||
if (dynamic_weights) {
|
||||
c += " FLT4 f = args.weights.Read(kx, ky, S);\n";
|
||||
} else {
|
||||
c += " FLT4 f = args.weights.Read(fx_c, S);\n";
|
||||
if (weights_are_buffer) {
|
||||
c += " FLT4 f = args.weights.Read(fx_c);\n";
|
||||
} else {
|
||||
c += " FLT4 f = args.weights.Read(fx_c, S);\n";
|
||||
}
|
||||
}
|
||||
c += GetSrcValue(channel_multiplier, flat_coords);
|
||||
c += " r += TO_ACCUM_TYPE(src_final * f);\n";
|
||||
c += " };\n";
|
||||
c += " fx_c++;\n";
|
||||
if (!dynamic_weights) {
|
||||
c += " fx_c++;\n";
|
||||
}
|
||||
c += " }\n";
|
||||
c += " }\n";
|
||||
if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
|
||||
@ -170,7 +185,7 @@ std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
|
||||
} else { // Texture types with ZERO clamping
|
||||
if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
|
||||
flat_coords += ", z_c";
|
||||
c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
|
||||
c += " for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
|
||||
c += " int z_c = z_offseted + kz * args.dilation_z;\n";
|
||||
if (src_tensor_type !=
|
||||
TensorStorageType::TEXTURE_3D) { // Only TEXTURE_3D supports clamping
|
||||
@ -181,20 +196,24 @@ std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
|
||||
c += " }\n";
|
||||
}
|
||||
}
|
||||
c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
|
||||
c += " for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
|
||||
c += " int y_c = y_offseted + ky * args.dilation_y;\n";
|
||||
c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
|
||||
c += " for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
|
||||
const std::string dilation_x =
|
||||
op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
|
||||
: "args.dilation_x";
|
||||
c += " int x_c = x_offseted + kx * " + dilation_x + ";\n";
|
||||
c += GetSrcValue(channel_multiplier, flat_coords);
|
||||
if (weights_are_buffer) {
|
||||
c += " FLT4 f = args.weights.Read(fx_c);\n";
|
||||
if (dynamic_weights) {
|
||||
c += " FLT4 f = args.weights.Read(kx, ky, S);\n";
|
||||
} else {
|
||||
c += " FLT4 f = args.weights.Read(fx_c, S);\n";
|
||||
if (weights_are_buffer) {
|
||||
c += " FLT4 f = args.weights.Read(fx_c);\n";
|
||||
} else {
|
||||
c += " FLT4 f = args.weights.Read(fx_c, S);\n";
|
||||
}
|
||||
c += " fx_c++;\n";
|
||||
}
|
||||
c += " fx_c++;\n";
|
||||
c += " r += TO_ACCUM_TYPE(src_final * f);\n";
|
||||
c += " }\n";
|
||||
c += " }\n";
|
||||
@ -234,7 +253,7 @@ GPUOperation CreateDepthwiseConvolution2D(
|
||||
definition.IsBatchSupported() && attr.strides.w != 1;
|
||||
op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
|
||||
attr.weights.shape.o,
|
||||
weights_are_buffer, &op);
|
||||
weights_are_buffer, false, &op);
|
||||
UploadWeightsForDWConv2D(attr.weights, weights_are_buffer,
|
||||
definition.precision, &op);
|
||||
op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
|
||||
@ -249,6 +268,32 @@ GPUOperation CreateDepthwiseConvolution2D(
|
||||
return op;
|
||||
}
|
||||
|
||||
GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr) {
|
||||
GPUOperation op(definition);
|
||||
op.args_.AddInt("stride_x", attr.strides.w);
|
||||
op.args_.AddInt("padding_x", -attr.padding.prepended.w);
|
||||
op.args_.AddInt("dilation_x", attr.dilations.w);
|
||||
op.args_.AddInt("stride_y", attr.strides.h);
|
||||
op.args_.AddInt("padding_y", -attr.padding.prepended.h);
|
||||
op.args_.AddInt("dilation_y", attr.dilations.h);
|
||||
const bool stride_correction =
|
||||
definition.IsBatchSupported() && attr.strides.w != 1;
|
||||
op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction, 1,
|
||||
false, true, &op);
|
||||
op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
|
||||
|
||||
TensorLinearDescriptor desc;
|
||||
desc.storage_type = device_info.IsMali() ? LinearStorageType::BUFFER
|
||||
: LinearStorageType::TEXTURE_2D;
|
||||
desc.element_type = definition.GetDataType();
|
||||
desc.UploadLinearData(attr.bias);
|
||||
op.args_.AddObject(
|
||||
"biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
|
||||
return op;
|
||||
}
|
||||
|
||||
GPUOperation CreateDepthwiseConvolution3D(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution3DAttributes& attr) {
|
||||
@ -273,7 +318,7 @@ GPUOperation CreateDepthwiseConvolution3D(
|
||||
definition.IsBatchSupported() && attr.strides.w != 1;
|
||||
op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
|
||||
attr.weights.shape.o,
|
||||
weights_are_buffer, &op);
|
||||
weights_are_buffer, false, &op);
|
||||
UploadWeightsForDWConv3D(attr.weights, weights_are_buffer,
|
||||
definition.precision, &op);
|
||||
op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
|
||||
|
@ -186,6 +186,10 @@ GPUOperation CreateDepthwiseConvolution2D(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr);
|
||||
|
||||
GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution2DAttributes& attr);
|
||||
|
||||
GPUOperation CreateDepthwiseConvolution3D(
|
||||
const DeviceInfo& device_info, const OperationDef& definition,
|
||||
const DepthwiseConvolution3DAttributes& attr);
|
||||
|
@ -132,6 +132,7 @@ cc_library(
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:add",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:concat_xy",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:concat_z",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:depthwise_conv",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:lstm",
|
||||
"//tensorflow/lite/delegates/gpu/cl/kernels:max_unpooling",
|
||||
|
@ -315,7 +315,16 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
|
||||
case OperationType::DEPTHWISE_CONVOLUTION: {
|
||||
auto attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
|
||||
node.operation.attributes);
|
||||
*gpu_op = SelectDWConvolution(attr, device_info, op_def);
|
||||
if (inputs.size() == 1) {
|
||||
*gpu_op = SelectDWConvolution(attr, device_info, op_def);
|
||||
} else {
|
||||
if (inputs[1]->tensor.shape.b != 1) {
|
||||
return absl::UnimplementedError(
|
||||
"No support of depthwise runtime weights with channel multiplier "
|
||||
"!= 1");
|
||||
}
|
||||
*gpu_op = SelectDWConvolutionDynamicWeights(attr, device_info, op_def);
|
||||
}
|
||||
return absl::OkStatus();
|
||||
}
|
||||
case OperationType::FULLY_CONNECTED: {
|
||||
|
@ -22,6 +22,7 @@ limitations under the License.
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/add.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/lstm.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
|
||||
#include "tensorflow/lite/delegates/gpu/cl/kernels/mean.h"
|
||||
@ -110,6 +111,13 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
|
||||
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
|
||||
const OperationDef& op_def) {
|
||||
return absl::make_unique<GPUOperation>(
|
||||
CreateDepthwiseConvolution2DDynamicWeights(device_info, op_def, attr));
|
||||
}
|
||||
|
||||
void SelectReshape(int src_channels, int dst_channels,
|
||||
const OperationDef& op_def,
|
||||
std::unique_ptr<GPUOperation>* ptr) {
|
||||
|
@ -57,6 +57,10 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
|
||||
const DeviceInfo& device_info,
|
||||
std::unique_ptr<GPUOperation>* ptr);
|
||||
|
||||
std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
|
||||
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
|
||||
const OperationDef& op_def);
|
||||
|
||||
void SelectReshape(int src_channels, int dst_channels,
|
||||
const OperationDef& op_def,
|
||||
std::unique_ptr<GPUOperation>* ptr);
|
||||
|
@ -40,6 +40,10 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
|
||||
OperationType::DEPTHWISE_CONVOLUTION) {
|
||||
return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
|
||||
}
|
||||
auto dw_inputs = graph.FindInputs(dw_node->id);
|
||||
if (dw_inputs.size() != 1) {
|
||||
return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
|
||||
}
|
||||
auto dw_outputs = graph.FindOutputs(dw_node->id);
|
||||
auto consumers = graph.FindConsumers(dw_outputs[0]->id);
|
||||
if (consumers.size() != 1) {
|
||||
@ -60,7 +64,6 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
|
||||
dw_node->operation.attributes);
|
||||
auto conv_attr =
|
||||
absl::any_cast<Convolution2DAttributes>(conv_node->operation.attributes);
|
||||
auto dw_inputs = graph.FindInputs(dw_node->id);
|
||||
auto conv_outputs = graph.FindOutputs(conv_node->id);
|
||||
OperationDef op_def;
|
||||
op_def.precision = precision;
|
||||
|
@ -511,9 +511,22 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
|
||||
const TfLiteNode* tflite_node,
|
||||
const TfLiteRegistration* registration) final {
|
||||
RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 6));
|
||||
RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
|
||||
/*runtime_inputs=*/1, /*outputs=*/1));
|
||||
RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
|
||||
const int runtime_inputs =
|
||||
GetNumberOfRuntimeInputsForNode(context, tflite_node);
|
||||
if (runtime_inputs > 2) {
|
||||
return absl::InternalError(
|
||||
absl::StrCat("Expected 1 or 2 input tensor(s), but node has ",
|
||||
runtime_inputs, " runtime inputs."));
|
||||
}
|
||||
const int runtime_outputs = NumOutputs(tflite_node);
|
||||
if (runtime_outputs != 1) {
|
||||
return absl::InternalError(
|
||||
absl::StrCat("Expected 1 output tensor(s), but node has ",
|
||||
runtime_outputs, " runtime outputs."));
|
||||
}
|
||||
if (runtime_inputs == 1) {
|
||||
RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
|
||||
}
|
||||
const TfLiteDepthwiseConvParams* tf_options;
|
||||
RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
|
||||
RETURN_IF_ERROR(CheckStridesAndDilation(
|
||||
@ -567,7 +580,12 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
|
||||
RETURN_IF_ERROR(reader->AddOutputs(node));
|
||||
|
||||
DepthwiseConvolution2DAttributes attr;
|
||||
RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
|
||||
const int runtime_inputs = reader->GetNumberOfRuntimeInputs();
|
||||
if (runtime_inputs == 2) {
|
||||
RETURN_IF_ERROR(reader->AddInput(node, 1));
|
||||
} else { // runtime_inputs == 1;
|
||||
RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
|
||||
}
|
||||
reader->ReadTensor(2, &attr.bias).IgnoreError(); // bias is optional
|
||||
const TfLiteDepthwiseConvParams* tf_options;
|
||||
RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
|
||||
|
@ -70,6 +70,12 @@ class AddBias : public NodeTransformation {
|
||||
}
|
||||
if (node->operation.type ==
|
||||
ToString(OperationType::DEPTHWISE_CONVOLUTION)) {
|
||||
if (graph->FindInputs(node->id).size() != 1) {
|
||||
return {TransformStatus::DECLINED,
|
||||
"This transformation is only applicable to depth wise conv "
|
||||
"with one "
|
||||
"runtime input."};
|
||||
}
|
||||
auto& attr = absl::any_cast<DepthwiseConvolution2DAttributes&>(
|
||||
node->operation.attributes);
|
||||
return FillBias(attr.weights.shape.o * attr.weights.shape.i, &attr.bias);
|
||||
|
@ -54,6 +54,10 @@ class MergeConvolutionWithAdd : public SequenceTransformation {
|
||||
TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
|
||||
GraphFloat32* graph) final {
|
||||
auto& conv_node = *sequence[0];
|
||||
if (graph->FindInputs(conv_node.id).size() != 1) {
|
||||
return {TransformStatus::DECLINED,
|
||||
"This fusion is only applicable to ops with one runtime input."};
|
||||
}
|
||||
auto& add_node = *sequence[1];
|
||||
if (add_node.operation.type != ToString(OperationType::ADD)) {
|
||||
return {TransformStatus::SKIPPED, ""};
|
||||
|
@ -38,6 +38,10 @@ class DepthwiseConvolution : public NodeShader {
|
||||
public:
|
||||
absl::Status GenerateCode(const GenerationContext& ctx,
|
||||
GeneratedCode* generated_code) const final {
|
||||
if (ctx.input_shapes.size() != 1) {
|
||||
return absl::UnimplementedError(
|
||||
"DepthWise Convolution does not support more than 1 runtime tensor");
|
||||
}
|
||||
const auto& attr =
|
||||
absl::any_cast<const DepthwiseConvolution2DAttributes&>(ctx.op_attr);
|
||||
auto weights = attr.weights.shape;
|
||||
|
@ -267,6 +267,11 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
|
||||
device_info, options);
|
||||
break;
|
||||
case OperationType::DEPTHWISE_CONVOLUTION:
|
||||
if (graph.FindInputs(node->id).size() != 1) {
|
||||
return absl::UnimplementedError(
|
||||
"DepthWise Convolution does not support more than 1 runtime "
|
||||
"tensor");
|
||||
}
|
||||
*tasks =
|
||||
SelectDepthWiseConv(node_id, inputs[0], outputs[0],
|
||||
absl::any_cast<DepthwiseConvolution2DAttributes>(
|
||||
|
Loading…
Reference in New Issue
Block a user