Added support of dynamic weights for DepthWise Convolution in OpenCL.

PiperOrigin-RevId: 335916814
Change-Id: I95d93791293291e3499cf99e58bbc404119d88f8
This commit is contained in:
Raman Sarokin 2020-10-07 11:44:13 -07:00 committed by TensorFlower Gardener
parent 6f2bb31b57
commit a2e00ba674
12 changed files with 147 additions and 36 deletions

View File

@ -67,17 +67,18 @@ std::string GetSrcValue(int channel_multiplier, const std::string coords) {
return c;
}
std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
bool stride_correction,
int channel_multiplier,
bool weights_are_buffer,
GPUOperation* op) {
std::string GenerateDepthwiseConvolutionCode(
const OperationDef& op_def, bool stride_correction, int channel_multiplier,
bool weights_are_buffer, bool dynamic_weights, GPUOperation* op) {
auto src_desc = op_def.src_tensors[0];
src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
if (op_def.IsBatchSupported()) {
src_desc.SetStateVar("BatchedWidth", "true");
}
op->AddSrcTensor("src_tensor", src_desc);
if (dynamic_weights) {
op->AddSrcTensor("weights", op_def.src_tensors[1]);
}
auto dst_desc = op_def.dst_tensors[0];
if (op_def.IsBatchSupported()) {
@ -122,16 +123,24 @@ std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
}
}
c += " int y_offseted = Y * args.stride_y + args.padding_y;\n";
std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
c += " int z_offseted = Z * args.stride_z + args.padding_z;\n";
weights_offset += " * args.kernel_size_z";
}
if (weights_are_buffer) {
c += " int fx_c = S * " + weights_offset + ";\n";
} else {
c += " int fx_c = 0;\n";
if (!dynamic_weights) {
std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
c += " int z_offseted = Z * args.stride_z + args.padding_z;\n";
weights_offset += " * args.kernel_size_z";
}
if (weights_are_buffer) {
c += " int fx_c = S * " + weights_offset + ";\n";
} else {
c += " int fx_c = 0;\n";
}
}
std::string kernel_size_x =
dynamic_weights ? "args.weights.Width()" : "args.kernel_size_x";
std::string kernel_size_y =
dynamic_weights ? "args.weights.Height()" : "args.kernel_size_y";
std::string kernel_size_z =
dynamic_weights ? "args.weights.Depth()" : "args.kernel_size_z";
std::string flat_coords = "x_c, y_c";
if (manual_clamp) {
@ -139,29 +148,35 @@ std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
check += " && !outside_z";
flat_coords += ", z_c";
c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
c += " for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
c += " int z_c = z_offseted + kz * args.dilation_z;\n";
c += " bool outside_z = z_c < 0 || z_c >= args.src_tensor.Depth();\n";
}
c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
c += " for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
c += " int y_c = y_offseted + ky * args.dilation_y;\n";
c += " bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
c += " for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
const std::string dilation_x =
op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
: "args.dilation_x";
c += " int x_c = x_offseted + kx * " + dilation_x + ";\n";
c += " bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
c += " if (" + check + ") {\n";
if (weights_are_buffer) {
c += " FLT4 f = args.weights.Read(fx_c);\n";
if (dynamic_weights) {
c += " FLT4 f = args.weights.Read(kx, ky, S);\n";
} else {
c += " FLT4 f = args.weights.Read(fx_c, S);\n";
if (weights_are_buffer) {
c += " FLT4 f = args.weights.Read(fx_c);\n";
} else {
c += " FLT4 f = args.weights.Read(fx_c, S);\n";
}
}
c += GetSrcValue(channel_multiplier, flat_coords);
c += " r += TO_ACCUM_TYPE(src_final * f);\n";
c += " };\n";
c += " fx_c++;\n";
if (!dynamic_weights) {
c += " fx_c++;\n";
}
c += " }\n";
c += " }\n";
if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
@ -170,7 +185,7 @@ std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
} else { // Texture types with ZERO clamping
if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
flat_coords += ", z_c";
c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
c += " for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
c += " int z_c = z_offseted + kz * args.dilation_z;\n";
if (src_tensor_type !=
TensorStorageType::TEXTURE_3D) { // Only TEXTURE_3D supports clamping
@ -181,20 +196,24 @@ std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
c += " }\n";
}
}
c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
c += " for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
c += " int y_c = y_offseted + ky * args.dilation_y;\n";
c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
c += " for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
const std::string dilation_x =
op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
: "args.dilation_x";
c += " int x_c = x_offseted + kx * " + dilation_x + ";\n";
c += GetSrcValue(channel_multiplier, flat_coords);
if (weights_are_buffer) {
c += " FLT4 f = args.weights.Read(fx_c);\n";
if (dynamic_weights) {
c += " FLT4 f = args.weights.Read(kx, ky, S);\n";
} else {
c += " FLT4 f = args.weights.Read(fx_c, S);\n";
if (weights_are_buffer) {
c += " FLT4 f = args.weights.Read(fx_c);\n";
} else {
c += " FLT4 f = args.weights.Read(fx_c, S);\n";
}
c += " fx_c++;\n";
}
c += " fx_c++;\n";
c += " r += TO_ACCUM_TYPE(src_final * f);\n";
c += " }\n";
c += " }\n";
@ -234,7 +253,7 @@ GPUOperation CreateDepthwiseConvolution2D(
definition.IsBatchSupported() && attr.strides.w != 1;
op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
attr.weights.shape.o,
weights_are_buffer, &op);
weights_are_buffer, false, &op);
UploadWeightsForDWConv2D(attr.weights, weights_are_buffer,
definition.precision, &op);
op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
@ -249,6 +268,32 @@ GPUOperation CreateDepthwiseConvolution2D(
return op;
}
GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
const DeviceInfo& device_info, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr) {
GPUOperation op(definition);
op.args_.AddInt("stride_x", attr.strides.w);
op.args_.AddInt("padding_x", -attr.padding.prepended.w);
op.args_.AddInt("dilation_x", attr.dilations.w);
op.args_.AddInt("stride_y", attr.strides.h);
op.args_.AddInt("padding_y", -attr.padding.prepended.h);
op.args_.AddInt("dilation_y", attr.dilations.h);
const bool stride_correction =
definition.IsBatchSupported() && attr.strides.w != 1;
op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction, 1,
false, true, &op);
op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
TensorLinearDescriptor desc;
desc.storage_type = device_info.IsMali() ? LinearStorageType::BUFFER
: LinearStorageType::TEXTURE_2D;
desc.element_type = definition.GetDataType();
desc.UploadLinearData(attr.bias);
op.args_.AddObject(
"biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
return op;
}
GPUOperation CreateDepthwiseConvolution3D(
const DeviceInfo& device_info, const OperationDef& definition,
const DepthwiseConvolution3DAttributes& attr) {
@ -273,7 +318,7 @@ GPUOperation CreateDepthwiseConvolution3D(
definition.IsBatchSupported() && attr.strides.w != 1;
op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
attr.weights.shape.o,
weights_are_buffer, &op);
weights_are_buffer, false, &op);
UploadWeightsForDWConv3D(attr.weights, weights_are_buffer,
definition.precision, &op);
op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;

View File

@ -186,6 +186,10 @@ GPUOperation CreateDepthwiseConvolution2D(
const DeviceInfo& device_info, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr);
GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
const DeviceInfo& device_info, const OperationDef& definition,
const DepthwiseConvolution2DAttributes& attr);
GPUOperation CreateDepthwiseConvolution3D(
const DeviceInfo& device_info, const OperationDef& definition,
const DepthwiseConvolution3DAttributes& attr);

View File

@ -132,6 +132,7 @@ cc_library(
"//tensorflow/lite/delegates/gpu/cl/kernels:add",
"//tensorflow/lite/delegates/gpu/cl/kernels:concat_xy",
"//tensorflow/lite/delegates/gpu/cl/kernels:concat_z",
"//tensorflow/lite/delegates/gpu/cl/kernels:depthwise_conv",
"//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
"//tensorflow/lite/delegates/gpu/cl/kernels:lstm",
"//tensorflow/lite/delegates/gpu/cl/kernels:max_unpooling",

View File

@ -315,7 +315,16 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
case OperationType::DEPTHWISE_CONVOLUTION: {
auto attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
node.operation.attributes);
*gpu_op = SelectDWConvolution(attr, device_info, op_def);
if (inputs.size() == 1) {
*gpu_op = SelectDWConvolution(attr, device_info, op_def);
} else {
if (inputs[1]->tensor.shape.b != 1) {
return absl::UnimplementedError(
"No support of depthwise runtime weights with channel multiplier "
"!= 1");
}
*gpu_op = SelectDWConvolutionDynamicWeights(attr, device_info, op_def);
}
return absl::OkStatus();
}
case OperationType::FULLY_CONNECTED: {

View File

@ -22,6 +22,7 @@ limitations under the License.
#include "tensorflow/lite/delegates/gpu/cl/kernels/add.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/lstm.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
#include "tensorflow/lite/delegates/gpu/cl/kernels/mean.h"
@ -110,6 +111,13 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
}
}
std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
const OperationDef& op_def) {
return absl::make_unique<GPUOperation>(
CreateDepthwiseConvolution2DDynamicWeights(device_info, op_def, attr));
}
void SelectReshape(int src_channels, int dst_channels,
const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr) {

View File

@ -57,6 +57,10 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
const DeviceInfo& device_info,
std::unique_ptr<GPUOperation>* ptr);
std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
const OperationDef& op_def);
void SelectReshape(int src_channels, int dst_channels,
const OperationDef& op_def,
std::unique_ptr<GPUOperation>* ptr);

View File

@ -40,6 +40,10 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
OperationType::DEPTHWISE_CONVOLUTION) {
return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
}
auto dw_inputs = graph.FindInputs(dw_node->id);
if (dw_inputs.size() != 1) {
return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
}
auto dw_outputs = graph.FindOutputs(dw_node->id);
auto consumers = graph.FindConsumers(dw_outputs[0]->id);
if (consumers.size() != 1) {
@ -60,7 +64,6 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
dw_node->operation.attributes);
auto conv_attr =
absl::any_cast<Convolution2DAttributes>(conv_node->operation.attributes);
auto dw_inputs = graph.FindInputs(dw_node->id);
auto conv_outputs = graph.FindOutputs(conv_node->id);
OperationDef op_def;
op_def.precision = precision;

View File

@ -511,9 +511,22 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
const TfLiteNode* tflite_node,
const TfLiteRegistration* registration) final {
RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 6));
RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
/*runtime_inputs=*/1, /*outputs=*/1));
RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
const int runtime_inputs =
GetNumberOfRuntimeInputsForNode(context, tflite_node);
if (runtime_inputs > 2) {
return absl::InternalError(
absl::StrCat("Expected 1 or 2 input tensor(s), but node has ",
runtime_inputs, " runtime inputs."));
}
const int runtime_outputs = NumOutputs(tflite_node);
if (runtime_outputs != 1) {
return absl::InternalError(
absl::StrCat("Expected 1 output tensor(s), but node has ",
runtime_outputs, " runtime outputs."));
}
if (runtime_inputs == 1) {
RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
}
const TfLiteDepthwiseConvParams* tf_options;
RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
RETURN_IF_ERROR(CheckStridesAndDilation(
@ -567,7 +580,12 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
RETURN_IF_ERROR(reader->AddOutputs(node));
DepthwiseConvolution2DAttributes attr;
RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
const int runtime_inputs = reader->GetNumberOfRuntimeInputs();
if (runtime_inputs == 2) {
RETURN_IF_ERROR(reader->AddInput(node, 1));
} else { // runtime_inputs == 1;
RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
}
reader->ReadTensor(2, &attr.bias).IgnoreError(); // bias is optional
const TfLiteDepthwiseConvParams* tf_options;
RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));

View File

@ -70,6 +70,12 @@ class AddBias : public NodeTransformation {
}
if (node->operation.type ==
ToString(OperationType::DEPTHWISE_CONVOLUTION)) {
if (graph->FindInputs(node->id).size() != 1) {
return {TransformStatus::DECLINED,
"This transformation is only applicable to depth wise conv "
"with one "
"runtime input."};
}
auto& attr = absl::any_cast<DepthwiseConvolution2DAttributes&>(
node->operation.attributes);
return FillBias(attr.weights.shape.o * attr.weights.shape.i, &attr.bias);

View File

@ -54,6 +54,10 @@ class MergeConvolutionWithAdd : public SequenceTransformation {
TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
GraphFloat32* graph) final {
auto& conv_node = *sequence[0];
if (graph->FindInputs(conv_node.id).size() != 1) {
return {TransformStatus::DECLINED,
"This fusion is only applicable to ops with one runtime input."};
}
auto& add_node = *sequence[1];
if (add_node.operation.type != ToString(OperationType::ADD)) {
return {TransformStatus::SKIPPED, ""};

View File

@ -38,6 +38,10 @@ class DepthwiseConvolution : public NodeShader {
public:
absl::Status GenerateCode(const GenerationContext& ctx,
GeneratedCode* generated_code) const final {
if (ctx.input_shapes.size() != 1) {
return absl::UnimplementedError(
"DepthWise Convolution does not support more than 1 runtime tensor");
}
const auto& attr =
absl::any_cast<const DepthwiseConvolution2DAttributes&>(ctx.op_attr);
auto weights = attr.weights.shape;

View File

@ -267,6 +267,11 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
device_info, options);
break;
case OperationType::DEPTHWISE_CONVOLUTION:
if (graph.FindInputs(node->id).size() != 1) {
return absl::UnimplementedError(
"DepthWise Convolution does not support more than 1 runtime "
"tensor");
}
*tasks =
SelectDepthWiseConv(node_id, inputs[0], outputs[0],
absl::any_cast<DepthwiseConvolution2DAttributes>(