Remove portable_optimized depthwise_conv implementation.
PR #34999 surfaced inconsistencies between the reference and portable_optimized implementations and we are also slowly transitioning to CMSIS-NN for the optimized implementations for ARM Cortex-M targets. There may still be an issue with the reference depthwise conv + dilation > 1 but that would be a separate bug. PiperOrigin-RevId: 317802639 Change-Id: Ic8c9acffb060bb3ec5f802a2045ac6ac8b9f2233
This commit is contained in:
parent
40e860b7a4
commit
934df8dcea
@ -102,30 +102,6 @@ cc_library(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO(b/144176795): This target should really be handled differently so that we
|
|
||||||
# do not have a fork in the build graph. The bug has some initial ideas.
|
|
||||||
cc_library(
|
|
||||||
name = "portable_optimized_op_resolver",
|
|
||||||
srcs = [
|
|
||||||
"all_ops_resolver.cc",
|
|
||||||
"micro_mutable_op_resolver.h",
|
|
||||||
"micro_op_resolver.h",
|
|
||||||
],
|
|
||||||
hdrs = [
|
|
||||||
"all_ops_resolver.h",
|
|
||||||
],
|
|
||||||
copts = micro_copts(),
|
|
||||||
deps = [
|
|
||||||
":micro_compatibility",
|
|
||||||
"//tensorflow/lite/c:common",
|
|
||||||
"//tensorflow/lite/core/api",
|
|
||||||
"//tensorflow/lite/kernels:op_macros",
|
|
||||||
"//tensorflow/lite/kernels/internal:compatibility",
|
|
||||||
"//tensorflow/lite/micro/kernels:portable_optimized_micro_ops",
|
|
||||||
"//tensorflow/lite/schema:schema_fbs",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "debug_log",
|
name = "debug_log",
|
||||||
srcs = [
|
srcs = [
|
||||||
|
|||||||
@ -20,7 +20,6 @@ package_group(
|
|||||||
packages = ["//tensorflow/lite/micro"],
|
packages = ["//tensorflow/lite/micro"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# LINT.IfChange(micro_ops)
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "micro_ops",
|
name = "micro_ops",
|
||||||
srcs = [
|
srcs = [
|
||||||
@ -106,73 +105,6 @@ cc_library(
|
|||||||
],
|
],
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
# LINT.ThenChange(//tensorflow/lite/micro/kernels/BUILD:portable_optimized_micro_ops)
|
|
||||||
|
|
||||||
# LINT.IfChange(portable_optimized_micro_ops)
|
|
||||||
cc_library(
|
|
||||||
name = "portable_optimized_micro_ops",
|
|
||||||
srcs = [
|
|
||||||
"activations.cc",
|
|
||||||
"add.cc",
|
|
||||||
"arg_min_max.cc",
|
|
||||||
"ceil.cc",
|
|
||||||
"circular_buffer.cc",
|
|
||||||
"comparisons.cc",
|
|
||||||
"concatenation.cc",
|
|
||||||
"conv.cc",
|
|
||||||
"dequantize.cc",
|
|
||||||
"elementwise.cc",
|
|
||||||
"ethosu.cc",
|
|
||||||
"floor.cc",
|
|
||||||
"fully_connected.cc",
|
|
||||||
"l2norm.cc",
|
|
||||||
"logical.cc",
|
|
||||||
"logistic.cc",
|
|
||||||
"maximum_minimum.cc",
|
|
||||||
"mul.cc",
|
|
||||||
"neg.cc",
|
|
||||||
"pack.cc",
|
|
||||||
"pad.cc",
|
|
||||||
"pooling.cc",
|
|
||||||
"portable_optimized/depthwise_conv.cc",
|
|
||||||
"prelu.cc",
|
|
||||||
"quantize.cc",
|
|
||||||
"reduce.cc",
|
|
||||||
"reshape.cc",
|
|
||||||
"resize_nearest_neighbor.cc",
|
|
||||||
"round.cc",
|
|
||||||
"softmax.cc",
|
|
||||||
"split.cc",
|
|
||||||
"strided_slice.cc",
|
|
||||||
"sub.cc",
|
|
||||||
"svdf.cc",
|
|
||||||
"tanh.cc",
|
|
||||||
"unpack.cc",
|
|
||||||
],
|
|
||||||
hdrs = ["micro_ops.h"],
|
|
||||||
copts = micro_copts(),
|
|
||||||
visibility = [
|
|
||||||
# Needed for micro:portable_optimized_ops_resolver but visibility can not be
|
|
||||||
# finer-grained than a package.
|
|
||||||
":micro_top_level",
|
|
||||||
],
|
|
||||||
deps = [
|
|
||||||
":activation_utils",
|
|
||||||
":micro_utils",
|
|
||||||
"//tensorflow/lite/c:common",
|
|
||||||
"//tensorflow/lite/kernels:kernel_util",
|
|
||||||
"//tensorflow/lite/kernels:op_macros",
|
|
||||||
"//tensorflow/lite/kernels:padding",
|
|
||||||
"//tensorflow/lite/kernels/internal:common",
|
|
||||||
"//tensorflow/lite/kernels/internal:compatibility",
|
|
||||||
"//tensorflow/lite/kernels/internal:quantization_util",
|
|
||||||
"//tensorflow/lite/kernels/internal:reference_base",
|
|
||||||
"//tensorflow/lite/kernels/internal:tensor",
|
|
||||||
"//tensorflow/lite/kernels/internal:types",
|
|
||||||
"//tensorflow/lite/micro:micro_utils",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
# LINT.ThenChange(//tensorflow/lite/micro/kernels/BUILD:micro_ops)
|
|
||||||
|
|
||||||
test_suite(
|
test_suite(
|
||||||
name = "all_tests",
|
name = "all_tests",
|
||||||
@ -214,19 +146,6 @@ tflite_micro_cc_test(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
tflite_micro_cc_test(
|
|
||||||
name = "portable_optimized_depthwise_conv_test",
|
|
||||||
srcs = [
|
|
||||||
"depthwise_conv_test.cc",
|
|
||||||
],
|
|
||||||
deps = [
|
|
||||||
"//tensorflow/lite/c:common",
|
|
||||||
"//tensorflow/lite/kernels/internal:tensor",
|
|
||||||
"//tensorflow/lite/micro:portable_optimized_op_resolver",
|
|
||||||
"//tensorflow/lite/micro/testing:micro_test",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
tflite_micro_cc_test(
|
tflite_micro_cc_test(
|
||||||
name = "fully_connected_test",
|
name = "fully_connected_test",
|
||||||
srcs = [
|
srcs = [
|
||||||
|
|||||||
@ -1,515 +0,0 @@
|
|||||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
==============================================================================*/
|
|
||||||
|
|
||||||
#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
|
|
||||||
|
|
||||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
|
||||||
#include "tensorflow/lite/c/common.h"
|
|
||||||
#include "tensorflow/lite/kernels/internal/common.h"
|
|
||||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
|
||||||
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
|
|
||||||
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
|
|
||||||
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
|
|
||||||
#include "tensorflow/lite/kernels/kernel_util.h"
|
|
||||||
#include "tensorflow/lite/kernels/padding.h"
|
|
||||||
|
|
||||||
namespace tflite {
|
|
||||||
namespace ops {
|
|
||||||
namespace micro {
|
|
||||||
namespace depthwise_conv {
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
constexpr int kInputTensor = 0;
|
|
||||||
constexpr int kFilterTensor = 1;
|
|
||||||
constexpr int kBiasTensor = 2;
|
|
||||||
constexpr int kOutputTensor = 0;
|
|
||||||
constexpr int kMaxChannels = 256;
|
|
||||||
|
|
||||||
// Depthwise conv is quantized along dimension 3:
|
|
||||||
// https://www.tensorflow.org/lite/performance/quantization_spec
|
|
||||||
constexpr int kDepthwiseConvQuantizedDimension = 3;
|
|
||||||
|
|
||||||
// Size of the cached buffer we'll be using to hold reordered weights.
|
|
||||||
constexpr int kReshapedFilterDataSize = 1 * 1024;
|
|
||||||
|
|
||||||
struct OpData {
|
|
||||||
TfLitePaddingValues padding;
|
|
||||||
// The scaling factor from input to output (aka the 'real multiplier') can
|
|
||||||
// be represented as a fixed point multiplier plus a left shift.
|
|
||||||
int32_t output_multiplier;
|
|
||||||
int output_shift;
|
|
||||||
|
|
||||||
// Per channel output multiplier and shift.
|
|
||||||
int32_t per_channel_output_multiplier[kMaxChannels];
|
|
||||||
int32_t per_channel_output_shift[kMaxChannels];
|
|
||||||
|
|
||||||
// The range of the fused activation layer. For example for kNone and
|
|
||||||
// uint8_t these would be 0 and 255.
|
|
||||||
int32_t output_activation_min;
|
|
||||||
int32_t output_activation_max;
|
|
||||||
};
|
|
||||||
|
|
||||||
TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
|
|
||||||
TfLiteDepthwiseConvParams* params, int width,
|
|
||||||
int height, int filter_width, int filter_height,
|
|
||||||
int out_width, int out_height,
|
|
||||||
const TfLiteType data_type, OpData* data) {
|
|
||||||
bool has_bias = node->inputs->size == 3;
|
|
||||||
// Check number of inputs/outputs
|
|
||||||
TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
|
|
||||||
TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
|
|
||||||
|
|
||||||
// Matching GetWindowedOutputSize in TensorFlow.
|
|
||||||
auto padding = params->padding;
|
|
||||||
data->padding = ComputePaddingHeightWidth(
|
|
||||||
params->stride_height, params->stride_width,
|
|
||||||
params->dilation_height_factor, params->dilation_width_factor, height,
|
|
||||||
width, filter_height, filter_width, padding, &out_height, &out_width);
|
|
||||||
|
|
||||||
// Note that quantized inference requires that all tensors have their
|
|
||||||
// parameters set. This is usually done during quantized training.
|
|
||||||
if (data_type != kTfLiteFloat32) {
|
|
||||||
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
|
|
||||||
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
|
|
||||||
const TfLiteTensor* bias =
|
|
||||||
GetOptionalInputTensor(context, node, kBiasTensor);
|
|
||||||
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
|
|
||||||
int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
|
|
||||||
|
|
||||||
TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
|
|
||||||
context, input, filter, bias, output, params->activation,
|
|
||||||
&data->output_multiplier, &data->output_shift,
|
|
||||||
&data->output_activation_min, &data->output_activation_max,
|
|
||||||
data->per_channel_output_multiplier,
|
|
||||||
reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
|
|
||||||
}
|
|
||||||
return kTfLiteOk;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Specialized implementation of the depthwise convolution operation designed to
|
|
||||||
// work with the particular filter width of eight used by the default micro
|
|
||||||
// speech sample code. It uses 1KB of RAM to hold reordered weight parameters,
|
|
||||||
// converted from TFLite's NHWC format to NCHW format, and expressed as signed
|
|
||||||
// eight bit integers, rather than unsigned. Care must be taken when calling
|
|
||||||
// this not to use it for more than one node since there's only a single static
|
|
||||||
// buffer holding the weights. You should use this implementation if depthwise
|
|
||||||
// convolutions are a performance bottleneck, you have a layer that meets the
|
|
||||||
// parameter requirements, and the extra RAM usage and additional code size are
|
|
||||||
// not an issue.
|
|
||||||
static inline void DepthwiseConvOptimizedForFilterWidthEight(
|
|
||||||
TfLiteContext* context, const DepthwiseParams& params,
|
|
||||||
const RuntimeShape& input_shape, const uint8* input_data,
|
|
||||||
const RuntimeShape& filter_shape, const uint8* filter_data,
|
|
||||||
const RuntimeShape& bias_shape, const int32* bias_data,
|
|
||||||
const RuntimeShape& output_shape, uint8* output_data) {
|
|
||||||
const int stride_width = params.stride_width;
|
|
||||||
const int stride_height = params.stride_height;
|
|
||||||
const int pad_width = params.padding_values.width;
|
|
||||||
const int pad_height = params.padding_values.height;
|
|
||||||
const int depth_multiplier = params.depth_multiplier;
|
|
||||||
const int32 output_activation_min = params.quantized_activation_min;
|
|
||||||
const int32 output_activation_max = params.quantized_activation_max;
|
|
||||||
const int32 input_offset = params.input_offset;
|
|
||||||
const int32 filter_offset = params.weights_offset;
|
|
||||||
const int32 output_offset = params.output_offset;
|
|
||||||
const int32 output_multiplier = params.output_multiplier;
|
|
||||||
const int output_shift = params.output_shift;
|
|
||||||
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
||||||
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
|
||||||
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
||||||
|
|
||||||
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
|
||||||
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
|
||||||
const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
|
|
||||||
const int input_height = input_shape.Dims(1);
|
|
||||||
const int input_width = input_shape.Dims(2);
|
|
||||||
const int input_depth = input_shape.Dims(3);
|
|
||||||
const int filter_height = filter_shape.Dims(1);
|
|
||||||
const int filter_width = filter_shape.Dims(2);
|
|
||||||
const int output_height = output_shape.Dims(1);
|
|
||||||
const int output_width = output_shape.Dims(2);
|
|
||||||
TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
|
|
||||||
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
|
|
||||||
|
|
||||||
static int16_t reshaped_filter_data[kReshapedFilterDataSize];
|
|
||||||
const int needed_size =
|
|
||||||
output_depth * filter_width * filter_height * input_depth;
|
|
||||||
if (needed_size > kReshapedFilterDataSize) {
|
|
||||||
TF_LITE_KERNEL_LOG(
|
|
||||||
context,
|
|
||||||
"Size too large for reshaped weight buffer (%d needed, %d available)",
|
|
||||||
needed_size, kReshapedFilterDataSize);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
RuntimeShape reshaped_filter_shape;
|
|
||||||
reshaped_filter_shape.BuildFrom(
|
|
||||||
{1, output_depth, filter_height, filter_width});
|
|
||||||
|
|
||||||
// If this is the first time through, repack the weights into a cached buffer
|
|
||||||
// so that they can be accessed sequentially.
|
|
||||||
static bool is_reshaped_filter_initialized = false;
|
|
||||||
if (!is_reshaped_filter_initialized) {
|
|
||||||
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
|
||||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
|
||||||
for (int oc = 0; oc < output_depth; ++oc) {
|
|
||||||
const uint8* current_filter =
|
|
||||||
filter_data + Offset(filter_shape, 0, filter_y, filter_x, oc);
|
|
||||||
int16_t* reshaped_filter =
|
|
||||||
reshaped_filter_data +
|
|
||||||
Offset(reshaped_filter_shape, 0, oc, filter_y, filter_x);
|
|
||||||
*reshaped_filter =
|
|
||||||
static_cast<int16_t>(*current_filter) + filter_offset;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
is_reshaped_filter_initialized = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int b = 0; b < batches; ++b) {
|
|
||||||
for (int out_y = 0; out_y < output_height; ++out_y) {
|
|
||||||
for (int out_x = 0; out_x < output_width; ++out_x) {
|
|
||||||
for (int ic = 0; ic < input_depth; ++ic) {
|
|
||||||
for (int m = 0; m < depth_multiplier; m++) {
|
|
||||||
const int oc = m + ic * depth_multiplier;
|
|
||||||
const int in_x_origin = (out_x * stride_width) - pad_width;
|
|
||||||
const int in_y_origin = (out_y * stride_height) - pad_height;
|
|
||||||
int32 acc = 0;
|
|
||||||
int in_y_start = in_y_origin;
|
|
||||||
int filter_y_start = 0;
|
|
||||||
if (in_y_origin < 0) {
|
|
||||||
in_y_start = 0;
|
|
||||||
filter_y_start = 0 - in_y_origin;
|
|
||||||
}
|
|
||||||
int filter_y_end = filter_height;
|
|
||||||
if ((in_y_origin + filter_height) >= input_height) {
|
|
||||||
filter_y_end -= (in_y_origin + filter_height) - input_height;
|
|
||||||
}
|
|
||||||
int in_y = in_y_start;
|
|
||||||
int in_x_start = in_x_origin;
|
|
||||||
int filter_x_start = 0;
|
|
||||||
bool is_out_of_x_bounds = false;
|
|
||||||
if (in_x_origin < 0) {
|
|
||||||
in_x_start = 0;
|
|
||||||
filter_x_start = 0 - in_x_origin;
|
|
||||||
is_out_of_x_bounds = true;
|
|
||||||
}
|
|
||||||
int filter_x_end = filter_width;
|
|
||||||
if ((in_x_origin + filter_width) >= input_width) {
|
|
||||||
filter_x_end -= (in_x_origin + filter_width) - input_width;
|
|
||||||
is_out_of_x_bounds = true;
|
|
||||||
}
|
|
||||||
for (int filter_y = filter_y_start; filter_y < filter_y_end;
|
|
||||||
++filter_y, ++in_y) {
|
|
||||||
const uint8* current_input =
|
|
||||||
input_data + Offset(input_shape, b, in_y, in_x_start, ic);
|
|
||||||
if ((filter_width == 8) && !is_out_of_x_bounds) {
|
|
||||||
int16* current_filter =
|
|
||||||
reshaped_filter_data + Offset(reshaped_filter_shape, 0, oc,
|
|
||||||
filter_y, filter_x_start);
|
|
||||||
const uint32_t input_vals0 =
|
|
||||||
*reinterpret_cast<const uint32_t*>(current_input);
|
|
||||||
current_input += 4;
|
|
||||||
const int32_t filter_vals0 =
|
|
||||||
*reinterpret_cast<const int32_t*>(current_filter);
|
|
||||||
current_filter += 2;
|
|
||||||
const uint8 input_val0 = input_vals0 & 0xff;
|
|
||||||
const int16 filter_val0 = filter_vals0 & 0xffff;
|
|
||||||
acc += filter_val0 * input_val0;
|
|
||||||
const uint8 input_val1 = (input_vals0 >> 8) & 0xff;
|
|
||||||
const int16 filter_val1 = (filter_vals0 >> 16) & 0xffff;
|
|
||||||
acc += filter_val1 * input_val1;
|
|
||||||
|
|
||||||
const int32_t filter_vals1 =
|
|
||||||
*reinterpret_cast<const int32_t*>(current_filter);
|
|
||||||
current_filter += 2;
|
|
||||||
const uint8 input_val2 = (input_vals0 >> 16) & 0xff;
|
|
||||||
const int16 filter_val2 = filter_vals1 & 0xffff;
|
|
||||||
acc += filter_val2 * input_val2;
|
|
||||||
const uint8 input_val3 = (input_vals0 >> 24) & 0xff;
|
|
||||||
const int16 filter_val3 = (filter_vals1 >> 16) & 0xffff;
|
|
||||||
acc += filter_val3 * input_val3;
|
|
||||||
|
|
||||||
const uint32_t input_vals1 =
|
|
||||||
*reinterpret_cast<const uint32_t*>(current_input);
|
|
||||||
const int32_t filter_vals2 =
|
|
||||||
*reinterpret_cast<const int32_t*>(current_filter);
|
|
||||||
current_filter += 2;
|
|
||||||
const uint8 input_val4 = input_vals1 & 0xff;
|
|
||||||
const int16 filter_val4 = filter_vals2 & 0xffff;
|
|
||||||
acc += filter_val4 * input_val4;
|
|
||||||
const uint8 input_val5 = (input_vals1 >> 8) & 0xff;
|
|
||||||
const int16 filter_val5 = (filter_vals2 >> 16) & 0xffff;
|
|
||||||
acc += filter_val5 * input_val5;
|
|
||||||
|
|
||||||
const int32_t filter_vals3 =
|
|
||||||
*reinterpret_cast<const int32_t*>(current_filter);
|
|
||||||
const uint8 input_val6 = (input_vals1 >> 16) & 0xff;
|
|
||||||
const int16 filter_val6 = filter_vals3 & 0xffff;
|
|
||||||
acc += filter_val6 * input_val6;
|
|
||||||
const uint8 input_val7 = (input_vals1 >> 24) & 0xff;
|
|
||||||
const int16 filter_val7 = (filter_vals3 >> 16) & 0xffff;
|
|
||||||
acc += filter_val7 * input_val7;
|
|
||||||
} else {
|
|
||||||
const uint8* current_filter =
|
|
||||||
filter_data +
|
|
||||||
Offset(filter_shape, 0, filter_y, filter_x_start, oc);
|
|
||||||
for (int filter_x = filter_x_start; filter_x < filter_x_end;
|
|
||||||
++filter_x) {
|
|
||||||
int32 input_val = *current_input;
|
|
||||||
current_input += input_depth;
|
|
||||||
int32 filter_val = *current_filter;
|
|
||||||
current_filter += output_depth;
|
|
||||||
acc +=
|
|
||||||
(filter_val + filter_offset) * (input_val + input_offset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (bias_data) {
|
|
||||||
acc += bias_data[oc];
|
|
||||||
}
|
|
||||||
acc = reference_ops::depthwise_conv::DepthwiseConvRound<
|
|
||||||
DepthwiseConvOutputRounding::kAwayFromZero>(
|
|
||||||
acc, output_multiplier, output_shift);
|
|
||||||
acc += output_offset;
|
|
||||||
acc = std::max(acc, output_activation_min);
|
|
||||||
acc = std::min(acc, output_activation_max);
|
|
||||||
output_data[Offset(output_shape, b, out_y, out_x, oc)] =
|
|
||||||
static_cast<uint8>(acc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
void EvalFloat(TfLiteContext* context, TfLiteNode* node,
|
|
||||||
TfLiteDepthwiseConvParams* params, OpData* data,
|
|
||||||
const TfLiteTensor* input, const TfLiteTensor* filter,
|
|
||||||
const TfLiteTensor* bias, TfLiteTensor* output) {
|
|
||||||
float output_activation_min, output_activation_max;
|
|
||||||
CalculateActivationRange(params->activation, &output_activation_min,
|
|
||||||
&output_activation_max);
|
|
||||||
|
|
||||||
tflite::DepthwiseParams op_params;
|
|
||||||
// Padding type is ignored, but still set.
|
|
||||||
op_params.padding_type = PaddingType::kSame;
|
|
||||||
op_params.padding_values.width = data->padding.width;
|
|
||||||
op_params.padding_values.height = data->padding.height;
|
|
||||||
op_params.stride_width = params->stride_width;
|
|
||||||
op_params.stride_height = params->stride_height;
|
|
||||||
op_params.dilation_width_factor = 1;
|
|
||||||
op_params.dilation_height_factor = 1;
|
|
||||||
op_params.depth_multiplier = params->depth_multiplier;
|
|
||||||
op_params.float_activation_min = output_activation_min;
|
|
||||||
op_params.float_activation_max = output_activation_max;
|
|
||||||
|
|
||||||
tflite::reference_ops::DepthwiseConv(
|
|
||||||
op_params, GetTensorShape(input), GetTensorData<float>(input),
|
|
||||||
GetTensorShape(filter), GetTensorData<float>(filter),
|
|
||||||
GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
|
|
||||||
GetTensorData<float>(output));
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO(njeff): Optimize for int8 like we do for uint8.
|
|
||||||
|
|
||||||
void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
|
|
||||||
TfLiteDepthwiseConvParams* params, OpData* data,
|
|
||||||
const TfLiteTensor* input,
|
|
||||||
const TfLiteTensor* filter,
|
|
||||||
const TfLiteTensor* bias, TfLiteTensor* output) {
|
|
||||||
DepthwiseParams op_params;
|
|
||||||
op_params.padding_type = PaddingType::kSame;
|
|
||||||
op_params.padding_values.width = data->padding.width;
|
|
||||||
op_params.padding_values.height = data->padding.height;
|
|
||||||
op_params.stride_width = params->stride_width;
|
|
||||||
op_params.stride_height = params->stride_height;
|
|
||||||
op_params.dilation_width_factor = params->dilation_width_factor;
|
|
||||||
op_params.dilation_height_factor = params->dilation_height_factor;
|
|
||||||
op_params.depth_multiplier = params->depth_multiplier;
|
|
||||||
op_params.input_offset = -input->params.zero_point;
|
|
||||||
op_params.weights_offset = 0;
|
|
||||||
op_params.output_offset = output->params.zero_point;
|
|
||||||
// TODO(b/130439627): Use calculated value for clamping.
|
|
||||||
op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
|
|
||||||
op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
|
|
||||||
|
|
||||||
reference_integer_ops::DepthwiseConvPerChannel(
|
|
||||||
op_params, data->per_channel_output_multiplier,
|
|
||||||
data->per_channel_output_shift, GetTensorShape(input),
|
|
||||||
GetTensorData<int8>(input), GetTensorShape(filter),
|
|
||||||
GetTensorData<int8>(filter), GetTensorShape(bias),
|
|
||||||
GetTensorData<int32>(bias), GetTensorShape(output),
|
|
||||||
GetTensorData<int8>(output));
|
|
||||||
}
|
|
||||||
|
|
||||||
void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
|
|
||||||
TfLiteDepthwiseConvParams* params, OpData* data,
|
|
||||||
const TfLiteTensor* input, const TfLiteTensor* filter,
|
|
||||||
const TfLiteTensor* bias, TfLiteTensor* output) {
|
|
||||||
const int32_t input_offset = -input->params.zero_point;
|
|
||||||
const int32_t filter_offset = -filter->params.zero_point;
|
|
||||||
const int32_t output_offset = output->params.zero_point;
|
|
||||||
|
|
||||||
tflite::DepthwiseParams op_params;
|
|
||||||
// Padding type is ignored, but still set.
|
|
||||||
op_params.padding_type = PaddingType::kSame;
|
|
||||||
op_params.padding_values.width = data->padding.width;
|
|
||||||
op_params.padding_values.height = data->padding.height;
|
|
||||||
op_params.stride_width = params->stride_width;
|
|
||||||
op_params.stride_height = params->stride_height;
|
|
||||||
op_params.dilation_width_factor = 1;
|
|
||||||
op_params.dilation_height_factor = 1;
|
|
||||||
op_params.depth_multiplier = params->depth_multiplier;
|
|
||||||
op_params.quantized_activation_min = data->output_activation_min;
|
|
||||||
op_params.quantized_activation_max = data->output_activation_max;
|
|
||||||
op_params.input_offset = input_offset;
|
|
||||||
op_params.weights_offset = filter_offset;
|
|
||||||
op_params.output_offset = output_offset;
|
|
||||||
op_params.output_multiplier = data->output_multiplier;
|
|
||||||
// Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
|
|
||||||
op_params.output_shift = -data->output_shift;
|
|
||||||
|
|
||||||
// Figure out if we can use the optimized path for this set of parameters.
|
|
||||||
const int filter_width = GetTensorShape(filter).Dims(2);
|
|
||||||
const int input_depth = GetTensorShape(input).Dims(3);
|
|
||||||
const int output_depth = GetTensorShape(filter).Dims(3);
|
|
||||||
const int filter_height = GetTensorShape(filter).Dims(1);
|
|
||||||
const int needed_size =
|
|
||||||
output_depth * filter_width * filter_height * input_depth;
|
|
||||||
bool use_optimized_path = false;
|
|
||||||
if ((filter_width == 8) && (input_offset == 0) && (input_depth == 1) &&
|
|
||||||
(needed_size <= kReshapedFilterDataSize)) {
|
|
||||||
// FIXME(petewarden) - We need a more robust way of handling this, ideally
|
|
||||||
// with an allocation mechanism available through the context API.
|
|
||||||
// Use the address of the node as a proxy for its identity, since we need
|
|
||||||
// to ensure the weight values are consistent between calls, and there's
|
|
||||||
// no easy way to do that quickly other than relying on the identity of
|
|
||||||
// the owning node.
|
|
||||||
static TfLiteNode* initialized_node_address = node;
|
|
||||||
if (initialized_node_address == node) {
|
|
||||||
use_optimized_path = true;
|
|
||||||
} else {
|
|
||||||
static bool has_warned = false;
|
|
||||||
if (!has_warned) {
|
|
||||||
TF_LITE_KERNEL_LOG(
|
|
||||||
context,
|
|
||||||
"Multiple depthwise conv ops match optimization parameters, but "
|
|
||||||
"only the first will use the fast path, because there's only one "
|
|
||||||
"RAM cache available");
|
|
||||||
has_warned = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (use_optimized_path) {
|
|
||||||
DepthwiseConvOptimizedForFilterWidthEight(
|
|
||||||
context, op_params, GetTensorShape(input),
|
|
||||||
GetTensorData<uint8_t>(input), GetTensorShape(filter),
|
|
||||||
GetTensorData<uint8_t>(filter), GetTensorShape(bias),
|
|
||||||
GetTensorData<int32_t>(bias), GetTensorShape(output),
|
|
||||||
GetTensorData<uint8_t>(output));
|
|
||||||
} else {
|
|
||||||
tflite::reference_ops::DepthwiseConv(
|
|
||||||
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
|
|
||||||
GetTensorShape(filter), GetTensorData<uint8_t>(filter),
|
|
||||||
GetTensorShape(bias), GetTensorData<int32_t>(bias),
|
|
||||||
GetTensorShape(output), GetTensorData<uint8_t>(output));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
|
|
||||||
auto* params =
|
|
||||||
reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
|
|
||||||
|
|
||||||
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
|
|
||||||
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
|
|
||||||
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
|
|
||||||
const TfLiteTensor* bias =
|
|
||||||
(NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
|
|
||||||
|
|
||||||
const TfLiteType data_type = input->type;
|
|
||||||
int width = SizeOfDimension(input, 2);
|
|
||||||
int height = SizeOfDimension(input, 1);
|
|
||||||
int filter_width = SizeOfDimension(filter, 2);
|
|
||||||
int filter_height = SizeOfDimension(filter, 1);
|
|
||||||
int out_width = ComputeOutSize(params->padding, width, filter_width,
|
|
||||||
params->stride_width);
|
|
||||||
int out_height = ComputeOutSize(params->padding, height, filter_height,
|
|
||||||
params->stride_height);
|
|
||||||
OpData data;
|
|
||||||
|
|
||||||
// All per-channel quantized tensors need valid zero point and scale arrays.
|
|
||||||
if (input->type == kTfLiteInt8) {
|
|
||||||
TF_LITE_ENSURE_EQ(context, filter->quantization.type,
|
|
||||||
kTfLiteAffineQuantization);
|
|
||||||
|
|
||||||
const auto* affine_quantization =
|
|
||||||
reinterpret_cast<TfLiteAffineQuantization*>(
|
|
||||||
filter->quantization.params);
|
|
||||||
TF_LITE_ENSURE(context, affine_quantization);
|
|
||||||
TF_LITE_ENSURE(context, affine_quantization->scale);
|
|
||||||
TF_LITE_ENSURE(context, affine_quantization->zero_point);
|
|
||||||
TF_LITE_ENSURE(
|
|
||||||
context, affine_quantization->scale->size == 1 ||
|
|
||||||
affine_quantization->scale->size ==
|
|
||||||
filter->dims->data[kDepthwiseConvQuantizedDimension]);
|
|
||||||
TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
|
|
||||||
affine_quantization->zero_point->size);
|
|
||||||
}
|
|
||||||
|
|
||||||
TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
|
|
||||||
filter_width, filter_height, out_width,
|
|
||||||
out_height, data_type, &data));
|
|
||||||
|
|
||||||
// TODO(aselle): Consider whether float conv and quantized conv should be
|
|
||||||
// separate ops to avoid dispatch overhead here.
|
|
||||||
switch (input->type) { // Already know in/out types are same.
|
|
||||||
case kTfLiteFloat32:
|
|
||||||
EvalFloat(context, node, params, &data, input, filter, bias, output);
|
|
||||||
break;
|
|
||||||
case kTfLiteInt8:
|
|
||||||
EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
|
|
||||||
output);
|
|
||||||
break;
|
|
||||||
case kTfLiteUInt8:
|
|
||||||
EvalQuantized(context, node, params, &data, input, filter, bias, output);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
|
|
||||||
TfLiteTypeGetName(input->type), input->type);
|
|
||||||
return kTfLiteError;
|
|
||||||
}
|
|
||||||
return kTfLiteOk;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace depthwise_conv
|
|
||||||
|
|
||||||
TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
|
|
||||||
static TfLiteRegistration r = {/*init=*/nullptr,
|
|
||||||
/*free=*/nullptr,
|
|
||||||
/*prepare=*/nullptr,
|
|
||||||
/*invoke=*/depthwise_conv::Eval,
|
|
||||||
/*profiling_string=*/nullptr,
|
|
||||||
/*builtin_code=*/0,
|
|
||||||
/*custom_name=*/nullptr,
|
|
||||||
/*version=*/0};
|
|
||||||
return &r;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace micro
|
|
||||||
} // namespace ops
|
|
||||||
} // namespace tflite
|
|
||||||
@ -49,7 +49,7 @@ fi
|
|||||||
|
|
||||||
make -f tensorflow/lite/micro/tools/make/Makefile \
|
make -f tensorflow/lite/micro/tools/make/Makefile \
|
||||||
TARGET=${TARGET} \
|
TARGET=${TARGET} \
|
||||||
TAGS="portable_optimized disco_f746ng" \
|
TAGS="disco_f746ng" \
|
||||||
${PROJECTS}
|
${PROJECTS}
|
||||||
|
|
||||||
readable_run tensorflow/lite/micro/tools/ci_build/install_mbed_cli.sh
|
readable_run tensorflow/lite/micro/tools/ci_build/install_mbed_cli.sh
|
||||||
|
|||||||
@ -24,9 +24,6 @@ ifeq ($(TARGET),$(filter $(TARGET),\
|
|||||||
$(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)
|
$(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# Use the faster depthwise conv implementation.
|
|
||||||
ALL_TAGS += portable_optimized
|
|
||||||
|
|
||||||
PLATFORM_FLAGS = \
|
PLATFORM_FLAGS = \
|
||||||
-DPART_apollo3 \
|
-DPART_apollo3 \
|
||||||
-DAM_PACKAGE_BGA \
|
-DAM_PACKAGE_BGA \
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user