From dd473010edc8959354128138e42628f8c87f0fa2 Mon Sep 17 00:00:00 2001 From: Pete Warden Date: Tue, 5 Mar 2019 15:53:49 -0800 Subject: [PATCH] Optimize depthwise conv for particular filter width used in micro speech example PiperOrigin-RevId: 236937295 --- .../sparkfun_edge/command_responder.cc | 16 +- .../lite/experimental/micro/kernels/BUILD | 49 ++ .../micro/kernels/depthwise_conv_test.cc | 85 ++++ .../portable_optimized/depthwise_conv.cc | 439 ++++++++++++++++++ .../experimental/micro/tools/make/Makefile | 4 +- .../make/targets/apollo3evb_makefile.inc | 3 + .../micro/tools/make/targets/osx_makefile.inc | 3 + 7 files changed, 594 insertions(+), 5 deletions(-) create mode 100644 tensorflow/lite/experimental/micro/kernels/portable_optimized/depthwise_conv.cc diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/command_responder.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/command_responder.cc index f1ad9b017e9..78469f2b7d7 100644 --- a/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/command_responder.cc +++ b/tensorflow/lite/experimental/micro/examples/micro_speech/sparkfun_edge/command_responder.cc @@ -33,21 +33,29 @@ void RespondToCommand(tflite::ErrorReporter* error_reporter, } static int count = 0; - // Toggle the yellow LED every time an inference is performed. + // Toggle the blue LED every time an inference is performed. ++count; if (count & 1) { - am_hal_gpio_output_set(AM_BSP_GPIO_LED_YELLOW); + am_hal_gpio_output_set(AM_BSP_GPIO_LED_BLUE); } else { - am_hal_gpio_output_clear(AM_BSP_GPIO_LED_YELLOW); + am_hal_gpio_output_clear(AM_BSP_GPIO_LED_BLUE); } - // Turn on the red LED if 'yes' was heard. + // Turn on the yellow LED if 'yes' was heard. am_hal_gpio_output_clear(AM_BSP_GPIO_LED_RED); + am_hal_gpio_output_clear(AM_BSP_GPIO_LED_YELLOW); + am_hal_gpio_output_clear(AM_BSP_GPIO_LED_GREEN); if (is_new_command) { error_reporter->Report("Heard %s (%d) @%dms", found_command, score, current_time); if (found_command[0] == 'y') { + am_hal_gpio_output_set(AM_BSP_GPIO_LED_YELLOW); + } + if (found_command[0] == 'n') { am_hal_gpio_output_set(AM_BSP_GPIO_LED_RED); } + if (found_command[0] == 'u') { + am_hal_gpio_output_set(AM_BSP_GPIO_LED_GREEN); + } } } diff --git a/tensorflow/lite/experimental/micro/kernels/BUILD b/tensorflow/lite/experimental/micro/kernels/BUILD index e2d3164d4c3..451eed28528 100644 --- a/tensorflow/lite/experimental/micro/kernels/BUILD +++ b/tensorflow/lite/experimental/micro/kernels/BUILD @@ -46,6 +46,42 @@ cc_library( ], ) +cc_library( + name = "portable_optimized_micro_ops", + srcs = [ + "fully_connected.cc", + "portable_optimized/depthwise_conv.cc", + "softmax.cc", + ], + hdrs = [ + ], + copts = tflite_copts(), + deps = [ + "//tensorflow/lite/c:c_api_internal", + "//tensorflow/lite/kernels:kernel_util", + "//tensorflow/lite/kernels:op_macros", + "//tensorflow/lite/kernels:padding", + "//tensorflow/lite/kernels/internal:quantization_util", + "//tensorflow/lite/kernels/internal:reference_base", + "//tensorflow/lite/kernels/internal:tensor", + ], +) + +cc_library( + name = "portable_optimized_ops_resolver", + srcs = [ + "all_ops_resolver.cc", + ], + hdrs = [ + "all_ops_resolver.h", + ], + copts = tflite_copts(), + deps = [ + ":portable_optimized_micro_ops", + "//tensorflow/lite/experimental/micro:micro_framework", + ], +) + tflite_micro_cc_test( name = "depthwise_conv_test", srcs = [ @@ -59,6 +95,19 @@ tflite_micro_cc_test( ], ) +tflite_micro_cc_test( + name = "portable_optimized_depthwise_conv_test", + srcs = [ + "depthwise_conv_test.cc", + ], + deps = [ + ":portable_optimized_ops_resolver", + "//tensorflow/lite/c:c_api_internal", + "//tensorflow/lite/experimental/micro:micro_framework", + "//tensorflow/lite/experimental/micro/testing:micro_test", + ], +) + tflite_micro_cc_test( name = "fully_connected_test", srcs = [ diff --git a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc index 05ba8798c0d..ff952b39c00 100644 --- a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc +++ b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc @@ -50,6 +50,11 @@ void TestDepthwiseConvFloat(std::initializer_list input_dims_data, CreateFloatTensor(output_data, output_dims, "output_tensor"), }; + // Place a unique value in the uninitialized output buffer. + for (int i = 0; i < output_dims_count; ++i) { + output_data[i] = 23; + } + TfLiteContext context; PopulateContext(tensors, tensors_size, &context); @@ -403,4 +408,84 @@ TF_LITE_MICRO_TEST(SimpleTestReluQuantized) { kTfLiteActRelu, output_data); } +TF_LITE_MICRO_TEST(SimpleTestOptimizedFilterWidth) { + using tflite::testing::F2Q; + using tflite::testing::F2Q32; + + const float input_min = 0; + const float input_max = 255.0f; + const float filter_min = -63.5f; + const float filter_max = 64.0f; + const float bias_min = 0.0f; + const float bias_max = 128.0f * (1 << 24); + const float output_min = -127.0f; + const float output_max = 128.0f; + const int output_dims_count = 9; + uint8_t output_data[output_dims_count]; + + tflite::testing::TestDepthwiseConvQuantized( // + {4, 1, 1, 9, 1}, // Input shape. + { + // Input values. + F2Q(1, input_min, input_max), + F2Q(2, input_min, input_max), + F2Q(7, input_min, input_max), + F2Q(8, input_min, input_max), + F2Q(3, input_min, input_max), + F2Q(4, input_min, input_max), + F2Q(9, input_min, input_max), + F2Q(10, input_min, input_max), + F2Q(5, input_min, input_max), + F2Q(6, input_min, input_max), + F2Q(11, input_min, input_max), + F2Q(12, input_min, input_max), + }, + input_min, input_max, // Input quantization range. + {4, 2, 1, 8, 1}, // Filter shape. + { + // Filter values. + F2Q(1, filter_min, filter_max), + F2Q(2, filter_min, filter_max), + F2Q(3, filter_min, filter_max), + F2Q(4, filter_min, filter_max), + F2Q(-9, filter_min, filter_max), + F2Q(10, filter_min, filter_max), + F2Q(-11, filter_min, filter_max), + F2Q(12, filter_min, filter_max), + F2Q(5, filter_min, filter_max), + F2Q(6, filter_min, filter_max), + F2Q(7, filter_min, filter_max), + F2Q(8, filter_min, filter_max), + F2Q(13, filter_min, filter_max), + F2Q(-14, filter_min, filter_max), + F2Q(15, filter_min, filter_max), + F2Q(-16, filter_min, filter_max), + }, + filter_min, filter_max, // Filter quantization range. + {1, 1}, // Bias shape. + { + // Bias values. + F2Q32(1, bias_min, bias_max), + F2Q32(2, bias_min, bias_max), + F2Q32(3, bias_min, bias_max), + F2Q32(4, bias_min, bias_max), + }, + bias_min, bias_max, // Bias quantization range. + { + // Expected results. + 220, + 184, + 140, + 150, + 161, + 200, + 172, + 148, + 133, + }, + {4, 1, 1, 9, 1}, // Output shape. + output_min, output_max, // Output quantization range. + kTfLiteActNone, output_data); +} + TF_LITE_MICRO_TESTS_END diff --git a/tensorflow/lite/experimental/micro/kernels/portable_optimized/depthwise_conv.cc b/tensorflow/lite/experimental/micro/kernels/portable_optimized/depthwise_conv.cc new file mode 100644 index 00000000000..f1ddf6b0733 --- /dev/null +++ b/tensorflow/lite/experimental/micro/kernels/portable_optimized/depthwise_conv.cc @@ -0,0 +1,439 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/c_api_internal.h" +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/kernels/padding.h" + +#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h" +#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h" + +namespace tflite { +namespace ops { +namespace micro { +namespace depthwise_conv { +namespace { + +constexpr int kInputTensor = 0; +constexpr int kFilterTensor = 1; +constexpr int kBiasTensor = 2; +constexpr int kOutputTensor = 0; + +// Size of the cached buffer we'll be using to hold reordered weights. +constexpr int kReshapedFilterDataSize = 1 * 1024; + +struct OpData { + TfLitePaddingValues padding; + // The scaling factor from input to output (aka the 'real multiplier') can + // be represented as a fixed point multiplier plus a left shift. + int32_t output_multiplier; + int output_shift; + // The range of the fused activation layer. For example for kNone and + // uint8_t these would be 0 and 255. + int32_t output_activation_min; + int32_t output_activation_max; +}; + +TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, + TfLiteDepthwiseConvParams* params, int width, + int height, int filter_width, int filter_height, + int out_width, int out_height, + const TfLiteType data_type, OpData* data) { + data->padding.height = ComputePadding(params->stride_height, 1, height, + filter_height, out_height); + data->padding.width = + ComputePadding(params->stride_width, 1, width, filter_width, out_width); + + // Note that quantized inference requires that all tensors have their + // parameters set. This is usually done during quantized training. + if (data_type != kTfLiteFloat32) { + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + const TfLiteTensor* filter = GetInput(context, node, kFilterTensor); + const TfLiteTensor* bias = + GetOptionalInputTensor(context, node, kBiasTensor); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + double real_multiplier = 0.0; + TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( + context, input, filter, bias, output, &real_multiplier)); + int exponent; + QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent); + data->output_shift = -exponent; + CalculateActivationRangeUint8(params->activation, output, + &data->output_activation_min, + &data->output_activation_max); + } + return kTfLiteOk; +} + +// Specialized implementation of the depthwise convolution operation designed to +// work with the particular filter width of eight used by the default micro +// speech sample code. It uses 1KB of RAM to hold reordered weight parameters, +// converted from TFLite's NHWC format to NCHW format, and expressed as signed +// eight bit integers, rather than unsigned. Care must be taken when calling +// this not to use it for more than one node since there's only a single static +// buffer holding the weights. You should use this implementation if depthwise +// convolutions are a performance bottleneck, you have a layer that meets the +// parameter requirements, and the extra RAM usage and additional code size are +// not an issue. +static inline void DepthwiseConvOptimizedForFilterWidthEight( + TfLiteContext* context, const DepthwiseParams& params, + const RuntimeShape& input_shape, const uint8* input_data, + const RuntimeShape& filter_shape, const uint8* filter_data, + const RuntimeShape& bias_shape, const int32* bias_data, + const RuntimeShape& output_shape, uint8* output_data) { + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32 output_activation_min = params.quantized_activation_min; + const int32 output_activation_max = params.quantized_activation_max; + const int32 input_offset = params.input_offset; + const int32 filter_offset = params.weights_offset; + const int32 output_offset = params.output_offset; + const int32 output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + + static int8_t reshaped_filter_data[kReshapedFilterDataSize]; + const int needed_size = + output_depth * filter_width * filter_height * input_depth; + if (needed_size > kReshapedFilterDataSize) { + context->ReportError( + context, + "Size too large for reshaped weight buffer (%d needed, %d available)", + needed_size, kReshapedFilterDataSize); + return; + } + + RuntimeShape reshaped_filter_shape; + reshaped_filter_shape.BuildFrom( + {1, output_depth, filter_height, filter_width}); + + // If this is the first time through, repack the weights into a cached buffer + // so that they can be accessed sequentially. + static bool is_reshaped_filter_initialized = false; + if (!is_reshaped_filter_initialized) { + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + for (int oc = 0; oc < output_depth; ++oc) { + const uint8* current_filter = + filter_data + Offset(filter_shape, 0, filter_y, filter_x, oc); + int8* reshaped_filter = + reshaped_filter_data + + Offset(reshaped_filter_shape, 0, oc, filter_y, filter_x); + *reshaped_filter = (int32_t)(*current_filter) + filter_offset; + } + } + } + is_reshaped_filter_initialized = true; + } + + for (int b = 0; b < batches; ++b) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + for (int ic = 0; ic < input_depth; ++ic) { + for (int m = 0; m < depth_multiplier; m++) { + const int oc = m + ic * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32 acc = 0; + int in_y_start = in_y_origin; + int filter_y_start = 0; + if (in_y_origin < 0) { + in_y_start = 0; + filter_y_start = 0 - in_y_origin; + } + int filter_y_end = filter_height; + if ((in_y_origin + filter_height) >= input_height) { + filter_y_end -= (in_y_origin + filter_height) - input_height; + } + int in_y = in_y_start; + int in_x_start = in_x_origin; + int filter_x_start = 0; + bool is_out_of_x_bounds = false; + if (in_x_origin < 0) { + in_x_start = 0; + filter_x_start = 0 - in_x_origin; + is_out_of_x_bounds = true; + } + int filter_x_end = filter_width; + if ((in_x_origin + filter_width) >= input_width) { + filter_x_end -= (in_x_origin + filter_width) - input_width; + is_out_of_x_bounds = true; + } + for (int filter_y = filter_y_start; filter_y < filter_y_end; + ++filter_y, ++in_y) { + const uint8* current_input = + input_data + Offset(input_shape, b, in_y, in_x_start, ic); + if ((filter_width == 8) && !is_out_of_x_bounds) { + int8* current_filter = + reshaped_filter_data + Offset(reshaped_filter_shape, 0, oc, + filter_y, filter_x_start); + const uint32_t input_vals0 = + *reinterpret_cast(current_input); + current_input += 4; + const int32_t filter_vals0 = + *reinterpret_cast(current_filter); + current_filter += 4; + const uint8 input_val0 = input_vals0 & 0xff; + const int8 filter_val0 = filter_vals0 & 0xff; + acc += filter_val0 * input_val0; + const uint8 input_val1 = (input_vals0 >> 8) & 0xff; + const int8 filter_val1 = (filter_vals0 >> 8) & 0xff; + acc += filter_val1 * input_val1; + const uint8 input_val2 = (input_vals0 >> 16) & 0xff; + const int8 filter_val2 = (filter_vals0 >> 16) & 0xff; + acc += filter_val2 * input_val2; + const uint8 input_val3 = (input_vals0 >> 24) & 0xff; + const int8 filter_val3 = (filter_vals0 >> 24) & 0xff; + acc += filter_val3 * input_val3; + + const uint32_t input_vals1 = + *reinterpret_cast(current_input); + const int32_t filter_vals1 = + *reinterpret_cast(current_filter); + const uint8 input_val4 = input_vals1 & 0xff; + const int8 filter_val4 = filter_vals1 & 0xff; + acc += filter_val4 * input_val4; + const uint8 input_val5 = (input_vals1 >> 8) & 0xff; + const int8 filter_val5 = (filter_vals1 >> 8) & 0xff; + acc += filter_val5 * input_val5; + const uint8 input_val6 = (input_vals1 >> 16) & 0xff; + const int8 filter_val6 = (filter_vals1 >> 16) & 0xff; + acc += filter_val6 * input_val6; + const uint8 input_val7 = (input_vals1 >> 24) & 0xff; + const int8 filter_val7 = (filter_vals1 >> 24) & 0xff; + acc += filter_val7 * input_val7; + } else { + const uint8* current_filter = + filter_data + + Offset(filter_shape, 0, filter_y, filter_x_start, oc); + for (int filter_x = filter_x_start; filter_x < filter_x_end; + ++filter_x) { + int32 input_val = *current_input; + current_input += input_depth; + int32 filter_val = *current_filter; + current_filter += output_depth; + acc += + (filter_val + filter_offset) * (input_val + input_offset); + } + } + } + if (bias_data) { + acc += bias_data[oc]; + } + acc = reference_ops::depthwise_conv::DepthwiseConvRound< + DepthwiseConvOutputRounding::kAwayFromZero>( + acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, b, out_y, out_x, oc)] = + static_cast(acc); + } + } + } + } + } +} // namespace + +} // namespace + +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + return nullptr; +} + +void Free(TfLiteContext* context, void* buffer) {} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + return kTfLiteOk; +} + +void EvalFloat(TfLiteContext* context, TfLiteNode* node, + TfLiteDepthwiseConvParams* params, OpData* data, + const TfLiteTensor* input, const TfLiteTensor* filter, + const TfLiteTensor* bias, TfLiteTensor* output) { + float output_activation_min, output_activation_max; + CalculateActivationRange(params->activation, &output_activation_min, + &output_activation_max); + + tflite::DepthwiseParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = data->padding.width; + op_params.padding_values.height = data->padding.height; + op_params.stride_width = params->stride_width; + op_params.stride_height = params->stride_height; + op_params.dilation_width_factor = 1; + op_params.dilation_height_factor = 1; + op_params.depth_multiplier = params->depth_multiplier; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + + tflite::reference_ops::DepthwiseConv( + op_params, GetTensorShape(input), GetTensorData(input), + GetTensorShape(filter), GetTensorData(filter), + GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), + GetTensorData(output)); +} + +void EvalQuantized(TfLiteContext* context, TfLiteNode* node, + TfLiteDepthwiseConvParams* params, OpData* data, + const TfLiteTensor* input, const TfLiteTensor* filter, + const TfLiteTensor* bias, TfLiteTensor* output) { + const int32_t input_offset = -input->params.zero_point; + const int32_t filter_offset = -filter->params.zero_point; + const int32_t output_offset = output->params.zero_point; + + tflite::DepthwiseParams op_params; + // Padding type is ignored, but still set. + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = data->padding.width; + op_params.padding_values.height = data->padding.height; + op_params.stride_width = params->stride_width; + op_params.stride_height = params->stride_height; + op_params.dilation_width_factor = 1; + op_params.dilation_height_factor = 1; + op_params.depth_multiplier = params->depth_multiplier; + op_params.quantized_activation_min = data->output_activation_min; + op_params.quantized_activation_max = data->output_activation_max; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = data->output_multiplier; + // Legacy ops used mixed left and right shifts. Now all are +ve-means-left. + op_params.output_shift = -data->output_shift; + + // Figure out if we can use the optimized path for this set of parameters. + const int filter_width = GetTensorShape(filter).Dims(2); + const int input_depth = GetTensorShape(input).Dims(3); + const int output_depth = GetTensorShape(filter).Dims(3); + const int filter_height = GetTensorShape(filter).Dims(1); + const int needed_size = + output_depth * filter_width * filter_height * input_depth; + bool use_optimized_path = false; + if ((filter_width == 8) && (input_offset == 0) && (filter_offset == -127) && + (input_depth == 1) && (needed_size <= kReshapedFilterDataSize)) { + // FIXME(petewarden) - We need a more robust way of handling this, ideally + // with an allocation mechanism available through the context API. + // Use the address of the node as a proxy for its identity, since we need + // to ensure the weight values are consistent between calls, and there's + // no easy way to do that quickly other than relying on the identity of + // the owning node. + static TfLiteNode* initialized_node_address = node; + if (initialized_node_address == node) { + use_optimized_path = true; + } else { + static bool has_warned = false; + if (!has_warned) { + context->ReportError( + context, + "Multiple depthwise conv ops match optimization parameters, but " + "only the first will use the fast path, because there's only one " + "RAM cache available"); + has_warned = true; + } + } + } + if (use_optimized_path) { + DepthwiseConvOptimizedForFilterWidthEight( + context, op_params, GetTensorShape(input), + GetTensorData(input), GetTensorShape(filter), + GetTensorData(filter), GetTensorShape(bias), + GetTensorData(bias), GetTensorShape(output), + GetTensorData(output)); + } else { + tflite::reference_ops::DepthwiseConv( + op_params, GetTensorShape(input), GetTensorData(input), + GetTensorShape(filter), GetTensorData(filter), + GetTensorShape(bias), GetTensorData(bias), + GetTensorShape(output), GetTensorData(output)); + } +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + auto* params = + reinterpret_cast(node->builtin_data); + + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + const TfLiteTensor* filter = GetInput(context, node, kFilterTensor); + const TfLiteTensor* bias = + (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr; + + const TfLiteType data_type = input->type; + int width = SizeOfDimension(input, 2); + int height = SizeOfDimension(input, 1); + int filter_width = SizeOfDimension(filter, 2); + int filter_height = SizeOfDimension(filter, 1); + int out_width = ComputeOutSize(params->padding, width, filter_width, + params->stride_width); + int out_height = ComputeOutSize(params->padding, height, filter_height, + params->stride_height); + OpData local_data_object; + OpData* data = &local_data_object; + TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height, + filter_width, filter_height, out_width, + out_height, data_type, data)); + + // TODO(aselle): Consider whether float conv and quantized conv should be + // separate ops to avoid dispatch overhead here. + switch (input->type) { // Already know in/out types are same. + case kTfLiteFloat32: + EvalFloat(context, node, params, data, input, filter, bias, output); + break; + case kTfLiteUInt8: + EvalQuantized(context, node, params, data, input, filter, bias, output); + break; + default: + context->ReportError(context, "Type %d not currently supported.", + input->type); + return kTfLiteError; + } + return kTfLiteOk; +} + +} // namespace depthwise_conv + +TfLiteRegistration* Register_DEPTHWISE_CONV_2D() { + static TfLiteRegistration r = {depthwise_conv::Init, depthwise_conv::Free, + depthwise_conv::Prepare, depthwise_conv::Eval}; + return &r; +} + +} // namespace micro +} // namespace ops +} // namespace tflite diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile index 1179b285067..e11e8a8cf09 100644 --- a/tensorflow/lite/experimental/micro/tools/make/Makefile +++ b/tensorflow/lite/experimental/micro/tools/make/Makefile @@ -87,7 +87,6 @@ tensorflow/lite/core/api/op_resolver.cc \ tensorflow/lite/kernels/kernel_util.cc \ tensorflow/lite/kernels/internal/quantization_util.cc MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS)) -MICROLITE_CC_SRCS := $(call specialize,$(MICROLITE_CC_SRCS)) MICROLITE_CC_HDRS := \ $(wildcard tensorflow/lite/experimental/micro/*.h) \ @@ -151,6 +150,9 @@ KEIL_PROJECT_FILES := \ # keep this main makefile focused on the sources and dependencies. include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc) +# Call specialize here so that platform-specific tags can be taken into account. +MICROLITE_CC_SRCS := $(call specialize,$(MICROLITE_CC_SRCS)) + ALL_TAGS += $(TARGET_ARCH) ALL_SRCS := \ diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc index 4b3e79849e9..4df26a7bf70 100644 --- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc +++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc @@ -10,6 +10,9 @@ ifeq ($(TARGET),$(filter $(TARGET),apollo3evb sparkfun_edge)) # with the hard interfaces. GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/ + # Use the faster depthwise conv implementation. + ALL_TAGS += portable_optimized + PLATFORM_FLAGS = \ -DPART_apollo3 \ -DAM_PACKAGE_BGA \ diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc index 3b91eeff9fd..090b4fa101d 100644 --- a/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc +++ b/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc @@ -1,6 +1,9 @@ # Settings for Mac OS platforms. ifeq ($(TARGET), osx) + # Make sure we can find the embedded GCC compiler. + export PATH := ${PATH}:tensorflow/lite/experimental/micro/tools/make/downloads/gcc_embedded/bin/ + PLATFORM_FLAGS = \ -DTF_LITE_DISABLE_X86_NEON