From c0ddb9b4faaa8fea257947386a5a27b9f050710d Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 20 May 2020 15:55:36 +0100 Subject: [PATCH] TFLu: Update CMSIS-NN glue interface in conv.cc The CMSIS-NN glue for TFLu convolution now adopts a wrapper function (arm_convolve_wrapper_s8) to simplify the integration of future optimizations available in CMSIS-NN. The wrapper function is responsible to dispatch the most optimized kernel accordingly with the parameters passed --- .../lite/micro/kernels/cmsis-nn/conv.cc | 251 ++++++++++-------- .../tools/make/third_party_downloads.inc | 4 +- 2 files changed, 146 insertions(+), 109 deletions(-) diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc index 34d4e837f65..286e24a508d 100644 --- a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc +++ b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/reference/conv.h" #include "arm_nnfunctions.h" +#include "arm_nn_types.h" #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/internal/common.h" @@ -116,7 +117,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { #if defined(__ARM_FEATURE_DSP) OpData data; - int32_t buf_size; + int32_t buf_size = 0; auto* params = reinterpret_cast(node->builtin_data); @@ -127,32 +128,51 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { RuntimeShape input_shape = GetTensorShape(input); RuntimeShape output_shape = GetTensorShape(output); - const int input_depth = input_shape.Dims(3); - const int input_width = input->dims->data[2]; - const int input_height = input->dims->data[1]; - const int filter_width = filter->dims->data[2]; - const int filter_height = filter->dims->data[1]; - const int output_width = output->dims->data[2]; - const int output_height = output->dims->data[1]; - const int batches = MatchingDim(input_shape, 0, output_shape, 0); + // Initialize cmsis-nn input dimensions + cmsis_nn_dims input_dims; + input_dims.n = MatchingDim(input_shape, 0, output_shape, 0); + input_dims.h = input->dims->data[1]; + input_dims.w = input->dims->data[2]; + input_dims.c = input_shape.Dims(3); + + // Initialize cmsis-nn filter dimensions + cmsis_nn_dims filter_dims; + filter_dims.n = output_shape.Dims(3); + filter_dims.h = filter->dims->data[1]; + filter_dims.w = filter->dims->data[2]; + filter_dims.c = input_dims.c; + + // Initialize cmsis-nn output dimensions + cmsis_nn_dims output_dims; + output_dims.n = input_dims.n; + output_dims.h = output->dims->data[1]; + output_dims.w = output->dims->data[2]; + output_dims.c = output_shape.Dims(3); int* buffer_idx = reinterpret_cast(node->user_data); TF_LITE_ENSURE_STATUS(CalculateOpData( - context, node, params, input_width, input_height, filter_width, - filter_height, output_width, output_height, input->type, &data)); + context, node, params, input_dims.w, input_dims.h, filter_dims.w, + filter_dims.h, output_dims.w, output_dims.h, input->type, &data)); - if (data.padding.width == 0 && data.padding.height == 0 && - (input_depth % 4 == 0) && params->stride_width == 1 && - params->stride_height == 1 && filter_width == 1 && filter_height == 1) { - buf_size = arm_convolve_1x1_s8_fast_get_buffer_size(input_depth); - } else if (output_height == 1 && input_height == 1 && filter_height == 1 && - (output_width % 4 == 0) && batches == 1) { - buf_size = arm_convolve_1_x_n_s8_get_buffer_size(input_depth, filter_width, - filter_height); - } else { - buf_size = arm_convolve_s8_get_buffer_size(input_depth, filter_width, - filter_height); + if(input->type == kTfLiteInt8) { + // Initialize cmsis-nn convolution parameters + cmsis_nn_conv_params conv_params; + conv_params.input_offset = -input->params.zero_point; + conv_params.output_offset = output->params.zero_point; + conv_params.stride.h = params->stride_height; + conv_params.stride.w = params->stride_width; + conv_params.dilation.h = params->dilation_height_factor; + conv_params.dilation.w = params->dilation_width_factor; + conv_params.padding.h = data.padding.height; + conv_params.padding.w = data.padding.width; + conv_params.activation.min = data.output_activation_min; + conv_params.activation.max = data.output_activation_max; + + buf_size = arm_convolve_wrapper_s8_get_buffer_size(&conv_params, + &input_dims, + &filter_dims, + &output_dims); } node->user_data = buffer_idx; @@ -204,6 +224,107 @@ TfLiteStatus EvalQuantizedPerChannel( TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params, OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output, TfLiteTensor* im2col) { + + // Initialize cmsis-nn convolution parameters + cmsis_nn_conv_params conv_params; + conv_params.input_offset = -input->params.zero_point; + conv_params.output_offset = output->params.zero_point; + conv_params.stride.h = params->stride_height; + conv_params.stride.w = params->stride_width; + conv_params.dilation.h = params->dilation_height_factor; + conv_params.dilation.w = params->dilation_width_factor; + conv_params.padding.h = data->padding.height; + conv_params.padding.w = data->padding.width; + conv_params.activation.min = data->output_activation_min; + conv_params.activation.max = data->output_activation_max; + + // Initialize cmsis-nn per channel quantization parameters + cmsis_nn_per_channel_quant_params quant_params; + quant_params.multiplier = data->per_channel_output_multiplier; + quant_params.shift = data->per_channel_output_shift; + +#if defined(__ARM_FEATURE_DSP) + RuntimeShape filter_shape = GetTensorShape(filter); + RuntimeShape input_shape = GetTensorShape(input); + RuntimeShape output_shape = GetTensorShape(output); + RuntimeShape bias_shape = GetTensorShape(bias); + + // Sanity check. + TFLITE_DCHECK_LE(conv_params.activation.min, conv_params.activation.max); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); + const int batch_size = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (GetTensorData(bias)) { + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); + } + + // Initialize cmsis-nn dimensions + // Input + cmsis_nn_dims input_dims; + input_dims.n = batch_size; + input_dims.h = input_shape.Dims(1); + input_dims.w = input_shape.Dims(2); + input_dims.c = input_depth; + + // Filter + cmsis_nn_dims filter_dims; + filter_dims.n = output_depth; + filter_dims.h = filter_shape.Dims(1); + filter_dims.w = filter_shape.Dims(2); + filter_dims.c = input_depth; + + // Bias + cmsis_nn_dims bias_dims; + bias_dims.n = 1; + bias_dims.h = 1; + bias_dims.w = 1; + bias_dims.c = output_depth; + + // Output + cmsis_nn_dims output_dims; + output_dims.n = batch_size; + output_dims.h = output_shape.Dims(1); + output_dims.w = output_shape.Dims(2); + output_dims.c = output_depth; + + // Initialize cmsis-nn context + cmsis_nn_context ctx; + ctx.buf = nullptr; + ctx.size = 0; + + auto* buffer_idx = reinterpret_cast(node->user_data); + if (*buffer_idx > -1) { + ctx.buf = context->GetScratchBuffer(context, *buffer_idx); + // Note: ctx.size is currently not used in cmsis-nn. + // The buffer should be allocated in the Prepare function through arm_convolve_wrapper_s8_get_buffer_size + } + + // arm_convolve_wrapper_s8 dispatches the optimized kernel accordingly with the parameters passed + arm_status status = arm_convolve_wrapper_s8(&ctx, + &conv_params, + &quant_params, + &input_dims, + GetTensorData(input), + &filter_dims, + GetTensorData(filter), + &bias_dims, + GetTensorData(bias), + &output_dims, + GetTensorData(output)); + + if(status == ARM_MATH_SUCCESS) { + return kTfLiteOk; + } else { + return kTfLiteError; + } + +#else +#pragma message( \ + "CMSIS-NN optimization for conv not available for this target. Using reference kernel.") + ConvParams op_params; op_params.input_offset = -input->params.zero_point; op_params.output_offset = output->params.zero_point; @@ -216,91 +337,6 @@ TfLiteStatus EvalQuantizedPerChannel( op_params.quantized_activation_min = data->output_activation_min; op_params.quantized_activation_max = data->output_activation_max; -#if defined(__ARM_FEATURE_DSP) - RuntimeShape filter_shape = GetTensorShape(filter); - RuntimeShape input_shape = GetTensorShape(input); - RuntimeShape output_shape = GetTensorShape(output); - RuntimeShape bias_shape = GetTensorShape(bias); - - // Set min and max value of the output. - const int32 output_activation_min = std::numeric_limits::min(); - const int32 output_activation_max = std::numeric_limits::max(); - - // Sanity check. - TFLITE_DCHECK_LE(output_activation_min, output_activation_max); - TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); - TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); - TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); - const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); - const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); - if (GetTensorData(bias)) { - TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); - } - - const int input_height = input_shape.Dims(1); - const int input_width = input_shape.Dims(2); - const int filter_height = filter_shape.Dims(1); - const int filter_width = filter_shape.Dims(2); - const int output_height = output_shape.Dims(1); - const int output_width = output_shape.Dims(2); - int16_t* buf = nullptr; - - auto* buffer_idx = reinterpret_cast(node->user_data); - if (*buffer_idx > -1) { - void* raw = context->GetScratchBuffer(context, *buffer_idx); - buf = reinterpret_cast(raw); - } - - if (op_params.padding_values.width == 0 && - op_params.padding_values.height == 0 && (input_depth % 4 == 0) && - op_params.stride_width == 1 && op_params.stride_height == 1 && - filter_width == 1 && filter_height == 1) { - if (arm_convolve_1x1_s8_fast( - GetTensorData(input), input_width, input_height, - input_depth, batches, GetTensorData(filter), output_depth, - op_params.padding_values.width, op_params.padding_values.height, - op_params.stride_width, op_params.stride_height, - GetTensorData(bias), GetTensorData(output), - data->per_channel_output_shift, data->per_channel_output_multiplier, - op_params.output_offset, op_params.input_offset, - output_activation_min, output_activation_max, output_width, - output_height, buf) != ARM_MATH_SUCCESS) { - return kTfLiteError; - } - - } else if (output_height == 1 && input_height == 1 && filter_height == 1 && - (output_width % 4 == 0) && batches == 1) { - if (arm_convolve_1_x_n_s8( - GetTensorData(input), input_width, input_depth, batches, - GetTensorData(filter), output_depth, filter_width, - op_params.padding_values.width, op_params.stride_width, - GetTensorData(bias), GetTensorData(output), - data->per_channel_output_shift, data->per_channel_output_multiplier, - op_params.output_offset, op_params.input_offset, - output_activation_min, output_activation_max, output_width, - buf) != ARM_MATH_SUCCESS) { - return kTfLiteError; - } - } else { - if (arm_convolve_s8( - GetTensorData(input), input_width, input_height, - input_depth, batches, GetTensorData(filter), output_depth, - filter_width, filter_height, op_params.padding_values.width, - op_params.padding_values.height, op_params.stride_width, - op_params.stride_height, GetTensorData(bias), - GetTensorData(output), data->per_channel_output_shift, - data->per_channel_output_multiplier, op_params.output_offset, - op_params.input_offset, output_activation_min, - output_activation_max, output_width, output_height, - buf) != ARM_MATH_SUCCESS) { - return kTfLiteError; - } - } -#else -#pragma message( \ - "CMSIS-NN optimization for conv not available for this target. Using reference kernel.") - reference_integer_ops::ConvPerChannel( op_params, data->per_channel_output_multiplier, data->per_channel_output_shift, GetTensorShape(input), @@ -420,3 +456,4 @@ TfLiteRegistration* Register_CONV_2D() { } // namespace micro } // namespace ops } // namespace tflite + diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc index 9251e4c161e..3b6d8f25de8 100644 --- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc +++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc @@ -28,8 +28,8 @@ LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765" TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz" TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f" -CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/8a4db53f69da06e97565fe2f2e8926d193a5759d.zip" -CMSIS_MD5 := "e9864fb71b65adc4f7d92a9dea6e1aab" +CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/1150e71e07c79b538efd842aba5b210a31827ae5.zip" +CMSIS_MD5 := "e05f4222ef58825193910b41a0871dcb" AM_SDK_URL := "http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.2.0.zip" AM_SDK_MD5 := "7605fa2d4d97e6bb7a1190c92b66b597"