From fd50feed714e9a765636937bf502ff162ff453a0 Mon Sep 17 00:00:00 2001
From: Daniel Situnayake <daniel@edgeimpulse.com>
Date: Sun, 12 Apr 2020 10:06:29 -0700
Subject: [PATCH 1/3] Enable TensorFlow Lite for Microcontrollers to build with
 CMSIS-NN

---
 tensorflow/lite/micro/kernels/cmsis-nn/README.md      | 11 ++++++++++-
 tensorflow/lite/micro/kernels/cmsis-nn/conv.cc        |  3 ---
 .../lite/micro/tools/make/third_party_downloads.inc   |  4 ++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/README.md b/tensorflow/lite/micro/kernels/cmsis-nn/README.md
index 4107ba466db..6224b3b3796 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/README.md
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/README.md
@@ -48,7 +48,16 @@ cp tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/DSP/Include/\
 arm_math.h mbed-os/cmsis/TARGET_CORTEX_M/arm_math.h
 ```
 
-This issue will be resolved soon. Now type
+There's also a dependency to an old cmsis_gcc.h, which you can fix with the following:
+
+```
+tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/Core/Include/\
+cmsis_gcc.h mbed-os/cmsis/TARGET_CORTEX_M/cmsis_gcc.h
+```
+
+This issue will be resolved soon.
+
+Now type:
 
 ```
 mbed compile -m DISCO_F746NG -t GCC_ARM
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
index 273fdaea65b..8b5a7c028e5 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
@@ -268,9 +268,6 @@ TfLiteStatus EvalQuantizedPerChannel(
              (output_width % 4 == 0) && batches == 1) {
     const int32_t buf_size = arm_convolve_1_x_n_s8_get_buffer_size(
         input_depth, filter_width, filter_height);
-    if (get_cmsis_scratch_buffer(context, &buf, buf_size) != kTfLiteOk) {
-      return kTfLiteError;
-    }
     if (arm_convolve_1_x_n_s8(
             GetTensorData<int8_t>(input), input_width, input_depth, batches,
             GetTensorData<int8_t>(filter), output_depth, filter_width,
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 30a27c0a758..a3089e42d44 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -20,8 +20,8 @@ LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765"
 TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
 TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f"
 
-CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/b2937134bd2047bd569c4408391ae20d7677d35c.zip"
-CMSIS_MD5 := "04cb3a2cb4834284767a01e8f1c6f834"
+CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/8a4db53f69da06e97565fe2f2e8926d193a5759d.zip"
+CMSIS_MD5 := "e9864fb71b65adc4f7d92a9dea6e1aab"
 
 AM_SDK_URL := "http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.2.0.zip"
 AM_SDK_MD5 := "7605fa2d4d97e6bb7a1190c92b66b597"

From 7ce67938d0f84d5e724ab6cde7adaa78e1756a10 Mon Sep 17 00:00:00 2001
From: Daniel Situnayake <daniel@edgeimpulse.com>
Date: Sun, 12 Apr 2020 19:50:29 -0700
Subject: [PATCH 2/3] Integrate CMSIS-NN optimized function for max pooling

---
 .../lite/micro/kernels/cmsis-nn/pooling.cc    | 79 ++++++++++++++++++-
 1 file changed, 75 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
index bf7370ee79a..61f703cf91e 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc
@@ -145,7 +145,7 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
       ARM_MATH_SUCCESS);
 #else
 #pragma message( \
-    "CMSIS-NN optimization for depthwise_conv not available for this target. Using reference kernel.")
+    "CMSIS-NN optimization for avg_pool not available for this target. Using reference kernel.")
 
   PoolParams op_params;
   op_params.stride_height = params->stride_height;
@@ -166,7 +166,7 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
 
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
                   TfLitePoolParams* params, OpData* data,
-                  const TfLiteTensor* input, TfLiteTensor* output) {
+                  TfLiteTensor* input, TfLiteTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
                            &activation_max);
@@ -187,7 +187,7 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
 
 void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
                            TfLitePoolParams* params, OpData* data,
-                           const TfLiteTensor* input, TfLiteTensor* output) {
+                           TfLiteTensor* input, TfLiteTensor* output) {
   int32_t activation_min, activation_max;
   (void)CalculateActivationRangeQuantized(context, params->activation, output,
                                           &activation_min, &activation_max);
@@ -206,6 +206,73 @@ void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
                          GetTensorData<uint8_t>(output));
 }
 
+TfLiteStatus MaxEvalInt8(TfLiteContext* context, const TfLiteNode* node,
+                             const TfLitePoolParams* params, const OpData* data,
+                             TfLiteTensor* input, TfLiteTensor* output) {
+  int32_t activation_min, activation_max;
+  (void)CalculateActivationRangeQuantized(context, params->activation, output,
+                                          &activation_min, &activation_max);
+
+  TFLITE_DCHECK_LE(activation_min, activation_max);
+
+#if defined(__ARM_FEATURE_DSP)
+  RuntimeShape input_shape = GetTensorShape(input);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+
+  RuntimeShape output_shape = GetTensorShape(output);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params->stride_height;
+  const int stride_width = params->stride_width;
+
+  const int filter_height = params->filter_height;
+  const int filter_width = params->filter_width;
+  const int padding_height = data->padding.height;
+  const int padding_width = data->padding.width;
+
+  int16_t* scratch_buffer = nullptr;
+
+  auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
+
+  if (*buffer_idx > -1) {
+    void* raw = context->GetScratchBuffer(context, *buffer_idx);
+    scratch_buffer = reinterpret_cast<int16_t*>(raw);
+  }
+
+  TF_LITE_ENSURE_EQ(
+      context,
+      arm_max_pool_s8_opt(input_height, input_width, output_height, output_width,
+                     stride_height, stride_width, filter_height, filter_width,
+                     padding_height, padding_width, activation_min,
+                     activation_max, depth, GetTensorData<int8_t>(input),
+                     scratch_buffer, GetTensorData<int8_t>(output)),
+      ARM_MATH_SUCCESS);
+#else
+#pragma message( \
+    "CMSIS-NN optimization for max_pool not available for this target. Using reference kernel.")
+
+  PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = activation_min;
+  op_params.quantized_activation_max = activation_max;
+  reference_integer_ops::MaxPool(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(output), GetTensorData<int8_t>(output));
+
+#endif
+  return kTfLiteOk;
+}
+
 }  // namespace
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -277,7 +344,8 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
   OpData data;
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* input = &context->tensors[flatbuffers::EndianScalar(
+      node->inputs->data[kInputTensor])];
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
@@ -289,6 +357,9 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
       MaxEvalQuantizedUInt8(context, node, params, &data, input, output);
       break;
+    case kTfLiteInt8:
+      MaxEvalInt8(context, node, params, &data, input, output);
+      break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
                          TfLiteTypeGetName(input->type));

From a9aa8cb2d7caf22d61e0c1b9c12b6e002c3d7bb9 Mon Sep 17 00:00:00 2001
From: Daniel Situnayake <daniel@edgeimpulse.com>
Date: Mon, 27 Apr 2020 12:21:39 -0700
Subject: [PATCH 3/3] Ensure `ParseOpData` always has a return value

---
 tensorflow/lite/core/api/flatbuffer_conversions.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 998b7d5fbf1..6c861151283 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -913,6 +913,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_SEGMENT_SUM:
       return kTfLiteOk;
   }
+  return kTfLiteError;
 }  // NOLINT[readability/fn_size]
 
 }  // namespace tflite