From 45d9e41015b9004e1e0c8afaa7e90eaa71c2aeed Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@users.noreply.github.com>
Date: Tue, 16 Feb 2021 14:33:09 -0800
Subject: [PATCH] Use xa_nnlib for quantize for Fusion F1.

Copied the relevant function call from https://github.com/pnikam-cad/tensorflow/blob/a737c1e3945bc70022259479ad24133a343ec906/tensorflow/lite/micro/kernels/xtensa_hifi/quantize.cc

Latency for the first quantize op (int16->int8) in the keyword_benchmark
went from 3758 ticks to 800 ticks.

Overall latency went from 38516 ticks to 34253 ticks.

Tested with:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade OPTIMIZED_KERNEL_DIR=xtensa run_keyword_benchmark -j8
```

Full output (for completeness):
```
InitializeKeywordRunner took 160568 ticks (160 ms).

KeywordRunNIerations(1) took 34253 ticks (34 ms)
QUANTIZE took 800 ticks (0 ms).
SVDF took 4753 ticks (4 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 4211 ticks (4 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 3145 ticks (3 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 4211 ticks (4 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 2890 ticks (2 ms).
SVDF took 3583 ticks (3 ms).
SVDF took 3054 ticks (3 ms).
FULLY_CONNECTED took 1091 ticks (1 ms).
SOFTMAX took 749 ticks (0 ms).
QUANTIZE took 354 ticks (0 ms).
```

Also tested that the kernel test passes with:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade OPTIMIZED_KERNEL_DIR=xtensa test_kernel_quantize_test -j8
```

Progress towards http://b/177457688
---
 .../lite/micro/kernels/quantize_test.cc       | 12 +++---
 .../lite/micro/kernels/xtensa/quantize.cc     | 39 ++++++++++++++++---
 2 files changed, 40 insertions(+), 11 deletions(-)
diff --git a/tensorflow/lite/micro/kernels/quantize_test.cc b/tensorflow/lite/micro/kernels/quantize_test.cc
index ad302f0438d..b5da97930fd 100644
--- a/tensorflow/lite/micro/kernels/quantize_test.cc
+++ b/tensorflow/lite/micro/kernels/quantize_test.cc
@@ -49,7 +49,7 @@ void ValidateQuantizeGoldens(TfLiteTensor* tensors, int tensors_size,
   }
 }
 
-#if !defined(HIFIMINI)
+#if !defined(XTENSA)
 template <typename T>
 void TestQuantizeFloat(const int* input_dims_data, const float* input_data,
                        const int* output_dims_data, const float* golden,
@@ -79,7 +79,7 @@ void TestQuantizeFloat(const int* input_dims_data, const float* input_data,
   ValidateQuantizeGoldens(tensors, tensors_size, golden, golden_quantized,
                           scale, zero_point, output_dims_count, output_data);
 }
-#endif  // defined(HIFIMINI)
+#endif  // defined(XTENSA)
 
 template <typename InputType, typename OutputType>
 void TestRequantize(const int* input_dims_data, const float* input_data,
@@ -121,7 +121,7 @@ void TestRequantize(const int* input_dims_data, const float* input_data,
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-#if !defined(HIFIMINI)
+#if !defined(XTENSA)
 TF_LITE_MICRO_TEST(QuantizeOpTestUint8) {
   const int length = 10;
   const int dims[] = {2, 2, 5};
@@ -267,9 +267,9 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt8toInt8NoZeroPoint) {
                                   values_quantized, output_scale,
                                   output_zero_point, output_quantized);
 }
-#endif  // defined(HIFIMINI)
+#endif  // defined(XTENSA)
 
-#if !defined(HIFIMINI)
+#if !defined(XTENSA)
 // TODO(b/155682734): Hifimini optimized quantize requires input scale to be
 // smaller then output scale.
 TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) {
@@ -288,7 +288,7 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) {
                                   values_quantized, output_scale,
                                   output_zero_point, output_quantized);
 }
-#endif  // defined(HIFIMINI)
+#endif  // defined(XTENSA)
 
 TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt32) {
   const int length = 10;
diff --git a/tensorflow/lite/micro/kernels/xtensa/quantize.cc b/tensorflow/lite/micro/kernels/xtensa/quantize.cc
index 3b84e0680f6..5418c69d18e 100644
--- a/tensorflow/lite/micro/kernels/xtensa/quantize.cc
+++ b/tensorflow/lite/micro/kernels/xtensa/quantize.cc
@@ -109,25 +109,55 @@ void AffineQuantize(int scale_multiplier, const int32_t zero_point,
   }
 }
 
-TfLiteStatus EvalHifimini(TfLiteContext* context, TfLiteNode* node) {
+#endif  // defined(HIFIMINI)
+
+TfLiteStatus EvalXtensa(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
+#if defined(HIFIMINI)
   auto* op_data = static_cast<OpData*>(node->user_data);
+#elif defined(FUSION_F1)
+  auto* op_data = static_cast<OpDataQuantizeReference*>(node->user_data);
+#endif
 
   const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
   TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
 
   if (output->type == kTfLiteInt8 && input->type == kTfLiteInt16) {
+#if defined(HIFIMINI)
     AffineQuantize(op_data->scale_multiplier, op_data->zero_point,
                    tflite::micro::GetTensorShape(input),
                    tflite::micro::GetTensorData<int16_t>(input),
                    tflite::micro::GetTensorShape(output),
                    tflite::micro::GetTensorData<int8_t>(output));
+#elif defined(FUSION_F1)
+    int size = ElementCount(*input->dims);
+    TF_LITE_ENSURE_EQ(
+        context,
+        xa_nn_elm_quantize_asym16s_asym8s(
+            tflite::micro::GetTensorData<int8_t>(output),
+            tflite::micro::GetTensorData<int16_t>(input),
+            op_data->input_zero_point, op_data->quantization_params.zero_point,
+            op_data->requantize_output_shift,
+            op_data->requantize_output_multiplier, size),
+        0);
+#else
+    static_assert(false, "Unsupported xtensa architecture.");
+#endif
   } else if (output->type == kTfLiteInt32 && input->type == kTfLiteInt16) {
     int size = ElementCount(*input->dims);
+
+    // This ifdef is only needed because the hifimini code is not following the
+    // convention of the rest of the codebase. Ideally we would be using the
+    // same structs as much as possible and reduce the need for such ifdefs.
+#if defined(HIFIMINI)
+    int32_t zero_point = op_data->zero_point;
+#elif defined(FUSION_F1)
+    int32_t zero_point = op_data->quantization_params.zero_point;
+#endif
     reference_ops::Requantize(tflite::micro::GetTensorData<int16_t>(input),
                               size, op_data->requantize_output_multiplier,
                               op_data->requantize_output_shift,
-                              op_data->input_zero_point, op_data->zero_point,
+                              op_data->input_zero_point, zero_point,
                               tflite::micro::GetTensorData<int32_t>(output));
   } else {
     TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
@@ -137,7 +167,6 @@ TfLiteStatus EvalHifimini(TfLiteContext* context, TfLiteNode* node) {
   }
   return kTfLiteOk;
 }
-#endif  // defined(HIFIMINI)
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
@@ -179,8 +208,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-#if defined(HIFIMINI)
-  return EvalHifimini(context, node);
+#if defined(HIFIMINI) || defined(FUSION_F1)
+  return EvalXtensa(context, node);
 #else
   return EvalQuantizeReference(context, node);
 #endif