Use xa_nnlib for quantize for Fusion F1.

Copied the relevant function call from a737c1e394/tensorflow/lite/micro/kernels/xtensa_hifi/quantize.cc Latency for the first quantize op (int16->int8) in the keyword_benchmark went from 3758 ticks to 800 ticks. Overall latency went from 38516 ticks to 34253 ticks. Tested with: ``` make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade OPTIMIZED_KERNEL_DIR=xtensa run_keyword_benchmark -j8 ``` Full output (for completeness): ``` InitializeKeywordRunner took 160568 ticks (160 ms). KeywordRunNIerations(1) took 34253 ticks (34 ms) QUANTIZE took 800 ticks (0 ms). SVDF took 4753 ticks (4 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 4211 ticks (4 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 3145 ticks (3 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 4211 ticks (4 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 2890 ticks (2 ms). SVDF took 3583 ticks (3 ms). SVDF took 3054 ticks (3 ms). FULLY_CONNECTED took 1091 ticks (1 ms). SOFTMAX took 749 ticks (0 ms). QUANTIZE took 354 ticks (0 ms). ``` Also tested that the kernel test passes with: ``` make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade OPTIMIZED_KERNEL_DIR=xtensa test_kernel_quantize_test -j8 ``` Progress towards http://b/177457688
2021-02-16 14:33:09 -08:00 · 2021-02-16 14:33:09 -08:00 · 45d9e41015
commit 45d9e41015
parent 63a277f28a
2 changed files with 40 additions and 11 deletions
--- a/tensorflow/lite/micro/kernels/quantize_test.cc
+++ b/tensorflow/lite/micro/kernels/quantize_test.cc
@ -49,7 +49,7 @@ void ValidateQuantizeGoldens(TfLiteTensor* tensors, int tensors_size,
  }
 }

-#if !defined(HIFIMINI)
+#if !defined(XTENSA)
 template <typename T>
 void TestQuantizeFloat(const int* input_dims_data, const float* input_data,
                       const int* output_dims_data, const float* golden,
@ -79,7 +79,7 @@ void TestQuantizeFloat(const int* input_dims_data, const float* input_data,
  ValidateQuantizeGoldens(tensors, tensors_size, golden, golden_quantized,
                          scale, zero_point, output_dims_count, output_data);
 }
-#endif  // defined(HIFIMINI)
+#endif  // defined(XTENSA)

 template <typename InputType, typename OutputType>
 void TestRequantize(const int* input_dims_data, const float* input_data,
@ -121,7 +121,7 @@ void TestRequantize(const int* input_dims_data, const float* input_data,

 TF_LITE_MICRO_TESTS_BEGIN

-#if !defined(HIFIMINI)
+#if !defined(XTENSA)
 TF_LITE_MICRO_TEST(QuantizeOpTestUint8) {
  const int length = 10;
  const int dims[] = {2, 2, 5};
@ -267,9 +267,9 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt8toInt8NoZeroPoint) {
                                  values_quantized, output_scale,
                                  output_zero_point, output_quantized);
 }
-#endif  // defined(HIFIMINI)
+#endif  // defined(XTENSA)

-#if !defined(HIFIMINI)
+#if !defined(XTENSA)
 // TODO(b/155682734): Hifimini optimized quantize requires input scale to be
 // smaller then output scale.
 TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) {
@ -288,7 +288,7 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) {
                                  values_quantized, output_scale,
                                  output_zero_point, output_quantized);
 }
-#endif  // defined(HIFIMINI)
+#endif  // defined(XTENSA)

 TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt32) {
  const int length = 10;
--- a/tensorflow/lite/micro/kernels/xtensa/quantize.cc
+++ b/tensorflow/lite/micro/kernels/xtensa/quantize.cc
@ -109,25 +109,55 @@ void AffineQuantize(int scale_multiplier, const int32_t zero_point,
  }
 }

-TfLiteStatus EvalHifimini(TfLiteContext* context, TfLiteNode* node) {
+#endif  // defined(HIFIMINI)
+
+TfLiteStatus EvalXtensa(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
+#if defined(HIFIMINI)
  auto* op_data = static_cast<OpData*>(node->user_data);
+#elif defined(FUSION_F1)
+  auto* op_data = static_cast<OpDataQuantizeReference*>(node->user_data);
+#endif

  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);

  if (output->type == kTfLiteInt8 && input->type == kTfLiteInt16) {
+#if defined(HIFIMINI)
    AffineQuantize(op_data->scale_multiplier, op_data->zero_point,
                   tflite::micro::GetTensorShape(input),
                   tflite::micro::GetTensorData<int16_t>(input),
                   tflite::micro::GetTensorShape(output),
                   tflite::micro::GetTensorData<int8_t>(output));
+#elif defined(FUSION_F1)
+    int size = ElementCount(*input->dims);
+    TF_LITE_ENSURE_EQ(
+        context,
+        xa_nn_elm_quantize_asym16s_asym8s(
+            tflite::micro::GetTensorData<int8_t>(output),
+            tflite::micro::GetTensorData<int16_t>(input),
+            op_data->input_zero_point, op_data->quantization_params.zero_point,
+            op_data->requantize_output_shift,
+            op_data->requantize_output_multiplier, size),
+        0);
+#else
+    static_assert(false, "Unsupported xtensa architecture.");
+#endif
  } else if (output->type == kTfLiteInt32 && input->type == kTfLiteInt16) {
    int size = ElementCount(*input->dims);
+
+    // This ifdef is only needed because the hifimini code is not following the
+    // convention of the rest of the codebase. Ideally we would be using the
+    // same structs as much as possible and reduce the need for such ifdefs.
+#if defined(HIFIMINI)
+    int32_t zero_point = op_data->zero_point;
+#elif defined(FUSION_F1)
+    int32_t zero_point = op_data->quantization_params.zero_point;
+#endif
    reference_ops::Requantize(tflite::micro::GetTensorData<int16_t>(input),
                              size, op_data->requantize_output_multiplier,
                              op_data->requantize_output_shift,
-                              op_data->input_zero_point, op_data->zero_point,
+                              op_data->input_zero_point, zero_point,
                              tflite::micro::GetTensorData<int32_t>(output));
  } else {
    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
@ -137,7 +167,6 @@ TfLiteStatus EvalHifimini(TfLiteContext* context, TfLiteNode* node) {
  }
  return kTfLiteOk;
 }
-#endif  // defined(HIFIMINI)

 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
@ -179,8 +208,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-#if defined(HIFIMINI)
-  return EvalHifimini(context, node);
+#if defined(HIFIMINI) || defined(FUSION_F1)
+  return EvalXtensa(context, node);
 #else
  return EvalQuantizeReference(context, node);
 #endif