From 45d9e41015b9004e1e0c8afaa7e90eaa71c2aeed Mon Sep 17 00:00:00 2001 From: Advait Jain Date: Tue, 16 Feb 2021 14:33:09 -0800 Subject: [PATCH] Use xa_nnlib for quantize for Fusion F1. Copied the relevant function call from https://github.com/pnikam-cad/tensorflow/blob/a737c1e3945bc70022259479ad24133a343ec906/tensorflow/lite/micro/kernels/xtensa_hifi/quantize.cc Latency for the first quantize op (int16->int8) in the keyword_benchmark went from 3758 ticks to 800 ticks. Overall latency went from 38516 ticks to 34253 ticks. Tested with: ``` make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade OPTIMIZED_KERNEL_DIR=xtensa run_keyword_benchmark -j8 ``` Full output (for completeness): ``` InitializeKeywordRunner took 160568 ticks (160 ms). KeywordRunNIerations(1) took 34253 ticks (34 ms) QUANTIZE took 800 ticks (0 ms). SVDF took 4753 ticks (4 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 4211 ticks (4 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 3145 ticks (3 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 4211 ticks (4 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 2890 ticks (2 ms). SVDF took 3583 ticks (3 ms). SVDF took 3054 ticks (3 ms). FULLY_CONNECTED took 1091 ticks (1 ms). SOFTMAX took 749 ticks (0 ms). QUANTIZE took 354 ticks (0 ms). ``` Also tested that the kernel test passes with: ``` make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade OPTIMIZED_KERNEL_DIR=xtensa test_kernel_quantize_test -j8 ``` Progress towards http://b/177457688 --- .../lite/micro/kernels/quantize_test.cc | 12 +++--- .../lite/micro/kernels/xtensa/quantize.cc | 39 ++++++++++++++++--- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/tensorflow/lite/micro/kernels/quantize_test.cc b/tensorflow/lite/micro/kernels/quantize_test.cc index ad302f0438d..b5da97930fd 100644 --- a/tensorflow/lite/micro/kernels/quantize_test.cc +++ b/tensorflow/lite/micro/kernels/quantize_test.cc @@ -49,7 +49,7 @@ void ValidateQuantizeGoldens(TfLiteTensor* tensors, int tensors_size, } } -#if !defined(HIFIMINI) +#if !defined(XTENSA) template void TestQuantizeFloat(const int* input_dims_data, const float* input_data, const int* output_dims_data, const float* golden, @@ -79,7 +79,7 @@ void TestQuantizeFloat(const int* input_dims_data, const float* input_data, ValidateQuantizeGoldens(tensors, tensors_size, golden, golden_quantized, scale, zero_point, output_dims_count, output_data); } -#endif // defined(HIFIMINI) +#endif // defined(XTENSA) template void TestRequantize(const int* input_dims_data, const float* input_data, @@ -121,7 +121,7 @@ void TestRequantize(const int* input_dims_data, const float* input_data, TF_LITE_MICRO_TESTS_BEGIN -#if !defined(HIFIMINI) +#if !defined(XTENSA) TF_LITE_MICRO_TEST(QuantizeOpTestUint8) { const int length = 10; const int dims[] = {2, 2, 5}; @@ -267,9 +267,9 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt8toInt8NoZeroPoint) { values_quantized, output_scale, output_zero_point, output_quantized); } -#endif // defined(HIFIMINI) +#endif // defined(XTENSA) -#if !defined(HIFIMINI) +#if !defined(XTENSA) // TODO(b/155682734): Hifimini optimized quantize requires input scale to be // smaller then output scale. TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) { @@ -288,7 +288,7 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) { values_quantized, output_scale, output_zero_point, output_quantized); } -#endif // defined(HIFIMINI) +#endif // defined(XTENSA) TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt32) { const int length = 10; diff --git a/tensorflow/lite/micro/kernels/xtensa/quantize.cc b/tensorflow/lite/micro/kernels/xtensa/quantize.cc index 3b84e0680f6..5418c69d18e 100644 --- a/tensorflow/lite/micro/kernels/xtensa/quantize.cc +++ b/tensorflow/lite/micro/kernels/xtensa/quantize.cc @@ -109,25 +109,55 @@ void AffineQuantize(int scale_multiplier, const int32_t zero_point, } } -TfLiteStatus EvalHifimini(TfLiteContext* context, TfLiteNode* node) { +#endif // defined(HIFIMINI) + +TfLiteStatus EvalXtensa(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(node->user_data != nullptr); +#if defined(HIFIMINI) auto* op_data = static_cast(node->user_data); +#elif defined(FUSION_F1) + auto* op_data = static_cast(node->user_data); +#endif const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0); TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0); if (output->type == kTfLiteInt8 && input->type == kTfLiteInt16) { +#if defined(HIFIMINI) AffineQuantize(op_data->scale_multiplier, op_data->zero_point, tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output)); +#elif defined(FUSION_F1) + int size = ElementCount(*input->dims); + TF_LITE_ENSURE_EQ( + context, + xa_nn_elm_quantize_asym16s_asym8s( + tflite::micro::GetTensorData(output), + tflite::micro::GetTensorData(input), + op_data->input_zero_point, op_data->quantization_params.zero_point, + op_data->requantize_output_shift, + op_data->requantize_output_multiplier, size), + 0); +#else + static_assert(false, "Unsupported xtensa architecture."); +#endif } else if (output->type == kTfLiteInt32 && input->type == kTfLiteInt16) { int size = ElementCount(*input->dims); + + // This ifdef is only needed because the hifimini code is not following the + // convention of the rest of the codebase. Ideally we would be using the + // same structs as much as possible and reduce the need for such ifdefs. +#if defined(HIFIMINI) + int32_t zero_point = op_data->zero_point; +#elif defined(FUSION_F1) + int32_t zero_point = op_data->quantization_params.zero_point; +#endif reference_ops::Requantize(tflite::micro::GetTensorData(input), size, op_data->requantize_output_multiplier, op_data->requantize_output_shift, - op_data->input_zero_point, op_data->zero_point, + op_data->input_zero_point, zero_point, tflite::micro::GetTensorData(output)); } else { TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.", @@ -137,7 +167,6 @@ TfLiteStatus EvalHifimini(TfLiteContext* context, TfLiteNode* node) { } return kTfLiteOk; } -#endif // defined(HIFIMINI) void* Init(TfLiteContext* context, const char* buffer, size_t length) { TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr); @@ -179,8 +208,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { } TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { -#if defined(HIFIMINI) - return EvalHifimini(context, node); +#if defined(HIFIMINI) || defined(FUSION_F1) + return EvalXtensa(context, node); #else return EvalQuantizeReference(context, node); #endif