Use xa_nnlib for quantize for Fusion F1.

Copied the relevant function call from a737c1e394/tensorflow/lite/micro/kernels/xtensa_hifi/quantize.cc

Latency for the first quantize op (int16->int8) in the keyword_benchmark
went from 3758 ticks to 800 ticks.

Overall latency went from 38516 ticks to 34253 ticks.

Tested with:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade OPTIMIZED_KERNEL_DIR=xtensa run_keyword_benchmark -j8
```

Full output (for completeness):
```
InitializeKeywordRunner took 160568 ticks (160 ms).

KeywordRunNIerations(1) took 34253 ticks (34 ms)
QUANTIZE took 800 ticks (0 ms).
SVDF took 4753 ticks (4 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 4211 ticks (4 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 3145 ticks (3 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 4211 ticks (4 ms).
FULLY_CONNECTED took 1353 ticks (1 ms).
SVDF took 2890 ticks (2 ms).
SVDF took 3583 ticks (3 ms).
SVDF took 3054 ticks (3 ms).
FULLY_CONNECTED took 1091 ticks (1 ms).
SOFTMAX took 749 ticks (0 ms).
QUANTIZE took 354 ticks (0 ms).
```

Also tested that the kernel test passes with:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade OPTIMIZED_KERNEL_DIR=xtensa test_kernel_quantize_test -j8
```

Progress towards http://b/177457688
This commit is contained in:
Advait Jain 2021-02-16 14:33:09 -08:00
parent 63a277f28a
commit 45d9e41015
2 changed files with 40 additions and 11 deletions

View File

@ -49,7 +49,7 @@ void ValidateQuantizeGoldens(TfLiteTensor* tensors, int tensors_size,
}
}
#if !defined(HIFIMINI)
#if !defined(XTENSA)
template <typename T>
void TestQuantizeFloat(const int* input_dims_data, const float* input_data,
const int* output_dims_data, const float* golden,
@ -79,7 +79,7 @@ void TestQuantizeFloat(const int* input_dims_data, const float* input_data,
ValidateQuantizeGoldens(tensors, tensors_size, golden, golden_quantized,
scale, zero_point, output_dims_count, output_data);
}
#endif // defined(HIFIMINI)
#endif // defined(XTENSA)
template <typename InputType, typename OutputType>
void TestRequantize(const int* input_dims_data, const float* input_data,
@ -121,7 +121,7 @@ void TestRequantize(const int* input_dims_data, const float* input_data,
TF_LITE_MICRO_TESTS_BEGIN
#if !defined(HIFIMINI)
#if !defined(XTENSA)
TF_LITE_MICRO_TEST(QuantizeOpTestUint8) {
const int length = 10;
const int dims[] = {2, 2, 5};
@ -267,9 +267,9 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt8toInt8NoZeroPoint) {
values_quantized, output_scale,
output_zero_point, output_quantized);
}
#endif // defined(HIFIMINI)
#endif // defined(XTENSA)
#if !defined(HIFIMINI)
#if !defined(XTENSA)
// TODO(b/155682734): Hifimini optimized quantize requires input scale to be
// smaller then output scale.
TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) {
@ -288,7 +288,7 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) {
values_quantized, output_scale,
output_zero_point, output_quantized);
}
#endif // defined(HIFIMINI)
#endif // defined(XTENSA)
TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt32) {
const int length = 10;

View File

@ -109,25 +109,55 @@ void AffineQuantize(int scale_multiplier, const int32_t zero_point,
}
}
TfLiteStatus EvalHifimini(TfLiteContext* context, TfLiteNode* node) {
#endif // defined(HIFIMINI)
TfLiteStatus EvalXtensa(TfLiteContext* context, TfLiteNode* node) {
TFLITE_DCHECK(node->user_data != nullptr);
#if defined(HIFIMINI)
auto* op_data = static_cast<OpData*>(node->user_data);
#elif defined(FUSION_F1)
auto* op_data = static_cast<OpDataQuantizeReference*>(node->user_data);
#endif
const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
if (output->type == kTfLiteInt8 && input->type == kTfLiteInt16) {
#if defined(HIFIMINI)
AffineQuantize(op_data->scale_multiplier, op_data->zero_point,
tflite::micro::GetTensorShape(input),
tflite::micro::GetTensorData<int16_t>(input),
tflite::micro::GetTensorShape(output),
tflite::micro::GetTensorData<int8_t>(output));
#elif defined(FUSION_F1)
int size = ElementCount(*input->dims);
TF_LITE_ENSURE_EQ(
context,
xa_nn_elm_quantize_asym16s_asym8s(
tflite::micro::GetTensorData<int8_t>(output),
tflite::micro::GetTensorData<int16_t>(input),
op_data->input_zero_point, op_data->quantization_params.zero_point,
op_data->requantize_output_shift,
op_data->requantize_output_multiplier, size),
0);
#else
static_assert(false, "Unsupported xtensa architecture.");
#endif
} else if (output->type == kTfLiteInt32 && input->type == kTfLiteInt16) {
int size = ElementCount(*input->dims);
// This ifdef is only needed because the hifimini code is not following the
// convention of the rest of the codebase. Ideally we would be using the
// same structs as much as possible and reduce the need for such ifdefs.
#if defined(HIFIMINI)
int32_t zero_point = op_data->zero_point;
#elif defined(FUSION_F1)
int32_t zero_point = op_data->quantization_params.zero_point;
#endif
reference_ops::Requantize(tflite::micro::GetTensorData<int16_t>(input),
size, op_data->requantize_output_multiplier,
op_data->requantize_output_shift,
op_data->input_zero_point, op_data->zero_point,
op_data->input_zero_point, zero_point,
tflite::micro::GetTensorData<int32_t>(output));
} else {
TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
@ -137,7 +167,6 @@ TfLiteStatus EvalHifimini(TfLiteContext* context, TfLiteNode* node) {
}
return kTfLiteOk;
}
#endif // defined(HIFIMINI)
void* Init(TfLiteContext* context, const char* buffer, size_t length) {
TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
@ -179,8 +208,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
}
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
#if defined(HIFIMINI)
return EvalHifimini(context, node);
#if defined(HIFIMINI) || defined(FUSION_F1)
return EvalXtensa(context, node);
#else
return EvalQuantizeReference(context, node);
#endif