From 55633bf5c1b9c9004a4bbc6b884dd6b6d5f9fab7 Mon Sep 17 00:00:00 2001 From: Advait Jain <advaitjain@users.noreply.github.com> Date: Thu, 11 Feb 2021 15:52:01 -0800 Subject: [PATCH] Use xa_nnlib for svdf for Fusion F1. The code in this change is the subset of functionality needed for int8 svdf for Hifi4 copied from https://github.com/pnikam-cad/tensorflow/blob/a737c1e3945bc70022259479ad24133a343ec906/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc Note that the current change has not pulled in either the floating point implementation or the Hifi5 implementation. Profiled the keryword_benchmark with the following command: ``` make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa OPTIMIZED_KERNEL_DIR=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade run_keyword_benchmark -j8 ``` gives a latency of 38516 ticks with this change vs 152642 ticks without this change. Per OP latency with this change: ``` KeywordRunNIerations(1) took 38516 ticks (38 ms) QUANTIZE took 3758 ticks (3 ms). SVDF took 4753 ticks (4 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 4211 ticks (4 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 3145 ticks (3 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 4211 ticks (4 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 2890 ticks (2 ms). SVDF took 3583 ticks (3 ms). SVDF took 3054 ticks (3 ms). FULLY_CONNECTED took 1091 ticks (1 ms). SOFTMAX took 2042 ticks (2 ms). QUANTIZE took 366 ticks (0 ms). ``` Without this change: ``` KeywordRunNIerations(1) took 152642 ticks (152 ms) QUANTIZE took 3758 ticks (3 ms). SVDF took 38003 ticks (38 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 18803 ticks (18 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 18803 ticks (18 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 18803 ticks (18 ms). FULLY_CONNECTED took 1353 ticks (1 ms). SVDF took 13907 ticks (13 ms). SVDF took 15827 ticks (15 ms). SVDF took 15827 ticks (15 ms). FULLY_CONNECTED took 1091 ticks (1 ms). SOFTMAX took 2042 ticks (2 ms). QUANTIZE took 366 ticks (0 ms). ``` Also confirmed that the kernel_svdf_test passes with: ``` make -f tensorflow/lite/micro/tools/make/Makefile TARGET=xtensa OPTIMIZED_KERNEL_DIR=xtensa TARGET_ARCH=fusion_f1 XTENSA_CORE=F1_190305_swupgrade test_kernel_svdf_test -j8 ``` --- tensorflow/lite/micro/kernels/xtensa/svdf.cc | 108 ++++++++++++++++--- 1 file changed, 93 insertions(+), 15 deletions(-) diff --git a/tensorflow/lite/micro/kernels/xtensa/svdf.cc b/tensorflow/lite/micro/kernels/xtensa/svdf.cc index f9d6e18e219..6aea649a890 100644 --- a/tensorflow/lite/micro/kernels/xtensa/svdf.cc +++ b/tensorflow/lite/micro/kernels/xtensa/svdf.cc @@ -51,14 +51,14 @@ constexpr int kOutputTensor = 0; * Note: passing OpData by value might seem like an oversight but it helps * reduce the latency. See b/155656675 for more details. */ -void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node, - const TfLiteEvalTensor* input_tensor, - const TfLiteEvalTensor* weights_feature_tensor, - const TfLiteEvalTensor* weights_time_tensor, - const TfLiteEvalTensor* bias_tensor, - const TfLiteSVDFParams* params, - TfLiteEvalTensor* activation_state_tensor, - TfLiteEvalTensor* output_tensor, OpData data) { +void EvalIntegerSvdfHifimini(TfLiteContext* context, TfLiteNode* node, + const TfLiteEvalTensor* input_tensor, + const TfLiteEvalTensor* weights_feature_tensor, + const TfLiteEvalTensor* weights_time_tensor, + const TfLiteEvalTensor* bias_tensor, + const TfLiteSVDFParams* params, + TfLiteEvalTensor* activation_state_tensor, + TfLiteEvalTensor* output_tensor, OpData data) { const int n_rank = params->rank; const int n_batch = input_tensor->dims->data[0]; const int n_input = input_tensor->dims->data[1]; @@ -243,7 +243,76 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node, } } } -#endif + +#elif defined(FUSION_F1) + +TfLiteStatus EvalIntegerSvdfHifi4( + TfLiteContext* context, TfLiteNode* node, + const TfLiteEvalTensor* input_tensor, + const TfLiteEvalTensor* weights_feature_tensor, + const TfLiteEvalTensor* weights_time_tensor, + const TfLiteEvalTensor* bias_tensor, const TfLiteSVDFParams* params, + TfLiteEvalTensor* activation_state_tensor, TfLiteEvalTensor* output_tensor, + const OpData& data) { + const int n_rank = params->rank; + const int n_batch = input_tensor->dims->data[0]; + const int n_input = input_tensor->dims->data[1]; + const int n_filter = weights_feature_tensor->dims->data[0]; + const int n_unit = n_filter / n_rank; + const int n_memory = weights_time_tensor->dims->data[1]; + + TFLITE_DCHECK(context != nullptr); + TFLITE_DCHECK(context->GetScratchBuffer != nullptr); + + // Shift states. + int16_t* const state_ptr = + tflite::micro::GetTensorData<int16_t>(activation_state_tensor); + + // Left shift the activation_state. + int num_bytes = sizeof(*state_ptr) * (n_batch * n_filter * n_memory - 1); + xa_nn_memmove_16(state_ptr, state_ptr + 1, num_bytes); + + // Note: no need to clear the latest activation, matmul is not accumulative. + + // Feature matmul. + const int8_t* input = tflite::micro::GetTensorData<int8_t>(input_tensor); + const int8_t* weight_feature = + tflite::micro::GetTensorData<int8_t>(weights_feature_tensor); + int16_t* result_in_batch = state_ptr + (n_memory - 1); + + for (int b = 0; b < n_batch; b++) { + TF_LITE_ENSURE_EQ(context, + xa_nn_matXvec_out_stride_sym8sxasym8s_16( + &result_in_batch[b * n_filter * n_memory], + weight_feature, &input[b * n_input], NULL, n_filter, + n_input, n_input, n_memory, -data.input_zero_point, + (data.effective_scale_1_a), data.effective_scale_1_b), + 0); + } + + // Time weights dot product + activation + for (int b = 0; b < n_batch; ++b) { + const int16_t* vector1_ptr = + tflite::micro::GetTensorData<int16_t>(weights_time_tensor); + const int16_t* vector2_ptr = + tflite::micro::GetTensorData<int16_t>(activation_state_tensor) + + b * n_memory * n_filter; + const int32_t* bias_ptr = + tflite::micro::GetTensorData<int32_t>(bias_tensor); + int8_t* output_ptr = + tflite::micro::GetTensorData<int8_t>(output_tensor) + b * n_unit; + + TF_LITE_ENSURE_EQ( + context, + xa_nn_dot_prod_16x16_asym8s( + output_ptr, vector1_ptr, vector2_ptr, bias_ptr, n_memory * n_rank, + (data.effective_scale_2_a), data.effective_scale_2_b, + data.output_zero_point, n_unit), + 0); + } + return kTfLiteOk; +} +#endif // defined(FUSION_F1) || defined(HIFIMINI) void* Init(TfLiteContext* context, const char* buffer, size_t length) { TFLITE_DCHECK(context != nullptr); @@ -274,11 +343,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { const int rank = params->rank; const int input_size = input->dims->data[1]; const int batch_size = input->dims->data[0]; + +#if defined(HIFIMINI) // Ensure the input size is a multiple of two. This is necessary since // optimized kernels access the memory in chunks of two, and all accesses // must be aligned to 16 bits. // TODO(b/153202598): Remove when padding is allowed in TFLite tensors. TF_LITE_ENSURE_EQ(context, input_size % 2, 0); +#endif // defined(HIFIMINI) const int num_filters = weights_feature->dims->data[0]; TF_LITE_ENSURE_EQ(context, num_filters % rank, 0); @@ -339,9 +411,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { static_cast<double>(activation_state->params.scale * weights_time->params.scale / output->params.scale); - TF_LITE_ENSURE_EQ(context, static_cast<double>(bias->params.scale), - static_cast<double>(activation_state->params.scale * - weights_time->params.scale)); + TF_LITE_ENSURE_NEAR(context, static_cast<double>(bias->params.scale), + static_cast<double>(activation_state->params.scale * + weights_time->params.scale), + 1e-5); TFLITE_DCHECK(node->user_data != nullptr); OpData* data = static_cast<OpData*>(node->user_data); @@ -396,13 +469,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const OpData& data = *(static_cast<const OpData*>(node->user_data)); #if defined(HIFIMINI) - EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias, - params, activation_state, output, data); + EvalIntegerSvdfHifimini(context, node, input, weights_feature, weights_time, + bias, params, activation_state, output, data); + return kTfLiteOk; +#elif defined(FUSION_F1) + return EvalIntegerSvdfHifi4(context, node, input, weights_feature, + weights_time, bias, params, activation_state, + output, data); #else EvalIntegerSvdfReference(context, node, input, weights_feature, weights_time, bias, params, activation_state, output, data); -#endif return kTfLiteOk; +#endif } } // namespace