From cb4f8412f8d57392988c69c1bee49f8d594c3195 Mon Sep 17 00:00:00 2001 From: Niranjan Yadla Date: Wed, 8 Apr 2020 14:14:15 -0700 Subject: [PATCH] Cadence NNLib:Fixed various unit test failures Fixed person detection test,max pool and svdf unit tests Signed-off-by: Bhanu Prakash Bandaru Venkata --- .../lite/micro/kernels/xtensa_hifi/conv.cc | 2 +- .../kernels/xtensa_hifi/depthwise_conv.cc | 2 +- .../lite/micro/kernels/xtensa_hifi/svdf.cc | 191 ++++++++---------- .../tools/make/third_party_downloads.inc | 4 +- 4 files changed, 86 insertions(+), 113 deletions(-) diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc index 425cdc58934..e0999750ffb 100755 --- a/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc +++ b/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc @@ -55,7 +55,7 @@ constexpr int kInputTensor = 0; constexpr int kFilterTensor = 1; constexpr int kBiasTensor = 2; constexpr int kOutputTensor = 0; -constexpr int kMaxChannels = 8; +constexpr int kMaxChannels = 256; // Conv is quantized along dimension 0: // https://www.tensorflow.org/lite/performance/quantization_spec diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc index 9dd8bd7326f..df26aa05a8c 100755 --- a/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc +++ b/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc @@ -58,7 +58,7 @@ constexpr int kFilterTensor = 1; constexpr int kBiasTensor = 2; constexpr int kOutputTensor = 0; // Per channel quantization is not needed for any model on xtensa. -constexpr int kMaxChannels = 8; +constexpr int kMaxChannels = 256; // Depthwise conv is quantized along dimension 3: // https://www.tensorflow.org/lite/performance/quantization_spec diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc index f21e6d67921..888eda8883a 100644 --- a/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc +++ b/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc @@ -78,14 +78,15 @@ struct OpData { */ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation( - TfLiteContext* context, int batch_size, int memory_size, int num_filters, - int num_units, int rank, const TfLiteTensor* weights_time, - const TfLiteTensor* bias, TfLiteFusedActivation activation, - TfLiteTensor* activation_state, TfLiteTensor* scratch, - TfLiteTensor* output) { - float* scratch_bias = GetTensorData(scratch); - if (bias) { - const float* bias_data = GetTensorData(bias); + TfLiteContext* context, int batch_size, int memory_size, int num_filters, int num_units, int rank, + const float* const __restrict__ weights_time_ptr, + const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation, + float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr, + float* const __restrict__ output_ptr) { + // Compute matmul(activation_state, weights_time). + float* scratch_bias = scratch_ptr; + if (bias_ptr) { + const float* bias_data = bias_ptr; for (int j = 0; j < num_units; ++j) { scratch_bias[j] = *bias_data++; } @@ -96,15 +97,16 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation( } int err = 0; for (int b = 0; b < batch_size; ++b) { - const float* weights_time_vec = GetTensorData(weights_time); + const float* weights_time_vec = weights_time_ptr; const float* mat_ptr = - GetTensorData(activation_state) + b * memory_size * num_filters; - float* output_ptr_batch = GetTensorData(output) + b * num_units; + state_ptr + b * memory_size * num_filters; + float* output_ptr_batch = output_ptr + b * num_units; for (int j = 0; j < num_units; j++) { err = xa_nn_matXvec_f32xf32_f32( output_ptr_batch, mat_ptr, NULL, weights_time_vec, NULL, scratch_bias, 1, memory_size * rank, 0, memory_size * rank, 0); CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_matXvec_f32xf32_f32 failed"); + output_ptr_batch++; mat_ptr += memory_size * rank; weights_time_vec += memory_size * rank; @@ -113,30 +115,12 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation( // Apply activation. for (int b = 0; b < batch_size; ++b) { - float* output_ptr_batch = GetTensorData(output) + b * num_units; + float* output_ptr_batch = output_ptr + b * num_units; for (int i = 0; i < num_units; ++i) { *output_ptr_batch = ActivationValFloat(activation, *output_ptr_batch); ++output_ptr_batch; } } - - // Left shift the activation_state to make room for next cycle's activation. - // (alanchiao): explore collapsing this into a single loop. - for (int b = 0; b < batch_size; ++b) { - float* state_ptr_batch = - GetTensorData(activation_state) + b * memory_size * num_filters; - for (int f = 0; f < num_filters; ++f) { - // Shift the vector left: - float* batch_ptr = state_ptr_batch; - float* batch_start = state_ptr_batch + 1; - float* batch_end = state_ptr_batch + memory_size; - while (batch_start != batch_end) { - *batch_ptr++ = *batch_start++; - } - state_ptr_batch[memory_size - 1] = 0.0f; - state_ptr_batch += memory_size; - } - } return kTfLiteOk; } @@ -144,8 +128,7 @@ inline TfLiteStatus EvalFloatSVDF( TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input, const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time, const TfLiteTensor* bias, const TfLiteSVDFParams* params, - TfLiteTensor* scratch, TfLiteTensor* activation_state, - TfLiteTensor* output) { + TfLiteTensor* activation_state, TfLiteTensor* output) { const int rank = params->rank; const int batch_size = input->dims->data[0]; const int input_size = input->dims->data[1]; @@ -153,30 +136,47 @@ inline TfLiteStatus EvalFloatSVDF( const int num_units = num_filters / rank; const int memory_size = weights_time->dims->data[1]; - // Clear the activation (activation_state's leftmost column). - // (ghodrat): Add a test which initialize activation_state with invalid - // values in leftmost column and make sure it passes. - for (int b = 0; b < batch_size; ++b) { - float* state_ptr_batch = - GetTensorData(activation_state) + b * memory_size * num_filters; + const float* weights_feature_ptr = GetTensorData(weights_feature); + const float* weights_time_ptr = GetTensorData(weights_time); + const float* bias_ptr = GetTensorData(bias); + const float* input_ptr = GetTensorData(input); + + float* state_ptr = GetTensorData(activation_state); + + // TODO(b/132070898): Move this temp variable to the new scratch buffer API + // when ready. + float scratch_tensor[kScratchTensorMaxSize]; + float* scratch_ptr = scratch_tensor; + + float* output_ptr = GetTensorData(output); + + // Left shift the activation_state. + { + float* new_state_start = state_ptr; + const float* old_state_start = state_ptr + 1; + const float* old_state_end = + state_ptr + batch_size * num_filters * memory_size; + while (old_state_start != old_state_end) { + *new_state_start++ = *old_state_start++; + } } + // Note: no need to clear the latest activation, matmul is not accumulative. + // Compute conv1d(inputs, weights_feature). // The activation_state's rightmost column is used to save current cycle - // activation. This is achieved by starting at - // GetTensorData(activation_state)[memory_size - 1] and having the - // stride equal to memory_size. + // activation. This is achieved by starting at state_ptr[memory_size - 1] and + // having the stride equal to memory_size. - const float* matrix = GetTensorData(weights_feature); - const float* vector = GetTensorData(input); - float* out_scratch = GetTensorData(scratch); - /* NNLib matXvec needs a bias buffer, so using output buffer to - avoid need for extra memory, output buffer size is batch * num_units, - batch is at least 1 so we use size num_units of it */ - float* bias_scratch = GetTensorData(output); - float* result = &GetTensorData(activation_state)[memory_size - 1]; - float* result_in_batch = result; + // Perform batched matrix vector multiply operation: + { + const float* matrix = weights_feature_ptr; + const float* vector = input_ptr; + float* result = &state_ptr[memory_size - 1]; + float* result_in_batch = result; + float* out_scratch = scratch_ptr; + float* bias_scratch = output_ptr; for (int i = 0; i < num_units; i++) bias_scratch[i] = 0.0f; int err = 0; @@ -196,11 +196,11 @@ inline TfLiteStatus EvalFloatSVDF( result_in_batch += memory_size; } } + } return ApplyTimeWeightsBiasAndActivation( - context, batch_size, memory_size, num_filters, num_units, rank, - weights_time, bias, params->activation, activation_state, scratch, - output); + context, batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr, + bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr); } void EvalIntegerSVDF( @@ -217,24 +217,26 @@ void EvalIntegerSVDF( const int n_unit = n_filter / n_rank; const int n_memory = weights_time_tensor->dims->data[1]; - // (b/132070898): Move these temp variables to the new scratch buffer API + // TODO(b/132070898): Move these temp variables to the new scratch buffer API // when ready. int32_t scratch_tensor[kScratchTensorMaxSize]; int32_t scratch_output_tensor[kScratchTensorMaxSize]; - // Rewrite last bit of state. + // Shift states. + int16_t* const state_ptr = GetTensorData(activation_state_tensor); + + // Left shift the activation_state. { - for (int b = 0; b < n_batch; ++b) { - int16_t* state_ptr_batch = - GetTensorData(activation_state_tensor) + - b * n_memory * n_filter; - for (int c = 0; c < n_filter; ++c) { - int16_t* state_ptr = state_ptr_batch + c * n_memory; - state_ptr[n_memory - 1] = 0; - } + int16_t* new_state_start = state_ptr; + const int16_t* old_state_start = state_ptr + 1; + const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory; + while (old_state_start != old_state_end) { + *new_state_start++ = *old_state_start++; } } + // Note: no need to clear the latest activation, matmul is not accumulative. + // Feature matmul. { int16_t* state = GetTensorData(activation_state_tensor); @@ -255,6 +257,12 @@ void EvalIntegerSVDF( dot_prod = MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b); dot_prod = std::min(std::max(output_min, dot_prod), output_max); + // This assumes state is symmetrically quantized. Otherwise last bit of + // state should be initialized to its zero point and accumulate the + // dot_prod. + // Equivalent as the following: + // result_in_batch = zero point, which happens to be zero. + // result_in_batch += dot_prod_56. *result_in_batch = dot_prod; result_in_batch += n_memory; } @@ -326,26 +334,6 @@ void EvalIntegerSVDF( GetTensorData(output_tensor)[i] = static_cast(x4); } } - - // Shift state. - { - for (int b = 0; b < n_batch; ++b) { - int16_t* state_ptr_batch = - GetTensorData(activation_state_tensor) + - b * n_memory * n_filter; - for (int f = 0; f < n_filter; ++f) { - // Shift the vector left: - int16_t* batch_ptr = state_ptr_batch; - int16_t* batch_start = state_ptr_batch + 1; - int16_t* batch_end = state_ptr_batch + n_memory; - while (batch_start != batch_end) { - *batch_ptr++ = *batch_start++; - } - state_ptr_batch[n_memory - 1] = 0; - state_ptr_batch += n_memory; - } - } - } } } // namespace @@ -385,12 +373,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { const int rank = params->rank; const int input_size = input->dims->data[1]; const int batch_size = input->dims->data[0]; - // Ensure the input size is a multiple of two. This is necessary since - // optimized kernels access the memory in chunks of two, and all accesses - // must be aligned to 16 bits. - // TODO(b/153202598): Remove when padding is allowed in TFLite tensors. - TF_LITE_ENSURE_EQ(context, input_size % 2, 0); - const int num_filters = weights_feature->dims->data[0]; TF_LITE_ENSURE_EQ(context, num_filters % rank, 0); const int num_units = num_filters / rank; @@ -446,13 +428,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Validate Scratch Tensors: // [0] = (shared - see float block below for usage) // [1] = Output Temp, int8_t, {2, num_units, batch_size} - // (b/132070898): Scratch values are used as stack variables in + // TODO(b/132070898): Scratch values are used as stack variables in // EvalIntegerSVDF(). // Validate output tensor: TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8); } else { - TF_LITE_ENSURE_EQ(context, node->inputs->size, 6); + TF_LITE_ENSURE_EQ(context, node->inputs->size, 5); // Validate Input Tensor dtypes: TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32); @@ -467,19 +449,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // [0] = Holds dot-product of time-forward calculations in // ApplyTimeWeightsBiasAndActivation(): // float/int32, {2, batch_size, num_filters} - // (b/132070898): Use input tensor as variable until scratch tensor - // allocation has been implemented (b/132070898) TfLiteTensor* - // scratch_tensor = GetTemporary(context, node, 0); - TfLiteTensor* scratch_tensor = &context->tensors[node->inputs->data[5]]; - TF_LITE_ENSURE_EQ(context, scratch_tensor->type, kTfLiteFloat32); - - TF_LITE_ENSURE_EQ(context, NumDimensions(scratch_tensor), 2); - TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[0], batch_size); - TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[1], num_filters); + // TODO(b/132070898): Scratch values are used as stack variables in + // EvalIntegerSVDF(). // Full-float SVDF only uses the one shared scratch tensor (see above for // usage). - // (b/132070898): Use input tensor as variable until scratch tensor + // TODO(b/132070898): Use input tensor as variable until scratch tensor // allocation has been implemented. // TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1); TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32); @@ -505,18 +480,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { switch (weights_feature->type) { case kTfLiteFloat32: { - // (b/132070898): Use input tensor as variable until scratch tensor - // allocation has been implemented. TfLiteTensor* scratch = - // GetTemporary(context, node, /*index=*/0); - TfLiteTensor* scratch = &context->tensors[node->inputs->data[5]]; - return EvalFloatSVDF(context, node, input, weights_feature, weights_time, - bias, params, scratch, activation_state, output); + // TODO(b/132070898): Use input tensor as variable until scratch tensor + // allocation has been implemented. + // TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0); + return EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias, + params, activation_state, output); break; } case kTfLiteInt8: { if (is_full_integer) { - // (b/132070898): Store these values in ::Prepare() instead of + // TODO(b/132070898): Store these values in ::Prepare() instead of // ::Eval(): // Calculate effective scales. OpData op_data; @@ -574,7 +548,6 @@ TfLiteRegistration* Register_SVDF() { /*builtin_code=*/0, /*custom_name=*/nullptr, /*version=*/0}; - return &r; } diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc index 189d758eb96..30a27c0a758 100644 --- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc +++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc @@ -63,6 +63,6 @@ EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776" EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/6316034d421cbbb59756239908d7c9a99075a3bb.zip" EMBARC_MLI_MD5 := "db0910cf0e07e43f74ae7a31de485d56" -XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib.zip" -XTENSA_HIFI4_MD5 :="a517b653a75b96d0271e1b99ee2a8c14" +XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip" +XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"