From cb4f8412f8d57392988c69c1bee49f8d594c3195 Mon Sep 17 00:00:00 2001
From: Niranjan Yadla <nyadla@cadence.com>
Date: Wed, 8 Apr 2020 14:14:15 -0700
Subject: [PATCH] Cadence NNLib:Fixed various unit test failures

Fixed person detection test,max pool and svdf unit tests

Signed-off-by: Bhanu Prakash Bandaru Venkata <bhanup@cadence.com>
---
 .../lite/micro/kernels/xtensa_hifi/conv.cc    |   2 +-
 .../kernels/xtensa_hifi/depthwise_conv.cc     |   2 +-
 .../lite/micro/kernels/xtensa_hifi/svdf.cc    | 191 ++++++++----------
 .../tools/make/third_party_downloads.inc      |   4 +-
 4 files changed, 86 insertions(+), 113 deletions(-)
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
index 425cdc58934..e0999750ffb 100755
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
@@ -55,7 +55,7 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 8;
+constexpr int kMaxChannels = 256;
 
 // Conv is quantized along dimension 0:
 // https://www.tensorflow.org/lite/performance/quantization_spec
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc
index 9dd8bd7326f..df26aa05a8c 100755
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc
@@ -58,7 +58,7 @@ constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 // Per channel quantization is not needed for any model on xtensa.
-constexpr int kMaxChannels = 8;
+constexpr int kMaxChannels = 256;
 
 // Depthwise conv is quantized along dimension 3:
 // https://www.tensorflow.org/lite/performance/quantization_spec
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
index f21e6d67921..888eda8883a 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
@@ -78,14 +78,15 @@ struct OpData {
  */
 
 static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
-    TfLiteContext* context, int batch_size, int memory_size, int num_filters,
-    int num_units, int rank, const TfLiteTensor* weights_time,
-    const TfLiteTensor* bias, TfLiteFusedActivation activation,
-    TfLiteTensor* activation_state, TfLiteTensor* scratch,
-    TfLiteTensor* output) {
-  float* scratch_bias = GetTensorData<float>(scratch);
-  if (bias) {
-    const float* bias_data = GetTensorData<float>(bias);
+    TfLiteContext* context, int batch_size, int memory_size, int num_filters, int num_units, int rank,
+    const float* const __restrict__ weights_time_ptr,
+    const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
+    float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
+    float* const __restrict__ output_ptr) {
+  // Compute matmul(activation_state, weights_time).
+  float* scratch_bias = scratch_ptr;
+  if (bias_ptr) {
+    const float* bias_data = bias_ptr;
     for (int j = 0; j < num_units; ++j) {
       scratch_bias[j] = *bias_data++;
     }
@@ -96,15 +97,16 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
   }
   int err = 0;
   for (int b = 0; b < batch_size; ++b) {
-    const float* weights_time_vec = GetTensorData<float>(weights_time);
+    const float* weights_time_vec = weights_time_ptr;
     const float* mat_ptr =
-        GetTensorData<float>(activation_state) + b * memory_size * num_filters;
-    float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
+        state_ptr + b * memory_size * num_filters;
+    float* output_ptr_batch = output_ptr + b * num_units;
     for (int j = 0; j < num_units; j++) {
       err = xa_nn_matXvec_f32xf32_f32(
           output_ptr_batch, mat_ptr, NULL, weights_time_vec, NULL, scratch_bias,
           1, memory_size * rank, 0, memory_size * rank, 0);
       CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_matXvec_f32xf32_f32 failed");
+
       output_ptr_batch++;
       mat_ptr += memory_size * rank;
       weights_time_vec += memory_size * rank;
@@ -113,30 +115,12 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
 
   // Apply activation.
   for (int b = 0; b < batch_size; ++b) {
-    float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
+    float* output_ptr_batch = output_ptr + b * num_units;
     for (int i = 0; i < num_units; ++i) {
       *output_ptr_batch = ActivationValFloat(activation, *output_ptr_batch);
       ++output_ptr_batch;
     }
   }
-
-  // Left shift the activation_state to make room for next cycle's activation.
-  // (alanchiao): explore collapsing this into a single loop.
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch =
-        GetTensorData<float>(activation_state) + b * memory_size * num_filters;
-    for (int f = 0; f < num_filters; ++f) {
-      // Shift the vector left:
-      float* batch_ptr = state_ptr_batch;
-      float* batch_start = state_ptr_batch + 1;
-      float* batch_end = state_ptr_batch + memory_size;
-      while (batch_start != batch_end) {
-        *batch_ptr++ = *batch_start++;
-      }
-      state_ptr_batch[memory_size - 1] = 0.0f;
-      state_ptr_batch += memory_size;
-    }
-  }
   return kTfLiteOk;
 }
 
@@ -144,8 +128,7 @@ inline TfLiteStatus EvalFloatSVDF(
     TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
     const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
     const TfLiteTensor* bias, const TfLiteSVDFParams* params,
-    TfLiteTensor* scratch, TfLiteTensor* activation_state,
-    TfLiteTensor* output) {
+    TfLiteTensor* activation_state, TfLiteTensor* output) {
   const int rank = params->rank;
   const int batch_size = input->dims->data[0];
   const int input_size = input->dims->data[1];
@@ -153,30 +136,47 @@ inline TfLiteStatus EvalFloatSVDF(
   const int num_units = num_filters / rank;
   const int memory_size = weights_time->dims->data[1];
 
-  // Clear the activation (activation_state's leftmost column).
-  // (ghodrat): Add a test which initialize activation_state with invalid
-  // values in leftmost column and make sure it passes.
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch =
-        GetTensorData<float>(activation_state) + b * memory_size * num_filters;
+  const float* weights_feature_ptr = GetTensorData<float>(weights_feature);
+  const float* weights_time_ptr = GetTensorData<float>(weights_time);
+  const float* bias_ptr = GetTensorData<float>(bias);
+  const float* input_ptr = GetTensorData<float>(input);
+
+  float* state_ptr = GetTensorData<float>(activation_state);
+
+  // TODO(b/132070898): Move this temp variable to the new scratch buffer API
+  // when ready.
+  float scratch_tensor[kScratchTensorMaxSize];
+  float* scratch_ptr = scratch_tensor;
+
+  float* output_ptr = GetTensorData<float>(output);
+
+  // Left shift the activation_state.
+  {
+    float* new_state_start = state_ptr;
+    const float* old_state_start = state_ptr + 1;
+    const float* old_state_end =
+        state_ptr + batch_size * num_filters * memory_size;
+    while (old_state_start != old_state_end) {
+      *new_state_start++ = *old_state_start++;
+    }
   }
 
+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
   // Compute conv1d(inputs, weights_feature).
   // The activation_state's rightmost column is used to save current cycle
-  // activation. This is achieved by starting at
-  // GetTensorData<float>(activation_state)[memory_size - 1] and having the
-  // stride equal to memory_size.
+  // activation. This is achieved by starting at state_ptr[memory_size - 1] and
+  // having the stride equal to memory_size.
 
-  const float* matrix = GetTensorData<float>(weights_feature);
-  const float* vector = GetTensorData<float>(input);
-  float* out_scratch = GetTensorData<float>(scratch);
-  /* NNLib matXvec needs a bias buffer, so using output buffer to
-  avoid need for extra memory, output buffer size is batch * num_units,
-  batch is at least 1 so we use size num_units of it */
-  float* bias_scratch = GetTensorData<float>(output);
-  float* result = &GetTensorData<float>(activation_state)[memory_size - 1];
-  float* result_in_batch = result;
+  // Perform batched matrix vector multiply operation:
+  {
+    const float* matrix = weights_feature_ptr;
+    const float* vector = input_ptr;
+    float* result = &state_ptr[memory_size - 1];
+    float* result_in_batch = result;
 
+  float* out_scratch = scratch_ptr;
+  float* bias_scratch = output_ptr;
   for (int i = 0; i < num_units; i++) bias_scratch[i] = 0.0f;
 
   int err = 0;
@@ -196,11 +196,11 @@ inline TfLiteStatus EvalFloatSVDF(
       result_in_batch += memory_size;
     }
   }
+  }
 
   return ApplyTimeWeightsBiasAndActivation(
-      context, batch_size, memory_size, num_filters, num_units, rank,
-      weights_time, bias, params->activation, activation_state, scratch,
-      output);
+      context, batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
+      bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
 }
 
 void EvalIntegerSVDF(
@@ -217,24 +217,26 @@ void EvalIntegerSVDF(
   const int n_unit = n_filter / n_rank;
   const int n_memory = weights_time_tensor->dims->data[1];
 
-  // (b/132070898): Move these temp variables to the new scratch buffer API
+  // TODO(b/132070898): Move these temp variables to the new scratch buffer API
   // when ready.
   int32_t scratch_tensor[kScratchTensorMaxSize];
   int32_t scratch_output_tensor[kScratchTensorMaxSize];
 
-  // Rewrite last bit of state.
+  // Shift states.
+  int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
+
+  // Left shift the activation_state.
   {
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-      for (int c = 0; c < n_filter; ++c) {
-        int16_t* state_ptr = state_ptr_batch + c * n_memory;
-        state_ptr[n_memory - 1] = 0;
-      }
+    int16_t* new_state_start = state_ptr;
+    const int16_t* old_state_start = state_ptr + 1;
+    const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
+    while (old_state_start != old_state_end) {
+      *new_state_start++ = *old_state_start++;
     }
   }
 
+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
   // Feature matmul.
   {
     int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
@@ -255,6 +257,12 @@ void EvalIntegerSVDF(
         dot_prod =
             MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
         dot_prod = std::min(std::max(output_min, dot_prod), output_max);
+        // This assumes state is symmetrically quantized. Otherwise last bit of
+        // state should be initialized to its zero point and accumulate the
+        // dot_prod.
+        // Equivalent as the following:
+        //     result_in_batch = zero point, which happens to be zero.
+        //     result_in_batch += dot_prod_56.
         *result_in_batch = dot_prod;
         result_in_batch += n_memory;
       }
@@ -326,26 +334,6 @@ void EvalIntegerSVDF(
       GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
     }
   }
-
-  // Shift state.
-  {
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-      for (int f = 0; f < n_filter; ++f) {
-        // Shift the vector left:
-        int16_t* batch_ptr = state_ptr_batch;
-        int16_t* batch_start = state_ptr_batch + 1;
-        int16_t* batch_end = state_ptr_batch + n_memory;
-        while (batch_start != batch_end) {
-          *batch_ptr++ = *batch_start++;
-        }
-        state_ptr_batch[n_memory - 1] = 0;
-        state_ptr_batch += n_memory;
-      }
-    }
-  }
 }
 
 }  // namespace
@@ -385,12 +373,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int rank = params->rank;
   const int input_size = input->dims->data[1];
   const int batch_size = input->dims->data[0];
-  // Ensure the input size is a multiple of two.  This is necessary since
-  // optimized kernels access the memory in chunks of two, and all accesses
-  // must be aligned to 16 bits.
-  // TODO(b/153202598): Remove when padding is allowed in TFLite tensors.
-  TF_LITE_ENSURE_EQ(context, input_size % 2, 0);
-
   const int num_filters = weights_feature->dims->data[0];
   TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
   const int num_units = num_filters / rank;
@@ -446,13 +428,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // Validate Scratch Tensors:
     // [0] = (shared - see float block below for usage)
     // [1] = Output Temp, int8_t, {2, num_units, batch_size}
-    // (b/132070898): Scratch values are used as stack variables in
+    // TODO(b/132070898): Scratch values are used as stack variables in
     // EvalIntegerSVDF().
 
     // Validate output tensor:
     TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
   } else {
-    TF_LITE_ENSURE_EQ(context, node->inputs->size, 6);
+    TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
 
     // Validate Input Tensor dtypes:
     TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
@@ -467,19 +449,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // [0] = Holds dot-product of time-forward calculations in
     //       ApplyTimeWeightsBiasAndActivation():
     //         float/int32, {2, batch_size, num_filters}
-    // (b/132070898): Use input tensor as variable until scratch tensor
-    // allocation has been implemented (b/132070898) TfLiteTensor*
-    // scratch_tensor = GetTemporary(context, node, 0);
-    TfLiteTensor* scratch_tensor = &context->tensors[node->inputs->data[5]];
-    TF_LITE_ENSURE_EQ(context, scratch_tensor->type, kTfLiteFloat32);
-
-    TF_LITE_ENSURE_EQ(context, NumDimensions(scratch_tensor), 2);
-    TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[0], batch_size);
-    TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[1], num_filters);
+    // TODO(b/132070898): Scratch values are used as stack variables in
+    // EvalIntegerSVDF().
 
     // Full-float SVDF only uses the one shared scratch tensor (see above for
     // usage).
-    // (b/132070898): Use input tensor as variable until scratch tensor
+    // TODO(b/132070898): Use input tensor as variable until scratch tensor
     // allocation has been implemented.
     // TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
     TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
@@ -505,18 +480,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   switch (weights_feature->type) {
     case kTfLiteFloat32: {
-      // (b/132070898): Use input tensor as variable until scratch tensor
-      // allocation has been implemented. TfLiteTensor* scratch =
-      // GetTemporary(context, node, /*index=*/0);
-      TfLiteTensor* scratch = &context->tensors[node->inputs->data[5]];
-      return EvalFloatSVDF(context, node, input, weights_feature, weights_time,
-                           bias, params, scratch, activation_state, output);
+      // TODO(b/132070898): Use input tensor as variable until scratch tensor
+      // allocation has been implemented.
+      // TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
+      return EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias,
+                    params, activation_state, output);
       break;
     }
 
     case kTfLiteInt8: {
       if (is_full_integer) {
-        // (b/132070898): Store these values in ::Prepare() instead of
+        // TODO(b/132070898): Store these values in ::Prepare() instead of
         // ::Eval():
         // Calculate effective scales.
         OpData op_data;
@@ -574,7 +548,6 @@ TfLiteRegistration* Register_SVDF() {
                                  /*builtin_code=*/0,
                                  /*custom_name=*/nullptr,
                                  /*version=*/0};
-
   return &r;
 }
 
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 189d758eb96..30a27c0a758 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -63,6 +63,6 @@ EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
 EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/6316034d421cbbb59756239908d7c9a99075a3bb.zip"
 EMBARC_MLI_MD5 := "db0910cf0e07e43f74ae7a31de485d56"
 
-XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib.zip"
-XTENSA_HIFI4_MD5 :="a517b653a75b96d0271e1b99ee2a8c14"
+XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
+XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"