Cadence NNLib:Fixed various unit test failures

Fixed person detection test,max pool and svdf unit tests Signed-off-by: Bhanu Prakash Bandaru Venkata <bhanup@cadence.com>
2020-04-08 14:14:15 -07:00 · 2020-04-08 14:14:15 -07:00 · cb4f8412f8
commit cb4f8412f8
parent 5d08924cda
4 changed files with 86 additions and 113 deletions
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
@ -55,7 +55,7 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 8;
+constexpr int kMaxChannels = 256;

 // Conv is quantized along dimension 0:
 // https://www.tensorflow.org/lite/performance/quantization_spec
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc
@ -58,7 +58,7 @@ constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 // Per channel quantization is not needed for any model on xtensa.
-constexpr int kMaxChannels = 8;
+constexpr int kMaxChannels = 256;

 // Depthwise conv is quantized along dimension 3:
 // https://www.tensorflow.org/lite/performance/quantization_spec
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
@ -78,14 +78,15 @@ struct OpData {
 */

 static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
-    TfLiteContext* context, int batch_size, int memory_size, int num_filters,
-    int num_units, int rank, const TfLiteTensor* weights_time,
-    const TfLiteTensor* bias, TfLiteFusedActivation activation,
-    TfLiteTensor* activation_state, TfLiteTensor* scratch,
-    TfLiteTensor* output) {
-  float* scratch_bias = GetTensorData<float>(scratch);
-  if (bias) {
-    const float* bias_data = GetTensorData<float>(bias);
+    TfLiteContext* context, int batch_size, int memory_size, int num_filters, int num_units, int rank,
+    const float* const __restrict__ weights_time_ptr,
+    const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
+    float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
+    float* const __restrict__ output_ptr) {
+  // Compute matmul(activation_state, weights_time).
+  float* scratch_bias = scratch_ptr;
+  if (bias_ptr) {
+    const float* bias_data = bias_ptr;
    for (int j = 0; j < num_units; ++j) {
      scratch_bias[j] = *bias_data++;
    }
@ -96,15 +97,16 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
  }
  int err = 0;
  for (int b = 0; b < batch_size; ++b) {
-    const float* weights_time_vec = GetTensorData<float>(weights_time);
+    const float* weights_time_vec = weights_time_ptr;
    const float* mat_ptr =
-        GetTensorData<float>(activation_state) + b * memory_size * num_filters;
-    float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
+        state_ptr + b * memory_size * num_filters;
+    float* output_ptr_batch = output_ptr + b * num_units;
    for (int j = 0; j < num_units; j++) {
      err = xa_nn_matXvec_f32xf32_f32(
          output_ptr_batch, mat_ptr, NULL, weights_time_vec, NULL, scratch_bias,
          1, memory_size * rank, 0, memory_size * rank, 0);
      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_matXvec_f32xf32_f32 failed");
+
      output_ptr_batch++;
      mat_ptr += memory_size * rank;
      weights_time_vec += memory_size * rank;
@ -113,30 +115,12 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(

  // Apply activation.
  for (int b = 0; b < batch_size; ++b) {
-    float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
+    float* output_ptr_batch = output_ptr + b * num_units;
    for (int i = 0; i < num_units; ++i) {
      *output_ptr_batch = ActivationValFloat(activation, *output_ptr_batch);
      ++output_ptr_batch;
    }
  }
-
-  // Left shift the activation_state to make room for next cycle's activation.
-  // (alanchiao): explore collapsing this into a single loop.
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch =
-        GetTensorData<float>(activation_state) + b * memory_size * num_filters;
-    for (int f = 0; f < num_filters; ++f) {
-      // Shift the vector left:
-      float* batch_ptr = state_ptr_batch;
-      float* batch_start = state_ptr_batch + 1;
-      float* batch_end = state_ptr_batch + memory_size;
-      while (batch_start != batch_end) {
-        *batch_ptr++ = *batch_start++;
-      }
-      state_ptr_batch[memory_size - 1] = 0.0f;
-      state_ptr_batch += memory_size;
-    }
-  }
  return kTfLiteOk;
 }

@ -144,8 +128,7 @@ inline TfLiteStatus EvalFloatSVDF(
    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
    const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
    const TfLiteTensor* bias, const TfLiteSVDFParams* params,
-    TfLiteTensor* scratch, TfLiteTensor* activation_state,
-    TfLiteTensor* output) {
+    TfLiteTensor* activation_state, TfLiteTensor* output) {
  const int rank = params->rank;
  const int batch_size = input->dims->data[0];
  const int input_size = input->dims->data[1];
@ -153,30 +136,47 @@ inline TfLiteStatus EvalFloatSVDF(
  const int num_units = num_filters / rank;
  const int memory_size = weights_time->dims->data[1];

-  // Clear the activation (activation_state's leftmost column).
-  // (ghodrat): Add a test which initialize activation_state with invalid
-  // values in leftmost column and make sure it passes.
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch =
-        GetTensorData<float>(activation_state) + b * memory_size * num_filters;
+  const float* weights_feature_ptr = GetTensorData<float>(weights_feature);
+  const float* weights_time_ptr = GetTensorData<float>(weights_time);
+  const float* bias_ptr = GetTensorData<float>(bias);
+  const float* input_ptr = GetTensorData<float>(input);
+
+  float* state_ptr = GetTensorData<float>(activation_state);
+
+  // TODO(b/132070898): Move this temp variable to the new scratch buffer API
+  // when ready.
+  float scratch_tensor[kScratchTensorMaxSize];
+  float* scratch_ptr = scratch_tensor;
+
+  float* output_ptr = GetTensorData<float>(output);
+
+  // Left shift the activation_state.
+  {
+    float* new_state_start = state_ptr;
+    const float* old_state_start = state_ptr + 1;
+    const float* old_state_end =
+        state_ptr + batch_size * num_filters * memory_size;
+    while (old_state_start != old_state_end) {
+      *new_state_start++ = *old_state_start++;
+    }
  }

+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
  // Compute conv1d(inputs, weights_feature).
  // The activation_state's rightmost column is used to save current cycle
-  // activation. This is achieved by starting at
-  // GetTensorData<float>(activation_state)[memory_size - 1] and having the
-  // stride equal to memory_size.
+  // activation. This is achieved by starting at state_ptr[memory_size - 1] and
+  // having the stride equal to memory_size.

-  const float* matrix = GetTensorData<float>(weights_feature);
-  const float* vector = GetTensorData<float>(input);
-  float* out_scratch = GetTensorData<float>(scratch);
-  /* NNLib matXvec needs a bias buffer, so using output buffer to
-  avoid need for extra memory, output buffer size is batch * num_units,
-  batch is at least 1 so we use size num_units of it */
-  float* bias_scratch = GetTensorData<float>(output);
-  float* result = &GetTensorData<float>(activation_state)[memory_size - 1];
-  float* result_in_batch = result;
+  // Perform batched matrix vector multiply operation:
+  {
+    const float* matrix = weights_feature_ptr;
+    const float* vector = input_ptr;
+    float* result = &state_ptr[memory_size - 1];
+    float* result_in_batch = result;

+  float* out_scratch = scratch_ptr;
+  float* bias_scratch = output_ptr;
  for (int i = 0; i < num_units; i++) bias_scratch[i] = 0.0f;

  int err = 0;
@ -196,11 +196,11 @@ inline TfLiteStatus EvalFloatSVDF(
      result_in_batch += memory_size;
    }
  }
+  }

  return ApplyTimeWeightsBiasAndActivation(
-      context, batch_size, memory_size, num_filters, num_units, rank,
-      weights_time, bias, params->activation, activation_state, scratch,
-      output);
+      context, batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
+      bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
 }

 void EvalIntegerSVDF(
@ -217,24 +217,26 @@ void EvalIntegerSVDF(
  const int n_unit = n_filter / n_rank;
  const int n_memory = weights_time_tensor->dims->data[1];

-  // (b/132070898): Move these temp variables to the new scratch buffer API
+  // TODO(b/132070898): Move these temp variables to the new scratch buffer API
  // when ready.
  int32_t scratch_tensor[kScratchTensorMaxSize];
  int32_t scratch_output_tensor[kScratchTensorMaxSize];

-  // Rewrite last bit of state.
+  // Shift states.
+  int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
+
+  // Left shift the activation_state.
  {
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-      for (int c = 0; c < n_filter; ++c) {
-        int16_t* state_ptr = state_ptr_batch + c * n_memory;
-        state_ptr[n_memory - 1] = 0;
-      }
+    int16_t* new_state_start = state_ptr;
+    const int16_t* old_state_start = state_ptr + 1;
+    const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
+    while (old_state_start != old_state_end) {
+      *new_state_start++ = *old_state_start++;
    }
  }

+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
  // Feature matmul.
  {
    int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
@ -255,6 +257,12 @@ void EvalIntegerSVDF(
        dot_prod =
            MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
        dot_prod = std::min(std::max(output_min, dot_prod), output_max);
+        // This assumes state is symmetrically quantized. Otherwise last bit of
+        // state should be initialized to its zero point and accumulate the
+        // dot_prod.
+        // Equivalent as the following:
+        //     result_in_batch = zero point, which happens to be zero.
+        //     result_in_batch += dot_prod_56.
        *result_in_batch = dot_prod;
        result_in_batch += n_memory;
      }
@ -326,26 +334,6 @@ void EvalIntegerSVDF(
      GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
    }
  }
-
-  // Shift state.
-  {
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-      for (int f = 0; f < n_filter; ++f) {
-        // Shift the vector left:
-        int16_t* batch_ptr = state_ptr_batch;
-        int16_t* batch_start = state_ptr_batch + 1;
-        int16_t* batch_end = state_ptr_batch + n_memory;
-        while (batch_start != batch_end) {
-          *batch_ptr++ = *batch_start++;
-        }
-        state_ptr_batch[n_memory - 1] = 0;
-        state_ptr_batch += n_memory;
-      }
-    }
-  }
 }

 }  // namespace
@ -385,12 +373,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  const int rank = params->rank;
  const int input_size = input->dims->data[1];
  const int batch_size = input->dims->data[0];
-  // Ensure the input size is a multiple of two.  This is necessary since
-  // optimized kernels access the memory in chunks of two, and all accesses
-  // must be aligned to 16 bits.
-  // TODO(b/153202598): Remove when padding is allowed in TFLite tensors.
-  TF_LITE_ENSURE_EQ(context, input_size % 2, 0);
-
  const int num_filters = weights_feature->dims->data[0];
  TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
  const int num_units = num_filters / rank;
@ -446,13 +428,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
    // Validate Scratch Tensors:
    // [0] = (shared - see float block below for usage)
    // [1] = Output Temp, int8_t, {2, num_units, batch_size}
-    // (b/132070898): Scratch values are used as stack variables in
+    // TODO(b/132070898): Scratch values are used as stack variables in
    // EvalIntegerSVDF().

    // Validate output tensor:
    TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
  } else {
-    TF_LITE_ENSURE_EQ(context, node->inputs->size, 6);
+    TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);

    // Validate Input Tensor dtypes:
    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
@ -467,19 +449,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
    // [0] = Holds dot-product of time-forward calculations in
    //       ApplyTimeWeightsBiasAndActivation():
    //         float/int32, {2, batch_size, num_filters}
-    // (b/132070898): Use input tensor as variable until scratch tensor
-    // allocation has been implemented (b/132070898) TfLiteTensor*
-    // scratch_tensor = GetTemporary(context, node, 0);
-    TfLiteTensor* scratch_tensor = &context->tensors[node->inputs->data[5]];
-    TF_LITE_ENSURE_EQ(context, scratch_tensor->type, kTfLiteFloat32);
-
-    TF_LITE_ENSURE_EQ(context, NumDimensions(scratch_tensor), 2);
-    TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[0], batch_size);
-    TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[1], num_filters);
+    // TODO(b/132070898): Scratch values are used as stack variables in
+    // EvalIntegerSVDF().

    // Full-float SVDF only uses the one shared scratch tensor (see above for
    // usage).
-    // (b/132070898): Use input tensor as variable until scratch tensor
+    // TODO(b/132070898): Use input tensor as variable until scratch tensor
    // allocation has been implemented.
    // TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
    TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
@ -505,18 +480,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {

  switch (weights_feature->type) {
    case kTfLiteFloat32: {
-      // (b/132070898): Use input tensor as variable until scratch tensor
-      // allocation has been implemented. TfLiteTensor* scratch =
-      // GetTemporary(context, node, /*index=*/0);
-      TfLiteTensor* scratch = &context->tensors[node->inputs->data[5]];
-      return EvalFloatSVDF(context, node, input, weights_feature, weights_time,
-                           bias, params, scratch, activation_state, output);
+      // TODO(b/132070898): Use input tensor as variable until scratch tensor
+      // allocation has been implemented.
+      // TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
+      return EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias,
+                    params, activation_state, output);
      break;
    }

    case kTfLiteInt8: {
      if (is_full_integer) {
-        // (b/132070898): Store these values in ::Prepare() instead of
+        // TODO(b/132070898): Store these values in ::Prepare() instead of
        // ::Eval():
        // Calculate effective scales.
        OpData op_data;
@ -574,7 +548,6 @@ TfLiteRegistration* Register_SVDF() {
                                 /*builtin_code=*/0,
                                 /*custom_name=*/nullptr,
                                 /*version=*/0};
-
  return &r;
 }

--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@ -63,6 +63,6 @@ EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
 EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/6316034d421cbbb59756239908d7c9a99075a3bb.zip"
 EMBARC_MLI_MD5 := "db0910cf0e07e43f74ae7a31de485d56"

-XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib.zip"
-XTENSA_HIFI4_MD5 :="a517b653a75b96d0271e1b99ee2a8c14"
+XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
+XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"