SVDF: Do state shifting and clearing of last state separately, so state shifting can be done in a single loop.

PiperOrigin-RevId: 298417000 Change-Id: Ia8a6756ffcc1bb7f761755461c085c55511602bc
2020-03-02 12:30:31 -08:00 · 2020-03-02 12:30:31 -08:00 · ae2fe82448
commit ae2fe82448
parent 7c12dad5fd
3 changed files with 60 additions and 80 deletions
--- a/tensorflow/lite/kernels/internal/reference/svdf.h
+++ b/tensorflow/lite/kernels/internal/reference/svdf.h
@ -85,28 +85,23 @@ inline void EvalIntegerSVDF(
  const int n_unit = n_filter / n_rank;
  const int n_memory = weights_time_tensor->dims->data[1];

-  // Shift state.
-  {
-    int16_t zero = 0;
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(state_tensor) + b * n_memory * n_filter;
-      for (int f = 0; f < n_filter; ++f) {
-        tensor_utils::VectorShiftLeft(state_ptr_batch, n_memory, zero);
-        state_ptr_batch += n_memory;
-      }
-    }
-  }
+  int16_t* const state_ptr = GetTensorData<int16_t>(state_tensor);
+
+  // Left shift the activation_state.
+  // std::copy is fine for overlapping ranges if the output is outside of the
+  // input range. (This is not true for copy_n.)
+  std::copy(state_ptr + 1, state_ptr + n_batch * n_memory * n_filter,
+            state_ptr);

  // Feature matmul.
+  // Note: no need to clear the latest activation, matmul is not accumulative.
  {
-    int16_t* state = GetTensorData<int16_t>(state_tensor);
    const int8_t* input = GetTensorData<int8_t>(input_tensor);
    const int8_t* weight_feature =
        GetTensorData<int8_t>(weights_feature_tensor);
    const int32_t output_max = std::numeric_limits<int16_t>::max();
    const int32_t output_min = std::numeric_limits<int16_t>::min();
-    int16_t* result_in_batch = state + (n_memory - 1);
+    int16_t* result_in_batch = state_ptr + (n_memory - 1);
    for (int b = 0; b < n_batch; b++) {
      const int8_t* matrix_ptr = weight_feature;
      for (int r = 0; r < n_filter; r++) {
@ -133,8 +128,7 @@ inline void EvalIntegerSVDF(
  // Time.
  {
    for (int b = 0; b < n_batch; ++b) {
-      const int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(state_tensor) + b * n_memory * n_filter;
+      const int16_t* state_ptr_batch = state_ptr + b * n_memory * n_filter;
      int32_t* scratch_ptr_batch =
          GetTensorData<int32_t>(scratch_tensor) + b * n_filter;
      tensor_utils::BatchVectorBatchVectorDotProduct(
@ -199,15 +193,15 @@ inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,

  float* output_ptr = GetTensorData<float>(output);

-  // Left shift the activation_state, and clear the latest activation (the
-  // rightmost column).
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
-    for (int f = 0; f < num_filters; ++f) {
-      tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
-                                    /*shift_value=*/0.0f);
-      state_ptr_batch += memory_size;
-    }
+  // Left shift the activation_state.
+  // std::copy is fine for overlapping ranges if the output is outside of the
+  // input range. (This is not true for copy_n.)
+  std::copy(state_ptr + 1, state_ptr + batch_size * memory_size * num_filters,
+            state_ptr);
+
+  // Clear the latest activation (the rightmost column).
+  for (int i = 0; i < batch_size * num_filters; ++i) {
+    state_ptr[i * memory_size + memory_size - 1] = 0.0f;
  }

  // Compute conv1d(inputs, weights_feature).
@ -252,15 +246,15 @@ inline void EvalHybridSVDF(
  // Initialize the weights scale.
  const float weights_feature_scale = weights_feature->params.scale;

-  // Left shift the activation_state, and clear the latest activation (the
-  // rightmost column).
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
-    for (int f = 0; f < num_filters; ++f) {
-      tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
-                                    /*shift_value=*/0.0f);
-      state_ptr_batch += memory_size;
-    }
+  // Left shift the activation_state.
+  // std::copy is fine for overlapping ranges if the output is outside of the
+  // input range. (This is not true for copy_n.)
+  std::copy(state_ptr + 1, state_ptr + batch_size * memory_size * num_filters,
+            state_ptr);
+
+  // Clear the latest activation (the rightmost column).
+  for (int i = 0; i < batch_size * num_filters; ++i) {
+    state_ptr[i * memory_size + memory_size - 1] = 0.0f;
  }

  if (!tensor_utils::IsZeroVector(input_ptr, batch_size * input_size)) {
--- a/tensorflow/lite/micro/kernels/svdf.cc
+++ b/tensorflow/lite/micro/kernels/svdf.cc
@ -142,19 +142,16 @@ inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
  float* output_ptr = GetTensorData<float>(output);

  // Left shift the activation_state.
-  for (int b = 0; b < batch_size; ++b) {
-    float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
-    for (int f = 0; f < num_filters; ++f) {
-      // Shift the vector left:
-      float* batch_ptr = state_ptr_batch;
-      float* batch_start = state_ptr_batch + 1;
-      float* batch_end = state_ptr_batch + memory_size;
-      while (batch_start != batch_end) {
-        *batch_ptr++ = *batch_start++;
-      }
-      state_ptr_batch += memory_size;
+  {
+    float* new_state_start = state_ptr;
+    const float* old_state_start = state_ptr + 1;
+    const float* old_state_end =
+        state_ptr + batch_size * num_filters * memory_size;
+    while (old_state_start != old_state_end) {
+      *new_state_start++ = *old_state_start++;
    }
  }
+
  // Note: no need to clear the latest activation, matmul is not accumulative.

  // Compute conv1d(inputs, weights_feature).
@ -206,25 +203,21 @@ void EvalIntegerSVDF(
  int32_t scratch_tensor[kScratchTensorMaxSize];
  int32_t scratch_output_tensor[kScratchTensorMaxSize];

-  // Shift states. No need to set last state, the matmul is not accumulative.
+  // Shift states.
+  int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
+
+  // Left shift the activation_state.
  {
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-      for (int f = 0; f < n_filter; ++f) {
-        // Shift the vector left:
-        int16_t* batch_ptr = state_ptr_batch;
-        int16_t* batch_start = state_ptr_batch + 1;
-        int16_t* batch_end = state_ptr_batch + n_memory;
-        while (batch_start != batch_end) {
-          *batch_ptr++ = *batch_start++;
-        }
-        state_ptr_batch += n_memory;
-      }
+    int16_t* new_state_start = state_ptr;
+    const int16_t* old_state_start = state_ptr + 1;
+    const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
+    while (old_state_start != old_state_end) {
+      *new_state_start++ = *old_state_start++;
    }
  }

+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
  // Feature matmul.
  {
    int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
@ -75,32 +75,27 @@ void EvalIntegerSVDF(
  int32_t scratch_tensor[kScratchTensorMaxSize];
  int32_t scratch_output_tensor[kScratchTensorMaxSize];

-  // Shift states. No need to set last state, the matmul is not accumulative.
+  // Shift states.
+  int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
+
+  // Left shift the activation_state.
  {
-    for (int b = 0; b < n_batch; ++b) {
-      int16_t* state_ptr_batch =
-          GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-      for (int f = 0; f < n_filter; ++f) {
-        // Shift the vector left:
-        int16_t* batch_ptr = state_ptr_batch;
-        int16_t* batch_start = state_ptr_batch + 1;
-        int16_t* batch_end = state_ptr_batch + n_memory;
-        while (batch_start != batch_end) {
-          *batch_ptr++ = *batch_start++;
-        }
-        state_ptr_batch += n_memory;
-      }
+    int16_t* new_state_start = state_ptr;
+    const int16_t* old_state_start = state_ptr + 1;
+    const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
+    while (old_state_start != old_state_end) {
+      *new_state_start++ = *old_state_start++;
    }
  }

+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
  // Feature matmul.
  {
-    int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
    const int8_t* input = GetTensorData<int8_t>(input_tensor);
    const int8_t* weight_feature =
        GetTensorData<int8_t>(weights_feature_tensor);
-    int16_t* result_in_batch = state + (n_memory - 1);
+    int16_t* result_in_batch = state_ptr + (n_memory - 1);

    ae_q56s output_int16_max_56 = AE_CVTQ48A32S(INT16_MAX);
    ae_q56s output_int16_min_56 = AE_CVTQ48A32S(INT16_MIN);
@ -170,9 +165,7 @@ void EvalIntegerSVDF(

      // Perform batched vector dot product:
      const int16_t* vector1_ptr = GetTensorData<int16_t>(weights_time_tensor);
-      const int16_t* vector2_ptr =
-          GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
+      const int16_t* vector2_ptr = state_ptr + b * n_memory * n_filter;

      int num_iters = n_filter / 2;
      const ae_p16x2s* offset_vector1 = (const ae_p16x2s*)(vector1_ptr - 2);