Remove handwritten NEON code to allow the compiler to generate better NEON code.

Change VectorCwiseProduct functions to a template so they can work with any type. Clang generates a slightly faster binary for ARM small cores (~1% faster LSTM op performance on ARM Cortex A55, no measurable difference for Cortex A76), GCC generates similar assembly to handwritten NEON. PiperOrigin-RevId: 286954815 Change-Id: I9bc4898f18e9a94538432ae550facaef40a4656e
2019-12-23 16:02:52 -08:00 · 2019-12-23 16:02:52 -08:00 · f92fc5d442
commit f92fc5d442
parent 734dfee862
8 changed files with 17 additions and 125 deletions
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@ -1817,54 +1817,6 @@ void NeonSparseMatrixBatchVectorMultiplyAccumulate(
  free(aligned_vec_free);
 }

-void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
-                                  int v_size, float* result) {
-  // If v_size is not divisible by the vector size, then we need to process the
-  // final few elements sequentially. postamble_start shows the start index
-  // where this should happen.
-  const int postamble_start =
-      RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
-  int v = 0;
-  for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
-    // Load 4 float values from vector1 and vector2.
-    const float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
-    const float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
-    // Vector multiply 4 float
-    const float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4);
-    // Save to result array.
-    vst1q_f32(result + v, mul_32x4);
-  }
-#pragma clang loop vectorize(disable) unroll(disable)
-  for (; v < v_size; v++) {
-    result[v] = vector1[v] * vector2[v];
-  }
-}
-
-void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
-                                            const float* vector2, int v_size,
-                                            float* result) {
-  // If v_size is not divisible by the vector size, then we need to process the
-  // final few elements sequentially. postamble_start shows the start index
-  // where this should happen.
-  const int postamble_start =
-      RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
-  int v = 0;
-  for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
-    // Load 4 float values from vector1 and vector2 and accumulator.
-    const float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
-    const float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
-    float32x4_t acc_32x4 = vld1q_f32(result + v);
-    // Vector multiply-accumulate 4 float
-    acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
-    // Save to result array.
-    vst1q_f32(result + v, acc_32x4);
-  }
-#pragma clang loop vectorize(disable) unroll(disable)
-  for (; v < v_size; v++) {
-    result[v] += vector1[v] * vector2[v];
-  }
-}
-
 void NeonSub1Vector(const float* vector, int v_size, float* result) {
  // If v_size is not divisible by the vector size, then we need to process the
  // final few elements sequentially. postamble_start shows the start index
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@ -142,11 +142,6 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
  NEON_OR_PORTABLE(CwiseClipping, input, clipping_value, n_batch, n_input);
 }

-void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
-                              int v_size, float* result) {
-  NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
-}
-
 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
                                      const int16_t* vector2, int v_size,
                                      int n_batch, int32_t* result,
@ -155,13 +150,6 @@ void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
      vector1, vector2, v_size, n_batch, result, result_stride);
 }

-void VectorVectorCwiseProductAccumulate(const float* vector1,
-                                        const float* vector2, int v_size,
-                                        float* result) {
-  NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size,
-                   result);
-}
-
 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
                                             const int16_t* batch_vector,
                                             int n_batch, int32_t multiplier,
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@ -107,16 +107,6 @@ void NeonSparseMatrixBatchVectorMultiplyAccumulate(
    const float* scaling_factors, int n_batch, float* __restrict__ result,
    int result_stride);

-// Cwise product of two vectors.
-void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
-                                  int v_size, float* result);
-
-// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
-// assumption here is that result array is initialized to valid values.
-void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
-                                            const float* vector2, int v_size,
-                                            float* result);
-
 // Dot product of two vectors.
 float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
                                 int v_size);
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@ -152,11 +152,6 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
 }

-void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
-                              int v_size, float* result) {
-  NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
-}
-
 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
                                      const int16_t* vector2, int v_size,
                                      int n_batch, int32_t* result,
@ -165,13 +160,6 @@ void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
      vector1, vector2, v_size, n_batch, result, result_stride);
 }

-void VectorVectorCwiseProductAccumulate(const float* vector1,
-                                        const float* vector2, int v_size,
-                                        float* result) {
-  NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size,
-                   result);
-}
-
 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
                                             const int16_t* batch_vector,
                                             int n_batch, int32_t multiplier,
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@ -504,14 +504,6 @@ void PortableCwiseClipping(int8_t* input, const int8_t clipping_value,
  }
 }

-void PortableVectorVectorCwiseProduct(const float* vector1,
-                                      const float* vector2, int v_size,
-                                      float* result) {
-  for (int v = 0; v < v_size; v++) {
-    result[v] = vector1[v] * vector2[v];
-  }
-}
-
 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
                                     int v_size) {
  float result = 0.0;
@ -545,14 +537,6 @@ void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
  }
 }

-void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
-                                                const float* vector2,
-                                                int v_size, float* result) {
-  for (int v = 0; v < v_size; v++) {
-    result[v] += vector1[v] * vector2[v];
-  }
-}
-
 void PortableVectorBatchVectorCwiseProductAccumulate(
    const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
    int32_t multiplier, int shift, int16_t* result) {
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@ -176,17 +176,6 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
  PortableCwiseClipping(input, clipping_value, n_batch, n_input);
 }

-void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
-                              int v_size, float* result) {
-  PortableVectorVectorCwiseProduct(vector1, vector2, v_size, result);
-}
-
-void VectorVectorCwiseProductAccumulate(const float* vector1,
-                                        const float* vector2, int v_size,
-                                        float* result) {
-  PortableVectorVectorCwiseProductAccumulate(vector1, vector2, v_size, result);
-}
-
 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
                                             const int16_t* batch_vector,
                                             int n_batch, int32_t multiplier,
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@ -78,17 +78,6 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate(
    const float* scaling_factors, int n_batch, float* __restrict__ result,
    int result_stride);

-// Cwise product of two vectors.
-void PortableVectorVectorCwiseProduct(const float* vector1,
-                                      const float* vector2, int v_size,
-                                      float* result);
-
-// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
-// assumption here is that result array is initialized to valid values.
-void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
-                                                const float* vector2,
-                                                int v_size, float* result);
-
 // Dot product of two vectors.
 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
                                     int v_size);
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@ -314,14 +314,26 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
                   int32_t n_input);

 // Cwise product of two vectors.
-void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
-                              int v_size, float* result);
+template <typename T>
+inline void VectorVectorCwiseProduct(const T* __restrict__ vector1,
+                                     const T* __restrict__ vector2, int v_size,
+                                     T* __restrict__ result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ = *vector1++ * *vector2++;
+  }
+}

 // Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
 // assumption here is that result array is initialized to valid values.
-void VectorVectorCwiseProductAccumulate(const float* vector1,
-                                        const float* vector2, int v_size,
-                                        float* result);
+template <typename T>
+inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
+                                               const T* __restrict__ vector2,
+                                               int v_size,
+                                               T* __restrict__ result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ += *vector1++ * *vector2++;
+  }
+}

 // Dot product of two vectors.
 float VectorVectorDotProduct(const float* vector1, const float* vector2,