From 731981feba0fe1aac0dbeea75716b9fea624aeb9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 17 Jun 2019 12:43:10 -0700
Subject: [PATCH] Refactor *_tensor_utils_impl.h files: Both portable and neon
 versions now only declare Portable* and Neon* functions, and
 *_tensor_utils.cc files include only the appropriate headers.
 *_tensor_utils.h include all

This allows us to easily add SSE (and other CPU specific targets) without polluting the compilation with Sse* functions on an ARM target.

PiperOrigin-RevId: 253638991
---
 tensorflow/lite/kernels/internal/BUILD        |   9 +-
 .../internal/optimized/neon_tensor_utils.cc   |   6 +-
 .../internal/optimized/neon_tensor_utils.h    |   3 +-
 .../optimized/neon_tensor_utils_impl.h        | 130 ++++++++++++++++++
 .../reference/portable_tensor_utils.cc        |   6 +-
 .../reference/portable_tensor_utils.h         | 121 +---------------
 .../portable_tensor_utils_impl.h}             | 112 ++++-----------
 7 files changed, 170 insertions(+), 217 deletions(-)
 create mode 100644 tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
 rename tensorflow/lite/kernels/internal/{optimized/tensor_utils_impl.h => reference/portable_tensor_utils_impl.h} (62%)
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 63eefbf6ef2..b413992cce1 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -476,6 +476,7 @@ cc_library(
     ],
     hdrs = [
         "reference/portable_tensor_utils.h",
+        "reference/portable_tensor_utils_impl.h",
     ],
     deps = [
         ":compatibility",
@@ -491,18 +492,17 @@ cc_library(
     name = "neon_tensor_utils",
     srcs = [
         "optimized/neon_tensor_utils.cc",
-        "reference/portable_tensor_utils.cc",
-        "reference/portable_tensor_utils.h",
     ],
     hdrs = [
         "optimized/neon_tensor_utils.h",
-        "optimized/tensor_utils_impl.h",
+        "optimized/neon_tensor_utils_impl.h",
     ],
     copts = NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE,
     deps = [
         ":common",
         ":compatibility",
         ":cpu_check",
+        ":portable_tensor_utils",
         ":round",
         ":types",
         "//tensorflow/lite/c:c_api_internal",
@@ -551,9 +551,6 @@ cc_library(
         "tensor_utils.cc",
     ],
     hdrs = [
-        "optimized/neon_tensor_utils.h",
-        "optimized/tensor_utils_impl.h",
-        "reference/portable_tensor_utils.h",
         "tensor_utils.h",
     ],
     copts = NEON_FLAGS_IF_APPLICABLE,
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 54585cc0dee..389f25ccf9e 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <fcntl.h>
-#include <stdlib.h>
-#include <string.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 
+#include <cstdlib>
+#include <cstring>
 #include <vector>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
 #include "tensorflow/lite/kernels/internal/round.h"
 
 #ifdef USE_NEON
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index 71d67903347..a74d544a79d 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -19,7 +19,8 @@ limitations under the License.
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
-#include "tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
 
 namespace tflite {
 namespace tensor_utils {
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
new file mode 100644
index 00000000000..71ac15556ee
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -0,0 +1,130 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
+
+// TODO(ghodrat): Remove this header file and the dependency to internal data
+// structure.
+#include "tensorflow/lite/c/builtin_op_data.h"
+
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#endif  //  defined(__ARM_NEON__) || defined(__ARM_NEON)
+#endif  //  USE_NEON
+
+namespace tflite {
+namespace tensor_utils {
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector.
+void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                             int m_cols, const float* vector,
+                                             int n_batch, float* result,
+                                             int result_stride);
+
+// Matrix multiplication for quantized values using symmetric quantization.
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride);
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector. Sparse version.
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result, int result_stride);
+
+// Matrix multiplication for quantized values using symmetric quantization.
+// Sparse version.
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+
+// Cwise product of two vectors.
+void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                                  int v_size, float* result);
+
+// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
+// assumption here is that result array is initialized to valid values.
+void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
+                                            const float* vector2, int v_size,
+                                            float* result);
+
+// Dot product of two vectors.
+float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                 int v_size);
+
+// Dot product of two batch vectors.
+void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
+                                          const float* vector2, int v_size,
+                                          int n_batch, float* result,
+                                          int result_stride);
+
+// Cwise product of a vector and a batch-vector.
+void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size,
+                                       const float* batch_vector, int n_batch,
+                                       float* result);
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
+                                                 int v_size,
+                                                 const float* batch_vector,
+                                                 int n_batch, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void NeonSub1Vector(const float* vector, int v_size, float* result);
+
+// Clip elements of a vector using a abs_limit value.
+void NeonClipVector(const float* vector, int v_size, float abs_limit,
+                    float* result);
+
+// Multiply all elements of vector with a scalar.
+void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                              float* result);
+
+// Check if all entries of a vector are zero.
+bool NeonIsZeroVector(const float* vector, int v_size);
+
+// Symmetric quantizer.
+void NeonSymmetricQuantizeFloats(const float* values, const int size,
+                                 int8_t* quantized_values, float* min,
+                                 float* max, float* scaling_factor);
+
+// Shift left a vector in place with v_size size.
+void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void NeonReductionSumVector(const float* input_vector, float* output_vector,
+                            int output_size, int reduction_size);
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index d0d97aa06e6..472425e8e0a 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <stdlib.h>
-#include <string.h>
-
 #include <algorithm>
 #include <cmath>
+#include <cstdlib>
+#include <cstring>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
 #include "tensorflow/lite/kernels/internal/round.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index fa737ecae69..28ca98160cd 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -18,6 +18,7 @@ limitations under the License.
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
@@ -26,126 +27,6 @@ limitations under the License.
 namespace tflite {
 namespace tensor_utils {
 
-// Limit a float input f between +abs_limit and -abs_limit.
-float PortableClip(float f, float abs_limit);
-
-bool PortableIsZeroVector(const float* vector, int v_size);
-
-void PortableSymmetricQuantizeFloats(const float* values, const int size,
-                                     int8_t* quantized_values, float* min_value,
-                                     float* max_value, float* scaling_factor);
-
-// Multiply a matrix by a batch vector, and store results in a batch-size
-// vector.
-void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
-                                                 int m_rows, int m_cols,
-                                                 const float* vector,
-                                                 int n_batch, float* result,
-                                                 int result_stride);
-
-void PortableMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride);
-
-void PortableSparseMatrixBatchVectorMultiplyAccumulate(
-    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
-    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
-    float* __restrict__ result, int result_stride);
-
-void PortableSparseMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
-    const int m_cols, const int8_t* __restrict__ vectors,
-    const float* scaling_factors, int n_batch, float* __restrict__ result,
-    int result_stride);
-
-// Cwise product of two vectors.
-void PortableVectorVectorCwiseProduct(const float* vector1,
-                                      const float* vector2, int v_size,
-                                      float* result);
-
-// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
-// assumption here is that result array is initialized to valid values.
-void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
-                                                const float* vector2,
-                                                int v_size, float* result);
-
-// Dot product of two vectors.
-float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
-                                     int v_size);
-
-// Dot product of two batch vectors.
-void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
-                                              const float* vector2, int v_size,
-                                              int n_batch, float* result,
-                                              int result_stride);
-
-// Cwise product of a vector and a batch-vector.
-void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size,
-                                           const float* batch_vector,
-                                           int n_batch, float* result);
-
-// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
-// operation, the assumption here is that result array is initialized to valid
-// values.
-void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
-                                                     int v_size,
-                                                     const float* batch_vector,
-                                                     int n_batch,
-                                                     float* result);
-
-// Batch vector initialization with another vector.
-void PortableVectorBatchVectorAssign(const float* vector, int v_size,
-                                     int n_batch, float* batch_vector);
-
-// Add another vector for each batch in the batch vector.
-void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
-                                  float* batch_vector);
-
-// Apply sigmoid to elements of a vector.
-void PortableApplySigmoidToVector(const float* vector, int v_size,
-                                  float* result);
-
-// Apply activation function to elements of a vector.
-void PortableApplyActivationToVector(const float* vector, int v_size,
-                                     TfLiteFusedActivation activation,
-                                     float* result);
-
-// Copy vector to another vector.
-void PortableCopyVector(const float* vector, int v_size, float* result);
-
-// Compute "1.0f - elements of vector" (used in CIFG).
-void PortableSub1Vector(const float* vector, int v_size, float* result);
-
-// Fill vector with 0.f.
-void PortableZeroVector(float* vector, int v_size);
-
-// Multiply all elements of vector with a scalar.
-void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
-                                  float* result);
-
-// Clip elements of a vector using a abs_limit value.
-void PortableClipVector(const float* vector, int v_size, float abs_limit,
-                        float* result);
-
-// Shift left a vector in place with v_size size.
-void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
-
-// Reduce-sum on a float input vector:
-// input_vector: float pointer to input vector.
-// output_vector: float pointer to vector.
-// output_size: output vector size.
-// reduction_size: number of consecutive elements from input vector which are
-// added to get one element of output.
-void PortableReductionSumVector(const float* input_vector, float* output_vector,
-                                int output_size, int reduction_size);
-
-// Layer norm for each batch.
-// normalization_epsilon is added to avoid divergence.
-void PortableMeanStddevNormalization(const float* input_vector,
-                                     float* output_vector, int v_size,
-                                     int n_batch, float normalization_epsilon);
-
 float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
 
 bool IsZeroVector(const float* vector, int v_size) {
diff --git a/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
similarity index 62%
rename from tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
rename to tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index 3a6061b6080..80503d7f6cd 100644
--- a/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
-#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
+
+#include <cstdint>
 
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
@@ -23,15 +25,18 @@ limitations under the License.
 #define __restrict__ __restrict
 #endif
 
-#ifndef USE_NEON
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#define USE_NEON
-#endif  //  defined(__ARM_NEON__) || defined(__ARM_NEON)
-#endif  //  USE_NEON
-
 namespace tflite {
 namespace tensor_utils {
 
+// Limit a float input f between +abs_limit and -abs_limit.
+float PortableClip(float f, float abs_limit);
+
+bool PortableIsZeroVector(const float* vector, int v_size);
+
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float* min_value,
+                                     float* max_value, float* scaling_factor);
+
 // Multiply a matrix by a batch vector, and store results in a batch-size
 // vector.
 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
@@ -39,84 +44,48 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
                                                  const float* vector,
                                                  int n_batch, float* result,
                                                  int result_stride);
-void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
-                                             int m_cols, const float* vector,
-                                             int n_batch, float* result,
-                                             int result_stride);
 
-// Matrix multiplication for quantized values using symmetric quantization.
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride);
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride);
 
-// Multiply a matrix by a batch vector, and store results in a batch-size
-// vector. Sparse version.
 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
     float* __restrict__ result, int result_stride);
-void NeonSparseMatrixBatchVectorMultiplyAccumulate(
-    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
-    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
-    float* __restrict__ result, int result_stride);
 
-// Matrix multiplication for quantized values using symmetric quantization.
-// Sparse version.
 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
     const int m_cols, const int8_t* __restrict__ vectors,
     const float* scaling_factors, int n_batch, float* __restrict__ result,
     int result_stride);
-void NeonSparseMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
-    const int m_cols, const int8_t* __restrict__ vectors,
-    const float* scaling_factors, int n_batch, float* __restrict__ result,
-    int result_stride);
 
 // Cwise product of two vectors.
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
                                       float* result);
-void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
-                                  int v_size, float* result);
 
-// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
+// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
 // assumption here is that result array is initialized to valid values.
 void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
                                                 const float* vector2,
                                                 int v_size, float* result);
-void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
-                                            const float* vector2, int v_size,
-                                            float* result);
 
 // Dot product of two vectors.
 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
                                      int v_size);
-float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
-                                 int v_size);
 
 // Dot product of two batch vectors.
 void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
                                               const float* vector2, int v_size,
                                               int n_batch, float* result,
                                               int result_stride);
-void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
-                                          const float* vector2, int v_size,
-                                          int n_batch, float* result,
-                                          int result_stride);
 
 // Cwise product of a vector and a batch-vector.
 void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size,
                                            const float* batch_vector,
                                            int n_batch, float* result);
-void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size,
-                                       const float* batch_vector, int n_batch,
-                                       float* result);
 
 // Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
 // operation, the assumption here is that result array is initialized to valid
@@ -126,29 +95,15 @@ void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
                                                      const float* batch_vector,
                                                      int n_batch,
                                                      float* result);
-void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
-                                                 int v_size,
-                                                 const float* batch_vector,
-                                                 int n_batch, float* result);
-
-// Compute "1.0f - elements of vector" (used in CIFG).
-void PortableSub1Vector(const float* vector, int v_size, float* result);
-void NeonSub1Vector(const float* vector, int v_size, float* result);
-
-// Clip elements of a vector using a abs_limit value.
-void PortableClipVector(const float* vector, int v_size, float abs_limit,
-                        float* result);
-void NeonClipVector(const float* vector, int v_size, float abs_limit,
-                    float* result);
-
-// Add another vector for each batch in the batch vector.
-void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
-                                  float* batch_vector);
 
 // Batch vector initialization with another vector.
 void PortableVectorBatchVectorAssign(const float* vector, int v_size,
                                      int n_batch, float* batch_vector);
 
+// Add another vector for each batch in the batch vector.
+void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
+                                  float* batch_vector);
+
 // Apply sigmoid to elements of a vector.
 void PortableApplySigmoidToVector(const float* vector, int v_size,
                                   float* result);
@@ -161,33 +116,22 @@ void PortableApplyActivationToVector(const float* vector, int v_size,
 // Copy vector to another vector.
 void PortableCopyVector(const float* vector, int v_size, float* result);
 
+// Compute "1.0f - elements of vector" (used in CIFG).
+void PortableSub1Vector(const float* vector, int v_size, float* result);
+
 // Fill vector with 0.f.
 void PortableZeroVector(float* vector, int v_size);
 
 // Multiply all elements of vector with a scalar.
 void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                                   float* result);
-void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
-                              float* result);
 
-// Limit a float input f between +abs_limit and -abs_limit.
-float PortableClip(float f, float abs_limit);
-
-// Check if all entries of a vector are zero.
-bool PortableIsZeroVector(const float* vector, int v_size);
-bool NeonIsZeroVector(const float* vector, int v_size);
-
-// Symmetric quantizer.
-void PortableSymmetricQuantizeFloats(const float* values, const int size,
-                                     int8_t* quantized_values, float* min,
-                                     float* max, float* scaling_factor);
-void NeonSymmetricQuantizeFloats(const float* values, const int size,
-                                 int8_t* quantized_values, float* min,
-                                 float* max, float* scaling_factor);
+// Clip elements of a vector using a abs_limit value.
+void PortableClipVector(const float* vector, int v_size, float abs_limit,
+                        float* result);
 
 // Shift left a vector in place with v_size size.
 void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
-void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
 
 // Reduce-sum on a float input vector:
 // input_vector: float pointer to input vector.
@@ -197,9 +141,9 @@ void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
 // added to get one element of output.
 void PortableReductionSumVector(const float* input_vector, float* output_vector,
                                 int output_size, int reduction_size);
-void NeonReductionSumVector(const float* input_vector, float* output_vector,
-                            int output_size, int reduction_size);
 
+// Layer norm for each batch.
+// normalization_epsilon is added to avoid divergence.
 void PortableMeanStddevNormalization(const float* input_vector,
                                      float* output_vector, int v_size,
                                      int n_batch, float normalization_epsilon);
@@ -207,4 +151,4 @@ void PortableMeanStddevNormalization(const float* input_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_