Refactor *_tensor_utils_impl.h files: Both portable and neon versions now only declare Portable* and Neon* functions, and *_tensor_utils.cc files include only the appropriate headers. *_tensor_utils.h include all

This allows us to easily add SSE (and other CPU specific targets) without polluting the compilation with Sse* functions on an ARM target. PiperOrigin-RevId: 253638991
2019-06-17 12:43:10 -07:00 · 2019-06-17 12:43:10 -07:00 · 731981feba
commit 731981feba
parent bfa58c30dd
7 changed files with 170 additions and 217 deletions
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@ -476,6 +476,7 @@ cc_library(
    ],
    hdrs = [
        "reference/portable_tensor_utils.h",
+        "reference/portable_tensor_utils_impl.h",
    ],
    deps = [
        ":compatibility",
@ -491,18 +492,17 @@ cc_library(
    name = "neon_tensor_utils",
    srcs = [
        "optimized/neon_tensor_utils.cc",
-        "reference/portable_tensor_utils.cc",
-        "reference/portable_tensor_utils.h",
    ],
    hdrs = [
        "optimized/neon_tensor_utils.h",
-        "optimized/tensor_utils_impl.h",
+        "optimized/neon_tensor_utils_impl.h",
    ],
    copts = NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE,
    deps = [
        ":common",
        ":compatibility",
        ":cpu_check",
+        ":portable_tensor_utils",
        ":round",
        ":types",
        "//tensorflow/lite/c:c_api_internal",
@ -551,9 +551,6 @@ cc_library(
        "tensor_utils.cc",
    ],
    hdrs = [
-        "optimized/neon_tensor_utils.h",
-        "optimized/tensor_utils_impl.h",
-        "reference/portable_tensor_utils.h",
        "tensor_utils.h",
    ],
    copts = NEON_FLAGS_IF_APPLICABLE,
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <fcntl.h>
-#include <stdlib.h>
-#include <string.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>

+#include <cstdlib>
+#include <cstring>
 #include <vector>

 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
 #include "tensorflow/lite/kernels/internal/round.h"

 #ifdef USE_NEON
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@ -19,7 +19,8 @@ limitations under the License.
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
-#include "tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h"
+#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"

 namespace tflite {
 namespace tensor_utils {
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@ -0,0 +1,130 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
+
+// TODO(ghodrat): Remove this header file and the dependency to internal data
+// structure.
+#include "tensorflow/lite/c/builtin_op_data.h"
+
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#endif  //  defined(__ARM_NEON__) || defined(__ARM_NEON)
+#endif  //  USE_NEON
+
+namespace tflite {
+namespace tensor_utils {
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector.
+void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                             int m_cols, const float* vector,
+                                             int n_batch, float* result,
+                                             int result_stride);
+
+// Matrix multiplication for quantized values using symmetric quantization.
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride);
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector. Sparse version.
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
+    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
+    float* __restrict__ result, int result_stride);
+
+// Matrix multiplication for quantized values using symmetric quantization.
+// Sparse version.
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+
+// Cwise product of two vectors.
+void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                                  int v_size, float* result);
+
+// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
+// assumption here is that result array is initialized to valid values.
+void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
+                                            const float* vector2, int v_size,
+                                            float* result);
+
+// Dot product of two vectors.
+float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                 int v_size);
+
+// Dot product of two batch vectors.
+void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
+                                          const float* vector2, int v_size,
+                                          int n_batch, float* result,
+                                          int result_stride);
+
+// Cwise product of a vector and a batch-vector.
+void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size,
+                                       const float* batch_vector, int n_batch,
+                                       float* result);
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
+                                                 int v_size,
+                                                 const float* batch_vector,
+                                                 int n_batch, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void NeonSub1Vector(const float* vector, int v_size, float* result);
+
+// Clip elements of a vector using a abs_limit value.
+void NeonClipVector(const float* vector, int v_size, float abs_limit,
+                    float* result);
+
+// Multiply all elements of vector with a scalar.
+void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
+                              float* result);
+
+// Check if all entries of a vector are zero.
+bool NeonIsZeroVector(const float* vector, int v_size);
+
+// Symmetric quantizer.
+void NeonSymmetricQuantizeFloats(const float* values, const int size,
+                                 int8_t* quantized_values, float* min,
+                                 float* max, float* scaling_factor);
+
+// Shift left a vector in place with v_size size.
+void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void NeonReductionSumVector(const float* input_vector, float* output_vector,
+                            int output_size, int reduction_size);
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <stdlib.h>
-#include <string.h>
-
 #include <algorithm>
 #include <cmath>
+#include <cstdlib>
+#include <cstring>

 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
 #include "tensorflow/lite/kernels/internal/round.h"
 #include "tensorflow/lite/kernels/op_macros.h"

--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@ -18,6 +18,7 @@ limitations under the License.
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"

 #if defined(_MSC_VER)
 #define __restrict__ __restrict
@ -26,126 +27,6 @@ limitations under the License.
 namespace tflite {
 namespace tensor_utils {

-// Limit a float input f between +abs_limit and -abs_limit.
-float PortableClip(float f, float abs_limit);
-
-bool PortableIsZeroVector(const float* vector, int v_size);
-
-void PortableSymmetricQuantizeFloats(const float* values, const int size,
-                                     int8_t* quantized_values, float* min_value,
-                                     float* max_value, float* scaling_factor);
-
-// Multiply a matrix by a batch vector, and store results in a batch-size
-// vector.
-void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
-                                                 int m_rows, int m_cols,
-                                                 const float* vector,
-                                                 int n_batch, float* result,
-                                                 int result_stride);
-
-void PortableMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride);
-
-void PortableSparseMatrixBatchVectorMultiplyAccumulate(
-    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
-    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
-    float* __restrict__ result, int result_stride);
-
-void PortableSparseMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
-    const int m_cols, const int8_t* __restrict__ vectors,
-    const float* scaling_factors, int n_batch, float* __restrict__ result,
-    int result_stride);
-
-// Cwise product of two vectors.
-void PortableVectorVectorCwiseProduct(const float* vector1,
-                                      const float* vector2, int v_size,
-                                      float* result);
-
-// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
-// assumption here is that result array is initialized to valid values.
-void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
-                                                const float* vector2,
-                                                int v_size, float* result);
-
-// Dot product of two vectors.
-float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
-                                     int v_size);
-
-// Dot product of two batch vectors.
-void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
-                                              const float* vector2, int v_size,
-                                              int n_batch, float* result,
-                                              int result_stride);
-
-// Cwise product of a vector and a batch-vector.
-void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size,
-                                           const float* batch_vector,
-                                           int n_batch, float* result);
-
-// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
-// operation, the assumption here is that result array is initialized to valid
-// values.
-void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
-                                                     int v_size,
-                                                     const float* batch_vector,
-                                                     int n_batch,
-                                                     float* result);
-
-// Batch vector initialization with another vector.
-void PortableVectorBatchVectorAssign(const float* vector, int v_size,
-                                     int n_batch, float* batch_vector);
-
-// Add another vector for each batch in the batch vector.
-void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
-                                  float* batch_vector);
-
-// Apply sigmoid to elements of a vector.
-void PortableApplySigmoidToVector(const float* vector, int v_size,
-                                  float* result);
-
-// Apply activation function to elements of a vector.
-void PortableApplyActivationToVector(const float* vector, int v_size,
-                                     TfLiteFusedActivation activation,
-                                     float* result);
-
-// Copy vector to another vector.
-void PortableCopyVector(const float* vector, int v_size, float* result);
-
-// Compute "1.0f - elements of vector" (used in CIFG).
-void PortableSub1Vector(const float* vector, int v_size, float* result);
-
-// Fill vector with 0.f.
-void PortableZeroVector(float* vector, int v_size);
-
-// Multiply all elements of vector with a scalar.
-void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
-                                  float* result);
-
-// Clip elements of a vector using a abs_limit value.
-void PortableClipVector(const float* vector, int v_size, float abs_limit,
-                        float* result);
-
-// Shift left a vector in place with v_size size.
-void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
-
-// Reduce-sum on a float input vector:
-// input_vector: float pointer to input vector.
-// output_vector: float pointer to vector.
-// output_size: output vector size.
-// reduction_size: number of consecutive elements from input vector which are
-// added to get one element of output.
-void PortableReductionSumVector(const float* input_vector, float* output_vector,
-                                int output_size, int reduction_size);
-
-// Layer norm for each batch.
-// normalization_epsilon is added to avoid divergence.
-void PortableMeanStddevNormalization(const float* input_vector,
-                                     float* output_vector, int v_size,
-                                     int n_batch, float normalization_epsilon);
-
 float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }

 bool IsZeroVector(const float* vector, int v_size) {
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
-#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
+
+#include <cstdint>

 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
@ -23,15 +25,18 @@ limitations under the License.
 #define __restrict__ __restrict
 #endif

-#ifndef USE_NEON
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#define USE_NEON
-#endif  //  defined(__ARM_NEON__) || defined(__ARM_NEON)
-#endif  //  USE_NEON
-
 namespace tflite {
 namespace tensor_utils {

+// Limit a float input f between +abs_limit and -abs_limit.
+float PortableClip(float f, float abs_limit);
+
+bool PortableIsZeroVector(const float* vector, int v_size);
+
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float* min_value,
+                                     float* max_value, float* scaling_factor);
+
 // Multiply a matrix by a batch vector, and store results in a batch-size
 // vector.
 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
@ -39,84 +44,48 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
                                                 const float* vector,
                                                 int n_batch, float* result,
                                                 int result_stride);
-void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
-                                             int m_cols, const float* vector,
-                                             int n_batch, float* result,
-                                             int result_stride);

-// Matrix multiplication for quantized values using symmetric quantization.
 void PortableMatrixBatchVectorMultiplyAccumulate(
    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
    const int8_t* __restrict__ vectors, const float* scaling_factors,
    int n_batch, float* __restrict__ result, int result_stride);
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, int result_stride);

-// Multiply a matrix by a batch vector, and store results in a batch-size
-// vector. Sparse version.
 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
    float* __restrict__ result, int result_stride);
-void NeonSparseMatrixBatchVectorMultiplyAccumulate(
-    const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
-    int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
-    float* __restrict__ result, int result_stride);

-// Matrix multiplication for quantized values using symmetric quantization.
-// Sparse version.
 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
    const int m_cols, const int8_t* __restrict__ vectors,
    const float* scaling_factors, int n_batch, float* __restrict__ result,
    int result_stride);
-void NeonSparseMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
-    const int m_cols, const int8_t* __restrict__ vectors,
-    const float* scaling_factors, int n_batch, float* __restrict__ result,
-    int result_stride);

 // Cwise product of two vectors.
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                      const float* vector2, int v_size,
                                      float* result);
-void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
-                                  int v_size, float* result);

-// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
+// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
 // assumption here is that result array is initialized to valid values.
 void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
                                                const float* vector2,
                                                int v_size, float* result);
-void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
-                                            const float* vector2, int v_size,
-                                            float* result);

 // Dot product of two vectors.
 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
                                     int v_size);
-float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
-                                 int v_size);

 // Dot product of two batch vectors.
 void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
                                              const float* vector2, int v_size,
                                              int n_batch, float* result,
                                              int result_stride);
-void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
-                                          const float* vector2, int v_size,
-                                          int n_batch, float* result,
-                                          int result_stride);

 // Cwise product of a vector and a batch-vector.
 void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size,
                                           const float* batch_vector,
                                           int n_batch, float* result);
-void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size,
-                                       const float* batch_vector, int n_batch,
-                                       float* result);

 // Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
 // operation, the assumption here is that result array is initialized to valid
@ -126,29 +95,15 @@ void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
                                                     const float* batch_vector,
                                                     int n_batch,
                                                     float* result);
-void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
-                                                 int v_size,
-                                                 const float* batch_vector,
-                                                 int n_batch, float* result);
-
-// Compute "1.0f - elements of vector" (used in CIFG).
-void PortableSub1Vector(const float* vector, int v_size, float* result);
-void NeonSub1Vector(const float* vector, int v_size, float* result);
-
-// Clip elements of a vector using a abs_limit value.
-void PortableClipVector(const float* vector, int v_size, float abs_limit,
-                        float* result);
-void NeonClipVector(const float* vector, int v_size, float abs_limit,
-                    float* result);
-
-// Add another vector for each batch in the batch vector.
-void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
-                                  float* batch_vector);

 // Batch vector initialization with another vector.
 void PortableVectorBatchVectorAssign(const float* vector, int v_size,
                                     int n_batch, float* batch_vector);

+// Add another vector for each batch in the batch vector.
+void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
+                                  float* batch_vector);
+
 // Apply sigmoid to elements of a vector.
 void PortableApplySigmoidToVector(const float* vector, int v_size,
                                  float* result);
@ -161,33 +116,22 @@ void PortableApplyActivationToVector(const float* vector, int v_size,
 // Copy vector to another vector.
 void PortableCopyVector(const float* vector, int v_size, float* result);

+// Compute "1.0f - elements of vector" (used in CIFG).
+void PortableSub1Vector(const float* vector, int v_size, float* result);
+
 // Fill vector with 0.f.
 void PortableZeroVector(float* vector, int v_size);

 // Multiply all elements of vector with a scalar.
 void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
                                  float* result);
-void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
-                              float* result);

-// Limit a float input f between +abs_limit and -abs_limit.
-float PortableClip(float f, float abs_limit);
-
-// Check if all entries of a vector are zero.
-bool PortableIsZeroVector(const float* vector, int v_size);
-bool NeonIsZeroVector(const float* vector, int v_size);
-
-// Symmetric quantizer.
-void PortableSymmetricQuantizeFloats(const float* values, const int size,
-                                     int8_t* quantized_values, float* min,
-                                     float* max, float* scaling_factor);
-void NeonSymmetricQuantizeFloats(const float* values, const int size,
-                                 int8_t* quantized_values, float* min,
-                                 float* max, float* scaling_factor);
+// Clip elements of a vector using a abs_limit value.
+void PortableClipVector(const float* vector, int v_size, float abs_limit,
+                        float* result);

 // Shift left a vector in place with v_size size.
 void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
-void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);

 // Reduce-sum on a float input vector:
 // input_vector: float pointer to input vector.
@ -197,9 +141,9 @@ void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
 // added to get one element of output.
 void PortableReductionSumVector(const float* input_vector, float* output_vector,
                                int output_size, int reduction_size);
-void NeonReductionSumVector(const float* input_vector, float* output_vector,
-                            int output_size, int reduction_size);

+// Layer norm for each batch.
+// normalization_epsilon is added to avoid divergence.
 void PortableMeanStddevNormalization(const float* input_vector,
                                     float* output_vector, int v_size,
                                     int n_batch, float normalization_epsilon);
@ -207,4 +151,4 @@ void PortableMeanStddevNormalization(const float* input_vector,
 }  // namespace tensor_utils
 }  // namespace tflite

-#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_