From 731981feba0fe1aac0dbeea75716b9fea624aeb9 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 17 Jun 2019 12:43:10 -0700 Subject: [PATCH] Refactor *_tensor_utils_impl.h files: Both portable and neon versions now only declare Portable* and Neon* functions, and *_tensor_utils.cc files include only the appropriate headers. *_tensor_utils.h include all This allows us to easily add SSE (and other CPU specific targets) without polluting the compilation with Sse* functions on an ARM target. PiperOrigin-RevId: 253638991 --- tensorflow/lite/kernels/internal/BUILD | 9 +- .../internal/optimized/neon_tensor_utils.cc | 6 +- .../internal/optimized/neon_tensor_utils.h | 3 +- .../optimized/neon_tensor_utils_impl.h | 130 ++++++++++++++++++ .../reference/portable_tensor_utils.cc | 6 +- .../reference/portable_tensor_utils.h | 121 +--------------- .../portable_tensor_utils_impl.h} | 112 ++++----------- 7 files changed, 170 insertions(+), 217 deletions(-) create mode 100644 tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h rename tensorflow/lite/kernels/internal/{optimized/tensor_utils_impl.h => reference/portable_tensor_utils_impl.h} (62%) diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD index 63eefbf6ef2..b413992cce1 100644 --- a/tensorflow/lite/kernels/internal/BUILD +++ b/tensorflow/lite/kernels/internal/BUILD @@ -476,6 +476,7 @@ cc_library( ], hdrs = [ "reference/portable_tensor_utils.h", + "reference/portable_tensor_utils_impl.h", ], deps = [ ":compatibility", @@ -491,18 +492,17 @@ cc_library( name = "neon_tensor_utils", srcs = [ "optimized/neon_tensor_utils.cc", - "reference/portable_tensor_utils.cc", - "reference/portable_tensor_utils.h", ], hdrs = [ "optimized/neon_tensor_utils.h", - "optimized/tensor_utils_impl.h", + "optimized/neon_tensor_utils_impl.h", ], copts = NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE, deps = [ ":common", ":compatibility", ":cpu_check", + ":portable_tensor_utils", ":round", ":types", "//tensorflow/lite/c:c_api_internal", @@ -551,9 +551,6 @@ cc_library( "tensor_utils.cc", ], hdrs = [ - "optimized/neon_tensor_utils.h", - "optimized/tensor_utils_impl.h", - "reference/portable_tensor_utils.h", "tensor_utils.h", ], copts = NEON_FLAGS_IF_APPLICABLE, diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc index 54585cc0dee..389f25ccf9e 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc @@ -13,19 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include -#include -#include #include #include #include +#include +#include #include #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/kernels/activation_functor.h" #include "tensorflow/lite/kernels/internal/common.h" #include "tensorflow/lite/kernels/internal/compatibility.h" -#include "tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h" +#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h" #include "tensorflow/lite/kernels/internal/round.h" #ifdef USE_NEON diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h index 71d67903347..a74d544a79d 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h @@ -19,7 +19,8 @@ limitations under the License. // structure. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h" -#include "tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h" +#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h" +#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h" namespace tflite { namespace tensor_utils { diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h new file mode 100644 index 00000000000..71ac15556ee --- /dev/null +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h @@ -0,0 +1,130 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_ + +// TODO(ghodrat): Remove this header file and the dependency to internal data +// structure. +#include "tensorflow/lite/c/builtin_op_data.h" + +#if defined(_MSC_VER) +#define __restrict__ __restrict +#endif + +#ifndef USE_NEON +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#define USE_NEON +#endif // defined(__ARM_NEON__) || defined(__ARM_NEON) +#endif // USE_NEON + +namespace tflite { +namespace tensor_utils { + +// Multiply a matrix by a batch vector, and store results in a batch-size +// vector. +void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, + int m_cols, const float* vector, + int n_batch, float* result, + int result_stride); + +// Matrix multiplication for quantized values using symmetric quantization. +void NeonMatrixBatchVectorMultiplyAccumulate( + const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, + const int8_t* __restrict__ vectors, const float* scaling_factors, + int n_batch, float* __restrict__ result, int result_stride); + +// Multiply a matrix by a batch vector, and store results in a batch-size +// vector. Sparse version. +void NeonSparseMatrixBatchVectorMultiplyAccumulate( + const float* __restrict__ matrix, const uint8_t* __restrict__ ledger, + int m_rows, int m_cols, const float* __restrict__ vector, int n_batch, + float* __restrict__ result, int result_stride); + +// Matrix multiplication for quantized values using symmetric quantization. +// Sparse version. +void NeonSparseMatrixBatchVectorMultiplyAccumulate( + const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows, + const int m_cols, const int8_t* __restrict__ vectors, + const float* scaling_factors, int n_batch, float* __restrict__ result, + int result_stride); + +// Cwise product of two vectors. +void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2, + int v_size, float* result); + +// Cwise product and accumulate of two vectors. Since it's a MAC operation, the +// assumption here is that result array is initialized to valid values. +void NeonVectorVectorCwiseProductAccumulate(const float* vector1, + const float* vector2, int v_size, + float* result); + +// Dot product of two vectors. +float NeonVectorVectorDotProduct(const float* vector1, const float* vector2, + int v_size); + +// Dot product of two batch vectors. +void NeonBatchVectorBatchVectorDotProduct(const float* vector1, + const float* vector2, int v_size, + int n_batch, float* result, + int result_stride); + +// Cwise product of a vector and a batch-vector. +void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size, + const float* batch_vector, int n_batch, + float* result); + +// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC +// operation, the assumption here is that result array is initialized to valid +// values. +void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, + int v_size, + const float* batch_vector, + int n_batch, float* result); + +// Compute "1.0f - elements of vector" (used in CIFG). +void NeonSub1Vector(const float* vector, int v_size, float* result); + +// Clip elements of a vector using a abs_limit value. +void NeonClipVector(const float* vector, int v_size, float abs_limit, + float* result); + +// Multiply all elements of vector with a scalar. +void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale, + float* result); + +// Check if all entries of a vector are zero. +bool NeonIsZeroVector(const float* vector, int v_size); + +// Symmetric quantizer. +void NeonSymmetricQuantizeFloats(const float* values, const int size, + int8_t* quantized_values, float* min, + float* max, float* scaling_factor); + +// Shift left a vector in place with v_size size. +void NeonVectorShiftLeft(float* vector, int v_size, float shift_value); + +// Reduce-sum on a float input vector: +// input_vector: float pointer to input vector. +// output_vector: float pointer to vector. +// output_size: output vector size. +// reduction_size: number of consecutive elements from input vector which are +// added to get one element of output. +void NeonReductionSumVector(const float* input_vector, float* output_vector, + int output_size, int reduction_size); + +} // namespace tensor_utils +} // namespace tflite + +#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_ diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc index d0d97aa06e6..472425e8e0a 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc @@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include -#include - #include #include +#include +#include #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/kernels/activation_functor.h" #include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h" #include "tensorflow/lite/kernels/internal/round.h" #include "tensorflow/lite/kernels/op_macros.h" diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h index fa737ecae69..28ca98160cd 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h @@ -18,6 +18,7 @@ limitations under the License. // TODO(ghodrat): Remove this header file and the dependency to internal data // structure. #include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h" #if defined(_MSC_VER) #define __restrict__ __restrict @@ -26,126 +27,6 @@ limitations under the License. namespace tflite { namespace tensor_utils { -// Limit a float input f between +abs_limit and -abs_limit. -float PortableClip(float f, float abs_limit); - -bool PortableIsZeroVector(const float* vector, int v_size); - -void PortableSymmetricQuantizeFloats(const float* values, const int size, - int8_t* quantized_values, float* min_value, - float* max_value, float* scaling_factor); - -// Multiply a matrix by a batch vector, and store results in a batch-size -// vector. -void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix, - int m_rows, int m_cols, - const float* vector, - int n_batch, float* result, - int result_stride); - -void PortableMatrixBatchVectorMultiplyAccumulate( - const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, - const int8_t* __restrict__ vectors, const float* scaling_factors, - int n_batch, float* __restrict__ result, int result_stride); - -void PortableSparseMatrixBatchVectorMultiplyAccumulate( - const float* __restrict__ matrix, const uint8_t* __restrict__ ledger, - int m_rows, int m_cols, const float* __restrict__ vector, int n_batch, - float* __restrict__ result, int result_stride); - -void PortableSparseMatrixBatchVectorMultiplyAccumulate( - const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows, - const int m_cols, const int8_t* __restrict__ vectors, - const float* scaling_factors, int n_batch, float* __restrict__ result, - int result_stride); - -// Cwise product of two vectors. -void PortableVectorVectorCwiseProduct(const float* vector1, - const float* vector2, int v_size, - float* result); - -// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the -// assumption here is that result array is initialized to valid values. -void PortableVectorVectorCwiseProductAccumulate(const float* vector1, - const float* vector2, - int v_size, float* result); - -// Dot product of two vectors. -float PortableVectorVectorDotProduct(const float* vector1, const float* vector2, - int v_size); - -// Dot product of two batch vectors. -void PortableBatchVectorBatchVectorDotProduct(const float* vector1, - const float* vector2, int v_size, - int n_batch, float* result, - int result_stride); - -// Cwise product of a vector and a batch-vector. -void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size, - const float* batch_vector, - int n_batch, float* result); - -// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC -// operation, the assumption here is that result array is initialized to valid -// values. -void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector, - int v_size, - const float* batch_vector, - int n_batch, - float* result); - -// Batch vector initialization with another vector. -void PortableVectorBatchVectorAssign(const float* vector, int v_size, - int n_batch, float* batch_vector); - -// Add another vector for each batch in the batch vector. -void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch, - float* batch_vector); - -// Apply sigmoid to elements of a vector. -void PortableApplySigmoidToVector(const float* vector, int v_size, - float* result); - -// Apply activation function to elements of a vector. -void PortableApplyActivationToVector(const float* vector, int v_size, - TfLiteFusedActivation activation, - float* result); - -// Copy vector to another vector. -void PortableCopyVector(const float* vector, int v_size, float* result); - -// Compute "1.0f - elements of vector" (used in CIFG). -void PortableSub1Vector(const float* vector, int v_size, float* result); - -// Fill vector with 0.f. -void PortableZeroVector(float* vector, int v_size); - -// Multiply all elements of vector with a scalar. -void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale, - float* result); - -// Clip elements of a vector using a abs_limit value. -void PortableClipVector(const float* vector, int v_size, float abs_limit, - float* result); - -// Shift left a vector in place with v_size size. -void PortableVectorShiftLeft(float* vector, int v_size, float shift_value); - -// Reduce-sum on a float input vector: -// input_vector: float pointer to input vector. -// output_vector: float pointer to vector. -// output_size: output vector size. -// reduction_size: number of consecutive elements from input vector which are -// added to get one element of output. -void PortableReductionSumVector(const float* input_vector, float* output_vector, - int output_size, int reduction_size); - -// Layer norm for each batch. -// normalization_epsilon is added to avoid divergence. -void PortableMeanStddevNormalization(const float* input_vector, - float* output_vector, int v_size, - int n_batch, float normalization_epsilon); - float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); } bool IsZeroVector(const float* vector, int v_size) { diff --git a/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h similarity index 62% rename from tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h rename to tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h index 3a6061b6080..80503d7f6cd 100644 --- a/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h @@ -1,4 +1,4 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ -#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_ + +#include // TODO(ghodrat): Remove this header file and the dependency to internal data // structure. @@ -23,15 +25,18 @@ limitations under the License. #define __restrict__ __restrict #endif -#ifndef USE_NEON -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#define USE_NEON -#endif // defined(__ARM_NEON__) || defined(__ARM_NEON) -#endif // USE_NEON - namespace tflite { namespace tensor_utils { +// Limit a float input f between +abs_limit and -abs_limit. +float PortableClip(float f, float abs_limit); + +bool PortableIsZeroVector(const float* vector, int v_size); + +void PortableSymmetricQuantizeFloats(const float* values, const int size, + int8_t* quantized_values, float* min_value, + float* max_value, float* scaling_factor); + // Multiply a matrix by a batch vector, and store results in a batch-size // vector. void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix, @@ -39,84 +44,48 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix, const float* vector, int n_batch, float* result, int result_stride); -void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, - int m_cols, const float* vector, - int n_batch, float* result, - int result_stride); -// Matrix multiplication for quantized values using symmetric quantization. void PortableMatrixBatchVectorMultiplyAccumulate( const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, const int8_t* __restrict__ vectors, const float* scaling_factors, int n_batch, float* __restrict__ result, int result_stride); -void NeonMatrixBatchVectorMultiplyAccumulate( - const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, - const int8_t* __restrict__ vectors, const float* scaling_factors, - int n_batch, float* __restrict__ result, int result_stride); -// Multiply a matrix by a batch vector, and store results in a batch-size -// vector. Sparse version. void PortableSparseMatrixBatchVectorMultiplyAccumulate( const float* __restrict__ matrix, const uint8_t* __restrict__ ledger, int m_rows, int m_cols, const float* __restrict__ vector, int n_batch, float* __restrict__ result, int result_stride); -void NeonSparseMatrixBatchVectorMultiplyAccumulate( - const float* __restrict__ matrix, const uint8_t* __restrict__ ledger, - int m_rows, int m_cols, const float* __restrict__ vector, int n_batch, - float* __restrict__ result, int result_stride); -// Matrix multiplication for quantized values using symmetric quantization. -// Sparse version. void PortableSparseMatrixBatchVectorMultiplyAccumulate( const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows, const int m_cols, const int8_t* __restrict__ vectors, const float* scaling_factors, int n_batch, float* __restrict__ result, int result_stride); -void NeonSparseMatrixBatchVectorMultiplyAccumulate( - const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows, - const int m_cols, const int8_t* __restrict__ vectors, - const float* scaling_factors, int n_batch, float* __restrict__ result, - int result_stride); // Cwise product of two vectors. void PortableVectorVectorCwiseProduct(const float* vector1, const float* vector2, int v_size, float* result); -void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2, - int v_size, float* result); -// Cwise product and accumulate of two vectors. Since it's a MAC operation, the +// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the // assumption here is that result array is initialized to valid values. void PortableVectorVectorCwiseProductAccumulate(const float* vector1, const float* vector2, int v_size, float* result); -void NeonVectorVectorCwiseProductAccumulate(const float* vector1, - const float* vector2, int v_size, - float* result); // Dot product of two vectors. float PortableVectorVectorDotProduct(const float* vector1, const float* vector2, int v_size); -float NeonVectorVectorDotProduct(const float* vector1, const float* vector2, - int v_size); // Dot product of two batch vectors. void PortableBatchVectorBatchVectorDotProduct(const float* vector1, const float* vector2, int v_size, int n_batch, float* result, int result_stride); -void NeonBatchVectorBatchVectorDotProduct(const float* vector1, - const float* vector2, int v_size, - int n_batch, float* result, - int result_stride); // Cwise product of a vector and a batch-vector. void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size, const float* batch_vector, int n_batch, float* result); -void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size, - const float* batch_vector, int n_batch, - float* result); // Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC // operation, the assumption here is that result array is initialized to valid @@ -126,29 +95,15 @@ void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector, const float* batch_vector, int n_batch, float* result); -void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, - int v_size, - const float* batch_vector, - int n_batch, float* result); - -// Compute "1.0f - elements of vector" (used in CIFG). -void PortableSub1Vector(const float* vector, int v_size, float* result); -void NeonSub1Vector(const float* vector, int v_size, float* result); - -// Clip elements of a vector using a abs_limit value. -void PortableClipVector(const float* vector, int v_size, float abs_limit, - float* result); -void NeonClipVector(const float* vector, int v_size, float abs_limit, - float* result); - -// Add another vector for each batch in the batch vector. -void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch, - float* batch_vector); // Batch vector initialization with another vector. void PortableVectorBatchVectorAssign(const float* vector, int v_size, int n_batch, float* batch_vector); +// Add another vector for each batch in the batch vector. +void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch, + float* batch_vector); + // Apply sigmoid to elements of a vector. void PortableApplySigmoidToVector(const float* vector, int v_size, float* result); @@ -161,33 +116,22 @@ void PortableApplyActivationToVector(const float* vector, int v_size, // Copy vector to another vector. void PortableCopyVector(const float* vector, int v_size, float* result); +// Compute "1.0f - elements of vector" (used in CIFG). +void PortableSub1Vector(const float* vector, int v_size, float* result); + // Fill vector with 0.f. void PortableZeroVector(float* vector, int v_size); // Multiply all elements of vector with a scalar. void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale, float* result); -void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale, - float* result); -// Limit a float input f between +abs_limit and -abs_limit. -float PortableClip(float f, float abs_limit); - -// Check if all entries of a vector are zero. -bool PortableIsZeroVector(const float* vector, int v_size); -bool NeonIsZeroVector(const float* vector, int v_size); - -// Symmetric quantizer. -void PortableSymmetricQuantizeFloats(const float* values, const int size, - int8_t* quantized_values, float* min, - float* max, float* scaling_factor); -void NeonSymmetricQuantizeFloats(const float* values, const int size, - int8_t* quantized_values, float* min, - float* max, float* scaling_factor); +// Clip elements of a vector using a abs_limit value. +void PortableClipVector(const float* vector, int v_size, float abs_limit, + float* result); // Shift left a vector in place with v_size size. void PortableVectorShiftLeft(float* vector, int v_size, float shift_value); -void NeonVectorShiftLeft(float* vector, int v_size, float shift_value); // Reduce-sum on a float input vector: // input_vector: float pointer to input vector. @@ -197,9 +141,9 @@ void NeonVectorShiftLeft(float* vector, int v_size, float shift_value); // added to get one element of output. void PortableReductionSumVector(const float* input_vector, float* output_vector, int output_size, int reduction_size); -void NeonReductionSumVector(const float* input_vector, float* output_vector, - int output_size, int reduction_size); +// Layer norm for each batch. +// normalization_epsilon is added to avoid divergence. void PortableMeanStddevNormalization(const float* input_vector, float* output_vector, int v_size, int n_batch, float normalization_epsilon); @@ -207,4 +151,4 @@ void PortableMeanStddevNormalization(const float* input_vector, } // namespace tensor_utils } // namespace tflite -#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_ +#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_