Refactor *_tensor_utils_impl.h files: Both portable and neon versions now only declare Portable* and Neon* functions, and *_tensor_utils.cc files include only the appropriate headers. *_tensor_utils.h include all
This allows us to easily add SSE (and other CPU specific targets) without polluting the compilation with Sse* functions on an ARM target. PiperOrigin-RevId: 253638991
This commit is contained in:
parent
bfa58c30dd
commit
731981feba
@ -476,6 +476,7 @@ cc_library(
|
|||||||
],
|
],
|
||||||
hdrs = [
|
hdrs = [
|
||||||
"reference/portable_tensor_utils.h",
|
"reference/portable_tensor_utils.h",
|
||||||
|
"reference/portable_tensor_utils_impl.h",
|
||||||
],
|
],
|
||||||
deps = [
|
deps = [
|
||||||
":compatibility",
|
":compatibility",
|
||||||
@ -491,18 +492,17 @@ cc_library(
|
|||||||
name = "neon_tensor_utils",
|
name = "neon_tensor_utils",
|
||||||
srcs = [
|
srcs = [
|
||||||
"optimized/neon_tensor_utils.cc",
|
"optimized/neon_tensor_utils.cc",
|
||||||
"reference/portable_tensor_utils.cc",
|
|
||||||
"reference/portable_tensor_utils.h",
|
|
||||||
],
|
],
|
||||||
hdrs = [
|
hdrs = [
|
||||||
"optimized/neon_tensor_utils.h",
|
"optimized/neon_tensor_utils.h",
|
||||||
"optimized/tensor_utils_impl.h",
|
"optimized/neon_tensor_utils_impl.h",
|
||||||
],
|
],
|
||||||
copts = NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE,
|
copts = NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE,
|
||||||
deps = [
|
deps = [
|
||||||
":common",
|
":common",
|
||||||
":compatibility",
|
":compatibility",
|
||||||
":cpu_check",
|
":cpu_check",
|
||||||
|
":portable_tensor_utils",
|
||||||
":round",
|
":round",
|
||||||
":types",
|
":types",
|
||||||
"//tensorflow/lite/c:c_api_internal",
|
"//tensorflow/lite/c:c_api_internal",
|
||||||
@ -551,9 +551,6 @@ cc_library(
|
|||||||
"tensor_utils.cc",
|
"tensor_utils.cc",
|
||||||
],
|
],
|
||||||
hdrs = [
|
hdrs = [
|
||||||
"optimized/neon_tensor_utils.h",
|
|
||||||
"optimized/tensor_utils_impl.h",
|
|
||||||
"reference/portable_tensor_utils.h",
|
|
||||||
"tensor_utils.h",
|
"tensor_utils.h",
|
||||||
],
|
],
|
||||||
copts = NEON_FLAGS_IF_APPLICABLE,
|
copts = NEON_FLAGS_IF_APPLICABLE,
|
||||||
|
@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||||
#include "tensorflow/lite/kernels/activation_functor.h"
|
#include "tensorflow/lite/kernels/activation_functor.h"
|
||||||
#include "tensorflow/lite/kernels/internal/common.h"
|
#include "tensorflow/lite/kernels/internal/common.h"
|
||||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||||
#include "tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h"
|
#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
|
||||||
#include "tensorflow/lite/kernels/internal/round.h"
|
#include "tensorflow/lite/kernels/internal/round.h"
|
||||||
|
|
||||||
#ifdef USE_NEON
|
#ifdef USE_NEON
|
||||||
|
@ -19,7 +19,8 @@ limitations under the License.
|
|||||||
// structure.
|
// structure.
|
||||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||||
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
|
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
|
||||||
#include "tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h"
|
#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
|
||||||
|
#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
|
||||||
|
|
||||||
namespace tflite {
|
namespace tflite {
|
||||||
namespace tensor_utils {
|
namespace tensor_utils {
|
||||||
|
@ -0,0 +1,130 @@
|
|||||||
|
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
==============================================================================*/
|
||||||
|
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
|
||||||
|
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
|
||||||
|
|
||||||
|
// TODO(ghodrat): Remove this header file and the dependency to internal data
|
||||||
|
// structure.
|
||||||
|
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#define __restrict__ __restrict
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef USE_NEON
|
||||||
|
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||||
|
#define USE_NEON
|
||||||
|
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||||
|
#endif // USE_NEON
|
||||||
|
|
||||||
|
namespace tflite {
|
||||||
|
namespace tensor_utils {
|
||||||
|
|
||||||
|
// Multiply a matrix by a batch vector, and store results in a batch-size
|
||||||
|
// vector.
|
||||||
|
void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
|
||||||
|
int m_cols, const float* vector,
|
||||||
|
int n_batch, float* result,
|
||||||
|
int result_stride);
|
||||||
|
|
||||||
|
// Matrix multiplication for quantized values using symmetric quantization.
|
||||||
|
void NeonMatrixBatchVectorMultiplyAccumulate(
|
||||||
|
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||||
|
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||||
|
int n_batch, float* __restrict__ result, int result_stride);
|
||||||
|
|
||||||
|
// Multiply a matrix by a batch vector, and store results in a batch-size
|
||||||
|
// vector. Sparse version.
|
||||||
|
void NeonSparseMatrixBatchVectorMultiplyAccumulate(
|
||||||
|
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||||
|
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
||||||
|
float* __restrict__ result, int result_stride);
|
||||||
|
|
||||||
|
// Matrix multiplication for quantized values using symmetric quantization.
|
||||||
|
// Sparse version.
|
||||||
|
void NeonSparseMatrixBatchVectorMultiplyAccumulate(
|
||||||
|
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
||||||
|
const int m_cols, const int8_t* __restrict__ vectors,
|
||||||
|
const float* scaling_factors, int n_batch, float* __restrict__ result,
|
||||||
|
int result_stride);
|
||||||
|
|
||||||
|
// Cwise product of two vectors.
|
||||||
|
void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
|
||||||
|
int v_size, float* result);
|
||||||
|
|
||||||
|
// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
|
||||||
|
// assumption here is that result array is initialized to valid values.
|
||||||
|
void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
|
||||||
|
const float* vector2, int v_size,
|
||||||
|
float* result);
|
||||||
|
|
||||||
|
// Dot product of two vectors.
|
||||||
|
float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||||
|
int v_size);
|
||||||
|
|
||||||
|
// Dot product of two batch vectors.
|
||||||
|
void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
|
||||||
|
const float* vector2, int v_size,
|
||||||
|
int n_batch, float* result,
|
||||||
|
int result_stride);
|
||||||
|
|
||||||
|
// Cwise product of a vector and a batch-vector.
|
||||||
|
void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size,
|
||||||
|
const float* batch_vector, int n_batch,
|
||||||
|
float* result);
|
||||||
|
|
||||||
|
// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
|
||||||
|
// operation, the assumption here is that result array is initialized to valid
|
||||||
|
// values.
|
||||||
|
void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
|
||||||
|
int v_size,
|
||||||
|
const float* batch_vector,
|
||||||
|
int n_batch, float* result);
|
||||||
|
|
||||||
|
// Compute "1.0f - elements of vector" (used in CIFG).
|
||||||
|
void NeonSub1Vector(const float* vector, int v_size, float* result);
|
||||||
|
|
||||||
|
// Clip elements of a vector using a abs_limit value.
|
||||||
|
void NeonClipVector(const float* vector, int v_size, float abs_limit,
|
||||||
|
float* result);
|
||||||
|
|
||||||
|
// Multiply all elements of vector with a scalar.
|
||||||
|
void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
||||||
|
float* result);
|
||||||
|
|
||||||
|
// Check if all entries of a vector are zero.
|
||||||
|
bool NeonIsZeroVector(const float* vector, int v_size);
|
||||||
|
|
||||||
|
// Symmetric quantizer.
|
||||||
|
void NeonSymmetricQuantizeFloats(const float* values, const int size,
|
||||||
|
int8_t* quantized_values, float* min,
|
||||||
|
float* max, float* scaling_factor);
|
||||||
|
|
||||||
|
// Shift left a vector in place with v_size size.
|
||||||
|
void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
|
||||||
|
|
||||||
|
// Reduce-sum on a float input vector:
|
||||||
|
// input_vector: float pointer to input vector.
|
||||||
|
// output_vector: float pointer to vector.
|
||||||
|
// output_size: output vector size.
|
||||||
|
// reduction_size: number of consecutive elements from input vector which are
|
||||||
|
// added to get one element of output.
|
||||||
|
void NeonReductionSumVector(const float* input_vector, float* output_vector,
|
||||||
|
int output_size, int reduction_size);
|
||||||
|
|
||||||
|
} // namespace tensor_utils
|
||||||
|
} // namespace tflite
|
||||||
|
|
||||||
|
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
|
@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||||
#include "tensorflow/lite/kernels/activation_functor.h"
|
#include "tensorflow/lite/kernels/activation_functor.h"
|
||||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||||
|
#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
|
||||||
#include "tensorflow/lite/kernels/internal/round.h"
|
#include "tensorflow/lite/kernels/internal/round.h"
|
||||||
#include "tensorflow/lite/kernels/op_macros.h"
|
#include "tensorflow/lite/kernels/op_macros.h"
|
||||||
|
|
||||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
|||||||
// TODO(ghodrat): Remove this header file and the dependency to internal data
|
// TODO(ghodrat): Remove this header file and the dependency to internal data
|
||||||
// structure.
|
// structure.
|
||||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||||
|
#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#define __restrict__ __restrict
|
#define __restrict__ __restrict
|
||||||
@ -26,126 +27,6 @@ limitations under the License.
|
|||||||
namespace tflite {
|
namespace tflite {
|
||||||
namespace tensor_utils {
|
namespace tensor_utils {
|
||||||
|
|
||||||
// Limit a float input f between +abs_limit and -abs_limit.
|
|
||||||
float PortableClip(float f, float abs_limit);
|
|
||||||
|
|
||||||
bool PortableIsZeroVector(const float* vector, int v_size);
|
|
||||||
|
|
||||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
|
||||||
int8_t* quantized_values, float* min_value,
|
|
||||||
float* max_value, float* scaling_factor);
|
|
||||||
|
|
||||||
// Multiply a matrix by a batch vector, and store results in a batch-size
|
|
||||||
// vector.
|
|
||||||
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
|
|
||||||
int m_rows, int m_cols,
|
|
||||||
const float* vector,
|
|
||||||
int n_batch, float* result,
|
|
||||||
int result_stride);
|
|
||||||
|
|
||||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
|
||||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
|
||||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
|
||||||
int n_batch, float* __restrict__ result, int result_stride);
|
|
||||||
|
|
||||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
|
||||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
|
||||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
|
||||||
float* __restrict__ result, int result_stride);
|
|
||||||
|
|
||||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
|
||||||
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
|
||||||
const int m_cols, const int8_t* __restrict__ vectors,
|
|
||||||
const float* scaling_factors, int n_batch, float* __restrict__ result,
|
|
||||||
int result_stride);
|
|
||||||
|
|
||||||
// Cwise product of two vectors.
|
|
||||||
void PortableVectorVectorCwiseProduct(const float* vector1,
|
|
||||||
const float* vector2, int v_size,
|
|
||||||
float* result);
|
|
||||||
|
|
||||||
// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
|
|
||||||
// assumption here is that result array is initialized to valid values.
|
|
||||||
void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
|
|
||||||
const float* vector2,
|
|
||||||
int v_size, float* result);
|
|
||||||
|
|
||||||
// Dot product of two vectors.
|
|
||||||
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
|
|
||||||
int v_size);
|
|
||||||
|
|
||||||
// Dot product of two batch vectors.
|
|
||||||
void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
|
|
||||||
const float* vector2, int v_size,
|
|
||||||
int n_batch, float* result,
|
|
||||||
int result_stride);
|
|
||||||
|
|
||||||
// Cwise product of a vector and a batch-vector.
|
|
||||||
void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size,
|
|
||||||
const float* batch_vector,
|
|
||||||
int n_batch, float* result);
|
|
||||||
|
|
||||||
// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
|
|
||||||
// operation, the assumption here is that result array is initialized to valid
|
|
||||||
// values.
|
|
||||||
void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
|
|
||||||
int v_size,
|
|
||||||
const float* batch_vector,
|
|
||||||
int n_batch,
|
|
||||||
float* result);
|
|
||||||
|
|
||||||
// Batch vector initialization with another vector.
|
|
||||||
void PortableVectorBatchVectorAssign(const float* vector, int v_size,
|
|
||||||
int n_batch, float* batch_vector);
|
|
||||||
|
|
||||||
// Add another vector for each batch in the batch vector.
|
|
||||||
void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
|
|
||||||
float* batch_vector);
|
|
||||||
|
|
||||||
// Apply sigmoid to elements of a vector.
|
|
||||||
void PortableApplySigmoidToVector(const float* vector, int v_size,
|
|
||||||
float* result);
|
|
||||||
|
|
||||||
// Apply activation function to elements of a vector.
|
|
||||||
void PortableApplyActivationToVector(const float* vector, int v_size,
|
|
||||||
TfLiteFusedActivation activation,
|
|
||||||
float* result);
|
|
||||||
|
|
||||||
// Copy vector to another vector.
|
|
||||||
void PortableCopyVector(const float* vector, int v_size, float* result);
|
|
||||||
|
|
||||||
// Compute "1.0f - elements of vector" (used in CIFG).
|
|
||||||
void PortableSub1Vector(const float* vector, int v_size, float* result);
|
|
||||||
|
|
||||||
// Fill vector with 0.f.
|
|
||||||
void PortableZeroVector(float* vector, int v_size);
|
|
||||||
|
|
||||||
// Multiply all elements of vector with a scalar.
|
|
||||||
void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
|
||||||
float* result);
|
|
||||||
|
|
||||||
// Clip elements of a vector using a abs_limit value.
|
|
||||||
void PortableClipVector(const float* vector, int v_size, float abs_limit,
|
|
||||||
float* result);
|
|
||||||
|
|
||||||
// Shift left a vector in place with v_size size.
|
|
||||||
void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
|
|
||||||
|
|
||||||
// Reduce-sum on a float input vector:
|
|
||||||
// input_vector: float pointer to input vector.
|
|
||||||
// output_vector: float pointer to vector.
|
|
||||||
// output_size: output vector size.
|
|
||||||
// reduction_size: number of consecutive elements from input vector which are
|
|
||||||
// added to get one element of output.
|
|
||||||
void PortableReductionSumVector(const float* input_vector, float* output_vector,
|
|
||||||
int output_size, int reduction_size);
|
|
||||||
|
|
||||||
// Layer norm for each batch.
|
|
||||||
// normalization_epsilon is added to avoid divergence.
|
|
||||||
void PortableMeanStddevNormalization(const float* input_vector,
|
|
||||||
float* output_vector, int v_size,
|
|
||||||
int n_batch, float normalization_epsilon);
|
|
||||||
|
|
||||||
float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
|
float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
|
||||||
|
|
||||||
bool IsZeroVector(const float* vector, int v_size) {
|
bool IsZeroVector(const float* vector, int v_size) {
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
==============================================================================*/
|
==============================================================================*/
|
||||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
|
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
|
||||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
|
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
// TODO(ghodrat): Remove this header file and the dependency to internal data
|
// TODO(ghodrat): Remove this header file and the dependency to internal data
|
||||||
// structure.
|
// structure.
|
||||||
@ -23,15 +25,18 @@ limitations under the License.
|
|||||||
#define __restrict__ __restrict
|
#define __restrict__ __restrict
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef USE_NEON
|
|
||||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
||||||
#define USE_NEON
|
|
||||||
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
|
||||||
#endif // USE_NEON
|
|
||||||
|
|
||||||
namespace tflite {
|
namespace tflite {
|
||||||
namespace tensor_utils {
|
namespace tensor_utils {
|
||||||
|
|
||||||
|
// Limit a float input f between +abs_limit and -abs_limit.
|
||||||
|
float PortableClip(float f, float abs_limit);
|
||||||
|
|
||||||
|
bool PortableIsZeroVector(const float* vector, int v_size);
|
||||||
|
|
||||||
|
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
||||||
|
int8_t* quantized_values, float* min_value,
|
||||||
|
float* max_value, float* scaling_factor);
|
||||||
|
|
||||||
// Multiply a matrix by a batch vector, and store results in a batch-size
|
// Multiply a matrix by a batch vector, and store results in a batch-size
|
||||||
// vector.
|
// vector.
|
||||||
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
|
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
|
||||||
@ -39,84 +44,48 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
|
|||||||
const float* vector,
|
const float* vector,
|
||||||
int n_batch, float* result,
|
int n_batch, float* result,
|
||||||
int result_stride);
|
int result_stride);
|
||||||
void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
|
|
||||||
int m_cols, const float* vector,
|
|
||||||
int n_batch, float* result,
|
|
||||||
int result_stride);
|
|
||||||
|
|
||||||
// Matrix multiplication for quantized values using symmetric quantization.
|
|
||||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||||
int n_batch, float* __restrict__ result, int result_stride);
|
int n_batch, float* __restrict__ result, int result_stride);
|
||||||
void NeonMatrixBatchVectorMultiplyAccumulate(
|
|
||||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
|
||||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
|
||||||
int n_batch, float* __restrict__ result, int result_stride);
|
|
||||||
|
|
||||||
// Multiply a matrix by a batch vector, and store results in a batch-size
|
|
||||||
// vector. Sparse version.
|
|
||||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
||||||
float* __restrict__ result, int result_stride);
|
float* __restrict__ result, int result_stride);
|
||||||
void NeonSparseMatrixBatchVectorMultiplyAccumulate(
|
|
||||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
|
||||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
|
||||||
float* __restrict__ result, int result_stride);
|
|
||||||
|
|
||||||
// Matrix multiplication for quantized values using symmetric quantization.
|
|
||||||
// Sparse version.
|
|
||||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||||
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
||||||
const int m_cols, const int8_t* __restrict__ vectors,
|
const int m_cols, const int8_t* __restrict__ vectors,
|
||||||
const float* scaling_factors, int n_batch, float* __restrict__ result,
|
const float* scaling_factors, int n_batch, float* __restrict__ result,
|
||||||
int result_stride);
|
int result_stride);
|
||||||
void NeonSparseMatrixBatchVectorMultiplyAccumulate(
|
|
||||||
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
|
||||||
const int m_cols, const int8_t* __restrict__ vectors,
|
|
||||||
const float* scaling_factors, int n_batch, float* __restrict__ result,
|
|
||||||
int result_stride);
|
|
||||||
|
|
||||||
// Cwise product of two vectors.
|
// Cwise product of two vectors.
|
||||||
void PortableVectorVectorCwiseProduct(const float* vector1,
|
void PortableVectorVectorCwiseProduct(const float* vector1,
|
||||||
const float* vector2, int v_size,
|
const float* vector2, int v_size,
|
||||||
float* result);
|
float* result);
|
||||||
void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
|
|
||||||
int v_size, float* result);
|
|
||||||
|
|
||||||
// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
|
// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
|
||||||
// assumption here is that result array is initialized to valid values.
|
// assumption here is that result array is initialized to valid values.
|
||||||
void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
|
void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
|
||||||
const float* vector2,
|
const float* vector2,
|
||||||
int v_size, float* result);
|
int v_size, float* result);
|
||||||
void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
|
|
||||||
const float* vector2, int v_size,
|
|
||||||
float* result);
|
|
||||||
|
|
||||||
// Dot product of two vectors.
|
// Dot product of two vectors.
|
||||||
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
|
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||||
int v_size);
|
int v_size);
|
||||||
float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
|
|
||||||
int v_size);
|
|
||||||
|
|
||||||
// Dot product of two batch vectors.
|
// Dot product of two batch vectors.
|
||||||
void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
|
void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
|
||||||
const float* vector2, int v_size,
|
const float* vector2, int v_size,
|
||||||
int n_batch, float* result,
|
int n_batch, float* result,
|
||||||
int result_stride);
|
int result_stride);
|
||||||
void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
|
|
||||||
const float* vector2, int v_size,
|
|
||||||
int n_batch, float* result,
|
|
||||||
int result_stride);
|
|
||||||
|
|
||||||
// Cwise product of a vector and a batch-vector.
|
// Cwise product of a vector and a batch-vector.
|
||||||
void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size,
|
void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size,
|
||||||
const float* batch_vector,
|
const float* batch_vector,
|
||||||
int n_batch, float* result);
|
int n_batch, float* result);
|
||||||
void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size,
|
|
||||||
const float* batch_vector, int n_batch,
|
|
||||||
float* result);
|
|
||||||
|
|
||||||
// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
|
// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
|
||||||
// operation, the assumption here is that result array is initialized to valid
|
// operation, the assumption here is that result array is initialized to valid
|
||||||
@ -126,29 +95,15 @@ void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
|
|||||||
const float* batch_vector,
|
const float* batch_vector,
|
||||||
int n_batch,
|
int n_batch,
|
||||||
float* result);
|
float* result);
|
||||||
void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
|
|
||||||
int v_size,
|
|
||||||
const float* batch_vector,
|
|
||||||
int n_batch, float* result);
|
|
||||||
|
|
||||||
// Compute "1.0f - elements of vector" (used in CIFG).
|
|
||||||
void PortableSub1Vector(const float* vector, int v_size, float* result);
|
|
||||||
void NeonSub1Vector(const float* vector, int v_size, float* result);
|
|
||||||
|
|
||||||
// Clip elements of a vector using a abs_limit value.
|
|
||||||
void PortableClipVector(const float* vector, int v_size, float abs_limit,
|
|
||||||
float* result);
|
|
||||||
void NeonClipVector(const float* vector, int v_size, float abs_limit,
|
|
||||||
float* result);
|
|
||||||
|
|
||||||
// Add another vector for each batch in the batch vector.
|
|
||||||
void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
|
|
||||||
float* batch_vector);
|
|
||||||
|
|
||||||
// Batch vector initialization with another vector.
|
// Batch vector initialization with another vector.
|
||||||
void PortableVectorBatchVectorAssign(const float* vector, int v_size,
|
void PortableVectorBatchVectorAssign(const float* vector, int v_size,
|
||||||
int n_batch, float* batch_vector);
|
int n_batch, float* batch_vector);
|
||||||
|
|
||||||
|
// Add another vector for each batch in the batch vector.
|
||||||
|
void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
|
||||||
|
float* batch_vector);
|
||||||
|
|
||||||
// Apply sigmoid to elements of a vector.
|
// Apply sigmoid to elements of a vector.
|
||||||
void PortableApplySigmoidToVector(const float* vector, int v_size,
|
void PortableApplySigmoidToVector(const float* vector, int v_size,
|
||||||
float* result);
|
float* result);
|
||||||
@ -161,33 +116,22 @@ void PortableApplyActivationToVector(const float* vector, int v_size,
|
|||||||
// Copy vector to another vector.
|
// Copy vector to another vector.
|
||||||
void PortableCopyVector(const float* vector, int v_size, float* result);
|
void PortableCopyVector(const float* vector, int v_size, float* result);
|
||||||
|
|
||||||
|
// Compute "1.0f - elements of vector" (used in CIFG).
|
||||||
|
void PortableSub1Vector(const float* vector, int v_size, float* result);
|
||||||
|
|
||||||
// Fill vector with 0.f.
|
// Fill vector with 0.f.
|
||||||
void PortableZeroVector(float* vector, int v_size);
|
void PortableZeroVector(float* vector, int v_size);
|
||||||
|
|
||||||
// Multiply all elements of vector with a scalar.
|
// Multiply all elements of vector with a scalar.
|
||||||
void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
||||||
float* result);
|
float* result);
|
||||||
void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
|
||||||
float* result);
|
|
||||||
|
|
||||||
// Limit a float input f between +abs_limit and -abs_limit.
|
// Clip elements of a vector using a abs_limit value.
|
||||||
float PortableClip(float f, float abs_limit);
|
void PortableClipVector(const float* vector, int v_size, float abs_limit,
|
||||||
|
float* result);
|
||||||
// Check if all entries of a vector are zero.
|
|
||||||
bool PortableIsZeroVector(const float* vector, int v_size);
|
|
||||||
bool NeonIsZeroVector(const float* vector, int v_size);
|
|
||||||
|
|
||||||
// Symmetric quantizer.
|
|
||||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
|
||||||
int8_t* quantized_values, float* min,
|
|
||||||
float* max, float* scaling_factor);
|
|
||||||
void NeonSymmetricQuantizeFloats(const float* values, const int size,
|
|
||||||
int8_t* quantized_values, float* min,
|
|
||||||
float* max, float* scaling_factor);
|
|
||||||
|
|
||||||
// Shift left a vector in place with v_size size.
|
// Shift left a vector in place with v_size size.
|
||||||
void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
|
void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
|
||||||
void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
|
|
||||||
|
|
||||||
// Reduce-sum on a float input vector:
|
// Reduce-sum on a float input vector:
|
||||||
// input_vector: float pointer to input vector.
|
// input_vector: float pointer to input vector.
|
||||||
@ -197,9 +141,9 @@ void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
|
|||||||
// added to get one element of output.
|
// added to get one element of output.
|
||||||
void PortableReductionSumVector(const float* input_vector, float* output_vector,
|
void PortableReductionSumVector(const float* input_vector, float* output_vector,
|
||||||
int output_size, int reduction_size);
|
int output_size, int reduction_size);
|
||||||
void NeonReductionSumVector(const float* input_vector, float* output_vector,
|
|
||||||
int output_size, int reduction_size);
|
|
||||||
|
|
||||||
|
// Layer norm for each batch.
|
||||||
|
// normalization_epsilon is added to avoid divergence.
|
||||||
void PortableMeanStddevNormalization(const float* input_vector,
|
void PortableMeanStddevNormalization(const float* input_vector,
|
||||||
float* output_vector, int v_size,
|
float* output_vector, int v_size,
|
||||||
int n_batch, float normalization_epsilon);
|
int n_batch, float normalization_epsilon);
|
||||||
@ -207,4 +151,4 @@ void PortableMeanStddevNormalization(const float* input_vector,
|
|||||||
} // namespace tensor_utils
|
} // namespace tensor_utils
|
||||||
} // namespace tflite
|
} // namespace tflite
|
||||||
|
|
||||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
|
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
|
Loading…
Reference in New Issue
Block a user