Refactor *_tensor_utils_impl.h files: Both portable and neon versions now only declare Portable* and Neon* functions, and *_tensor_utils.cc files include only the appropriate headers. *_tensor_utils.h include all
This allows us to easily add SSE (and other CPU specific targets) without polluting the compilation with Sse* functions on an ARM target. PiperOrigin-RevId: 253638991
This commit is contained in:
parent
bfa58c30dd
commit
731981feba
@ -476,6 +476,7 @@ cc_library(
|
||||
],
|
||||
hdrs = [
|
||||
"reference/portable_tensor_utils.h",
|
||||
"reference/portable_tensor_utils_impl.h",
|
||||
],
|
||||
deps = [
|
||||
":compatibility",
|
||||
@ -491,18 +492,17 @@ cc_library(
|
||||
name = "neon_tensor_utils",
|
||||
srcs = [
|
||||
"optimized/neon_tensor_utils.cc",
|
||||
"reference/portable_tensor_utils.cc",
|
||||
"reference/portable_tensor_utils.h",
|
||||
],
|
||||
hdrs = [
|
||||
"optimized/neon_tensor_utils.h",
|
||||
"optimized/tensor_utils_impl.h",
|
||||
"optimized/neon_tensor_utils_impl.h",
|
||||
],
|
||||
copts = NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE,
|
||||
deps = [
|
||||
":common",
|
||||
":compatibility",
|
||||
":cpu_check",
|
||||
":portable_tensor_utils",
|
||||
":round",
|
||||
":types",
|
||||
"//tensorflow/lite/c:c_api_internal",
|
||||
@ -551,9 +551,6 @@ cc_library(
|
||||
"tensor_utils.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"optimized/neon_tensor_utils.h",
|
||||
"optimized/tensor_utils_impl.h",
|
||||
"reference/portable_tensor_utils.h",
|
||||
"tensor_utils.h",
|
||||
],
|
||||
copts = NEON_FLAGS_IF_APPLICABLE,
|
||||
|
@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||
#include "tensorflow/lite/kernels/activation_functor.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
|
||||
#include "tensorflow/lite/kernels/internal/round.h"
|
||||
|
||||
#ifdef USE_NEON
|
||||
|
@ -19,7 +19,8 @@ limitations under the License.
|
||||
// structure.
|
||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace tensor_utils {
|
||||
|
@ -0,0 +1,130 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
|
||||
|
||||
// TODO(ghodrat): Remove this header file and the dependency to internal data
|
||||
// structure.
|
||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define __restrict__ __restrict
|
||||
#endif
|
||||
|
||||
#ifndef USE_NEON
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#define USE_NEON
|
||||
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#endif // USE_NEON
|
||||
|
||||
namespace tflite {
|
||||
namespace tensor_utils {
|
||||
|
||||
// Multiply a matrix by a batch vector, and store results in a batch-size
|
||||
// vector.
|
||||
void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
|
||||
int m_cols, const float* vector,
|
||||
int n_batch, float* result,
|
||||
int result_stride);
|
||||
|
||||
// Matrix multiplication for quantized values using symmetric quantization.
|
||||
void NeonMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||
int n_batch, float* __restrict__ result, int result_stride);
|
||||
|
||||
// Multiply a matrix by a batch vector, and store results in a batch-size
|
||||
// vector. Sparse version.
|
||||
void NeonSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
||||
float* __restrict__ result, int result_stride);
|
||||
|
||||
// Matrix multiplication for quantized values using symmetric quantization.
|
||||
// Sparse version.
|
||||
void NeonSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
||||
const int m_cols, const int8_t* __restrict__ vectors,
|
||||
const float* scaling_factors, int n_batch, float* __restrict__ result,
|
||||
int result_stride);
|
||||
|
||||
// Cwise product of two vectors.
|
||||
void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
|
||||
int v_size, float* result);
|
||||
|
||||
// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
|
||||
// assumption here is that result array is initialized to valid values.
|
||||
void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
float* result);
|
||||
|
||||
// Dot product of two vectors.
|
||||
float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size);
|
||||
|
||||
// Dot product of two batch vectors.
|
||||
void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
int n_batch, float* result,
|
||||
int result_stride);
|
||||
|
||||
// Cwise product of a vector and a batch-vector.
|
||||
void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size,
|
||||
const float* batch_vector, int n_batch,
|
||||
float* result);
|
||||
|
||||
// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
|
||||
// operation, the assumption here is that result array is initialized to valid
|
||||
// values.
|
||||
void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
|
||||
int v_size,
|
||||
const float* batch_vector,
|
||||
int n_batch, float* result);
|
||||
|
||||
// Compute "1.0f - elements of vector" (used in CIFG).
|
||||
void NeonSub1Vector(const float* vector, int v_size, float* result);
|
||||
|
||||
// Clip elements of a vector using a abs_limit value.
|
||||
void NeonClipVector(const float* vector, int v_size, float abs_limit,
|
||||
float* result);
|
||||
|
||||
// Multiply all elements of vector with a scalar.
|
||||
void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
||||
float* result);
|
||||
|
||||
// Check if all entries of a vector are zero.
|
||||
bool NeonIsZeroVector(const float* vector, int v_size);
|
||||
|
||||
// Symmetric quantizer.
|
||||
void NeonSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* min,
|
||||
float* max, float* scaling_factor);
|
||||
|
||||
// Shift left a vector in place with v_size size.
|
||||
void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
|
||||
|
||||
// Reduce-sum on a float input vector:
|
||||
// input_vector: float pointer to input vector.
|
||||
// output_vector: float pointer to vector.
|
||||
// output_size: output vector size.
|
||||
// reduction_size: number of consecutive elements from input vector which are
|
||||
// added to get one element of output.
|
||||
void NeonReductionSumVector(const float* input_vector, float* output_vector,
|
||||
int output_size, int reduction_size);
|
||||
|
||||
} // namespace tensor_utils
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
|
@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||
#include "tensorflow/lite/kernels/activation_functor.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
|
||||
#include "tensorflow/lite/kernels/internal/round.h"
|
||||
#include "tensorflow/lite/kernels/op_macros.h"
|
||||
|
||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
||||
// TODO(ghodrat): Remove this header file and the dependency to internal data
|
||||
// structure.
|
||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define __restrict__ __restrict
|
||||
@ -26,126 +27,6 @@ limitations under the License.
|
||||
namespace tflite {
|
||||
namespace tensor_utils {
|
||||
|
||||
// Limit a float input f between +abs_limit and -abs_limit.
|
||||
float PortableClip(float f, float abs_limit);
|
||||
|
||||
bool PortableIsZeroVector(const float* vector, int v_size);
|
||||
|
||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* min_value,
|
||||
float* max_value, float* scaling_factor);
|
||||
|
||||
// Multiply a matrix by a batch vector, and store results in a batch-size
|
||||
// vector.
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
|
||||
int m_rows, int m_cols,
|
||||
const float* vector,
|
||||
int n_batch, float* result,
|
||||
int result_stride);
|
||||
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||
int n_batch, float* __restrict__ result, int result_stride);
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
||||
float* __restrict__ result, int result_stride);
|
||||
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
||||
const int m_cols, const int8_t* __restrict__ vectors,
|
||||
const float* scaling_factors, int n_batch, float* __restrict__ result,
|
||||
int result_stride);
|
||||
|
||||
// Cwise product of two vectors.
|
||||
void PortableVectorVectorCwiseProduct(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
float* result);
|
||||
|
||||
// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
|
||||
// assumption here is that result array is initialized to valid values.
|
||||
void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
|
||||
const float* vector2,
|
||||
int v_size, float* result);
|
||||
|
||||
// Dot product of two vectors.
|
||||
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size);
|
||||
|
||||
// Dot product of two batch vectors.
|
||||
void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
int n_batch, float* result,
|
||||
int result_stride);
|
||||
|
||||
// Cwise product of a vector and a batch-vector.
|
||||
void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size,
|
||||
const float* batch_vector,
|
||||
int n_batch, float* result);
|
||||
|
||||
// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
|
||||
// operation, the assumption here is that result array is initialized to valid
|
||||
// values.
|
||||
void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
|
||||
int v_size,
|
||||
const float* batch_vector,
|
||||
int n_batch,
|
||||
float* result);
|
||||
|
||||
// Batch vector initialization with another vector.
|
||||
void PortableVectorBatchVectorAssign(const float* vector, int v_size,
|
||||
int n_batch, float* batch_vector);
|
||||
|
||||
// Add another vector for each batch in the batch vector.
|
||||
void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
|
||||
float* batch_vector);
|
||||
|
||||
// Apply sigmoid to elements of a vector.
|
||||
void PortableApplySigmoidToVector(const float* vector, int v_size,
|
||||
float* result);
|
||||
|
||||
// Apply activation function to elements of a vector.
|
||||
void PortableApplyActivationToVector(const float* vector, int v_size,
|
||||
TfLiteFusedActivation activation,
|
||||
float* result);
|
||||
|
||||
// Copy vector to another vector.
|
||||
void PortableCopyVector(const float* vector, int v_size, float* result);
|
||||
|
||||
// Compute "1.0f - elements of vector" (used in CIFG).
|
||||
void PortableSub1Vector(const float* vector, int v_size, float* result);
|
||||
|
||||
// Fill vector with 0.f.
|
||||
void PortableZeroVector(float* vector, int v_size);
|
||||
|
||||
// Multiply all elements of vector with a scalar.
|
||||
void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
||||
float* result);
|
||||
|
||||
// Clip elements of a vector using a abs_limit value.
|
||||
void PortableClipVector(const float* vector, int v_size, float abs_limit,
|
||||
float* result);
|
||||
|
||||
// Shift left a vector in place with v_size size.
|
||||
void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
|
||||
|
||||
// Reduce-sum on a float input vector:
|
||||
// input_vector: float pointer to input vector.
|
||||
// output_vector: float pointer to vector.
|
||||
// output_size: output vector size.
|
||||
// reduction_size: number of consecutive elements from input vector which are
|
||||
// added to get one element of output.
|
||||
void PortableReductionSumVector(const float* input_vector, float* output_vector,
|
||||
int output_size, int reduction_size);
|
||||
|
||||
// Layer norm for each batch.
|
||||
// normalization_epsilon is added to avoid divergence.
|
||||
void PortableMeanStddevNormalization(const float* input_vector,
|
||||
float* output_vector, int v_size,
|
||||
int n_batch, float normalization_epsilon);
|
||||
|
||||
float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
|
||||
|
||||
bool IsZeroVector(const float* vector, int v_size) {
|
||||
|
@ -1,4 +1,4 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
// TODO(ghodrat): Remove this header file and the dependency to internal data
|
||||
// structure.
|
||||
@ -23,15 +25,18 @@ limitations under the License.
|
||||
#define __restrict__ __restrict
|
||||
#endif
|
||||
|
||||
#ifndef USE_NEON
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#define USE_NEON
|
||||
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#endif // USE_NEON
|
||||
|
||||
namespace tflite {
|
||||
namespace tensor_utils {
|
||||
|
||||
// Limit a float input f between +abs_limit and -abs_limit.
|
||||
float PortableClip(float f, float abs_limit);
|
||||
|
||||
bool PortableIsZeroVector(const float* vector, int v_size);
|
||||
|
||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* min_value,
|
||||
float* max_value, float* scaling_factor);
|
||||
|
||||
// Multiply a matrix by a batch vector, and store results in a batch-size
|
||||
// vector.
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
|
||||
@ -39,84 +44,48 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
|
||||
const float* vector,
|
||||
int n_batch, float* result,
|
||||
int result_stride);
|
||||
void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
|
||||
int m_cols, const float* vector,
|
||||
int n_batch, float* result,
|
||||
int result_stride);
|
||||
|
||||
// Matrix multiplication for quantized values using symmetric quantization.
|
||||
void PortableMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||
int n_batch, float* __restrict__ result, int result_stride);
|
||||
void NeonMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors, const float* scaling_factors,
|
||||
int n_batch, float* __restrict__ result, int result_stride);
|
||||
|
||||
// Multiply a matrix by a batch vector, and store results in a batch-size
|
||||
// vector. Sparse version.
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
||||
float* __restrict__ result, int result_stride);
|
||||
void NeonSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
||||
float* __restrict__ result, int result_stride);
|
||||
|
||||
// Matrix multiplication for quantized values using symmetric quantization.
|
||||
// Sparse version.
|
||||
void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
||||
const int m_cols, const int8_t* __restrict__ vectors,
|
||||
const float* scaling_factors, int n_batch, float* __restrict__ result,
|
||||
int result_stride);
|
||||
void NeonSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
|
||||
const int m_cols, const int8_t* __restrict__ vectors,
|
||||
const float* scaling_factors, int n_batch, float* __restrict__ result,
|
||||
int result_stride);
|
||||
|
||||
// Cwise product of two vectors.
|
||||
void PortableVectorVectorCwiseProduct(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
float* result);
|
||||
void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
|
||||
int v_size, float* result);
|
||||
|
||||
// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
|
||||
// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
|
||||
// assumption here is that result array is initialized to valid values.
|
||||
void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
|
||||
const float* vector2,
|
||||
int v_size, float* result);
|
||||
void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
float* result);
|
||||
|
||||
// Dot product of two vectors.
|
||||
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size);
|
||||
float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size);
|
||||
|
||||
// Dot product of two batch vectors.
|
||||
void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
int n_batch, float* result,
|
||||
int result_stride);
|
||||
void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
int n_batch, float* result,
|
||||
int result_stride);
|
||||
|
||||
// Cwise product of a vector and a batch-vector.
|
||||
void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size,
|
||||
const float* batch_vector,
|
||||
int n_batch, float* result);
|
||||
void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size,
|
||||
const float* batch_vector, int n_batch,
|
||||
float* result);
|
||||
|
||||
// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
|
||||
// operation, the assumption here is that result array is initialized to valid
|
||||
@ -126,29 +95,15 @@ void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
|
||||
const float* batch_vector,
|
||||
int n_batch,
|
||||
float* result);
|
||||
void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
|
||||
int v_size,
|
||||
const float* batch_vector,
|
||||
int n_batch, float* result);
|
||||
|
||||
// Compute "1.0f - elements of vector" (used in CIFG).
|
||||
void PortableSub1Vector(const float* vector, int v_size, float* result);
|
||||
void NeonSub1Vector(const float* vector, int v_size, float* result);
|
||||
|
||||
// Clip elements of a vector using a abs_limit value.
|
||||
void PortableClipVector(const float* vector, int v_size, float abs_limit,
|
||||
float* result);
|
||||
void NeonClipVector(const float* vector, int v_size, float abs_limit,
|
||||
float* result);
|
||||
|
||||
// Add another vector for each batch in the batch vector.
|
||||
void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
|
||||
float* batch_vector);
|
||||
|
||||
// Batch vector initialization with another vector.
|
||||
void PortableVectorBatchVectorAssign(const float* vector, int v_size,
|
||||
int n_batch, float* batch_vector);
|
||||
|
||||
// Add another vector for each batch in the batch vector.
|
||||
void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch,
|
||||
float* batch_vector);
|
||||
|
||||
// Apply sigmoid to elements of a vector.
|
||||
void PortableApplySigmoidToVector(const float* vector, int v_size,
|
||||
float* result);
|
||||
@ -161,33 +116,22 @@ void PortableApplyActivationToVector(const float* vector, int v_size,
|
||||
// Copy vector to another vector.
|
||||
void PortableCopyVector(const float* vector, int v_size, float* result);
|
||||
|
||||
// Compute "1.0f - elements of vector" (used in CIFG).
|
||||
void PortableSub1Vector(const float* vector, int v_size, float* result);
|
||||
|
||||
// Fill vector with 0.f.
|
||||
void PortableZeroVector(float* vector, int v_size);
|
||||
|
||||
// Multiply all elements of vector with a scalar.
|
||||
void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
||||
float* result);
|
||||
void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
||||
float* result);
|
||||
|
||||
// Limit a float input f between +abs_limit and -abs_limit.
|
||||
float PortableClip(float f, float abs_limit);
|
||||
|
||||
// Check if all entries of a vector are zero.
|
||||
bool PortableIsZeroVector(const float* vector, int v_size);
|
||||
bool NeonIsZeroVector(const float* vector, int v_size);
|
||||
|
||||
// Symmetric quantizer.
|
||||
void PortableSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* min,
|
||||
float* max, float* scaling_factor);
|
||||
void NeonSymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* min,
|
||||
float* max, float* scaling_factor);
|
||||
// Clip elements of a vector using a abs_limit value.
|
||||
void PortableClipVector(const float* vector, int v_size, float abs_limit,
|
||||
float* result);
|
||||
|
||||
// Shift left a vector in place with v_size size.
|
||||
void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
|
||||
void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
|
||||
|
||||
// Reduce-sum on a float input vector:
|
||||
// input_vector: float pointer to input vector.
|
||||
@ -197,9 +141,9 @@ void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
|
||||
// added to get one element of output.
|
||||
void PortableReductionSumVector(const float* input_vector, float* output_vector,
|
||||
int output_size, int reduction_size);
|
||||
void NeonReductionSumVector(const float* input_vector, float* output_vector,
|
||||
int output_size, int reduction_size);
|
||||
|
||||
// Layer norm for each batch.
|
||||
// normalization_epsilon is added to avoid divergence.
|
||||
void PortableMeanStddevNormalization(const float* input_vector,
|
||||
float* output_vector, int v_size,
|
||||
int n_batch, float normalization_epsilon);
|
||||
@ -207,4 +151,4 @@ void PortableMeanStddevNormalization(const float* input_vector,
|
||||
} // namespace tensor_utils
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
|
Loading…
Reference in New Issue
Block a user