Remove handwritten NEON code to allow the compiler to generate better NEON code.
Change VectorCwiseProduct functions to a template so they can work with any type. Clang generates a slightly faster binary for ARM small cores (~1% faster LSTM op performance on ARM Cortex A55, no measurable difference for Cortex A76), GCC generates similar assembly to handwritten NEON. PiperOrigin-RevId: 286954815 Change-Id: I9bc4898f18e9a94538432ae550facaef40a4656e
This commit is contained in:
parent
734dfee862
commit
f92fc5d442
tensorflow/lite/kernels/internal
@ -1817,54 +1817,6 @@ void NeonSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
free(aligned_vec_free);
|
||||
}
|
||||
|
||||
void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
|
||||
int v_size, float* result) {
|
||||
// If v_size is not divisible by the vector size, then we need to process the
|
||||
// final few elements sequentially. postamble_start shows the start index
|
||||
// where this should happen.
|
||||
const int postamble_start =
|
||||
RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
|
||||
int v = 0;
|
||||
for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
|
||||
// Load 4 float values from vector1 and vector2.
|
||||
const float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
|
||||
const float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
|
||||
// Vector multiply 4 float
|
||||
const float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4);
|
||||
// Save to result array.
|
||||
vst1q_f32(result + v, mul_32x4);
|
||||
}
|
||||
#pragma clang loop vectorize(disable) unroll(disable)
|
||||
for (; v < v_size; v++) {
|
||||
result[v] = vector1[v] * vector2[v];
|
||||
}
|
||||
}
|
||||
|
||||
void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
float* result) {
|
||||
// If v_size is not divisible by the vector size, then we need to process the
|
||||
// final few elements sequentially. postamble_start shows the start index
|
||||
// where this should happen.
|
||||
const int postamble_start =
|
||||
RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
|
||||
int v = 0;
|
||||
for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
|
||||
// Load 4 float values from vector1 and vector2 and accumulator.
|
||||
const float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
|
||||
const float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
|
||||
float32x4_t acc_32x4 = vld1q_f32(result + v);
|
||||
// Vector multiply-accumulate 4 float
|
||||
acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
|
||||
// Save to result array.
|
||||
vst1q_f32(result + v, acc_32x4);
|
||||
}
|
||||
#pragma clang loop vectorize(disable) unroll(disable)
|
||||
for (; v < v_size; v++) {
|
||||
result[v] += vector1[v] * vector2[v];
|
||||
}
|
||||
}
|
||||
|
||||
void NeonSub1Vector(const float* vector, int v_size, float* result) {
|
||||
// If v_size is not divisible by the vector size, then we need to process the
|
||||
// final few elements sequentially. postamble_start shows the start index
|
||||
|
@ -142,11 +142,6 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
|
||||
NEON_OR_PORTABLE(CwiseClipping, input, clipping_value, n_batch, n_input);
|
||||
}
|
||||
|
||||
void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
|
||||
int v_size, float* result) {
|
||||
NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
|
||||
}
|
||||
|
||||
void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
|
||||
const int16_t* vector2, int v_size,
|
||||
int n_batch, int32_t* result,
|
||||
@ -155,13 +150,6 @@ void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
|
||||
vector1, vector2, v_size, n_batch, result, result_stride);
|
||||
}
|
||||
|
||||
void VectorVectorCwiseProductAccumulate(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
float* result) {
|
||||
NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size,
|
||||
result);
|
||||
}
|
||||
|
||||
void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
|
||||
const int16_t* batch_vector,
|
||||
int n_batch, int32_t multiplier,
|
||||
|
@ -107,16 +107,6 @@ void NeonSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const float* scaling_factors, int n_batch, float* __restrict__ result,
|
||||
int result_stride);
|
||||
|
||||
// Cwise product of two vectors.
|
||||
void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
|
||||
int v_size, float* result);
|
||||
|
||||
// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
|
||||
// assumption here is that result array is initialized to valid values.
|
||||
void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
float* result);
|
||||
|
||||
// Dot product of two vectors.
|
||||
float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size);
|
||||
|
@ -152,11 +152,6 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
|
||||
PortableCwiseClipping(input, clipping_value, n_batch, n_input);
|
||||
}
|
||||
|
||||
void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
|
||||
int v_size, float* result) {
|
||||
NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
|
||||
}
|
||||
|
||||
void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
|
||||
const int16_t* vector2, int v_size,
|
||||
int n_batch, int32_t* result,
|
||||
@ -165,13 +160,6 @@ void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
|
||||
vector1, vector2, v_size, n_batch, result, result_stride);
|
||||
}
|
||||
|
||||
void VectorVectorCwiseProductAccumulate(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
float* result) {
|
||||
NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size,
|
||||
result);
|
||||
}
|
||||
|
||||
void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
|
||||
const int16_t* batch_vector,
|
||||
int n_batch, int32_t multiplier,
|
||||
|
@ -504,14 +504,6 @@ void PortableCwiseClipping(int8_t* input, const int8_t clipping_value,
|
||||
}
|
||||
}
|
||||
|
||||
void PortableVectorVectorCwiseProduct(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
float* result) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
result[v] = vector1[v] * vector2[v];
|
||||
}
|
||||
}
|
||||
|
||||
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size) {
|
||||
float result = 0.0;
|
||||
@ -545,14 +537,6 @@ void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
|
||||
}
|
||||
}
|
||||
|
||||
void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
|
||||
const float* vector2,
|
||||
int v_size, float* result) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
result[v] += vector1[v] * vector2[v];
|
||||
}
|
||||
}
|
||||
|
||||
void PortableVectorBatchVectorCwiseProductAccumulate(
|
||||
const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
|
||||
int32_t multiplier, int shift, int16_t* result) {
|
||||
|
@ -176,17 +176,6 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
|
||||
PortableCwiseClipping(input, clipping_value, n_batch, n_input);
|
||||
}
|
||||
|
||||
void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
|
||||
int v_size, float* result) {
|
||||
PortableVectorVectorCwiseProduct(vector1, vector2, v_size, result);
|
||||
}
|
||||
|
||||
void VectorVectorCwiseProductAccumulate(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
float* result) {
|
||||
PortableVectorVectorCwiseProductAccumulate(vector1, vector2, v_size, result);
|
||||
}
|
||||
|
||||
void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
|
||||
const int16_t* batch_vector,
|
||||
int n_batch, int32_t multiplier,
|
||||
|
@ -78,17 +78,6 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const float* scaling_factors, int n_batch, float* __restrict__ result,
|
||||
int result_stride);
|
||||
|
||||
// Cwise product of two vectors.
|
||||
void PortableVectorVectorCwiseProduct(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
float* result);
|
||||
|
||||
// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
|
||||
// assumption here is that result array is initialized to valid values.
|
||||
void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
|
||||
const float* vector2,
|
||||
int v_size, float* result);
|
||||
|
||||
// Dot product of two vectors.
|
||||
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size);
|
||||
|
@ -314,14 +314,26 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
|
||||
int32_t n_input);
|
||||
|
||||
// Cwise product of two vectors.
|
||||
void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
|
||||
int v_size, float* result);
|
||||
template <typename T>
|
||||
inline void VectorVectorCwiseProduct(const T* __restrict__ vector1,
|
||||
const T* __restrict__ vector2, int v_size,
|
||||
T* __restrict__ result) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
*result++ = *vector1++ * *vector2++;
|
||||
}
|
||||
}
|
||||
|
||||
// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
|
||||
// assumption here is that result array is initialized to valid values.
|
||||
void VectorVectorCwiseProductAccumulate(const float* vector1,
|
||||
const float* vector2, int v_size,
|
||||
float* result);
|
||||
template <typename T>
|
||||
inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
|
||||
const T* __restrict__ vector2,
|
||||
int v_size,
|
||||
T* __restrict__ result) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
*result++ += *vector1++ * *vector2++;
|
||||
}
|
||||
}
|
||||
|
||||
// Dot product of two vectors.
|
||||
float VectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
|
Loading…
Reference in New Issue
Block a user