Remove handwritten NEON code to allow the compiler to generate better NEON code.

Change VectorCwiseProduct functions to a template so they can work with any type.

Clang generates a slightly faster binary for ARM small cores (~1% faster LSTM op performance on ARM Cortex A55, no measurable difference for Cortex A76), GCC generates similar assembly to handwritten NEON.

PiperOrigin-RevId: 286954815
Change-Id: I9bc4898f18e9a94538432ae550facaef40a4656e
This commit is contained in:
Robert David 2019-12-23 16:02:52 -08:00 committed by TensorFlower Gardener
parent 734dfee862
commit f92fc5d442
8 changed files with 17 additions and 125 deletions

View File

@ -1817,54 +1817,6 @@ void NeonSparseMatrixBatchVectorMultiplyAccumulate(
free(aligned_vec_free);
}
void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
int v_size, float* result) {
// If v_size is not divisible by the vector size, then we need to process the
// final few elements sequentially. postamble_start shows the start index
// where this should happen.
const int postamble_start =
RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
int v = 0;
for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
// Load 4 float values from vector1 and vector2.
const float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
const float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
// Vector multiply 4 float
const float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4);
// Save to result array.
vst1q_f32(result + v, mul_32x4);
}
#pragma clang loop vectorize(disable) unroll(disable)
for (; v < v_size; v++) {
result[v] = vector1[v] * vector2[v];
}
}
void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
const float* vector2, int v_size,
float* result) {
// If v_size is not divisible by the vector size, then we need to process the
// final few elements sequentially. postamble_start shows the start index
// where this should happen.
const int postamble_start =
RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
int v = 0;
for (; v < postamble_start; v += kFloatValuesPerNeonVector) {
// Load 4 float values from vector1 and vector2 and accumulator.
const float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
const float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
float32x4_t acc_32x4 = vld1q_f32(result + v);
// Vector multiply-accumulate 4 float
acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
// Save to result array.
vst1q_f32(result + v, acc_32x4);
}
#pragma clang loop vectorize(disable) unroll(disable)
for (; v < v_size; v++) {
result[v] += vector1[v] * vector2[v];
}
}
void NeonSub1Vector(const float* vector, int v_size, float* result) {
// If v_size is not divisible by the vector size, then we need to process the
// final few elements sequentially. postamble_start shows the start index

View File

@ -142,11 +142,6 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
NEON_OR_PORTABLE(CwiseClipping, input, clipping_value, n_batch, n_input);
}
void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
int v_size, float* result) {
NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
}
void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
const int16_t* vector2, int v_size,
int n_batch, int32_t* result,
@ -155,13 +150,6 @@ void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
vector1, vector2, v_size, n_batch, result, result_stride);
}
void VectorVectorCwiseProductAccumulate(const float* vector1,
const float* vector2, int v_size,
float* result) {
NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size,
result);
}
void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
const int16_t* batch_vector,
int n_batch, int32_t multiplier,

View File

@ -107,16 +107,6 @@ void NeonSparseMatrixBatchVectorMultiplyAccumulate(
const float* scaling_factors, int n_batch, float* __restrict__ result,
int result_stride);
// Cwise product of two vectors.
void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
int v_size, float* result);
// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
// assumption here is that result array is initialized to valid values.
void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
const float* vector2, int v_size,
float* result);
// Dot product of two vectors.
float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
int v_size);

View File

@ -152,11 +152,6 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
PortableCwiseClipping(input, clipping_value, n_batch, n_input);
}
void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
int v_size, float* result) {
NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
}
void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
const int16_t* vector2, int v_size,
int n_batch, int32_t* result,
@ -165,13 +160,6 @@ void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
vector1, vector2, v_size, n_batch, result, result_stride);
}
void VectorVectorCwiseProductAccumulate(const float* vector1,
const float* vector2, int v_size,
float* result) {
NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size,
result);
}
void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
const int16_t* batch_vector,
int n_batch, int32_t multiplier,

View File

@ -504,14 +504,6 @@ void PortableCwiseClipping(int8_t* input, const int8_t clipping_value,
}
}
void PortableVectorVectorCwiseProduct(const float* vector1,
const float* vector2, int v_size,
float* result) {
for (int v = 0; v < v_size; v++) {
result[v] = vector1[v] * vector2[v];
}
}
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
int v_size) {
float result = 0.0;
@ -545,14 +537,6 @@ void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
}
}
void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
const float* vector2,
int v_size, float* result) {
for (int v = 0; v < v_size; v++) {
result[v] += vector1[v] * vector2[v];
}
}
void PortableVectorBatchVectorCwiseProductAccumulate(
const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
int32_t multiplier, int shift, int16_t* result) {

View File

@ -176,17 +176,6 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
PortableCwiseClipping(input, clipping_value, n_batch, n_input);
}
void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
int v_size, float* result) {
PortableVectorVectorCwiseProduct(vector1, vector2, v_size, result);
}
void VectorVectorCwiseProductAccumulate(const float* vector1,
const float* vector2, int v_size,
float* result) {
PortableVectorVectorCwiseProductAccumulate(vector1, vector2, v_size, result);
}
void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
const int16_t* batch_vector,
int n_batch, int32_t multiplier,

View File

@ -78,17 +78,6 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate(
const float* scaling_factors, int n_batch, float* __restrict__ result,
int result_stride);
// Cwise product of two vectors.
void PortableVectorVectorCwiseProduct(const float* vector1,
const float* vector2, int v_size,
float* result);
// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
// assumption here is that result array is initialized to valid values.
void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
const float* vector2,
int v_size, float* result);
// Dot product of two vectors.
float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
int v_size);

View File

@ -314,14 +314,26 @@ void CwiseClipping(int8_t* input, const int8_t clipping_value, int32_t n_batch,
int32_t n_input);
// Cwise product of two vectors.
void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
int v_size, float* result);
template <typename T>
inline void VectorVectorCwiseProduct(const T* __restrict__ vector1,
const T* __restrict__ vector2, int v_size,
T* __restrict__ result) {
for (int v = 0; v < v_size; v++) {
*result++ = *vector1++ * *vector2++;
}
}
// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
// assumption here is that result array is initialized to valid values.
void VectorVectorCwiseProductAccumulate(const float* vector1,
const float* vector2, int v_size,
float* result);
template <typename T>
inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
const T* __restrict__ vector2,
int v_size,
T* __restrict__ result) {
for (int v = 0; v < v_size; v++) {
*result++ += *vector1++ * *vector2++;
}
}
// Dot product of two vectors.
float VectorVectorDotProduct(const float* vector1, const float* vector2,