Neon acceleration applied on kernel module.
PiperOrigin-RevId: 347588687 Change-Id: Ieda280ecf7cb872feea481140856485fde0a9ca7
This commit is contained in:
parent
203f9256a5
commit
d29c99439d
@ -26,7 +26,6 @@ limitations under the License.
|
||||
#include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
|
||||
#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
|
||||
#include "tensorflow/lite/kernels/internal/tensor_utils.h"
|
||||
@ -145,11 +144,10 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
||||
}
|
||||
}
|
||||
|
||||
inline void AveragePool16(const PoolParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const int8* input_data,
|
||||
const RuntimeShape& output_shape, int8* output_data) {
|
||||
ruy::profiler::ScopeLabel label("AveragePool/8bitWith16bitAccumulator");
|
||||
inline void AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape, const int8* input_data,
|
||||
const RuntimeShape& output_shape, int8* output_data) {
|
||||
ruy::profiler::ScopeLabel label("AveragePool/8bitWith32bitAccumulator");
|
||||
|
||||
// Here, and in other pooling ops, in order to maintain locality of reference,
|
||||
// to minimize some recalculations, and to load into NEON vector registers, we
|
||||
@ -171,7 +169,7 @@ inline void AveragePool16(const PoolParams& params,
|
||||
const int stride_height = params.stride_height;
|
||||
const int stride_width = params.stride_width;
|
||||
|
||||
int16 acc[kPoolingAccTrancheSize];
|
||||
int32 acc[kPoolingAccTrancheSize];
|
||||
for (int batch = 0; batch < batches; ++batch) {
|
||||
// We proceed through the depth in tranches (see comment above). The
|
||||
// depth_base is the depth at the beginning of the tranche. The
|
||||
@ -207,24 +205,30 @@ inline void AveragePool16(const PoolParams& params,
|
||||
int channel = 0;
|
||||
#ifdef USE_NEON
|
||||
for (; channel <= tranche_depth - 16; channel += 16) {
|
||||
int16x8_t acc_reg[2];
|
||||
for (int i = 0; i < 2; i++) {
|
||||
acc_reg[i] = vld1q_s16(acc + channel + 8 * i);
|
||||
}
|
||||
int16x4_t acc_reg[4];
|
||||
int8x16_t input_reg = vld1q_s8(input_channel_ptr);
|
||||
input_channel_ptr += 16;
|
||||
acc_reg[0] = vaddw_s8(acc_reg[0], vget_low_s8(input_reg));
|
||||
acc_reg[1] = vaddw_s8(acc_reg[1], vget_high_s8(input_reg));
|
||||
for (int i = 0; i < 2; i++) {
|
||||
vst1q_s16(acc + channel + 8 * i, acc_reg[i]);
|
||||
acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg)));
|
||||
acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg)));
|
||||
acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg)));
|
||||
acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg)));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
vst1q_s32(
|
||||
acc + channel + 4 * i,
|
||||
vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
|
||||
}
|
||||
}
|
||||
for (; channel <= tranche_depth - 8; channel += 8) {
|
||||
int16x8_t acc_reg = vld1q_s16(acc + channel);
|
||||
int8x8_t input_reg = vld1_s8(input_channel_ptr);
|
||||
int16x4_t acc_reg[2];
|
||||
int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr));
|
||||
input_channel_ptr += 8;
|
||||
acc_reg = vaddw_s8(acc_reg, input_reg);
|
||||
vst1q_s16(acc + channel, acc_reg);
|
||||
acc_reg[0] = vget_low_s16(input_reg);
|
||||
acc_reg[1] = vget_high_s16(input_reg);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
vst1q_s32(
|
||||
acc + channel + 4 * i,
|
||||
vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (; channel < tranche_depth; ++channel) {
|
||||
@ -237,24 +241,6 @@ inline void AveragePool16(const PoolParams& params,
|
||||
out_x, depth_base);
|
||||
int channel = 0;
|
||||
#ifdef USE_NEON
|
||||
#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \
|
||||
if (filter_count == FILTER_COUNT) { \
|
||||
for (; channel <= tranche_depth - 8; channel += 8) { \
|
||||
int16 buf[8]; \
|
||||
for (int i = 0; i < 8; i++) { \
|
||||
buf[i] = acc[channel + i] > 0 \
|
||||
? (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT \
|
||||
: (acc[channel + i] - FILTER_COUNT / 2) / FILTER_COUNT; \
|
||||
} \
|
||||
int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf)); \
|
||||
buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max)); \
|
||||
buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min)); \
|
||||
vst1_s8(output_ptr + channel, buf8); \
|
||||
} \
|
||||
}
|
||||
AVGPOOL_DIVIDING_BY(9)
|
||||
AVGPOOL_DIVIDING_BY(15)
|
||||
#undef AVGPOOL_DIVIDING_BY
|
||||
for (; channel <= tranche_depth - 8; channel += 8) {
|
||||
int16 buf[8];
|
||||
for (int i = 0; i < 8; i++) {
|
||||
@ -283,17 +269,6 @@ inline void AveragePool16(const PoolParams& params,
|
||||
}
|
||||
}
|
||||
|
||||
inline void AveragePool(const PoolParams& params,
|
||||
const RuntimeShape& input_shape, const int8* input_data,
|
||||
const RuntimeShape& output_shape, int8* output_data) {
|
||||
if (params.filter_height * params.filter_width > 16 * 16) {
|
||||
reference_integer_ops::AveragePool(params, input_shape, input_data,
|
||||
output_shape, output_data);
|
||||
} else {
|
||||
AveragePool16(params, input_shape, input_data, output_shape, output_data);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace optimized_integer_ops
|
||||
} // namespace tflite
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user