Neon acceleration applied on kernel module.

PiperOrigin-RevId: 347588687
Change-Id: Ieda280ecf7cb872feea481140856485fde0a9ca7
This commit is contained in:
Hyeonjong Ryu 2020-12-15 04:58:29 -08:00 committed by TensorFlower Gardener
parent 203f9256a5
commit d29c99439d

View File

@ -26,7 +26,6 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
#include "tensorflow/lite/kernels/internal/tensor_utils.h"
@ -145,11 +144,10 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
}
}
inline void AveragePool16(const PoolParams& params,
const RuntimeShape& input_shape,
const int8* input_data,
const RuntimeShape& output_shape, int8* output_data) {
ruy::profiler::ScopeLabel label("AveragePool/8bitWith16bitAccumulator");
inline void AveragePool(const PoolParams& params,
const RuntimeShape& input_shape, const int8* input_data,
const RuntimeShape& output_shape, int8* output_data) {
ruy::profiler::ScopeLabel label("AveragePool/8bitWith32bitAccumulator");
// Here, and in other pooling ops, in order to maintain locality of reference,
// to minimize some recalculations, and to load into NEON vector registers, we
@ -171,7 +169,7 @@ inline void AveragePool16(const PoolParams& params,
const int stride_height = params.stride_height;
const int stride_width = params.stride_width;
int16 acc[kPoolingAccTrancheSize];
int32 acc[kPoolingAccTrancheSize];
for (int batch = 0; batch < batches; ++batch) {
// We proceed through the depth in tranches (see comment above). The
// depth_base is the depth at the beginning of the tranche. The
@ -207,24 +205,30 @@ inline void AveragePool16(const PoolParams& params,
int channel = 0;
#ifdef USE_NEON
for (; channel <= tranche_depth - 16; channel += 16) {
int16x8_t acc_reg[2];
for (int i = 0; i < 2; i++) {
acc_reg[i] = vld1q_s16(acc + channel + 8 * i);
}
int16x4_t acc_reg[4];
int8x16_t input_reg = vld1q_s8(input_channel_ptr);
input_channel_ptr += 16;
acc_reg[0] = vaddw_s8(acc_reg[0], vget_low_s8(input_reg));
acc_reg[1] = vaddw_s8(acc_reg[1], vget_high_s8(input_reg));
for (int i = 0; i < 2; i++) {
vst1q_s16(acc + channel + 8 * i, acc_reg[i]);
acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg)));
acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg)));
acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg)));
acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg)));
for (int i = 0; i < 4; i++) {
vst1q_s32(
acc + channel + 4 * i,
vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
}
}
for (; channel <= tranche_depth - 8; channel += 8) {
int16x8_t acc_reg = vld1q_s16(acc + channel);
int8x8_t input_reg = vld1_s8(input_channel_ptr);
int16x4_t acc_reg[2];
int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr));
input_channel_ptr += 8;
acc_reg = vaddw_s8(acc_reg, input_reg);
vst1q_s16(acc + channel, acc_reg);
acc_reg[0] = vget_low_s16(input_reg);
acc_reg[1] = vget_high_s16(input_reg);
for (int i = 0; i < 2; i++) {
vst1q_s32(
acc + channel + 4 * i,
vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
}
}
#endif
for (; channel < tranche_depth; ++channel) {
@ -237,24 +241,6 @@ inline void AveragePool16(const PoolParams& params,
out_x, depth_base);
int channel = 0;
#ifdef USE_NEON
#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \
if (filter_count == FILTER_COUNT) { \
for (; channel <= tranche_depth - 8; channel += 8) { \
int16 buf[8]; \
for (int i = 0; i < 8; i++) { \
buf[i] = acc[channel + i] > 0 \
? (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT \
: (acc[channel + i] - FILTER_COUNT / 2) / FILTER_COUNT; \
} \
int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf)); \
buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max)); \
buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min)); \
vst1_s8(output_ptr + channel, buf8); \
} \
}
AVGPOOL_DIVIDING_BY(9)
AVGPOOL_DIVIDING_BY(15)
#undef AVGPOOL_DIVIDING_BY
for (; channel <= tranche_depth - 8; channel += 8) {
int16 buf[8];
for (int i = 0; i < 8; i++) {
@ -283,17 +269,6 @@ inline void AveragePool16(const PoolParams& params,
}
}
inline void AveragePool(const PoolParams& params,
const RuntimeShape& input_shape, const int8* input_data,
const RuntimeShape& output_shape, int8* output_data) {
if (params.filter_height * params.filter_width > 16 * 16) {
reference_integer_ops::AveragePool(params, input_shape, input_data,
output_shape, output_data);
} else {
AveragePool16(params, input_shape, input_data, output_shape, output_data);
}
}
} // namespace optimized_integer_ops
} // namespace tflite