Neon acceleration applied on kernel module.

PiperOrigin-RevId: 347588687 Change-Id: Ieda280ecf7cb872feea481140856485fde0a9ca7
2020-12-15 04:58:29 -08:00 · 2020-12-15 04:58:29 -08:00 · d29c99439d
commit d29c99439d
parent 203f9256a5
1 changed files with 23 additions and 48 deletions
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/pooling.h
@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
@ -145,11 +144,10 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
  }
 }

-inline void AveragePool16(const PoolParams& params,
-                          const RuntimeShape& input_shape,
-                          const int8* input_data,
-                          const RuntimeShape& output_shape, int8* output_data) {
-  ruy::profiler::ScopeLabel label("AveragePool/8bitWith16bitAccumulator");
+inline void AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape, const int8* input_data,
+                        const RuntimeShape& output_shape, int8* output_data) {
+  ruy::profiler::ScopeLabel label("AveragePool/8bitWith32bitAccumulator");

  // Here, and in other pooling ops, in order to maintain locality of reference,
  // to minimize some recalculations, and to load into NEON vector registers, we
@ -171,7 +169,7 @@ inline void AveragePool16(const PoolParams& params,
  const int stride_height = params.stride_height;
  const int stride_width = params.stride_width;

-  int16 acc[kPoolingAccTrancheSize];
+  int32 acc[kPoolingAccTrancheSize];
  for (int batch = 0; batch < batches; ++batch) {
    // We proceed through the depth in tranches (see comment above). The
    // depth_base is the depth at the beginning of the tranche. The
@ -207,24 +205,30 @@ inline void AveragePool16(const PoolParams& params,
              int channel = 0;
 #ifdef USE_NEON
              for (; channel <= tranche_depth - 16; channel += 16) {
-                int16x8_t acc_reg[2];
-                for (int i = 0; i < 2; i++) {
-                  acc_reg[i] = vld1q_s16(acc + channel + 8 * i);
-                }
+                int16x4_t acc_reg[4];
                int8x16_t input_reg = vld1q_s8(input_channel_ptr);
                input_channel_ptr += 16;
-                acc_reg[0] = vaddw_s8(acc_reg[0], vget_low_s8(input_reg));
-                acc_reg[1] = vaddw_s8(acc_reg[1], vget_high_s8(input_reg));
-                for (int i = 0; i < 2; i++) {
-                  vst1q_s16(acc + channel + 8 * i, acc_reg[i]);
+                acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg)));
+                acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg)));
+                acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg)));
+                acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg)));
+                for (int i = 0; i < 4; i++) {
+                  vst1q_s32(
+                      acc + channel + 4 * i,
+                      vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
                }
              }
              for (; channel <= tranche_depth - 8; channel += 8) {
-                int16x8_t acc_reg = vld1q_s16(acc + channel);
-                int8x8_t input_reg = vld1_s8(input_channel_ptr);
+                int16x4_t acc_reg[2];
+                int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr));
                input_channel_ptr += 8;
-                acc_reg = vaddw_s8(acc_reg, input_reg);
-                vst1q_s16(acc + channel, acc_reg);
+                acc_reg[0] = vget_low_s16(input_reg);
+                acc_reg[1] = vget_high_s16(input_reg);
+                for (int i = 0; i < 2; i++) {
+                  vst1q_s32(
+                      acc + channel + 4 * i,
+                      vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
+                }
              }
 #endif
              for (; channel < tranche_depth; ++channel) {
@ -237,24 +241,6 @@ inline void AveragePool16(const PoolParams& params,
                                                  out_x, depth_base);
          int channel = 0;
 #ifdef USE_NEON
-#define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                                    \
-  if (filter_count == FILTER_COUNT) {                                        \
-    for (; channel <= tranche_depth - 8; channel += 8) {                     \
-      int16 buf[8];                                                          \
-      for (int i = 0; i < 8; i++) {                                          \
-        buf[i] = acc[channel + i] > 0                                        \
-                     ? (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT  \
-                     : (acc[channel + i] - FILTER_COUNT / 2) / FILTER_COUNT; \
-      }                                                                      \
-      int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf));                            \
-      buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max));      \
-      buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min));      \
-      vst1_s8(output_ptr + channel, buf8);                                   \
-    }                                                                        \
-  }
-          AVGPOOL_DIVIDING_BY(9)
-          AVGPOOL_DIVIDING_BY(15)
-#undef AVGPOOL_DIVIDING_BY
          for (; channel <= tranche_depth - 8; channel += 8) {
            int16 buf[8];
            for (int i = 0; i < 8; i++) {
@ -283,17 +269,6 @@ inline void AveragePool16(const PoolParams& params,
  }
 }

-inline void AveragePool(const PoolParams& params,
-                        const RuntimeShape& input_shape, const int8* input_data,
-                        const RuntimeShape& output_shape, int8* output_data) {
-  if (params.filter_height * params.filter_width > 16 * 16) {
-    reference_integer_ops::AveragePool(params, input_shape, input_data,
-                                       output_shape, output_data);
-  } else {
-    AveragePool16(params, input_shape, input_data, output_shape, output_data);
-  }
-}
-
 }  // namespace optimized_integer_ops
 }  // namespace tflite