Neon acceleration applied on kernel module.
PiperOrigin-RevId: 347588687 Change-Id: Ieda280ecf7cb872feea481140856485fde0a9ca7
This commit is contained in:
		
							parent
							
								
									203f9256a5
								
							
						
					
					
						commit
						d29c99439d
					
				| @ -26,7 +26,6 @@ limitations under the License. | ||||
| #include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h" | ||||
| #include "tensorflow/lite/kernels/internal/optimized/neon_check.h" | ||||
| #include "tensorflow/lite/kernels/internal/quantization_util.h" | ||||
| #include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h" | ||||
| #include "tensorflow/lite/kernels/internal/reference/reference_ops.h" | ||||
| #include "tensorflow/lite/kernels/internal/strided_slice_logic.h" | ||||
| #include "tensorflow/lite/kernels/internal/tensor_utils.h" | ||||
| @ -145,11 +144,10 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| inline void AveragePool16(const PoolParams& params, | ||||
|                           const RuntimeShape& input_shape, | ||||
|                           const int8* input_data, | ||||
| inline void AveragePool(const PoolParams& params, | ||||
|                         const RuntimeShape& input_shape, const int8* input_data, | ||||
|                         const RuntimeShape& output_shape, int8* output_data) { | ||||
|   ruy::profiler::ScopeLabel label("AveragePool/8bitWith16bitAccumulator"); | ||||
|   ruy::profiler::ScopeLabel label("AveragePool/8bitWith32bitAccumulator"); | ||||
| 
 | ||||
|   // Here, and in other pooling ops, in order to maintain locality of reference,
 | ||||
|   // to minimize some recalculations, and to load into NEON vector registers, we
 | ||||
| @ -171,7 +169,7 @@ inline void AveragePool16(const PoolParams& params, | ||||
|   const int stride_height = params.stride_height; | ||||
|   const int stride_width = params.stride_width; | ||||
| 
 | ||||
|   int16 acc[kPoolingAccTrancheSize]; | ||||
|   int32 acc[kPoolingAccTrancheSize]; | ||||
|   for (int batch = 0; batch < batches; ++batch) { | ||||
|     // We proceed through the depth in tranches (see comment above). The
 | ||||
|     // depth_base is the depth at the beginning of the tranche. The
 | ||||
| @ -207,24 +205,30 @@ inline void AveragePool16(const PoolParams& params, | ||||
|               int channel = 0; | ||||
| #ifdef USE_NEON | ||||
|               for (; channel <= tranche_depth - 16; channel += 16) { | ||||
|                 int16x8_t acc_reg[2]; | ||||
|                 for (int i = 0; i < 2; i++) { | ||||
|                   acc_reg[i] = vld1q_s16(acc + channel + 8 * i); | ||||
|                 } | ||||
|                 int16x4_t acc_reg[4]; | ||||
|                 int8x16_t input_reg = vld1q_s8(input_channel_ptr); | ||||
|                 input_channel_ptr += 16; | ||||
|                 acc_reg[0] = vaddw_s8(acc_reg[0], vget_low_s8(input_reg)); | ||||
|                 acc_reg[1] = vaddw_s8(acc_reg[1], vget_high_s8(input_reg)); | ||||
|                 for (int i = 0; i < 2; i++) { | ||||
|                   vst1q_s16(acc + channel + 8 * i, acc_reg[i]); | ||||
|                 acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg))); | ||||
|                 acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg))); | ||||
|                 acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg))); | ||||
|                 acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg))); | ||||
|                 for (int i = 0; i < 4; i++) { | ||||
|                   vst1q_s32( | ||||
|                       acc + channel + 4 * i, | ||||
|                       vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i])); | ||||
|                 } | ||||
|               } | ||||
|               for (; channel <= tranche_depth - 8; channel += 8) { | ||||
|                 int16x8_t acc_reg = vld1q_s16(acc + channel); | ||||
|                 int8x8_t input_reg = vld1_s8(input_channel_ptr); | ||||
|                 int16x4_t acc_reg[2]; | ||||
|                 int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr)); | ||||
|                 input_channel_ptr += 8; | ||||
|                 acc_reg = vaddw_s8(acc_reg, input_reg); | ||||
|                 vst1q_s16(acc + channel, acc_reg); | ||||
|                 acc_reg[0] = vget_low_s16(input_reg); | ||||
|                 acc_reg[1] = vget_high_s16(input_reg); | ||||
|                 for (int i = 0; i < 2; i++) { | ||||
|                   vst1q_s32( | ||||
|                       acc + channel + 4 * i, | ||||
|                       vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i])); | ||||
|                 } | ||||
|               } | ||||
| #endif | ||||
|               for (; channel < tranche_depth; ++channel) { | ||||
| @ -237,24 +241,6 @@ inline void AveragePool16(const PoolParams& params, | ||||
|                                                   out_x, depth_base); | ||||
|           int channel = 0; | ||||
| #ifdef USE_NEON | ||||
| #define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                                    \ | ||||
|   if (filter_count == FILTER_COUNT) {                                        \ | ||||
|     for (; channel <= tranche_depth - 8; channel += 8) {                     \ | ||||
|       int16 buf[8];                                                          \ | ||||
|       for (int i = 0; i < 8; i++) {                                          \ | ||||
|         buf[i] = acc[channel + i] > 0                                        \ | ||||
|                      ? (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT  \ | ||||
|                      : (acc[channel + i] - FILTER_COUNT / 2) / FILTER_COUNT; \ | ||||
|       }                                                                      \ | ||||
|       int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf));                            \ | ||||
|       buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max));      \ | ||||
|       buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min));      \ | ||||
|       vst1_s8(output_ptr + channel, buf8);                                   \ | ||||
|     }                                                                        \ | ||||
|   } | ||||
|           AVGPOOL_DIVIDING_BY(9) | ||||
|           AVGPOOL_DIVIDING_BY(15) | ||||
| #undef AVGPOOL_DIVIDING_BY | ||||
|           for (; channel <= tranche_depth - 8; channel += 8) { | ||||
|             int16 buf[8]; | ||||
|             for (int i = 0; i < 8; i++) { | ||||
| @ -283,17 +269,6 @@ inline void AveragePool16(const PoolParams& params, | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| inline void AveragePool(const PoolParams& params, | ||||
|                         const RuntimeShape& input_shape, const int8* input_data, | ||||
|                         const RuntimeShape& output_shape, int8* output_data) { | ||||
|   if (params.filter_height * params.filter_width > 16 * 16) { | ||||
|     reference_integer_ops::AveragePool(params, input_shape, input_data, | ||||
|                                        output_shape, output_data); | ||||
|   } else { | ||||
|     AveragePool16(params, input_shape, input_data, output_shape, output_data); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| }  // namespace optimized_integer_ops
 | ||||
| }  // namespace tflite
 | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user