diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h index 6f246e7a169..2ef61e52b65 100644 --- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h @@ -5125,9 +5125,10 @@ inline void Requantize(const input_type* input_data, int32_t size, #ifdef USE_NEON inline void MultiplyByQuantizedMultiplier4Rows( - int32x4_t input_val_1, int32x4_t input_val_2, int32x4_t input_val_3, - int32x4_t input_val_4, int32_t multiplier, int32_t left_shifted_one, - int32_t right_shift, int32x4_t* result_val_1, int32x4_t* result_val_2, + const int32x4_t input_val_1, const int32x4_t input_val_2, + const int32x4_t input_val_3, const int32x4_t input_val_4, + const int32_t multiplier, const int32_t left_shifted_one, + const int32_t right_shift, int32x4_t* result_val_1, int32x4_t* result_val_2, int32x4_t* result_val_3, int32x4_t* result_val_4) { using gemmlowp::RoundingDivideByPOT; using gemmlowp::SaturatingRoundingDoublingHighMul; @@ -5167,20 +5168,21 @@ inline void Requantize(const int8_t* input_data, int32_t size, int i = 0; #ifdef USE_NEON // Constants. - int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint); - int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint); - int32x4_t min_val_dup = vdupq_n_s32(kMinOutput); - int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput); + const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint); + const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint); + const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput); + const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput); // Left shift & right shift unconditionally. - int32_t left_shifted_one = + const int32_t left_shifted_one = effective_scale_shift > 0 ? 1 << effective_scale_shift : 1; - int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift; + const int32_t right_shift = + effective_scale_shift > 0 ? 0 : -effective_scale_shift; for (; i <= size - 16; i += 16) { - int8x16_t input_vec = vld1q_s8(input_data + i); - int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec)); - int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec)); + const int8x16_t input_vec = vld1q_s8(input_data + i); + const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec)); + const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec)); int32x4_t input_val_1 = vmovl_s16(vget_low_s16(first_half)); int32x4_t input_val_2 = vmovl_s16(vget_high_s16(first_half)); int32x4_t input_val_3 = vmovl_s16(vget_low_s16(second_half)); @@ -5205,21 +5207,27 @@ inline void Requantize(const int8_t* input_data, int32_t size, result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup); result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup); - uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result_val_1); - uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result_val_2); - uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result_val_3); - uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result_val_4); + const uint32x4_t result_val_1_unsigned = + vreinterpretq_u32_s32(result_val_1); + const uint32x4_t result_val_2_unsigned = + vreinterpretq_u32_s32(result_val_2); + const uint32x4_t result_val_3_unsigned = + vreinterpretq_u32_s32(result_val_3); + const uint32x4_t result_val_4_unsigned = + vreinterpretq_u32_s32(result_val_4); - uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned); - uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned); - uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned); - uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned); - uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2); - uint16x8_t output_second_half = + const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned); + const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned); + const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned); + const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned); + const uint16x8_t output_first_half = + vcombine_u16(narrowed_val_1, narrowed_val_2); + const uint16x8_t output_second_half = vcombine_u16(narrowed_val_3, narrowed_val_4); - uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half); - uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half); - uint8x16_t result = vcombine_u8(narrowed_first_half, narrowed_second_half); + const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half); + const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half); + const uint8x16_t result = + vcombine_u8(narrowed_first_half, narrowed_second_half); vst1q_u8(output_data + i, result); } @@ -5243,7 +5251,7 @@ inline void Requantize(const uint8_t* input_data, int32_t size, int32_t input_zeropoint, int32_t output_zeropoint, int8_t* output_data) { - gemmlowp::ScopedProfilingLabel label("Requantize/UInt8ToInt8"); + gemmlowp::ScopedProfilingLabel label("Requantize/Uint8ToInt8"); static constexpr int32_t kMinOutput = std::numeric_limits::min(); static constexpr int32_t kMaxOutput = std::numeric_limits::max(); @@ -5251,20 +5259,21 @@ inline void Requantize(const uint8_t* input_data, int32_t size, int i = 0; #ifdef USE_NEON // Constants. - int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint); - int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint); - int32x4_t min_val_dup = vdupq_n_s32(kMinOutput); - int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput); + const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint); + const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint); + const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput); + const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput); // Left shift & right shift unconditionally. - int32_t left_shifted_one = + const int32_t left_shifted_one = effective_scale_shift > 0 ? 1 << effective_scale_shift : 1; - int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift; + const int32_t right_shift = + effective_scale_shift > 0 ? 0 : -effective_scale_shift; for (; i <= size - 16; i += 16) { - uint8x16_t input_vec = vld1q_u8(input_data + i); - uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec)); - uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec)); + const uint8x16_t input_vec = vld1q_u8(input_data + i); + const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec)); + const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec)); int32x4_t input_val_1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half))); int32x4_t input_val_2 = @@ -5293,15 +5302,18 @@ inline void Requantize(const uint8_t* input_data, int32_t size, result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup); result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup); - int16x4_t narrowed_val_1 = vqmovn_s32(result_val_1); - int16x4_t narrowed_val_2 = vqmovn_s32(result_val_2); - int16x4_t narrowed_val_3 = vqmovn_s32(result_val_3); - int16x4_t narrowed_val_4 = vqmovn_s32(result_val_4); - int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2); - int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4); - int8x8_t narrowed_first_half = vqmovn_s16(output_first_half); - int8x8_t narrowed_second_half = vqmovn_s16(output_second_half); - int8x16_t result = vcombine_s8(narrowed_first_half, narrowed_second_half); + const int16x4_t narrowed_val_1 = vqmovn_s32(result_val_1); + const int16x4_t narrowed_val_2 = vqmovn_s32(result_val_2); + const int16x4_t narrowed_val_3 = vqmovn_s32(result_val_3); + const int16x4_t narrowed_val_4 = vqmovn_s32(result_val_4); + const int16x8_t output_first_half = + vcombine_s16(narrowed_val_1, narrowed_val_2); + const int16x8_t output_second_half = + vcombine_s16(narrowed_val_3, narrowed_val_4); + const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half); + const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half); + const int8x16_t result = + vcombine_s8(narrowed_first_half, narrowed_second_half); vst1q_s8(output_data + i, result); } @@ -5318,6 +5330,180 @@ inline void Requantize(const uint8_t* input_data, int32_t size, } } +template <> +inline void Requantize(const int8_t* input_data, int32_t size, + int32_t effective_scale_multiplier, + int32_t effective_scale_shift, + int32_t input_zeropoint, + int32_t output_zeropoint, + int8_t* output_data) { + gemmlowp::ScopedProfilingLabel label("Requantize/Int8ToInt8"); + + static constexpr int32_t kMinOutput = std::numeric_limits::min(); + static constexpr int32_t kMaxOutput = std::numeric_limits::max(); + + int i = 0; +#ifdef USE_NEON + // Constants. + const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint); + const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint); + const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput); + const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput); + + // Left shift & right shift unconditionally. + int32_t left_shifted_one = + effective_scale_shift > 0 ? 1 << effective_scale_shift : 1; + int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift; + + for (; i <= size - 16; i += 16) { + const int8x16_t input_vec = vld1q_s8(input_data + i); + const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec)); + const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec)); + int32x4_t input_val_1 = vmovl_s16(vget_low_s16(first_half)); + int32x4_t input_val_2 = vmovl_s16(vget_high_s16(first_half)); + int32x4_t input_val_3 = vmovl_s16(vget_low_s16(second_half)); + int32x4_t input_val_4 = vmovl_s16(vget_high_s16(second_half)); + + input_val_1 = vaddq_s32(input_val_1, input_zero_point_dup); + input_val_2 = vaddq_s32(input_val_2, input_zero_point_dup); + input_val_3 = vaddq_s32(input_val_3, input_zero_point_dup); + input_val_4 = vaddq_s32(input_val_4, input_zero_point_dup); + + int32x4_t result_val_1, result_val_2, result_val_3, result_val_4; + MultiplyByQuantizedMultiplier4Rows( + input_val_1, input_val_2, input_val_3, input_val_4, + effective_scale_multiplier, left_shifted_one, right_shift, + &result_val_1, &result_val_2, &result_val_3, &result_val_4); + + result_val_1 = vaddq_s32(result_val_1, output_zero_point_dup); + result_val_2 = vaddq_s32(result_val_2, output_zero_point_dup); + result_val_3 = vaddq_s32(result_val_3, output_zero_point_dup); + result_val_4 = vaddq_s32(result_val_4, output_zero_point_dup); + result_val_1 = vmaxq_s32(vminq_s32(result_val_1, max_val_dup), min_val_dup); + result_val_2 = vmaxq_s32(vminq_s32(result_val_2, max_val_dup), min_val_dup); + result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup); + result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup); + + const int16x4_t narrowed_val_1 = vqmovn_s32(result_val_1); + const int16x4_t narrowed_val_2 = vqmovn_s32(result_val_2); + const int16x4_t narrowed_val_3 = vqmovn_s32(result_val_3); + const int16x4_t narrowed_val_4 = vqmovn_s32(result_val_4); + const int16x8_t output_first_half = + vcombine_s16(narrowed_val_1, narrowed_val_2); + const int16x8_t output_second_half = + vcombine_s16(narrowed_val_3, narrowed_val_4); + const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half); + const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half); + const int8x16_t result = + vcombine_s8(narrowed_first_half, narrowed_second_half); + vst1q_s8(output_data + i, result); + } + +#endif + for (; i < size; ++i) { + const int32_t input = input_data[i] - input_zeropoint; + const int32_t output = + MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, + effective_scale_shift) + + output_zeropoint; + const int32_t clamped_output = + std::max(std::min(output, kMaxOutput), kMinOutput); + output_data[i] = static_cast(clamped_output); + } +} + +template <> +inline void Requantize( + const uint8_t* input_data, int32_t size, int32_t effective_scale_multiplier, + int32_t effective_scale_shift, int32_t input_zeropoint, + int32_t output_zeropoint, uint8_t* output_data) { + gemmlowp::ScopedProfilingLabel label("Requantize/Uint8ToUint8"); + + static constexpr int32_t kMinOutput = std::numeric_limits::min(); + static constexpr int32_t kMaxOutput = std::numeric_limits::max(); + + int i = 0; +#ifdef USE_NEON + // Constants. + const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint); + const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint); + const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput); + const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput); + + // Left shift & right shift unconditionally. + int32_t left_shifted_one = + effective_scale_shift > 0 ? 1 << effective_scale_shift : 1; + int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift; + + for (; i <= size - 16; i += 16) { + const uint8x16_t input_vec = vld1q_u8(input_data + i); + const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec)); + const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec)); + int32x4_t input_val_1 = + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half))); + int32x4_t input_val_2 = + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half))); + int32x4_t input_val_3 = + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half))); + int32x4_t input_val_4 = + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half))); + input_val_1 = vaddq_s32(input_val_1, input_zero_point_dup); + input_val_2 = vaddq_s32(input_val_2, input_zero_point_dup); + input_val_3 = vaddq_s32(input_val_3, input_zero_point_dup); + input_val_4 = vaddq_s32(input_val_4, input_zero_point_dup); + + int32x4_t result_val_1, result_val_2, result_val_3, result_val_4; + MultiplyByQuantizedMultiplier4Rows( + input_val_1, input_val_2, input_val_3, input_val_4, + effective_scale_multiplier, left_shifted_one, right_shift, + &result_val_1, &result_val_2, &result_val_3, &result_val_4); + + result_val_1 = vaddq_s32(result_val_1, output_zero_point_dup); + result_val_2 = vaddq_s32(result_val_2, output_zero_point_dup); + result_val_3 = vaddq_s32(result_val_3, output_zero_point_dup); + result_val_4 = vaddq_s32(result_val_4, output_zero_point_dup); + result_val_1 = vmaxq_s32(vminq_s32(result_val_1, max_val_dup), min_val_dup); + result_val_2 = vmaxq_s32(vminq_s32(result_val_2, max_val_dup), min_val_dup); + result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup); + result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup); + + const uint32x4_t result_val_1_unsigned = + vreinterpretq_u32_s32(result_val_1); + const uint32x4_t result_val_2_unsigned = + vreinterpretq_u32_s32(result_val_2); + const uint32x4_t result_val_3_unsigned = + vreinterpretq_u32_s32(result_val_3); + const uint32x4_t result_val_4_unsigned = + vreinterpretq_u32_s32(result_val_4); + + const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned); + const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned); + const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned); + const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned); + const uint16x8_t output_first_half = + vcombine_u16(narrowed_val_1, narrowed_val_2); + const uint16x8_t output_second_half = + vcombine_u16(narrowed_val_3, narrowed_val_4); + const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half); + const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half); + const uint8x16_t result = + vcombine_u8(narrowed_first_half, narrowed_second_half); + vst1q_u8(output_data + i, result); + } + +#endif + for (; i < size; ++i) { + const int32_t input = input_data[i] - input_zeropoint; + const int32_t output = + MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, + effective_scale_shift) + + output_zeropoint; + const int32_t clamped_output = + std::max(std::min(output, kMaxOutput), kMinOutput); + output_data[i] = static_cast(clamped_output); + } +} + inline void HardSwish(const RuntimeShape& input_shape, const float* input_data, const RuntimeShape& output_shape, float* output_data) { gemmlowp::ScopedProfilingLabel label("HardSwish/Float"); diff --git a/tensorflow/lite/kernels/quantize_test.cc b/tensorflow/lite/kernels/quantize_test.cc index e720f74728e..69b6f7dbc26 100644 --- a/tensorflow/lite/kernels/quantize_test.cc +++ b/tensorflow/lite/kernels/quantize_test.cc @@ -129,6 +129,20 @@ TEST(QuantizeOpTest, Int8Int8SmallerScale) { ElementsAreArray({1, 3, 5, 7, 9, 11, 13, 15, 17, 19})); } +// Same as previous test, except more data to hit the neon path. +TEST(QuantizeOpTest, Int8Int8SmallerScaleNeonPath) { + QuantizeOpModel m({TensorType_INT8, {1, 1, 4, 5}, -127, 128}, + {TensorType_INT8, {1, 1, 4, 5}, -63.5, 64}); + + // Input will quantized to {0,1,2,3,4,5,6,7,8,9,9,8,7,6,5,4,3,2,1,0}. + m.SetInputAndQuantize( + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}); + m.Invoke(); + EXPECT_THAT(m.GetOutput(), + ElementsAreArray({1, 3, 5, 7, 9, 11, 13, 15, 17, 19, + 19, 17, 15, 13, 11, 9, 7, 5, 3, 1})); +} + // Input scale 0.500000, output scale 0.500000, input zeropoint 127, output // zeropoint 127 TEST(QuantizeOpTest, UInt8UInt8SameScale) { @@ -171,6 +185,22 @@ TEST(QuantizeOpTest, Uint8Uint8SmallerScale) { ElementsAreArray({129, 131, 133, 135, 137, 139, 141, 143, 145, 147})); } +// Same as previous test, except more data to hit the neon path. +TEST(QuantizeOpTest, Uint8Uint8SmallerScaleNeonPath) { + QuantizeOpModel m({TensorType_UINT8, {1, 1, 4, 5}, -127, 128}, + {TensorType_UINT8, {1, 1, 4, 5}, -63.5, 64}); + + // Input will quantized to {128, 129, 130, 131, 132, 133, 134, 135, 136, 137, + // 137, 136, 135, 134, 133, 132, 131, 130, 129, 128}. + m.SetInputAndQuantize( + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}); + m.Invoke(); + EXPECT_THAT( + m.GetOutput(), + ElementsAreArray({129, 131, 133, 135, 137, 139, 141, 143, 145, 147, + 147, 145, 143, 141, 139, 137, 135, 133, 131, 129})); +} + // Input scale 1.000000, output scale 1.000000, input zeropoint -1, output // zeropoint 127 TEST(QuantizeOpTest, Int8Uint8SameScale) {