Optimize requantize methods for int8 to int8 and uint8 to uint8

PiperOrigin-RevId: 261618865
This commit is contained in:
Jaesung Chung 2019-08-04 22:05:12 -07:00 committed by TensorFlower Gardener
parent fd2d5a3739
commit e920181a6f
2 changed files with 260 additions and 44 deletions

View File

@ -5125,9 +5125,10 @@ inline void Requantize(const input_type* input_data, int32_t size,
#ifdef USE_NEON
inline void MultiplyByQuantizedMultiplier4Rows(
int32x4_t input_val_1, int32x4_t input_val_2, int32x4_t input_val_3,
int32x4_t input_val_4, int32_t multiplier, int32_t left_shifted_one,
int32_t right_shift, int32x4_t* result_val_1, int32x4_t* result_val_2,
const int32x4_t input_val_1, const int32x4_t input_val_2,
const int32x4_t input_val_3, const int32x4_t input_val_4,
const int32_t multiplier, const int32_t left_shifted_one,
const int32_t right_shift, int32x4_t* result_val_1, int32x4_t* result_val_2,
int32x4_t* result_val_3, int32x4_t* result_val_4) {
using gemmlowp::RoundingDivideByPOT;
using gemmlowp::SaturatingRoundingDoublingHighMul;
@ -5167,20 +5168,21 @@ inline void Requantize<int8_t, uint8_t>(const int8_t* input_data, int32_t size,
int i = 0;
#ifdef USE_NEON
// Constants.
int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
// Left shift & right shift unconditionally.
int32_t left_shifted_one =
const int32_t left_shifted_one =
effective_scale_shift > 0 ? 1 << effective_scale_shift : 1;
int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift;
const int32_t right_shift =
effective_scale_shift > 0 ? 0 : -effective_scale_shift;
for (; i <= size - 16; i += 16) {
int8x16_t input_vec = vld1q_s8(input_data + i);
int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
const int8x16_t input_vec = vld1q_s8(input_data + i);
const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
int32x4_t input_val_1 = vmovl_s16(vget_low_s16(first_half));
int32x4_t input_val_2 = vmovl_s16(vget_high_s16(first_half));
int32x4_t input_val_3 = vmovl_s16(vget_low_s16(second_half));
@ -5205,21 +5207,27 @@ inline void Requantize<int8_t, uint8_t>(const int8_t* input_data, int32_t size,
result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup);
result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup);
uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result_val_1);
uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result_val_2);
uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result_val_3);
uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result_val_4);
const uint32x4_t result_val_1_unsigned =
vreinterpretq_u32_s32(result_val_1);
const uint32x4_t result_val_2_unsigned =
vreinterpretq_u32_s32(result_val_2);
const uint32x4_t result_val_3_unsigned =
vreinterpretq_u32_s32(result_val_3);
const uint32x4_t result_val_4_unsigned =
vreinterpretq_u32_s32(result_val_4);
uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2);
uint16x8_t output_second_half =
const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
const uint16x8_t output_first_half =
vcombine_u16(narrowed_val_1, narrowed_val_2);
const uint16x8_t output_second_half =
vcombine_u16(narrowed_val_3, narrowed_val_4);
uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
uint8x16_t result = vcombine_u8(narrowed_first_half, narrowed_second_half);
const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
const uint8x16_t result =
vcombine_u8(narrowed_first_half, narrowed_second_half);
vst1q_u8(output_data + i, result);
}
@ -5243,7 +5251,7 @@ inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
int32_t input_zeropoint,
int32_t output_zeropoint,
int8_t* output_data) {
gemmlowp::ScopedProfilingLabel label("Requantize/UInt8ToInt8");
gemmlowp::ScopedProfilingLabel label("Requantize/Uint8ToInt8");
static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
@ -5251,20 +5259,21 @@ inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
int i = 0;
#ifdef USE_NEON
// Constants.
int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
// Left shift & right shift unconditionally.
int32_t left_shifted_one =
const int32_t left_shifted_one =
effective_scale_shift > 0 ? 1 << effective_scale_shift : 1;
int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift;
const int32_t right_shift =
effective_scale_shift > 0 ? 0 : -effective_scale_shift;
for (; i <= size - 16; i += 16) {
uint8x16_t input_vec = vld1q_u8(input_data + i);
uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
const uint8x16_t input_vec = vld1q_u8(input_data + i);
const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
int32x4_t input_val_1 =
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
int32x4_t input_val_2 =
@ -5293,15 +5302,18 @@ inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup);
result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup);
int16x4_t narrowed_val_1 = vqmovn_s32(result_val_1);
int16x4_t narrowed_val_2 = vqmovn_s32(result_val_2);
int16x4_t narrowed_val_3 = vqmovn_s32(result_val_3);
int16x4_t narrowed_val_4 = vqmovn_s32(result_val_4);
int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2);
int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4);
int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
int8x16_t result = vcombine_s8(narrowed_first_half, narrowed_second_half);
const int16x4_t narrowed_val_1 = vqmovn_s32(result_val_1);
const int16x4_t narrowed_val_2 = vqmovn_s32(result_val_2);
const int16x4_t narrowed_val_3 = vqmovn_s32(result_val_3);
const int16x4_t narrowed_val_4 = vqmovn_s32(result_val_4);
const int16x8_t output_first_half =
vcombine_s16(narrowed_val_1, narrowed_val_2);
const int16x8_t output_second_half =
vcombine_s16(narrowed_val_3, narrowed_val_4);
const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
const int8x16_t result =
vcombine_s8(narrowed_first_half, narrowed_second_half);
vst1q_s8(output_data + i, result);
}
@ -5318,6 +5330,180 @@ inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
}
}
template <>
inline void Requantize<int8_t, int8_t>(const int8_t* input_data, int32_t size,
int32_t effective_scale_multiplier,
int32_t effective_scale_shift,
int32_t input_zeropoint,
int32_t output_zeropoint,
int8_t* output_data) {
gemmlowp::ScopedProfilingLabel label("Requantize/Int8ToInt8");
static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
int i = 0;
#ifdef USE_NEON
// Constants.
const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
// Left shift & right shift unconditionally.
int32_t left_shifted_one =
effective_scale_shift > 0 ? 1 << effective_scale_shift : 1;
int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift;
for (; i <= size - 16; i += 16) {
const int8x16_t input_vec = vld1q_s8(input_data + i);
const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
int32x4_t input_val_1 = vmovl_s16(vget_low_s16(first_half));
int32x4_t input_val_2 = vmovl_s16(vget_high_s16(first_half));
int32x4_t input_val_3 = vmovl_s16(vget_low_s16(second_half));
int32x4_t input_val_4 = vmovl_s16(vget_high_s16(second_half));
input_val_1 = vaddq_s32(input_val_1, input_zero_point_dup);
input_val_2 = vaddq_s32(input_val_2, input_zero_point_dup);
input_val_3 = vaddq_s32(input_val_3, input_zero_point_dup);
input_val_4 = vaddq_s32(input_val_4, input_zero_point_dup);
int32x4_t result_val_1, result_val_2, result_val_3, result_val_4;
MultiplyByQuantizedMultiplier4Rows(
input_val_1, input_val_2, input_val_3, input_val_4,
effective_scale_multiplier, left_shifted_one, right_shift,
&result_val_1, &result_val_2, &result_val_3, &result_val_4);
result_val_1 = vaddq_s32(result_val_1, output_zero_point_dup);
result_val_2 = vaddq_s32(result_val_2, output_zero_point_dup);
result_val_3 = vaddq_s32(result_val_3, output_zero_point_dup);
result_val_4 = vaddq_s32(result_val_4, output_zero_point_dup);
result_val_1 = vmaxq_s32(vminq_s32(result_val_1, max_val_dup), min_val_dup);
result_val_2 = vmaxq_s32(vminq_s32(result_val_2, max_val_dup), min_val_dup);
result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup);
result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup);
const int16x4_t narrowed_val_1 = vqmovn_s32(result_val_1);
const int16x4_t narrowed_val_2 = vqmovn_s32(result_val_2);
const int16x4_t narrowed_val_3 = vqmovn_s32(result_val_3);
const int16x4_t narrowed_val_4 = vqmovn_s32(result_val_4);
const int16x8_t output_first_half =
vcombine_s16(narrowed_val_1, narrowed_val_2);
const int16x8_t output_second_half =
vcombine_s16(narrowed_val_3, narrowed_val_4);
const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
const int8x16_t result =
vcombine_s8(narrowed_first_half, narrowed_second_half);
vst1q_s8(output_data + i, result);
}
#endif
for (; i < size; ++i) {
const int32_t input = input_data[i] - input_zeropoint;
const int32_t output =
MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
effective_scale_shift) +
output_zeropoint;
const int32_t clamped_output =
std::max(std::min(output, kMaxOutput), kMinOutput);
output_data[i] = static_cast<int8_t>(clamped_output);
}
}
template <>
inline void Requantize<uint8_t, uint8_t>(
const uint8_t* input_data, int32_t size, int32_t effective_scale_multiplier,
int32_t effective_scale_shift, int32_t input_zeropoint,
int32_t output_zeropoint, uint8_t* output_data) {
gemmlowp::ScopedProfilingLabel label("Requantize/Uint8ToUint8");
static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
int i = 0;
#ifdef USE_NEON
// Constants.
const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
// Left shift & right shift unconditionally.
int32_t left_shifted_one =
effective_scale_shift > 0 ? 1 << effective_scale_shift : 1;
int32_t right_shift = effective_scale_shift > 0 ? 0 : -effective_scale_shift;
for (; i <= size - 16; i += 16) {
const uint8x16_t input_vec = vld1q_u8(input_data + i);
const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
int32x4_t input_val_1 =
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
int32x4_t input_val_2 =
vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
int32x4_t input_val_3 =
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
int32x4_t input_val_4 =
vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
input_val_1 = vaddq_s32(input_val_1, input_zero_point_dup);
input_val_2 = vaddq_s32(input_val_2, input_zero_point_dup);
input_val_3 = vaddq_s32(input_val_3, input_zero_point_dup);
input_val_4 = vaddq_s32(input_val_4, input_zero_point_dup);
int32x4_t result_val_1, result_val_2, result_val_3, result_val_4;
MultiplyByQuantizedMultiplier4Rows(
input_val_1, input_val_2, input_val_3, input_val_4,
effective_scale_multiplier, left_shifted_one, right_shift,
&result_val_1, &result_val_2, &result_val_3, &result_val_4);
result_val_1 = vaddq_s32(result_val_1, output_zero_point_dup);
result_val_2 = vaddq_s32(result_val_2, output_zero_point_dup);
result_val_3 = vaddq_s32(result_val_3, output_zero_point_dup);
result_val_4 = vaddq_s32(result_val_4, output_zero_point_dup);
result_val_1 = vmaxq_s32(vminq_s32(result_val_1, max_val_dup), min_val_dup);
result_val_2 = vmaxq_s32(vminq_s32(result_val_2, max_val_dup), min_val_dup);
result_val_3 = vmaxq_s32(vminq_s32(result_val_3, max_val_dup), min_val_dup);
result_val_4 = vmaxq_s32(vminq_s32(result_val_4, max_val_dup), min_val_dup);
const uint32x4_t result_val_1_unsigned =
vreinterpretq_u32_s32(result_val_1);
const uint32x4_t result_val_2_unsigned =
vreinterpretq_u32_s32(result_val_2);
const uint32x4_t result_val_3_unsigned =
vreinterpretq_u32_s32(result_val_3);
const uint32x4_t result_val_4_unsigned =
vreinterpretq_u32_s32(result_val_4);
const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
const uint16x8_t output_first_half =
vcombine_u16(narrowed_val_1, narrowed_val_2);
const uint16x8_t output_second_half =
vcombine_u16(narrowed_val_3, narrowed_val_4);
const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
const uint8x16_t result =
vcombine_u8(narrowed_first_half, narrowed_second_half);
vst1q_u8(output_data + i, result);
}
#endif
for (; i < size; ++i) {
const int32_t input = input_data[i] - input_zeropoint;
const int32_t output =
MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
effective_scale_shift) +
output_zeropoint;
const int32_t clamped_output =
std::max(std::min(output, kMaxOutput), kMinOutput);
output_data[i] = static_cast<uint8_t>(clamped_output);
}
}
inline void HardSwish(const RuntimeShape& input_shape, const float* input_data,
const RuntimeShape& output_shape, float* output_data) {
gemmlowp::ScopedProfilingLabel label("HardSwish/Float");

View File

@ -129,6 +129,20 @@ TEST(QuantizeOpTest, Int8Int8SmallerScale) {
ElementsAreArray({1, 3, 5, 7, 9, 11, 13, 15, 17, 19}));
}
// Same as previous test, except more data to hit the neon path.
TEST(QuantizeOpTest, Int8Int8SmallerScaleNeonPath) {
QuantizeOpModel m({TensorType_INT8, {1, 1, 4, 5}, -127, 128},
{TensorType_INT8, {1, 1, 4, 5}, -63.5, 64});
// Input will quantized to {0,1,2,3,4,5,6,7,8,9,9,8,7,6,5,4,3,2,1,0}.
m.SetInputAndQuantize<int8_t>(
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
m.Invoke();
EXPECT_THAT(m.GetOutput<int8_t>(),
ElementsAreArray({1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
19, 17, 15, 13, 11, 9, 7, 5, 3, 1}));
}
// Input scale 0.500000, output scale 0.500000, input zeropoint 127, output
// zeropoint 127
TEST(QuantizeOpTest, UInt8UInt8SameScale) {
@ -171,6 +185,22 @@ TEST(QuantizeOpTest, Uint8Uint8SmallerScale) {
ElementsAreArray({129, 131, 133, 135, 137, 139, 141, 143, 145, 147}));
}
// Same as previous test, except more data to hit the neon path.
TEST(QuantizeOpTest, Uint8Uint8SmallerScaleNeonPath) {
QuantizeOpModel m({TensorType_UINT8, {1, 1, 4, 5}, -127, 128},
{TensorType_UINT8, {1, 1, 4, 5}, -63.5, 64});
// Input will quantized to {128, 129, 130, 131, 132, 133, 134, 135, 136, 137,
// 137, 136, 135, 134, 133, 132, 131, 130, 129, 128}.
m.SetInputAndQuantize<uint8_t>(
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1});
m.Invoke();
EXPECT_THAT(
m.GetOutput<uint8_t>(),
ElementsAreArray({129, 131, 133, 135, 137, 139, 141, 143, 145, 147,
147, 145, 143, 141, 139, 137, 135, 133, 131, 129}));
}
// Input scale 1.000000, output scale 1.000000, input zeropoint -1, output
// zeropoint 127
TEST(QuantizeOpTest, Int8Uint8SameScale) {