From ff2019a216aed7bbb1e30432b47abcfe5567f0b4 Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Mon, 18 May 2020 07:06:15 -0700 Subject: [PATCH] Optimize multiply by quantize multiplier. PiperOrigin-RevId: 312072311 Change-Id: I7d01be9aa8f1a238c6887d4770a1090899337383 --- .../internal/optimized/optimized_ops.h | 82 ++++++------------- 1 file changed, 27 insertions(+), 55 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h index b18f0f4bb5a..64598d70ee3 100644 --- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h @@ -201,63 +201,35 @@ MatrixMap MapAsMatrixWithGivenNumberOfRows(Scalar* data, // MultiplyByQuantizedMultipler. #ifdef USE_NEON inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows( - int32x4x4_t input_val, int32 quantized_multiplier, int shift) { - using gemmlowp::RoundingDivideByPOT; - using gemmlowp::SaturatingRoundingDoublingHighMul; - const int left_shift = shift > 0 ? shift : 0; - const int right_shift = shift > 0 ? 0 : -shift; + int32x4x4_t input_val, int32 quantized_multiplier, int32 shift) { + const int left_shift = std::max(shift, 0); + const int right_shift = std::min(shift, 0); int32x4x4_t result; - // The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp - // is limited to NEON. -#ifdef GEMMLOWP_NEON - const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift); - result.val[0] = - RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( - vmulq_s32(input_val.val[0], left_shifted_one_dup), - quantized_multiplier), - right_shift); - result.val[1] = - RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( - vmulq_s32(input_val.val[1], left_shifted_one_dup), - quantized_multiplier), - right_shift); - result.val[2] = - RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( - vmulq_s32(input_val.val[2], left_shifted_one_dup), - quantized_multiplier), - right_shift); - result.val[3] = - RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( - vmulq_s32(input_val.val[3], left_shifted_one_dup), - quantized_multiplier), - right_shift); -#else - for (int i = 0; i < 4; ++i) { - int32_t vals[4]; - vals[0] = RoundingDivideByPOT( - SaturatingRoundingDoublingHighMul( - vgetq_lane_s32(input_val.val[i], 0) * (1 << left_shift), - quantized_multiplier), - right_shift); - vals[1] = RoundingDivideByPOT( - SaturatingRoundingDoublingHighMul( - vgetq_lane_s32(input_val.val[i], 1) * (1 << left_shift), - quantized_multiplier), - right_shift); - vals[2] = RoundingDivideByPOT( - SaturatingRoundingDoublingHighMul( - vgetq_lane_s32(input_val.val[i], 2) * (1 << left_shift), - quantized_multiplier), - right_shift); - vals[3] = RoundingDivideByPOT( - SaturatingRoundingDoublingHighMul( - vgetq_lane_s32(input_val.val[i], 3) * (1 << left_shift), - quantized_multiplier), - right_shift); - result.val[i] = vld1q_s32(reinterpret_cast(&vals)); - } -#endif + int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier); + int32x4_t left_shift_dup = vdupq_n_s32(left_shift); + int32x4_t right_shift_dup = vdupq_n_s32(right_shift); + + result.val[0] = + vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), + multiplier_dup), + right_shift_dup); + + result.val[1] = + vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), + multiplier_dup), + right_shift_dup); + + result.val[2] = + vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), + multiplier_dup), + right_shift_dup); + + result.val[3] = + vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), + multiplier_dup), + right_shift_dup); + return result; } #endif