Optimize multiply by quantize multiplier.

PiperOrigin-RevId: 312072311
Change-Id: I7d01be9aa8f1a238c6887d4770a1090899337383
This commit is contained in:
Renjie Liu 2020-05-18 07:06:15 -07:00 committed by TensorFlower Gardener
parent fb416f16e2
commit ff2019a216

View File

@ -201,63 +201,35 @@ MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
// MultiplyByQuantizedMultipler.
#ifdef USE_NEON
inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
int32x4x4_t input_val, int32 quantized_multiplier, int shift) {
using gemmlowp::RoundingDivideByPOT;
using gemmlowp::SaturatingRoundingDoublingHighMul;
const int left_shift = shift > 0 ? shift : 0;
const int right_shift = shift > 0 ? 0 : -shift;
int32x4x4_t input_val, int32 quantized_multiplier, int32 shift) {
const int left_shift = std::max(shift, 0);
const int right_shift = std::min(shift, 0);
int32x4x4_t result;
// The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp
// is limited to NEON.
#ifdef GEMMLOWP_NEON
const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
result.val[0] =
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
vmulq_s32(input_val.val[0], left_shifted_one_dup),
quantized_multiplier),
right_shift);
result.val[1] =
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
vmulq_s32(input_val.val[1], left_shifted_one_dup),
quantized_multiplier),
right_shift);
result.val[2] =
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
vmulq_s32(input_val.val[2], left_shifted_one_dup),
quantized_multiplier),
right_shift);
result.val[3] =
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
vmulq_s32(input_val.val[3], left_shifted_one_dup),
quantized_multiplier),
right_shift);
#else
for (int i = 0; i < 4; ++i) {
int32_t vals[4];
vals[0] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val.val[i], 0) * (1 << left_shift),
quantized_multiplier),
right_shift);
vals[1] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val.val[i], 1) * (1 << left_shift),
quantized_multiplier),
right_shift);
vals[2] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val.val[i], 2) * (1 << left_shift),
quantized_multiplier),
right_shift);
vals[3] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val.val[i], 3) * (1 << left_shift),
quantized_multiplier),
right_shift);
result.val[i] = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
}
#endif
int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
result.val[0] =
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
multiplier_dup),
right_shift_dup);
result.val[1] =
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
multiplier_dup),
right_shift_dup);
result.val[2] =
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
multiplier_dup),
right_shift_dup);
result.val[3] =
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
multiplier_dup),
right_shift_dup);
return result;
}
#endif