Optimize multiply by quantize multiplier.
PiperOrigin-RevId: 312072311 Change-Id: I7d01be9aa8f1a238c6887d4770a1090899337383
This commit is contained in:
parent
fb416f16e2
commit
ff2019a216
@ -201,63 +201,35 @@ MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
|
||||
// MultiplyByQuantizedMultipler.
|
||||
#ifdef USE_NEON
|
||||
inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
|
||||
int32x4x4_t input_val, int32 quantized_multiplier, int shift) {
|
||||
using gemmlowp::RoundingDivideByPOT;
|
||||
using gemmlowp::SaturatingRoundingDoublingHighMul;
|
||||
const int left_shift = shift > 0 ? shift : 0;
|
||||
const int right_shift = shift > 0 ? 0 : -shift;
|
||||
int32x4x4_t input_val, int32 quantized_multiplier, int32 shift) {
|
||||
const int left_shift = std::max(shift, 0);
|
||||
const int right_shift = std::min(shift, 0);
|
||||
int32x4x4_t result;
|
||||
// The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp
|
||||
// is limited to NEON.
|
||||
#ifdef GEMMLOWP_NEON
|
||||
const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
|
||||
result.val[0] =
|
||||
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
|
||||
vmulq_s32(input_val.val[0], left_shifted_one_dup),
|
||||
quantized_multiplier),
|
||||
right_shift);
|
||||
result.val[1] =
|
||||
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
|
||||
vmulq_s32(input_val.val[1], left_shifted_one_dup),
|
||||
quantized_multiplier),
|
||||
right_shift);
|
||||
result.val[2] =
|
||||
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
|
||||
vmulq_s32(input_val.val[2], left_shifted_one_dup),
|
||||
quantized_multiplier),
|
||||
right_shift);
|
||||
result.val[3] =
|
||||
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
|
||||
vmulq_s32(input_val.val[3], left_shifted_one_dup),
|
||||
quantized_multiplier),
|
||||
right_shift);
|
||||
#else
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int32_t vals[4];
|
||||
vals[0] = RoundingDivideByPOT(
|
||||
SaturatingRoundingDoublingHighMul(
|
||||
vgetq_lane_s32(input_val.val[i], 0) * (1 << left_shift),
|
||||
quantized_multiplier),
|
||||
right_shift);
|
||||
vals[1] = RoundingDivideByPOT(
|
||||
SaturatingRoundingDoublingHighMul(
|
||||
vgetq_lane_s32(input_val.val[i], 1) * (1 << left_shift),
|
||||
quantized_multiplier),
|
||||
right_shift);
|
||||
vals[2] = RoundingDivideByPOT(
|
||||
SaturatingRoundingDoublingHighMul(
|
||||
vgetq_lane_s32(input_val.val[i], 2) * (1 << left_shift),
|
||||
quantized_multiplier),
|
||||
right_shift);
|
||||
vals[3] = RoundingDivideByPOT(
|
||||
SaturatingRoundingDoublingHighMul(
|
||||
vgetq_lane_s32(input_val.val[i], 3) * (1 << left_shift),
|
||||
quantized_multiplier),
|
||||
right_shift);
|
||||
|
||||
result.val[i] = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
|
||||
}
|
||||
#endif
|
||||
int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
|
||||
int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
|
||||
int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
|
||||
|
||||
result.val[0] =
|
||||
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
|
||||
multiplier_dup),
|
||||
right_shift_dup);
|
||||
|
||||
result.val[1] =
|
||||
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
|
||||
multiplier_dup),
|
||||
right_shift_dup);
|
||||
|
||||
result.val[2] =
|
||||
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
|
||||
multiplier_dup),
|
||||
right_shift_dup);
|
||||
|
||||
result.val[3] =
|
||||
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
|
||||
multiplier_dup),
|
||||
right_shift_dup);
|
||||
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user