Fix AVX2 build

The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp is limited to NEON only.

PiperOrigin-RevId: 272334570
This commit is contained in:
Jaesung Chung 2019-10-01 17:43:23 -07:00 committed by TensorFlower Gardener
parent 0240b7c101
commit 496cc4a74c
2 changed files with 140 additions and 2 deletions
tensorflow/lite/kernels/internal/optimized

View File

@ -105,8 +105,11 @@ inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
using gemmlowp::SaturatingRoundingDoublingHighMul;
const int left_shift = shift > 0 ? shift : 0;
const int right_shift = shift > 0 ? 0 : -shift;
const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
int32x4x4_t result;
// The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp
// is limited to NEON.
#ifdef GEMMLOWP_NEON
const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
result.val[0] =
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
vmulq_s32(input_val.val[0], left_shifted_one_dup),
@ -127,6 +130,33 @@ inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
vmulq_s32(input_val.val[3], left_shifted_one_dup),
quantized_multiplier),
right_shift);
#else
for (int i = 0; i < 4; ++i) {
int32_t vals[4];
vals[0] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val.val[i], 0) * (1 << left_shift),
quantized_multiplier),
right_shift);
vals[1] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val.val[i], 1) * (1 << left_shift),
quantized_multiplier),
right_shift);
vals[2] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val.val[i], 2) * (1 << left_shift),
quantized_multiplier),
right_shift);
vals[3] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val.val[i], 3) * (1 << left_shift),
quantized_multiplier),
right_shift);
result.val[i] = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
}
#endif
return result;
}
@ -136,8 +166,11 @@ inline int32x4x2_t MultiplyByQuantizedMultiplier2Rows(
using gemmlowp::SaturatingRoundingDoublingHighMul;
const int left_shift = shift > 0 ? shift : 0;
const int right_shift = shift > 0 ? 0 : -shift;
const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
int32x4x2_t result;
// The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp
// is limited to NEON.
#ifdef GEMMLOWP_NEON
const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
result.val[0] =
RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
vmulq_s32(input_val.val[0], left_shifted_one_dup),
@ -148,6 +181,33 @@ inline int32x4x2_t MultiplyByQuantizedMultiplier2Rows(
vmulq_s32(input_val.val[1], left_shifted_one_dup),
quantized_multiplier),
right_shift);
#else
for (int i = 0; i < 2; ++i) {
int32_t vals[4];
vals[0] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val.val[i], 0) * (1 << left_shift),
quantized_multiplier),
right_shift);
vals[1] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val.val[i], 1) * (1 << left_shift),
quantized_multiplier),
right_shift);
vals[2] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val.val[i], 2) * (1 << left_shift),
quantized_multiplier),
right_shift);
vals[3] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val.val[i], 3) * (1 << left_shift),
quantized_multiplier),
right_shift);
result.val[i] = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
}
#endif
return result;
}

View File

@ -5147,6 +5147,10 @@ inline void MultiplyByQuantizedMultiplier4Rows(
int32x4_t* result_val_3, int32x4_t* result_val_4) {
using gemmlowp::RoundingDivideByPOT;
using gemmlowp::SaturatingRoundingDoublingHighMul;
// The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp
// is limited to NEON.
#ifdef GEMMLOWP_NEON
int32x4_t left_shifted_one_dup = vdupq_n_s32(left_shifted_one);
*result_val_1 = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
@ -5164,6 +5168,80 @@ inline void MultiplyByQuantizedMultiplier4Rows(
SaturatingRoundingDoublingHighMul(
vmulq_s32(input_val_4, left_shifted_one_dup), multiplier),
right_shift);
#else
int32_t vals[4];
vals[0] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_1, 0) * left_shifted_one, multiplier),
right_shift);
vals[1] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_1, 1) * left_shifted_one, multiplier),
right_shift);
vals[2] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_1, 2) * left_shifted_one, multiplier),
right_shift);
vals[3] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_1, 3) * left_shifted_one, multiplier),
right_shift);
*result_val_1 = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
vals[0] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_2, 0) * left_shifted_one, multiplier),
right_shift);
vals[1] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_2, 1) * left_shifted_one, multiplier),
right_shift);
vals[2] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_2, 2) * left_shifted_one, multiplier),
right_shift);
vals[3] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_2, 3) * left_shifted_one, multiplier),
right_shift);
*result_val_2 = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
vals[0] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_3, 0) * left_shifted_one, multiplier),
right_shift);
vals[1] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_3, 1) * left_shifted_one, multiplier),
right_shift);
vals[2] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_3, 2) * left_shifted_one, multiplier),
right_shift);
vals[3] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_3, 3) * left_shifted_one, multiplier),
right_shift);
*result_val_3 = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
vals[0] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_4, 0) * left_shifted_one, multiplier),
right_shift);
vals[1] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_4, 1) * left_shifted_one, multiplier),
right_shift);
vals[2] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_4, 2) * left_shifted_one, multiplier),
right_shift);
vals[3] = RoundingDivideByPOT(
SaturatingRoundingDoublingHighMul(
vgetq_lane_s32(input_val_4, 3) * left_shifted_one, multiplier),
right_shift);
*result_val_4 = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
#endif
}
#endif