Merge pull request #42134 from Tessil:toupstream/fix_64_bit_multiply_by_quantized_multiplier_overflow
PiperOrigin-RevId: 346877418 Change-Id: I82190beaa201c24472b604a43edf59f79b59718a
This commit is contained in:
commit
d79e6fa523
tensorflow/lite/kernels/internal
@ -385,6 +385,10 @@ cc_library(
|
||||
hdrs = ["quantization_util.h"],
|
||||
compatible_with = get_compatible_with_portable(),
|
||||
copts = tflite_copts() + micro_copts(),
|
||||
linkopts = select({
|
||||
"//tensorflow:windows": [],
|
||||
"//conditions:default": ["-lm"],
|
||||
}),
|
||||
deps = [
|
||||
":compatibility",
|
||||
":cppmath",
|
||||
|
@ -178,8 +178,12 @@ inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
|
||||
// - input x is in the range -(1<<47) <= x < (1<<47)
|
||||
assert(quantized_multiplier >= 0);
|
||||
assert(shift >= -31 && shift < 8);
|
||||
assert(x >= -(static_cast<int64_t>(1) << 47) &&
|
||||
x < (static_cast<int64_t>(1) << 47));
|
||||
|
||||
int32_t reduced_multiplier = (quantized_multiplier + (1 << 15)) >> 16;
|
||||
int32_t reduced_multiplier = (quantized_multiplier < 0x7FFF0000)
|
||||
? ((quantized_multiplier + (1 << 15)) >> 16)
|
||||
: 0x7FFF;
|
||||
int total_shift = 15 - shift;
|
||||
x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1));
|
||||
int32_t result = x >> total_shift;
|
||||
|
@ -422,6 +422,72 @@ TEST(QuantizationUtilTest, GetInvSqrtQuantizedMultiplierExp) {
|
||||
EXPECT_THAT(inv_sqrt(kInt32Max), Pair(189812531, 12));
|
||||
}
|
||||
|
||||
TEST(QuantizationUtilTest, MultiplyByQuantizedMultiplierInt32) {
|
||||
auto quant_and_multiply = [](int32_t x, double multiplier) {
|
||||
int32_t quantized_multiplier;
|
||||
int shift;
|
||||
QuantizeMultiplier(multiplier, &quantized_multiplier, &shift);
|
||||
return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
|
||||
};
|
||||
|
||||
EXPECT_EQ(quant_and_multiply(0, 0.1), 0);
|
||||
EXPECT_EQ(quant_and_multiply(1, 0), 0);
|
||||
EXPECT_EQ(quant_and_multiply(10000, 0.00097656), 10);
|
||||
EXPECT_EQ(quant_and_multiply(10000, -0.00097656), -10);
|
||||
EXPECT_EQ(quant_and_multiply(-10000, 0.00097656), -10);
|
||||
EXPECT_EQ(quant_and_multiply(-10000, -0.00097656), 10);
|
||||
EXPECT_EQ(quant_and_multiply(std::numeric_limits<int32_t>::min(), 0.00001),
|
||||
-21475);
|
||||
EXPECT_EQ(quant_and_multiply(std::numeric_limits<int32_t>::min(), -0.00001),
|
||||
21475);
|
||||
EXPECT_EQ(quant_and_multiply(std::numeric_limits<int32_t>::max(), 0.00001),
|
||||
21475);
|
||||
EXPECT_EQ(quant_and_multiply(std::numeric_limits<int32_t>::max(), -0.00001),
|
||||
-21475);
|
||||
|
||||
// Test with maximum possible x and quantized_multiplier
|
||||
const int32_t x = std::numeric_limits<int32_t>::max();
|
||||
const int32_t quantized_multiplier = std::numeric_limits<int32_t>::max();
|
||||
const int shift = -3;
|
||||
const int32_t expected = static_cast<int32_t>(
|
||||
TfLiteRound(static_cast<int64_t>(x) * quantized_multiplier /
|
||||
static_cast<double>(1ll << (31 - shift))));
|
||||
EXPECT_EQ(MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift),
|
||||
expected);
|
||||
EXPECT_EQ(MultiplyByQuantizedMultiplier(-x, quantized_multiplier, shift),
|
||||
-expected);
|
||||
}
|
||||
|
||||
TEST(QuantizationUtilTest, MultiplyByQuantizedMultiplierInt64) {
|
||||
auto quant_and_multiply = [](int64_t x, double multiplier) {
|
||||
int32_t quantized_multiplier;
|
||||
int shift;
|
||||
QuantizeMultiplier(multiplier, &quantized_multiplier, &shift);
|
||||
return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
|
||||
};
|
||||
|
||||
// Negative multipliers are not supported by the 64-bit
|
||||
// MultiplyByQuantizedMultiplier, only use >= 0 multipliers.
|
||||
EXPECT_EQ(quant_and_multiply(0, 0.1), 0);
|
||||
EXPECT_EQ(quant_and_multiply(1, 0), 0);
|
||||
EXPECT_EQ(quant_and_multiply(10000, 0.00097656), 10);
|
||||
EXPECT_EQ(quant_and_multiply(-10000, 0.00097656), -10);
|
||||
EXPECT_EQ(quant_and_multiply(-(1ll << 47), 0.00001), -1407385600);
|
||||
EXPECT_EQ(quant_and_multiply((1ll << 47) - 1, 0.00001), 1407385600);
|
||||
|
||||
// Test with maximum possible x and quantized_multiplier
|
||||
const int64_t x = (1ll << 47) - 1;
|
||||
const int32_t quantized_multiplier = std::numeric_limits<int32_t>::max();
|
||||
const int shift = -31;
|
||||
// Expected is around 'x * quantized_multiplier / 2**(31 - shift)' ~= 65536
|
||||
// As there is some rounding error, expected is a bit smaller.
|
||||
const int32_t expected = 65534;
|
||||
EXPECT_EQ(MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift),
|
||||
expected);
|
||||
EXPECT_EQ(MultiplyByQuantizedMultiplier(-x, quantized_multiplier, shift),
|
||||
-expected);
|
||||
}
|
||||
|
||||
TEST(QuantizationUtilTest, PreprocessSoftmaxScaling) {
|
||||
auto quantize = [](double beta, double scale, int integer_bits) {
|
||||
int32_t q;
|
||||
|
Loading…
Reference in New Issue
Block a user