There can be functions beyond round() that aren't present within the std:: namespace depending on libstdc++ version. This CL adds an indirection layer using a new TF_MICRO_USE_GLOBAL_CMATH_FUNCTIONS flag to select these functions and converts TfLiteRound() to use this layer. Other similar indirections will be added in future CLs. PiperOrigin-RevId: 303425632 Change-Id: Idfe14924329827f71742ab6aedfd6ead9bcb0dbb
396 lines
15 KiB
C++
396 lines
15 KiB
C++
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <limits>
|
|
|
|
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
|
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
|
|
|
namespace tflite {
|
|
|
|
namespace {
|
|
// These constants are used to manipulate the binary representation of doubles.
|
|
// Double-precision binary64 floating point format is:
|
|
// Bit | 63 | 62-52 | 51-0 |
|
|
// | Sign | Exponent | Fraction |
|
|
// To avoid 64-bit integers as much as possible, I break this into high and
|
|
// low 32-bit chunks. High is:
|
|
// Bit | 31 | 30-20 | 19-0 |
|
|
// | Sign | Exponent | High Fraction |
|
|
// Low is:
|
|
// Bit | 31-0 |
|
|
// | Low Fraction |
|
|
// We then access the components through logical bit-wise operations to
|
|
// extract the parts needed, with the positions and masks derived from the
|
|
// layout shown above.
|
|
constexpr uint64_t kSignMask = 0x8000000000000000LL;
|
|
constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
|
|
constexpr int32_t kExponentShift = 52;
|
|
constexpr int32_t kExponentBias = 1023;
|
|
constexpr uint32_t kExponentIsBadNum = 0x7ff;
|
|
constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
|
|
constexpr uint32_t kFractionShift = 22;
|
|
constexpr uint32_t kFractionRoundingMask = 0x003fffff;
|
|
constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
|
|
} // namespace
|
|
|
|
void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
|
|
int* shift) {
|
|
if (double_multiplier == 0.) {
|
|
*quantized_multiplier = 0;
|
|
*shift = 0;
|
|
return;
|
|
}
|
|
#ifdef TFLITE_EMULATE_FLOAT
|
|
// If we're trying to avoid the use of floating-point instructions (for
|
|
// example on microcontrollers) then use an alternative implementation
|
|
// that only requires integer and bitwise operations. To enable this, you
|
|
// need to set the define during the build process for your platform.
|
|
int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
|
|
#else // TFLITE_EMULATE_FLOAT
|
|
const double q = std::frexp(double_multiplier, shift);
|
|
auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
|
|
#endif // TFLITE_EMULATE_FLOAT
|
|
TFLITE_CHECK(q_fixed <= (1ll << 31));
|
|
if (q_fixed == (1ll << 31)) {
|
|
q_fixed /= 2;
|
|
++*shift;
|
|
}
|
|
TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
|
|
// A shift amount smaller than -31 would cause all bits to be shifted out
|
|
// and thus all results would be zero. We implement that instead with
|
|
// q_fixed==0, so as to avoid hitting issues with right-shift
|
|
// operations with shift amounts greater than 31. Note that this happens
|
|
// roughly when abs(double_multiplier) < 2^-31 and the present handling means
|
|
// that we're effectively flushing tiny double_multiplier's to zero.
|
|
// We could conceivably handle values in the range (roughly) [32, 63]
|
|
// as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
|
|
// the present handling is just doing 'flush denormals to zero'. We could
|
|
// reconsider and actually generate nonzero denormals if a need arises.
|
|
if (*shift < -31) {
|
|
*shift = 0;
|
|
q_fixed = 0;
|
|
}
|
|
*quantized_multiplier = static_cast<int32_t>(q_fixed);
|
|
}
|
|
|
|
void QuantizeMultiplierGreaterThanOne(double double_multiplier,
|
|
int32_t* quantized_multiplier,
|
|
int* left_shift) {
|
|
TFLITE_CHECK_GT(double_multiplier, 1.);
|
|
QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
|
|
TFLITE_CHECK_GE(*left_shift, 0);
|
|
}
|
|
|
|
void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
|
|
int32_t* quantized_multiplier,
|
|
int* left_shift) {
|
|
TFLITE_CHECK_LT(double_multiplier, 1.);
|
|
TFLITE_CHECK_GT(double_multiplier, 0.);
|
|
int shift;
|
|
QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
|
|
TFLITE_CHECK_LE(shift, 0);
|
|
*left_shift = shift;
|
|
}
|
|
|
|
int64_t IntegerFrExp(double input, int* shift) {
|
|
// Make sure our assumptions about the double layout hold.
|
|
TFLITE_CHECK_EQ(8, sizeof(double));
|
|
|
|
// We want to access the bits of the input double value directly, which is
|
|
// tricky to do safely, so use a union to handle the casting.
|
|
union {
|
|
double double_value;
|
|
uint64_t double_as_uint;
|
|
} cast_union;
|
|
cast_union.double_value = input;
|
|
const uint64_t u = cast_union.double_as_uint;
|
|
|
|
// If the bitfield is all zeros apart from the sign bit, this is a normalized
|
|
// zero value, so return standard values for this special case.
|
|
if ((u & ~kSignMask) == 0) {
|
|
*shift = 0;
|
|
return 0;
|
|
}
|
|
|
|
// Deal with NaNs and Infs, which are always indicated with a fixed pattern in
|
|
// the exponent, and distinguished by whether the fractions are zero or
|
|
// non-zero.
|
|
const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
|
|
if (exponent_part == kExponentIsBadNum) {
|
|
*shift = std::numeric_limits<int>::max();
|
|
if (u & kFractionMask) {
|
|
// NaN, so just return zero (with the exponent set to INT_MAX).
|
|
return 0;
|
|
} else {
|
|
// Infinity, so return +/- INT_MAX.
|
|
if (u & kSignMask) {
|
|
return std::numeric_limits<int64_t>::min();
|
|
} else {
|
|
return std::numeric_limits<int64_t>::max();
|
|
}
|
|
}
|
|
}
|
|
|
|
// The shift is fairly easy to extract from the high bits of the double value,
|
|
// just by masking it out and applying a bias. The std::frexp() implementation
|
|
// always returns values between 0.5 and 1.0 though, whereas the exponent
|
|
// assumes 1.0 to 2.0 is the standard range, so I add on one to match that
|
|
// interface.
|
|
*shift = (exponent_part - kExponentBias) + 1;
|
|
|
|
// There's an implicit high bit in the double format definition, so make sure
|
|
// we include that at the top, and then reconstruct the rest of the fractional
|
|
// value from the remaining fragments.
|
|
int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
|
|
|
|
// We're cutting off some bits at the bottom, so to exactly match the standard
|
|
// frexp implementation here we'll apply rounding by adding one to the least
|
|
// significant bit of the result if the discarded portion is over half of the
|
|
// maximum.
|
|
if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
|
|
fraction += 1;
|
|
}
|
|
// Negate the fraction if the sign bit was set.
|
|
if (u & kSignMask) {
|
|
fraction *= -1;
|
|
}
|
|
|
|
return fraction;
|
|
}
|
|
|
|
double DoubleFromFractionAndShift(int64_t fraction, int shift) {
|
|
union {
|
|
double double_value;
|
|
uint64_t double_as_uint;
|
|
} result;
|
|
|
|
// Detect NaNs and infinities.
|
|
if (shift == std::numeric_limits<int>::max()) {
|
|
if (fraction == 0) {
|
|
return std::numeric_limits<double>::quiet_NaN();
|
|
} else if (fraction > 0) {
|
|
return std::numeric_limits<double>::infinity();
|
|
} else {
|
|
return -std::numeric_limits<double>::infinity();
|
|
}
|
|
}
|
|
|
|
// Return a normalized zero for a zero fraction.
|
|
if (fraction == 0) {
|
|
result.double_as_uint = 0;
|
|
return result.double_value;
|
|
}
|
|
|
|
bool is_negative = (fraction < 0);
|
|
int64_t encoded_fraction = is_negative ? -fraction : fraction;
|
|
int64_t encoded_shift = (shift - 1);
|
|
while (encoded_fraction < 0x40000000) {
|
|
encoded_fraction *= 2;
|
|
encoded_shift -= 1;
|
|
}
|
|
while (encoded_fraction > 0x80000000) {
|
|
encoded_fraction /= 2;
|
|
encoded_shift += 1;
|
|
}
|
|
encoded_fraction -= 0x40000000;
|
|
if (encoded_shift < -1022) {
|
|
encoded_shift = -1023;
|
|
} else if (encoded_shift > 1022) {
|
|
encoded_shift = 1023;
|
|
}
|
|
encoded_shift += kExponentBias;
|
|
uint64_t encoded_sign = is_negative ? kSignMask : 0;
|
|
result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
|
|
(encoded_fraction << kFractionShift);
|
|
return result.double_value;
|
|
}
|
|
|
|
double IntegerDoubleMultiply(double a, double b) {
|
|
int a_shift;
|
|
const int64_t a_fraction = IntegerFrExp(a, &a_shift);
|
|
int b_shift;
|
|
const int64_t b_fraction = IntegerFrExp(b, &b_shift);
|
|
// Detect NaNs and infinities.
|
|
if (a_shift == std::numeric_limits<int>::max() ||
|
|
(b_shift == std::numeric_limits<int>::max())) {
|
|
return std::numeric_limits<double>::quiet_NaN();
|
|
}
|
|
const int result_shift = a_shift + b_shift + 1;
|
|
const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
|
|
return DoubleFromFractionAndShift(result_fraction, result_shift);
|
|
}
|
|
|
|
int IntegerDoubleCompare(double a, double b) {
|
|
int a_shift;
|
|
const int64_t a_fraction = IntegerFrExp(a, &a_shift);
|
|
int b_shift;
|
|
const int64_t b_fraction = IntegerFrExp(b, &b_shift);
|
|
|
|
// Detect NaNs and infinities.
|
|
if (a_shift == std::numeric_limits<int>::max() ||
|
|
(b_shift == std::numeric_limits<int>::max())) {
|
|
return 1;
|
|
}
|
|
|
|
if ((a_fraction == 0) && (b_fraction < 0)) {
|
|
return 1;
|
|
} else if ((a_fraction < 0) && (b_fraction == 0)) {
|
|
return -1;
|
|
} else if (a_shift < b_shift) {
|
|
return -1;
|
|
} else if (a_shift > b_shift) {
|
|
return 1;
|
|
} else if (a_fraction < b_fraction) {
|
|
return -1;
|
|
} else if (a_fraction > b_fraction) {
|
|
return 1;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
void PreprocessSoftmaxScaling(double beta, double input_scale,
|
|
int input_integer_bits,
|
|
int32_t* quantized_multiplier, int* left_shift) {
|
|
// If the overall multiplier (input and beta) is large, then exp() of an
|
|
// input difference of 1 scaled by this will be large. In other words, we
|
|
// can cap the multiplier and know that, when it is used, the output will be
|
|
// (round to) zero wherever the input is not at the maximum value.
|
|
|
|
// If the overall scale is less than one, and input_integer_bits=0, then the
|
|
// result is double equivalent of Q0.31 (actually with more precision). Thus
|
|
// this generates a Q(input_integer_bits).(31-input_integer_bits)
|
|
// representation.
|
|
#ifdef TFLITE_EMULATE_FLOAT
|
|
const double input_beta = IntegerDoubleMultiply(beta, input_scale);
|
|
int shift;
|
|
int64_t fraction = IntegerFrExp(input_beta, &shift);
|
|
shift += (31 - input_integer_bits);
|
|
double input_beta_real_multiplier =
|
|
DoubleFromFractionAndShift(fraction, shift);
|
|
if (IntegerDoubleCompare(input_beta_real_multiplier, (1ll << 31) - 1.0) > 0) {
|
|
input_beta_real_multiplier = (1ll << 31) - 1.0;
|
|
}
|
|
#else // TFLITE_EMULATE_FLOAT
|
|
const double input_beta_real_multiplier = std::min(
|
|
beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
|
|
#endif // TFLITE_EMULATE_FLOAT
|
|
|
|
QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
|
|
quantized_multiplier, left_shift);
|
|
}
|
|
|
|
void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
|
|
int input_integer_bits,
|
|
int32_t* quantized_multiplier,
|
|
int* left_shift,
|
|
int32_t* reverse_scaling_divisor,
|
|
int* reverse_scaling_left_shift) {
|
|
PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
|
|
quantized_multiplier, left_shift);
|
|
|
|
// Also calculate what amounts to the inverse scaling factor for the input.
|
|
const double real_reverse_scaling_divisor =
|
|
(1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
|
|
tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
|
|
reverse_scaling_divisor,
|
|
reverse_scaling_left_shift);
|
|
}
|
|
|
|
int CalculateInputRadius(int input_integer_bits, int input_left_shift,
|
|
int total_signed_bits) {
|
|
#ifdef TFLITE_EMULATE_FLOAT
|
|
int64_t result = (1 << input_integer_bits) - 1;
|
|
result <<= (total_signed_bits - input_integer_bits);
|
|
result >>= input_left_shift;
|
|
return result;
|
|
#else // TFLITE_EMULATE_FLOAT
|
|
const double max_input_rescaled =
|
|
1.0 * ((1 << input_integer_bits) - 1) *
|
|
(1ll << (total_signed_bits - input_integer_bits)) /
|
|
(1ll << input_left_shift);
|
|
// Tighten bound using floor. Suppose that we could use the exact value.
|
|
// After scaling the difference, the result would be at the maximum. Thus we
|
|
// must ensure that our value has lower magnitude.
|
|
return static_cast<int>(std::floor(max_input_rescaled));
|
|
#endif // TFLITE_EMULATE_FLOAT
|
|
}
|
|
|
|
void NudgeQuantizationRange(const float min, const float max,
|
|
const int quant_min, const int quant_max,
|
|
float* nudged_min, float* nudged_max,
|
|
float* nudged_scale) {
|
|
// This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
|
|
const float quant_min_float = static_cast<float>(quant_min);
|
|
const float quant_max_float = static_cast<float>(quant_max);
|
|
*nudged_scale = (max - min) / (quant_max_float - quant_min_float);
|
|
const float zero_point_from_min = quant_min_float - min / *nudged_scale;
|
|
uint16 nudged_zero_point;
|
|
if (zero_point_from_min < quant_min_float) {
|
|
nudged_zero_point = static_cast<uint16>(quant_min);
|
|
} else if (zero_point_from_min > quant_max_float) {
|
|
nudged_zero_point = static_cast<uint16>(quant_max);
|
|
} else {
|
|
nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
|
|
}
|
|
*nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
|
|
*nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
|
|
}
|
|
|
|
void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
|
|
const float nudged_max, const float* input_data,
|
|
float* output_data, const float size) {
|
|
// This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
|
|
const float inv_nudged_scale = 1.0f / nudged_scale;
|
|
|
|
for (int i = 0; i < size; i++) {
|
|
const float src_val = input_data[i];
|
|
const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
|
|
const float clamped_shifted = clamped - nudged_min;
|
|
const float dst_val =
|
|
TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
|
|
nudged_min;
|
|
output_data[i] = dst_val;
|
|
}
|
|
}
|
|
|
|
bool CheckedLog2(const float x, int* log2_result) {
|
|
// Using TfLiteRound instead of std::round and std::log instead of
|
|
// std::log2 to work around these functions being missing in a toolchain
|
|
// used in some TensorFlow tests as of May 2018.
|
|
const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
|
|
const float x_log2_rounded = TfLiteRound(x_log2);
|
|
const float x_log2_fracpart = x_log2 - x_log2_rounded;
|
|
|
|
*log2_result = static_cast<int>(x_log2_rounded);
|
|
return std::abs(x_log2_fracpart) < 1e-3f;
|
|
}
|
|
|
|
void QuantizeMultiplierArray(const double* effective_scales, size_t size,
|
|
int32_t* effective_scale_significand,
|
|
int* effective_shift) {
|
|
for (size_t i = 0; i < size; ++i) {
|
|
QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
|
|
&effective_shift[i]);
|
|
}
|
|
}
|
|
|
|
} // namespace tflite
|