Refactor some quantization methods

Refactored Bias(int32) and int16 quantization methods to factor out quantization logic to separate function. These will be used in MLIR quantizer's legacy mode.

PiperOrigin-RevId: 352176296
Change-Id: I54c975ad3ba348f2b2cf77290772aff345865b63
This commit is contained in:
Taehee Jeong 2021-01-16 07:03:51 -08:00 committed by TensorFlower Gardener
parent 85bf96f508
commit a7c1a23ebb
3 changed files with 57 additions and 40 deletions

View File

@ -147,7 +147,6 @@ cc_library(
"//tensorflow/lite/schema:schema_fbs",
"//third_party/eigen3",
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings",
],
)

View File

@ -16,6 +16,7 @@ limitations under the License.
#include <cmath>
#include <cstdint>
#include <iostream>
#include <memory>
#include <string>
@ -329,27 +330,33 @@ TfLiteStatus SymmetricPerChannelQuantization(TensorT* tensor,
return kTfLiteOk;
}
TfLiteStatus SymmetricQuantizeFloatsToInt16(ModelT* model, TensorT* tensor,
float scaling_factor,
ErrorReporter* error_reporter) {
std::vector<int16_t> SymmetricQuantizeFloatsToInt16(const float* data,
uint64_t num_elements,
float scaling_factor) {
// Compute the inverse of scale.
const float scaling_factor_inv =
(scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
std::vector<int16_t> buffer(num_elements);
const int32_t kScale = std::numeric_limits<int16_t>::max();
for (size_t i = 0; i < num_elements; i++) {
const int32_t quantized_value =
static_cast<int32_t>(TfLiteRound(data[i] * scaling_factor_inv));
buffer[i] = std::min(kScale, std::max(-kScale, quantized_value));
}
return buffer;
}
TfLiteStatus SymmetricQuantizeFloatsToInt16(ModelT* model, TensorT* tensor,
float scaling_factor,
ErrorReporter* error_reporter) {
const BufferT* buffer = model->buffers[tensor->buffer].get();
const float* float_data = reinterpret_cast<const float*>(buffer->data.data());
uint64_t num_elements;
TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
std::vector<int16_t> final_buffer(num_elements);
const int32_t kScale = std::numeric_limits<int16_t>::max();
for (size_t i = 0; i < num_elements; i++) {
const int32_t quantized_value =
static_cast<int32_t>(TfLiteRound(float_data[i] * scaling_factor_inv));
final_buffer[i] = std::min(kScale, std::max(-kScale, quantized_value));
}
auto final_buffer =
SymmetricQuantizeFloatsToInt16(float_data, num_elements, scaling_factor);
// Set the buffers and output type.
uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
size_t buffer_size = num_elements * sizeof(int16_t);
@ -589,27 +596,39 @@ TfLiteStatus SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
model, tensor, error_reporter);
}
template <class BiasType>
std::vector<BiasType> SymmetricBiasQuantize(const float* data,
uint64_t num_elements,
const std::vector<float>& scales) {
std::vector<BiasType> buffer(num_elements);
const BiasType kScale = std::numeric_limits<BiasType>::max();
float scaling_factor_inv_per_layer = (scales[0] == 0) ? 0 : 1.0 / scales[0];
for (int32_t idx = 0; idx < num_elements; idx++) {
float scaling_factor_inv =
scales.size() == 1 ? scaling_factor_inv_per_layer
: ((scales[idx] == 0) ? 0 : 1.0 / scales[idx]);
const BiasType quantized_value =
tflite::SafeCast<BiasType>(TfLiteRound(data[idx] * scaling_factor_inv));
buffer[idx] = std::min(kScale, std::max(-kScale, quantized_value));
}
return buffer;
}
template std::vector<std::int32_t> SymmetricBiasQuantize<std::int32_t>(
const float* data, uint64_t num_elements, const std::vector<float>& scales);
template <class BiasType>
TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
float scaling_factor,
ErrorReporter* error_reporter) {
// Compute the inverse of scale.
const float scaling_factor_inv =
(scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
const BufferT* buffer = model->buffers[tensor->buffer].get();
const float* float_data = reinterpret_cast<const float*>(buffer->data.data());
uint64_t num_elements;
TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
std::vector<BiasType> final_buffer(num_elements);
const BiasType kScale = std::numeric_limits<BiasType>::max();
for (size_t i = 0; i < num_elements; i++) {
const BiasType quantized_value = tflite::SafeCast<BiasType>(
TfLiteRound(float_data[i] * scaling_factor_inv));
final_buffer[i] = std::min(kScale, std::max(-kScale, quantized_value));
}
auto final_buffer = SymmetricBiasQuantize<BiasType>(float_data, num_elements,
{scaling_factor});
// Set the buffers and output type.
uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
@ -650,18 +669,8 @@ TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
uint64_t num_elements;
TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
std::vector<BiasType> final_buffer(num_elements);
const BiasType kScale = std::numeric_limits<BiasType>::max();
for (int32_t channel_idx = 0; channel_idx < number_of_dimension;
channel_idx++) {
float scaling_factor = scales[channel_idx];
float scaling_factor_inv = (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
const BiasType quantized_value = tflite::SafeCast<BiasType>(
TfLiteRound(float_data[channel_idx] * scaling_factor_inv));
final_buffer[channel_idx] =
std::min(kScale, std::max(-kScale, quantized_value));
}
auto final_buffer =
SymmetricBiasQuantize<BiasType>(float_data, num_elements, scales);
// Set the buffers and output type.
uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());

View File

@ -102,17 +102,21 @@ TfLiteStatus AdjustWeightsForBiasScale(QuantizationParametersT* quant_params,
const float input_scale,
ErrorReporter* error_reporter);
// Quantize tensor with per channel.
// Quantizes tensor with per channel.
TfLiteStatus SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
int32_t channel_dim_index,
ErrorReporter* error_reporter);
// Symmetrically quantized float to 16bits.
// Symmetrically quantizes float to 16bits.
TfLiteStatus SymmetricQuantizeFloatsToInt16(ModelT* model, TensorT* tensor,
float scaling_factor,
ErrorReporter* error_reporter);
// Symmetrically quantized the bias for per-layer ops (i.e. FullyConnected).
std::vector<int16_t> SymmetricQuantizeFloatsToInt16(const float* data,
uint64_t num_elements,
float scaling_factor);
// Symmetrically quantizes the bias for per-layer ops (i.e. FullyConnected).
template <typename BiasType>
TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
float scaling_factor,
@ -127,6 +131,11 @@ TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
int number_of_dimension,
ErrorReporter* error_reporter);
template <typename BiasType>
std::vector<BiasType> SymmetricBiasQuantize(const float* data,
uint64_t num_elements,
const std::vector<float>& scales);
// Quantize weight with or without per channel.
TfLiteStatus QuantizeWeight(ModelT* model, TensorT* tensor, bool per_channel,
int per_axis_index, ErrorReporter* error_reporter);