Add utility methods to quantization utils for model quantization.
PiperOrigin-RevId: 243201849
This commit is contained in:
parent
6d43308884
commit
7e4c43bbe8
@ -78,6 +78,7 @@ tf_cc_test(
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/lite:framework",
|
||||
"//tensorflow/lite/schema:schema_fbs",
|
||||
"@com_google_absl//absl/memory",
|
||||
"@com_google_googletest//:gtest",
|
||||
"@flatbuffers",
|
||||
],
|
||||
|
@ -171,7 +171,7 @@ TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
|
||||
}
|
||||
float* float_data = reinterpret_cast<float*>(buffer->data.data());
|
||||
uint64_t num_elements;
|
||||
TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
|
||||
TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
|
||||
|
||||
std::vector<int8_t> quantized_buffer;
|
||||
quantized_buffer.resize(num_elements);
|
||||
@ -197,6 +197,116 @@ TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
TfLiteStatus AddQuantizationParams(const std::vector<float>& scales,
|
||||
const std::vector<int64_t>& zero_point,
|
||||
int quantized_dimension,
|
||||
const uint8_t* buffer_data,
|
||||
size_t buffer_size, TensorType output_type,
|
||||
ModelT* model, TensorT* tensor) {
|
||||
tensor->quantization = absl::make_unique<QuantizationParametersT>();
|
||||
tensor->quantization->scale.assign(scales.begin(), scales.end());
|
||||
if (zero_point.size() != scales.size()) {
|
||||
return kTfLiteError;
|
||||
}
|
||||
tensor->quantization->zero_point.assign(zero_point.begin(), zero_point.end());
|
||||
tensor->quantization->quantized_dimension = quantized_dimension;
|
||||
model->buffers[tensor->buffer]->data.assign(buffer_data,
|
||||
buffer_data + buffer_size);
|
||||
|
||||
// Update the tensor type.
|
||||
tensor->type = output_type;
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
TfLiteStatus SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
|
||||
int32_t channel_dim_index) {
|
||||
if (tensor->shape.size() != 4) {
|
||||
return kTfLiteError;
|
||||
}
|
||||
|
||||
// Get dimensions.
|
||||
uint64_t num_elements;
|
||||
TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
|
||||
const int32_t channel_dim_size = tensor->shape[channel_dim_index];
|
||||
|
||||
// Get input float data.
|
||||
BufferT* buffer = model->buffers[tensor->buffer].get();
|
||||
float* float_input_data = reinterpret_cast<float*>(buffer->data.data());
|
||||
|
||||
// Create container for output scale and output data.
|
||||
std::vector<float> scales(channel_dim_size);
|
||||
std::vector<int8_t> final_buffer(num_elements);
|
||||
|
||||
// Quantize the input data with respect to channel_dim_index.
|
||||
const std::vector<int> tensor_dims = {tensor->shape[0], tensor->shape[1],
|
||||
tensor->shape[2], tensor->shape[3]};
|
||||
SymmetricPerChannelQuantization(float_input_data, tensor_dims,
|
||||
channel_dim_index, &scales, &final_buffer);
|
||||
|
||||
// Set the buffers and output type.
|
||||
uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
|
||||
const size_t buffer_size = num_elements * sizeof(int8_t);
|
||||
std::vector<int64_t> zero_point(scales.size(), 0);
|
||||
return AddQuantizationParams(scales, zero_point, channel_dim_index,
|
||||
uint8_buffer, buffer_size, TensorType_INT8,
|
||||
model, tensor);
|
||||
}
|
||||
|
||||
TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
|
||||
float input_scale,
|
||||
const float* weight_scales,
|
||||
int number_of_dimension,
|
||||
int dimension_index) {
|
||||
// Compute scales.
|
||||
std::vector<float> scales(number_of_dimension);
|
||||
for (size_t i = 0; i < number_of_dimension; i++) {
|
||||
scales[i] = input_scale * weight_scales[i];
|
||||
}
|
||||
|
||||
BufferT* buffer = model->buffers[tensor->buffer].get();
|
||||
float* float_data = reinterpret_cast<float*>(buffer->data.data());
|
||||
uint64_t num_elements;
|
||||
TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
|
||||
|
||||
std::vector<int32_t> final_buffer(num_elements);
|
||||
const int32_t kScale = std::numeric_limits<int32_t>::max();
|
||||
|
||||
for (int32_t channel_idx = 0; channel_idx < number_of_dimension;
|
||||
channel_idx++) {
|
||||
float scaling_factor = scales[channel_idx];
|
||||
float scaling_factor_inv = (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
|
||||
const int32_t quantized_value = static_cast<int32_t>(
|
||||
TfLiteRound(float_data[channel_idx] * scaling_factor_inv));
|
||||
final_buffer[channel_idx] =
|
||||
std::min(kScale, std::max(-kScale, quantized_value));
|
||||
}
|
||||
|
||||
// Set the buffers and output type.
|
||||
uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
|
||||
size_t buffer_size = num_elements * sizeof(int32_t);
|
||||
std::vector<int64_t> zero_point(scales.size(), 0);
|
||||
return AddQuantizationParams(scales, zero_point, dimension_index,
|
||||
uint8_buffer, buffer_size, TensorType_INT32,
|
||||
model, tensor);
|
||||
}
|
||||
|
||||
TfLiteStatus QuantizeWeight(ModelT* model, TensorT* tensor, bool per_channel,
|
||||
int per_axis_index) {
|
||||
if (per_channel) {
|
||||
return SymmetricQuantizeTensorPerChannel(model, tensor, per_axis_index);
|
||||
} else {
|
||||
return SymmetricQuantizeTensor(model, tensor);
|
||||
}
|
||||
}
|
||||
|
||||
void QuantizeActivation(TensorT* tensor) {
|
||||
GetAsymmetricQuantizationParams(
|
||||
tensor->quantization->min[0], tensor->quantization->max[0],
|
||||
std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
|
||||
tensor->quantization.get());
|
||||
tensor->type = TensorType_INT8;
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
} // namespace optimize
|
||||
} // namespace tflite
|
||||
|
@ -64,6 +64,33 @@ void SymmetricPerChannelQuantizeValues(const float* const input,
|
||||
// of the tensor.
|
||||
TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor);
|
||||
|
||||
// Add quantization parameters.
|
||||
TfLiteStatus AddQuantizationParams(const std::vector<float>& scales,
|
||||
const std::vector<int64_t>& zero_point,
|
||||
int quantized_dimension,
|
||||
const uint8_t* buffer_data,
|
||||
size_t buffer_size, TensorType output_type,
|
||||
ModelT* model, TensorT* tensor);
|
||||
|
||||
// Quantize tensor with per channel.
|
||||
TfLiteStatus SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
|
||||
int32_t channel_dim_index);
|
||||
|
||||
// Symmetrically quantizes the bias for ops like Conv and DepthwiseConv.
|
||||
// The scale of bias if weight_per_channel_scale[channel] * input_scale
|
||||
TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
|
||||
float input_scale,
|
||||
const float* weight_scales,
|
||||
int number_of_dimension,
|
||||
int dimension_index);
|
||||
|
||||
// Quantize weight with or without per channel.
|
||||
TfLiteStatus QuantizeWeight(ModelT* model, TensorT* tensor, bool per_channel,
|
||||
int per_axis_index);
|
||||
|
||||
// Quantize activation.
|
||||
void QuantizeActivation(TensorT* tensor);
|
||||
|
||||
} // namespace utils
|
||||
} // namespace optimize
|
||||
} // namespace tflite
|
||||
|
@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#include "tensorflow/lite/tools/optimize/quantization_utils.h"
|
||||
|
||||
#include <gmock/gmock.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include "absl/memory/memory.h"
|
||||
#include "tensorflow/core/lib/io/path.h"
|
||||
#include "tensorflow/core/platform/init_main.h"
|
||||
#include "tensorflow/core/util/command_line_flags.h"
|
||||
@ -256,6 +258,70 @@ TEST(QuantizationUtilsTest, SymmetricQuantizeTensor) {
|
||||
EXPECT_EQ(quant_buffer_size * 4, float_buffer_size);
|
||||
}
|
||||
|
||||
TEST(QuantizationUtilsTest, AddQuantizationParams) {
|
||||
// Create data.
|
||||
auto model = absl::make_unique<ModelT>();
|
||||
auto subgraph = absl::make_unique<tflite::SubGraphT>();
|
||||
auto tensor = absl::make_unique<TensorT>();
|
||||
auto buffer = absl::make_unique<tflite::BufferT>();
|
||||
const std::vector<float> scales = {0.5, 1.0, 1.5};
|
||||
const std::vector<int64_t> zero_points = {5, 10, 15};
|
||||
const int32_t quantizated_dimension = 3;
|
||||
const std::vector<uint8_t> buffer_data = {1, 2, 3, 4};
|
||||
const int32_t buffer_size = 4;
|
||||
tensor->buffer = 0;
|
||||
|
||||
// Wire the model.
|
||||
model->subgraphs.push_back(std::move(subgraph));
|
||||
model->subgraphs[0]->tensors.push_back(std::move(tensor));
|
||||
model->buffers.push_back(std::move(buffer));
|
||||
|
||||
// Call and verify.
|
||||
EXPECT_EQ(
|
||||
AddQuantizationParams(scales, zero_points, quantizated_dimension,
|
||||
buffer_data.data(), buffer_size, TensorType_INT8,
|
||||
model.get(), model->subgraphs[0]->tensors[0].get()),
|
||||
kTfLiteOk);
|
||||
EXPECT_THAT(model->subgraphs[0]->tensors[0]->quantization->scale,
|
||||
ElementsAreArray(scales));
|
||||
EXPECT_THAT(model->subgraphs[0]->tensors[0]->quantization->zero_point,
|
||||
ElementsAreArray(zero_points));
|
||||
EXPECT_THAT(model->buffers[model->subgraphs[0]->tensors[0]->buffer]->data,
|
||||
ElementsAreArray(buffer_data));
|
||||
EXPECT_EQ(model->subgraphs[0]->tensors[0]->type, TensorType_INT8);
|
||||
}
|
||||
|
||||
TEST(QuantizationUtilsTest, SymmetricPerChannelBiasQuantize) {
|
||||
// Create data.
|
||||
auto model = absl::make_unique<ModelT>();
|
||||
auto subgraph = absl::make_unique<tflite::SubGraphT>();
|
||||
auto tensor = absl::make_unique<TensorT>();
|
||||
auto buffer = absl::make_unique<tflite::BufferT>();
|
||||
const std::vector<float> weight_scales = {0.5, 1.0};
|
||||
const float input_scale = 0.5;
|
||||
std::vector<float> bias_data = {4.0, 1.0};
|
||||
auto bias_reinterpreted_data =
|
||||
reinterpret_cast<const unsigned char*>(bias_data.data());
|
||||
buffer->data.assign(bias_reinterpreted_data, bias_reinterpreted_data + 4 * 4);
|
||||
tensor->buffer = 0;
|
||||
tensor->shape = {2, 1, 1, 1};
|
||||
tensor->quantization = absl::make_unique<QuantizationParametersT>();
|
||||
|
||||
// Wire the model.
|
||||
model->subgraphs.push_back(std::move(subgraph));
|
||||
model->subgraphs[0]->tensors.push_back(std::move(tensor));
|
||||
model->buffers.push_back(std::move(buffer));
|
||||
|
||||
// Call and verify.
|
||||
EXPECT_EQ(SymmetricPerChannelBiasQuantize(
|
||||
model.get(), model->subgraphs[0]->tensors[0].get(), input_scale,
|
||||
weight_scales.data(), 2, 0),
|
||||
kTfLiteOk);
|
||||
EXPECT_THAT(model->buffers[model->subgraphs[0]->tensors[0]->buffer]->data,
|
||||
ElementsAreArray({16, 0, 0, 0, 2, 0, 0, 0}));
|
||||
EXPECT_EQ(model->subgraphs[0]->tensors[0]->type, TensorType_INT32);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace utils
|
||||
} // namespace optimize
|
||||
|
Loading…
Reference in New Issue
Block a user