Support quantized FC layer (with biases).
This required adding support for quantizing per-layer biases. PiperOrigin-RevId: 243709827
This commit is contained in:
parent
66008c9031
commit
0ef2e0526f
@ -178,6 +178,7 @@ tf_cc_test(
|
|||||||
"//tensorflow/lite/tools/optimize:testdata/add_with_const_input.bin",
|
"//tensorflow/lite/tools/optimize:testdata/add_with_const_input.bin",
|
||||||
"//tensorflow/lite/tools/optimize:testdata/argmax.bin",
|
"//tensorflow/lite/tools/optimize:testdata/argmax.bin",
|
||||||
"//tensorflow/lite/tools/optimize:testdata/concat.bin",
|
"//tensorflow/lite/tools/optimize:testdata/concat.bin",
|
||||||
|
"//tensorflow/lite/tools/optimize:testdata/fc.bin",
|
||||||
"//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin",
|
"//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin",
|
||||||
"//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",
|
"//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",
|
||||||
"//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
|
"//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
|
||||||
|
@ -252,6 +252,37 @@ TfLiteStatus SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
|
|||||||
model, tensor);
|
model, tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
|
||||||
|
float input_scale,
|
||||||
|
float weight_scale) {
|
||||||
|
// Compute scales.
|
||||||
|
float scaling_factor = input_scale * weight_scale;
|
||||||
|
|
||||||
|
BufferT* buffer = model->buffers[tensor->buffer].get();
|
||||||
|
float* float_data = reinterpret_cast<float*>(buffer->data.data());
|
||||||
|
int32_t float_data_size = buffer->data.size() / sizeof(float);
|
||||||
|
uint64_t num_elements;
|
||||||
|
TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
|
||||||
|
|
||||||
|
std::vector<int32_t> final_buffer(num_elements);
|
||||||
|
const int32_t kScale = std::numeric_limits<int32_t>::max();
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < float_data_size; i++) {
|
||||||
|
float scaling_factor_inv = (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
|
||||||
|
const int32_t quantized_value =
|
||||||
|
static_cast<int32_t>(TfLiteRound(float_data[i] * scaling_factor_inv));
|
||||||
|
final_buffer[i] = std::min(kScale, std::max(-kScale, quantized_value));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the buffers and output type.
|
||||||
|
uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
|
||||||
|
size_t buffer_size = num_elements * sizeof(int32_t);
|
||||||
|
std::vector<float> scales(1, scaling_factor);
|
||||||
|
std::vector<int64_t> zero_points(1, 0);
|
||||||
|
return AddQuantizationParams(scales, zero_points, 0, uint8_buffer,
|
||||||
|
buffer_size, TensorType_INT32, model, tensor);
|
||||||
|
}
|
||||||
|
|
||||||
TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
|
TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
|
||||||
float input_scale,
|
float input_scale,
|
||||||
const float* weight_scales,
|
const float* weight_scales,
|
||||||
|
@ -76,6 +76,11 @@ TfLiteStatus AddQuantizationParams(const std::vector<float>& scales,
|
|||||||
TfLiteStatus SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
|
TfLiteStatus SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
|
||||||
int32_t channel_dim_index);
|
int32_t channel_dim_index);
|
||||||
|
|
||||||
|
// Symmetrically quantized the bias for per-layer ops (i.e. FullyConnected).
|
||||||
|
TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
|
||||||
|
float input_scale,
|
||||||
|
float weight_scale);
|
||||||
|
|
||||||
// Symmetrically quantizes the bias for ops like Conv and DepthwiseConv.
|
// Symmetrically quantizes the bias for ops like Conv and DepthwiseConv.
|
||||||
// The scale of bias if weight_per_channel_scale[channel] * input_scale
|
// The scale of bias if weight_per_channel_scale[channel] * input_scale
|
||||||
TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
|
TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
|
||||||
|
@ -291,6 +291,43 @@ TEST(QuantizationUtilsTest, AddQuantizationParams) {
|
|||||||
EXPECT_EQ(model->subgraphs[0]->tensors[0]->type, TensorType_INT8);
|
EXPECT_EQ(model->subgraphs[0]->tensors[0]->type, TensorType_INT8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(QuantizationUtilsTest, SymmetricPerLayerBiasQuantize) {
|
||||||
|
// Create data.
|
||||||
|
auto model = absl::make_unique<ModelT>();
|
||||||
|
auto subgraph = absl::make_unique<tflite::SubGraphT>();
|
||||||
|
auto tensor = absl::make_unique<TensorT>();
|
||||||
|
auto buffer = absl::make_unique<tflite::BufferT>();
|
||||||
|
const float weight_scale = 0.5;
|
||||||
|
const float input_scale = 0.5;
|
||||||
|
std::vector<float> bias_data = {4.0, 1.0};
|
||||||
|
auto bias_reinterpreted_data =
|
||||||
|
reinterpret_cast<const unsigned char*>(bias_data.data());
|
||||||
|
buffer->data.assign(bias_reinterpreted_data,
|
||||||
|
bias_reinterpreted_data + bias_data.size() * 4);
|
||||||
|
tensor->buffer = 0;
|
||||||
|
tensor->shape = {2, 1, 1, 1};
|
||||||
|
tensor->quantization = absl::make_unique<QuantizationParametersT>();
|
||||||
|
|
||||||
|
// Wire the model.
|
||||||
|
model->subgraphs.push_back(std::move(subgraph));
|
||||||
|
model->subgraphs[0]->tensors.push_back(std::move(tensor));
|
||||||
|
model->buffers.push_back(std::move(buffer));
|
||||||
|
|
||||||
|
// Call and verify.
|
||||||
|
EXPECT_EQ(SymmetricPerLayerBiasQuantize(model.get(),
|
||||||
|
model->subgraphs[0]->tensors[0].get(),
|
||||||
|
input_scale, weight_scale),
|
||||||
|
kTfLiteOk);
|
||||||
|
|
||||||
|
EXPECT_THAT(model->subgraphs[0]->tensors[0]->quantization->scale[0],
|
||||||
|
weight_scale * input_scale);
|
||||||
|
EXPECT_THAT(model->subgraphs[0]->tensors[0]->quantization->zero_point[0], 0);
|
||||||
|
|
||||||
|
EXPECT_THAT(model->buffers[model->subgraphs[0]->tensors[0]->buffer]->data,
|
||||||
|
ElementsAreArray({16, 0, 0, 0, 4, 0, 0, 0}));
|
||||||
|
EXPECT_EQ(model->subgraphs[0]->tensors[0]->type, TensorType_INT32);
|
||||||
|
}
|
||||||
|
|
||||||
TEST(QuantizationUtilsTest, SymmetricPerChannelBiasQuantize) {
|
TEST(QuantizationUtilsTest, SymmetricPerChannelBiasQuantize) {
|
||||||
// Create data.
|
// Create data.
|
||||||
auto model = absl::make_unique<ModelT>();
|
auto model = absl::make_unique<ModelT>();
|
||||||
|
@ -37,36 +37,50 @@ namespace optimize {
|
|||||||
namespace {
|
namespace {
|
||||||
TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
|
TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
|
||||||
const TensorT* weight_tensor, TensorT* bias_tensor,
|
const TensorT* weight_tensor, TensorT* bias_tensor,
|
||||||
int channel_dim_index,
|
bool is_per_channel, int channel_dim_index,
|
||||||
ErrorReporter* error_reporter) {
|
ErrorReporter* error_reporter) {
|
||||||
if (bias_tensor->shape.size() != 1) {
|
if (bias_tensor->shape.size() != 1) {
|
||||||
error_reporter->Report("Expected bias tensor shape to be 1.");
|
error_reporter->Report("Expected bias tensor shape to be 1.");
|
||||||
return kTfLiteError;
|
return kTfLiteError;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (bias_tensor->shape[0] != weight_tensor->shape[channel_dim_index]) {
|
|
||||||
error_reporter->Report(
|
|
||||||
"Channel mismatch between bias and weight tensors %d vs %d",
|
|
||||||
bias_tensor->shape[0], weight_tensor->shape[channel_dim_index]);
|
|
||||||
return kTfLiteError;
|
|
||||||
}
|
|
||||||
int32_t channel_dim_size = bias_tensor->shape[0];
|
int32_t channel_dim_size = bias_tensor->shape[0];
|
||||||
if (!input_tensor->quantization ||
|
|
||||||
input_tensor->quantization->scale.size() != 1) {
|
|
||||||
error_reporter->Report("Input tensor missing quantization information");
|
|
||||||
return kTfLiteError;
|
|
||||||
}
|
|
||||||
TF_LITE_ENSURE(error_reporter, weight_tensor->quantization);
|
TF_LITE_ENSURE(error_reporter, weight_tensor->quantization);
|
||||||
const std::vector<float>& weight_scales = weight_tensor->quantization->scale;
|
std::vector<float> weight_scales = weight_tensor->quantization->scale;
|
||||||
|
|
||||||
if (weight_scales.size() != channel_dim_size) {
|
if (is_per_channel) {
|
||||||
error_reporter->Report("Mismatch weight scale dimension: %d",
|
if (bias_tensor->shape[0] != weight_tensor->shape[channel_dim_index]) {
|
||||||
weight_scales.size());
|
error_reporter->Report(
|
||||||
return kTfLiteError;
|
"Channel mismatch between bias and weight tensors %d vs %d",
|
||||||
|
bias_tensor->shape[0], weight_tensor->shape[channel_dim_index]);
|
||||||
|
return kTfLiteError;
|
||||||
|
}
|
||||||
|
if (!input_tensor->quantization ||
|
||||||
|
input_tensor->quantization->scale.size() != 1) {
|
||||||
|
error_reporter->Report("Input tensor missing quantization information");
|
||||||
|
return kTfLiteError;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (weight_scales.size() != channel_dim_size) {
|
||||||
|
error_reporter->Report("Mismatch weight scale dimension: %d",
|
||||||
|
weight_scales.size());
|
||||||
|
return kTfLiteError;
|
||||||
|
}
|
||||||
|
return utils::SymmetricPerChannelBiasQuantize(
|
||||||
|
model, bias_tensor, input_tensor->quantization->scale[0],
|
||||||
|
weight_scales.data(), channel_dim_size, channel_dim_index);
|
||||||
|
} else {
|
||||||
|
if (weight_scales.size() != 1) {
|
||||||
|
error_reporter->Report(
|
||||||
|
"Expected per-layer weight scale dimension size 1, got %d",
|
||||||
|
weight_scales.size());
|
||||||
|
return kTfLiteError;
|
||||||
|
}
|
||||||
|
return utils::SymmetricPerLayerBiasQuantize(
|
||||||
|
model, bias_tensor, input_tensor->quantization->scale[0],
|
||||||
|
weight_scales[0]);
|
||||||
}
|
}
|
||||||
return utils::SymmetricPerChannelBiasQuantize(
|
return kTfLiteError;
|
||||||
model, bias_tensor, input_tensor->quantization->scale[0],
|
|
||||||
weight_scales.data(), channel_dim_size, channel_dim_index);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// True if the tensor type has to be modified.
|
// True if the tensor type has to be modified.
|
||||||
@ -478,7 +492,7 @@ TfLiteStatus QuantizeBiases(flatbuffers::FlatBufferBuilder* builder,
|
|||||||
for (const int bias_idx : property.biases) {
|
for (const int bias_idx : property.biases) {
|
||||||
if (bias_idx >= op->inputs.size()) {
|
if (bias_idx >= op->inputs.size()) {
|
||||||
error_reporter->Report(
|
error_reporter->Report(
|
||||||
"Requaired input index %d is larger than the input length of "
|
"Required input index %d is larger than the input length of "
|
||||||
"op %s at index %d in subgraph %d",
|
"op %s at index %d in subgraph %d",
|
||||||
bias_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code),
|
bias_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code),
|
||||||
op_idx, subgraph_idx);
|
op_idx, subgraph_idx);
|
||||||
@ -502,8 +516,9 @@ TfLiteStatus QuantizeBiases(flatbuffers::FlatBufferBuilder* builder,
|
|||||||
subgraph->tensors[op->inputs[property.input_indexes[0]]].get();
|
subgraph->tensors[op->inputs[property.input_indexes[0]]].get();
|
||||||
TensorT* weight_tensor =
|
TensorT* weight_tensor =
|
||||||
subgraph->tensors[op->inputs[property.input_indexes[1]]].get();
|
subgraph->tensors[op->inputs[property.input_indexes[1]]].get();
|
||||||
QuantizeBias(model, input_tensor, weight_tensor, bias_tensor,
|
TF_LITE_ENSURE_STATUS(QuantizeBias(
|
||||||
property.per_axis_index, error_reporter);
|
model, input_tensor, weight_tensor, bias_tensor,
|
||||||
|
property.per_axis, property.per_axis_index, error_reporter));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -827,6 +827,48 @@ TEST_F(QuantizeArgMaxTest, VerifyArgMax) {
|
|||||||
subgraph->tensors[op->outputs[0]].get()->type);
|
subgraph->tensors[op->outputs[0]].get()->type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class QuantizeFCTest : public QuantizeModelTest {
|
||||||
|
protected:
|
||||||
|
QuantizeFCTest() {
|
||||||
|
input_model_ = ReadModel(internal::kModelWithFCOp);
|
||||||
|
readonly_model_ = input_model_->GetModel();
|
||||||
|
readonly_model_->UnPackTo(&model_);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST_F(QuantizeFCTest, VerifyFC) {
|
||||||
|
auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
|
||||||
|
TensorType_INT8, &error_reporter_);
|
||||||
|
ASSERT_EQ(kTfLiteOk, status);
|
||||||
|
|
||||||
|
const auto& subgraph = model_.subgraphs[0];
|
||||||
|
auto op = subgraph->operators[0].get();
|
||||||
|
ASSERT_EQ(model_.operator_codes[op->opcode_index].get()->builtin_code,
|
||||||
|
BuiltinOperator_FULLY_CONNECTED);
|
||||||
|
|
||||||
|
ASSERT_EQ(op->inputs.size(), 3);
|
||||||
|
ASSERT_EQ(op->outputs.size(), 1);
|
||||||
|
|
||||||
|
auto float_graph = readonly_model_->subgraphs()->Get(0);
|
||||||
|
// Verify FC input and weight is quantized.
|
||||||
|
ASSERT_EQ(float_graph->tensors()->Get(op->inputs[0])->type(),
|
||||||
|
TensorType_FLOAT32);
|
||||||
|
EXPECT_EQ(subgraph->tensors[op->inputs[0]].get()->type, TensorType_INT8);
|
||||||
|
ASSERT_EQ(float_graph->tensors()->Get(op->inputs[1])->type(),
|
||||||
|
TensorType_FLOAT32);
|
||||||
|
EXPECT_EQ(subgraph->tensors[op->inputs[1]].get()->type, TensorType_INT8);
|
||||||
|
|
||||||
|
// Verify FC bias should be int32 quantized.
|
||||||
|
ASSERT_EQ(float_graph->tensors()->Get(op->inputs[2])->type(),
|
||||||
|
TensorType_FLOAT32);
|
||||||
|
EXPECT_EQ(subgraph->tensors[op->inputs[2]].get()->type, TensorType_INT32);
|
||||||
|
|
||||||
|
// The output of FC should be quantized.
|
||||||
|
ASSERT_EQ(float_graph->tensors()->Get(op->outputs[0])->type(),
|
||||||
|
TensorType_FLOAT32);
|
||||||
|
EXPECT_EQ(subgraph->tensors[op->outputs[0]].get()->type, TensorType_INT8);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
} // namespace optimize
|
} // namespace optimize
|
||||||
} // namespace tflite
|
} // namespace tflite
|
||||||
|
@ -43,6 +43,8 @@ const char* kModelWithCustomOp = "custom_op.bin";
|
|||||||
|
|
||||||
const char* kModelWithArgMaxOp = "argmax.bin";
|
const char* kModelWithArgMaxOp = "argmax.bin";
|
||||||
|
|
||||||
|
const char* kModelWithFCOp = "fc.bin";
|
||||||
|
|
||||||
int FailOnErrorReporter::Report(const char* format, va_list args) {
|
int FailOnErrorReporter::Report(const char* format, va_list args) {
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
vsnprintf(buf, sizeof(buf), format, args);
|
vsnprintf(buf, sizeof(buf), format, args);
|
||||||
|
@ -66,6 +66,9 @@ extern const char* kModelWithCustomOp;
|
|||||||
// Test model with a argmax op.
|
// Test model with a argmax op.
|
||||||
extern const char* kModelWithArgMaxOp;
|
extern const char* kModelWithArgMaxOp;
|
||||||
|
|
||||||
|
// Test model with a argmax op.
|
||||||
|
extern const char* kModelWithFCOp;
|
||||||
|
|
||||||
// An error reporter that fails on testing.
|
// An error reporter that fails on testing.
|
||||||
class FailOnErrorReporter : public ErrorReporter {
|
class FailOnErrorReporter : public ErrorReporter {
|
||||||
public:
|
public:
|
||||||
|
BIN
tensorflow/lite/tools/optimize/testdata/fc.bin
vendored
Normal file
BIN
tensorflow/lite/tools/optimize/testdata/fc.bin
vendored
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user