Add LSTM quantization spec and add quantizer support for LSTM.

Test data are generated manually with dummy data.

PiperOrigin-RevId: 279857476
Change-Id: I2ac46bf3836d5eb89688b5127c23b9b0ec5cbafd
This commit is contained in:
Jian Li 2019-11-11 17:57:32 -08:00 committed by TensorFlower Gardener
parent 09d58c6012
commit fd4ad2ebdc
11 changed files with 368 additions and 10 deletions

View File

@ -219,6 +219,8 @@ tf_cc_test(
"//tensorflow/lite/tools/optimize:testdata/argmax.bin",
"//tensorflow/lite/tools/optimize:testdata/concat.bin",
"//tensorflow/lite/tools/optimize:testdata/fc.bin",
"//tensorflow/lite/tools/optimize:testdata/lstm_calibrated.bin",
"//tensorflow/lite/tools/optimize:testdata/lstm_quantized.bin",
"//tensorflow/lite/tools/optimize:testdata/mixed.bin",
"//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin",
"//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",

View File

@ -168,9 +168,73 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
case BuiltinOperator_LSTM: {
// TODO(jianlijianli): extend LSTM op spec to inlucde input, bias etc.
// TODO(jianlijianli): extend this to other variants of LSTM.
// LSTM need 5 intermediate tensors. This agrees with the fully quantized
// LSTM needs 5 intermediate tensors. This agrees with the fully quantized
// kernels in lstm_eval.cc
property.intermediates = {{0, {}}, {1, {}}, {2, {}}, {3, {}}, {4, {}}};
static const float alpha = static_cast<float>(std::pow(2, -10));
TensorProperty tensor_property_12;
tensor_property_12.use_derived_scale = true;
tensor_property_12.number_of_bits = 32;
tensor_property_12.derived_scale = {{20}, {}, {alpha}};
TensorProperty tensor_property_13;
tensor_property_13.use_derived_scale = true;
tensor_property_13.number_of_bits = 32;
tensor_property_13.derived_scale = {{21}, {}, {alpha}};
TensorProperty tensor_property_14;
tensor_property_14.use_derived_scale = true;
tensor_property_14.number_of_bits = 32;
tensor_property_14.derived_scale = {{22}, {}, {alpha}};
TensorProperty tensor_property_15;
tensor_property_15.use_derived_scale = true;
tensor_property_15.number_of_bits = 32;
tensor_property_15.derived_scale = {{23}, {}, {alpha}};
TensorProperty tensor_property_17;
tensor_property_17.use_derived_scale = true;
tensor_property_17.number_of_bits = 32;
tensor_property_17.derived_scale = {{16}, {4}, {}};
TensorProperty tensor_property_19;
tensor_property_19.extend_to_power_of_two = true;
tensor_property_19.number_of_bits = 16;
tensor_property_19.state_tensor = true;
tensor_property_19.symmetric = true;
TensorProperty tensor_property_20;
tensor_property_20.number_of_bits = 16;
tensor_property_20.symmetric = true;
property.inputs = {
{0, {}},
{1, {}},
{2, {}},
{3, {}},
{4, {}},
{5, {}},
{6, {}},
{7, {}},
{8, {}},
{9, {}},
{10, {}},
{11, {}},
{16, {}},
{19, tensor_property_19},
{20, tensor_property_20},
{21, tensor_property_20},
{22, tensor_property_20},
{23, tensor_property_20},
{12, tensor_property_12},
{13, tensor_property_13},
{14, tensor_property_14},
{15, tensor_property_15},
{17, tensor_property_17},
};
property.outputs = {{0, {}}};
property.intermediates = {
{0, tensor_property_20},
{1, tensor_property_20},
{2, tensor_property_20},
{3, tensor_property_20},
{4, {}},
};
property.restrict_scale = {{18, 0}};
property.version = 2;
break;
}

View File

@ -44,6 +44,21 @@ struct TensorProperty {
bool restriction = false;
// scale/zero_point hardcoded.
std::pair<float, int> restricted_value = {0.0, 0};
// Use derived scale.
bool use_derived_scale = false;
// The derived scale.
DerivedScale derived_scale;
// The number of bits for this tensor. It could be 8, 16, 32 or even not power
// of two.
int number_of_bits = 8;
// Extend the range to power of two.
bool extend_to_power_of_two = false;
// State tensor.
bool state_tensor = false;
};
struct OperatorProperty {
@ -55,10 +70,13 @@ struct OperatorProperty {
// Op has arbitrary number of outputs, such as slice.
bool arbitrary_outputs = false;
// Input indexes -> input tensor property.
// Must be topologically sorted since there are derived scales.
std::vector<std::pair<int, TensorProperty>> inputs = {};
// Output indexes -> output tensor property.
std::vector<std::pair<int, TensorProperty>> outputs = {};
// Bias indexes.
// TODO(jianlijianli): remove this by putting biases into inputs as well since
// we now can model "derived scale".
std::vector<int> biases = {};
// Intermediate indexes -> intermediate tensor property.
@ -67,6 +85,12 @@ struct OperatorProperty {
// Force output to reuse the same scale and zero point of input.
bool restrict_same_input_output_scale = false;
// Use same min of min and max of max for each group.
// Incompatable with restrict_same_input_output_scale and restricted_value.
// TODO(jianlijianli): make it compatible with other restrictions when there
// is a use case.
std::vector<std::vector<int>> restrict_scale = {};
// Op version.
int version = 1;
};

View File

@ -625,11 +625,15 @@ float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
float scale = 1.0f;
OperatorT* op = subgraph->operators[op_idx].get();
for (int i = 0; i < input_index.size(); ++i) {
TensorT* tensor = subgraph->tensors[op->inputs[i]].get();
const int index_local = input_index[i];
const int index_global = op->inputs[index_local];
const TensorT* tensor = subgraph->tensors[index_global].get();
scale *= tensor->quantization->scale[0];
}
for (int i = 0; i < intermediate_index.size(); ++i) {
TensorT* tensor = subgraph->tensors[op->intermediates[i]].get();
const int index_local = intermediate_index[i];
const int index_global = op->intermediates[index_local];
const TensorT* tensor = subgraph->tensors[index_global].get();
scale *= tensor->quantization->scale[0];
}
for (int i = 0; i < factors.size(); ++i) {
@ -646,6 +650,15 @@ void QuantizeActivation(TensorT* tensor) {
tensor->type = TensorType_INT8;
}
TfLiteStatus QuantizeActivationToInt16(TensorT* tensor, float scale) {
const int32 zero_point = 0;
tensor->quantization = absl::make_unique<QuantizationParametersT>();
tensor->quantization->scale.push_back(scale);
tensor->quantization->zero_point.push_back(zero_point);
tensor->type = TensorType_INT16;
return kTfLiteOk;
}
int GetPowerOfTwoScale(float min, float max) {
const float range = std::max(std::abs(min), std::abs(max));
int pot = 0;

View File

@ -138,6 +138,9 @@ float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
// Quantize activation.
void QuantizeActivation(TensorT* tensor);
// Quantize activation to 16bit.
TfLiteStatus QuantizeActivationToInt16(TensorT* tensor, float scale);
// Get the power of two scale for min and max for symmetric quantization case.
int GetPowerOfTwoScale(float min, float max);

View File

@ -421,6 +421,10 @@ TfLiteStatus QuantizeOpInput(
return kTfLiteError;
}
const int32_t tensor_idx = op->inputs[input_idx];
if (tensor_idx == -1) {
// Skip optional tensor.
return kTfLiteOk;
}
TensorT* tensor = subgraph->tensors[tensor_idx].get();
// Assumes op is quantized to int8.
const bool is_input_quantized = utils::QuantizationParametersExist(tensor);
@ -429,9 +433,59 @@ TfLiteStatus QuantizeOpInput(
if (utils::HasBuffer(model, subgraph, tensor_idx)) {
// TODO(suharshs): Look at consumers, throw error if one consumer is
// per-channel and one per-layer.
if (utils::QuantizeWeight(model, tensor, tensor_property.per_axis,
tensor_property.per_axis_index,
error_reporter) == kTfLiteError) {
if (tensor_property.number_of_bits == 8) {
if (tensor_property.use_derived_scale) {
// Currently 8bit tensors in input do not accept derived scale.
return kTfLiteError;
}
if (utils::QuantizeWeight(model, tensor, tensor_property.per_axis,
tensor_property.per_axis_index,
error_reporter) == kTfLiteError) {
error_reporter->Report(
"Unable to quantize buffer or min/max value for input %d "
"in op %s in subgraph %d, node: %d",
input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
*op_idx);
return kTfLiteError;
}
} else if (tensor_property.number_of_bits == 16) {
if (tensor_property.use_derived_scale) {
// Currently 16bit tensors in input do not accept derived scale.
return kTfLiteError;
}
TensorT* tensor = subgraph->tensors[tensor_idx].get();
int total_size = 1;
for (int i = 0; i < tensor->shape.size(); ++i) {
total_size *= tensor->shape[i];
}
BufferT* buffer = model->buffers[tensor->buffer].get();
float* float_data = reinterpret_cast<float*>(buffer->data.data());
auto minmax = std::minmax_element(float_data, float_data + total_size);
const float min = *minmax.first;
const float max = *minmax.second;
const float range = std::max(std::abs(min), std::abs(max));
// The narrow range quantized value for int16.
const float quantize_range = 32767.0;
const float scale = range / quantize_range;
return utils::SymmetricQuantizeFloatsToInt16(model, tensor, scale,
error_reporter);
} else if (tensor_property.number_of_bits == 32) {
if (!tensor_property.use_derived_scale) {
// Currently 32 bit tensors in input only accept derived scale.
return kTfLiteError;
}
TensorT* tensor = subgraph->tensors[tensor_idx].get();
const float scale = utils::GetEffectiveScale(
model, subgraph, *op_idx,
tensor_property.derived_scale.input_tensors,
tensor_property.derived_scale.intermediate_tensors,
tensor_property.derived_scale.factors);
return utils::SymmetricPerLayerBiasQuantize(model, tensor, scale,
error_reporter);
} else {
// Only 8, 16, 32 are supported.
// TODO(jianlijianli): extend this to support arbitrary bits.
error_reporter->Report(
"Unable to quantize buffer or min/max value for input %d "
"in op %s in subgraph %d, node: %d",
@ -439,9 +493,27 @@ TfLiteStatus QuantizeOpInput(
return kTfLiteError;
}
} else if (utils::HasMinMax(tensor)) {
// TODO(suharshs): Handle per-channel dynamic tensor.
if (IsSubgraphInput(subgraph, tensor_idx)) {
utils::QuantizeActivation(tensor);
if (IsSubgraphInput(subgraph, tensor_idx) ||
tensor_property.state_tensor) {
if (tensor_property.number_of_bits == 8) {
if (tensor_property.use_derived_scale) {
// Currently 8bit tensors in input do not accept derived scale.
return kTfLiteError;
}
utils::QuantizeActivation(tensor);
} else if (tensor_property.number_of_bits == 16) {
TensorT* tensor = subgraph->tensors[tensor_idx].get();
float range = std::max(std::abs(tensor->quantization->min[0]),
std::abs(tensor->quantization->max[0]));
if (tensor_property.extend_to_power_of_two) {
const int power_of_two_scale = utils::GetPowerOfTwoScale(
tensor->quantization->min[0], tensor->quantization->max[0]);
range = std::pow(2, power_of_two_scale);
}
const float quantized_range = 32768.0;
const float scale = range / quantized_range;
utils::QuantizeActivationToInt16(tensor, scale);
}
} else {
// If the tensor is not a model input, we need to add a Quantize
// operation since the preceding op may require a float output.
@ -515,6 +587,10 @@ TfLiteStatus QuantizeOpOutput(
}
TensorT* output_tensor = subgraph->tensors[op->outputs[output_idx]].get();
if (utils::QuantizationParametersExist(output_tensor)) {
// Skip output if it has been quantized.
return kTfLiteOk;
}
if (ShouldRestrictSameInputOutputScale(property)) {
// Copy quantization parameter. For average pool, max pool, etc
// min/max can be different but we want them to be the same.
@ -576,6 +652,122 @@ TfLiteStatus QuantizeOpOutput(
return kTfLiteOk;
}
TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
ErrorReporter* error_reporter) {
for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
subgraph_idx++) {
SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
operator_property::OperatorProperty property =
operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
if (!property.intermediates.empty()) {
OperatorT* op = subgraph->operators[op_idx].get();
const BuiltinOperator op_code =
model->operator_codes[op->opcode_index]->builtin_code;
for (const std::pair<int, operator_property::TensorProperty>& input :
property.intermediates) {
const int index_local = input.first;
const int index_global = op->intermediates[index_local];
if (index_global == -1) {
// Skip optional tensor.
continue;
}
if (input.second.number_of_bits == 8 &&
input.second.symmetric == false) {
TensorT* tensor = subgraph->tensors[index_global].get();
if (utils::HasMinMax(tensor)) {
utils::QuantizeActivation(tensor);
} else {
error_reporter->Report(
"Unable to find min/max value for output %d in %s in "
"subgraph %d, node: %d",
tensor, EnumNameBuiltinOperator(op_code), subgraph_idx,
op_idx);
return kTfLiteError;
}
} else if (input.second.number_of_bits == 16 &&
input.second.symmetric == true) {
TensorT* tensor = subgraph->tensors[index_global].get();
if (tensor->quantization == nullptr) {
continue;
}
const float min = tensor->quantization->min[0];
const float max = tensor->quantization->max[0];
const float range = std::max(std::abs(min), std::abs(max));
if (range < 1e-8) {
return kTfLiteError;
}
// Get scale and zero point.
const float quantized_range = 32767.0;
const float scale = range / quantized_range;
utils::QuantizeActivationToInt16(tensor, scale);
} else {
return kTfLiteError;
}
}
}
}
}
return kTfLiteOk;
}
// Quantize tensros that have shared range. For example, in LSTM, the output
// tensor and input state tensor should share the same range because they are
// using the same scale and zero point.
TfLiteStatus QuantizeSharedRange(ModelT* model, ErrorReporter* error_reporter) {
for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
subgraph_idx++) {
SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
operator_property::OperatorProperty property =
operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
if (!property.intermediates.empty()) {
OperatorT* op = subgraph->operators[op_idx].get();
for (const std::vector<int>& input : property.restrict_scale) {
if (input.empty()) {
continue;
}
// Currently only support pair of twos.
// TODO(jianlijianli): extend to arbitrary number of tensors.
if (input.size() != 2) {
return kTfLiteError;
}
const int index_1 = input[0];
const int index_2 = input[1];
// TODO(jianlijianli): model input/output.
TensorT* tensor_1 = subgraph->tensors[op->inputs[index_1]].get();
TensorT* tensor_2 = subgraph->tensors[op->outputs[index_2]].get();
const float min_of_min = std::min(tensor_1->quantization->min[0],
tensor_2->quantization->min[0]);
const float max_of_max = std::max(tensor_1->quantization->max[0],
tensor_2->quantization->max[0]);
if (min_of_min == 0.0 && max_of_max == 0.0) {
return kTfLiteError;
}
// Asmmetric quantization to 8 bit.
auto quantization_params =
absl::make_unique<QuantizationParametersT>();
utils::GetAsymmetricQuantizationParams(
min_of_min, max_of_max, -128, 127, quantization_params.get());
// Populate both tensors with the same parameters.
const float scale = quantization_params->scale[0];
const int32 zero_point = quantization_params->zero_point[0];
for (TensorT* tensor : {tensor_1, tensor_2}) {
tensor->quantization = absl::make_unique<QuantizationParametersT>();
tensor->quantization->scale.push_back(scale);
tensor->quantization->zero_point.push_back(zero_point);
tensor->type = TensorType_INT8;
}
}
}
}
}
return kTfLiteOk;
}
// Quantize inputs and weights.
// Because of ops such as lstm, still need to do per op, instead of weights.
TfLiteStatus QuantizeWeightsInputOutput(
@ -713,6 +905,10 @@ TfLiteStatus FillQuantizationParams(
// Get tensor.
const int32_t input_idx = input.first;
const int32_t tensor_idx = op->inputs[input_idx];
if (tensor_idx == -1) {
// Skip optional tensor.
continue;
}
TensorT* tensor = subgraph->tensors[tensor_idx].get();
// Static tensor.
@ -918,6 +1114,8 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
FillQuantizationParams(model, operator_names, error_reporter));
TF_LITE_ENSURE_STATUS(
EnsureBiasScaleCompatibility(model, operator_names, error_reporter));
TF_LITE_ENSURE_STATUS(QuantizeIntemediateTensors(model, error_reporter));
TF_LITE_ENSURE_STATUS(QuantizeSharedRange(model, error_reporter));
TF_LITE_ENSURE_STATUS(QuantizeWeightsInputOutput(
model, allow_float, operator_names, error_reporter));
TF_LITE_ENSURE_STATUS(

View File

@ -979,6 +979,53 @@ TEST_F(QuantizeArgMaxTest, VerifyArgMax) {
EXPECT_EQ(model_.operator_codes[0]->version, 2);
}
class QuantizeLSTMTest : public QuantizeModelTest {
protected:
QuantizeLSTMTest() {
input_model_ = ReadModel(internal::kLstmCalibrated);
readonly_model_ = input_model_->GetModel();
readonly_model_->UnPackTo(&model_);
}
};
TEST_F(QuantizeLSTMTest, VerifyLSTM) {
// Quantize model.
auto status = QuantizeModel(&builder_, &model_, TensorType_FLOAT32,
TensorType_FLOAT32, &error_reporter_);
ASSERT_EQ(kTfLiteOk, status);
// Read expected model.
auto expected_fb_model = ReadModel(internal::kLstmQuantized);
auto expected_read_only_model = expected_fb_model->GetModel();
ModelT expected_model;
expected_read_only_model->UnPackTo(&expected_model);
// Comparison.
ASSERT_EQ(model_.subgraphs.size(), expected_model.subgraphs.size());
for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
subgraph_idx++) {
const auto graph = model_.subgraphs[subgraph_idx].get();
const auto expected_graph = expected_model.subgraphs[subgraph_idx].get();
ASSERT_EQ(graph->tensors.size(), expected_graph->tensors.size());
for (size_t i = 0; i < graph->tensors.size(); i++) {
const auto tensor = graph->tensors[i].get();
const auto expected_tensor = expected_graph->tensors[i].get();
EXPECT_EQ(tensor->buffer, expected_tensor->buffer);
EXPECT_EQ(tensor->is_variable, expected_tensor->is_variable);
EXPECT_EQ(tensor->shape, expected_tensor->shape);
EXPECT_EQ(tensor->name, expected_tensor->name);
EXPECT_EQ(tensor->type, expected_tensor->type);
}
}
ASSERT_EQ(model_.buffers.size(), expected_model.buffers.size());
for (size_t buffer_idx = 0; buffer_idx < model_.buffers.size();
++buffer_idx) {
const auto buffer = model_.buffers[buffer_idx].get()->data;
const auto expected_buffer = expected_model.buffers[buffer_idx].get()->data;
EXPECT_EQ(buffer, expected_buffer);
}
}
class QuantizeFCTest : public QuantizeModelTest {
protected:
QuantizeFCTest() {

View File

@ -49,6 +49,9 @@ const char* kModelMixed = "mixed.bin";
const char* kModelSplit = "split.bin";
const char* kLstmCalibrated = "lstm_calibrated.bin";
const char* kLstmQuantized = "lstm_quantized.bin";
int FailOnErrorReporter::Report(const char* format, va_list args) {
char buf[1024];
vsnprintf(buf, sizeof(buf), format, args);

View File

@ -76,6 +76,10 @@ extern const char* kModelMixed;
// Test model with split op.
extern const char* kModelSplit;
// Test model with LSTM op.
extern const char* kLstmCalibrated;
extern const char* kLstmQuantized;
// An error reporter that fails on testing.
class FailOnErrorReporter : public ErrorReporter {
public:

Binary file not shown.

Binary file not shown.