Add support for partially quantizing models.
PiperOrigin-RevId: 245479701
This commit is contained in:
parent
ed7ceaa7c9
commit
9e668b3d69
@ -180,6 +180,7 @@ tf_cc_test(
|
||||
"//tensorflow/lite/tools/optimize:testdata/argmax.bin",
|
||||
"//tensorflow/lite/tools/optimize:testdata/concat.bin",
|
||||
"//tensorflow/lite/tools/optimize:testdata/fc.bin",
|
||||
"//tensorflow/lite/tools/optimize:testdata/mixed.bin",
|
||||
"//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin",
|
||||
"//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",
|
||||
"//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
|
||||
|
@ -123,16 +123,17 @@ bool HasMinMax(const TensorT* tensor) {
|
||||
!tensor->quantization->max.empty();
|
||||
}
|
||||
|
||||
TfLiteStatus SetOperatorCodeVersion(ModelT* model) {
|
||||
void SetOperatorCodeVersion(ModelT* model) {
|
||||
for (int i = 0; i < model->operator_codes.size(); ++i) {
|
||||
OperatorCodeT* op_code = model->operator_codes[i].get();
|
||||
const BuiltinOperator op_buildin_code = op_code->builtin_code;
|
||||
operator_property::OperatorProperty property;
|
||||
TF_LITE_ENSURE_STATUS(
|
||||
operator_property::GetOperatorProperty(op_buildin_code, &property));
|
||||
op_code->version = property.version;
|
||||
operator_property::OperatorProperty property =
|
||||
operator_property::GetOperatorProperty(op_buildin_code);
|
||||
if (property.quantizable) {
|
||||
// Only update the versions of non-quantizable operations.
|
||||
op_code->version = property.version;
|
||||
}
|
||||
}
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
|
@ -53,8 +53,9 @@ bool IsQuantized(const SubGraphT* subgraph, int tensor_index);
|
||||
|
||||
bool HasMinMax(const TensorT* tensor);
|
||||
|
||||
// Set version of OperatorCode.
|
||||
TfLiteStatus SetOperatorCodeVersion(ModelT* model);
|
||||
// Set version of OperatorCode. The version will only be applied for operations
|
||||
// that have been quantized.
|
||||
void SetOperatorCodeVersion(ModelT* model);
|
||||
|
||||
} // namespace utils
|
||||
} // namespace optimize
|
||||
|
@ -17,123 +17,113 @@ limitations under the License.
|
||||
namespace tflite {
|
||||
namespace optimize {
|
||||
namespace operator_property {
|
||||
TfLiteStatus GetOperatorProperty(const BuiltinOperator& op,
|
||||
OperatorProperty* property) {
|
||||
// Set up default values.
|
||||
property->per_axis = false;
|
||||
property->per_axis_index = 0;
|
||||
property->arbitrary_inputs = false;
|
||||
property->input_indexes = {};
|
||||
property->output_indexes = {};
|
||||
property->biases = {};
|
||||
property->restrict_same_input_output_scale = false;
|
||||
property->restriction_on_output = false;
|
||||
property->restricted_value_on_output = {0.0, 0.0};
|
||||
property->version = 0;
|
||||
OperatorProperty GetOperatorProperty(const BuiltinOperator& op) {
|
||||
OperatorProperty property;
|
||||
switch (op) {
|
||||
case BuiltinOperator_ADD:
|
||||
property->input_indexes = {0, 1};
|
||||
property->output_indexes = {0};
|
||||
property->version = 2;
|
||||
return kTfLiteOk;
|
||||
property.input_indexes = {0, 1};
|
||||
property.output_indexes = {0};
|
||||
property.version = 2;
|
||||
break;
|
||||
case BuiltinOperator_ARG_MAX:
|
||||
property->input_indexes = {0};
|
||||
property.input_indexes = {0};
|
||||
// ArgMax has no quantizable output.
|
||||
property->version = 2;
|
||||
return kTfLiteOk;
|
||||
property.version = 2;
|
||||
break;
|
||||
case BuiltinOperator_AVERAGE_POOL_2D:
|
||||
property->input_indexes = {0};
|
||||
property->output_indexes = {0};
|
||||
property->restrict_same_input_output_scale = true;
|
||||
property->version = 2;
|
||||
return kTfLiteOk;
|
||||
property.input_indexes = {0};
|
||||
property.output_indexes = {0};
|
||||
property.restrict_same_input_output_scale = true;
|
||||
property.version = 2;
|
||||
break;
|
||||
case BuiltinOperator_CONCATENATION:
|
||||
property->arbitrary_inputs = true;
|
||||
property->input_indexes = {};
|
||||
property->output_indexes = {0};
|
||||
property->restrict_same_input_output_scale = true;
|
||||
property->version = 2;
|
||||
return kTfLiteOk;
|
||||
property.arbitrary_inputs = true;
|
||||
property.input_indexes = {};
|
||||
property.output_indexes = {0};
|
||||
property.restrict_same_input_output_scale = true;
|
||||
property.version = 2;
|
||||
break;
|
||||
case BuiltinOperator_CONV_2D:
|
||||
property->per_axis = true;
|
||||
property->per_axis_index = 0;
|
||||
property->input_indexes = {0, 1};
|
||||
property->output_indexes = {0};
|
||||
property->biases = {2};
|
||||
property->version = 2;
|
||||
return kTfLiteOk;
|
||||
property.per_axis = true;
|
||||
property.per_axis_index = 0;
|
||||
property.input_indexes = {0, 1};
|
||||
property.output_indexes = {0};
|
||||
property.biases = {2};
|
||||
property.version = 2;
|
||||
break;
|
||||
case BuiltinOperator_DEPTHWISE_CONV_2D:
|
||||
property->per_axis = true;
|
||||
property->per_axis_index = 3;
|
||||
property->input_indexes = {0, 1};
|
||||
property->output_indexes = {0};
|
||||
property->biases = {2};
|
||||
property->version = 3;
|
||||
return kTfLiteOk;
|
||||
property.per_axis = true;
|
||||
property.per_axis_index = 3;
|
||||
property.input_indexes = {0, 1};
|
||||
property.output_indexes = {0};
|
||||
property.biases = {2};
|
||||
property.version = 3;
|
||||
break;
|
||||
case BuiltinOperator_FULLY_CONNECTED:
|
||||
property->input_indexes = {0, 1};
|
||||
property->output_indexes = {0};
|
||||
property->biases = {2};
|
||||
property->version = 4;
|
||||
return kTfLiteOk;
|
||||
property.input_indexes = {0, 1};
|
||||
property.output_indexes = {0};
|
||||
property.biases = {2};
|
||||
property.version = 4;
|
||||
break;
|
||||
case BuiltinOperator_MAX_POOL_2D:
|
||||
property->input_indexes = {0};
|
||||
property->output_indexes = {0};
|
||||
property->restrict_same_input_output_scale = true;
|
||||
property->version = 2;
|
||||
return kTfLiteOk;
|
||||
property.input_indexes = {0};
|
||||
property.output_indexes = {0};
|
||||
property.restrict_same_input_output_scale = true;
|
||||
property.version = 2;
|
||||
break;
|
||||
case BuiltinOperator_MEAN:
|
||||
property->input_indexes = {0};
|
||||
property->output_indexes = {0};
|
||||
property->version = 2;
|
||||
return kTfLiteOk;
|
||||
property.input_indexes = {0};
|
||||
property.output_indexes = {0};
|
||||
property.version = 2;
|
||||
break;
|
||||
case BuiltinOperator_MUL:
|
||||
property->input_indexes = {0, 1};
|
||||
property->output_indexes = {0};
|
||||
property->version = 2;
|
||||
return kTfLiteOk;
|
||||
property.input_indexes = {0, 1};
|
||||
property.output_indexes = {0};
|
||||
property.version = 2;
|
||||
break;
|
||||
case BuiltinOperator_PAD:
|
||||
property->input_indexes = {0};
|
||||
property->output_indexes = {0};
|
||||
property->version = 2;
|
||||
return kTfLiteOk;
|
||||
property.input_indexes = {0};
|
||||
property.output_indexes = {0};
|
||||
property.version = 2;
|
||||
break;
|
||||
case BuiltinOperator_QUANTIZE:
|
||||
property->input_indexes = {0};
|
||||
property->output_indexes = {0};
|
||||
property->version = 1;
|
||||
return kTfLiteOk;
|
||||
property.input_indexes = {0};
|
||||
property.output_indexes = {0};
|
||||
property.version = 1;
|
||||
break;
|
||||
case BuiltinOperator_RESHAPE:
|
||||
property->input_indexes = {0};
|
||||
property->output_indexes = {0};
|
||||
property->restrict_same_input_output_scale = true;
|
||||
property->version = 1;
|
||||
return kTfLiteOk;
|
||||
property.input_indexes = {0};
|
||||
property.output_indexes = {0};
|
||||
property.restrict_same_input_output_scale = true;
|
||||
property.version = 1;
|
||||
break;
|
||||
case BuiltinOperator_SQUEEZE:
|
||||
property->input_indexes = {0};
|
||||
property->output_indexes = {0};
|
||||
property->restrict_same_input_output_scale = true;
|
||||
property->version = 1;
|
||||
return kTfLiteOk;
|
||||
property.input_indexes = {0};
|
||||
property.output_indexes = {0};
|
||||
property.restrict_same_input_output_scale = true;
|
||||
property.version = 1;
|
||||
break;
|
||||
case BuiltinOperator_SOFTMAX:
|
||||
property->input_indexes = {0};
|
||||
property->output_indexes = {0};
|
||||
property.input_indexes = {0};
|
||||
property.output_indexes = {0};
|
||||
// Softmax requires output with 1/256 as scale and -128 as zero point.
|
||||
property->restriction_on_output = true;
|
||||
property->restricted_value_on_output = {1 / 256.0, -128};
|
||||
property->version = 2;
|
||||
return kTfLiteOk;
|
||||
property.restriction_on_output = true;
|
||||
property.restricted_value_on_output = {1 / 256.0, -128};
|
||||
property.version = 2;
|
||||
break;
|
||||
case BuiltinOperator_TANH:
|
||||
property->input_indexes = {0};
|
||||
property->output_indexes = {0};
|
||||
property.input_indexes = {0};
|
||||
property.output_indexes = {0};
|
||||
// Tanh requires output with 1/128 as scale and 0 as zero point.
|
||||
property->restriction_on_output = true;
|
||||
property->restricted_value_on_output = {1 / 128.0, 0};
|
||||
property->version = 2;
|
||||
return kTfLiteOk;
|
||||
property.restriction_on_output = true;
|
||||
property.restricted_value_on_output = {1 / 128.0, 0};
|
||||
property.version = 2;
|
||||
break;
|
||||
default:
|
||||
return kTfLiteError;
|
||||
// No quantized implementation exists for this operation.
|
||||
property.quantizable = false;
|
||||
}
|
||||
return kTfLiteError;
|
||||
return property;
|
||||
}
|
||||
|
||||
} // namespace operator_property
|
||||
|
@ -23,34 +23,35 @@ namespace optimize {
|
||||
namespace operator_property {
|
||||
|
||||
struct OperatorProperty {
|
||||
// Is a quantized operations currently supported.
|
||||
bool quantizable = true;
|
||||
// Per axis.
|
||||
bool per_axis;
|
||||
bool per_axis = false;
|
||||
// TODO(jianlijianli): remove dimension index and read it from tensor instead.
|
||||
int per_axis_index;
|
||||
int per_axis_index = 0;
|
||||
|
||||
// Op has arbitrary number of inputs, such as concat.
|
||||
bool arbitrary_inputs;
|
||||
bool arbitrary_inputs = false;
|
||||
// Input and weight indexes. Unable to separate the two because of ops such as
|
||||
// ADD.
|
||||
std::vector<int> input_indexes;
|
||||
std::vector<int> input_indexes = {};
|
||||
|
||||
// Output indexes
|
||||
std::vector<int> output_indexes;
|
||||
std::vector<int> output_indexes = {};
|
||||
|
||||
// Bias indexes.
|
||||
std::vector<int> biases;
|
||||
std::vector<int> biases = {};
|
||||
|
||||
// Constraints.
|
||||
bool restrict_same_input_output_scale;
|
||||
bool restriction_on_output;
|
||||
std::pair<float, float> restricted_value_on_output;
|
||||
bool restrict_same_input_output_scale = false;
|
||||
bool restriction_on_output = false;
|
||||
std::pair<float, float> restricted_value_on_output = {0.0, 0.0};
|
||||
|
||||
// Op version.
|
||||
int version;
|
||||
int version = 1;
|
||||
};
|
||||
|
||||
TfLiteStatus GetOperatorProperty(const BuiltinOperator& op,
|
||||
OperatorProperty* property);
|
||||
OperatorProperty GetOperatorProperty(const BuiltinOperator& op);
|
||||
|
||||
} // namespace operator_property
|
||||
} // namespace optimize
|
||||
|
@ -210,13 +210,23 @@ int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
|
||||
// For Uint8 input and output, leading op is Quantize (uint8 to
|
||||
// int8, can be thought as "requant") and tailing op is also Quantize (int8 to
|
||||
// uint8, can be thought as "requant").
|
||||
void SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
|
||||
const TensorType& output_type) {
|
||||
TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
|
||||
const TensorType& output_type,
|
||||
ErrorReporter* error_reporter) {
|
||||
for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
|
||||
subgraph_idx++) {
|
||||
SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
|
||||
|
||||
for (int i = 0; i < subgraph->inputs.size(); ++i) {
|
||||
TensorT* tensor = subgraph->tensors[subgraph->inputs[i]].get();
|
||||
// TODO(suharshs): Add support for this case if it ever comes up.
|
||||
if (tensor->type == TensorType_FLOAT32 && input_type != tensor->type) {
|
||||
error_reporter->Report(
|
||||
"Unsupported input type %s for input tensor %d of type %s.",
|
||||
EnumNameTensorType(input_type), subgraph->inputs[i],
|
||||
EnumNameTensorType(tensor->type));
|
||||
return kTfLiteError;
|
||||
}
|
||||
const int32_t input_idx =
|
||||
SetInputType(model, subgraph, subgraph->inputs[i], input_type);
|
||||
if (input_idx < 0) {
|
||||
@ -225,6 +235,15 @@ void SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
|
||||
subgraph->inputs[i] = input_idx;
|
||||
}
|
||||
for (int i = 0; i < subgraph->outputs.size(); ++i) {
|
||||
TensorT* tensor = subgraph->tensors[subgraph->outputs[i]].get();
|
||||
// TODO(suharshs): Add support for this case if it ever comes up.
|
||||
if (tensor->type == TensorType_FLOAT32 && output_type != tensor->type) {
|
||||
error_reporter->Report(
|
||||
"Unsupported output type %s for output tensor %d of type %s.",
|
||||
EnumNameTensorType(output_type), subgraph->outputs[i],
|
||||
EnumNameTensorType(tensor->type));
|
||||
return kTfLiteError;
|
||||
}
|
||||
const int32_t output_idx =
|
||||
SetOutputType(model, subgraph, subgraph->outputs[i], output_type);
|
||||
if (output_idx < 0) {
|
||||
@ -233,6 +252,7 @@ void SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
|
||||
subgraph->outputs[i] = output_idx;
|
||||
}
|
||||
}
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
// Apply constraints to ops if they have any.
|
||||
@ -250,9 +270,11 @@ TfLiteStatus ApplyConstraints(flatbuffers::FlatBufferBuilder* builder,
|
||||
OperatorT* op = subgraph->operators[op_idx].get();
|
||||
const BuiltinOperator op_code =
|
||||
model->operator_codes[op->opcode_index]->builtin_code;
|
||||
operator_property::OperatorProperty property;
|
||||
TF_LITE_ENSURE_STATUS(
|
||||
operator_property::GetOperatorProperty(op_code, &property));
|
||||
operator_property::OperatorProperty property =
|
||||
operator_property::GetOperatorProperty(op_code);
|
||||
if (!property.quantizable) {
|
||||
continue;
|
||||
}
|
||||
// Basically only Concat passes this check.
|
||||
if (!property.restrict_same_input_output_scale ||
|
||||
(property.input_indexes.size() == 1 &&
|
||||
@ -311,10 +333,215 @@ TfLiteStatus ApplyConstraints(flatbuffers::FlatBufferBuilder* builder,
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
std::vector<int> GetInputIndexes(const OperatorT* op,
|
||||
operator_property::OperatorProperty property) {
|
||||
std::vector<int> input_indexes;
|
||||
if (property.arbitrary_inputs || !property.quantizable) {
|
||||
for (int i = 0; i < op->inputs.size(); ++i) {
|
||||
input_indexes.push_back(i);
|
||||
}
|
||||
} else {
|
||||
input_indexes = property.input_indexes;
|
||||
}
|
||||
return input_indexes;
|
||||
}
|
||||
|
||||
bool ShouldRestrictSameInputOutputScale(
|
||||
operator_property::OperatorProperty property) {
|
||||
return (property.input_indexes.size() == 1 &&
|
||||
property.output_indexes.size() == 1 && property.biases.empty() &&
|
||||
property.restrict_same_input_output_scale);
|
||||
}
|
||||
|
||||
bool IsSubgraphInput(SubGraphT* subgraph, int32_t index) {
|
||||
for (const int32_t input_idx : subgraph->inputs) {
|
||||
if (index == input_idx) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Quantize the op input. Will increment op_idx if ops are added.
|
||||
TfLiteStatus QuantizeOpInput(ModelT* model, int32_t subgraph_idx,
|
||||
size_t* op_idx,
|
||||
operator_property::OperatorProperty property,
|
||||
int32_t input_idx, ErrorReporter* error_reporter) {
|
||||
SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
|
||||
OperatorT* op = subgraph->operators[*op_idx].get();
|
||||
const BuiltinOperator op_code =
|
||||
model->operator_codes[op->opcode_index]->builtin_code;
|
||||
const int32_t tensor_idx = op->inputs[input_idx];
|
||||
TensorT* tensor = subgraph->tensors[tensor_idx].get();
|
||||
const bool is_input_quantized = utils::IsQuantized(subgraph, tensor_idx);
|
||||
if (input_idx >= op->inputs.size()) {
|
||||
error_reporter->Report(
|
||||
"Required input index %d is larger than the input length of op "
|
||||
"%s at index %d in subgraph %d",
|
||||
input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code), *op_idx,
|
||||
subgraph_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
if (property.quantizable && !is_input_quantized) {
|
||||
// The operation is quantizable, but the input isn't yet quantized.
|
||||
if (utils::HasBuffer(model, subgraph, tensor_idx)) {
|
||||
if (utils::QuantizeWeight(model, tensor, property.per_axis,
|
||||
property.per_axis_index) == kTfLiteError) {
|
||||
error_reporter->Report(
|
||||
"Unable to quantize buffer or min/max value for input %d "
|
||||
"in op %s in subgraph %d, node: %d",
|
||||
input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
} else if (utils::HasMinMax(tensor)) {
|
||||
if (IsSubgraphInput(subgraph, tensor_idx)) {
|
||||
utils::QuantizeActivation(tensor);
|
||||
} else {
|
||||
// If the tensor is not a model input, we need to add a Quantize
|
||||
// operation since the preceding op may require a float output.
|
||||
std::unique_ptr<TensorT> op_output;
|
||||
utils::MakeTensor(tensor->name + "_int8", tensor->shape,
|
||||
TensorType_INT8, &op_output);
|
||||
op_output->quantization = absl::make_unique<QuantizationParametersT>();
|
||||
op_output->quantization->min.push_back(tensor->quantization->min[0]);
|
||||
op_output->quantization->max.push_back(tensor->quantization->max[0]);
|
||||
utils::QuantizeActivation(op_output.get());
|
||||
const int32_t quant_op_output_idx = subgraph->tensors.size();
|
||||
subgraph->tensors.push_back(std::move(op_output));
|
||||
std::unique_ptr<OperatorT> quant_op;
|
||||
utils::MakeQuantizeOperator(model, &quant_op, tensor_idx,
|
||||
quant_op_output_idx);
|
||||
subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
|
||||
std::move(quant_op));
|
||||
op->inputs[input_idx] = quant_op_output_idx;
|
||||
*op_idx += 1;
|
||||
}
|
||||
} else {
|
||||
error_reporter->Report(
|
||||
"Unable to find buffer or min/max value for input activation "
|
||||
"%d "
|
||||
"in %s in subgraph %d, node: %d",
|
||||
input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
} else if (!property.quantizable && is_input_quantized) {
|
||||
// If the tensor is quantized, we have to add a Dequantize op after
|
||||
// since this op is not quantizable.
|
||||
std::unique_ptr<TensorT> op_output;
|
||||
utils::MakeTensor(tensor->name + "_float", tensor->shape,
|
||||
TensorType_FLOAT32, &op_output);
|
||||
const int32_t dequant_op_output_idx = subgraph->tensors.size();
|
||||
subgraph->tensors.push_back(std::move(op_output));
|
||||
std::unique_ptr<OperatorT> dequant_op;
|
||||
utils::MakeDequantizeOperator(model, &dequant_op, tensor_idx,
|
||||
dequant_op_output_idx);
|
||||
subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
|
||||
std::move(dequant_op));
|
||||
op->inputs[input_idx] = dequant_op_output_idx;
|
||||
*op_idx += 1;
|
||||
}
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
// Quantize the op output.
|
||||
TfLiteStatus QuantizeOpOutput(ModelT* model, int32_t subgraph_idx,
|
||||
int32_t op_idx,
|
||||
operator_property::OperatorProperty property,
|
||||
int32_t output_idx,
|
||||
ErrorReporter* error_reporter) {
|
||||
// If the operator is not quantizable, we don't need to do anything for the
|
||||
// output.
|
||||
if (!property.quantizable) {
|
||||
return kTfLiteOk;
|
||||
}
|
||||
SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
|
||||
OperatorT* op = subgraph->operators[op_idx].get();
|
||||
const BuiltinOperator op_code =
|
||||
model->operator_codes[op->opcode_index]->builtin_code;
|
||||
if (output_idx >= op->outputs.size()) {
|
||||
error_reporter->Report(
|
||||
"Required output index %d is larger than the output length of "
|
||||
"op %s at index %d in subgraph %d",
|
||||
output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code),
|
||||
op_idx, subgraph_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
|
||||
TensorT* output_tensor = subgraph->tensors[op->outputs[output_idx]].get();
|
||||
if (ShouldRestrictSameInputOutputScale(property)) {
|
||||
// Copy quantization parameter. For average pool, max pool, etc
|
||||
// min/max can be different but we want them to be the same.
|
||||
// Get scale and zero point of input.
|
||||
if (property.input_indexes[0] >= op->inputs.size()) {
|
||||
error_reporter->Report(
|
||||
"Required input index %d is larger than the input length of "
|
||||
"op %s at index %d in subgraph %d",
|
||||
property.input_indexes[0], op->inputs.size(),
|
||||
EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
const int input_index = op->inputs[property.input_indexes[0]];
|
||||
TensorT* input_tensor = subgraph->tensors[input_index].get();
|
||||
if (input_tensor->quantization->scale.size() != 1 ||
|
||||
input_tensor->quantization->zero_point.size() != 1 ||
|
||||
input_tensor->quantization->min.size() != 1 ||
|
||||
input_tensor->quantization->max.size() != 1) {
|
||||
error_reporter->Report(
|
||||
"Invalid quantization params for op %s at index %d "
|
||||
"in subgraph %d",
|
||||
EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
|
||||
const float input_scale = input_tensor->quantization->scale[0];
|
||||
const int32_t input_zero_point = input_tensor->quantization->zero_point[0];
|
||||
|
||||
const float min = input_tensor->quantization->min[0];
|
||||
const float max = input_tensor->quantization->max[0];
|
||||
if (utils::HasMinMax(output_tensor)) {
|
||||
if (output_tensor->quantization->min[0] != min ||
|
||||
output_tensor->quantization->max[0] != max) {
|
||||
printf(
|
||||
"Note the output min/max is different from the input min/max "
|
||||
"for op %s at index %d in subgraph %d. This is legal but "
|
||||
"should happens rarely.",
|
||||
EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply to output.
|
||||
output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
|
||||
output_tensor->quantization->scale.push_back(input_scale);
|
||||
output_tensor->quantization->zero_point.push_back(input_zero_point);
|
||||
output_tensor->quantization->min.push_back(min);
|
||||
output_tensor->quantization->max.push_back(max);
|
||||
output_tensor->type = TensorType_INT8;
|
||||
} else if (property.restriction_on_output) {
|
||||
const auto scale_and_zp = property.restricted_value_on_output;
|
||||
// Apply to output.
|
||||
output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
|
||||
output_tensor->quantization->scale.push_back(scale_and_zp.first);
|
||||
output_tensor->quantization->zero_point.push_back(scale_and_zp.second);
|
||||
output_tensor->type = TensorType_INT8;
|
||||
} else {
|
||||
// Process regular output that doesn't have any restrictions.
|
||||
if (utils::HasMinMax(output_tensor)) {
|
||||
utils::QuantizeActivation(output_tensor);
|
||||
} else {
|
||||
error_reporter->Report(
|
||||
"Unable to find min/max value for output %d in %s in "
|
||||
"subgraph %d, node: %d",
|
||||
output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
}
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
// Quantize inputs and weights.
|
||||
// Because of ops such as lstm, still need to do per op, instead of weights.
|
||||
TfLiteStatus QuantizeWeightsInputOutput(flatbuffers::FlatBufferBuilder* builder,
|
||||
ModelT* model,
|
||||
ModelT* model, bool allow_float,
|
||||
ErrorReporter* error_reporter) {
|
||||
for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
|
||||
subgraph_idx++) {
|
||||
@ -323,154 +550,25 @@ TfLiteStatus QuantizeWeightsInputOutput(flatbuffers::FlatBufferBuilder* builder,
|
||||
OperatorT* op = subgraph->operators[op_idx].get();
|
||||
const BuiltinOperator op_code =
|
||||
model->operator_codes[op->opcode_index]->builtin_code;
|
||||
operator_property::OperatorProperty property;
|
||||
if (operator_property::GetOperatorProperty(op_code, &property) ==
|
||||
kTfLiteError) {
|
||||
operator_property::OperatorProperty property =
|
||||
operator_property::GetOperatorProperty(op_code);
|
||||
|
||||
if (!property.quantizable && !allow_float) {
|
||||
error_reporter->Report("Quantization not yet supported for op: %s",
|
||||
EnumNameBuiltinOperator(op_code));
|
||||
return kTfLiteError;
|
||||
}
|
||||
// Quantize weight and inputs.
|
||||
std::vector<int> input_indexes;
|
||||
if (property.arbitrary_inputs) {
|
||||
for (int i = 0; i < op->inputs.size(); ++i) {
|
||||
input_indexes.push_back(i);
|
||||
}
|
||||
} else {
|
||||
input_indexes = property.input_indexes;
|
||||
|
||||
// Quantize operator inputs/weights.
|
||||
for (const int input_idx : GetInputIndexes(op, property)) {
|
||||
TF_LITE_ENSURE_STATUS(QuantizeOpInput(
|
||||
model, subgraph_idx, &op_idx, property, input_idx, error_reporter));
|
||||
}
|
||||
for (const int input_idx : input_indexes) {
|
||||
if (input_idx >= op->inputs.size()) {
|
||||
error_reporter->Report(
|
||||
"Required input index %d is larger than the input length of op "
|
||||
"%s at index %d in subgraph %d",
|
||||
input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code),
|
||||
op_idx, subgraph_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
TensorT* tensor = subgraph->tensors[op->inputs[input_idx]].get();
|
||||
// Quantize if it is not quantized already as the output of
|
||||
// another op or input of another op.
|
||||
if (!utils::IsQuantized(subgraph, op->inputs[input_idx])) {
|
||||
if (utils::HasBuffer(model, subgraph, op->inputs[input_idx])) {
|
||||
TensorT* tensor = subgraph->tensors[op->inputs[input_idx]].get();
|
||||
utils::QuantizeWeight(model, tensor, property.per_axis,
|
||||
property.per_axis_index);
|
||||
continue;
|
||||
}
|
||||
if (utils::HasMinMax(tensor)) {
|
||||
utils::QuantizeActivation(tensor);
|
||||
continue;
|
||||
}
|
||||
// TODO(jianlijianli): Eventually we can insert a dequantize operation
|
||||
// for all inputs and weights here, in the case that min/max is
|
||||
// missing.
|
||||
error_reporter->Report(
|
||||
"Unable to find buffer or min/max value for input activation %d "
|
||||
"in %s in subgraph %d, node: %d",
|
||||
input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
|
||||
op_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
}
|
||||
// Quantize output.
|
||||
|
||||
// Quantize operator outputs.
|
||||
for (const int output_idx : property.output_indexes) {
|
||||
if (output_idx >= op->outputs.size()) {
|
||||
error_reporter->Report(
|
||||
"Requaired output index %d is larger than the output length of "
|
||||
"op %s at index %d in subgraph %d",
|
||||
output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code),
|
||||
op_idx, subgraph_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
if (property.input_indexes.size() == 1 &&
|
||||
property.output_indexes.size() == 1 && property.biases.empty() &&
|
||||
property.restrict_same_input_output_scale) {
|
||||
// Copy quantization parameter. For average pool, max pool, etc
|
||||
// min/max can be different but we want them to be the same.
|
||||
// Get scale and zero point of input.
|
||||
if (property.input_indexes[0] >= op->inputs.size()) {
|
||||
error_reporter->Report(
|
||||
"Requaired input index %d is larger than the input length of "
|
||||
"op %s at index %d in subgraph %d",
|
||||
property.input_indexes[0], op->inputs.size(),
|
||||
EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
const int input_index = op->inputs[property.input_indexes[0]];
|
||||
TensorT* input_tensor = subgraph->tensors[input_index].get();
|
||||
if (input_tensor->quantization->scale.size() != 1 ||
|
||||
input_tensor->quantization->min.size() != 1 ||
|
||||
input_tensor->quantization->max.size() != 1) {
|
||||
error_reporter->Report(
|
||||
"Quantization dimension is not 1 for op %s at index %d in "
|
||||
"subgraph %d",
|
||||
EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
const float input_scale = input_tensor->quantization->scale[0];
|
||||
const float input_zero_point =
|
||||
input_tensor->quantization->zero_point[0];
|
||||
const float min = input_tensor->quantization->min[0];
|
||||
const float max = input_tensor->quantization->max[0];
|
||||
|
||||
// Log a warning when we have to override the min/max (scale and zero
|
||||
// point) of output using input.
|
||||
TensorT* output_tensor =
|
||||
subgraph->tensors[op->outputs[output_idx]].get();
|
||||
if (utils::HasMinMax(output_tensor)) {
|
||||
if (output_tensor->quantization->min[0] != min ||
|
||||
output_tensor->quantization->max[0] != max) {
|
||||
printf(
|
||||
"Note the output min/max is different from the input min/max "
|
||||
"for op %s at index %d in subgraph %d. This is legal but "
|
||||
"should happens rarely. ",
|
||||
EnumNameBuiltinOperator(op_code), static_cast<int>(op_idx),
|
||||
static_cast<int>(subgraph_idx));
|
||||
}
|
||||
}
|
||||
|
||||
// Apply to output.
|
||||
output_tensor->quantization =
|
||||
absl::make_unique<QuantizationParametersT>();
|
||||
output_tensor->quantization->scale.push_back(input_scale);
|
||||
output_tensor->quantization->zero_point.push_back(input_zero_point);
|
||||
output_tensor->quantization->min.push_back(min);
|
||||
output_tensor->quantization->max.push_back(max);
|
||||
output_tensor->type = TensorType_INT8;
|
||||
continue;
|
||||
}
|
||||
if (property.restriction_on_output) {
|
||||
const std::pair<float, float> scale_and_zp =
|
||||
property.restricted_value_on_output;
|
||||
// Copy scale and zero point since they are fixed.
|
||||
// Applies to softmax, tanh etc.
|
||||
TensorT* output_tensor =
|
||||
subgraph->tensors[op->outputs[output_idx]].get();
|
||||
output_tensor->quantization =
|
||||
absl::make_unique<QuantizationParametersT>();
|
||||
output_tensor->quantization->scale.push_back(scale_and_zp.first);
|
||||
output_tensor->quantization->zero_point.push_back(
|
||||
scale_and_zp.second);
|
||||
output_tensor->type = TensorType_INT8;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Process regular output that doesn't have any restrictions.
|
||||
TensorT* output_tensor =
|
||||
subgraph->tensors[op->outputs[output_idx]].get();
|
||||
if (utils::HasMinMax(output_tensor)) {
|
||||
utils::QuantizeActivation(output_tensor);
|
||||
} else {
|
||||
// TODO(jianlijianli): Eventually we can insert a dequantize operation
|
||||
// for output here, in the case that min/max is missing.
|
||||
error_reporter->Report(
|
||||
"Unable to find min/max value for output activation %d in %s in "
|
||||
"subgraph %d, node: %d",
|
||||
output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
|
||||
op_idx);
|
||||
return kTfLiteError;
|
||||
}
|
||||
TF_LITE_ENSURE_STATUS(QuantizeOpOutput(
|
||||
model, subgraph_idx, op_idx, property, output_idx, error_reporter));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -487,9 +585,11 @@ TfLiteStatus QuantizeBiases(flatbuffers::FlatBufferBuilder* builder,
|
||||
OperatorT* op = subgraph->operators[op_idx].get();
|
||||
const BuiltinOperator op_code =
|
||||
model->operator_codes[op->opcode_index]->builtin_code;
|
||||
operator_property::OperatorProperty property;
|
||||
TF_LITE_ENSURE_STATUS(
|
||||
operator_property::GetOperatorProperty(op_code, &property));
|
||||
operator_property::OperatorProperty property =
|
||||
operator_property::GetOperatorProperty(op_code);
|
||||
if (!property.quantizable) {
|
||||
continue;
|
||||
}
|
||||
for (const int bias_idx : property.biases) {
|
||||
if (bias_idx >= op->inputs.size()) {
|
||||
error_reporter->Report(
|
||||
@ -533,14 +633,15 @@ TfLiteStatus QuantizeBiases(flatbuffers::FlatBufferBuilder* builder,
|
||||
// Assumes that the operators in the model have been topologically sorted.
|
||||
TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
|
||||
ModelT* model, const TensorType& input_type,
|
||||
const TensorType& output_type,
|
||||
const TensorType& output_type, bool allow_float,
|
||||
ErrorReporter* error_reporter) {
|
||||
TF_LITE_ENSURE_STATUS(
|
||||
QuantizeWeightsInputOutput(builder, model, error_reporter));
|
||||
QuantizeWeightsInputOutput(builder, model, allow_float, error_reporter));
|
||||
TF_LITE_ENSURE_STATUS(ApplyConstraints(builder, model, error_reporter));
|
||||
TF_LITE_ENSURE_STATUS(QuantizeBiases(builder, model, error_reporter));
|
||||
TF_LITE_ENSURE_STATUS(utils::SetOperatorCodeVersion(model));
|
||||
SetInputAndOutputTypes(model, input_type, output_type);
|
||||
utils::SetOperatorCodeVersion(model);
|
||||
TF_LITE_ENSURE_STATUS(
|
||||
SetInputAndOutputTypes(model, input_type, output_type, error_reporter));
|
||||
|
||||
flatbuffers::Offset<Model> output_model_location =
|
||||
Model::Pack(*builder, model);
|
||||
@ -549,10 +650,18 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
|
||||
ModelT* model, const TensorType& input_type,
|
||||
const TensorType& output_type,
|
||||
ErrorReporter* error_reporter) {
|
||||
return QuantizeModel(builder, model, input_type, output_type,
|
||||
/*allow_float=*/false, error_reporter);
|
||||
}
|
||||
|
||||
TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
|
||||
ModelT* model, ErrorReporter* error_reporter) {
|
||||
return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32,
|
||||
error_reporter);
|
||||
/*allow_float=*/false, error_reporter);
|
||||
}
|
||||
|
||||
} // namespace optimize
|
||||
|
@ -44,6 +44,15 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
|
||||
const TensorType& output_type,
|
||||
ErrorReporter* error_reporter);
|
||||
|
||||
// Same as above, but can enable allowing float intermediate operations for ops
|
||||
// that do not yet support quantizable.
|
||||
//
|
||||
// Note: This is a private API, subject to change.
|
||||
TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
|
||||
ModelT* input_model, const TensorType& input_type,
|
||||
const TensorType& output_type, bool allow_float,
|
||||
ErrorReporter* error_reporter);
|
||||
|
||||
} // namespace optimize
|
||||
} // namespace tflite
|
||||
|
||||
|
@ -934,6 +934,42 @@ TEST_F(QuantizeFCTest, VerifyFC) {
|
||||
EXPECT_EQ(model_.operator_codes[1]->version, 1);
|
||||
}
|
||||
|
||||
class QuantizeCustomOpTest : public QuantizeModelTest {
|
||||
protected:
|
||||
QuantizeCustomOpTest() {
|
||||
input_model_ = ReadModel(internal::kModelMixed);
|
||||
readonly_model_ = input_model_->GetModel();
|
||||
readonly_model_->UnPackTo(&model_);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(QuantizeCustomOpTest, VerifyMixedQuantization) {
|
||||
auto status =
|
||||
QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
|
||||
/*allow_float=*/true, &error_reporter_);
|
||||
ASSERT_EQ(kTfLiteOk, status);
|
||||
const auto& subgraph = model_.subgraphs[0];
|
||||
auto float_graph = readonly_model_->subgraphs()->Get(0);
|
||||
// The original model reshape->custom->custom->squeeze.
|
||||
ASSERT_EQ(float_graph->operators()->size(), 4);
|
||||
// The resulting model should be:
|
||||
// reshape->dequantize->custom->custom->quantize->squeeze.
|
||||
ASSERT_EQ(subgraph->operators.size(), 6);
|
||||
const std::vector<BuiltinOperator> op_codes = {
|
||||
BuiltinOperator_RESHAPE, BuiltinOperator_DEQUANTIZE,
|
||||
BuiltinOperator_CUSTOM, BuiltinOperator_CUSTOM,
|
||||
BuiltinOperator_QUANTIZE, BuiltinOperator_SQUEEZE};
|
||||
const std::vector<TensorType> op_input_types = {
|
||||
TensorType_INT8, TensorType_INT8, TensorType_FLOAT32,
|
||||
TensorType_FLOAT32, TensorType_FLOAT32, TensorType_INT8};
|
||||
for (int i = 0; i < subgraph->operators.size(); ++i) {
|
||||
OperatorT* op = subgraph->operators[i].get();
|
||||
ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,
|
||||
op_codes[i]);
|
||||
ASSERT_EQ(subgraph->tensors[op->inputs[0]]->type, op_input_types[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace optimize
|
||||
} // namespace tflite
|
||||
|
@ -45,6 +45,8 @@ const char* kModelWithArgMaxOp = "argmax.bin";
|
||||
|
||||
const char* kModelWithFCOp = "fc.bin";
|
||||
|
||||
const char* kModelMixed = "mixed.bin";
|
||||
|
||||
int FailOnErrorReporter::Report(const char* format, va_list args) {
|
||||
char buf[1024];
|
||||
vsnprintf(buf, sizeof(buf), format, args);
|
||||
|
@ -69,6 +69,10 @@ extern const char* kModelWithArgMaxOp;
|
||||
// Test model with a argmax op.
|
||||
extern const char* kModelWithFCOp;
|
||||
|
||||
// Test model with mixed quantizable and un-quantizable ops.
|
||||
// reshape->custom->custom->squeeze.
|
||||
extern const char* kModelMixed;
|
||||
|
||||
// An error reporter that fails on testing.
|
||||
class FailOnErrorReporter : public ErrorReporter {
|
||||
public:
|
||||
|
BIN
tensorflow/lite/tools/optimize/testdata/mixed.bin
vendored
Normal file
BIN
tensorflow/lite/tools/optimize/testdata/mixed.bin
vendored
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user