diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD index 4db1eee439a..22a473bf9dc 100644 --- a/tensorflow/lite/tools/optimize/BUILD +++ b/tensorflow/lite/tools/optimize/BUILD @@ -180,6 +180,7 @@ tf_cc_test( "//tensorflow/lite/tools/optimize:testdata/argmax.bin", "//tensorflow/lite/tools/optimize:testdata/concat.bin", "//tensorflow/lite/tools/optimize:testdata/fc.bin", + "//tensorflow/lite/tools/optimize:testdata/mixed.bin", "//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin", "//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin", "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin", diff --git a/tensorflow/lite/tools/optimize/model_utils.cc b/tensorflow/lite/tools/optimize/model_utils.cc index 96e42eafb32..e0755e45c96 100644 --- a/tensorflow/lite/tools/optimize/model_utils.cc +++ b/tensorflow/lite/tools/optimize/model_utils.cc @@ -123,16 +123,17 @@ bool HasMinMax(const TensorT* tensor) { !tensor->quantization->max.empty(); } -TfLiteStatus SetOperatorCodeVersion(ModelT* model) { +void SetOperatorCodeVersion(ModelT* model) { for (int i = 0; i < model->operator_codes.size(); ++i) { OperatorCodeT* op_code = model->operator_codes[i].get(); const BuiltinOperator op_buildin_code = op_code->builtin_code; - operator_property::OperatorProperty property; - TF_LITE_ENSURE_STATUS( - operator_property::GetOperatorProperty(op_buildin_code, &property)); - op_code->version = property.version; + operator_property::OperatorProperty property = + operator_property::GetOperatorProperty(op_buildin_code); + if (property.quantizable) { + // Only update the versions of non-quantizable operations. + op_code->version = property.version; + } } - return kTfLiteOk; } } // namespace utils diff --git a/tensorflow/lite/tools/optimize/model_utils.h b/tensorflow/lite/tools/optimize/model_utils.h index 23380fc3492..6583d6a10db 100644 --- a/tensorflow/lite/tools/optimize/model_utils.h +++ b/tensorflow/lite/tools/optimize/model_utils.h @@ -53,8 +53,9 @@ bool IsQuantized(const SubGraphT* subgraph, int tensor_index); bool HasMinMax(const TensorT* tensor); -// Set version of OperatorCode. -TfLiteStatus SetOperatorCodeVersion(ModelT* model); +// Set version of OperatorCode. The version will only be applied for operations +// that have been quantized. +void SetOperatorCodeVersion(ModelT* model); } // namespace utils } // namespace optimize diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc index 940912f0d25..3703ffdaef8 100644 --- a/tensorflow/lite/tools/optimize/operator_property.cc +++ b/tensorflow/lite/tools/optimize/operator_property.cc @@ -17,123 +17,113 @@ limitations under the License. namespace tflite { namespace optimize { namespace operator_property { -TfLiteStatus GetOperatorProperty(const BuiltinOperator& op, - OperatorProperty* property) { - // Set up default values. - property->per_axis = false; - property->per_axis_index = 0; - property->arbitrary_inputs = false; - property->input_indexes = {}; - property->output_indexes = {}; - property->biases = {}; - property->restrict_same_input_output_scale = false; - property->restriction_on_output = false; - property->restricted_value_on_output = {0.0, 0.0}; - property->version = 0; +OperatorProperty GetOperatorProperty(const BuiltinOperator& op) { + OperatorProperty property; switch (op) { case BuiltinOperator_ADD: - property->input_indexes = {0, 1}; - property->output_indexes = {0}; - property->version = 2; - return kTfLiteOk; + property.input_indexes = {0, 1}; + property.output_indexes = {0}; + property.version = 2; + break; case BuiltinOperator_ARG_MAX: - property->input_indexes = {0}; + property.input_indexes = {0}; // ArgMax has no quantizable output. - property->version = 2; - return kTfLiteOk; + property.version = 2; + break; case BuiltinOperator_AVERAGE_POOL_2D: - property->input_indexes = {0}; - property->output_indexes = {0}; - property->restrict_same_input_output_scale = true; - property->version = 2; - return kTfLiteOk; + property.input_indexes = {0}; + property.output_indexes = {0}; + property.restrict_same_input_output_scale = true; + property.version = 2; + break; case BuiltinOperator_CONCATENATION: - property->arbitrary_inputs = true; - property->input_indexes = {}; - property->output_indexes = {0}; - property->restrict_same_input_output_scale = true; - property->version = 2; - return kTfLiteOk; + property.arbitrary_inputs = true; + property.input_indexes = {}; + property.output_indexes = {0}; + property.restrict_same_input_output_scale = true; + property.version = 2; + break; case BuiltinOperator_CONV_2D: - property->per_axis = true; - property->per_axis_index = 0; - property->input_indexes = {0, 1}; - property->output_indexes = {0}; - property->biases = {2}; - property->version = 2; - return kTfLiteOk; + property.per_axis = true; + property.per_axis_index = 0; + property.input_indexes = {0, 1}; + property.output_indexes = {0}; + property.biases = {2}; + property.version = 2; + break; case BuiltinOperator_DEPTHWISE_CONV_2D: - property->per_axis = true; - property->per_axis_index = 3; - property->input_indexes = {0, 1}; - property->output_indexes = {0}; - property->biases = {2}; - property->version = 3; - return kTfLiteOk; + property.per_axis = true; + property.per_axis_index = 3; + property.input_indexes = {0, 1}; + property.output_indexes = {0}; + property.biases = {2}; + property.version = 3; + break; case BuiltinOperator_FULLY_CONNECTED: - property->input_indexes = {0, 1}; - property->output_indexes = {0}; - property->biases = {2}; - property->version = 4; - return kTfLiteOk; + property.input_indexes = {0, 1}; + property.output_indexes = {0}; + property.biases = {2}; + property.version = 4; + break; case BuiltinOperator_MAX_POOL_2D: - property->input_indexes = {0}; - property->output_indexes = {0}; - property->restrict_same_input_output_scale = true; - property->version = 2; - return kTfLiteOk; + property.input_indexes = {0}; + property.output_indexes = {0}; + property.restrict_same_input_output_scale = true; + property.version = 2; + break; case BuiltinOperator_MEAN: - property->input_indexes = {0}; - property->output_indexes = {0}; - property->version = 2; - return kTfLiteOk; + property.input_indexes = {0}; + property.output_indexes = {0}; + property.version = 2; + break; case BuiltinOperator_MUL: - property->input_indexes = {0, 1}; - property->output_indexes = {0}; - property->version = 2; - return kTfLiteOk; + property.input_indexes = {0, 1}; + property.output_indexes = {0}; + property.version = 2; + break; case BuiltinOperator_PAD: - property->input_indexes = {0}; - property->output_indexes = {0}; - property->version = 2; - return kTfLiteOk; + property.input_indexes = {0}; + property.output_indexes = {0}; + property.version = 2; + break; case BuiltinOperator_QUANTIZE: - property->input_indexes = {0}; - property->output_indexes = {0}; - property->version = 1; - return kTfLiteOk; + property.input_indexes = {0}; + property.output_indexes = {0}; + property.version = 1; + break; case BuiltinOperator_RESHAPE: - property->input_indexes = {0}; - property->output_indexes = {0}; - property->restrict_same_input_output_scale = true; - property->version = 1; - return kTfLiteOk; + property.input_indexes = {0}; + property.output_indexes = {0}; + property.restrict_same_input_output_scale = true; + property.version = 1; + break; case BuiltinOperator_SQUEEZE: - property->input_indexes = {0}; - property->output_indexes = {0}; - property->restrict_same_input_output_scale = true; - property->version = 1; - return kTfLiteOk; + property.input_indexes = {0}; + property.output_indexes = {0}; + property.restrict_same_input_output_scale = true; + property.version = 1; + break; case BuiltinOperator_SOFTMAX: - property->input_indexes = {0}; - property->output_indexes = {0}; + property.input_indexes = {0}; + property.output_indexes = {0}; // Softmax requires output with 1/256 as scale and -128 as zero point. - property->restriction_on_output = true; - property->restricted_value_on_output = {1 / 256.0, -128}; - property->version = 2; - return kTfLiteOk; + property.restriction_on_output = true; + property.restricted_value_on_output = {1 / 256.0, -128}; + property.version = 2; + break; case BuiltinOperator_TANH: - property->input_indexes = {0}; - property->output_indexes = {0}; + property.input_indexes = {0}; + property.output_indexes = {0}; // Tanh requires output with 1/128 as scale and 0 as zero point. - property->restriction_on_output = true; - property->restricted_value_on_output = {1 / 128.0, 0}; - property->version = 2; - return kTfLiteOk; + property.restriction_on_output = true; + property.restricted_value_on_output = {1 / 128.0, 0}; + property.version = 2; + break; default: - return kTfLiteError; + // No quantized implementation exists for this operation. + property.quantizable = false; } - return kTfLiteError; + return property; } } // namespace operator_property diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h index 7de9545b765..7d01ab42ea6 100644 --- a/tensorflow/lite/tools/optimize/operator_property.h +++ b/tensorflow/lite/tools/optimize/operator_property.h @@ -23,34 +23,35 @@ namespace optimize { namespace operator_property { struct OperatorProperty { + // Is a quantized operations currently supported. + bool quantizable = true; // Per axis. - bool per_axis; + bool per_axis = false; // TODO(jianlijianli): remove dimension index and read it from tensor instead. - int per_axis_index; + int per_axis_index = 0; // Op has arbitrary number of inputs, such as concat. - bool arbitrary_inputs; + bool arbitrary_inputs = false; // Input and weight indexes. Unable to separate the two because of ops such as // ADD. - std::vector input_indexes; + std::vector input_indexes = {}; // Output indexes - std::vector output_indexes; + std::vector output_indexes = {}; // Bias indexes. - std::vector biases; + std::vector biases = {}; // Constraints. - bool restrict_same_input_output_scale; - bool restriction_on_output; - std::pair restricted_value_on_output; + bool restrict_same_input_output_scale = false; + bool restriction_on_output = false; + std::pair restricted_value_on_output = {0.0, 0.0}; // Op version. - int version; + int version = 1; }; -TfLiteStatus GetOperatorProperty(const BuiltinOperator& op, - OperatorProperty* property); +OperatorProperty GetOperatorProperty(const BuiltinOperator& op); } // namespace operator_property } // namespace optimize diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc index 4e50950d696..058dc316d2f 100644 --- a/tensorflow/lite/tools/optimize/quantize_model.cc +++ b/tensorflow/lite/tools/optimize/quantize_model.cc @@ -210,13 +210,23 @@ int32_t SetOutputType(ModelT* model, SubGraphT* subgraph, // For Uint8 input and output, leading op is Quantize (uint8 to // int8, can be thought as "requant") and tailing op is also Quantize (int8 to // uint8, can be thought as "requant"). -void SetInputAndOutputTypes(ModelT* model, const TensorType& input_type, - const TensorType& output_type) { +TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type, + const TensorType& output_type, + ErrorReporter* error_reporter) { for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size(); subgraph_idx++) { SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get(); for (int i = 0; i < subgraph->inputs.size(); ++i) { + TensorT* tensor = subgraph->tensors[subgraph->inputs[i]].get(); + // TODO(suharshs): Add support for this case if it ever comes up. + if (tensor->type == TensorType_FLOAT32 && input_type != tensor->type) { + error_reporter->Report( + "Unsupported input type %s for input tensor %d of type %s.", + EnumNameTensorType(input_type), subgraph->inputs[i], + EnumNameTensorType(tensor->type)); + return kTfLiteError; + } const int32_t input_idx = SetInputType(model, subgraph, subgraph->inputs[i], input_type); if (input_idx < 0) { @@ -225,6 +235,15 @@ void SetInputAndOutputTypes(ModelT* model, const TensorType& input_type, subgraph->inputs[i] = input_idx; } for (int i = 0; i < subgraph->outputs.size(); ++i) { + TensorT* tensor = subgraph->tensors[subgraph->outputs[i]].get(); + // TODO(suharshs): Add support for this case if it ever comes up. + if (tensor->type == TensorType_FLOAT32 && output_type != tensor->type) { + error_reporter->Report( + "Unsupported output type %s for output tensor %d of type %s.", + EnumNameTensorType(output_type), subgraph->outputs[i], + EnumNameTensorType(tensor->type)); + return kTfLiteError; + } const int32_t output_idx = SetOutputType(model, subgraph, subgraph->outputs[i], output_type); if (output_idx < 0) { @@ -233,6 +252,7 @@ void SetInputAndOutputTypes(ModelT* model, const TensorType& input_type, subgraph->outputs[i] = output_idx; } } + return kTfLiteOk; } // Apply constraints to ops if they have any. @@ -250,9 +270,11 @@ TfLiteStatus ApplyConstraints(flatbuffers::FlatBufferBuilder* builder, OperatorT* op = subgraph->operators[op_idx].get(); const BuiltinOperator op_code = model->operator_codes[op->opcode_index]->builtin_code; - operator_property::OperatorProperty property; - TF_LITE_ENSURE_STATUS( - operator_property::GetOperatorProperty(op_code, &property)); + operator_property::OperatorProperty property = + operator_property::GetOperatorProperty(op_code); + if (!property.quantizable) { + continue; + } // Basically only Concat passes this check. if (!property.restrict_same_input_output_scale || (property.input_indexes.size() == 1 && @@ -311,10 +333,215 @@ TfLiteStatus ApplyConstraints(flatbuffers::FlatBufferBuilder* builder, return kTfLiteOk; } +std::vector GetInputIndexes(const OperatorT* op, + operator_property::OperatorProperty property) { + std::vector input_indexes; + if (property.arbitrary_inputs || !property.quantizable) { + for (int i = 0; i < op->inputs.size(); ++i) { + input_indexes.push_back(i); + } + } else { + input_indexes = property.input_indexes; + } + return input_indexes; +} + +bool ShouldRestrictSameInputOutputScale( + operator_property::OperatorProperty property) { + return (property.input_indexes.size() == 1 && + property.output_indexes.size() == 1 && property.biases.empty() && + property.restrict_same_input_output_scale); +} + +bool IsSubgraphInput(SubGraphT* subgraph, int32_t index) { + for (const int32_t input_idx : subgraph->inputs) { + if (index == input_idx) { + return true; + } + } + return false; +} + +// Quantize the op input. Will increment op_idx if ops are added. +TfLiteStatus QuantizeOpInput(ModelT* model, int32_t subgraph_idx, + size_t* op_idx, + operator_property::OperatorProperty property, + int32_t input_idx, ErrorReporter* error_reporter) { + SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get(); + OperatorT* op = subgraph->operators[*op_idx].get(); + const BuiltinOperator op_code = + model->operator_codes[op->opcode_index]->builtin_code; + const int32_t tensor_idx = op->inputs[input_idx]; + TensorT* tensor = subgraph->tensors[tensor_idx].get(); + const bool is_input_quantized = utils::IsQuantized(subgraph, tensor_idx); + if (input_idx >= op->inputs.size()) { + error_reporter->Report( + "Required input index %d is larger than the input length of op " + "%s at index %d in subgraph %d", + input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code), *op_idx, + subgraph_idx); + return kTfLiteError; + } + if (property.quantizable && !is_input_quantized) { + // The operation is quantizable, but the input isn't yet quantized. + if (utils::HasBuffer(model, subgraph, tensor_idx)) { + if (utils::QuantizeWeight(model, tensor, property.per_axis, + property.per_axis_index) == kTfLiteError) { + error_reporter->Report( + "Unable to quantize buffer or min/max value for input %d " + "in op %s in subgraph %d, node: %d", + input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx); + return kTfLiteError; + } + } else if (utils::HasMinMax(tensor)) { + if (IsSubgraphInput(subgraph, tensor_idx)) { + utils::QuantizeActivation(tensor); + } else { + // If the tensor is not a model input, we need to add a Quantize + // operation since the preceding op may require a float output. + std::unique_ptr op_output; + utils::MakeTensor(tensor->name + "_int8", tensor->shape, + TensorType_INT8, &op_output); + op_output->quantization = absl::make_unique(); + op_output->quantization->min.push_back(tensor->quantization->min[0]); + op_output->quantization->max.push_back(tensor->quantization->max[0]); + utils::QuantizeActivation(op_output.get()); + const int32_t quant_op_output_idx = subgraph->tensors.size(); + subgraph->tensors.push_back(std::move(op_output)); + std::unique_ptr quant_op; + utils::MakeQuantizeOperator(model, &quant_op, tensor_idx, + quant_op_output_idx); + subgraph->operators.insert(subgraph->operators.begin() + *op_idx, + std::move(quant_op)); + op->inputs[input_idx] = quant_op_output_idx; + *op_idx += 1; + } + } else { + error_reporter->Report( + "Unable to find buffer or min/max value for input activation " + "%d " + "in %s in subgraph %d, node: %d", + input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx); + return kTfLiteError; + } + } else if (!property.quantizable && is_input_quantized) { + // If the tensor is quantized, we have to add a Dequantize op after + // since this op is not quantizable. + std::unique_ptr op_output; + utils::MakeTensor(tensor->name + "_float", tensor->shape, + TensorType_FLOAT32, &op_output); + const int32_t dequant_op_output_idx = subgraph->tensors.size(); + subgraph->tensors.push_back(std::move(op_output)); + std::unique_ptr dequant_op; + utils::MakeDequantizeOperator(model, &dequant_op, tensor_idx, + dequant_op_output_idx); + subgraph->operators.insert(subgraph->operators.begin() + *op_idx, + std::move(dequant_op)); + op->inputs[input_idx] = dequant_op_output_idx; + *op_idx += 1; + } + return kTfLiteOk; +} + +// Quantize the op output. +TfLiteStatus QuantizeOpOutput(ModelT* model, int32_t subgraph_idx, + int32_t op_idx, + operator_property::OperatorProperty property, + int32_t output_idx, + ErrorReporter* error_reporter) { + // If the operator is not quantizable, we don't need to do anything for the + // output. + if (!property.quantizable) { + return kTfLiteOk; + } + SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get(); + OperatorT* op = subgraph->operators[op_idx].get(); + const BuiltinOperator op_code = + model->operator_codes[op->opcode_index]->builtin_code; + if (output_idx >= op->outputs.size()) { + error_reporter->Report( + "Required output index %d is larger than the output length of " + "op %s at index %d in subgraph %d", + output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code), + op_idx, subgraph_idx); + return kTfLiteError; + } + + TensorT* output_tensor = subgraph->tensors[op->outputs[output_idx]].get(); + if (ShouldRestrictSameInputOutputScale(property)) { + // Copy quantization parameter. For average pool, max pool, etc + // min/max can be different but we want them to be the same. + // Get scale and zero point of input. + if (property.input_indexes[0] >= op->inputs.size()) { + error_reporter->Report( + "Required input index %d is larger than the input length of " + "op %s at index %d in subgraph %d", + property.input_indexes[0], op->inputs.size(), + EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx); + return kTfLiteError; + } + const int input_index = op->inputs[property.input_indexes[0]]; + TensorT* input_tensor = subgraph->tensors[input_index].get(); + if (input_tensor->quantization->scale.size() != 1 || + input_tensor->quantization->zero_point.size() != 1 || + input_tensor->quantization->min.size() != 1 || + input_tensor->quantization->max.size() != 1) { + error_reporter->Report( + "Invalid quantization params for op %s at index %d " + "in subgraph %d", + EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx); + return kTfLiteError; + } + + const float input_scale = input_tensor->quantization->scale[0]; + const int32_t input_zero_point = input_tensor->quantization->zero_point[0]; + + const float min = input_tensor->quantization->min[0]; + const float max = input_tensor->quantization->max[0]; + if (utils::HasMinMax(output_tensor)) { + if (output_tensor->quantization->min[0] != min || + output_tensor->quantization->max[0] != max) { + printf( + "Note the output min/max is different from the input min/max " + "for op %s at index %d in subgraph %d. This is legal but " + "should happens rarely.", + EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx); + } + } + + // Apply to output. + output_tensor->quantization = absl::make_unique(); + output_tensor->quantization->scale.push_back(input_scale); + output_tensor->quantization->zero_point.push_back(input_zero_point); + output_tensor->quantization->min.push_back(min); + output_tensor->quantization->max.push_back(max); + output_tensor->type = TensorType_INT8; + } else if (property.restriction_on_output) { + const auto scale_and_zp = property.restricted_value_on_output; + // Apply to output. + output_tensor->quantization = absl::make_unique(); + output_tensor->quantization->scale.push_back(scale_and_zp.first); + output_tensor->quantization->zero_point.push_back(scale_and_zp.second); + output_tensor->type = TensorType_INT8; + } else { + // Process regular output that doesn't have any restrictions. + if (utils::HasMinMax(output_tensor)) { + utils::QuantizeActivation(output_tensor); + } else { + error_reporter->Report( + "Unable to find min/max value for output %d in %s in " + "subgraph %d, node: %d", + output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx); + return kTfLiteError; + } + } + return kTfLiteOk; +} + // Quantize inputs and weights. // Because of ops such as lstm, still need to do per op, instead of weights. TfLiteStatus QuantizeWeightsInputOutput(flatbuffers::FlatBufferBuilder* builder, - ModelT* model, + ModelT* model, bool allow_float, ErrorReporter* error_reporter) { for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size(); subgraph_idx++) { @@ -323,154 +550,25 @@ TfLiteStatus QuantizeWeightsInputOutput(flatbuffers::FlatBufferBuilder* builder, OperatorT* op = subgraph->operators[op_idx].get(); const BuiltinOperator op_code = model->operator_codes[op->opcode_index]->builtin_code; - operator_property::OperatorProperty property; - if (operator_property::GetOperatorProperty(op_code, &property) == - kTfLiteError) { + operator_property::OperatorProperty property = + operator_property::GetOperatorProperty(op_code); + + if (!property.quantizable && !allow_float) { error_reporter->Report("Quantization not yet supported for op: %s", EnumNameBuiltinOperator(op_code)); return kTfLiteError; } - // Quantize weight and inputs. - std::vector input_indexes; - if (property.arbitrary_inputs) { - for (int i = 0; i < op->inputs.size(); ++i) { - input_indexes.push_back(i); - } - } else { - input_indexes = property.input_indexes; + + // Quantize operator inputs/weights. + for (const int input_idx : GetInputIndexes(op, property)) { + TF_LITE_ENSURE_STATUS(QuantizeOpInput( + model, subgraph_idx, &op_idx, property, input_idx, error_reporter)); } - for (const int input_idx : input_indexes) { - if (input_idx >= op->inputs.size()) { - error_reporter->Report( - "Required input index %d is larger than the input length of op " - "%s at index %d in subgraph %d", - input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code), - op_idx, subgraph_idx); - return kTfLiteError; - } - TensorT* tensor = subgraph->tensors[op->inputs[input_idx]].get(); - // Quantize if it is not quantized already as the output of - // another op or input of another op. - if (!utils::IsQuantized(subgraph, op->inputs[input_idx])) { - if (utils::HasBuffer(model, subgraph, op->inputs[input_idx])) { - TensorT* tensor = subgraph->tensors[op->inputs[input_idx]].get(); - utils::QuantizeWeight(model, tensor, property.per_axis, - property.per_axis_index); - continue; - } - if (utils::HasMinMax(tensor)) { - utils::QuantizeActivation(tensor); - continue; - } - // TODO(jianlijianli): Eventually we can insert a dequantize operation - // for all inputs and weights here, in the case that min/max is - // missing. - error_reporter->Report( - "Unable to find buffer or min/max value for input activation %d " - "in %s in subgraph %d, node: %d", - input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, - op_idx); - return kTfLiteError; - } - } - // Quantize output. + + // Quantize operator outputs. for (const int output_idx : property.output_indexes) { - if (output_idx >= op->outputs.size()) { - error_reporter->Report( - "Requaired output index %d is larger than the output length of " - "op %s at index %d in subgraph %d", - output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code), - op_idx, subgraph_idx); - return kTfLiteError; - } - if (property.input_indexes.size() == 1 && - property.output_indexes.size() == 1 && property.biases.empty() && - property.restrict_same_input_output_scale) { - // Copy quantization parameter. For average pool, max pool, etc - // min/max can be different but we want them to be the same. - // Get scale and zero point of input. - if (property.input_indexes[0] >= op->inputs.size()) { - error_reporter->Report( - "Requaired input index %d is larger than the input length of " - "op %s at index %d in subgraph %d", - property.input_indexes[0], op->inputs.size(), - EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx); - return kTfLiteError; - } - const int input_index = op->inputs[property.input_indexes[0]]; - TensorT* input_tensor = subgraph->tensors[input_index].get(); - if (input_tensor->quantization->scale.size() != 1 || - input_tensor->quantization->min.size() != 1 || - input_tensor->quantization->max.size() != 1) { - error_reporter->Report( - "Quantization dimension is not 1 for op %s at index %d in " - "subgraph %d", - EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx); - return kTfLiteError; - } - const float input_scale = input_tensor->quantization->scale[0]; - const float input_zero_point = - input_tensor->quantization->zero_point[0]; - const float min = input_tensor->quantization->min[0]; - const float max = input_tensor->quantization->max[0]; - - // Log a warning when we have to override the min/max (scale and zero - // point) of output using input. - TensorT* output_tensor = - subgraph->tensors[op->outputs[output_idx]].get(); - if (utils::HasMinMax(output_tensor)) { - if (output_tensor->quantization->min[0] != min || - output_tensor->quantization->max[0] != max) { - printf( - "Note the output min/max is different from the input min/max " - "for op %s at index %d in subgraph %d. This is legal but " - "should happens rarely. ", - EnumNameBuiltinOperator(op_code), static_cast(op_idx), - static_cast(subgraph_idx)); - } - } - - // Apply to output. - output_tensor->quantization = - absl::make_unique(); - output_tensor->quantization->scale.push_back(input_scale); - output_tensor->quantization->zero_point.push_back(input_zero_point); - output_tensor->quantization->min.push_back(min); - output_tensor->quantization->max.push_back(max); - output_tensor->type = TensorType_INT8; - continue; - } - if (property.restriction_on_output) { - const std::pair scale_and_zp = - property.restricted_value_on_output; - // Copy scale and zero point since they are fixed. - // Applies to softmax, tanh etc. - TensorT* output_tensor = - subgraph->tensors[op->outputs[output_idx]].get(); - output_tensor->quantization = - absl::make_unique(); - output_tensor->quantization->scale.push_back(scale_and_zp.first); - output_tensor->quantization->zero_point.push_back( - scale_and_zp.second); - output_tensor->type = TensorType_INT8; - continue; - } - - // Process regular output that doesn't have any restrictions. - TensorT* output_tensor = - subgraph->tensors[op->outputs[output_idx]].get(); - if (utils::HasMinMax(output_tensor)) { - utils::QuantizeActivation(output_tensor); - } else { - // TODO(jianlijianli): Eventually we can insert a dequantize operation - // for output here, in the case that min/max is missing. - error_reporter->Report( - "Unable to find min/max value for output activation %d in %s in " - "subgraph %d, node: %d", - output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, - op_idx); - return kTfLiteError; - } + TF_LITE_ENSURE_STATUS(QuantizeOpOutput( + model, subgraph_idx, op_idx, property, output_idx, error_reporter)); } } } @@ -487,9 +585,11 @@ TfLiteStatus QuantizeBiases(flatbuffers::FlatBufferBuilder* builder, OperatorT* op = subgraph->operators[op_idx].get(); const BuiltinOperator op_code = model->operator_codes[op->opcode_index]->builtin_code; - operator_property::OperatorProperty property; - TF_LITE_ENSURE_STATUS( - operator_property::GetOperatorProperty(op_code, &property)); + operator_property::OperatorProperty property = + operator_property::GetOperatorProperty(op_code); + if (!property.quantizable) { + continue; + } for (const int bias_idx : property.biases) { if (bias_idx >= op->inputs.size()) { error_reporter->Report( @@ -533,14 +633,15 @@ TfLiteStatus QuantizeBiases(flatbuffers::FlatBufferBuilder* builder, // Assumes that the operators in the model have been topologically sorted. TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder, ModelT* model, const TensorType& input_type, - const TensorType& output_type, + const TensorType& output_type, bool allow_float, ErrorReporter* error_reporter) { TF_LITE_ENSURE_STATUS( - QuantizeWeightsInputOutput(builder, model, error_reporter)); + QuantizeWeightsInputOutput(builder, model, allow_float, error_reporter)); TF_LITE_ENSURE_STATUS(ApplyConstraints(builder, model, error_reporter)); TF_LITE_ENSURE_STATUS(QuantizeBiases(builder, model, error_reporter)); - TF_LITE_ENSURE_STATUS(utils::SetOperatorCodeVersion(model)); - SetInputAndOutputTypes(model, input_type, output_type); + utils::SetOperatorCodeVersion(model); + TF_LITE_ENSURE_STATUS( + SetInputAndOutputTypes(model, input_type, output_type, error_reporter)); flatbuffers::Offset output_model_location = Model::Pack(*builder, model); @@ -549,10 +650,18 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder, return kTfLiteOk; } +TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder, + ModelT* model, const TensorType& input_type, + const TensorType& output_type, + ErrorReporter* error_reporter) { + return QuantizeModel(builder, model, input_type, output_type, + /*allow_float=*/false, error_reporter); +} + TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder, ModelT* model, ErrorReporter* error_reporter) { return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32, - error_reporter); + /*allow_float=*/false, error_reporter); } } // namespace optimize diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h index 58e5899afad..d6519797c26 100644 --- a/tensorflow/lite/tools/optimize/quantize_model.h +++ b/tensorflow/lite/tools/optimize/quantize_model.h @@ -44,6 +44,15 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder, const TensorType& output_type, ErrorReporter* error_reporter); +// Same as above, but can enable allowing float intermediate operations for ops +// that do not yet support quantizable. +// +// Note: This is a private API, subject to change. +TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder, + ModelT* input_model, const TensorType& input_type, + const TensorType& output_type, bool allow_float, + ErrorReporter* error_reporter); + } // namespace optimize } // namespace tflite diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc index c6ef2d4792a..f41bf077cd3 100644 --- a/tensorflow/lite/tools/optimize/quantize_model_test.cc +++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc @@ -934,6 +934,42 @@ TEST_F(QuantizeFCTest, VerifyFC) { EXPECT_EQ(model_.operator_codes[1]->version, 1); } +class QuantizeCustomOpTest : public QuantizeModelTest { + protected: + QuantizeCustomOpTest() { + input_model_ = ReadModel(internal::kModelMixed); + readonly_model_ = input_model_->GetModel(); + readonly_model_->UnPackTo(&model_); + } +}; + +TEST_F(QuantizeCustomOpTest, VerifyMixedQuantization) { + auto status = + QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8, + /*allow_float=*/true, &error_reporter_); + ASSERT_EQ(kTfLiteOk, status); + const auto& subgraph = model_.subgraphs[0]; + auto float_graph = readonly_model_->subgraphs()->Get(0); + // The original model reshape->custom->custom->squeeze. + ASSERT_EQ(float_graph->operators()->size(), 4); + // The resulting model should be: + // reshape->dequantize->custom->custom->quantize->squeeze. + ASSERT_EQ(subgraph->operators.size(), 6); + const std::vector op_codes = { + BuiltinOperator_RESHAPE, BuiltinOperator_DEQUANTIZE, + BuiltinOperator_CUSTOM, BuiltinOperator_CUSTOM, + BuiltinOperator_QUANTIZE, BuiltinOperator_SQUEEZE}; + const std::vector op_input_types = { + TensorType_INT8, TensorType_INT8, TensorType_FLOAT32, + TensorType_FLOAT32, TensorType_FLOAT32, TensorType_INT8}; + for (int i = 0; i < subgraph->operators.size(); ++i) { + OperatorT* op = subgraph->operators[i].get(); + ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code, + op_codes[i]); + ASSERT_EQ(subgraph->tensors[op->inputs[0]]->type, op_input_types[i]); + } +} + } // namespace } // namespace optimize } // namespace tflite diff --git a/tensorflow/lite/tools/optimize/test_util.cc b/tensorflow/lite/tools/optimize/test_util.cc index 9c5a4799818..5f38d9a949e 100644 --- a/tensorflow/lite/tools/optimize/test_util.cc +++ b/tensorflow/lite/tools/optimize/test_util.cc @@ -45,6 +45,8 @@ const char* kModelWithArgMaxOp = "argmax.bin"; const char* kModelWithFCOp = "fc.bin"; +const char* kModelMixed = "mixed.bin"; + int FailOnErrorReporter::Report(const char* format, va_list args) { char buf[1024]; vsnprintf(buf, sizeof(buf), format, args); diff --git a/tensorflow/lite/tools/optimize/test_util.h b/tensorflow/lite/tools/optimize/test_util.h index 4fcc7283957..1e7e14cf457 100644 --- a/tensorflow/lite/tools/optimize/test_util.h +++ b/tensorflow/lite/tools/optimize/test_util.h @@ -69,6 +69,10 @@ extern const char* kModelWithArgMaxOp; // Test model with a argmax op. extern const char* kModelWithFCOp; +// Test model with mixed quantizable and un-quantizable ops. +// reshape->custom->custom->squeeze. +extern const char* kModelMixed; + // An error reporter that fails on testing. class FailOnErrorReporter : public ErrorReporter { public: diff --git a/tensorflow/lite/tools/optimize/testdata/mixed.bin b/tensorflow/lite/tools/optimize/testdata/mixed.bin new file mode 100644 index 00000000000..b2eeba0ffd0 Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/mixed.bin differ