Add support for partially quantizing models.

PiperOrigin-RevId: 245479701
2019-04-26 13:44:14 -07:00 · 2019-04-26 13:44:14 -07:00 · 9e668b3d69
commit 9e668b3d69
parent ed7ceaa7c9
11 changed files with 425 additions and 271 deletions
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@ -180,6 +180,7 @@ tf_cc_test(
        "//tensorflow/lite/tools/optimize:testdata/argmax.bin",
        "//tensorflow/lite/tools/optimize:testdata/concat.bin",
        "//tensorflow/lite/tools/optimize:testdata/fc.bin",
+        "//tensorflow/lite/tools/optimize:testdata/mixed.bin",
        "//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin",
        "//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",
        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
--- a/tensorflow/lite/tools/optimize/model_utils.cc
+++ b/tensorflow/lite/tools/optimize/model_utils.cc
@ -123,16 +123,17 @@ bool HasMinMax(const TensorT* tensor) {
         !tensor->quantization->max.empty();
 }

-TfLiteStatus SetOperatorCodeVersion(ModelT* model) {
+void SetOperatorCodeVersion(ModelT* model) {
  for (int i = 0; i < model->operator_codes.size(); ++i) {
    OperatorCodeT* op_code = model->operator_codes[i].get();
    const BuiltinOperator op_buildin_code = op_code->builtin_code;
-    operator_property::OperatorProperty property;
-    TF_LITE_ENSURE_STATUS(
-        operator_property::GetOperatorProperty(op_buildin_code, &property));
-    op_code->version = property.version;
+    operator_property::OperatorProperty property =
+        operator_property::GetOperatorProperty(op_buildin_code);
+    if (property.quantizable) {
+      // Only update the versions of non-quantizable operations.
+      op_code->version = property.version;
+    }
  }
-  return kTfLiteOk;
 }

 }  // namespace utils
--- a/tensorflow/lite/tools/optimize/model_utils.h
+++ b/tensorflow/lite/tools/optimize/model_utils.h
@ -53,8 +53,9 @@ bool IsQuantized(const SubGraphT* subgraph, int tensor_index);

 bool HasMinMax(const TensorT* tensor);

-// Set version of OperatorCode.
-TfLiteStatus SetOperatorCodeVersion(ModelT* model);
+// Set version of OperatorCode. The version will only be applied for operations
+// that have been quantized.
+void SetOperatorCodeVersion(ModelT* model);

 }  // namespace utils
 }  // namespace optimize
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@ -17,123 +17,113 @@ limitations under the License.
 namespace tflite {
 namespace optimize {
 namespace operator_property {
-TfLiteStatus GetOperatorProperty(const BuiltinOperator& op,
-                                 OperatorProperty* property) {
-  // Set up default values.
-  property->per_axis = false;
-  property->per_axis_index = 0;
-  property->arbitrary_inputs = false;
-  property->input_indexes = {};
-  property->output_indexes = {};
-  property->biases = {};
-  property->restrict_same_input_output_scale = false;
-  property->restriction_on_output = false;
-  property->restricted_value_on_output = {0.0, 0.0};
-  property->version = 0;
+OperatorProperty GetOperatorProperty(const BuiltinOperator& op) {
+  OperatorProperty property;
  switch (op) {
    case BuiltinOperator_ADD:
-      property->input_indexes = {0, 1};
-      property->output_indexes = {0};
-      property->version = 2;
-      return kTfLiteOk;
+      property.input_indexes = {0, 1};
+      property.output_indexes = {0};
+      property.version = 2;
+      break;
    case BuiltinOperator_ARG_MAX:
-      property->input_indexes = {0};
+      property.input_indexes = {0};
      // ArgMax has no quantizable output.
-      property->version = 2;
-      return kTfLiteOk;
+      property.version = 2;
+      break;
    case BuiltinOperator_AVERAGE_POOL_2D:
-      property->input_indexes = {0};
-      property->output_indexes = {0};
-      property->restrict_same_input_output_scale = true;
-      property->version = 2;
-      return kTfLiteOk;
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
    case BuiltinOperator_CONCATENATION:
-      property->arbitrary_inputs = true;
-      property->input_indexes = {};
-      property->output_indexes = {0};
-      property->restrict_same_input_output_scale = true;
-      property->version = 2;
-      return kTfLiteOk;
+      property.arbitrary_inputs = true;
+      property.input_indexes = {};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
    case BuiltinOperator_CONV_2D:
-      property->per_axis = true;
-      property->per_axis_index = 0;
-      property->input_indexes = {0, 1};
-      property->output_indexes = {0};
-      property->biases = {2};
-      property->version = 2;
-      return kTfLiteOk;
+      property.per_axis = true;
+      property.per_axis_index = 0;
+      property.input_indexes = {0, 1};
+      property.output_indexes = {0};
+      property.biases = {2};
+      property.version = 2;
+      break;
    case BuiltinOperator_DEPTHWISE_CONV_2D:
-      property->per_axis = true;
-      property->per_axis_index = 3;
-      property->input_indexes = {0, 1};
-      property->output_indexes = {0};
-      property->biases = {2};
-      property->version = 3;
-      return kTfLiteOk;
+      property.per_axis = true;
+      property.per_axis_index = 3;
+      property.input_indexes = {0, 1};
+      property.output_indexes = {0};
+      property.biases = {2};
+      property.version = 3;
+      break;
    case BuiltinOperator_FULLY_CONNECTED:
-      property->input_indexes = {0, 1};
-      property->output_indexes = {0};
-      property->biases = {2};
-      property->version = 4;
-      return kTfLiteOk;
+      property.input_indexes = {0, 1};
+      property.output_indexes = {0};
+      property.biases = {2};
+      property.version = 4;
+      break;
    case BuiltinOperator_MAX_POOL_2D:
-      property->input_indexes = {0};
-      property->output_indexes = {0};
-      property->restrict_same_input_output_scale = true;
-      property->version = 2;
-      return kTfLiteOk;
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
    case BuiltinOperator_MEAN:
-      property->input_indexes = {0};
-      property->output_indexes = {0};
-      property->version = 2;
-      return kTfLiteOk;
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.version = 2;
+      break;
    case BuiltinOperator_MUL:
-      property->input_indexes = {0, 1};
-      property->output_indexes = {0};
-      property->version = 2;
-      return kTfLiteOk;
+      property.input_indexes = {0, 1};
+      property.output_indexes = {0};
+      property.version = 2;
+      break;
    case BuiltinOperator_PAD:
-      property->input_indexes = {0};
-      property->output_indexes = {0};
-      property->version = 2;
-      return kTfLiteOk;
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.version = 2;
+      break;
    case BuiltinOperator_QUANTIZE:
-      property->input_indexes = {0};
-      property->output_indexes = {0};
-      property->version = 1;
-      return kTfLiteOk;
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.version = 1;
+      break;
    case BuiltinOperator_RESHAPE:
-      property->input_indexes = {0};
-      property->output_indexes = {0};
-      property->restrict_same_input_output_scale = true;
-      property->version = 1;
-      return kTfLiteOk;
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 1;
+      break;
    case BuiltinOperator_SQUEEZE:
-      property->input_indexes = {0};
-      property->output_indexes = {0};
-      property->restrict_same_input_output_scale = true;
-      property->version = 1;
-      return kTfLiteOk;
+      property.input_indexes = {0};
+      property.output_indexes = {0};
+      property.restrict_same_input_output_scale = true;
+      property.version = 1;
+      break;
    case BuiltinOperator_SOFTMAX:
-      property->input_indexes = {0};
-      property->output_indexes = {0};
+      property.input_indexes = {0};
+      property.output_indexes = {0};
      // Softmax requires output with 1/256 as scale and -128 as zero point.
-      property->restriction_on_output = true;
-      property->restricted_value_on_output = {1 / 256.0, -128};
-      property->version = 2;
-      return kTfLiteOk;
+      property.restriction_on_output = true;
+      property.restricted_value_on_output = {1 / 256.0, -128};
+      property.version = 2;
+      break;
    case BuiltinOperator_TANH:
-      property->input_indexes = {0};
-      property->output_indexes = {0};
+      property.input_indexes = {0};
+      property.output_indexes = {0};
      // Tanh requires output with 1/128 as scale and 0 as zero point.
-      property->restriction_on_output = true;
-      property->restricted_value_on_output = {1 / 128.0, 0};
-      property->version = 2;
-      return kTfLiteOk;
+      property.restriction_on_output = true;
+      property.restricted_value_on_output = {1 / 128.0, 0};
+      property.version = 2;
+      break;
    default:
-      return kTfLiteError;
+      // No quantized implementation exists for this operation.
+      property.quantizable = false;
  }
-  return kTfLiteError;
+  return property;
 }

 }  // namespace operator_property
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@ -23,34 +23,35 @@ namespace optimize {
 namespace operator_property {

 struct OperatorProperty {
+  // Is a quantized operations currently supported.
+  bool quantizable = true;
  // Per axis.
-  bool per_axis;
+  bool per_axis = false;
  // TODO(jianlijianli): remove dimension index and read it from tensor instead.
-  int per_axis_index;
+  int per_axis_index = 0;

  // Op has arbitrary number of inputs, such as concat.
-  bool arbitrary_inputs;
+  bool arbitrary_inputs = false;
  // Input and weight indexes. Unable to separate the two because of ops such as
  // ADD.
-  std::vector<int> input_indexes;
+  std::vector<int> input_indexes = {};

  // Output indexes
-  std::vector<int> output_indexes;
+  std::vector<int> output_indexes = {};

  // Bias indexes.
-  std::vector<int> biases;
+  std::vector<int> biases = {};

  // Constraints.
-  bool restrict_same_input_output_scale;
-  bool restriction_on_output;
-  std::pair<float, float> restricted_value_on_output;
+  bool restrict_same_input_output_scale = false;
+  bool restriction_on_output = false;
+  std::pair<float, float> restricted_value_on_output = {0.0, 0.0};

  // Op version.
-  int version;
+  int version = 1;
 };

-TfLiteStatus GetOperatorProperty(const BuiltinOperator& op,
-                                 OperatorProperty* property);
+OperatorProperty GetOperatorProperty(const BuiltinOperator& op);

 }  // namespace operator_property
 }  // namespace optimize
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@ -210,13 +210,23 @@ int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
 // For Uint8 input and output, leading op is Quantize (uint8 to
 // int8, can be thought as "requant") and tailing op is also Quantize (int8 to
 // uint8, can be thought as "requant").
-void SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
-                            const TensorType& output_type) {
+TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
+                                    const TensorType& output_type,
+                                    ErrorReporter* error_reporter) {
  for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
       subgraph_idx++) {
    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();

    for (int i = 0; i < subgraph->inputs.size(); ++i) {
+      TensorT* tensor = subgraph->tensors[subgraph->inputs[i]].get();
+      // TODO(suharshs): Add support for this case if it ever comes up.
+      if (tensor->type == TensorType_FLOAT32 && input_type != tensor->type) {
+        error_reporter->Report(
+            "Unsupported input type %s for input tensor %d of type %s.",
+            EnumNameTensorType(input_type), subgraph->inputs[i],
+            EnumNameTensorType(tensor->type));
+        return kTfLiteError;
+      }
      const int32_t input_idx =
          SetInputType(model, subgraph, subgraph->inputs[i], input_type);
      if (input_idx < 0) {
@ -225,6 +235,15 @@ void SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
      subgraph->inputs[i] = input_idx;
    }
    for (int i = 0; i < subgraph->outputs.size(); ++i) {
+      TensorT* tensor = subgraph->tensors[subgraph->outputs[i]].get();
+      // TODO(suharshs): Add support for this case if it ever comes up.
+      if (tensor->type == TensorType_FLOAT32 && output_type != tensor->type) {
+        error_reporter->Report(
+            "Unsupported output type %s for output tensor %d of type %s.",
+            EnumNameTensorType(output_type), subgraph->outputs[i],
+            EnumNameTensorType(tensor->type));
+        return kTfLiteError;
+      }
      const int32_t output_idx =
          SetOutputType(model, subgraph, subgraph->outputs[i], output_type);
      if (output_idx < 0) {
@ -233,6 +252,7 @@ void SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
      subgraph->outputs[i] = output_idx;
    }
  }
+  return kTfLiteOk;
 }

 // Apply constraints to ops if they have any.
@ -250,9 +270,11 @@ TfLiteStatus ApplyConstraints(flatbuffers::FlatBufferBuilder* builder,
      OperatorT* op = subgraph->operators[op_idx].get();
      const BuiltinOperator op_code =
          model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property;
-      TF_LITE_ENSURE_STATUS(
-          operator_property::GetOperatorProperty(op_code, &property));
+      operator_property::OperatorProperty property =
+          operator_property::GetOperatorProperty(op_code);
+      if (!property.quantizable) {
+        continue;
+      }
      // Basically only Concat passes this check.
      if (!property.restrict_same_input_output_scale ||
          (property.input_indexes.size() == 1 &&
@ -311,10 +333,215 @@ TfLiteStatus ApplyConstraints(flatbuffers::FlatBufferBuilder* builder,
  return kTfLiteOk;
 }

+std::vector<int> GetInputIndexes(const OperatorT* op,
+                                 operator_property::OperatorProperty property) {
+  std::vector<int> input_indexes;
+  if (property.arbitrary_inputs || !property.quantizable) {
+    for (int i = 0; i < op->inputs.size(); ++i) {
+      input_indexes.push_back(i);
+    }
+  } else {
+    input_indexes = property.input_indexes;
+  }
+  return input_indexes;
+}
+
+bool ShouldRestrictSameInputOutputScale(
+    operator_property::OperatorProperty property) {
+  return (property.input_indexes.size() == 1 &&
+          property.output_indexes.size() == 1 && property.biases.empty() &&
+          property.restrict_same_input_output_scale);
+}
+
+bool IsSubgraphInput(SubGraphT* subgraph, int32_t index) {
+  for (const int32_t input_idx : subgraph->inputs) {
+    if (index == input_idx) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Quantize the op input. Will increment op_idx if ops are added.
+TfLiteStatus QuantizeOpInput(ModelT* model, int32_t subgraph_idx,
+                             size_t* op_idx,
+                             operator_property::OperatorProperty property,
+                             int32_t input_idx, ErrorReporter* error_reporter) {
+  SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
+  OperatorT* op = subgraph->operators[*op_idx].get();
+  const BuiltinOperator op_code =
+      model->operator_codes[op->opcode_index]->builtin_code;
+  const int32_t tensor_idx = op->inputs[input_idx];
+  TensorT* tensor = subgraph->tensors[tensor_idx].get();
+  const bool is_input_quantized = utils::IsQuantized(subgraph, tensor_idx);
+  if (input_idx >= op->inputs.size()) {
+    error_reporter->Report(
+        "Required input index %d is larger than the input length of op "
+        "%s at index %d in subgraph %d",
+        input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code), *op_idx,
+        subgraph_idx);
+    return kTfLiteError;
+  }
+  if (property.quantizable && !is_input_quantized) {
+    // The operation is quantizable, but the input isn't yet quantized.
+    if (utils::HasBuffer(model, subgraph, tensor_idx)) {
+      if (utils::QuantizeWeight(model, tensor, property.per_axis,
+                                property.per_axis_index) == kTfLiteError) {
+        error_reporter->Report(
+            "Unable to quantize buffer or min/max value for input %d "
+            "in op %s in subgraph %d, node: %d",
+            input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx);
+        return kTfLiteError;
+      }
+    } else if (utils::HasMinMax(tensor)) {
+      if (IsSubgraphInput(subgraph, tensor_idx)) {
+        utils::QuantizeActivation(tensor);
+      } else {
+        // If the tensor is not a model input, we need to add a Quantize
+        // operation since the preceding op may require a float output.
+        std::unique_ptr<TensorT> op_output;
+        utils::MakeTensor(tensor->name + "_int8", tensor->shape,
+                          TensorType_INT8, &op_output);
+        op_output->quantization = absl::make_unique<QuantizationParametersT>();
+        op_output->quantization->min.push_back(tensor->quantization->min[0]);
+        op_output->quantization->max.push_back(tensor->quantization->max[0]);
+        utils::QuantizeActivation(op_output.get());
+        const int32_t quant_op_output_idx = subgraph->tensors.size();
+        subgraph->tensors.push_back(std::move(op_output));
+        std::unique_ptr<OperatorT> quant_op;
+        utils::MakeQuantizeOperator(model, &quant_op, tensor_idx,
+                                    quant_op_output_idx);
+        subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
+                                   std::move(quant_op));
+        op->inputs[input_idx] = quant_op_output_idx;
+        *op_idx += 1;
+      }
+    } else {
+      error_reporter->Report(
+          "Unable to find buffer or min/max value for input activation "
+          "%d "
+          "in %s in subgraph %d, node: %d",
+          input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx);
+      return kTfLiteError;
+    }
+  } else if (!property.quantizable && is_input_quantized) {
+    // If the tensor is quantized, we have to add a Dequantize op after
+    // since this op is not quantizable.
+    std::unique_ptr<TensorT> op_output;
+    utils::MakeTensor(tensor->name + "_float", tensor->shape,
+                      TensorType_FLOAT32, &op_output);
+    const int32_t dequant_op_output_idx = subgraph->tensors.size();
+    subgraph->tensors.push_back(std::move(op_output));
+    std::unique_ptr<OperatorT> dequant_op;
+    utils::MakeDequantizeOperator(model, &dequant_op, tensor_idx,
+                                  dequant_op_output_idx);
+    subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
+                               std::move(dequant_op));
+    op->inputs[input_idx] = dequant_op_output_idx;
+    *op_idx += 1;
+  }
+  return kTfLiteOk;
+}
+
+// Quantize the op output.
+TfLiteStatus QuantizeOpOutput(ModelT* model, int32_t subgraph_idx,
+                              int32_t op_idx,
+                              operator_property::OperatorProperty property,
+                              int32_t output_idx,
+                              ErrorReporter* error_reporter) {
+  // If the operator is not quantizable, we don't need to do anything for the
+  // output.
+  if (!property.quantizable) {
+    return kTfLiteOk;
+  }
+  SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
+  OperatorT* op = subgraph->operators[op_idx].get();
+  const BuiltinOperator op_code =
+      model->operator_codes[op->opcode_index]->builtin_code;
+  if (output_idx >= op->outputs.size()) {
+    error_reporter->Report(
+        "Required output index %d is larger than the output length of "
+        "op %s at index %d in subgraph %d",
+        output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code),
+        op_idx, subgraph_idx);
+    return kTfLiteError;
+  }
+
+  TensorT* output_tensor = subgraph->tensors[op->outputs[output_idx]].get();
+  if (ShouldRestrictSameInputOutputScale(property)) {
+    // Copy quantization parameter. For average pool, max pool, etc
+    // min/max can be different but we want them to be the same.
+    // Get scale and zero point of input.
+    if (property.input_indexes[0] >= op->inputs.size()) {
+      error_reporter->Report(
+          "Required input index %d is larger than the input length of "
+          "op  %s at index %d in subgraph %d",
+          property.input_indexes[0], op->inputs.size(),
+          EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
+      return kTfLiteError;
+    }
+    const int input_index = op->inputs[property.input_indexes[0]];
+    TensorT* input_tensor = subgraph->tensors[input_index].get();
+    if (input_tensor->quantization->scale.size() != 1 ||
+        input_tensor->quantization->zero_point.size() != 1 ||
+        input_tensor->quantization->min.size() != 1 ||
+        input_tensor->quantization->max.size() != 1) {
+      error_reporter->Report(
+          "Invalid quantization params for op %s at index %d "
+          "in subgraph %d",
+          EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
+      return kTfLiteError;
+    }
+
+    const float input_scale = input_tensor->quantization->scale[0];
+    const int32_t input_zero_point = input_tensor->quantization->zero_point[0];
+
+    const float min = input_tensor->quantization->min[0];
+    const float max = input_tensor->quantization->max[0];
+    if (utils::HasMinMax(output_tensor)) {
+      if (output_tensor->quantization->min[0] != min ||
+          output_tensor->quantization->max[0] != max) {
+        printf(
+            "Note the output min/max is different from the input min/max "
+            "for op %s at index %d in subgraph %d. This is legal but "
+            "should happens rarely.",
+            EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
+      }
+    }
+
+    // Apply to output.
+    output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
+    output_tensor->quantization->scale.push_back(input_scale);
+    output_tensor->quantization->zero_point.push_back(input_zero_point);
+    output_tensor->quantization->min.push_back(min);
+    output_tensor->quantization->max.push_back(max);
+    output_tensor->type = TensorType_INT8;
+  } else if (property.restriction_on_output) {
+    const auto scale_and_zp = property.restricted_value_on_output;
+    // Apply to output.
+    output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
+    output_tensor->quantization->scale.push_back(scale_and_zp.first);
+    output_tensor->quantization->zero_point.push_back(scale_and_zp.second);
+    output_tensor->type = TensorType_INT8;
+  } else {
+    // Process regular output that doesn't have any restrictions.
+    if (utils::HasMinMax(output_tensor)) {
+      utils::QuantizeActivation(output_tensor);
+    } else {
+      error_reporter->Report(
+          "Unable to find min/max value for output %d in %s in "
+          "subgraph %d, node: %d",
+          output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx);
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
 // Quantize inputs and weights.
 // Because of ops such as lstm, still need to do per op, instead of weights.
 TfLiteStatus QuantizeWeightsInputOutput(flatbuffers::FlatBufferBuilder* builder,
-                                        ModelT* model,
+                                        ModelT* model, bool allow_float,
                                        ErrorReporter* error_reporter) {
  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
       subgraph_idx++) {
@ -323,154 +550,25 @@ TfLiteStatus QuantizeWeightsInputOutput(flatbuffers::FlatBufferBuilder* builder,
      OperatorT* op = subgraph->operators[op_idx].get();
      const BuiltinOperator op_code =
          model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property;
-      if (operator_property::GetOperatorProperty(op_code, &property) ==
-          kTfLiteError) {
+      operator_property::OperatorProperty property =
+          operator_property::GetOperatorProperty(op_code);
+
+      if (!property.quantizable && !allow_float) {
        error_reporter->Report("Quantization not yet supported for op: %s",
                               EnumNameBuiltinOperator(op_code));
        return kTfLiteError;
      }
-      // Quantize weight and inputs.
-      std::vector<int> input_indexes;
-      if (property.arbitrary_inputs) {
-        for (int i = 0; i < op->inputs.size(); ++i) {
-          input_indexes.push_back(i);
-        }
-      } else {
-        input_indexes = property.input_indexes;
+
+      // Quantize operator inputs/weights.
+      for (const int input_idx : GetInputIndexes(op, property)) {
+        TF_LITE_ENSURE_STATUS(QuantizeOpInput(
+            model, subgraph_idx, &op_idx, property, input_idx, error_reporter));
      }
-      for (const int input_idx : input_indexes) {
-        if (input_idx >= op->inputs.size()) {
-          error_reporter->Report(
-              "Required input index %d is larger than the input length of op "
-              "%s at index %d in subgraph %d",
-              input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code),
-              op_idx, subgraph_idx);
-          return kTfLiteError;
-        }
-        TensorT* tensor = subgraph->tensors[op->inputs[input_idx]].get();
-        // Quantize if it is not quantized already as the output of
-        // another op or input of another op.
-        if (!utils::IsQuantized(subgraph, op->inputs[input_idx])) {
-          if (utils::HasBuffer(model, subgraph, op->inputs[input_idx])) {
-            TensorT* tensor = subgraph->tensors[op->inputs[input_idx]].get();
-            utils::QuantizeWeight(model, tensor, property.per_axis,
-                                  property.per_axis_index);
-            continue;
-          }
-          if (utils::HasMinMax(tensor)) {
-            utils::QuantizeActivation(tensor);
-            continue;
-          }
-          // TODO(jianlijianli): Eventually we can insert a dequantize operation
-          // for all inputs and weights here, in the case that min/max is
-          // missing.
-          error_reporter->Report(
-              "Unable to find buffer or min/max value for input activation %d "
-              "in %s in subgraph %d, node: %d",
-              input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
-              op_idx);
-          return kTfLiteError;
-        }
-      }
-      // Quantize output.
+
+      // Quantize operator outputs.
      for (const int output_idx : property.output_indexes) {
-        if (output_idx >= op->outputs.size()) {
-          error_reporter->Report(
-              "Requaired output index %d is larger than the output length of "
-              "op %s at index %d in subgraph %d",
-              output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code),
-              op_idx, subgraph_idx);
-          return kTfLiteError;
-        }
-        if (property.input_indexes.size() == 1 &&
-            property.output_indexes.size() == 1 && property.biases.empty() &&
-            property.restrict_same_input_output_scale) {
-          // Copy quantization parameter. For average pool, max pool, etc
-          // min/max can be different but we want them to be the same.
-          // Get scale and zero point of input.
-          if (property.input_indexes[0] >= op->inputs.size()) {
-            error_reporter->Report(
-                "Requaired input index %d is larger than the input length of "
-                "op  %s at index %d in subgraph %d",
-                property.input_indexes[0], op->inputs.size(),
-                EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
-            return kTfLiteError;
-          }
-          const int input_index = op->inputs[property.input_indexes[0]];
-          TensorT* input_tensor = subgraph->tensors[input_index].get();
-          if (input_tensor->quantization->scale.size() != 1 ||
-              input_tensor->quantization->min.size() != 1 ||
-              input_tensor->quantization->max.size() != 1) {
-            error_reporter->Report(
-                "Quantization dimension is not 1 for op %s at index %d in "
-                "subgraph %d",
-                EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
-            return kTfLiteError;
-          }
-          const float input_scale = input_tensor->quantization->scale[0];
-          const float input_zero_point =
-              input_tensor->quantization->zero_point[0];
-          const float min = input_tensor->quantization->min[0];
-          const float max = input_tensor->quantization->max[0];
-
-          // Log a warning when we have to override the min/max (scale and zero
-          // point) of output using input.
-          TensorT* output_tensor =
-              subgraph->tensors[op->outputs[output_idx]].get();
-          if (utils::HasMinMax(output_tensor)) {
-            if (output_tensor->quantization->min[0] != min ||
-                output_tensor->quantization->max[0] != max) {
-              printf(
-                  "Note the output min/max is different from the input min/max "
-                  "for op %s at index %d in subgraph %d. This is legal but "
-                  "should happens rarely. ",
-                  EnumNameBuiltinOperator(op_code), static_cast<int>(op_idx),
-                  static_cast<int>(subgraph_idx));
-            }
-          }
-
-          // Apply to output.
-          output_tensor->quantization =
-              absl::make_unique<QuantizationParametersT>();
-          output_tensor->quantization->scale.push_back(input_scale);
-          output_tensor->quantization->zero_point.push_back(input_zero_point);
-          output_tensor->quantization->min.push_back(min);
-          output_tensor->quantization->max.push_back(max);
-          output_tensor->type = TensorType_INT8;
-          continue;
-        }
-        if (property.restriction_on_output) {
-          const std::pair<float, float> scale_and_zp =
-              property.restricted_value_on_output;
-          // Copy scale and zero point since they are fixed.
-          // Applies to softmax, tanh etc.
-          TensorT* output_tensor =
-              subgraph->tensors[op->outputs[output_idx]].get();
-          output_tensor->quantization =
-              absl::make_unique<QuantizationParametersT>();
-          output_tensor->quantization->scale.push_back(scale_and_zp.first);
-          output_tensor->quantization->zero_point.push_back(
-              scale_and_zp.second);
-          output_tensor->type = TensorType_INT8;
-          continue;
-        }
-
-        // Process regular output that doesn't have any restrictions.
-        TensorT* output_tensor =
-            subgraph->tensors[op->outputs[output_idx]].get();
-        if (utils::HasMinMax(output_tensor)) {
-          utils::QuantizeActivation(output_tensor);
-        } else {
-          // TODO(jianlijianli): Eventually we can insert a dequantize operation
-          // for output here, in the case that min/max is missing.
-          error_reporter->Report(
-              "Unable to find min/max value for output activation %d in %s in "
-              "subgraph %d, node: %d",
-              output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
-              op_idx);
-          return kTfLiteError;
-        }
+        TF_LITE_ENSURE_STATUS(QuantizeOpOutput(
+            model, subgraph_idx, op_idx, property, output_idx, error_reporter));
      }
    }
  }
@ -487,9 +585,11 @@ TfLiteStatus QuantizeBiases(flatbuffers::FlatBufferBuilder* builder,
      OperatorT* op = subgraph->operators[op_idx].get();
      const BuiltinOperator op_code =
          model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property;
-      TF_LITE_ENSURE_STATUS(
-          operator_property::GetOperatorProperty(op_code, &property));
+      operator_property::OperatorProperty property =
+          operator_property::GetOperatorProperty(op_code);
+      if (!property.quantizable) {
+        continue;
+      }
      for (const int bias_idx : property.biases) {
        if (bias_idx >= op->inputs.size()) {
          error_reporter->Report(
@ -533,14 +633,15 @@ TfLiteStatus QuantizeBiases(flatbuffers::FlatBufferBuilder* builder,
 // Assumes that the operators in the model have been topologically sorted.
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                           ModelT* model, const TensorType& input_type,
-                           const TensorType& output_type,
+                           const TensorType& output_type, bool allow_float,
                           ErrorReporter* error_reporter) {
  TF_LITE_ENSURE_STATUS(
-      QuantizeWeightsInputOutput(builder, model, error_reporter));
+      QuantizeWeightsInputOutput(builder, model, allow_float, error_reporter));
  TF_LITE_ENSURE_STATUS(ApplyConstraints(builder, model, error_reporter));
  TF_LITE_ENSURE_STATUS(QuantizeBiases(builder, model, error_reporter));
-  TF_LITE_ENSURE_STATUS(utils::SetOperatorCodeVersion(model));
-  SetInputAndOutputTypes(model, input_type, output_type);
+  utils::SetOperatorCodeVersion(model);
+  TF_LITE_ENSURE_STATUS(
+      SetInputAndOutputTypes(model, input_type, output_type, error_reporter));

  flatbuffers::Offset<Model> output_model_location =
      Model::Pack(*builder, model);
@ -549,10 +650,18 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
  return kTfLiteOk;
 }

+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, const TensorType& input_type,
+                           const TensorType& output_type,
+                           ErrorReporter* error_reporter) {
+  return QuantizeModel(builder, model, input_type, output_type,
+                       /*allow_float=*/false, error_reporter);
+}
+
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                           ModelT* model, ErrorReporter* error_reporter) {
  return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32,
-                       error_reporter);
+                       /*allow_float=*/false, error_reporter);
 }

 }  // namespace optimize
--- a/tensorflow/lite/tools/optimize/quantize_model.h
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@ -44,6 +44,15 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                           const TensorType& output_type,
                           ErrorReporter* error_reporter);

+// Same as above, but can enable allowing float intermediate operations for ops
+// that do not yet support quantizable.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* input_model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           ErrorReporter* error_reporter);
+
 }  // namespace optimize
 }  // namespace tflite

--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@ -934,6 +934,42 @@ TEST_F(QuantizeFCTest, VerifyFC) {
  EXPECT_EQ(model_.operator_codes[1]->version, 1);
 }

+class QuantizeCustomOpTest : public QuantizeModelTest {
+ protected:
+  QuantizeCustomOpTest() {
+    input_model_ = ReadModel(internal::kModelMixed);
+    readonly_model_ = input_model_->GetModel();
+    readonly_model_->UnPackTo(&model_);
+  }
+};
+
+TEST_F(QuantizeCustomOpTest, VerifyMixedQuantization) {
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    /*allow_float=*/true, &error_reporter_);
+  ASSERT_EQ(kTfLiteOk, status);
+  const auto& subgraph = model_.subgraphs[0];
+  auto float_graph = readonly_model_->subgraphs()->Get(0);
+  // The original model reshape->custom->custom->squeeze.
+  ASSERT_EQ(float_graph->operators()->size(), 4);
+  // The resulting model should be:
+  // reshape->dequantize->custom->custom->quantize->squeeze.
+  ASSERT_EQ(subgraph->operators.size(), 6);
+  const std::vector<BuiltinOperator> op_codes = {
+      BuiltinOperator_RESHAPE,  BuiltinOperator_DEQUANTIZE,
+      BuiltinOperator_CUSTOM,   BuiltinOperator_CUSTOM,
+      BuiltinOperator_QUANTIZE, BuiltinOperator_SQUEEZE};
+  const std::vector<TensorType> op_input_types = {
+      TensorType_INT8,    TensorType_INT8,    TensorType_FLOAT32,
+      TensorType_FLOAT32, TensorType_FLOAT32, TensorType_INT8};
+  for (int i = 0; i < subgraph->operators.size(); ++i) {
+    OperatorT* op = subgraph->operators[i].get();
+    ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,
+              op_codes[i]);
+    ASSERT_EQ(subgraph->tensors[op->inputs[0]]->type, op_input_types[i]);
+  }
+}
+
 }  // namespace
 }  // namespace optimize
 }  // namespace tflite
--- a/tensorflow/lite/tools/optimize/test_util.cc
+++ b/tensorflow/lite/tools/optimize/test_util.cc
@ -45,6 +45,8 @@ const char* kModelWithArgMaxOp = "argmax.bin";

 const char* kModelWithFCOp = "fc.bin";

+const char* kModelMixed = "mixed.bin";
+
 int FailOnErrorReporter::Report(const char* format, va_list args) {
  char buf[1024];
  vsnprintf(buf, sizeof(buf), format, args);
--- a/tensorflow/lite/tools/optimize/test_util.h
+++ b/tensorflow/lite/tools/optimize/test_util.h
@ -69,6 +69,10 @@ extern const char* kModelWithArgMaxOp;
 // Test model with a argmax op.
 extern const char* kModelWithFCOp;

+// Test model with mixed quantizable and un-quantizable ops.
+// reshape->custom->custom->squeeze.
+extern const char* kModelMixed;
+
 // An error reporter that fails on testing.
 class FailOnErrorReporter : public ErrorReporter {
 public:
--- a/tensorflow/lite/tools/optimize/testdata/mixed.bin
+++ b/tensorflow/lite/tools/optimize/testdata/mixed.bin