STT-tensorflow/tensorflow/lite/tools/optimize/quantize_model.cc

/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/tools/optimize/quantize_model.h"

#include <algorithm>
#include <cstdint>
#include <limits>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

#include "flatbuffers/flexbuffers.h"
#include "tensorflow/lite/context.h"
#include "tensorflow/lite/core/api/error_reporter.h"
#include "tensorflow/lite/model.h"
#include "tensorflow/lite/schema/schema_generated.h"
#include "tensorflow/lite/tools/optimize/model_utils.h"
#include "tensorflow/lite/tools/optimize/operator_property.h"
#include "tensorflow/lite/tools/optimize/quantization_utils.h"

namespace tflite {
namespace optimize {

namespace {

bool IsFloatTensor(const SubGraphT* subgraph, int32_t tensor_idx) {
  TensorT* tensor = subgraph->tensors[tensor_idx].get();
  if (tensor->type != TensorType_FLOAT32) {
    // Skip non-real-valued tensor.
    return false;
  }
  return true;
}

// Gets the operator property from the operator_property list and additionally
// modifies the quantizable parameter based on the user's specified
// operator_names.
operator_property::OperatorProperty GetOperatorProperty(
    const std::unordered_set<string>& operator_names, const ModelT* model,
    int subgraph_index, int op_idx, const string& operator_name,
    const TensorType& activations_type) {
  operator_property::OperatorProperty property =
      operator_property::GetOperatorProperty(model, subgraph_index, op_idx);
  const SubGraphT* subgraph = model->subgraphs[subgraph_index].get();
  const OperatorT* op = subgraph->operators[op_idx].get();
  const BuiltinOperator op_code =
      model->operator_codes[op->opcode_index]->builtin_code;
  if (activations_type == TensorType_INT16 && !property.quantizable_int16) {
    property.quantizable = false;
  }
  // The algorithm adds Dequantize and Quantize, so we don't require them to be
  // in the operator_names.
  if (op_code != BuiltinOperator_DEQUANTIZE &&
      op_code != BuiltinOperator_QUANTIZE) {
    property.quantizable =
        property.quantizable &&
        (operator_names.find(operator_name) != operator_names.end());
  }
  return property;
}

bool IsRealValueOp(const std::unordered_set<string>& real_value_op_set,
                   const string& operator_name) {
  return real_value_op_set.find(operator_name) != real_value_op_set.end();
}

// Creates a set that contains all quantizable ops that happen to take a
// non-float type in the source graph.
std::unordered_set<string> PopulateRealValueOpSet(
    ModelT* model, const std::unordered_set<string>& operator_names,
    const TensorType& activations_type) {
  std::unordered_set<string> real_value_op_set;
  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
       subgraph_idx++) {
    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
    for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
      OperatorT* op = subgraph->operators[op_idx].get();
      const string operator_name = subgraph->tensors[op->outputs[0]]->name;
      operator_property::OperatorProperty property =
          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
                              operator_name, activations_type);

      if (!property.quantizable) {
        real_value_op_set.insert(operator_name);
        continue;
      }

      for (const std::pair<int, operator_property::TensorProperty>& input :
           property.inputs) {
        const int32_t input_idx = input.first;
        const int32_t tensor_idx = op->inputs[input_idx];
        if (IsFloatTensor(subgraph, tensor_idx)) {
          real_value_op_set.insert(operator_name);
          break;
        }
      }
      for (const std::pair<int, operator_property::TensorProperty>& output :
           property.outputs) {
        const int32_t output_idx = output.first;
        const int32_t tensor_idx = op->outputs[output_idx];
        if (IsFloatTensor(subgraph, tensor_idx)) {
          real_value_op_set.insert(operator_name);
          break;
        }
      }

      if (property.arbitrary_inputs) {
        const int32_t tensor_idx = op->inputs[0];
        if (IsFloatTensor(subgraph, tensor_idx)) {
          real_value_op_set.insert(operator_name);
        }
      }

      if (property.arbitrary_outputs) {
        const int32_t tensor_idx = op->outputs[0];
        if (IsFloatTensor(subgraph, tensor_idx)) {
          real_value_op_set.insert(operator_name);
        }
      }
    }
  }
  return real_value_op_set;
}

TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
                          const TensorT* weight_tensor, TensorT* bias_tensor,
                          bool is_per_channel, int channel_dim_index,
                          const TensorType& activations_type,
                          ErrorReporter* error_reporter) {
  if (bias_tensor->shape.size() != 1) {
    TF_LITE_REPORT_ERROR(error_reporter, "Expected bias tensor shape to be 1.");
    return kTfLiteError;
  }

  int32_t channel_dim_size = bias_tensor->shape[0];
  TF_LITE_ENSURE(error_reporter, weight_tensor->quantization);
  std::vector<float> weight_scales = weight_tensor->quantization->scale;

  if (is_per_channel) {
    if (bias_tensor->shape[0] != weight_tensor->shape[channel_dim_index]) {
      TF_LITE_REPORT_ERROR(
          error_reporter,
          "Channel mismatch between bias and weight tensors %d vs %d",
          bias_tensor->shape[0], weight_tensor->shape[channel_dim_index]);
      return kTfLiteError;
    }
    if (!input_tensor->quantization ||
        input_tensor->quantization->scale.size() != 1) {
      TF_LITE_REPORT_ERROR(error_reporter,
                           "Input tensor missing quantization information");
      return kTfLiteError;
    }

    if (weight_scales.size() != channel_dim_size) {
      TF_LITE_REPORT_ERROR(error_reporter,
                           "Mismatch weight scale dimension: %d",
                           weight_scales.size());
      return kTfLiteError;
    }
    if (activations_type == tflite::TensorType_INT16) {
      return utils::SymmetricPerChannelBiasQuantize<std::int64_t>(
          model, bias_tensor, input_tensor->quantization->scale[0],
          weight_scales.data(), channel_dim_size, error_reporter);
    } else {
      return utils::SymmetricPerChannelBiasQuantize<std::int32_t>(
          model, bias_tensor, input_tensor->quantization->scale[0],
          weight_scales.data(), channel_dim_size, error_reporter);
    }
  } else {
    if (weight_scales.size() != 1) {
      TF_LITE_REPORT_ERROR(
          error_reporter,
          "Expected per-layer weight scale dimension size 1, got %d",
          weight_scales.size());
      return kTfLiteError;
    }
    if (activations_type == tflite::TensorType_INT16) {
      return utils::SymmetricPerLayerBiasQuantize<std::int64_t>(
          model, bias_tensor,
          input_tensor->quantization->scale[0] * weight_scales[0],
          error_reporter);
    } else {
      return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
          model, bias_tensor,
          input_tensor->quantization->scale[0] * weight_scales[0],
          error_reporter);
    }
  }
  return kTfLiteError;
}

// True if the tensor type has to be modified.
bool TensorTypeChangeRequired(const TensorT* tensor, const TensorType& type) {
  // The quantized model is type INT8/INT16, so if the user provided type is
  // INT8/INT16, we do not have to do any custom logic. Additionally, if the
  // current tensor isn't INT8/INT16 quantized, the custom type doesn't apply.
  bool int8check = type != TensorType_INT8 && tensor->type == TensorType_INT8 &&
                   !tensor->quantization->scale.empty();
  bool int16check = type != TensorType_INT16 &&
                    tensor->type == TensorType_INT16 &&
                    !tensor->quantization->scale.empty();
  return (int8check || int16check);
}

// Sets the input type, adding a Leading Op node at the start of the model if
// necessary.
// Returns the new input tensor index.
int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
                     const int32_t tensor_idx, const TensorType& input_type,
                     const TensorType& activations_type) {
  TensorT* tensor = subgraph->tensors[tensor_idx].get();
  if (!TensorTypeChangeRequired(tensor, input_type)) {
    return -1;
  }
  if (input_type == TensorType_FLOAT32 || input_type == TensorType_UINT8) {
    std::string type_string =
        activations_type == TensorType_INT16 ? "int16" : "int8";
    // Create a new tensor to be the input of the leading Op.
    std::unique_ptr<TensorT> leading_op_input;
    if (input_type == TensorType_FLOAT32) {
      // Add tensor for quantize operator. Scales and zero points are not
      // needed.
      const string leading_op_name = tensor->name;
      const string new_name_original_input = tensor->name + "_" + type_string;
      tensor->name = new_name_original_input;
      utils::MakeTensor(leading_op_name, tensor->shape, tensor->shape_signature,
                        input_type, &leading_op_input);
    } else {
      // Get scale and zero point from the first tensor.
      const float scale = subgraph->tensors[tensor_idx]->quantization->scale[0];
      const int64_t zero_point =
          subgraph->tensors[tensor_idx]->quantization->zero_point[0];

      //  Add tensor for requantize operator. Scale is the existing scale and
      //  zero point is shifted by +128.
      TFLITE_DCHECK_GE(zero_point, -128);
      TFLITE_DCHECK_LE(zero_point, 127);
      const string leading_op_name = tensor->name;
      const string new_name_original_input = tensor->name + "_" + type_string;
      tensor->name = new_name_original_input;
      utils::MakeTensorWithQuantParam(
          leading_op_name, tensor->shape, tensor->shape_signature, input_type,
          scale, zero_point + 128, &leading_op_input);
    }
    const int32_t leading_op_input_idx = subgraph->tensors.size();
    subgraph->tensors.push_back(std::move(leading_op_input));

    // Create the leading op, which is Quantize Op that quantize or requantize
    // the input.
    std::unique_ptr<OperatorT> leading_op;
    utils::MakeQuantizeOperator(model, &leading_op, leading_op_input_idx,
                                tensor_idx);

    // Insert the new op at the start of the model.
    subgraph->operators.insert(subgraph->operators.begin(),
                               std::move(leading_op));
    return leading_op_input_idx;
  }
  return -1;
}

// Sets the output type, adding a Tailing Op node at the end of the model if
// necessary.
// Returns the new output tensor index.
int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
                      const int32_t tensor_idx, const TensorType& output_type,
                      const TensorType& activations_type) {
  TensorT* tensor = subgraph->tensors[tensor_idx].get();
  if (!TensorTypeChangeRequired(tensor, output_type)) {
    return -1;
  }
  if (output_type == TensorType_FLOAT32 || output_type == TensorType_UINT8) {
    std::string type_string =
        activations_type == TensorType_INT16 ? "int16" : "int8";
    // Create a new tensor to be the output of the tailing op.
    std::unique_ptr<TensorT> tailing_op_output;
    if (output_type == TensorType_FLOAT32) {
      const string tailing_op_name = tensor->name;
      const string new_name_original_output = tensor->name + "_" + type_string;
      tensor->name = new_name_original_output;
      utils::MakeTensor(tailing_op_name, tensor->shape, tensor->shape_signature,
                        output_type, &tailing_op_output);
    } else {
      // Get scale and zero point from the last tensor.
      const float scale = subgraph->tensors[tensor_idx]->quantization->scale[0];
      const int64_t zero_point =
          subgraph->tensors[tensor_idx]->quantization->zero_point[0];

      //  Add tensor for requantize operator. Scale is the existing scale and
      //  zero point is shifted by +128.
      TFLITE_DCHECK_GE(zero_point, -128);
      TFLITE_DCHECK_LE(zero_point, 127);
      const string tailing_op_name = tensor->name;
      const string new_name_original_output = tensor->name + "_" + type_string;
      tensor->name = new_name_original_output;
      utils::MakeTensorWithQuantParam(
          tailing_op_name, tensor->shape, tensor->shape_signature, output_type,
          scale, zero_point + 128, &tailing_op_output);
    }
    const int32_t tailing_op_output_idx = subgraph->tensors.size();
    subgraph->tensors.push_back(std::move(tailing_op_output));

    // Create the tailing operation.
    std::unique_ptr<OperatorT> tailing_op;
    if (output_type == TensorType_FLOAT32) {
      // Tailing Op is Dequantize Op.
      utils::MakeDequantizeOperator(model, &tailing_op, tensor_idx,
                                    tailing_op_output_idx);
    } else {
      // Tailing Op is Quantize Op that does requantization.
      utils::MakeQuantizeOperator(model, &tailing_op, tensor_idx,
                                  tailing_op_output_idx);
    }
    // Add the operator at the end of the model.
    subgraph->operators.push_back(std::move(tailing_op));
    return tailing_op_output_idx;
  }
  return -1;
}

// Sets the input and output types to the provided types. Leading and
// tailing operations will be added if needed.
// For Float input and output, leading op is Quantize and tailing op is
// Dequantize.
// For Uint8 input and output, leading op is Quantize (uint8 to
// int8, can be thought as "requant") and tailing op is also Quantize (int8 to
// uint8, can be thought as "requant").
TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
                                    const TensorType& output_type,
                                    const TensorType& activations_type,
                                    ErrorReporter* error_reporter) {
  for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
       subgraph_idx++) {
    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();

    for (int i = 0; i < subgraph->inputs.size(); ++i) {
      TensorT* tensor = subgraph->tensors[subgraph->inputs[i]].get();
      // TODO(suharshs): Add support for this case if it ever comes up.
      if (tensor->type == TensorType_FLOAT32 && input_type != tensor->type) {
        TF_LITE_REPORT_ERROR(
            error_reporter,
            "Unsupported input type %s for input tensor %d of type %s.",
            EnumNameTensorType(input_type), subgraph->inputs[i],
            EnumNameTensorType(tensor->type));
        return kTfLiteError;
      }
      const int32_t input_idx = SetInputType(
          model, subgraph, subgraph->inputs[i], input_type, activations_type);
      if (input_idx < 0) {
        continue;
      }
      subgraph->inputs[i] = input_idx;
    }
    for (int i = 0; i < subgraph->outputs.size(); ++i) {
      TensorT* tensor = subgraph->tensors[subgraph->outputs[i]].get();
      // TODO(suharshs): Add support for this case if it ever comes up.
      if (tensor->type == TensorType_FLOAT32 && output_type != tensor->type) {
        TF_LITE_REPORT_ERROR(
            error_reporter,
            "Unsupported output type %s for output tensor '%s' of type %s.",
            EnumNameTensorType(output_type), tensor->name.c_str(),
            EnumNameTensorType(tensor->type));
        return kTfLiteError;
      }
      const int32_t output_idx = SetOutputType(
          model, subgraph, subgraph->outputs[i], output_type, activations_type);
      if (output_idx < 0) {
        continue;
      }
      subgraph->outputs[i] = output_idx;
    }
  }
  return kTfLiteOk;
}

// Apply constraints to ops if they have any.
// We have made the restriction that for int8 quantized concat, minimum, and
// maximum, the inputs and outputs must have the same scale and zero point.
// The other ones with constraints are handled in QuantizeWeightsAndInput.
TfLiteStatus ApplyConstraints(
    ModelT* model, const std::unordered_set<string>& operator_names,
    const std::unordered_set<string>& real_value_op_set,
    TensorType activations_type, ErrorReporter* error_reporter) {
  for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
       subgraph_idx++) {
    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
    // Iterate backward to avoid messing with index.
    for (int op_idx = subgraph->operators.size() - 1; op_idx >= 0; op_idx--) {
      OperatorT* op = subgraph->operators[op_idx].get();
      const string operator_name = subgraph->tensors[op->outputs[0]]->name;
      operator_property::OperatorProperty property =
          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
                              operator_name, activations_type);
      if (!property.quantizable ||
          !IsRealValueOp(real_value_op_set, operator_name)) {
        continue;
      }
      if (!property.arbitrary_inputs ||
          !property.restrict_same_input_output_scale) {
        continue;
      }
      // If ApplyConstraints and requant is needed, use the min of min and max
      // of max, which means using the scale and zero point of output.
      TensorT* output_tensor = subgraph->tensors[op->outputs[0]].get();
      if (!utils::QuantizationParametersExist(output_tensor)) {
        TF_LITE_REPORT_ERROR(
            error_reporter,
            "Unable to get scale or zero point from the tensor at %d.",
            op->outputs[0]);
        return kTfLiteError;
      }
      const float output_scale = output_tensor->quantization->scale[0];
      const float output_zp = output_tensor->quantization->zero_point[0];
      for (size_t input_idx = 0; input_idx < op->inputs.size(); ++input_idx) {
        TensorT* input_tensor = subgraph->tensors[op->inputs[input_idx]].get();
        if (!utils::QuantizationParametersExist(input_tensor)) {
          TF_LITE_REPORT_ERROR(
              error_reporter,
              "Unable to get scale or zero point from tensor at %d.",
              op->inputs[input_idx]);
          return kTfLiteError;
        }
        if (input_tensor->quantization->scale[0] == output_scale &&
            input_tensor->quantization->zero_point[0] == output_zp) {
          // This input does not need to be requantized.
          continue;
        }

        std::unique_ptr<TensorT> additional_tensor;
        const string requant_tensor_name = input_tensor->name + "_requantized";
        utils::MakeTensorWithQuantParam(
            requant_tensor_name, input_tensor->shape,
            input_tensor->shape_signature, activations_type, output_scale,
            output_zp, &additional_tensor);
        const int32_t additional_tensor_idx = subgraph->tensors.size();
        subgraph->tensors.push_back(std::move(additional_tensor));

        // Add requant op before this input.
        // There are better ways to handle this, which is to try to push the
        // rescale upwards recursively and hope all upstream ops can absort
        // this rescale.and only add requant when there is no other way.
        std::unique_ptr<OperatorT> requant_op;
        utils::MakeQuantizeOperator(model, &requant_op, op->inputs[input_idx],
                                    additional_tensor_idx);
        op->inputs[input_idx] = additional_tensor_idx;

        subgraph->operators.insert(subgraph->operators.begin() + op_idx,
                                   std::move(requant_op));
      }
    }
  }
  return kTfLiteOk;
}

std::vector<std::pair<int, operator_property::TensorProperty>> GetInputs(
    const OperatorT* op, operator_property::OperatorProperty property) {
  std::vector<std::pair<int, operator_property::TensorProperty>> inputs;
  if (property.arbitrary_inputs || !property.quantizable) {
    for (int i = 0; i < op->inputs.size(); ++i) {
      inputs.push_back({i, {}});
    }
  } else {
    inputs = property.inputs;
  }
  return inputs;
}

std::vector<std::pair<int, operator_property::TensorProperty>> GetOutputs(
    const OperatorT* op, operator_property::OperatorProperty property) {
  std::vector<std::pair<int, operator_property::TensorProperty>> outputs;
  if (property.arbitrary_outputs) {
    for (int i = 0; i < op->outputs.size(); ++i) {
      outputs.push_back({i, {}});
    }
  } else {
    outputs = property.outputs;
  }
  return outputs;
}

bool ShouldRestrictSameInputOutputScale(
    operator_property::OperatorProperty property) {
  // Ops with multiple inputs (i.e. concat, max and min) gets restricted in
  // ApplyConstraints.
  return (!property.arbitrary_inputs &&
          property.restrict_same_input_output_scale);
}

bool IsSubgraphInput(SubGraphT* subgraph, int32_t index) {
  for (const int32_t input_idx : subgraph->inputs) {
    if (index == input_idx) {
      return true;
    }
  }
  return false;
}

// Quantize the op input. Will increment op_idx if ops are added.
TfLiteStatus QuantizeOpInput(
    ModelT* model, int32_t subgraph_idx, size_t* op_idx,
    operator_property::OperatorProperty property,
    const std::pair<int32_t, operator_property::TensorProperty>& input,
    const TensorType& activations_type, ErrorReporter* error_reporter) {
  int32_t input_idx = input.first;
  operator_property::TensorProperty tensor_property = input.second;
  SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
  OperatorT* op = subgraph->operators[*op_idx].get();
  const BuiltinOperator op_code =
      model->operator_codes[op->opcode_index]->builtin_code;
  if (input_idx >= op->inputs.size()) {
    TF_LITE_REPORT_ERROR(
        error_reporter,
        "Required input index %d is larger than the input length of op "
        "%s at index %d in subgraph %d",
        input_idx, op->inputs.size(), EnumNameBuiltinOperator(op_code), *op_idx,
        subgraph_idx);
    return kTfLiteError;
  }
  const int32_t tensor_idx = op->inputs[input_idx];
  if (tensor_idx == -1) {
    // Skip optional tensor.
    return kTfLiteOk;
  }
  TensorT* tensor = subgraph->tensors[tensor_idx].get();
  // Assumes op is quantized to int8.
  const bool is_input_quantized = utils::QuantizationParametersExist(tensor);
  if (property.quantizable && !is_input_quantized) {
    // The operation is quantizable, but the input isn't yet quantized.
    if (utils::HasBuffer(model, subgraph, tensor_idx)) {
      // TODO(suharshs): Look at consumers, throw error if one consumer is
      // per-channel and one per-layer.
      bool quantize_const_input = property.quantize_input_as_activations &&
                                  activations_type == TensorType_INT16;
      if (tensor_property.number_of_bits == 8 && !quantize_const_input) {
        if (tensor_property.use_derived_scale) {
          // Currently 8bit tensors in input do not accept derived scale.
          return kTfLiteError;
        }
        if (utils::QuantizeWeight(model, tensor, tensor_property.per_axis,
                                  tensor_property.per_axis_index,
                                  error_reporter) != kTfLiteOk) {
          TF_LITE_REPORT_ERROR(
              error_reporter,
              "Unable to quantize buffer or min/max value for input %d "
              "in op %s in subgraph %d, node: %d",
              input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
              *op_idx);
          return kTfLiteError;
        }
      } else if (tensor_property.number_of_bits == 16 || quantize_const_input) {
        if (tensor_property.use_derived_scale) {
          // Currently 16bit tensors in input do not accept derived scale.
          return kTfLiteError;
        }
        TensorT* tensor = subgraph->tensors[tensor_idx].get();
        int total_size = 1;
        for (int i = 0; i < tensor->shape.size(); ++i) {
          total_size *= tensor->shape[i];
        }
        BufferT* buffer = model->buffers[tensor->buffer].get();
        float* float_data = reinterpret_cast<float*>(buffer->data.data());
        auto minmax = std::minmax_element(float_data, float_data + total_size);
        const float min = *minmax.first;
        const float max = *minmax.second;
        const float range = std::max(std::abs(min), std::abs(max));
        // The narrow range quantized value for int16.
        const float quantize_range = 32767.0;
        const float scale = range / quantize_range;
        return utils::SymmetricQuantizeFloatsToInt16(model, tensor, scale,
                                                     error_reporter);
      } else if (tensor_property.number_of_bits == 32) {
        if (!tensor_property.use_derived_scale) {
          // Currently 32 bit tensors in input only accept derived scale.
          return kTfLiteError;
        }
        TensorT* tensor = subgraph->tensors[tensor_idx].get();
        const float scale = utils::GetEffectiveScale(
            model, subgraph, *op_idx,
            tensor_property.derived_scale.input_tensors,
            tensor_property.derived_scale.intermediate_tensors,
            tensor_property.derived_scale.factors);
        return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
            model, tensor, scale, error_reporter);

      } else if (tensor_property.number_of_bits == 10) {
        // When the number of bits is 10 (instead of 16), quantize the tensor to
        // [-512, 512], instead of [-32767, 32767].
        TensorT* tensor = subgraph->tensors[tensor_idx].get();
        int total_size = 1;
        for (int i = 0; i < tensor->shape.size(); ++i) {
          total_size *= tensor->shape[i];
        }
        BufferT* buffer = model->buffers[tensor->buffer].get();
        float* buffer_data = reinterpret_cast<float*>(buffer->data.data());
        auto minmax =
            std::minmax_element(buffer_data, buffer_data + total_size);
        const float range =
            std::max(std::abs(*minmax.first), std::abs(*minmax.second));
        const float quantized_range = 512.0;
        const float scale = range / quantized_range;
        return utils::SymmetricQuantizeFloatsToInt16(model, tensor, scale,
                                                     error_reporter);
      } else {
        // Only 8, 16, 32, 10 are supported.
        // TODO(jianlijianli): extend this to support arbitrary bits.
        TF_LITE_REPORT_ERROR(
            error_reporter,
            "Unable to quantize buffer or min/max value for input %d "
            "in op %s in subgraph %d, node: %d",
            input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, *op_idx);
        return kTfLiteError;
      }
    } else if (utils::HasMinMax(tensor)) {
      if (IsSubgraphInput(subgraph, tensor_idx) ||
          tensor_property.state_tensor) {
        if (tensor_property.number_of_bits == 8) {
          if (tensor_property.use_derived_scale) {
            // Currently 8bit tensors in input do not accept derived scale.
            return kTfLiteError;
          }
          TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
              tensor, activations_type, error_reporter));
        } else if (tensor_property.number_of_bits == 16) {
          TensorT* tensor = subgraph->tensors[tensor_idx].get();
          float quantized_range = 32767.0;
          float range = std::max(std::abs(tensor->quantization->min[0]),
                                 std::abs(tensor->quantization->max[0]));
          if (tensor_property.extend_to_power_of_two) {
            const int power_of_two_scale = utils::GetPowerOfTwoScale(
                tensor->quantization->min[0], tensor->quantization->max[0]);
            range = std::pow(2, power_of_two_scale);
            quantized_range = 32768.0;
          }
          const float scale = range / quantized_range;
          utils::QuantizeActivationToInt16(tensor, scale);
        }
      } else {
        // If the tensor is not a model input, we need to add a Quantize
        // operation since the preceding op may require a float output.
        std::string type_string =
            activations_type == TensorType_INT16 ? "int16" : "int8";
        std::unique_ptr<TensorT> op_output;
        utils::MakeTensor(tensor->name + "_" + type_string, tensor->shape,
                          tensor->shape_signature, activations_type,
                          &op_output);
        op_output->quantization = absl::make_unique<QuantizationParametersT>();
        op_output->quantization->min.push_back(tensor->quantization->min[0]);
        op_output->quantization->max.push_back(tensor->quantization->max[0]);
        TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
            op_output.get(), activations_type, error_reporter));
        const int32_t quant_op_output_idx = subgraph->tensors.size();
        subgraph->tensors.push_back(std::move(op_output));
        std::unique_ptr<OperatorT> quant_op;
        utils::MakeQuantizeOperator(model, &quant_op, tensor_idx,
                                    quant_op_output_idx);
        subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
                                   std::move(quant_op));
        op->inputs[input_idx] = quant_op_output_idx;
        *op_idx += 1;
      }
    } else {
      TF_LITE_REPORT_ERROR(error_reporter,
                           "Unable to find buffer or min/max value for input "
                           "%d in %s in subgraph %d, node: %d",
                           input_idx, EnumNameBuiltinOperator(op_code),
                           subgraph_idx, *op_idx);
      return kTfLiteError;
    }
  } else if (!property.quantizable && is_input_quantized) {
    // If the tensor is quantized, we have to add a Dequantize op after
    // since this op is not quantizable.
    std::unique_ptr<TensorT> op_output;
    utils::MakeTensor(tensor->name + "_float", tensor->shape,
                      tensor->shape_signature, TensorType_FLOAT32, &op_output);
    const int32_t dequant_op_output_idx = subgraph->tensors.size();
    subgraph->tensors.push_back(std::move(op_output));
    std::unique_ptr<OperatorT> dequant_op;
    utils::MakeDequantizeOperator(model, &dequant_op, tensor_idx,
                                  dequant_op_output_idx);
    subgraph->operators.insert(subgraph->operators.begin() + *op_idx,
                               std::move(dequant_op));
    op->inputs[input_idx] = dequant_op_output_idx;
    *op_idx += 1;
  }
  return kTfLiteOk;
}

// Quantize the op output.
TfLiteStatus QuantizeOpOutput(
    ModelT* model, int32_t subgraph_idx, int32_t op_idx,
    operator_property::OperatorProperty property,
    const std::pair<int32_t, operator_property::TensorProperty>& output,
    TensorType activations_type, ErrorReporter* error_reporter) {
  int32_t output_idx = output.first;
  operator_property::TensorProperty tensor_property = output.second;
  // If the operator is not quantizable, we don't need to do anything for the
  // output.
  if (!property.quantizable) {
    return kTfLiteOk;
  }
  SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
  OperatorT* op = subgraph->operators[op_idx].get();
  const BuiltinOperator op_code =
      model->operator_codes[op->opcode_index]->builtin_code;
  if (output_idx >= op->outputs.size()) {
    TF_LITE_REPORT_ERROR(
        error_reporter,
        "Required output index %d is larger than the output length of "
        "op %s at index %d in subgraph %d",
        output_idx, op->outputs.size(), EnumNameBuiltinOperator(op_code),
        op_idx, subgraph_idx);
    return kTfLiteError;
  }

  TensorT* output_tensor = subgraph->tensors[op->outputs[output_idx]].get();
  if (utils::QuantizationParametersExist(output_tensor)) {
    // Skip output if it has been quantized.
    return kTfLiteOk;
  }
  if (ShouldRestrictSameInputOutputScale(property)) {
    // Copy quantization parameter. For average pool, max pool, etc
    // min/max can be different but we want them to be the same.
    // Get scale and zero point of input.
    if (property.inputs[0].first >= op->inputs.size()) {
      TF_LITE_REPORT_ERROR(
          error_reporter,
          "Required input index %d is larger than the input length of "
          "op %s at index %d in subgraph %d",
          property.inputs[0].first, op->inputs.size(),
          EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
      return kTfLiteError;
    }
    const int input_tensor_idx = op->inputs[property.inputs[0].first];
    TensorT* input_tensor = subgraph->tensors[input_tensor_idx].get();
    if (input_tensor->quantization->scale.size() != 1 ||
        input_tensor->quantization->zero_point.size() != 1) {
      TF_LITE_REPORT_ERROR(error_reporter,
                           "Invalid quantization params for op %s at index %d "
                           "in subgraph %d",
                           EnumNameBuiltinOperator(op_code), op_idx,
                           subgraph_idx);
      return kTfLiteError;
    }

    const float input_scale = input_tensor->quantization->scale[0];
    const int32_t input_zero_point = input_tensor->quantization->zero_point[0];

    // Apply to output.
    output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
    output_tensor->quantization->scale.push_back(input_scale);
    output_tensor->quantization->zero_point.push_back(input_zero_point);
    if (!input_tensor->quantization->min.empty()) {
      const float min = input_tensor->quantization->min[0];
      output_tensor->quantization->min = {min};
    }
    if (!input_tensor->quantization->max.empty()) {
      const float max = input_tensor->quantization->max[0];
      output_tensor->quantization->max = {max};
    }
    output_tensor->type = activations_type;
  } else if (tensor_property.restriction) {
    const auto scale_and_zp = activations_type == TensorType_INT16
                                  ? tensor_property.restricted_value_int16
                                  : tensor_property.restricted_value_int8;

    // Apply to output.
    output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
    output_tensor->quantization->scale.push_back(scale_and_zp.first);
    output_tensor->quantization->zero_point.push_back(scale_and_zp.second);
    output_tensor->type = activations_type;
  } else {
    // Process regular output that doesn't have any restrictions.
    if (utils::HasMinMax(output_tensor)) {
      utils::QuantizeActivation(output_tensor, activations_type,
                                error_reporter);
    } else {
      TF_LITE_REPORT_ERROR(
          error_reporter,
          "Unable to find min/max value for output %d in %s in "
          "subgraph %d, node: %d",
          output_idx, EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx);
      return kTfLiteError;
    }
  }
  return kTfLiteOk;
}

TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
                                        TensorType activations_type,
                                        ErrorReporter* error_reporter) {
  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
       subgraph_idx++) {
    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
    for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
      operator_property::OperatorProperty property =
          operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
      if (!property.intermediates.empty()) {
        OperatorT* op = subgraph->operators[op_idx].get();
        const BuiltinOperator op_code =
            model->operator_codes[op->opcode_index]->builtin_code;
        for (const std::pair<int, operator_property::TensorProperty>& input :
             property.intermediates) {
          const int index_local = input.first;
          const int index_global = op->intermediates[index_local];
          if (index_global == -1) {
            // Skip optional tensor.
            continue;
          }
          if (input.second.number_of_bits == 8 &&
              input.second.symmetric == false) {
            TensorT* tensor = subgraph->tensors[index_global].get();
            if (utils::HasMinMax(tensor)) {
              utils::QuantizeActivation(tensor, activations_type,
                                        error_reporter);
            } else {
              TF_LITE_REPORT_ERROR(
                  error_reporter,
                  "Unable to find min/max value for output %d in %s in "
                  "subgraph %d, node: %d",
                  tensor, EnumNameBuiltinOperator(op_code), subgraph_idx,
                  op_idx);
              return kTfLiteError;
            }
          } else if (input.second.number_of_bits == 16 &&
                     input.second.symmetric == true) {
            TensorT* tensor = subgraph->tensors[index_global].get();
            if (tensor->quantization == nullptr) {
              continue;
            }
            const float min = tensor->quantization->min[0];
            const float max = tensor->quantization->max[0];
            const float range = std::max(std::abs(min), std::abs(max));
            if (range < 1e-8) {
              return kTfLiteError;
            }

            // Get scale and zero point.
            const float quantized_range = 32767.0;
            const float scale = range / quantized_range;
            utils::QuantizeActivationToInt16(tensor, scale);
          } else {
            return kTfLiteError;
          }
        }
      }
    }
  }
  return kTfLiteOk;
}

// Quantize tensros that have shared range. For example, in LSTM, the output
// tensor and input state tensor should share the same range because they are
// using the same scale and zero point.
// We have to model this explicitly because the output is modeled as an extra
// tensor in LSTM. In calibrator, state tensors are logged both before and after
// the inference so the range is fully captured. But output, although it is
// identical to activation, is not a state tensor the input value (range) of the
// very first inference is not captured.
TfLiteStatus QuantizeSharedRange(ModelT* model, ErrorReporter* error_reporter) {
  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
       subgraph_idx++) {
    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
    for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
      operator_property::OperatorProperty property =
          operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
      if (!property.intermediates.empty()) {
        OperatorT* op = subgraph->operators[op_idx].get();
        for (const std::vector<int>& input : property.restrict_scale) {
          if (input.empty()) {
            continue;
          }
          // Currently only support pair of twos.
          // TODO(jianlijianli): extend to arbitrary number of tensors.
          if (input.size() != 2) {
            return kTfLiteError;
          }
          const int index_1 = input[0];
          const int index_2 = input[1];
          // TODO(jianlijianli): model input/output.
          TensorT* tensor_1 = subgraph->tensors[op->inputs[index_1]].get();
          TensorT* tensor_2 = subgraph->tensors[op->outputs[index_2]].get();
          const float min_of_min = std::min(tensor_1->quantization->min[0],
                                            tensor_2->quantization->min[0]);
          const float max_of_max = std::max(tensor_1->quantization->max[0],
                                            tensor_2->quantization->max[0]);
          if (min_of_min == 0.0 && max_of_max == 0.0) {
            return kTfLiteError;
          }

          // Asmmetric quantization to 8 bit.
          auto quantization_params =
              absl::make_unique<QuantizationParametersT>();
          utils::GetAsymmetricQuantizationParams(
              min_of_min, max_of_max, -128, 127, quantization_params.get());

          // Populate both tensors with the same parameters.
          const float scale = quantization_params->scale[0];
          const int32 zero_point = quantization_params->zero_point[0];
          for (TensorT* tensor : {tensor_1, tensor_2}) {
            tensor->quantization = absl::make_unique<QuantizationParametersT>();
            tensor->quantization->scale.push_back(scale);
            tensor->quantization->zero_point.push_back(zero_point);
            tensor->type = TensorType_INT8;
          }
        }
      }
    }
  }
  return kTfLiteOk;
}

// Quantize inputs and weights.
// Because of ops such as lstm, still need to do per op, instead of weights.
TfLiteStatus QuantizeWeightsInputOutput(
    ModelT* model, bool allow_float,
    const std::unordered_set<string>& operator_names,
    const std::unordered_set<string>& real_value_op_set,
    const TensorType& activations_type, ErrorReporter* error_reporter) {
  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
       subgraph_idx++) {
    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
    for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
      OperatorT* op = subgraph->operators[op_idx].get();
      const BuiltinOperator op_code =
          model->operator_codes[op->opcode_index]->builtin_code;
      const string operator_name = subgraph->tensors[op->outputs[0]]->name;
      operator_property::OperatorProperty property =
          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
                              operator_name, activations_type);
      if (!IsRealValueOp(real_value_op_set, operator_name)) {
        continue;
      }

      if (activations_type == TensorType_INT16 && !property.quantizable &&
          !allow_float) {
        TF_LITE_REPORT_ERROR(
            error_reporter,
            "Quantization to 16x8-bit not yet supported for op: %",
            EnumNameBuiltinOperator(op_code));
        return kTfLiteError;
      } else if (!property.quantizable && !allow_float) {
        TF_LITE_REPORT_ERROR(error_reporter,
                             "Quantization not yet supported for op: %",
                             EnumNameBuiltinOperator(op_code));
        return kTfLiteError;
      }

      // Quantize operator inputs/weights.
      for (const std::pair<int, operator_property::TensorProperty>& input :
           GetInputs(op, property)) {
        TF_LITE_ENSURE_STATUS(QuantizeOpInput(model, subgraph_idx, &op_idx,
                                              property, input, activations_type,
                                              error_reporter));
      }

      // Quantize operator outputs.
      for (const std::pair<int, operator_property::TensorProperty>& output :
           GetOutputs(op, property)) {
        TF_LITE_ENSURE_STATUS(
            QuantizeOpOutput(model, subgraph_idx, op_idx, property, output,
                             activations_type, error_reporter));
      }
    }
  }
  return kTfLiteOk;
}

// Quantize bias.
TfLiteStatus QuantizeBiases(ModelT* model,
                            const std::unordered_set<string>& operator_names,
                            const std::unordered_set<string>& real_value_op_set,
                            const TensorType& activations_type,
                            ErrorReporter* error_reporter) {
  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
       subgraph_idx++) {
    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
    for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
      OperatorT* op = subgraph->operators[op_idx].get();
      const BuiltinOperator op_code =
          model->operator_codes[op->opcode_index]->builtin_code;
      const string operator_name = subgraph->tensors[op->outputs[0]]->name;
      operator_property::OperatorProperty property =
          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
                              operator_name, activations_type);
      if (!property.quantizable ||
          !IsRealValueOp(real_value_op_set, operator_name)) {
        continue;
      }
      for (const int bias_idx : property.biases) {
        if (bias_idx >= op->inputs.size() ||
            op->inputs[bias_idx] == kTfLiteOptionalTensor) {
          continue;
        }
        // Quantize if it is not quantized already as the
        // output of another op or input of another op.
        TensorT* bias_tensor = subgraph->tensors[op->inputs[bias_idx]].get();
        if (!utils::QuantizationParametersExist(bias_tensor)) {
          if (utils::HasBuffer(model, subgraph, op->inputs[bias_idx])) {
            if (property.inputs.size() != 2) {
              TF_LITE_REPORT_ERROR(error_reporter,
                                   "Expect the input length of "
                                   "op %s at index %d in subgraph %d to be 2",
                                   bias_idx, op->inputs.size(),
                                   EnumNameBuiltinOperator(op_code), op_idx,
                                   subgraph_idx);
              return kTfLiteError;
            }
            TensorT* input_tensor =
                subgraph->tensors[op->inputs[property.inputs[0].first]].get();
            TensorT* weight_tensor =
                subgraph->tensors[op->inputs[property.inputs[1].first]].get();
            operator_property::TensorProperty weight_property =
                property.inputs[1].second;
            TF_LITE_ENSURE_STATUS(QuantizeBias(
                model, input_tensor, weight_tensor, bias_tensor,
                weight_property.per_axis, weight_property.per_axis_index,
                activations_type, error_reporter));
          }
        }
      }
    }
  }
  return kTfLiteOk;
}

std::unordered_set<string> GetAllOperatorOutputs(ModelT* model) {
  std::unordered_set<string> operator_names;
  for (int32_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
       subgraph_idx++) {
    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
    for (int32_t tensor_idx = 0; tensor_idx < subgraph->tensors.size();
         tensor_idx++) {
      operator_names.insert(subgraph->tensors[tensor_idx]->name);
    }
  }
  return operator_names;
}
// Populate the quantization parameters max and min for input tensors.
// Assumes that dynamic tensors already have stored min, max values and throw
// an error if a tensor does not have min, max quantization parameter or a
// buffer.
// If any static tensors are not inputs to an operation, their max, min values
// will not be filled by this function.
TfLiteStatus FillQuantizationParams(
    ModelT* model, const std::unordered_set<string>& operator_names,
    const std::unordered_set<string>& real_value_op_set,
    const TensorType& activations_type, ErrorReporter* error_reporter) {
  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
       subgraph_idx++) {
    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
    for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
      OperatorT* op = subgraph->operators[op_idx].get();
      const string operator_name = subgraph->tensors[op->outputs[0]]->name;
      operator_property::OperatorProperty property =
          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
                              operator_name, activations_type);
      if (!IsRealValueOp(real_value_op_set, operator_name)) {
        continue;
      }

      // Populate max, min for each input tensor.
      for (const std::pair<int, operator_property::TensorProperty>& input :
           property.inputs) {
        // Get tensor.
        const int32_t input_idx = input.first;
        const int32_t tensor_idx = op->inputs[input_idx];
        if (tensor_idx == -1) {
          // Skip optional tensor.
          continue;
        }
        TensorT* tensor = subgraph->tensors[tensor_idx].get();

        // Static tensor.
        if (!utils::HasMinMax(tensor) &&
            utils::HasBuffer(model, subgraph, tensor_idx)) {
          // Get input float data and tensor dimensions.
          const BufferT* buffer = model->buffers[tensor->buffer].get();
          const float* float_input_data =
              reinterpret_cast<const float*>(buffer->data.data());

          if (tensor->quantization == nullptr) {
            tensor->quantization = absl::make_unique<QuantizationParametersT>();
          }

          // Fill per channel max and min with respect to channel_dim_index.
          if (input.second.per_axis) {
            if (tensor->shape.size() == 4) {
              int32_t channel_dim_index = input.second.per_axis_index;
              TF_LITE_ENSURE_STATUS(utils::FillPerChannelMinMax(
                  float_input_data, tensor->shape, channel_dim_index,
                  tensor->quantization.get(), error_reporter));
            } else {
              TF_LITE_REPORT_ERROR(
                  error_reporter,
                  "Could not fill max min for tensor as the dimension is %d "
                  "and not 4 as expected.",
                  tensor->shape.size());
              return kTfLiteError;
            }

            // Fill per layer max and min.
          } else if (!utils::HasMinMax(tensor) && !input.second.per_axis &&
                     utils::HasBuffer(model, subgraph, tensor_idx)) {
            uint64_t input_size;
            TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &input_size));
            utils::FillSingleMinMax(float_input_data, input_size,
                                    tensor->quantization.get());
          }
          if (tensor->quantization->quantized_dimension !=
              input.second.per_axis_index) {
            TF_LITE_REPORT_ERROR(
                error_reporter,
                "Quantized dimension for tensor property and quantization "
                "parameters do not match. Got %d and %d respectively.",
                input.second.per_axis_index,
                tensor->quantization->quantized_dimension);
            return kTfLiteError;
          }

          // Dynamic tensor.
        } else if (!utils::HasMinMax(tensor) &&
                   !utils::HasBuffer(model, subgraph, tensor_idx)) {
          TF_LITE_REPORT_ERROR(
              error_reporter,
              "Max and min for dynamic tensors should be"
              " recorded during calibration: Failed for tensor %s\n",
              tensor->name.c_str());
          if (tensor->quantization == nullptr) {
            TF_LITE_REPORT_ERROR(error_reporter,
                                 "No quantization params for tensor %s",
                                 tensor->name.c_str());
          } else if (tensor->quantization->min.empty() ||
                     tensor->quantization->max.empty()) {
            TF_LITE_REPORT_ERROR(error_reporter, "Empty min/max for tensor %s",
                                 tensor->name.c_str());
          }
          return kTfLiteError;
        }

        if (utils::QuantizationParametersExist(tensor)) {
          TF_LITE_REPORT_ERROR(
              error_reporter,
              "Scale and zero points should not be recorded before "
              "quantization.");
          return kTfLiteError;
        }
      }  // loop over op inputs
    }    // loop over ops
  }      // loop over subgraphs
  return kTfLiteOk;
}

// Check compatibility of activation, weight and bias scales. Adjust if needed.
TfLiteStatus EnsureBiasScaleCompatibility(
    ModelT* model, const std::unordered_set<string>& operator_names,
    const std::unordered_set<string>& real_value_op_set,
    const TensorType& activations_type, ErrorReporter* error_reporter) {
  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
       subgraph_idx++) {
    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
    for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
      OperatorT* op = subgraph->operators[op_idx].get();
      const string operator_name = subgraph->tensors[op->outputs[0]]->name;
      operator_property::OperatorProperty property =
          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
                              operator_name, activations_type);
      if (!IsRealValueOp(real_value_op_set, operator_name)) {
        continue;
      }

      // Loop over all bias tensors.
      for (const int bias_idx : property.biases) {
        if (bias_idx >= op->inputs.size() ||
            op->inputs[bias_idx] == kTfLiteOptionalTensor) {
          continue;
        }
        TensorT* bias_tensor = subgraph->tensors[op->inputs[bias_idx]].get();
        int32_t channel_dim_size = bias_tensor->shape[0];
        if (bias_tensor->shape.size() != 1) {
          TF_LITE_REPORT_ERROR(error_reporter,
                               "Expected bias tensor to be a vector.");
          return kTfLiteError;
        }

        if (property.inputs.size() != 2) {  // Only works for two input tensors.
          TF_LITE_REPORT_ERROR(
              error_reporter,
              "Expect %d inputs for op %s at index %d in subgraph %d to be 2",
              property.inputs.size(), op_idx, subgraph_idx);
          return kTfLiteError;
        }

        if (!property.arbitrary_inputs && property.quantizable) {
          // Get input and weight tensors.
          TensorT* input_tensor =
              subgraph->tensors[op->inputs[property.inputs[0].first]].get();
          TensorT* weight_tensor =
              subgraph->tensors[op->inputs[property.inputs[1].first]].get();
          operator_property::TensorProperty weight_property =
              property.inputs[1].second;
          TF_LITE_ENSURE(error_reporter, input_tensor->quantization);

          // Check quantization parameters exist for input.
          if (!utils::HasMinMax(input_tensor)) {
            TF_LITE_REPORT_ERROR(
                error_reporter,
                "Input tensor missing quantization information. Should be "
                "populated during calibration.");
            return kTfLiteError;
          }

          // Get input scale for asymmetric quantization.
          QuantizationParametersT temp_quant_params = QuantizationParametersT();
          TF_LITE_ENSURE_STATUS(
              utils::GetQuantizationParams(input_tensor, activations_type,
                                           &temp_quant_params, error_reporter));
          if (temp_quant_params.scale.size() != 1) {
            TF_LITE_REPORT_ERROR(error_reporter,
                                 "Unexpected input quantization scale size.");
            return kTfLiteError;
          }
          float input_scale = temp_quant_params.scale[0];

          // Check that max/min values have been filled for weights.
          if (!utils::HasMinMax(weight_tensor)) {
            TF_LITE_REPORT_ERROR(
                error_reporter,
                "Min and/or max values have not been recorded for weight "
                "tensor. This should have happened in FillQuantizationParams.");
            return kTfLiteError;
          }

          // Ensure the tensor dimensions are compatible.
          if (weight_property.per_axis) {
            if (bias_tensor->shape[0] !=
                weight_tensor->shape[weight_property.per_axis_index]) {
              TF_LITE_REPORT_ERROR(
                  error_reporter,
                  "Channel mismatch between bias and weight tensors %d vs %d",
                  bias_tensor->shape[0],
                  weight_tensor->shape[weight_property.per_axis_index]);
              return kTfLiteError;
            }
            // Ensure that the number of max/mins matches the channel_dim_size.
            if (weight_tensor->quantization->max.size() != channel_dim_size) {
              TF_LITE_REPORT_ERROR(
                  error_reporter,
                  "Mismatch between number of weight maxs and channels: %d vs "
                  "%d",
                  weight_tensor->quantization->max.size(), channel_dim_size);
              return kTfLiteError;
            }
            if (weight_tensor->quantization->min.size() != channel_dim_size) {
              TF_LITE_REPORT_ERROR(
                  error_reporter,
                  "Mismatch between number of weight mins and channels: %d",
                  weight_tensor->quantization->min.size());
              return kTfLiteError;
            }
          }

          // Get data and size of bias tensor.
          const BufferT* buffer = model->buffers[bias_tensor->buffer].get();
          const float* bias_data =
              reinterpret_cast<const float*>(buffer->data.data());
          uint64_t bias_size;
          TF_LITE_ENSURE_STATUS(utils::NumElements(*bias_tensor, &bias_size));

          // Adjust weight scales if needed.
          TF_LITE_ENSURE_STATUS(utils::AdjustWeightsForBiasScale(
              weight_tensor->quantization.get(), bias_data, bias_size,
              input_scale, error_reporter));

          if (utils::QuantizationParametersExist(weight_tensor)) {
            TF_LITE_REPORT_ERROR(
                error_reporter,
                "Scale and zero points should not be recorded for the weight "
                "tensor before quantization.");
            return kTfLiteError;
          }
          if (utils::QuantizationParametersExist(input_tensor)) {
            TF_LITE_REPORT_ERROR(
                error_reporter,
                "Scale and zero points should not be recorded for the input "
                "tensor before quantization.");
            return kTfLiteError;
          }
        }
      }
    }
  }
  return kTfLiteOk;
}

}  // namespace

// Assumes that the operators in the model have been topologically sorted.
TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                           ModelT* model, const TensorType& input_type,
                           const TensorType& output_type, bool allow_float,
                           const std::unordered_set<string>& operator_names,
                           const TensorType& activations_type,
                           ErrorReporter* error_reporter) {
  auto real_value_op_set =
      PopulateRealValueOpSet(model, operator_names, activations_type);
  TF_LITE_ENSURE_STATUS(
      FillQuantizationParams(model, operator_names, real_value_op_set,
                             activations_type, error_reporter));
  TF_LITE_ENSURE_STATUS(
      EnsureBiasScaleCompatibility(model, operator_names, real_value_op_set,
                                   activations_type, error_reporter));
  TF_LITE_ENSURE_STATUS(
      QuantizeIntemediateTensors(model, activations_type, error_reporter));
  TF_LITE_ENSURE_STATUS(QuantizeSharedRange(model, error_reporter));
  TF_LITE_ENSURE_STATUS(QuantizeWeightsInputOutput(
      model, allow_float, operator_names, real_value_op_set, activations_type,
      error_reporter));
  TF_LITE_ENSURE_STATUS(ApplyConstraints(model, operator_names,
                                         real_value_op_set, activations_type,
                                         error_reporter));
  TF_LITE_ENSURE_STATUS(QuantizeBiases(model, operator_names, real_value_op_set,
                                       activations_type, error_reporter));
  utils::SetOperatorCodeVersion(model);
  TF_LITE_ENSURE_STATUS(SetInputAndOutputTypes(
      model, input_type, output_type, activations_type, error_reporter));

  flatbuffers::Offset<Model> output_model_location =
      Model::Pack(*builder, model);
  FinishModelBuffer(*builder, output_model_location);

  return kTfLiteOk;
}

TfLiteStatus QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder* builder,
                                       ModelT* model,
                                       const TensorType& input_type,
                                       const TensorType& output_type,
                                       bool allow_float,
                                       const TensorType& activations_type,
                                       ErrorReporter* error_reporter) {
  return QuantizeModel(builder, model, input_type, output_type, allow_float,
                       GetAllOperatorOutputs(model), activations_type,
                       error_reporter);
}

TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                           ModelT* model, const TensorType& input_type,
                           const TensorType& output_type, bool allow_float,
                           ErrorReporter* error_reporter) {
  return QuantizeModel(builder, model, input_type, output_type, allow_float,
                       GetAllOperatorOutputs(model), TensorType_INT8,
                       error_reporter);
}

TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                           ModelT* model, const TensorType& input_type,
                           const TensorType& output_type,
                           ErrorReporter* error_reporter) {
  return QuantizeModel(builder, model, input_type, output_type,
                       /*allow_float=*/false, error_reporter);
}

TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                           ModelT* model, ErrorReporter* error_reporter) {
  return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32,
                       /*allow_float=*/false, error_reporter);
}

}  // namespace optimize
}  // namespace tflite