Make quantization properties per-tensor.

PiperOrigin-RevId: 251876877
2019-06-06 10:24:51 -07:00 · 2019-06-06 10:24:51 -07:00 · a372bb0e9d
commit a372bb0e9d
parent 651201e51a
4 changed files with 173 additions and 132 deletions
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@ -21,18 +21,18 @@ OperatorProperty GetOperatorProperty(const BuiltinOperator& op) {
  OperatorProperty property;
  switch (op) {
    case BuiltinOperator_ADD:
-      property.input_indexes = {0, 1};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}, {1, {}}};
+      property.outputs = {{0, {}}};
      property.version = 2;
      break;
    case BuiltinOperator_ARG_MAX:
-      property.input_indexes = {0};
+      property.inputs = {{0, {}}};
      // ArgMax has no quantizable output.
      property.version = 2;
      break;
    case BuiltinOperator_AVERAGE_POOL_2D:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 2;
      break;
@ -40,175 +40,196 @@ OperatorProperty GetOperatorProperty(const BuiltinOperator& op) {
    case BuiltinOperator_SPACE_TO_BATCH_ND:
    case BuiltinOperator_SPACE_TO_DEPTH:
      // We skip inputs 1 and 2 since they aren't real valued (they are shapes).
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 2;
      break;
    case BuiltinOperator_CONCATENATION:
      property.arbitrary_inputs = true;
-      property.input_indexes = {};
-      property.output_indexes = {0};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 2;
      break;
-    case BuiltinOperator_CONV_2D:
-      property.per_axis = true;
-      property.per_axis_index = 0;
-      property.input_indexes = {0, 1};
-      property.output_indexes = {0};
+    case BuiltinOperator_CONV_2D: {
+      TensorProperty tensor_property;
+      tensor_property.per_axis = true;
+      tensor_property.per_axis_index = 0;
+      tensor_property.symmetric = true;
+      property.inputs = {{0, {}}, {1, tensor_property}};
+      property.outputs = {{0, {}}};
      property.biases = {2};
      property.version = 2;
      break;
-    case BuiltinOperator_DEPTHWISE_CONV_2D:
-      property.per_axis = true;
-      property.per_axis_index = 3;
-      property.input_indexes = {0, 1};
-      property.output_indexes = {0};
+    }
+    case BuiltinOperator_DEPTHWISE_CONV_2D: {
+      TensorProperty tensor_property;
+      tensor_property.per_axis = true;
+      tensor_property.per_axis_index = 3;
+      tensor_property.symmetric = true;
+      property.inputs = {
+          {0, {}},
+          {1, tensor_property},
+      };
+      property.outputs = {{0, {}}};
      property.biases = {2};
      property.version = 3;
      break;
+    }
    case BuiltinOperator_EQUAL:
    case BuiltinOperator_NOT_EQUAL:
    case BuiltinOperator_GREATER:
    case BuiltinOperator_GREATER_EQUAL:
    case BuiltinOperator_LESS:
    case BuiltinOperator_LESS_EQUAL:
-      property.input_indexes = {0, 1};
+      property.inputs = {{0, {}}, {1, {}}};
      // Comparisons have no quantizable outputs.
      property.version = 2;
      break;
-    case BuiltinOperator_FULLY_CONNECTED:
-      property.input_indexes = {0, 1};
-      property.output_indexes = {0};
+    case BuiltinOperator_FULLY_CONNECTED: {
+      TensorProperty tensor_property;
+      tensor_property.symmetric = true;
+      property.inputs = {{0, {}}, {1, tensor_property}};
+      property.outputs = {{0, {}}};
      property.biases = {2};
      property.version = 4;
      break;
+    }
    case BuiltinOperator_GATHER:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 2;
      break;
-    case BuiltinOperator_LOG_SOFTMAX:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+    case BuiltinOperator_LOG_SOFTMAX: {
+      property.inputs = {{0, {}}};
      // LogSoftmax requires output with 16/256 as scale and 127 as zero point.
-      property.restriction_on_output = true;
-      property.restricted_value_on_output = {16.0 / 256.0, 127};
+      TensorProperty tensor_property;
+      tensor_property.restriction = true;
+      tensor_property.restricted_value = {16.0 / 256.0, 127};
+      property.outputs = {{0, tensor_property}};
      property.version = 2;
      break;
-    case BuiltinOperator_LOGISTIC:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+    }
+    case BuiltinOperator_LOGISTIC: {
+      property.inputs = {{0, {}}};
      // Logistic requires output with 1/256 as scale and -128 as zero point.
-      property.restriction_on_output = true;
-      property.restricted_value_on_output = {1 / 256.0, -128};
+      TensorProperty tensor_property;
+      tensor_property.restriction = true;
+      tensor_property.restricted_value = {1 / 256.0, -128};
+      property.outputs = {{0, tensor_property}};
      property.version = 2;
      break;
-    case BuiltinOperator_L2_NORMALIZATION:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+    }
+    case BuiltinOperator_L2_NORMALIZATION: {
+      property.inputs = {{0, {}}};
      // L2 Norm requires output with 1/128 as scale and 0 as zero point.
-      property.restriction_on_output = true;
-      property.restricted_value_on_output = {1 / 128.0, 0};
+      TensorProperty tensor_property;
+      tensor_property.restriction = true;
+      tensor_property.restricted_value = {1 / 128.0, 0};
+      property.outputs = {{0, tensor_property}};
      property.version = 2;
      break;
+    }
    case BuiltinOperator_MAX_POOL_2D:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 2;
      break;
    case BuiltinOperator_MAXIMUM:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 2;
      break;
    case BuiltinOperator_MEAN:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.version = 2;
      break;
    case BuiltinOperator_MINIMUM:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 2;
      break;
    case BuiltinOperator_MUL:
-      property.input_indexes = {0, 1};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.version = 2;
      break;
    case BuiltinOperator_PAD:
    case BuiltinOperator_PADV2:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 2;
      break;
    case BuiltinOperator_QUANTIZE:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.version = 1;
      break;
    case BuiltinOperator_RESHAPE:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 1;
      break;
    case BuiltinOperator_RESIZE_BILINEAR:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 2;
      break;
    case BuiltinOperator_SHAPE:
-      property.input_indexes = {0};
+      property.inputs = {{0, {}}};
      // Shape has no quantizable output.
      property.version = 1;
      break;
    case BuiltinOperator_SLICE:
      // We skip inputs 1 and 2 since they aren't real valued (they are the
      // index and size).
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 2;
      break;
    case BuiltinOperator_SQUEEZE:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 1;
      break;
-    case BuiltinOperator_SOFTMAX:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+    case BuiltinOperator_SOFTMAX: {
+      property.inputs = {{0, {}}};
      // Softmax requires output with 1/256 as scale and -128 as zero point.
-      property.restriction_on_output = true;
-      property.restricted_value_on_output = {1 / 256.0, -128};
+      TensorProperty tensor_property;
+      tensor_property.restriction = true;
+      tensor_property.restricted_value = {1 / 256.0, -128};
+      property.outputs = {{0, tensor_property}};
      property.version = 2;
      break;
+    }
    case BuiltinOperator_SUB:
-      property.input_indexes = {0, 1};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}, {1, {}}};
+      property.outputs = {{0, {}}};
      property.version = 2;
      break;
-    case BuiltinOperator_TANH:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+    case BuiltinOperator_TANH: {
+      property.inputs = {{0, {}}};
      // Tanh requires output with 1/128 as scale and 0 as zero point.
-      property.restriction_on_output = true;
-      property.restricted_value_on_output = {1 / 128.0, 0};
+      TensorProperty tensor_property;
+      tensor_property.restriction = true;
+      tensor_property.restricted_value = {1 / 128.0, 0};
+      property.outputs = {{0, tensor_property}};
      property.version = 2;
      break;
+    }
    case BuiltinOperator_TRANSPOSE:
-      property.input_indexes = {0};
-      property.output_indexes = {0};
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
      property.restrict_same_input_output_scale = true;
      property.version = 2;
      break;
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@ -22,30 +22,34 @@ namespace tflite {
 namespace optimize {
 namespace operator_property {

-struct OperatorProperty {
-  // Is a quantized operations currently supported.
-  bool quantizable = true;
-  // Per axis.
+struct TensorProperty {
+  // per_axis also implies symmetric currently.
  bool per_axis = false;
  // TODO(jianlijianli): remove dimension index and read it from tensor instead.
  int per_axis_index = 0;
+  bool symmetric = false;
+
+  // Constraints.
+  bool restriction = false;
+  // scale/zero_point hardcoded.
+  std::pair<float, int> restricted_value = {0.0, 0};
+};
+
+struct OperatorProperty {
+  // Is a quantized operations currently supported.
+  bool quantizable = true;

  // Op has arbitrary number of inputs, such as concat.
  bool arbitrary_inputs = false;
-  // Input and weight indexes. Unable to separate the two because of ops such as
-  // ADD.
-  std::vector<int> input_indexes = {};
-
-  // Output indexes
-  std::vector<int> output_indexes = {};
-
+  // Input indexes -> input tensor property.
+  std::vector<std::pair<int, TensorProperty>> inputs = {};
+  // Output indexes -> output tensor property.
+  std::vector<std::pair<int, TensorProperty>> outputs = {};
  // Bias indexes.
  std::vector<int> biases = {};

  // Constraints.
  bool restrict_same_input_output_scale = false;
-  bool restriction_on_output = false;
-  std::pair<float, float> restricted_value_on_output = {0.0, 0.0};

  // Op version.
  int version = 1;
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@ -357,6 +357,9 @@ TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,

 TfLiteStatus QuantizeWeight(ModelT* model, TensorT* tensor, bool per_channel,
                            int per_axis_index) {
+  // TODO(suharshs): Currently we conflate quantizing weights and constants. Its
+  // possible that the right thing to do is asymmetric quantize the weight. Add
+  // support for this.
  if (per_channel) {
    return SymmetricQuantizeTensorPerChannel(model, tensor, per_axis_index);
  } else {
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@ -276,12 +276,12 @@ TfLiteStatus ApplyConstraints(ModelT* model, ErrorReporter* error_reporter) {
      }
      // Basically only Concat passes this check.
      if (!property.restrict_same_input_output_scale ||
-          (property.input_indexes.size() == 1 &&
-           property.output_indexes.size() == 1 && property.biases.empty())) {
+          (property.inputs.size() == 1 && property.outputs.size() == 1 &&
+           property.biases.empty())) {
        continue;
      }
-      // If ApplyConstraintsnd requant is needed, use the min of min and max of
-      // max, which means using the scale and zero point of output.
+      // If ApplyConstraints and requant is needed, use the min of min and max
+      // of max, which means using the scale and zero point of output.
      TensorT* output_tensor = subgraph->tensors[op->outputs[0]].get();
      if (!utils::QuantizationParametersExist(output_tensor)) {
        error_reporter->Report(
@ -332,24 +332,23 @@ TfLiteStatus ApplyConstraints(ModelT* model, ErrorReporter* error_reporter) {
  return kTfLiteOk;
 }

-std::vector<int> GetInputIndexes(const OperatorT* op,
-                                 operator_property::OperatorProperty property) {
-  std::vector<int> input_indexes;
+std::vector<std::pair<int, operator_property::TensorProperty>> GetInputs(
+    const OperatorT* op, operator_property::OperatorProperty property) {
+  std::vector<std::pair<int, operator_property::TensorProperty>> inputs;
  if (property.arbitrary_inputs || !property.quantizable) {
    for (int i = 0; i < op->inputs.size(); ++i) {
-      input_indexes.push_back(i);
+      inputs.push_back({i, {}});
    }
  } else {
-    input_indexes = property.input_indexes;
+    inputs = property.inputs;
  }
-  return input_indexes;
+  return inputs;
 }

 bool ShouldRestrictSameInputOutputScale(
    operator_property::OperatorProperty property) {
-  return (property.input_indexes.size() == 1 &&
-          property.output_indexes.size() == 1 && property.biases.empty() &&
-          property.restrict_same_input_output_scale);
+  return (property.inputs.size() == 1 && property.outputs.size() == 1 &&
+          property.biases.empty() && property.restrict_same_input_output_scale);
 }

 bool IsSubgraphInput(SubGraphT* subgraph, int32_t index) {
@ -362,10 +361,13 @@ bool IsSubgraphInput(SubGraphT* subgraph, int32_t index) {
 }

 // Quantize the op input. Will increment op_idx if ops are added.
-TfLiteStatus QuantizeOpInput(ModelT* model, int32_t subgraph_idx,
-                             size_t* op_idx,
+TfLiteStatus QuantizeOpInput(
+    ModelT* model, int32_t subgraph_idx, size_t* op_idx,
    operator_property::OperatorProperty property,
-                             int32_t input_idx, ErrorReporter* error_reporter) {
+    const std::pair<int32_t, operator_property::TensorProperty>& input,
+    ErrorReporter* error_reporter) {
+  int32_t input_idx = input.first;
+  operator_property::TensorProperty tensor_property = input.second;
  SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
  OperatorT* op = subgraph->operators[*op_idx].get();
  const BuiltinOperator op_code =
@ -384,8 +386,11 @@ TfLiteStatus QuantizeOpInput(ModelT* model, int32_t subgraph_idx,
  if (property.quantizable && !is_input_quantized) {
    // The operation is quantizable, but the input isn't yet quantized.
    if (utils::HasBuffer(model, subgraph, tensor_idx)) {
-      if (utils::QuantizeWeight(model, tensor, property.per_axis,
-                                property.per_axis_index) == kTfLiteError) {
+      // TODO(suharshs): Look at consumers, throw error if one consumer is
+      // per-channel and one per-layer.
+      if (utils::QuantizeWeight(model, tensor, tensor_property.per_axis,
+                                tensor_property.per_axis_index) ==
+          kTfLiteError) {
        error_reporter->Report(
            "Unable to quantize buffer or min/max value for input %d "
            "in op %s in subgraph %d, node: %d",
@ -393,6 +398,7 @@ TfLiteStatus QuantizeOpInput(ModelT* model, int32_t subgraph_idx,
        return kTfLiteError;
      }
    } else if (utils::HasMinMax(tensor)) {
+      // TODO(suharshs): Handle per-channel dynamic tensor.
      if (IsSubgraphInput(subgraph, tensor_idx)) {
        utils::QuantizeActivation(tensor);
      } else {
@ -442,11 +448,13 @@ TfLiteStatus QuantizeOpInput(ModelT* model, int32_t subgraph_idx,
 }

 // Quantize the op output.
-TfLiteStatus QuantizeOpOutput(ModelT* model, int32_t subgraph_idx,
-                              int32_t op_idx,
+TfLiteStatus QuantizeOpOutput(
+    ModelT* model, int32_t subgraph_idx, int32_t op_idx,
    operator_property::OperatorProperty property,
-                              int32_t output_idx,
+    const std::pair<int32_t, operator_property::TensorProperty>& output,
    ErrorReporter* error_reporter) {
+  int32_t output_idx = output.first;
+  operator_property::TensorProperty tensor_property = output.second;
  // If the operator is not quantizable, we don't need to do anything for the
  // output.
  if (!property.quantizable) {
@ -470,16 +478,16 @@ TfLiteStatus QuantizeOpOutput(ModelT* model, int32_t subgraph_idx,
    // Copy quantization parameter. For average pool, max pool, etc
    // min/max can be different but we want them to be the same.
    // Get scale and zero point of input.
-    if (property.input_indexes[0] >= op->inputs.size()) {
+    if (property.inputs[0].first >= op->inputs.size()) {
      error_reporter->Report(
          "Required input index %d is larger than the input length of "
          "op %s at index %d in subgraph %d",
-          property.input_indexes[0], op->inputs.size(),
+          property.inputs[0].first, op->inputs.size(),
          EnumNameBuiltinOperator(op_code), op_idx, subgraph_idx);
      return kTfLiteError;
    }
-    const int input_index = op->inputs[property.input_indexes[0]];
-    TensorT* input_tensor = subgraph->tensors[input_index].get();
+    const int input_tensor_idx = op->inputs[property.inputs[0].first];
+    TensorT* input_tensor = subgraph->tensors[input_tensor_idx].get();
    if (input_tensor->quantization->scale.size() != 1 ||
        input_tensor->quantization->zero_point.size() != 1 ||
        input_tensor->quantization->min.size() != 1 ||
@ -514,8 +522,8 @@ TfLiteStatus QuantizeOpOutput(ModelT* model, int32_t subgraph_idx,
    output_tensor->quantization->min.push_back(min);
    output_tensor->quantization->max.push_back(max);
    output_tensor->type = TensorType_INT8;
-  } else if (property.restriction_on_output) {
-    const auto scale_and_zp = property.restricted_value_on_output;
+  } else if (tensor_property.restriction) {
+    const auto scale_and_zp = tensor_property.restricted_value;
    // Apply to output.
    output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
    output_tensor->quantization->scale.push_back(scale_and_zp.first);
@ -557,15 +565,17 @@ TfLiteStatus QuantizeWeightsInputOutput(ModelT* model, bool allow_float,
      }

      // Quantize operator inputs/weights.
-      for (const int input_idx : GetInputIndexes(op, property)) {
-        TF_LITE_ENSURE_STATUS(QuantizeOpInput(
-            model, subgraph_idx, &op_idx, property, input_idx, error_reporter));
+      for (const std::pair<int, operator_property::TensorProperty>& input :
+           GetInputs(op, property)) {
+        TF_LITE_ENSURE_STATUS(QuantizeOpInput(model, subgraph_idx, &op_idx,
+                                              property, input, error_reporter));
      }

      // Quantize operator outputs.
-      for (const int output_idx : property.output_indexes) {
+      for (const std::pair<int, operator_property::TensorProperty>& output :
+           property.outputs) {
        TF_LITE_ENSURE_STATUS(QuantizeOpOutput(
-            model, subgraph_idx, op_idx, property, output_idx, error_reporter));
+            model, subgraph_idx, op_idx, property, output, error_reporter));
      }
    }
  }
@ -601,7 +611,7 @@ TfLiteStatus QuantizeBiases(ModelT* model, ErrorReporter* error_reporter) {
          if (utils::HasBuffer(model, subgraph, op->inputs[bias_idx])) {
            TensorT* bias_tensor =
                subgraph->tensors[op->inputs[bias_idx]].get();
-            if (property.input_indexes.size() != 2) {
+            if (property.inputs.size() != 2) {
              error_reporter->Report(
                  "Expect the input length of "
                  "op %s at index %d in subgraph %d to be 2",
@ -610,12 +620,15 @@ TfLiteStatus QuantizeBiases(ModelT* model, ErrorReporter* error_reporter) {
              return kTfLiteError;
            }
            TensorT* input_tensor =
-                subgraph->tensors[op->inputs[property.input_indexes[0]]].get();
+                subgraph->tensors[op->inputs[property.inputs[0].first]].get();
            TensorT* weight_tensor =
-                subgraph->tensors[op->inputs[property.input_indexes[1]]].get();
-            TF_LITE_ENSURE_STATUS(QuantizeBias(
-                model, input_tensor, weight_tensor, bias_tensor,
-                property.per_axis, property.per_axis_index, error_reporter));
+                subgraph->tensors[op->inputs[property.inputs[1].first]].get();
+            operator_property::TensorProperty weight_property =
+                property.inputs[1].second;
+            TF_LITE_ENSURE_STATUS(
+                QuantizeBias(model, input_tensor, weight_tensor, bias_tensor,
+                             weight_property.per_axis,
+                             weight_property.per_axis_index, error_reporter));
          }
        }
      }