Add LSTM quantization spec and add quantizer support for LSTM.

Test data are generated manually with dummy data. PiperOrigin-RevId: 279857476 Change-Id: I2ac46bf3836d5eb89688b5127c23b9b0ec5cbafd
2019-11-11 17:57:32 -08:00 · 2019-11-11 17:57:32 -08:00 · fd4ad2ebdc
commit fd4ad2ebdc
parent 09d58c6012
11 changed files with 368 additions and 10 deletions
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@ -219,6 +219,8 @@ tf_cc_test(
        "//tensorflow/lite/tools/optimize:testdata/argmax.bin",
        "//tensorflow/lite/tools/optimize:testdata/concat.bin",
        "//tensorflow/lite/tools/optimize:testdata/fc.bin",
+        "//tensorflow/lite/tools/optimize:testdata/lstm_calibrated.bin",
+        "//tensorflow/lite/tools/optimize:testdata/lstm_quantized.bin",
        "//tensorflow/lite/tools/optimize:testdata/mixed.bin",
        "//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin",
        "//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@ -168,9 +168,73 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
    case BuiltinOperator_LSTM: {
      // TODO(jianlijianli): extend LSTM op spec to inlucde input, bias etc.
      // TODO(jianlijianli): extend this to other variants of LSTM.
-      // LSTM need 5 intermediate tensors. This agrees with the fully quantized
+      // LSTM needs 5 intermediate tensors. This agrees with the fully quantized
      // kernels in lstm_eval.cc
-      property.intermediates = {{0, {}}, {1, {}}, {2, {}}, {3, {}}, {4, {}}};
+      static const float alpha = static_cast<float>(std::pow(2, -10));
+
+      TensorProperty tensor_property_12;
+      tensor_property_12.use_derived_scale = true;
+      tensor_property_12.number_of_bits = 32;
+      tensor_property_12.derived_scale = {{20}, {}, {alpha}};
+      TensorProperty tensor_property_13;
+      tensor_property_13.use_derived_scale = true;
+      tensor_property_13.number_of_bits = 32;
+      tensor_property_13.derived_scale = {{21}, {}, {alpha}};
+      TensorProperty tensor_property_14;
+      tensor_property_14.use_derived_scale = true;
+      tensor_property_14.number_of_bits = 32;
+      tensor_property_14.derived_scale = {{22}, {}, {alpha}};
+      TensorProperty tensor_property_15;
+      tensor_property_15.use_derived_scale = true;
+      tensor_property_15.number_of_bits = 32;
+      tensor_property_15.derived_scale = {{23}, {}, {alpha}};
+      TensorProperty tensor_property_17;
+      tensor_property_17.use_derived_scale = true;
+      tensor_property_17.number_of_bits = 32;
+      tensor_property_17.derived_scale = {{16}, {4}, {}};
+      TensorProperty tensor_property_19;
+      tensor_property_19.extend_to_power_of_two = true;
+      tensor_property_19.number_of_bits = 16;
+      tensor_property_19.state_tensor = true;
+      tensor_property_19.symmetric = true;
+      TensorProperty tensor_property_20;
+      tensor_property_20.number_of_bits = 16;
+      tensor_property_20.symmetric = true;
+
+      property.inputs = {
+          {0, {}},
+          {1, {}},
+          {2, {}},
+          {3, {}},
+          {4, {}},
+          {5, {}},
+          {6, {}},
+          {7, {}},
+          {8, {}},
+          {9, {}},
+          {10, {}},
+          {11, {}},
+          {16, {}},
+          {19, tensor_property_19},
+          {20, tensor_property_20},
+          {21, tensor_property_20},
+          {22, tensor_property_20},
+          {23, tensor_property_20},
+          {12, tensor_property_12},
+          {13, tensor_property_13},
+          {14, tensor_property_14},
+          {15, tensor_property_15},
+          {17, tensor_property_17},
+      };
+      property.outputs = {{0, {}}};
+      property.intermediates = {
+          {0, tensor_property_20},
+          {1, tensor_property_20},
+          {2, tensor_property_20},
+          {3, tensor_property_20},
+          {4, {}},
+      };
+      property.restrict_scale = {{18, 0}};
      property.version = 2;
      break;
    }
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@ -44,6 +44,21 @@ struct TensorProperty {
  bool restriction = false;
  // scale/zero_point hardcoded.
  std::pair<float, int> restricted_value = {0.0, 0};
+
+  // Use derived scale.
+  bool use_derived_scale = false;
+  // The derived scale.
+  DerivedScale derived_scale;
+
+  // The number of bits for this tensor. It could be 8, 16, 32 or even not power
+  // of two.
+  int number_of_bits = 8;
+
+  // Extend the range to power of two.
+  bool extend_to_power_of_two = false;
+
+  // State tensor.
+  bool state_tensor = false;
 };

 struct OperatorProperty {
@ -55,10 +70,13 @@ struct OperatorProperty {
  // Op has arbitrary number of outputs, such as slice.
  bool arbitrary_outputs = false;
  // Input indexes -> input tensor property.
+  // Must be topologically sorted since there are derived scales.
  std::vector<std::pair<int, TensorProperty>> inputs = {};
  // Output indexes -> output tensor property.
  std::vector<std::pair<int, TensorProperty>> outputs = {};
  // Bias indexes.
+  // TODO(jianlijianli): remove this by putting biases into inputs as well since
+  // we now can model "derived scale".
  std::vector<int> biases = {};

  // Intermediate indexes -> intermediate tensor property.
@ -67,6 +85,12 @@ struct OperatorProperty {
  // Force output to reuse the same scale and zero point of input.
  bool restrict_same_input_output_scale = false;

+  // Use same min of min and max of max for each group.
+  // Incompatable with restrict_same_input_output_scale and restricted_value.
+  // TODO(jianlijianli): make it compatible with other restrictions when there
+  // is a use case.
+  std::vector<std::vector<int>> restrict_scale = {};
+
  // Op version.
  int version = 1;
 };
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@ -625,11 +625,15 @@ float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
  float scale = 1.0f;
  OperatorT* op = subgraph->operators[op_idx].get();
  for (int i = 0; i < input_index.size(); ++i) {
-    TensorT* tensor = subgraph->tensors[op->inputs[i]].get();
+    const int index_local = input_index[i];
+    const int index_global = op->inputs[index_local];
+    const TensorT* tensor = subgraph->tensors[index_global].get();
    scale *= tensor->quantization->scale[0];
  }
  for (int i = 0; i < intermediate_index.size(); ++i) {
-    TensorT* tensor = subgraph->tensors[op->intermediates[i]].get();
+    const int index_local = intermediate_index[i];
+    const int index_global = op->intermediates[index_local];
+    const TensorT* tensor = subgraph->tensors[index_global].get();
    scale *= tensor->quantization->scale[0];
  }
  for (int i = 0; i < factors.size(); ++i) {
@ -646,6 +650,15 @@ void QuantizeActivation(TensorT* tensor) {
  tensor->type = TensorType_INT8;
 }

+TfLiteStatus QuantizeActivationToInt16(TensorT* tensor, float scale) {
+  const int32 zero_point = 0;
+  tensor->quantization = absl::make_unique<QuantizationParametersT>();
+  tensor->quantization->scale.push_back(scale);
+  tensor->quantization->zero_point.push_back(zero_point);
+  tensor->type = TensorType_INT16;
+  return kTfLiteOk;
+}
+
 int GetPowerOfTwoScale(float min, float max) {
  const float range = std::max(std::abs(min), std::abs(max));
  int pot = 0;
--- a/tensorflow/lite/tools/optimize/quantization_utils.h
+++ b/tensorflow/lite/tools/optimize/quantization_utils.h
@ -138,6 +138,9 @@ float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
 // Quantize activation.
 void QuantizeActivation(TensorT* tensor);

+// Quantize activation to 16bit.
+TfLiteStatus QuantizeActivationToInt16(TensorT* tensor, float scale);
+
 // Get the power of two scale for min and max for symmetric quantization case.
 int GetPowerOfTwoScale(float min, float max);

--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@ -421,6 +421,10 @@ TfLiteStatus QuantizeOpInput(
    return kTfLiteError;
  }
  const int32_t tensor_idx = op->inputs[input_idx];
+  if (tensor_idx == -1) {
+    // Skip optional tensor.
+    return kTfLiteOk;
+  }
  TensorT* tensor = subgraph->tensors[tensor_idx].get();
  // Assumes op is quantized to int8.
  const bool is_input_quantized = utils::QuantizationParametersExist(tensor);
@ -429,9 +433,59 @@ TfLiteStatus QuantizeOpInput(
    if (utils::HasBuffer(model, subgraph, tensor_idx)) {
      // TODO(suharshs): Look at consumers, throw error if one consumer is
      // per-channel and one per-layer.
-      if (utils::QuantizeWeight(model, tensor, tensor_property.per_axis,
-                                tensor_property.per_axis_index,
-                                error_reporter) == kTfLiteError) {
+      if (tensor_property.number_of_bits == 8) {
+        if (tensor_property.use_derived_scale) {
+          // Currently 8bit tensors in input do not accept derived scale.
+          return kTfLiteError;
+        }
+        if (utils::QuantizeWeight(model, tensor, tensor_property.per_axis,
+                                  tensor_property.per_axis_index,
+                                  error_reporter) == kTfLiteError) {
+          error_reporter->Report(
+              "Unable to quantize buffer or min/max value for input %d "
+              "in op %s in subgraph %d, node: %d",
+              input_idx, EnumNameBuiltinOperator(op_code), subgraph_idx,
+              *op_idx);
+          return kTfLiteError;
+        }
+      } else if (tensor_property.number_of_bits == 16) {
+        if (tensor_property.use_derived_scale) {
+          // Currently 16bit tensors in input do not accept derived scale.
+          return kTfLiteError;
+        }
+        TensorT* tensor = subgraph->tensors[tensor_idx].get();
+        int total_size = 1;
+        for (int i = 0; i < tensor->shape.size(); ++i) {
+          total_size *= tensor->shape[i];
+        }
+        BufferT* buffer = model->buffers[tensor->buffer].get();
+        float* float_data = reinterpret_cast<float*>(buffer->data.data());
+        auto minmax = std::minmax_element(float_data, float_data + total_size);
+        const float min = *minmax.first;
+        const float max = *minmax.second;
+        const float range = std::max(std::abs(min), std::abs(max));
+        // The narrow range quantized value for int16.
+        const float quantize_range = 32767.0;
+        const float scale = range / quantize_range;
+        return utils::SymmetricQuantizeFloatsToInt16(model, tensor, scale,
+                                                     error_reporter);
+      } else if (tensor_property.number_of_bits == 32) {
+        if (!tensor_property.use_derived_scale) {
+          // Currently 32 bit tensors in input only accept derived scale.
+          return kTfLiteError;
+        }
+        TensorT* tensor = subgraph->tensors[tensor_idx].get();
+        const float scale = utils::GetEffectiveScale(
+            model, subgraph, *op_idx,
+            tensor_property.derived_scale.input_tensors,
+            tensor_property.derived_scale.intermediate_tensors,
+            tensor_property.derived_scale.factors);
+        return utils::SymmetricPerLayerBiasQuantize(model, tensor, scale,
+                                                    error_reporter);
+
+      } else {
+        // Only 8, 16, 32 are supported.
+        // TODO(jianlijianli): extend this to support arbitrary bits.
        error_reporter->Report(
            "Unable to quantize buffer or min/max value for input %d "
            "in op %s in subgraph %d, node: %d",
@ -439,9 +493,27 @@ TfLiteStatus QuantizeOpInput(
        return kTfLiteError;
      }
    } else if (utils::HasMinMax(tensor)) {
-      // TODO(suharshs): Handle per-channel dynamic tensor.
-      if (IsSubgraphInput(subgraph, tensor_idx)) {
-        utils::QuantizeActivation(tensor);
+      if (IsSubgraphInput(subgraph, tensor_idx) ||
+          tensor_property.state_tensor) {
+        if (tensor_property.number_of_bits == 8) {
+          if (tensor_property.use_derived_scale) {
+            // Currently 8bit tensors in input do not accept derived scale.
+            return kTfLiteError;
+          }
+          utils::QuantizeActivation(tensor);
+        } else if (tensor_property.number_of_bits == 16) {
+          TensorT* tensor = subgraph->tensors[tensor_idx].get();
+          float range = std::max(std::abs(tensor->quantization->min[0]),
+                                 std::abs(tensor->quantization->max[0]));
+          if (tensor_property.extend_to_power_of_two) {
+            const int power_of_two_scale = utils::GetPowerOfTwoScale(
+                tensor->quantization->min[0], tensor->quantization->max[0]);
+            range = std::pow(2, power_of_two_scale);
+          }
+          const float quantized_range = 32768.0;
+          const float scale = range / quantized_range;
+          utils::QuantizeActivationToInt16(tensor, scale);
+        }
      } else {
        // If the tensor is not a model input, we need to add a Quantize
        // operation since the preceding op may require a float output.
@ -515,6 +587,10 @@ TfLiteStatus QuantizeOpOutput(
  }

  TensorT* output_tensor = subgraph->tensors[op->outputs[output_idx]].get();
+  if (utils::QuantizationParametersExist(output_tensor)) {
+    // Skip output if it has been quantized.
+    return kTfLiteOk;
+  }
  if (ShouldRestrictSameInputOutputScale(property)) {
    // Copy quantization parameter. For average pool, max pool, etc
    // min/max can be different but we want them to be the same.
@ -576,6 +652,122 @@ TfLiteStatus QuantizeOpOutput(
  return kTfLiteOk;
 }

+TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
+                                        ErrorReporter* error_reporter) {
+  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
+       subgraph_idx++) {
+    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
+    for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
+      operator_property::OperatorProperty property =
+          operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
+      if (!property.intermediates.empty()) {
+        OperatorT* op = subgraph->operators[op_idx].get();
+        const BuiltinOperator op_code =
+            model->operator_codes[op->opcode_index]->builtin_code;
+        for (const std::pair<int, operator_property::TensorProperty>& input :
+             property.intermediates) {
+          const int index_local = input.first;
+          const int index_global = op->intermediates[index_local];
+          if (index_global == -1) {
+            // Skip optional tensor.
+            continue;
+          }
+          if (input.second.number_of_bits == 8 &&
+              input.second.symmetric == false) {
+            TensorT* tensor = subgraph->tensors[index_global].get();
+            if (utils::HasMinMax(tensor)) {
+              utils::QuantizeActivation(tensor);
+            } else {
+              error_reporter->Report(
+                  "Unable to find min/max value for output %d in %s in "
+                  "subgraph %d, node: %d",
+                  tensor, EnumNameBuiltinOperator(op_code), subgraph_idx,
+                  op_idx);
+              return kTfLiteError;
+            }
+          } else if (input.second.number_of_bits == 16 &&
+                     input.second.symmetric == true) {
+            TensorT* tensor = subgraph->tensors[index_global].get();
+            if (tensor->quantization == nullptr) {
+              continue;
+            }
+            const float min = tensor->quantization->min[0];
+            const float max = tensor->quantization->max[0];
+            const float range = std::max(std::abs(min), std::abs(max));
+            if (range < 1e-8) {
+              return kTfLiteError;
+            }
+
+            // Get scale and zero point.
+            const float quantized_range = 32767.0;
+            const float scale = range / quantized_range;
+            utils::QuantizeActivationToInt16(tensor, scale);
+          } else {
+            return kTfLiteError;
+          }
+        }
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+// Quantize tensros that have shared range. For example, in LSTM, the output
+// tensor and input state tensor should share the same range because they are
+// using the same scale and zero point.
+TfLiteStatus QuantizeSharedRange(ModelT* model, ErrorReporter* error_reporter) {
+  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
+       subgraph_idx++) {
+    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
+    for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
+      operator_property::OperatorProperty property =
+          operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
+      if (!property.intermediates.empty()) {
+        OperatorT* op = subgraph->operators[op_idx].get();
+        for (const std::vector<int>& input : property.restrict_scale) {
+          if (input.empty()) {
+            continue;
+          }
+          // Currently only support pair of twos.
+          // TODO(jianlijianli): extend to arbitrary number of tensors.
+          if (input.size() != 2) {
+            return kTfLiteError;
+          }
+          const int index_1 = input[0];
+          const int index_2 = input[1];
+          // TODO(jianlijianli): model input/output.
+          TensorT* tensor_1 = subgraph->tensors[op->inputs[index_1]].get();
+          TensorT* tensor_2 = subgraph->tensors[op->outputs[index_2]].get();
+          const float min_of_min = std::min(tensor_1->quantization->min[0],
+                                            tensor_2->quantization->min[0]);
+          const float max_of_max = std::max(tensor_1->quantization->max[0],
+                                            tensor_2->quantization->max[0]);
+          if (min_of_min == 0.0 && max_of_max == 0.0) {
+            return kTfLiteError;
+          }
+
+          // Asmmetric quantization to 8 bit.
+          auto quantization_params =
+              absl::make_unique<QuantizationParametersT>();
+          utils::GetAsymmetricQuantizationParams(
+              min_of_min, max_of_max, -128, 127, quantization_params.get());
+
+          // Populate both tensors with the same parameters.
+          const float scale = quantization_params->scale[0];
+          const int32 zero_point = quantization_params->zero_point[0];
+          for (TensorT* tensor : {tensor_1, tensor_2}) {
+            tensor->quantization = absl::make_unique<QuantizationParametersT>();
+            tensor->quantization->scale.push_back(scale);
+            tensor->quantization->zero_point.push_back(zero_point);
+            tensor->type = TensorType_INT8;
+          }
+        }
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
 // Quantize inputs and weights.
 // Because of ops such as lstm, still need to do per op, instead of weights.
 TfLiteStatus QuantizeWeightsInputOutput(
@ -713,6 +905,10 @@ TfLiteStatus FillQuantizationParams(
        // Get tensor.
        const int32_t input_idx = input.first;
        const int32_t tensor_idx = op->inputs[input_idx];
+        if (tensor_idx == -1) {
+          // Skip optional tensor.
+          continue;
+        }
        TensorT* tensor = subgraph->tensors[tensor_idx].get();

        // Static tensor.
@ -918,6 +1114,8 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
      FillQuantizationParams(model, operator_names, error_reporter));
  TF_LITE_ENSURE_STATUS(
      EnsureBiasScaleCompatibility(model, operator_names, error_reporter));
+  TF_LITE_ENSURE_STATUS(QuantizeIntemediateTensors(model, error_reporter));
+  TF_LITE_ENSURE_STATUS(QuantizeSharedRange(model, error_reporter));
  TF_LITE_ENSURE_STATUS(QuantizeWeightsInputOutput(
      model, allow_float, operator_names, error_reporter));
  TF_LITE_ENSURE_STATUS(
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@ -979,6 +979,53 @@ TEST_F(QuantizeArgMaxTest, VerifyArgMax) {
  EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }

+class QuantizeLSTMTest : public QuantizeModelTest {
+ protected:
+  QuantizeLSTMTest() {
+    input_model_ = ReadModel(internal::kLstmCalibrated);
+    readonly_model_ = input_model_->GetModel();
+    readonly_model_->UnPackTo(&model_);
+  }
+};
+
+TEST_F(QuantizeLSTMTest, VerifyLSTM) {
+  // Quantize model.
+  auto status = QuantizeModel(&builder_, &model_, TensorType_FLOAT32,
+                              TensorType_FLOAT32, &error_reporter_);
+  ASSERT_EQ(kTfLiteOk, status);
+
+  // Read expected model.
+  auto expected_fb_model = ReadModel(internal::kLstmQuantized);
+  auto expected_read_only_model = expected_fb_model->GetModel();
+  ModelT expected_model;
+  expected_read_only_model->UnPackTo(&expected_model);
+
+  // Comparison.
+  ASSERT_EQ(model_.subgraphs.size(), expected_model.subgraphs.size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
+       subgraph_idx++) {
+    const auto graph = model_.subgraphs[subgraph_idx].get();
+    const auto expected_graph = expected_model.subgraphs[subgraph_idx].get();
+    ASSERT_EQ(graph->tensors.size(), expected_graph->tensors.size());
+    for (size_t i = 0; i < graph->tensors.size(); i++) {
+      const auto tensor = graph->tensors[i].get();
+      const auto expected_tensor = expected_graph->tensors[i].get();
+      EXPECT_EQ(tensor->buffer, expected_tensor->buffer);
+      EXPECT_EQ(tensor->is_variable, expected_tensor->is_variable);
+      EXPECT_EQ(tensor->shape, expected_tensor->shape);
+      EXPECT_EQ(tensor->name, expected_tensor->name);
+      EXPECT_EQ(tensor->type, expected_tensor->type);
+    }
+  }
+  ASSERT_EQ(model_.buffers.size(), expected_model.buffers.size());
+  for (size_t buffer_idx = 0; buffer_idx < model_.buffers.size();
+       ++buffer_idx) {
+    const auto buffer = model_.buffers[buffer_idx].get()->data;
+    const auto expected_buffer = expected_model.buffers[buffer_idx].get()->data;
+    EXPECT_EQ(buffer, expected_buffer);
+  }
+}
+
 class QuantizeFCTest : public QuantizeModelTest {
 protected:
  QuantizeFCTest() {
--- a/tensorflow/lite/tools/optimize/test_util.cc
+++ b/tensorflow/lite/tools/optimize/test_util.cc
@ -49,6 +49,9 @@ const char* kModelMixed = "mixed.bin";

 const char* kModelSplit = "split.bin";

+const char* kLstmCalibrated = "lstm_calibrated.bin";
+const char* kLstmQuantized = "lstm_quantized.bin";
+
 int FailOnErrorReporter::Report(const char* format, va_list args) {
  char buf[1024];
  vsnprintf(buf, sizeof(buf), format, args);
--- a/tensorflow/lite/tools/optimize/test_util.h
+++ b/tensorflow/lite/tools/optimize/test_util.h
@ -76,6 +76,10 @@ extern const char* kModelMixed;
 // Test model with split op.
 extern const char* kModelSplit;

+// Test model with LSTM op.
+extern const char* kLstmCalibrated;
+extern const char* kLstmQuantized;
+
 // An error reporter that fails on testing.
 class FailOnErrorReporter : public ErrorReporter {
 public:
--- a/tensorflow/lite/tools/optimize/testdata/lstm_calibrated.bin
+++ b/tensorflow/lite/tools/optimize/testdata/lstm_calibrated.bin
--- a/tensorflow/lite/tools/optimize/testdata/lstm_quantized.bin
+++ b/tensorflow/lite/tools/optimize/testdata/lstm_quantized.bin