STT-tensorflow/tensorflow/lite/tools/optimize/quantize_weights_test.cc

/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <cstddef>
#include <cstdint>
#include <memory>

#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
#include "tensorflow/core/lib/io/path.h"
#include "tensorflow/core/platform/init_main.h"
#include "tensorflow/core/util/command_line_flags.h"
#include "tensorflow/lite/model.h"
#include "tensorflow/lite/schema/schema_generated.h"
#include "tensorflow/lite/tools/optimize/quantize_weights.h"
#include "tensorflow/lite/tools/optimize/test_util.h"

namespace {
tensorflow::string* g_test_model_dir = nullptr;
}  // namespace

namespace tflite {
namespace optimize {
namespace {

std::unique_ptr<FlatBufferModel> ReadTestModel() {
  auto model_path = tensorflow::io::JoinPath(
      *g_test_model_dir, internal::kConvModelWith0Plus10Weights);
  return FlatBufferModel::BuildFromFile(model_path.c_str());
}

std::unique_ptr<FlatBufferModel> ReadSharedWeightsTestModel() {
  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir,
                                             internal::kModelWithSharedWeights);
  return FlatBufferModel::BuildFromFile(model_path.c_str());
}

std::unique_ptr<FlatBufferModel> ReadGatherTestModel() {
  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir,
                                             internal::kQuantizedWithGather);
  return FlatBufferModel::BuildFromFile(model_path.c_str());
}

std::unique_ptr<FlatBufferModel> ReadCustomOpTestModel() {
  auto model_path =
      tensorflow::io::JoinPath(*g_test_model_dir, internal::kModelWithCustomOp);
  return FlatBufferModel::BuildFromFile(model_path.c_str());
}

template <typename T>
std::vector<T> GetAsVector(const flatbuffers::Vector<T>* vec) {
  return std::vector<T>(vec->begin(), vec->end());
}

class QuantizeWeightsTest : public testing::Test {
 protected:
  QuantizeWeightsTest() {}

  void LoadBasicModel() {
    input_model_ = ReadTestModel();
    model_ = input_model_->GetModel();
  }

  void LoadSharedWeightsModel() {
    input_model_ = ReadSharedWeightsTestModel();
    model_ = input_model_->GetModel();
  }

  void LoadGatherTestModel() {
    input_model_ = ReadGatherTestModel();
    model_ = input_model_->GetModel();
  }

  void LoadCustomOpTestModel() {
    input_model_ = ReadCustomOpTestModel();
    model_ = input_model_->GetModel();
  }

  std::unique_ptr<FlatBufferModel> input_model_;
  const Model* model_;

  bool IsModelInputOrOutput(const Model* model, uint32_t tensor_idx) {
    for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
         ++subgraph_idx) {
      const auto subgraph = model->subgraphs()->Get(subgraph_idx);
      for (size_t i = 0; i < subgraph->inputs()->size(); ++i) {
        if (subgraph->inputs()->Get(i) == tensor_idx) {
          return true;
        }
      }
      for (size_t i = 0; i < subgraph->outputs()->size(); ++i) {
        if (subgraph->outputs()->Get(i) == tensor_idx) {
          return true;
        }
      }
    }
    return false;
  }

  // Returns the producer op code of the specified tensor_idx.
  bool GetProducerOpCode(const Model* model, uint32_t subgraph_idx,
                         uint32_t tensor_idx,
                         tflite::BuiltinOperator* op_code) {
    const auto subgraph = model->subgraphs()->Get(subgraph_idx);
    for (size_t op_idx = 0; op_idx < subgraph->operators()->size(); ++op_idx) {
      const auto op = subgraph->operators()->Get(op_idx);
      for (size_t i = 0; i < op->outputs()->size(); ++i) {
        if (op->outputs()->Get(i) == tensor_idx) {
          const uint32_t op_code_idx = op->opcode_index();
          *op_code = model->operator_codes()->Get(op_code_idx)->builtin_code();
          return true;
        }
      }
    }
    return false;
  }
};

TEST_F(QuantizeWeightsTest, QuantizationSucceeds) {
  LoadBasicModel();
  flatbuffers::FlatBufferBuilder builder;
  auto status = QuantizeWeights(&builder, model_, 0);
  EXPECT_EQ(status, kTfLiteOk);

  const uint8_t* buffer = builder.GetBufferPointer();
  const Model* output_model = GetModel(buffer);
  ASSERT_TRUE(output_model);
}

TEST_F(QuantizeWeightsTest, WeightsMinNumElements) {
  LoadBasicModel();
  // Make weights_min_size sufficiently large such that no quantization should
  // happen, i.e. the original model is the same size as the old one.
  flatbuffers::FlatBufferBuilder builder;
  const uint64_t kWeightsMinNumElements = 1000000;
  EXPECT_EQ(QuantizeWeights(&builder, model_, kWeightsMinNumElements),
            kTfLiteOk);

  const uint8_t* buffer = builder.GetBufferPointer();
  const Model* output_model = GetModel(buffer);
  ASSERT_TRUE(output_model);

  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
       subgraph_idx++) {
    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
    ASSERT_EQ(quantized_graph->tensors()->size(),
              float_graph->tensors()->size());
    for (size_t i = 0; i < quantized_graph->tensors()->size(); i++) {
      const auto quant_tensor = quantized_graph->tensors()->Get(i);
      const auto float_tensor = float_graph->tensors()->Get(i);
      // Everything should remain equal between the two graphs.
      EXPECT_EQ(quant_tensor->buffer(), float_tensor->buffer());
      EXPECT_EQ(quant_tensor->is_variable(), float_tensor->is_variable());
      EXPECT_EQ(GetAsVector(quant_tensor->shape()),
                GetAsVector(float_tensor->shape()));
      EXPECT_EQ(quant_tensor->name()->str(), float_tensor->name()->str());
      EXPECT_EQ(quant_tensor->type(), float_tensor->type());
    }
  }
}

TEST_F(QuantizeWeightsTest, HybridConv) {
  LoadBasicModel();
  flatbuffers::FlatBufferBuilder builder;
  auto status = QuantizeWeights(&builder, model_, 0);
  EXPECT_EQ(status, kTfLiteOk);

  const uint8_t* buffer = builder.GetBufferPointer();
  const Model* output_model = GetModel(buffer);
  ASSERT_TRUE(output_model);

  // Nothing should change.
  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
       subgraph_idx++) {
    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
    ASSERT_EQ(quantized_graph->tensors()->size(),
              float_graph->tensors()->size());
    // Make sure the graph only has one Conv operation.
    ASSERT_EQ(quantized_graph->operators()->size(), 1);
    const auto op = quantized_graph->operators()->Get(0);
    const uint32_t op_code_idx = op->opcode_index();
    ASSERT_EQ(output_model->operator_codes()->Get(op_code_idx)->builtin_code(),
              BuiltinOperator_CONV_2D);
    for (size_t i = 0; i < quantized_graph->tensors()->size(); i++) {
      const auto quant_tensor = quantized_graph->tensors()->Get(i);
      const auto float_tensor = float_graph->tensors()->Get(i);
      EXPECT_EQ(quant_tensor->buffer(), float_tensor->buffer());
      EXPECT_EQ(quant_tensor->is_variable(), float_tensor->is_variable());
      EXPECT_EQ(GetAsVector(quant_tensor->shape()),
                GetAsVector(float_tensor->shape()));
      EXPECT_EQ(quant_tensor->name()->str(), float_tensor->name()->str());
      // If the tensor is a weight, it should have type INT8, otherwise it
      // should stay with type FLOAT32.
      // If the tensor is a bias, it should have type FLOAT32.
      if (quant_tensor->name()->str() == "conv_bias") {
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
      } else if (IsModelInputOrOutput(output_model, i)) {
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
      } else if (quant_tensor->buffer() != 0) {
        EXPECT_EQ(quant_tensor->type(), TensorType_INT8)
            << quant_tensor->name()->str();
      } else {
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
      }
    }
  }
}

TEST_F(QuantizeWeightsTest, DequantizeConv) {
  LoadBasicModel();
  flatbuffers::FlatBufferBuilder builder;
  auto status = internal::QuantizeWeights(&builder, model_, 0,
                                          /*use_hybrid_evaluation=*/false);
  EXPECT_EQ(status, kTfLiteOk);

  const uint8_t* buffer = builder.GetBufferPointer();
  const Model* output_model = GetModel(buffer);
  ASSERT_TRUE(output_model);

  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
       ++subgraph_idx) {
    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
    // The output graph should have an extra tensor from the added dequantize
    // op.
    ASSERT_EQ(quantized_graph->tensors()->size(),
              float_graph->tensors()->size() + 1);
    // Check that a dequantize op exists.
    int32_t dequant_input_idx = -1;
    int32_t dequant_output_idx = -1;
    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
      const auto op = quantized_graph->operators()->Get(i);
      const uint32_t op_code_idx = op->opcode_index();
      if (output_model->operator_codes()->Get(op_code_idx)->builtin_code() ==
          BuiltinOperator_DEQUANTIZE) {
        dequant_input_idx = op->inputs()->Get(0);
        dequant_output_idx = op->outputs()->Get(0);
      }
    }
    ASSERT_GT(dequant_input_idx, -1);
    ASSERT_GT(dequant_output_idx, -1);
    for (size_t i = 0; i < quantized_graph->tensors()->size(); ++i) {
      const auto quant_tensor = quantized_graph->tensors()->Get(i);
      // If the tensor is a weight, it should have type INT8.
      // If the tensor is a bias, it should have type FLOAT32.
      // If the tensor is an input or output it should have type FLOAT32.
      // The input to dequantize should be INT8, and all other tensors should be
      // FLOAT32.
      if (i == dequant_input_idx) {
        EXPECT_EQ(quant_tensor->type(), TensorType_INT8);
      } else if (i == dequant_output_idx) {
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
      } else if (IsModelInputOrOutput(output_model, i)) {
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
      } else if (quant_tensor->name()->str() == "conv_bias") {
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
      } else if (quant_tensor->buffer() != 0) {
        // If it's a non-bias constant tensor, it must be the weight.
        EXPECT_EQ(quant_tensor->type(), TensorType_INT8);
      } else {
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
      }
    }
  }
}

TEST_F(QuantizeWeightsTest, DequantizeConvFloat16) {
  LoadBasicModel();
  flatbuffers::FlatBufferBuilder builder;
  auto status = tflite::optimize::QuantizeWeights(
      &builder, model_, BufferType::QUANTIZED_FLOAT16);
  EXPECT_EQ(status, kTfLiteOk);

  const uint8_t* buffer = builder.GetBufferPointer();
  const Model* output_model = GetModel(buffer);
  ASSERT_TRUE(output_model);

  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
       ++subgraph_idx) {
    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
    // The output graph should have two extra tensors from the added dequantize
    // op.
    ASSERT_EQ(quantized_graph->tensors()->size(),
              float_graph->tensors()->size() + 2);
    // Check that a dequantize op exists.
    int32_t dequant_input_idx = -1;
    int32_t dequant_output_idx = -1;
    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
      const auto op = quantized_graph->operators()->Get(i);
      const uint32_t op_code_idx = op->opcode_index();
      if (output_model->operator_codes()->Get(op_code_idx)->builtin_code() ==
          BuiltinOperator_DEQUANTIZE) {
        dequant_input_idx = op->inputs()->Get(0);
        dequant_output_idx = op->outputs()->Get(0);
      }
    }
    ASSERT_GT(dequant_input_idx, -1);
    ASSERT_GT(dequant_output_idx, -1);
    for (size_t i = 0; i < quantized_graph->tensors()->size(); ++i) {
      const auto quant_tensor = quantized_graph->tensors()->Get(i);
      // If the tensor is a weight, it should have type FLOAT16.
      // If the tensor is a bias, it should have type FLOAT16.
      // If the tensor is an input or output it should have type FLOAT32.
      // The input to dequantize should be FLOAT16, and all other tensors should
      // be FLOAT32.
      if (i == dequant_input_idx) {
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT16);
      } else if (i == dequant_output_idx) {
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
      } else if (IsModelInputOrOutput(output_model, i)) {
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
      } else if (quant_tensor->name()->str() == "conv_bias") {
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT16);
      } else if (quant_tensor->buffer() != 0) {
        // If it's a non-bias constant tensor, it must be the weight.
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT16);
      } else {
        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
      }
    }
  }
}

TEST_F(QuantizeWeightsTest, SharedWeights_Hybrid) {
  LoadSharedWeightsModel();
  flatbuffers::FlatBufferBuilder builder;
  auto status = QuantizeWeights(&builder, model_, 0);
  EXPECT_EQ(status, kTfLiteOk);

  const uint8_t* buffer = builder.GetBufferPointer();
  const Model* output_model = GetModel(buffer);
  ASSERT_TRUE(output_model);

  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
  uint32_t num_conv_ops = 0;
  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
       ++subgraph_idx) {
    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
      const auto op = quantized_graph->operators()->Get(i);
      const uint32_t op_code_idx = op->opcode_index();
      const auto op_code =
          output_model->operator_codes()->Get(op_code_idx)->builtin_code();
      if (op_code == BuiltinOperator_CONV_2D) {
        num_conv_ops++;
        // Ensure that each convolution's weights tensor is now INT8.
        const auto weights_tensor =
            quantized_graph->tensors()->Get(op->inputs()->Get(1));
        EXPECT_EQ(weights_tensor->type(), TensorType_INT8);
      }
    }
  }
  // Ensure that there were exactly two convolutions in the model.
  EXPECT_EQ(num_conv_ops, 2);
}

TEST_F(QuantizeWeightsTest, SharedWeights_Dequantize) {
  LoadSharedWeightsModel();
  flatbuffers::FlatBufferBuilder builder;
  auto status = internal::QuantizeWeights(&builder, model_, 0,
                                          /*use_hybrid_evaluation*/ false);
  EXPECT_EQ(status, kTfLiteOk);

  const uint8_t* buffer = builder.GetBufferPointer();
  const Model* output_model = GetModel(buffer);
  ASSERT_TRUE(output_model);

  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
  uint32_t num_conv_ops = 0;
  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
       ++subgraph_idx) {
    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
      const auto op = quantized_graph->operators()->Get(i);
      const uint32_t op_code_idx = op->opcode_index();
      const auto op_code =
          output_model->operator_codes()->Get(op_code_idx)->builtin_code();
      if (op_code == BuiltinOperator_CONV_2D) {
        num_conv_ops++;
        // Ensure that each convolution's weights tensor is still FLOAT
        // (the output of the dequantize).
        uint32_t weights_tensor_index = op->inputs()->Get(1);
        const auto weights_tensor =
            quantized_graph->tensors()->Get(weights_tensor_index);
        EXPECT_EQ(weights_tensor->type(), TensorType_FLOAT32);

        // Check that it comes from a dequantize operation.
        BuiltinOperator producer_op_code;
        ASSERT_TRUE(GetProducerOpCode(output_model, subgraph_idx,
                                      weights_tensor_index, &producer_op_code));
        EXPECT_EQ(producer_op_code, BuiltinOperator_DEQUANTIZE);
      }
    }
  }
  // Ensure that there were exactly two convolutions in the model.
  EXPECT_EQ(num_conv_ops, 2);
}

TEST_F(QuantizeWeightsTest, VerifyGatherQuantization) {
  LoadGatherTestModel();
  flatbuffers::FlatBufferBuilder builder;
  auto status = QuantizeWeights(&builder, model_, 0);
  EXPECT_EQ(status, kTfLiteOk);

  const uint8_t* buffer = builder.GetBufferPointer();
  const Model* output_model = GetModel(buffer);
  ASSERT_TRUE(output_model);

  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
       ++subgraph_idx) {
    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
      const auto op = quantized_graph->operators()->Get(i);
      const uint32_t op_code_idx = op->opcode_index();
      const auto op_code =
          output_model->operator_codes()->Get(op_code_idx)->builtin_code();
      if (op_code == BuiltinOperator_GATHER) {
        uint32_t input_tensor_index = op->inputs()->Get(0);
        const auto weights_tensor =
            quantized_graph->tensors()->Get(input_tensor_index);
        EXPECT_EQ(weights_tensor->type(), TensorType_INT8);
      }
    }
  }
}

TEST_F(QuantizeWeightsTest, VerifyCustomOpQuantizationDequantize) {
  LoadCustomOpTestModel();

  // The custom op is not hybrid, and the second input is a constant that can
  // be quantized.
  CustomOpMap custom_op_map;
  custom_op_map["CustomTestOp"] = {
      .quantizable_input_indices = {1},
      .is_hybrid = false,
  };

  flatbuffers::FlatBufferBuilder builder;
  auto status = QuantizeWeights(&builder, model_, 0, custom_op_map);
  ASSERT_EQ(status, kTfLiteOk);

  const uint8_t* buffer = builder.GetBufferPointer();
  const Model* output_model = GetModel(buffer);
  ASSERT_TRUE(output_model);

  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
  const auto quantized_graph = output_model->subgraphs()->Get(0);
  // A dequantize op should be added.
  ASSERT_EQ(quantized_graph->operators()->size(),
            model_->subgraphs()->Get(0)->operators()->size() + 1);
  int num_custom_ops_found = 0;
  for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
    const auto op = quantized_graph->operators()->Get(i);
    const uint32_t op_code_idx = op->opcode_index();
    const auto op_code =
        output_model->operator_codes()->Get(op_code_idx)->builtin_code();
    if (op_code == BuiltinOperator_CUSTOM) {
      uint32_t weights_tensor_index = op->inputs()->Get(1);
      const auto weights_tensor =
          quantized_graph->tensors()->Get(weights_tensor_index);
      EXPECT_EQ(weights_tensor->type(), TensorType_FLOAT32);

      // Check that it comes from a dequantize operation.
      BuiltinOperator producer_op_code;
      ASSERT_TRUE(GetProducerOpCode(output_model, 0, weights_tensor_index,
                                    &producer_op_code));
      EXPECT_EQ(producer_op_code, BuiltinOperator_DEQUANTIZE);
      num_custom_ops_found++;
    }
  }
  EXPECT_EQ(num_custom_ops_found, 1);
}

TEST_F(QuantizeWeightsTest, VerifyCustomOpQuantizationHybrid) {
  LoadCustomOpTestModel();

  // The custom op is hybrid, and the second input is a constant that can
  // be quantized.
  CustomOpMap custom_op_map;
  custom_op_map["CustomTestOp"] = {
      .quantizable_input_indices = {1},
      .is_hybrid = true,
  };

  flatbuffers::FlatBufferBuilder builder;
  auto status = QuantizeWeights(&builder, model_, 0, custom_op_map);
  ASSERT_EQ(status, kTfLiteOk);

  const uint8_t* buffer = builder.GetBufferPointer();
  const Model* output_model = GetModel(buffer);
  ASSERT_TRUE(output_model);

  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
  const auto quantized_graph = output_model->subgraphs()->Get(0);
  ASSERT_EQ(quantized_graph->operators()->size(),
            model_->subgraphs()->Get(0)->operators()->size());
  int num_custom_ops_found = 0;
  for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
    const auto op = quantized_graph->operators()->Get(i);
    const uint32_t op_code_idx = op->opcode_index();
    const auto op_code =
        output_model->operator_codes()->Get(op_code_idx)->builtin_code();
    if (op_code == BuiltinOperator_CUSTOM) {
      uint32_t weights_tensor_index = op->inputs()->Get(1);
      const auto weights_tensor =
          quantized_graph->tensors()->Get(weights_tensor_index);
      EXPECT_EQ(weights_tensor->type(), TensorType_INT8);
      num_custom_ops_found++;
    }
  }
  EXPECT_EQ(num_custom_ops_found, 1);
}

}  // namespace
}  // namespace optimize
}  // namespace tflite

int main(int argc, char** argv) {
  tensorflow::string model_file;
  const std::vector<tensorflow::Flag> flag_list = {
      tensorflow::Flag("test_model_file", &model_file,
                       "Path to test tflite model file."),
  };

  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
  if (!parse_result) {
    std::cerr << "Required test_model_file\n";
    std::abort();
  }
  g_test_model_dir =
      new tensorflow::string(tensorflow::io::Dirname(model_file));
  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
  return RUN_ALL_TESTS();
}