Add a --enable_op_profiling flag for TFLite's benchmark_model

Use of `--copt=-DTFLITE_PROFILING_ENABLED` is no longer required to enable per-op profiling when running TFLite's benchmark_model utility. Simply use `--enable_op_profiling=true` to get this information. PiperOrigin-RevId: 247236386
2019-05-08 10:16:35 -07:00 · 2019-05-08 10:16:35 -07:00 · bba56b4444
commit bba56b4444
parent e84d1e517d
6 changed files with 72 additions and 56 deletions
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@ -89,9 +89,9 @@ cc_library(
        ":logging",
        "//tensorflow/lite:framework",
        "//tensorflow/lite:string_util",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
        "//tensorflow/lite/kernels:builtin_ops",
        "//tensorflow/lite/profiling:profile_summarizer",
+        "//tensorflow/lite/profiling:profiler",
        "//tensorflow/lite/tools/evaluation:utils",
        "@gemmlowp",
    ],
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@ -45,6 +45,8 @@ and the following optional parameters:
 *   `use_gpu`: `bool` (default=false) \
    Whether to use the [GPU accelerator delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/gpu).
    This option is currently only available on Android devices.
+*   `enable_op_profiling`: `bool` (default=false) \
+    Whether to enable per-operator profiling measurement.

 ## To build/install/run

@ -129,19 +131,18 @@ where `f0` is the affinity mask for big cores on Pixel 2.
 Note: The affinity mask varies with the device.

 ## Profiling model operators
-The benchmark model binary also allows you to profile operators and give execution times of each operator. To do this,
-compile the binary with a compiler flag that enables profiling to be compiled in. Pass **--copt=-DTFLITE_PROFILING_ENABLED**
-to compile benchmark with profiling support.
-For example, to compile with profiling support on Android, add this flag to the previous command:
+The benchmark model binary also allows you to profile operators and give
+execution times of each operator. To do this, pass the flag
+`--enable_op_profiling=true` to `benchmark_model` during invocation, e.g.,

 ```
-bazel build -c opt \
-  --config=android_arm \
-  --cxxopt='--std=c++11' \
-  --copt=-DTFLITE_PROFILING_ENABLED \
-  tensorflow/lite/tools/benchmark:benchmark_model
+adb shell taskset f0 /data/local/tmp/benchmark_model \
+  --graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --enable_op_profiling=true
 ```
-This compiles TFLite with profiling enabled, now you can run the benchmark binary like before. The binary will produce detailed statistics for each operation similar to those shown below:
+
+When enabled, the `benchmark_model` binary will produce detailed statistics for
+each operation similar to those shown below:

 ```

--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@ -48,6 +48,7 @@ BenchmarkParams CreateParams() {
  params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
  params.AddParam("use_legacy_nnapi", BenchmarkParam::Create<bool>(false));
  params.AddParam("use_gpu", BenchmarkParam::Create<bool>(false));
+  params.AddParam("enable_op_profiling", BenchmarkParam::Create<bool>(false));
  return params;
 }

--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@ -26,6 +26,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/profiling/buffered_profiler.h"
+#include "tensorflow/lite/profiling/profile_summarizer.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/benchmark/logging.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
@ -40,13 +42,45 @@ void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);

 namespace tflite {
 namespace benchmark {
+namespace {

-void ProfilingListener::SetInterpreter(tflite::Interpreter* interpreter) {
+// Backward compat with previous approach to enabling op profiling.
+#if defined(TFLITE_PROFILING_ENABLED)
+constexpr int kOpProfilingEnabledDefault = true;
+#else
+constexpr int kOpProfilingEnabledDefault = false;
+#endif
+
+// Dumps profiling events if profiling is enabled.
+class ProfilingListener : public BenchmarkListener {
+ public:
+  explicit ProfilingListener(Interpreter* interpreter)
+      : interpreter_(interpreter), has_profiles_(false) {
    TFLITE_BENCHMARK_CHECK(interpreter);
-  interpreter_ = interpreter;
    interpreter_->SetProfiler(&profiler_);
  }

+  void OnSingleRunStart(RunType run_type) override;
+
+  void OnSingleRunEnd() override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+
+ private:
+  Interpreter* interpreter_;
+  profiling::BufferedProfiler profiler_;
+  profiling::ProfileSummarizer summarizer_;
+  bool has_profiles_;
+};
+
+// Dumps gemmlowp profiling events if gemmlowp profiling is enabled.
+class GemmlowpProfilingListener : public BenchmarkListener {
+ public:
+  void OnBenchmarkStart(const BenchmarkParams& params) override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+};
+
 void ProfilingListener::OnSingleRunStart(RunType run_type) {
  if (run_type == REGULAR) {
    profiler_.Reset();
@ -82,8 +116,6 @@ void GemmlowpProfilingListener::OnBenchmarkEnd(
 #endif
 }

-namespace {
-
 std::vector<std::string> Split(const std::string& str, const char delim) {
  std::istringstream input(str);
  std::vector<std::string> results;
@ -201,6 +233,9 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
                          BenchmarkParam::Create<bool>(false));
  default_params.AddParam("use_gpu", BenchmarkParam::Create<bool>(false));
  default_params.AddParam("allow_fp16", BenchmarkParam::Create<bool>(false));
+  default_params.AddParam(
+      "enable_op_profiling",
+      BenchmarkParam::Create<bool>(kOpProfilingEnabledDefault));
  return default_params;
 }

@ -209,8 +244,6 @@ BenchmarkTfLiteModel::BenchmarkTfLiteModel()

 BenchmarkTfLiteModel::BenchmarkTfLiteModel(BenchmarkParams params)
    : BenchmarkModel(std::move(params)) {
-  AddListener(&profiling_listener_);
-  AddListener(&gemmlowp_profiling_listener_);
 }

 void BenchmarkTfLiteModel::CleanUp() {
@ -236,7 +269,8 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
      CreateFlag<bool>("use_nnapi", &params_, "use nnapi delegate api"),
      CreateFlag<bool>("use_legacy_nnapi", &params_, "use legacy nnapi api"),
      CreateFlag<bool>("use_gpu", &params_, "use gpu"),
-      CreateFlag<bool>("allow_fp16", &params_, "allow fp16")};
+      CreateFlag<bool>("allow_fp16", &params_, "allow fp16"),
+      CreateFlag<bool>("enable_op_profiling", &params_, "enable op profiling")};

  flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
  return flags;
@ -255,6 +289,8 @@ void BenchmarkTfLiteModel::LogParams() {
  TFLITE_LOG(INFO) << "Use gpu : [" << params_.Get<bool>("use_gpu") << "]";
  TFLITE_LOG(INFO) << "Allow fp16 : [" << params_.Get<bool>("allow_fp16")
                   << "]";
+  TFLITE_LOG(INFO) << "Enable op profiling: ["
+                   << params_.Get<bool>("enable_op_profiling") << "]";
 }

 bool BenchmarkTfLiteModel::ValidateParams() {
@ -382,7 +418,6 @@ void BenchmarkTfLiteModel::Init() {
  if (!interpreter) {
    TFLITE_LOG(FATAL) << "Failed to construct interpreter";
  }
-  profiling_listener_.SetInterpreter(interpreter.get());

  interpreter->UseNNAPI(params_.Get<bool>("use_legacy_nnapi"));

@ -433,6 +468,16 @@ void BenchmarkTfLiteModel::Init() {
  if (delegates_.empty() && interpreter->AllocateTensors() != kTfLiteOk) {
    TFLITE_LOG(FATAL) << "Failed to allocate tensors!";
  }
+
+  // Install profilers if necessary.
+  if (params_.Get<bool>("enable_op_profiling")) {
+    profiling_listener_.reset(new ProfilingListener(interpreter.get()));
+    AddListener(profiling_listener_.get());
+  }
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp_profiling_listener_.reset(new GemmlowpProfilingListener());
+  AddListener(gemmlowp_profiling_listener_.get());
+#endif
 }

 BenchmarkTfLiteModel::TfLiteDelegatePtrMap BenchmarkTfLiteModel::GetDelegates()
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@ -22,42 +22,11 @@ limitations under the License.
 #include <vector>

 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/profiling/profile_summarizer.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"

 namespace tflite {
 namespace benchmark {

-// Dumps profiling events if profiling is enabled.
-class ProfilingListener : public BenchmarkListener {
- public:
-  explicit ProfilingListener() : interpreter_(nullptr), has_profiles_(false) {}
-
-  void SetInterpreter(Interpreter* interpreter);
-
-  void OnSingleRunStart(RunType run_type) override;
-
-  void OnSingleRunEnd() override;
-
-  void OnBenchmarkEnd(const BenchmarkResults& results) override;
-
- private:
-  Interpreter* interpreter_;
-  profiling::Profiler profiler_;
-  profiling::ProfileSummarizer summarizer_;
-  bool has_profiles_;
-};
-
-// Dumps gemmlowp profiling events if gemmlowp profiling is enabled.
-class GemmlowpProfilingListener : public BenchmarkListener {
- public:
-  virtual ~GemmlowpProfilingListener() {}
-
-  void OnBenchmarkStart(const BenchmarkParams& params) override;
-
-  void OnBenchmarkEnd(const BenchmarkResults& results) override;
-};
-
 // Benchmarks a TFLite model by running tflite interpreter.
 class BenchmarkTfLiteModel : public BenchmarkModel {
 public:
@ -99,8 +68,8 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
  };
  std::vector<InputLayerInfo> inputs;
  std::vector<InputTensorData> inputs_data_;
-  ProfilingListener profiling_listener_;
-  GemmlowpProfilingListener gemmlowp_profiling_listener_;
+  std::unique_ptr<BenchmarkListener> profiling_listener_;
+  std::unique_ptr<BenchmarkListener> gemmlowp_profiling_listener_;
  TfLiteDelegatePtrMap delegates_;
 };

--- a/tensorflow/lite/tools/make/build_ios_universal_lib.sh
+++ b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
@ -32,7 +32,7 @@ BUILD_ARCHS="x86_64 armv7 armv7s arm64"
 while getopts "a:p" opt_name; do
  case "$opt_name" in
    a) BUILD_ARCHS="${OPTARG}";;
-    p) profiling_args='-DGEMMLOWP_PROFILING,-DTFLITE_PROFILING_ENABLED';;
+    p) profiling_args='-DGEMMLOWP_PROFILING';;
    *) usage;;
  esac
 done