diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index d4428f4d498..c692b948692 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -89,9 +89,9 @@ cc_library(
         ":logging",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:profile_summarizer",
+        "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/tools/evaluation:utils",
         "@gemmlowp",
     ],
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index e432d81f8ec..d5c89bd266b 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -45,6 +45,8 @@ and the following optional parameters:
 *   `use_gpu`: `bool` (default=false) \
     Whether to use the [GPU accelerator delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/gpu).
     This option is currently only available on Android devices.
+*   `enable_op_profiling`: `bool` (default=false) \
+    Whether to enable per-operator profiling measurement.
 
 ## To build/install/run
 
@@ -129,19 +131,18 @@ where `f0` is the affinity mask for big cores on Pixel 2.
 Note: The affinity mask varies with the device.
 
 ## Profiling model operators
-The benchmark model binary also allows you to profile operators and give execution times of each operator. To do this,
-compile the binary with a compiler flag that enables profiling to be compiled in. Pass **--copt=-DTFLITE_PROFILING_ENABLED**
-to compile benchmark with profiling support.
-For example, to compile with profiling support on Android, add this flag to the previous command:
+The benchmark model binary also allows you to profile operators and give
+execution times of each operator. To do this, pass the flag
+`--enable_op_profiling=true` to `benchmark_model` during invocation, e.g.,
 
 ```
-bazel build -c opt \
-  --config=android_arm \
-  --cxxopt='--std=c++11' \
-  --copt=-DTFLITE_PROFILING_ENABLED \
-  tensorflow/lite/tools/benchmark:benchmark_model
+adb shell taskset f0 /data/local/tmp/benchmark_model \
+  --graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --enable_op_profiling=true
 ```
-This compiles TFLite with profiling enabled, now you can run the benchmark binary like before. The binary will produce detailed statistics for each operation similar to those shown below:
+
+When enabled, the `benchmark_model` binary will produce detailed statistics for
+each operation similar to those shown below:
 
 ```
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index c7fbc24a477..8fd625cf141 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -48,6 +48,7 @@ BenchmarkParams CreateParams() {
   params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
   params.AddParam("use_legacy_nnapi", BenchmarkParam::Create<bool>(false));
   params.AddParam("use_gpu", BenchmarkParam::Create<bool>(false));
+  params.AddParam("enable_op_profiling", BenchmarkParam::Create<bool>(false));
   return params;
 }
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 161ae1df034..10c9643e9b6 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/profiling/buffered_profiler.h"
+#include "tensorflow/lite/profiling/profile_summarizer.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/benchmark/logging.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
@@ -40,12 +42,44 @@ void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
 
 namespace tflite {
 namespace benchmark {
+namespace {
 
-void ProfilingListener::SetInterpreter(tflite::Interpreter* interpreter) {
-  TFLITE_BENCHMARK_CHECK(interpreter);
-  interpreter_ = interpreter;
-  interpreter_->SetProfiler(&profiler_);
-}
+// Backward compat with previous approach to enabling op profiling.
+#if defined(TFLITE_PROFILING_ENABLED)
+constexpr int kOpProfilingEnabledDefault = true;
+#else
+constexpr int kOpProfilingEnabledDefault = false;
+#endif
+
+// Dumps profiling events if profiling is enabled.
+class ProfilingListener : public BenchmarkListener {
+ public:
+  explicit ProfilingListener(Interpreter* interpreter)
+      : interpreter_(interpreter), has_profiles_(false) {
+    TFLITE_BENCHMARK_CHECK(interpreter);
+    interpreter_->SetProfiler(&profiler_);
+  }
+
+  void OnSingleRunStart(RunType run_type) override;
+
+  void OnSingleRunEnd() override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+
+ private:
+  Interpreter* interpreter_;
+  profiling::BufferedProfiler profiler_;
+  profiling::ProfileSummarizer summarizer_;
+  bool has_profiles_;
+};
+
+// Dumps gemmlowp profiling events if gemmlowp profiling is enabled.
+class GemmlowpProfilingListener : public BenchmarkListener {
+ public:
+  void OnBenchmarkStart(const BenchmarkParams& params) override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+};
 
 void ProfilingListener::OnSingleRunStart(RunType run_type) {
   if (run_type == REGULAR) {
@@ -82,8 +116,6 @@ void GemmlowpProfilingListener::OnBenchmarkEnd(
 #endif
 }
 
-namespace {
-
 std::vector<std::string> Split(const std::string& str, const char delim) {
   std::istringstream input(str);
   std::vector<std::string> results;
@@ -201,6 +233,9 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
                           BenchmarkParam::Create<bool>(false));
   default_params.AddParam("use_gpu", BenchmarkParam::Create<bool>(false));
   default_params.AddParam("allow_fp16", BenchmarkParam::Create<bool>(false));
+  default_params.AddParam(
+      "enable_op_profiling",
+      BenchmarkParam::Create<bool>(kOpProfilingEnabledDefault));
   return default_params;
 }
 
@@ -209,8 +244,6 @@ BenchmarkTfLiteModel::BenchmarkTfLiteModel()
 
 BenchmarkTfLiteModel::BenchmarkTfLiteModel(BenchmarkParams params)
     : BenchmarkModel(std::move(params)) {
-  AddListener(&profiling_listener_);
-  AddListener(&gemmlowp_profiling_listener_);
 }
 
 void BenchmarkTfLiteModel::CleanUp() {
@@ -236,7 +269,8 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
       CreateFlag<bool>("use_nnapi", &params_, "use nnapi delegate api"),
       CreateFlag<bool>("use_legacy_nnapi", &params_, "use legacy nnapi api"),
       CreateFlag<bool>("use_gpu", &params_, "use gpu"),
-      CreateFlag<bool>("allow_fp16", &params_, "allow fp16")};
+      CreateFlag<bool>("allow_fp16", &params_, "allow fp16"),
+      CreateFlag<bool>("enable_op_profiling", &params_, "enable op profiling")};
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
   return flags;
@@ -255,6 +289,8 @@ void BenchmarkTfLiteModel::LogParams() {
   TFLITE_LOG(INFO) << "Use gpu : [" << params_.Get<bool>("use_gpu") << "]";
   TFLITE_LOG(INFO) << "Allow fp16 : [" << params_.Get<bool>("allow_fp16")
                    << "]";
+  TFLITE_LOG(INFO) << "Enable op profiling: ["
+                   << params_.Get<bool>("enable_op_profiling") << "]";
 }
 
 bool BenchmarkTfLiteModel::ValidateParams() {
@@ -382,7 +418,6 @@ void BenchmarkTfLiteModel::Init() {
   if (!interpreter) {
     TFLITE_LOG(FATAL) << "Failed to construct interpreter";
   }
-  profiling_listener_.SetInterpreter(interpreter.get());
 
   interpreter->UseNNAPI(params_.Get<bool>("use_legacy_nnapi"));
 
@@ -433,6 +468,16 @@ void BenchmarkTfLiteModel::Init() {
   if (delegates_.empty() && interpreter->AllocateTensors() != kTfLiteOk) {
     TFLITE_LOG(FATAL) << "Failed to allocate tensors!";
   }
+
+  // Install profilers if necessary.
+  if (params_.Get<bool>("enable_op_profiling")) {
+    profiling_listener_.reset(new ProfilingListener(interpreter.get()));
+    AddListener(profiling_listener_.get());
+  }
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp_profiling_listener_.reset(new GemmlowpProfilingListener());
+  AddListener(gemmlowp_profiling_listener_.get());
+#endif
 }
 
 BenchmarkTfLiteModel::TfLiteDelegatePtrMap BenchmarkTfLiteModel::GetDelegates()
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 99b9ce35246..ab64effd0ec 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -22,42 +22,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/profiling/profile_summarizer.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 
 namespace tflite {
 namespace benchmark {
 
-// Dumps profiling events if profiling is enabled.
-class ProfilingListener : public BenchmarkListener {
- public:
-  explicit ProfilingListener() : interpreter_(nullptr), has_profiles_(false) {}
-
-  void SetInterpreter(Interpreter* interpreter);
-
-  void OnSingleRunStart(RunType run_type) override;
-
-  void OnSingleRunEnd() override;
-
-  void OnBenchmarkEnd(const BenchmarkResults& results) override;
-
- private:
-  Interpreter* interpreter_;
-  profiling::Profiler profiler_;
-  profiling::ProfileSummarizer summarizer_;
-  bool has_profiles_;
-};
-
-// Dumps gemmlowp profiling events if gemmlowp profiling is enabled.
-class GemmlowpProfilingListener : public BenchmarkListener {
- public:
-  virtual ~GemmlowpProfilingListener() {}
-
-  void OnBenchmarkStart(const BenchmarkParams& params) override;
-
-  void OnBenchmarkEnd(const BenchmarkResults& results) override;
-};
-
 // Benchmarks a TFLite model by running tflite interpreter.
 class BenchmarkTfLiteModel : public BenchmarkModel {
  public:
@@ -99,8 +68,8 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   };
   std::vector<InputLayerInfo> inputs;
   std::vector<InputTensorData> inputs_data_;
-  ProfilingListener profiling_listener_;
-  GemmlowpProfilingListener gemmlowp_profiling_listener_;
+  std::unique_ptr<BenchmarkListener> profiling_listener_;
+  std::unique_ptr<BenchmarkListener> gemmlowp_profiling_listener_;
   TfLiteDelegatePtrMap delegates_;
 };
 
diff --git a/tensorflow/lite/tools/make/build_ios_universal_lib.sh b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
index 8b617ef5937..3678f554d08 100755
--- a/tensorflow/lite/tools/make/build_ios_universal_lib.sh
+++ b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
@@ -32,7 +32,7 @@ BUILD_ARCHS="x86_64 armv7 armv7s arm64"
 while getopts "a:p" opt_name; do
   case "$opt_name" in
     a) BUILD_ARCHS="${OPTARG}";;
-    p) profiling_args='-DGEMMLOWP_PROFILING,-DTFLITE_PROFILING_ENABLED';;
+    p) profiling_args='-DGEMMLOWP_PROFILING';;
     *) usage;;
   esac
 done