Internal tests cleanup.

PiperOrigin-RevId: 339390176 Change-Id: Ie0480a0d8d78bb1a50db434c7f456d407a72444c
2020-10-27 21:24:10 -07:00 · 2020-10-27 21:24:10 -07:00 · 5008bbbca4
commit 5008bbbca4
parent e7715df2de
14 changed files with 607 additions and 462 deletions
--- a/tensorflow/core/kernels/basic_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/basic_ops_benchmark_test.cc
@ -64,13 +64,16 @@ static void MulChain(int chain_length, Graph** init_g, Graph** run_g) {

 // Benchmark a chain of simple multiplications.
 // This emphasizes per-op overhead.
-static void BM_MulChain(int iters, int chain_length) {
-  const int64 tot = static_cast<int64>(iters) * chain_length;
-  testing::ItemsProcessed(tot);
+static void BM_MulChain(::testing::benchmark::State& state) {
+  const int chain_length = state.range(0);
+
  Graph* init;
  Graph* run;
  MulChain(chain_length, &init, &run);
-  test::Benchmark("cpu", run, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", run, GetOptions(), init, nullptr, "",
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetItemsProcessed(state.iterations());
 }
 BENCHMARK(BM_MulChain)->Arg(1 << 10);

--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc
@ -115,7 +115,7 @@ class ThroughputBenchmark {
  ThroughputBenchmark& operator=(const ThroughputBenchmark&) = delete;

  // Perform the benchmark run, based on the parameters supplied to the ctor.
-  void RunBenchmark(int iters);
+  void RunBenchmark(::testing::benchmark::State& state);

 private:
  // Resets all mutable state, including the scheduler.
@ -136,22 +136,18 @@ ThroughputBenchmark::ThroughputBenchmark(
    const BasicBatchScheduler<BenchmarkBatchTask>::Options& scheduler_options)
    : scheduler_options_(scheduler_options) {}

-void ThroughputBenchmark::RunBenchmark(int iters) {
-  CHECK_GE(iters, 1);
+void ThroughputBenchmark::RunBenchmark(::testing::benchmark::State& state) {
+  CHECK_GE(state.max_iterations, 1);

-  testing::StopTiming();
  ResetState();

  // Have each iteration issue a reasonably large number of tasks, to ensure our
  // measurements reflect steady-state behavior.
  const int kNumTasksPerIteration = 100 * 1000;
-
-  testing::ItemsProcessed(iters * kNumTasksPerIteration);
  testing::UseRealTime();
-  testing::StartTiming();

  // Schedule 'num_iterations_*kNumTasksPerIteration' tasks.
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    for (int j = 0; j < kNumTasksPerIteration; ++j) {
      auto task = std::unique_ptr<BenchmarkBatchTask>(new BenchmarkBatchTask);
      TF_CHECK_OK(scheduler_->Schedule(&task));
@ -160,7 +156,7 @@ void ThroughputBenchmark::RunBenchmark(int iters) {

  // Wait for the scheduler to process all tasks.
  scheduler_.reset();
-  testing::StopTiming();
+  state.SetItemsProcessed(state.iterations() * kNumTasksPerIteration);
 }

 void ThroughputBenchmark::ResetState() {
@ -338,7 +334,8 @@ void LatencyBenchmark::PerformBatchCpuWork() const {
  CHECK_NE(dummy, 0);
 }

-static void RunThroughputBenchmark(int iters, int64 batch_timeout_micros,
+static void RunThroughputBenchmark(::testing::benchmark::State& state,
+                                   int64 batch_timeout_micros,
                                   int num_batch_threads) {
  BasicBatchScheduler<BenchmarkBatchTask>::Options scheduler_options;
  const int kMaxBatchSize = 100;
@ -347,13 +344,14 @@ static void RunThroughputBenchmark(int iters, int64 batch_timeout_micros,
  scheduler_options.num_batch_threads = num_batch_threads;
  scheduler_options.max_enqueued_batches = INT_MAX;  // Unbounded queue.
  ThroughputBenchmark benchmark(scheduler_options);
-  benchmark.RunBenchmark(iters);
+  benchmark.RunBenchmark(state);
 }

-static void ThroughputBM_ZeroTimeout(int iters, int num_batch_threads) {
-  RunThroughputBenchmark(iters, 0 /* 0 ms timeout */, num_batch_threads);
+static void ThroughputBM_ZeroTimeout(::testing::benchmark::State& state) {
+  RunThroughputBenchmark(state, 0 /* 0 ms timeout */, state.range(0));
 }
 BENCHMARK(ThroughputBM_ZeroTimeout)
+    ->UseRealTime()
    ->Arg(1)
    ->Arg(2)
    ->Arg(4)
@ -362,10 +360,11 @@ BENCHMARK(ThroughputBM_ZeroTimeout)
    ->Arg(32)
    ->Arg(64);

-static void ThroughputBM_SmallTimeout(int iters, int num_batch_threads) {
-  RunThroughputBenchmark(iters, 1 * 1000 /* 1 ms timeout */, num_batch_threads);
+static void ThroughputBM_SmallTimeout(::testing::benchmark::State& state) {
+  RunThroughputBenchmark(state, 1 * 1000 /* 1 ms timeout */, state.range(0));
 }
 BENCHMARK(ThroughputBM_SmallTimeout)
+    ->UseRealTime()
    ->Arg(1)
    ->Arg(2)
    ->Arg(4)
@ -374,11 +373,11 @@ BENCHMARK(ThroughputBM_SmallTimeout)
    ->Arg(32)
    ->Arg(64);

-static void ThroughputBM_LargeTimeout(int iters, int num_batch_threads) {
-  RunThroughputBenchmark(iters, 50 * 1000 /* 50 ms timeout */,
-                         num_batch_threads);
+static void ThroughputBM_LargeTimeout(::testing::benchmark::State& state) {
+  RunThroughputBenchmark(state, 50 * 1000 /* 50 ms timeout */, state.range(0));
 }
 BENCHMARK(ThroughputBM_LargeTimeout)
+    ->UseRealTime()
    ->Arg(1)
    ->Arg(2)
    ->Arg(4)
--- a/tensorflow/core/kernels/bias_op_test.cc
+++ b/tensorflow/core/kernels/bias_op_test.cc
@ -43,22 +43,27 @@ static Graph* BiasAddGrad(int d0, int d1, int d2, int d3) {
  return g;
 }

-#define BM_BiasAddNHWC(N, W, H, C, DEVICE)                                   \
-  static void BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE(int iters) { \
-    testing::UseRealTime();                                                  \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C);      \
-    test::Benchmark(#DEVICE, BiasAdd(N, H, W, C)).Run(iters);                \
-  }                                                                          \
-  BENCHMARK(BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE);
+#define BM_BiasAddNHWC(N, W, H, C, DEVICE)                                     \
+  static void BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE(               \
+      ::testing::benchmark::State& state) {                                    \
+    test::Benchmark(#DEVICE, BiasAdd(N, H, W, C), /*old_benchmark_api=*/false) \
+        .Run(state);                                                           \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * H *   \
+                            W * C);                                            \
+  }                                                                            \
+  BENCHMARK(BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE)->UseRealTime();

-#define BM_BiasAddGradNHWC(N, W, H, C, DEVICE)                          \
-  static void BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE(    \
-      int iters) {                                                      \
-    testing::UseRealTime();                                             \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
-    test::Benchmark(#DEVICE, BiasAddGrad(N, H, W, C)).Run(iters);       \
-  }                                                                     \
-  BENCHMARK(BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE);
+#define BM_BiasAddGradNHWC(N, W, H, C, DEVICE)                               \
+  static void BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE(         \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, BiasAddGrad(N, H, W, C),                        \
+                    /*old_benchmark_api=*/false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * H * \
+                            W * C);                                          \
+  }                                                                          \
+  BENCHMARK(BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE)           \
+      ->UseRealTime();

 // CPU
 BM_BiasAddNHWC(32, 32, 32, 128, cpu);
--- a/tensorflow/core/kernels/bincount_op_test.cc
+++ b/tensorflow/core/kernels/bincount_op_test.cc
@ -45,11 +45,15 @@ static Graph* Bincount(int arr_size, int nbins) {
  return g;
 }

-#define BM_BincountDev(K, NBINS, type)                             \
-  static void BM_Bincount##_##type##_##K##_##NBINS(int iters) {    \
-    testing::ItemsProcessed(static_cast<int64>(iters) * K * 1024); \
-    test::Benchmark(#type, Bincount(K * 1024, NBINS)).Run(iters);  \
-  }                                                                \
+#define BM_BincountDev(K, NBINS, type)                                   \
+  static void BM_Bincount##_##type##_##K##_##NBINS(                      \
+      ::testing::benchmark::State& state) {                              \
+    test::Benchmark(#type, Bincount(K * 1024, NBINS),                    \
+                    /*old_benchmark_api=*/false)                         \
+        .Run(state);                                                     \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * K * \
+                            1024);                                       \
+  }                                                                      \
  BENCHMARK(BM_Bincount##_##type##_##K##_##NBINS);

 BM_BincountDev(32, 1000, cpu);
--- a/tensorflow/core/kernels/broadcast_to_op_test.cc
+++ b/tensorflow/core/kernels/broadcast_to_op_test.cc
@ -44,29 +44,35 @@ static Graph* BroadcastTo(int dim0, int dim1, InputShape input_shape) {
  return g;
 }

-#define BM_BroadcastTo_InnerDim(DIM0, DIM1, type)                          \
-  static void BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1(int iters) { \
-    testing::UseRealTime();                                                \
-    testing::ItemsProcessed(static_cast<int64>(iters) * DIM0 * DIM1);      \
-    test::Benchmark(#type, BroadcastTo(DIM0, DIM1,                         \
-                                       [](int dim0, int dim1) {            \
-                                         return TensorShape({dim0, 1});    \
-                                       }))                                 \
-        .Run(iters);                                                       \
-  }                                                                        \
-  BENCHMARK(BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1);
+#define BM_BroadcastTo_InnerDim(DIM0, DIM1, type)                           \
+  static void BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1(              \
+      ::testing::benchmark::State& state) {                                 \
+    test::Benchmark(#type,                                                  \
+                    BroadcastTo(DIM0, DIM1,                                 \
+                                [](int dim0, int dim1) {                    \
+                                  return TensorShape({dim0, 1});            \
+                                }),                                         \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * DIM0 * \
+                            DIM1);                                          \
+  }                                                                         \
+  BENCHMARK(BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1)->UseRealTime();

-#define BM_BroadcastTo_OuterDim(DIM0, DIM1, type)                          \
-  static void BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1(int iters) { \
-    testing::UseRealTime();                                                \
-    testing::ItemsProcessed(static_cast<int64>(iters) * DIM0 * DIM1);      \
-    test::Benchmark(#type, BroadcastTo(DIM0, DIM1,                         \
-                                       [](int dim0, int dim1) {            \
-                                         return TensorShape({1, dim1});    \
-                                       }))                                 \
-        .Run(iters);                                                       \
-  }                                                                        \
-  BENCHMARK(BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1);
+#define BM_BroadcastTo_OuterDim(DIM0, DIM1, type)                           \
+  static void BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1(              \
+      ::testing::benchmark::State& state) {                                 \
+    test::Benchmark(#type,                                                  \
+                    BroadcastTo(DIM0, DIM1,                                 \
+                                [](int dim0, int dim1) {                    \
+                                  return TensorShape({1, dim1});            \
+                                }),                                         \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * DIM0 * \
+                            DIM1);                                          \
+  }                                                                         \
+  BENCHMARK(BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1)->UseRealTime();

 BM_BroadcastTo_InnerDim(64, 64, cpu);
 BM_BroadcastTo_InnerDim(128, 128, cpu);
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@ -121,102 +121,127 @@ TEST_ALL_CASTS_FROM(quint16)

 // TODO(wicke): check conversions from/to bool, and bfloat16

-static void BM_cpu_float_int64(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_cpu_float_int64(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+  test::Benchmark("cpu", Cast<float, int64>(num), /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                          (sizeof(float) + sizeof(int64)));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Cast<float, int64>(num)).Run(iters);
 }
-BENCHMARK(BM_cpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_cpu_float_int64)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);

-static void BM_gpu_float_int64(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_gpu_float_int64(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  test::Benchmark("gpu", Cast<float, int64>(num), /*old_benchmark_api=*/false)
+      .Run(state);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                          (sizeof(float) + sizeof(int64)));
-  testing::UseRealTime();
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
-BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_gpu_float_int64)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);

-static void BM_cpu_bool_float(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_cpu_bool_float(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+  test::Benchmark("cpu", Cast<bool, float>(num), /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                          (sizeof(bool) + sizeof(float)));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Cast<bool, float>(num)).Run(iters);
 }
-BENCHMARK(BM_cpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_cpu_bool_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);

-static void BM_gpu_bool_float(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_gpu_bool_float(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  test::Benchmark("gpu", Cast<bool, float>(num), /*old_benchmark_api=*/false)
+      .Run(state);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                          (sizeof(bool) + sizeof(float)));
-  testing::UseRealTime();
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
-BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_gpu_bool_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);

-static void BM_cpu_float_bfloat16(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_cpu_float_bfloat16(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+  test::Benchmark("cpu", Cast<float, bfloat16>(num),
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                          (sizeof(float) + sizeof(bfloat16)));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Cast<float, bfloat16>(num)).Run(iters);
 }
-BENCHMARK(BM_cpu_float_bfloat16)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_cpu_float_bfloat16)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);

-static void BM_cpu_bfloat16_float(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_cpu_bfloat16_float(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+  test::Benchmark("cpu", Cast<bfloat16, float>(num),
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                          (sizeof(float) + sizeof(bfloat16)));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Cast<bfloat16, float>(num)).Run(iters);
 }
-BENCHMARK(BM_cpu_bfloat16_float)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_cpu_bfloat16_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);

-static void BM_cpu_float_half(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_cpu_float_half(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+  test::Benchmark("cpu", Cast<float, Eigen::half>(num),
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                          (sizeof(float) + sizeof(Eigen::half)));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Cast<float, Eigen::half>(num)).Run(iters);
 }
-BENCHMARK(BM_cpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_cpu_float_half)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);

-static void BM_cpu_half_float(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+static void BM_cpu_half_float(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+  test::Benchmark("cpu", Cast<Eigen::half, float>(num),
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                          (sizeof(float) + sizeof(Eigen::half)));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Cast<Eigen::half, float>(num)).Run(iters);
 }
-BENCHMARK(BM_cpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_cpu_half_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);

-static void BM_gpu_float_half(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
-                          (sizeof(float) + sizeof(Eigen::half)));
-  testing::UseRealTime();
+static void BM_gpu_float_half(::testing::benchmark::State& state) {
+  const int num = state.range(0);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
+  test::Benchmark("gpu", Cast<float, Eigen::half>(num),
+                  /*old_benchmark_api=*/false)
+      .Run(state);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-}
-BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);

-static void BM_gpu_half_float(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  testing::BytesProcessed(static_cast<int64>(iters) * num *
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
                          (sizeof(float) + sizeof(Eigen::half)));
-  testing::UseRealTime();
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }
-BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
+BENCHMARK(BM_gpu_float_half)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
+
+static void BM_gpu_half_float(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  test::Benchmark("gpu", Cast<Eigen::half, float>(num),
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
+                          (sizeof(float) + sizeof(Eigen::half)));
+}
+BENCHMARK(BM_gpu_half_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);

 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/clustering_ops_test.cc
+++ b/tensorflow/core/kernels/clustering_ops_test.cc
@ -72,22 +72,21 @@ Graph* SetUpKmeansPlusPlusInitialization(int num_dims, int num_points,

 template <int num_points, int num_to_sample, int num_dims,
          int retries_per_sample>
-void BM_KmeansPlusPlusInitialization(int iters) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_points * num_dims *
-                          num_to_sample);
-  testing::UseRealTime();
+void BM_KmeansPlusPlusInitialization(::testing::benchmark::State& state) {
  Graph* g = SetUpKmeansPlusPlusInitialization(
      num_dims, num_points, num_to_sample, retries_per_sample);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_points *
+                          num_dims * num_to_sample);
 }

-#define BENCHMARK_KMEANS_PLUS_PLUS(p, c, d, r)                            \
-  void BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r(int iters) { \
-    BM_KmeansPlusPlusInitialization<p, c, d, r>(iters);                   \
-  }                                                                       \
-  BENCHMARK(BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r);
+#define BENCHMARK_KMEANS_PLUS_PLUS(p, c, d, r)                     \
+  void BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r(      \
+      ::testing::benchmark::State& state) {                        \
+    BM_KmeansPlusPlusInitialization<p, c, d, r>(state);            \
+  }                                                                \
+  BENCHMARK(BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r) \
+      ->UseRealTime();

 #define RUN_BM_KmeansPlusPlusInitialization(retries)                     \
  BENCHMARK_KMEANS_PLUS_PLUS(k10Points, k2Centers, k100Dim, retries);    \
@ -132,20 +131,18 @@ Graph* SetUpKMC2Initialization(int num_points) {
 }

 template <int num_points, int num_to_sample, int num_dims>
-void BM_KMC2Initialization(int iters) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_points * num_dims *
-                          num_to_sample);
-  testing::UseRealTime();
+void BM_KMC2Initialization(::testing::benchmark::State& state) {
  Graph* g = SetUpKMC2Initialization(num_points);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_points *
+                          num_dims * num_to_sample);
 }
-#define BENCHMARK_KMC2(p, c, d)                           \
-  void BM_KMC2Initialization_##p##_##c##_##d(int iters) { \
-    BM_KMC2Initialization<p, c, d>(iters);                \
-  }                                                       \
-  BENCHMARK(BM_KMC2Initialization_##p##_##c##_##d);
+#define BENCHMARK_KMC2(p, c, d)               \
+  void BM_KMC2Initialization_##p##_##c##_##d( \
+      ::testing::benchmark::State& state) {   \
+    BM_KMC2Initialization<p, c, d>(state);    \
+  }                                           \
+  BENCHMARK(BM_KMC2Initialization_##p##_##c##_##d)->UseRealTime();

 #define RUN_BM_KMC2Initialization                   \
  BENCHMARK_KMC2(k10Points, k2Centers, k100Dim);    \
@ -191,14 +188,11 @@ Graph* SetUpNearestNeighbors(int num_dims, int num_points, int num_centers,
 }

 template <int num_dims, int num_points, int num_centers, int k>
-void BM_NearestNeighbors(int iters) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_points * num_dims *
-                          num_centers);
-  testing::UseRealTime();
+void BM_NearestNeighbors(::testing::benchmark::State& state) {
  Graph* g = SetUpNearestNeighbors(num_dims, num_points, num_centers, k);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_points *
+                          num_dims * num_centers);
 }

 constexpr int kTop1 = 1;
@ -206,11 +200,12 @@ constexpr int kTop2 = 2;
 constexpr int kTop5 = 5;
 constexpr int kTop10 = 10;

-#define BENCHMARK_NEAREST_NEIGHBORS(d, p, c, k)              \
-  void BM_NearestNeighbors##d##_##p##_##c##_##k(int iters) { \
-    BM_NearestNeighbors<d, p, c, k>(iters);                  \
-  }                                                          \
-  BENCHMARK(BM_NearestNeighbors##d##_##p##_##c##_##k);
+#define BENCHMARK_NEAREST_NEIGHBORS(d, p, c, k)  \
+  void BM_NearestNeighbors##d##_##p##_##c##_##k( \
+      ::testing::benchmark::State& state) {      \
+    BM_NearestNeighbors<d, p, c, k>(state);      \
+  }                                              \
+  BENCHMARK(BM_NearestNeighbors##d##_##p##_##c##_##k)->UseRealTime();

 #define RUN_BM_NearestNeighbors(k)                                 \
  BENCHMARK_NEAREST_NEIGHBORS(k100Dim, k1kPoints, k100Centers, k); \
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@ -57,9 +57,9 @@ void FillTensorWithRandomValues<tstring>(Tensor* t, int string_length,
 // std::string, then the length of individual strings in the tensors will be
 // of length "string_length".
 template <typename T>
-static void ConcatHelper(int iters, int concat_dimension, int dim2,
+static void ConcatHelper(::testing::benchmark::State& state,
+                         int concat_dimension, int dim2,
                         int string_length = 0) {
-  testing::StopTiming();
  Graph* g = new Graph(OpRegistry::Global());

  DataType dt = DataTypeToEnum<T>::v();
@ -81,49 +81,82 @@ static void ConcatHelper(int iters, int concat_dimension, int dim2,
          .Attr("T", dt)
          .Finalize(g, &node));

-  testing::BytesProcessed(static_cast<int64>(iters) * (in0_bytes + in1_bytes));
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
-  testing::UseRealTime();
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
+                          (in0_bytes + in1_bytes));
 }

-static void BM_ConcatDim0Float(int iters, int dim2) {
-  ConcatHelper<float>(iters, 0, dim2);
+void BM_ConcatDim0Float(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<float>(state, 0, dim2);
 }

-static void BM_ConcatDim1Float(int iters, int dim2) {
-  ConcatHelper<float>(iters, 1, dim2);
+void BM_ConcatDim1Float(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<float>(state, 1, dim2);
 }

-BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
-BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_ConcatDim0Float)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);
+BENCHMARK(BM_ConcatDim1Float)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);

-static void BM_ConcatDim0String(int iters, int dim2, int string_length) {
-  ConcatHelper<tstring>(iters, 0, dim2, string_length);
+void BM_ConcatDim0String(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+  const int string_length = state.range(1);
+
+  ConcatHelper<tstring>(state, 0, dim2, string_length);
 }

 BENCHMARK(BM_ConcatDim0String)
+    ->UseRealTime()
    ->ArgPair(1, 16)
    ->ArgPair(1, 10000)
    ->ArgPair(100, 16);

-static void BM_ConcatDim1uint8(int iters, int dim2) {
-  ConcatHelper<uint8>(iters, 1, dim2);
+void BM_ConcatDim1uint8(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<uint8>(state, 1, dim2);
 }
-static void BM_ConcatDim1int16(int iters, int dim2) {
-  ConcatHelper<int16>(iters, 1, dim2);
+void BM_ConcatDim1int16(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<int16>(state, 1, dim2);
 }
-static void BM_ConcatDim1bfloat16(int iters, int dim2) {
-  ConcatHelper<bfloat16>(iters, 1, dim2);
+void BM_ConcatDim1bfloat16(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatHelper<bfloat16>(state, 1, dim2);
 }

-BENCHMARK(BM_ConcatDim1uint8)->Arg(1000)->Arg(100000)->Arg(1000000);
-BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000);
-BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_ConcatDim1uint8)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);
+BENCHMARK(BM_ConcatDim1int16)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);
+BENCHMARK(BM_ConcatDim1bfloat16)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);

 template <typename T>
-static void ConcatManyHelper(int iters, int concat_dimension, int dim2) {
-  testing::StopTiming();
+static void ConcatManyHelper(::testing::benchmark::State& state,
+                             int concat_dimension, int dim2) {
  Graph* g = new Graph(OpRegistry::Global());

  DataType dt = DataTypeToEnum<T>::v();
@ -146,30 +179,25 @@ static void ConcatManyHelper(int iters, int concat_dimension, int dim2) {
                  .Attr("N", 64)
                  .Attr("T", dt)
                  .Finalize(g, &node));
-  testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
-                          kNumInputs * sizeof(T));
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
-  testing::UseRealTime();
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
+                          dim2 * kNumInputs * sizeof(T));
 }

-static void BM_ConcatManyDim1bfloat16(int iters, int dim2) {
-  ConcatManyHelper<bfloat16>(iters, 1, dim2);
+void BM_ConcatManyDim1bfloat16(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  ConcatManyHelper<bfloat16>(state, 1, dim2);
 }

-BENCHMARK(BM_ConcatManyDim1bfloat16)->Arg(18)->Arg(34)->Arg(60);
-
-static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) {
-  testing::StopTiming();
+BENCHMARK(BM_ConcatManyDim1bfloat16)->UseRealTime()->Arg(18)->Arg(34)->Arg(60);

+void MemcpyAlternativeHelper(::testing::benchmark::State& state, int dim2) {
  const int kDim1 = 100;
  std::vector<float> data1(kDim1 * dim2, 1.0f);
  std::vector<float> data2(kDim1 * dim2, 2.0f);

-  testing::BytesProcessed(static_cast<int64>(iters) *
-                          ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
-  testing::StartTiming();
-  while (--iters > 0) {
+  for (auto s : state) {
    const size_t n0 = data1.size();
    const size_t n1 = data2.size();
    float* result = new float[n0 + n1];
@ -177,24 +205,37 @@ static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) {
    memcpy(&result[n0], &data2[0], n1 * sizeof(float));
    delete[] result;
  }
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
+                          ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
 }

-static void BM_MemcpyAlternativeDim0(int iters, int dim2) {
-  MemcpyAlternativeHelper(iters, 0, dim2);
+void BM_MemcpyAlternativeDim0(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  MemcpyAlternativeHelper(state, dim2);
 }
-static void BM_MemcpyAlternativeDim1(int iters, int dim2) {
-  MemcpyAlternativeHelper(iters, 1, dim2);
+void BM_MemcpyAlternativeDim1(::testing::benchmark::State& state) {
+  const int dim2 = state.range(0);
+
+  MemcpyAlternativeHelper(state, dim2);
 }

-BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
-BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
+BENCHMARK(BM_MemcpyAlternativeDim0)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);
+BENCHMARK(BM_MemcpyAlternativeDim1)
+    ->UseRealTime()
+    ->Arg(1000)
+    ->Arg(100000)
+    ->Arg(1000000);

 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
                         Eigen::Unaligned>
    EigenMap;
-static void MemcpyManyAlternative1(int iters, int dim2) {
-  testing::StopTiming();
-
+void MemcpyManyAlternative1(::testing::benchmark::State& state) {
+  int dim2 = state.range(0);
  const int kDim1 = 40000;
  const int kNumCopies = 64;
  const int size = kDim1 * dim2 * kNumCopies;
@ -202,10 +243,7 @@ static void MemcpyManyAlternative1(int iters, int dim2) {
  EigenMap map(data, size);
  map.setRandom();

-  testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
-                          kNumCopies * sizeof(bfloat16));
-  testing::StartTiming();
-  while (iters-- > 0) {
+  for (auto s : state) {
    std::vector<bfloat16*> inputs(kNumCopies);
    for (int i = 0; i < kNumCopies; ++i) {
      inputs[i] = &data[i * kDim1 * dim2];
@ -225,11 +263,12 @@ static void MemcpyManyAlternative1(int iters, int dim2) {
    delete[] result;
  }
  delete[] data;
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
+                          dim2 * kNumCopies * sizeof(bfloat16));
 }

-static void MemcpyManyAlternative2(int iters, int dim2) {
-  testing::StopTiming();
-
+void MemcpyManyAlternative2(::testing::benchmark::State& state) {
+  int dim2 = state.range(0);
  const int kDim1 = 40000;
  const int kNumCopies = 64;
  const int size = kDim1 * dim2 * kNumCopies;
@ -237,11 +276,8 @@ static void MemcpyManyAlternative2(int iters, int dim2) {
  EigenMap map(data, size);
  map.setRandom();

-  testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
-                          kNumCopies * sizeof(bfloat16));
-  testing::StartTiming();
  std::vector<bfloat16*> inputs(kNumCopies);
-  while (--iters > 0) {
+  for (auto s : state) {
    bfloat16* result = new bfloat16[size];
    for (int i = 0; i < kNumCopies; ++i) {
      inputs[i] = &data[i * kDim1 * dim2];
@ -260,6 +296,9 @@ static void MemcpyManyAlternative2(int iters, int dim2) {
    delete[] result;
  }
  delete[] data;
+
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
+                          dim2 * kNumCopies * sizeof(bfloat16));
 }

 BENCHMARK(MemcpyManyAlternative1)
--- a/tensorflow/core/kernels/constant_op_test.cc
+++ b/tensorflow/core/kernels/constant_op_test.cc
@ -114,15 +114,23 @@ static Graph* ManyConsts(int num, bool sequential) {
  return g;
 }

-static void BM_ManyConsts_Parallel(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  test::Benchmark("cpu", ManyConsts(num, false /* !sequential */)).Run(iters);
+static void BM_ManyConsts_Parallel(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+  test::Benchmark("cpu", ManyConsts(num, false /* !sequential */),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
 }
 BENCHMARK(BM_ManyConsts_Parallel)->Range(1, 1 << 10);

-static void BM_ManyConsts_Sequential(int iters, int num) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num);
-  test::Benchmark("cpu", ManyConsts(num, true /* sequential */)).Run(iters);
+static void BM_ManyConsts_Sequential(::testing::benchmark::State& state) {
+  const int num = state.range(0);
+
+  test::Benchmark("cpu", ManyConsts(num, true /* sequential */),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
 }
 BENCHMARK(BM_ManyConsts_Sequential)->Range(1, 1 << 10);

--- a/tensorflow/core/kernels/conv_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
@ -309,104 +309,120 @@ static Graph* FusedConv2DWithBatchNorm(
 // The following benchmarks are always using 'float' data type with NHWC layout.
 // -------------------------------------------------------------------------- //

-#define BM_SETUP(N, H, W, C, type, LABEL, NAME)                               \
-  testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * (C)); \
-  testing::SetLabel(LABEL);
+#define BM_SET_INFO(N, H, W, C, type, LABEL, NAME)                             \
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * (N) * (H) * \
+                          (W) * (C));                                          \
+  state.SetLabel(LABEL);

 #define BM_NAME(name, type, N, H, W, C, FW, FH, FC) \
  name##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC

-#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL)                      \
-  static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)(int iters) { \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                              \
-    test::Benchmark(#type, Conv2D<float>(N, H, W, C, FW, FH, FC).graph)     \
-        .Run(iters);                                                        \
-  }                                                                         \
+#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL)                  \
+  static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH,              \
+                      FC)(::testing::benchmark::State & state) {        \
+    test::Benchmark(#type, Conv2D<float>(N, H, W, C, FW, FH, FC).graph, \
+                    /*old_benchmark_api=*/false)                        \
+        .Run(state);                                                    \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                       \
+  }                                                                     \
  BENCHMARK(BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC));

 #define BM_Conv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
  static void BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH,       \
-                      FC)(int iters) {                                   \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                           \
+                      FC)(::testing::benchmark::State & state) {         \
    test::Benchmark(#type,                                               \
-                    Conv2DWithBias<float>(N, H, W, C, FW, FH, FC).graph) \
-        .Run(iters);                                                     \
+                    Conv2DWithBias<float>(N, H, W, C, FW, FH, FC).graph, \
+                    /*old_benchmark_api=*/false)                         \
+        .Run(state);                                                     \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                        \
  }                                                                      \
  BENCHMARK(BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, FC));

-#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)         \
-  static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH,     \
-                      FC)(int iters) {                                        \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
-    test::Benchmark(#type, Conv2DWithBiasAndActivation<float>(N, H, W, C, FW, \
-                                                              FH, FC, "Relu") \
-                               .graph)                                        \
-        .Run(iters);                                                          \
-  }                                                                           \
+#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)      \
+  static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH,  \
+                      FC)(::testing::benchmark::State & state) {           \
+    test::Benchmark(                                                       \
+        #type,                                                             \
+        Conv2DWithBiasAndActivation<float>(N, H, W, C, FW, FH, FC, "Relu") \
+            .graph,                                                        \
+        /*old_benchmark_api=*/false)                                       \
+        .Run(state);                                                       \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                          \
+  }                                                                        \
  BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));

-#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
-  static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH,       \
-                      FC)(int iters) {                                        \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
-    test::Benchmark(#type, FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, \
-                                                      {"BiasAdd"}))           \
-        .Run(iters);                                                          \
-  }                                                                           \
+#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)      \
+  static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH,  \
+                      FC)(::testing::benchmark::State & state) {         \
+    test::Benchmark(                                                     \
+        #type,                                                           \
+        FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, {"BiasAdd"}), \
+        /*old_benchmark_api=*/false)                                     \
+        .Run(state);                                                     \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                        \
+  }                                                                      \
  BENCHMARK(BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, FC));

 #define BM_FusedConv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
  static void BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
-                      FC)(int iters) {                                         \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
-    test::Benchmark(#type, FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC,  \
-                                                      {"BiasAdd", "Relu"}))    \
-        .Run(iters);                                                           \
+                      FC)(::testing::benchmark::State & state) {               \
+    test::Benchmark(#type,                                                     \
+                    FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC,         \
+                                               {"BiasAdd", "Relu"}),           \
+                    /*old_benchmark_api=*/false)                               \
+        .Run(state);                                                           \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                              \
  }                                                                            \
  BENCHMARK(                                                                   \
      BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));

 #define BM_Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL)           \
  static void BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH,       \
-                      FC)(int iters) {                                        \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+                      FC)(::testing::benchmark::State & state) {              \
    test::Benchmark(#type,                                                    \
-                    Conv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC).graph) \
-        .Run(iters);                                                          \
+                    Conv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC).graph, \
+                    /*old_benchmark_api=*/false)                              \
+        .Run(state);                                                          \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                             \
  }                                                                           \
  BENCHMARK(BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));

 #define BM_Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
  static void BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, \
-                      FC)(int iters) {                                         \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
-    test::Benchmark(#type, Conv2DWithBatchNormAndActivation<float>(            \
-                               N, H, W, C, FW, FH, FC, "Relu")                 \
-                               .graph)                                         \
-        .Run(iters);                                                           \
+                      FC)(::testing::benchmark::State & state) {               \
+    test::Benchmark(#type,                                                     \
+                    Conv2DWithBatchNormAndActivation<float>(N, H, W, C, FW,    \
+                                                            FH, FC, "Relu")    \
+                        .graph,                                                \
+                    /*old_benchmark_api=*/false)                               \
+        .Run(state);                                                           \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                              \
  }                                                                            \
  BENCHMARK(                                                                   \
      BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, FC));

 #define BM_FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL)     \
  static void BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, \
-                      FC)(int iters) {                                       \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
-    test::Benchmark(#type, FusedConv2DWithBatchNorm<float>(                  \
-                               N, H, W, C, FW, FH, FC, {"FusedBatchNorm"}))  \
-        .Run(iters);                                                         \
+                      FC)(::testing::benchmark::State & state) {             \
+    test::Benchmark(#type,                                                   \
+                    FusedConv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC,  \
+                                                    {"FusedBatchNorm"}),     \
+                    /*old_benchmark_api=*/false)                             \
+        .Run(state);                                                         \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                            \
  }                                                                          \
  BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));

 #define BM_FusedConv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type,      \
                                           LABEL)                             \
  static void BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C,   \
-                      FW, FH, FC)(int iters) {                                \
-    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
-    test::Benchmark(                                                          \
-        #type, FusedConv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC,        \
-                                               {"FusedBatchNorm", "Relu"}))   \
-        .Run(iters);                                                          \
+                      FW, FH, FC)(::testing::benchmark::State & state) {      \
+    test::Benchmark(#type,                                                    \
+                    FusedConv2DWithBatchNorm<float>(                          \
+                        N, H, W, C, FW, FH, FC, {"FusedBatchNorm", "Relu"}),  \
+                    /*old_benchmark_api=*/false)                              \
+        .Run(state);                                                          \
+    BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D);                             \
  }                                                                           \
  BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, FW, \
                    FH, FC));
@ -561,11 +577,12 @@ BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32");

 #define BM_Conv2DFmt(T, FORMAT, N, H, W, C, FW, FH, FC, type)                 \
  static void BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH,    \
-                           FC)(int iters) {                                   \
-    BM_SETUP(N, H, W, C, type, "", Conv2D);                                   \
+                           FC)(::testing::benchmark::State & state) {         \
    test::Benchmark(#type,                                                    \
-                    Conv2D<T>(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph) \
-        .Run(iters);                                                          \
+                    Conv2D<T>(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph, \
+                    /*old_benchmark_api=*/false)                              \
+        .Run(state);                                                          \
+    BM_SET_INFO(N, H, W, C, type, "", Conv2D);                                \
  }                                                                           \
  BENCHMARK(BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH, FC));

--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@ -42,15 +42,19 @@ int RowsAndColsArg(int r, int c) { return r * kRows + c; }
 int RowsFromArg(int arg) { return (arg / kRows); }
 int ColsFromArg(int arg) { return (arg % kRows); }

-#define BM_UNARY(DEVICE, FUNC, T, TYPE)                              \
-  void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) {           \
-    const int64 tot = static_cast<int64>(iters) * num;               \
-    testing::UseRealTime();                                          \
-    testing::ItemsProcessed(tot);                                    \
-    testing::BytesProcessed(tot * sizeof(T));                        \
-    test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE)).Run(iters); \
-  }                                                                  \
-  BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
+#define BM_UNARY(DEVICE, FUNC, T, TYPE)                                    \
+  void BM_##DEVICE##_##FUNC##_##TYPE(::testing::benchmark::State& state) { \
+    const int num = state.range(0);                                        \
+    test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE),                   \
+                    /*old_benchmark_api*/ false)                           \
+        .Run(state);                                                       \
+    const int64 tot = static_cast<int64>(state.iterations()) * num;        \
+    state.SetItemsProcessed(tot);                                          \
+    state.SetBytesProcessed(tot * sizeof(T));                              \
+  }                                                                        \
+  BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)                                 \
+      ->UseRealTime()                                                      \
+      ->Range(4 << 10, 1 << 20);

 BM_UNARY(cpu, Floor, float, DT_FLOAT);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@ -101,27 +105,30 @@ Graph* BinaryScalar(int num, const string& func) {
  return g;
 }

-#define BM_BINARY_SCALAR(DEVICE, FUNC)                             \
-  void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) {         \
-    const int64 tot = static_cast<int64>(iters) * num;             \
-    testing::UseRealTime();                                        \
-    testing::ItemsProcessed(tot);                                  \
-    testing::BytesProcessed(tot * sizeof(float));                  \
-    test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \
-  }                                                                \
-  BENCHMARK(BM_##DEVICE##_##FUNC##_scalar)                         \
-      ->Arg(1 << 12) /* must >= 4096 */                            \
-      ->Arg(1 << 13)                                               \
-      ->Arg(1 << 14)                                               \
-      ->Arg((1 << 15) - (1 << 13))                                 \
-      ->Arg(1 << 15)                                               \
-      ->Arg((1 << 15) + (1 << 14))                                 \
-      ->Arg(1 << 16)                                               \
-      ->Arg((1 << 17) - (1 << 15))                                 \
-      ->Arg(1 << 17)                                               \
-      ->Arg((1 << 17) + (1 << 16))                                 \
-      ->Arg(1 << 18)                                               \
-      ->Arg(1 << 19)                                               \
+#define BM_BINARY_SCALAR(DEVICE, FUNC)                                     \
+  void BM_##DEVICE##_##FUNC##_scalar(::testing::benchmark::State& state) { \
+    const int num = state.range(0);                                        \
+                                                                           \
+    test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC),                     \
+                    /*old_benchmark_api=*/false)                           \
+        .Run(state);                                                       \
+    const int64 tot = static_cast<int64>(state.iterations()) * num;        \
+    state.SetItemsProcessed(tot);                                          \
+    state.SetBytesProcessed(tot * sizeof(float));                          \
+  }                                                                        \
+  BENCHMARK(BM_##DEVICE##_##FUNC##_scalar)                                 \
+      ->Arg(1 << 12) /* must >= 4096 */                                    \
+      ->Arg(1 << 13)                                                       \
+      ->Arg(1 << 14)                                                       \
+      ->Arg((1 << 15) - (1 << 13))                                         \
+      ->Arg(1 << 15)                                                       \
+      ->Arg((1 << 15) + (1 << 14))                                         \
+      ->Arg(1 << 16)                                                       \
+      ->Arg((1 << 17) - (1 << 15))                                         \
+      ->Arg(1 << 17)                                                       \
+      ->Arg((1 << 17) + (1 << 16))                                         \
+      ->Arg(1 << 18)                                                       \
+      ->Arg(1 << 19)                                                       \
      ->Arg(1 << 20);

 BM_BINARY_SCALAR(cpu, Less);
@ -173,17 +180,19 @@ Graph* CubeWithMulSquare(int num) {
  return g;
 }

-#define BM_CUBE(DEVICE, Impl)                          \
-  void BM_##DEVICE##_Cube_##Impl(int iters, int num) { \
-    const int64 tot = static_cast<int64>(iters) * num; \
-    testing::UseRealTime();                            \
-    testing::ItemsProcessed(tot);                      \
-    testing::BytesProcessed(tot * sizeof(float));      \
-    test::Benchmark(#DEVICE, Impl(num)).Run(iters);    \
-  }                                                    \
-  BENCHMARK(BM_##DEVICE##_Cube_##Impl)                 \
-      ->Arg(1 << 12) /* must >= 4096 */                \
-      ->Arg(1 << 16)                                   \
+#define BM_CUBE(DEVICE, Impl)                                          \
+  void BM_##DEVICE##_Cube_##Impl(::testing::benchmark::State& state) { \
+    const int num = state.range(0);                                    \
+                                                                       \
+    test::Benchmark(#DEVICE, Impl(num)).Run(state.iterations());       \
+    const int64 tot = static_cast<int64>(state.iterations()) * num;    \
+    state.SetItemsProcessed(tot);                                      \
+    state.SetBytesProcessed(tot * sizeof(float));                      \
+  }                                                                    \
+  BENCHMARK(BM_##DEVICE##_Cube_##Impl)                                 \
+      ->UseRealTime()                                                  \
+      ->Arg(1 << 12) /* must >= 4096 */                                \
+      ->Arg(1 << 16)                                                   \
      ->Arg(1 << 20);

 BM_CUBE(cpu, CubeWithPow3);
@ -211,17 +220,21 @@ Graph* BiasAdd(int rows, int cols, DataType type) {
  return g;
 }

-#define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C)                             \
-  void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C(int iters, int arg) {      \
-    const int rows = RowsFromArg(arg);                                         \
-    const int cols = ColsFromArg(arg);                                         \
-    const int64 tot = static_cast<int64>(iters) * rows * cols;                 \
-    testing::UseRealTime();                                                    \
-    testing::ItemsProcessed(tot);                                              \
-    testing::BytesProcessed(tot * sizeof(C_TYPE));                             \
-    test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE)).Run(iters); \
-  }                                                                            \
-  BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C)                      \
+#define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C)                          \
+  void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C(                        \
+      ::testing::benchmark::State& state) {                                 \
+    const int arg = state.range(0);                                         \
+    const int rows = RowsFromArg(arg);                                      \
+    const int cols = ColsFromArg(arg);                                      \
+    const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
+    test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE),          \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    state.SetItemsProcessed(tot);                                           \
+    state.SetBytesProcessed(tot * sizeof(C_TYPE));                          \
+  }                                                                         \
+  BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C)                   \
+      ->UseRealTime()                                                       \
      ->Arg(RowsAndColsArg(R, C));

 #define BM_BIAS_ADD_ALL(DEVICE, C_TYPE, TF_TYPE)   \
@ -264,16 +277,21 @@ Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,

 #define BM_BIAS_ADD_GRAD(DEVICE, FMT, C_TYPE, TF_TYPE, R, C, CH)               \
  void BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH(      \
-      int iters, int arg, int channels) {                                      \
+      ::testing::benchmark::State& state) {                                    \
+    const int arg = state.range(0);                                            \
+    const int channels = state.range(1);                                       \
+                                                                               \
    const int rows = RowsFromArg(arg);                                         \
    const int cols = ColsFromArg(arg);                                         \
-    const int64 tot = static_cast<int64>(iters) * rows * cols * channels;      \
-    testing::UseRealTime();                                                    \
-    testing::ItemsProcessed(tot);                                              \
-    testing::BytesProcessed(tot * sizeof(C_TYPE));                             \
-    test::Benchmark(#DEVICE, BiasAddGrad<C_TYPE>(rows, cols, channels,         \
-                                                 TF_TYPE, FORMAT_##FMT))       \
-        .Run(iters);                                                           \
+    test::Benchmark(                                                           \
+        #DEVICE,                                                               \
+        BiasAddGrad<C_TYPE>(rows, cols, channels, TF_TYPE, FORMAT_##FMT),      \
+        /*old_benchmark_api=*/false)                                           \
+        .Run(state);                                                           \
+    const int64 tot =                                                          \
+        static_cast<int64>(state.iterations()) * rows * cols * channels;       \
+    state.SetItemsProcessed(tot);                                              \
+    state.SetBytesProcessed(tot * sizeof(C_TYPE));                             \
  }                                                                            \
  BENCHMARK(BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH) \
      ->ArgPair(RowsAndColsArg(R, C), CH);
@ -326,16 +344,20 @@ Graph* BcastAdd(int rows, int cols, int dim) {
  return g;
 }

-#define BM_BCAST_ADD_ROW(DEVICE, R, C)                             \
-  void BM_##DEVICE##_BcastAddRow_R##R##_C##C(int iters, int arg) { \
-    const int rows = RowsFromArg(arg);                             \
-    const int cols = ColsFromArg(arg);                             \
-    const int64 tot = static_cast<int64>(iters) * rows * cols;     \
-    testing::UseRealTime();                                        \
-    testing::ItemsProcessed(tot);                                  \
-    testing::BytesProcessed(tot * sizeof(float));                  \
-    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters);  \
-  }                                                                \
+#define BM_BCAST_ADD_ROW(DEVICE, R, C)                                      \
+  void BM_##DEVICE##_BcastAddRow_R##R##_C##C(                               \
+      ::testing::benchmark::State& state) {                                 \
+    const int arg = state.range(0);                                         \
+                                                                            \
+    const int rows = RowsFromArg(arg);                                      \
+    const int cols = ColsFromArg(arg);                                      \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0),                       \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
+    state.SetItemsProcessed(tot);                                           \
+    state.SetBytesProcessed(tot * sizeof(float));                           \
+  }                                                                         \
  BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C));

 #define BM_BCAST_ADD_ROW_ALL(DEVICE)   \
@ -350,17 +372,24 @@ BM_BCAST_ADD_ROW_ALL(gpu);
 #undef BM_BCAST_ADD_ROW_ALL
 #undef BM_BCAST_ADD_ROW

-#define BM_BCAST_ADD_COL(DEVICE, R, C)                             \
-  void BM_##DEVICE##_BcastAddCol_R##R##_C##C(int iters, int arg) { \
-    const int rows = RowsFromArg(arg);                             \
-    const int cols = ColsFromArg(arg);                             \
-    const int64 tot = static_cast<int64>(iters) * rows * cols;     \
-    testing::UseRealTime();                                        \
-    testing::ItemsProcessed(tot);                                  \
-    testing::BytesProcessed(tot * sizeof(float));                  \
-    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters);  \
-  }                                                                \
-  BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
+#define BM_BCAST_ADD_COL(DEVICE, R, C)                                      \
+  void BM_##DEVICE##_BcastAddCol_R##R##_C##C(                               \
+      ::testing::benchmark::State& state) {                                 \
+    const int arg = state.range(0);                                         \
+                                                                            \
+    const int rows = RowsFromArg(arg);                                      \
+    const int cols = ColsFromArg(arg);                                      \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1),                       \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
+                                                                            \
+    state.SetItemsProcessed(tot);                                           \
+    state.SetBytesProcessed(tot * sizeof(float));                           \
+  }                                                                         \
+  BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)                          \
+      ->UseRealTime()                                                       \
+      ->Arg(RowsAndColsArg(R, C));

 #define BM_BCAST_ADD_COL_ALL(DEVICE)   \
  BM_BCAST_ADD_COL(DEVICE, 512, 2048); \
@ -374,17 +403,23 @@ BM_BCAST_ADD_COL_ALL(gpu);
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL

-#define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C)                            \
-  void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C(int iters, int arg) { \
-    const int rows = RowsFromArg(arg);                                 \
-    const int cols = ColsFromArg(arg);                                 \
-    const int64 tot = static_cast<int64>(iters) * rows * cols;         \
-    testing::UseRealTime();                                            \
-    testing::ItemsProcessed(tot);                                      \
-    testing::BytesProcessed(tot * sizeof(float));                      \
-    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2)).Run(iters);      \
-  }                                                                    \
-  BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C)                 \
+#define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C)                                 \
+  void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C(                           \
+      ::testing::benchmark::State& state) {                                 \
+    const int arg = state.range(0);                                         \
+                                                                            \
+    const int rows = RowsFromArg(arg);                                      \
+    const int cols = ColsFromArg(arg);                                      \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2),                       \
+                    /*old_benchmark_api=*/false)                            \
+        .Run(state);                                                        \
+    const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
+                                                                            \
+    state.SetItemsProcessed(tot);                                           \
+    state.SetBytesProcessed(tot * sizeof(float));                           \
+  }                                                                         \
+  BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C)                      \
+      ->UseRealTime()                                                       \
      ->Arg(RowsAndColsArg(R, C));

 #define BM_BCAST_ADD_CROSS_RC_ALL(DEVICE)   \
@ -399,17 +434,22 @@ BM_BCAST_ADD_CROSS_RC_ALL(gpu);
 #undef BM_BCAST_ADD_CROSS_RC_ALL
 #undef BM_BCAST_ADD_CROSS_RC

-#define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C)                            \
-  void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C(int iters, int arg) { \
-    const int rows = RowsFromArg(arg);                                 \
-    const int cols = ColsFromArg(arg);                                 \
-    const int64 tot = static_cast<int64>(iters) * rows * cols;         \
-    testing::UseRealTime();                                            \
-    testing::ItemsProcessed(tot);                                      \
-    testing::BytesProcessed(tot * sizeof(float));                      \
-    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3)).Run(iters);      \
-  }                                                                    \
-  BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C)                 \
+#define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C)                                 \
+  void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C(                           \
+      ::testing::benchmark::State& state) {                                 \
+    const int arg = state.range(0);                                         \
+                                                                            \
+    const int rows = RowsFromArg(arg);                                      \
+    const int cols = ColsFromArg(arg);                                      \
+    test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3),                       \
+                    /*old_benchmark_api*/ false)                            \
+        .Run(state);                                                        \
+    const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
+    state.SetItemsProcessed(tot);                                           \
+    state.SetBytesProcessed(tot * sizeof(float));                           \
+  }                                                                         \
+  BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C)                      \
+      ->UseRealTime()                                                       \
      ->Arg(RowsAndColsArg(R, C));

 #define BM_BCAST_ADD_CROSS_CR_ALL(DEVICE)   \
--- a/tensorflow/core/kernels/data/single_threaded_executor_test.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
@ -273,10 +273,10 @@ TEST_F(ExecutorTest, ControlDependenciesFromSpecialNodes) {
  EXPECT_EQ(3.0, V(retvals[0]));  // out = 1.0 + 2.0 = 3.0
 }

-static void BM_executor(int iters, int width, int depth) {
-#ifdef PLATFORM_GOOGLE
-  BenchmarkUseRealTime();
-#endif  // PLATFORM_GOOGLE
+void BM_executor(::testing::benchmark::State& state) {
+  const int width = state.range(0);
+  const int depth = state.range(1);
+
  Graph* g = new Graph(OpRegistry::Global());
  random::PhiloxRandom philox(1729, 17);
  random::SimplePhilox rand(&philox);
@ -306,30 +306,28 @@ static void BM_executor(int iters, int width, int depth) {
    }
  }
  FixupSourceAndSinkEdges(g);
-#ifdef PLATFORM_GOOGLE
-  SetBenchmarkLabel(strings::StrCat("Nodes = ", cur));
-  SetBenchmarkItemsProcessed(cur * static_cast<int64>(iters));
-#endif  // PLATFORM_GOOGLE
  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .Run(iters);
+                  "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetLabel(strings::StrCat("Nodes = ", cur));
+  state.SetItemsProcessed(cur * static_cast<int64>(state.iterations()));
 }

 // Tall skinny graphs
-BENCHMARK(BM_executor)->ArgPair(16, 1024);
-BENCHMARK(BM_executor)->ArgPair(32, 8192);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(16, 1024);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(32, 8192);

 // Short fat graphs
-BENCHMARK(BM_executor)->ArgPair(1024, 16);
-BENCHMARK(BM_executor)->ArgPair(8192, 32);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 16);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(8192, 32);

 // Tall fat graph
-BENCHMARK(BM_executor)->ArgPair(1024, 1024);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 1024);
+
+void BM_const_identity(::testing::benchmark::State& state) {
+  const int width = state.range(0);
+  const int outputs_per_const = state.range(1);

-static void BM_const_identity(int iters, int width, int outputs_per_const) {
-#ifdef PLATFORM_GOOGLE
-  BenchmarkUseRealTime();
-#endif  // PLATFORM_GOOGLE
  Graph* g = new Graph(OpRegistry::Global());
  for (int i = 0; i < width; ++i) {
    Tensor i_t(i);
@ -339,22 +337,20 @@ static void BM_const_identity(int iters, int width, int outputs_per_const) {
    }
  }
  FixupSourceAndSinkEdges(g);
-#ifdef PLATFORM_GOOGLE
-  SetBenchmarkLabel(
-      strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
-  SetBenchmarkItemsProcessed((1 + outputs_per_const) * width *
-                             static_cast<int64>(iters));
-#endif  // PLATFORM_GOOGLE
  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .Run(iters);
+                  "SINGLE_THREADED_EXECUTOR",
+                  /*old_benchmark_api=*/false)
+      .Run(state);
+  state.SetLabel(strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
+  state.SetItemsProcessed((1 + outputs_per_const) * width *
+                          static_cast<int64>(state.iterations()));
 }

 // Graph with actual op execution.
-BENCHMARK(BM_const_identity)->ArgPair(1, 1);
-BENCHMARK(BM_const_identity)->ArgPair(1, 100);
-BENCHMARK(BM_const_identity)->ArgPair(100, 1);
-BENCHMARK(BM_const_identity)->ArgPair(100, 100);
+BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(1, 1);
+BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(1, 100);
+BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(100, 1);
+BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(100, 100);

 // TODO(mrry): This benchmark currently crashes with a use-after free, because
 // test::Benchmark::RunWithArgs() assumes that the executor will take ownership
@ -368,7 +364,7 @@ BENCHMARK(BM_const_identity)->ArgPair(100, 100);
 #define ALICE "/job:j/replica:0/task:0/cpu:0"
 #define BOB "/job:j/replica:0/task:0/gpu:0"

-static void BM_FeedInputFetchOutput(int iters) {
+static void BM_FeedInputFetchOutput(::testing::benchmark::State& state) {
  Graph* g = new Graph(OpRegistry::Global());
  // z = x + y: x and y are provided as benchmark inputs.  z is the
  // output of the benchmark.  Conceptually, the caller is ALICE, the
@ -380,10 +376,10 @@ static void BM_FeedInputFetchOutput(int iters) {
  FixupSourceAndSinkEdges(g);
  Tensor val(DT_FLOAT, TensorShape({}));
  val.scalar<float>()() = 3.14;
-  SetBenchmarkItemsProcessed(static_cast<int64>(iters));
  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .RunWithArgs({{x, val}, {y, val}}, {z}, iters);
+                  "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api=*/false)
+      .RunWithArgs({{x, val}, {y, val}}, {z}, state);
+  state.SetItemsProcessed(state.iterations());
 }
 BENCHMARK(BM_FeedInputFetchOutput);
 #endif
--- a/tensorflow/core/kernels/dequantize_op_test.cc
+++ b/tensorflow/core/kernels/dequantize_op_test.cc
@ -247,7 +247,7 @@ TEST_F(DequantizeOpTest, DequantizeScaledQint8Axis3) {
 }

 template <typename T>
-static void BM_DequantizeMinCombinedCpu(int iters) {
+static void BM_DequantizeMinCombinedCpu(::testing::benchmark::State& state) {
  auto root = Scope::NewRootScope().ExitOnError();
  const int64 num_values = 1500 * 250;
  std::vector<T> inputs;
@ -262,25 +262,26 @@ static void BM_DequantizeMinCombinedCpu(int iters) {
  Graph* g = new Graph(OpRegistry::Global());
  TF_CHECK_OK(root.ToGraph(g));

-  test::Benchmark("cpu", g).Run(iters);
-  testing::BytesProcessed(iters * num_values * (sizeof(float) + sizeof(T)));
-  testing::ItemsProcessed(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetBytesProcessed(state.iterations() * num_values *
+                          (sizeof(float) + sizeof(T)));
+  state.SetItemsProcessed(state.iterations());
 }

-static void BM_DequantizeMinCombinedCpuQuint16(int iters) {
-  BM_DequantizeMinCombinedCpu<quint16>(iters);
+void BM_DequantizeMinCombinedCpuQuint16(::testing::benchmark::State& state) {
+  BM_DequantizeMinCombinedCpu<quint16>(state);
 }

-static void BM_DequantizeMinCombinedCpuQint16(int iters) {
-  BM_DequantizeMinCombinedCpu<qint16>(iters);
+void BM_DequantizeMinCombinedCpuQint16(::testing::benchmark::State& state) {
+  BM_DequantizeMinCombinedCpu<qint16>(state);
 }

-static void BM_DequantizeMinCombinedCpuQuint8(int iters) {
-  BM_DequantizeMinCombinedCpu<quint8>(iters);
+void BM_DequantizeMinCombinedCpuQuint8(::testing::benchmark::State& state) {
+  BM_DequantizeMinCombinedCpu<quint8>(state);
 }

-static void BM_DequantizeMinCombinedCpuQint8(int iters) {
-  BM_DequantizeMinCombinedCpu<qint8>(iters);
+void BM_DequantizeMinCombinedCpuQint8(::testing::benchmark::State& state) {
+  BM_DequantizeMinCombinedCpu<qint8>(state);
 }

 BENCHMARK(BM_DequantizeMinCombinedCpuQuint16);
@ -289,7 +290,8 @@ BENCHMARK(BM_DequantizeMinCombinedCpuQuint8);
 BENCHMARK(BM_DequantizeMinCombinedCpuQint8);

 template <typename T>
-static void BM_DequantizeBfloat16MinCombinedCpu(int iters) {
+static void BM_DequantizeBfloat16MinCombinedCpu(
+    ::testing::benchmark::State& state) {
  auto root = Scope::NewRootScope().ExitOnError();
  const int64 num_values = 1500 * 250;
  std::vector<T> inputs;
@ -304,25 +306,30 @@ static void BM_DequantizeBfloat16MinCombinedCpu(int iters) {
  Graph* g = new Graph(OpRegistry::Global());
  TF_CHECK_OK(root.ToGraph(g));

-  test::Benchmark("cpu", g).Run(iters);
-  testing::BytesProcessed(iters * num_values * (sizeof(bfloat16) + sizeof(T)));
-  testing::ItemsProcessed(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
+  state.SetBytesProcessed(state.iterations() * num_values *
+                          (sizeof(bfloat16) + sizeof(T)));
+  state.SetItemsProcessed(state.iterations());
 }

-static void BM_DequantizeBfloat16MinCombinedCpuQuint16(int iters) {
-  BM_DequantizeBfloat16MinCombinedCpu<quint16>(iters);
+void BM_DequantizeBfloat16MinCombinedCpuQuint16(
+    ::testing::benchmark::State& state) {
+  BM_DequantizeBfloat16MinCombinedCpu<quint16>(state);
 }

-static void BM_DequantizeBfloat16MinCombinedCpuQint16(int iters) {
-  BM_DequantizeBfloat16MinCombinedCpu<qint16>(iters);
+void BM_DequantizeBfloat16MinCombinedCpuQint16(
+    ::testing::benchmark::State& state) {
+  BM_DequantizeBfloat16MinCombinedCpu<qint16>(state);
 }

-static void BM_DequantizeBfloat16MinCombinedCpuQuint8(int iters) {
-  BM_DequantizeBfloat16MinCombinedCpu<quint8>(iters);
+void BM_DequantizeBfloat16MinCombinedCpuQuint8(
+    ::testing::benchmark::State& state) {
+  BM_DequantizeBfloat16MinCombinedCpu<quint8>(state);
 }

-static void BM_DequantizeBfloat16MinCombinedCpuQint8(int iters) {
-  BM_DequantizeBfloat16MinCombinedCpu<qint8>(iters);
+void BM_DequantizeBfloat16MinCombinedCpuQint8(
+    ::testing::benchmark::State& state) {
+  BM_DequantizeBfloat16MinCombinedCpu<qint8>(state);
 }

 BENCHMARK(BM_DequantizeBfloat16MinCombinedCpuQuint16);
--- a/tensorflow/core/kernels/diag_op_test.cc
+++ b/tensorflow/core/kernels/diag_op_test.cc
@ -30,12 +30,13 @@ static Graph* Diag(int n, DataType type) {
  return g;
 }

-#define BM_DiagDev(N, T, TFTYPE, DEVICE)                        \
-  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) {  \
-    testing::UseRealTime();                                     \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * N); \
-    test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE)).Run(iters);    \
-  }                                                             \
+#define BM_DiagDev(N, T, TFTYPE, DEVICE)                                      \
+  static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(                            \
+      ::testing::benchmark::State& state) {                                   \
+    test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE), /*old_benchmark_api=*/false) \
+        .Run(state);                                                          \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * N);  \
+  }                                                                           \
  BENCHMARK(BM_Diag##_##N##_##TFTYPE##_##DEVICE);

 #define BM_Diag(N)                                       \