Internal tests cleanup.

PiperOrigin-RevId: 339741501 Change-Id: Iaa532c63d5c653de8e6a76e78822014fbef51b28
2020-10-29 14:01:21 -07:00 · 2020-10-29 14:01:21 -07:00 · df70d68014
commit df70d68014
parent fb49d63afa
16 changed files with 518 additions and 369 deletions
--- a/tensorflow/core/kernels/random_op_test.cc
+++ b/tensorflow/core/kernels/random_op_test.cc
@ -58,11 +58,14 @@ Graph* TruncatedNormal(int64 n) {
  return g;
 }

-#define BM_RNG(DEVICE, RNG)                                   \
-  void BM_##DEVICE##_##RNG(int iters, int arg) {              \
-    testing::ItemsProcessed(static_cast<int64>(iters) * arg); \
-    test::Benchmark(#DEVICE, RNG(arg)).Run(iters);            \
-  }                                                           \
+#define BM_RNG(DEVICE, RNG)                                                \
+  void BM_##DEVICE##_##RNG(::testing::benchmark::State& state) {           \
+    const int arg = state.range(0);                                        \
+                                                                           \
+    test::Benchmark(#DEVICE, RNG(arg), /*old_benchmark_api*/ false)        \
+        .Run(state);                                                       \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * arg); \
+  }                                                                        \
  BENCHMARK(BM_##DEVICE##_##RNG)->Range(1 << 20, 8 << 20);

 BM_RNG(cpu, RandomUniform);
@ -84,60 +87,48 @@ Tensor VecAlphas(int64 n) {
  return alphas;
 }

-void BM_cpu_RandomGamma(int iters, int nsamp, int nalpha) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * nsamp * nalpha);
+void BM_cpu_RandomGamma(::testing::benchmark::State& state) {
+  const int nsamp = state.range(0);
+  const int nalpha = state.range(1);
+
  Graph* g = new Graph(OpRegistry::Global());
  test::graph::RandomGamma(g, test::graph::Constant(g, VecShape(nsamp)),
                           test::graph::Constant(g, VecAlphas(nalpha)));
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * nsamp *
+                          nalpha);
 }
 BENCHMARK(BM_cpu_RandomGamma)->RangePair(1 << 14, 4 << 15, 2, 50);

-void BM_PhiloxRandom(int iters) {
+void BM_PhiloxRandom(::testing::benchmark::State& state) {
  // Fill 2M random numbers
  int count = 2 << 20;
-
-  testing::ItemsProcessed(static_cast<int64>(iters) * count);
-
  random::PhiloxRandom gen(0x12345);

-  int val = 1;
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    for (int j = 0; j < count; j += 4) {
      /// each invocation of gen() returns 128-bit samples
      auto samples = gen();
-
-      // use the result trivially so the compiler does not optimize it away
-      val ^= samples[0] ^ samples[1] ^ samples[2] ^ samples[3];
+      tensorflow::testing::DoNotOptimize(samples);
    }
  }
-
-  // A anchor point to make sure the compiler does not cut corners
-  CHECK(val) << val;
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * count);
 }
 BENCHMARK(BM_PhiloxRandom);

-void BM_StdMTRandom(int iters) {
+void BM_StdMTRandom(::testing::benchmark::State& state) {
  // Fill 2M random numbers
  int count = 2 << 20;
-
-  testing::ItemsProcessed(static_cast<int64>(iters) * count);
-
  std::mt19937 gen(0x12345);

-  uint_fast32_t val = 1;
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    for (int j = 0; j < count; ++j) {
      /// each invocation of gen() returns 32-bit sample
      uint_fast32_t sample = gen();
-
-      // use the result trivially so the compiler does not optimize it away
-      val ^= sample;
+      tensorflow::testing::DoNotOptimize(sample);
    }
  }
-
-  // A anchor point to make sure the compiler does not cut corners
-  CHECK(val) << val;
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * count);
 }
 BENCHMARK(BM_StdMTRandom);

--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@ -84,108 +84,167 @@ static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
 // Creates a bench which reduces a 3D tensor with total "num" floats
 // into a scalar on a "device". Runs the bench for "iters" times.
 template <typename T>
-static void ReduceToScalar(int iters, const string& device,
-                           const string& reduce, int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(T));
-  test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y)).Run(iters);
+static void ReduceToScalar(::testing::benchmark::State& state,
+                           const string& device, const string& reduce,
+                           int num_x, int num_y) {
+  test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(T));
 }

-static void DoRowReduce(int iters, const string& device, const string& reduce,
-                        int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, RowReduce(reduce, num_x, num_y)).Run(iters);
+static void DoRowReduce(::testing::benchmark::State& state,
+                        const string& device, const string& reduce, int num_x,
+                        int num_y) {
+  test::Benchmark(device, RowReduce(reduce, num_x, num_y),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }

-static void DoColReduce(int iters, const string& device, const string& reduce,
-                        int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ColReduce(reduce, num_x, num_y)).Run(iters);
+static void DoColReduce(::testing::benchmark::State& state,
+                        const string& device, const string& reduce, int num_x,
+                        int num_y) {
+  test::Benchmark(device, ColReduce(reduce, num_x, num_y),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }

-static void Do3DYReduce(int iters, const string& device, const string& reduce,
-                        int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y)).Run(iters);
+static void Do3DYReduce(::testing::benchmark::State& state,
+                        const string& device, const string& reduce, int num_x,
+                        int num_y) {
+  test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }

-static void Do3DXZReduce(int iters, const string& device, const string& reduce,
-                         int num_x, int num_y) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y)).Run(iters);
+static void Do3DXZReduce(::testing::benchmark::State& state,
+                         const string& device, const string& reduce, int num_x,
+                         int num_y) {
+  test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }

-static void BM_Sum2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum2DToScalarGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<float>(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DToScalarGPU)->RangePair(1, 8192, 1, 8192);

-static void BM_Sum2DToScalarGPUComplex(int iters, int num_x, int num_y) {
-  ReduceToScalar<std::complex<float>>(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum2DToScalarGPUComplex(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<std::complex<float>>(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DToScalarGPUComplex)->RangePair(1, 8192, 1, 8192);

-static void BM_Sum2DToScalarGPUHalf(int iters, int num_x, int num_y) {
-  ReduceToScalar<Eigen::half>(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum2DToScalarGPUHalf(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<Eigen::half>(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DToScalarGPUHalf)->RangePair(1, 8192, 1, 8192);

-static void BM_Sum2DRowReduceGPU(int iters, int num_x, int num_y) {
-  DoRowReduce(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum2DRowReduceGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  DoRowReduce(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DRowReduceGPU)->RangePair(1, 8192, 1, 8192);

-static void BM_Sum2DColumnReduceGPU(int iters, int num_x, int num_y) {
-  DoColReduce(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum2DColumnReduceGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  DoColReduce(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DColumnReduceGPU)->RangePair(1, 8192, 1, 8192);

-static void BM_Sum3DYReduceGPU(int iters, int num_x, int num_y) {
-  Do3DYReduce(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum3DYReduceGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  Do3DYReduce(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum3DYReduceGPU)->RangePair(64, 4096, 64, 4096);

-static void BM_Sum3DXZReduceGPU(int iters, int num_x, int num_y) {
-  Do3DXZReduce(iters, "gpu", "Sum", num_x, num_y);
+static void BM_Sum3DXZReduceGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  Do3DXZReduce(state, "gpu", "Sum", num_x, num_y);
 }
 BENCHMARK(BM_Sum3DXZReduceGPU)->RangePair(64, 4096, 64, 4096);

-static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "Mean", num_x, num_y);
+static void BM_Mean2DToScalarGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<float>(state, "gpu", "Mean", num_x, num_y);
 }
 BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);

-static void BM_EuclideanNorm2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "EuclideanNorm", num_x, num_y);
+static void BM_EuclideanNorm2DToScalarGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<float>(state, "gpu", "EuclideanNorm", num_x, num_y);
 }
 BENCHMARK(BM_EuclideanNorm2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);

-static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "Max", num_x, num_y);
+static void BM_Max2DToScalarGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<float>(state, "gpu", "Max", num_x, num_y);
 }
 BENCHMARK(BM_Max2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);

-static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<float>(iters, "gpu", "Min", num_x, num_y);
+static void BM_Min2DToScalarGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<float>(state, "gpu", "Min", num_x, num_y);
 }
 BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);

-static void BM_Min2DToScalarGPUHalf(int iters, int num_x, int num_y) {
-  ReduceToScalar<Eigen::half>(iters, "gpu", "Min", num_x, num_y);
+static void BM_Min2DToScalarGPUHalf(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<Eigen::half>(state, "gpu", "Min", num_x, num_y);
 }
 BENCHMARK(BM_Min2DToScalarGPUHalf)->RangePair(2048, 8192, 2048, 8192);

-static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) {
-  ReduceToScalar<bool>(iters, "gpu", "All", num_x, num_y);
+static void BM_Bool2DToScalarGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  ReduceToScalar<bool>(state, "gpu", "All", num_x, num_y);
 }
 BENCHMARK(BM_Bool2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);

--- a/tensorflow/core/kernels/regex_replace_op_test.cc
+++ b/tensorflow/core/kernels/regex_replace_op_test.cc
@ -84,17 +84,17 @@ Graph* SetupRegexReplaceGraph(const Tensor& input, const string& input_pattern,
  return g;
 }

-void BM_RegexReplace(int iters, int batch_size) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  testing::UseRealTime();
+static void BM_RegexReplace(::testing::benchmark::State& state) {
+  const int batch_size = state.range(0);
+
  Tensor input = GetTestTensor(batch_size);
  Graph* g = SetupRegexReplaceGraph(input, kRegExPattern, kRewrite);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }

 BENCHMARK(BM_RegexReplace)
+    ->UseRealTime()
    ->Arg(1)
    ->Arg(8)
    ->Arg(16)
@ -115,17 +115,17 @@ Graph* SetupStaticGraph(const Tensor& input, const string& input_pattern,
                  .Finalize(g, nullptr /* node */));
  return g;
 }
-void BM_StaticRegexReplace(int iters, int batch_size) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  testing::UseRealTime();
+static void BM_StaticRegexReplace(::testing::benchmark::State& state) {
+  const int batch_size = state.range(0);
+
  Tensor input = GetTestTensor(batch_size);
  Graph* g = SetupStaticGraph(input, kRegExPattern, kRewrite);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }

 BENCHMARK(BM_StaticRegexReplace)
+    ->UseRealTime()
    ->Arg(1)
    ->Arg(8)
    ->Arg(16)
--- a/tensorflow/core/kernels/requantization_range_op_test.cc
+++ b/tensorflow/core/kernels/requantization_range_op_test.cc
@ -67,56 +67,29 @@ TEST_F(RequantizationRangeTest, HandCrafted) {
  test::ExpectTensorEqual<float>(expected_max, *GetOutput(1));
 }

-static void BM_RequantizationRange(int iters, int size) {
-  testing::StopTiming();
-  testing::UseRealTime();
-  testing::ItemsProcessed(static_cast<int64>(iters) * size);
-  testing::ItemsProcessed(static_cast<int64>(iters) * size * 4);
+static void BM_RequantizationRange(::testing::benchmark::State& state) {
+  const int size = state.range(0);

  Tensor quantized_tensor(DT_QINT32, TensorShape({1, size}));
  test::FillFn<qint32>(&quantized_tensor, [](int n) { return qint32(n); });

  qint32 actual_min;
  qint32 actual_max;
-  testing::StartTiming();
-  for (int iter = 0; iter < iters; ++iter) {
+  for (auto s : state) {
    CalculateUsedRange(quantized_tensor, &actual_min, &actual_max);
  }
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * size);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * size * 4);
 }

-static void BM_RequantizationRange100(int iters) {
-  BM_RequantizationRange(100, iters);
-}
-BENCHMARK(BM_RequantizationRange100);
-
-static void BM_RequantizationRange1000(int iters) {
-  BM_RequantizationRange(1000, iters);
-}
-BENCHMARK(BM_RequantizationRange1000);
-
-static void BM_RequantizationRange10000(int iters) {
-  BM_RequantizationRange(10000, iters);
-}
-BENCHMARK(BM_RequantizationRange10000);
-
-static void BM_RequantizationRange100000(int iters) {
-  BM_RequantizationRange(100000, iters);
-}
-BENCHMARK(BM_RequantizationRange100000);
-
-static void BM_RequantizationRange1000000(int iters) {
-  BM_RequantizationRange(1000000, iters);
-}
-BENCHMARK(BM_RequantizationRange1000000);
-
-static void BM_RequantizationRange10000000(int iters) {
-  BM_RequantizationRange(10000000, iters);
-}
-BENCHMARK(BM_RequantizationRange10000000);
-
-static void BM_RequantizationRange100000000(int iters) {
-  BM_RequantizationRange(100000000, iters);
-}
-BENCHMARK(BM_RequantizationRange100000000);
+BENCHMARK(BM_RequantizationRange)
+    ->UseRealTime()
+    ->Arg(100)
+    ->Arg(1000)
+    ->Arg(10000)
+    ->Arg(100000)
+    ->Arg(1000000)
+    ->Arg(10000000)
+    ->Arg(100000000);

 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/reverse_op_test.cc
+++ b/tensorflow/core/kernels/reverse_op_test.cc
@ -197,148 +197,187 @@ static Graph* Reverse(const TensorShape& shape, int reverse_axis) {
 }

 template <typename T>
-static void RunReverseRowsBenchmark(int iters, int outer_dim, int middle_dim,
+static void RunReverseRowsBenchmark(::testing::benchmark::State& state,
+                                    int outer_dim, int middle_dim,
                                    int intra_threads, int channels) {
  SessionOptions opts = GetOptions(intra_threads);
  TensorShape shape{outer_dim, middle_dim, channels};
-  const int64 num_items = static_cast<int64>(iters) * shape.num_elements();
-  testing::ItemsProcessed(num_items);
-  testing::BytesProcessed(num_items * sizeof(T));
-  testing::UseRealTime();
-  test::Benchmark("cpu", Reverse<T>(shape, 1), &opts).Run(iters);
+  test::Benchmark("cpu", Reverse<T>(shape, 1), &opts, nullptr, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  const int64 num_items =
+      static_cast<int64>(state.iterations()) * shape.num_elements();
+  state.SetItemsProcessed(num_items);
+  state.SetBytesProcessed(num_items * sizeof(T));
 }

-static void BM_ReverseRowsOf1Channel_1T_float(int iters, int outer_dim,
-                                              int middle_dim) {
-  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf1Channel_1T_float(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
                                 1 /* intra_threads */, 1 /* channels */);
 }

 BENCHMARK(BM_ReverseRowsOf1Channel_1T_float)
+    ->UseRealTime()
    ->ArgPair(288, 288)
    ->ArgPair(1024, 1024)
    ->ArgPair(10 * 1024, 1024);

-static void BM_ReverseRowsOf1Channel_1T_uint8(int iters, int outer_dim,
-                                              int middle_dim) {
-  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf1Channel_1T_uint8(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
                                 1 /* intra_threads */, 1 /* channels */);
 }

 BENCHMARK(BM_ReverseRowsOf1Channel_1T_uint8)
+    ->UseRealTime()
    ->ArgPair(288, 288)
    ->ArgPair(1024, 1024)
    ->ArgPair(10 * 1024, 1024);

-static void BM_ReverseRowsOf1Channel_4T_float(int iters, int outer_dim,
-                                              int middle_dim) {
-  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf1Channel_4T_float(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
                                 4 /* intra_threads */, 1 /* channels */);
 }

 BENCHMARK(BM_ReverseRowsOf1Channel_4T_float)
+    ->UseRealTime()
    ->ArgPair(288, 288)
    ->ArgPair(1024, 1024)
    ->ArgPair(10 * 1024, 1024);

-static void BM_ReverseRowsOf1Channel_4T_uint8(int iters, int outer_dim,
-                                              int middle_dim) {
-  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf1Channel_4T_uint8(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
                                 4 /* intra_threads */, 1 /* channels */);
 }

 BENCHMARK(BM_ReverseRowsOf1Channel_4T_uint8)
+    ->UseRealTime()
    ->ArgPair(288, 288)
    ->ArgPair(1024, 1024)
    ->ArgPair(10 * 1024, 1024);

-static void BM_ReverseRowsOf3Channels_1T_float(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf3Channels_1T_float(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
                                 1 /* intra_threads */, 3 /* channels */);
 }

 BENCHMARK(BM_ReverseRowsOf3Channels_1T_float)
+    ->UseRealTime()
    ->ArgPair(288, 288)
    ->ArgPair(30, 30)
    ->ArgPair(1024, 1024)
    ->ArgPair(10 * 1024, 1024);

-static void BM_ReverseRowsOf3Channels_1T_uint8(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf3Channels_1T_uint8(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
                                 1 /* intra_threads */, 3 /* channels */);
 }

 BENCHMARK(BM_ReverseRowsOf3Channels_1T_uint8)
+    ->UseRealTime()
    ->ArgPair(288, 288)
    ->ArgPair(30, 30)
    ->ArgPair(1024, 1024)
    ->ArgPair(10 * 1024, 1024);

-static void BM_ReverseRowsOf3Channels_4T_float(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf3Channels_4T_float(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
                                 4 /* intra_threads */, 3 /* channels */);
 }

 BENCHMARK(BM_ReverseRowsOf3Channels_4T_float)
+    ->UseRealTime()
    ->ArgPair(288, 288)
    ->ArgPair(30, 30)
    ->ArgPair(1024, 1024)
    ->ArgPair(10 * 1024, 1024);

-static void BM_ReverseRowsOf3Channels_4T_uint8(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf3Channels_4T_uint8(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
                                 4 /* intra_threads */, 3 /* channels */);
 }
 BENCHMARK(BM_ReverseRowsOf3Channels_4T_uint8)
+    ->UseRealTime()
    ->ArgPair(288, 288)
    ->ArgPair(30, 30)
    ->ArgPair(1024, 1024)
    ->ArgPair(10 * 1024, 1024);

-static void BM_ReverseRowsOf4Channels_1T_float(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf4Channels_1T_float(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
                                 1 /* intra_threads */, 4 /* channels */);
 }

 BENCHMARK(BM_ReverseRowsOf4Channels_1T_float)
+    ->UseRealTime()
    ->ArgPair(288, 288)
    ->ArgPair(1024, 1024)
    ->ArgPair(10 * 1024, 1024);

-static void BM_ReverseRowsOf4Channels_1T_uint8(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf4Channels_1T_uint8(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
                                 1 /* intra_threads */, 4 /* channels */);
 }

 BENCHMARK(BM_ReverseRowsOf4Channels_1T_uint8)
+    ->UseRealTime()
    ->ArgPair(288, 288)
    ->ArgPair(1024, 1024)
    ->ArgPair(10 * 1024, 1024);

-static void BM_ReverseRowsOf4Channels_4T_float(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf4Channels_4T_float(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
                                 4 /* intra_threads */, 4 /* channels */);
 }

 BENCHMARK(BM_ReverseRowsOf4Channels_4T_float)
+    ->UseRealTime()
    ->ArgPair(288, 288)
    ->ArgPair(1024, 1024)
    ->ArgPair(10 * 1024, 1024);

-static void BM_ReverseRowsOf4Channels_4T_uint8(int iters, int outer_dim,
-                                               int middle_dim) {
-  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+void BM_ReverseRowsOf4Channels_4T_uint8(::testing::benchmark::State& state) {
+  const int outer_dim = state.range(0);
+  const int middle_dim = state.range(1);
+
+  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
                                 4 /* intra_threads */, 4 /* channels */);
 }

 BENCHMARK(BM_ReverseRowsOf4Channels_4T_uint8)
+    ->UseRealTime()
    ->ArgPair(288, 288)
    ->ArgPair(1024, 1024)
    ->ArgPair(10 * 1024, 1024);
--- a/tensorflow/core/kernels/roll_op_test.cc
+++ b/tensorflow/core/kernels/roll_op_test.cc
@ -450,34 +450,44 @@ static Graph* RollGraph(const TensorShape& shape, int isd) {
  return g;
 }

-#define BM_ROLL_OUTER(DEVICE)                                                 \
-  static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) {    \
-    TensorShape shape{rows, columns};                                         \
-    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
-    testing::ItemsProcessed(num_items);                                       \
-    testing::BytesProcessed(num_items * sizeof(float));                       \
-    testing::UseRealTime();                                                   \
-    test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters);                 \
-  }                                                                           \
-  BENCHMARK(BM_##DEVICE##_roll_outer)                                         \
-      ->ArgPair(256, 256)                                                     \
-      ->ArgPair(512, 512)                                                     \
-      ->ArgPair(1024, 1024)                                                   \
+#define BM_ROLL_OUTER(DEVICE)                                                  \
+  static void BM_##DEVICE##_roll_outer(::testing::benchmark::State& state) {   \
+    const int rows = state.range(0);                                           \
+    const int columns = state.range(1);                                        \
+                                                                               \
+    TensorShape shape{rows, columns};                                          \
+    test::Benchmark(#DEVICE, RollGraph(shape, 0), /*old_benchmark_api*/ false) \
+        .Run(state);                                                           \
+    const int64 num_items =                                                    \
+        static_cast<int64>(state.iterations()) * shape.num_elements();         \
+    state.SetItemsProcessed(num_items);                                        \
+    state.SetBytesProcessed(num_items * sizeof(float));                        \
+  }                                                                            \
+  BENCHMARK(BM_##DEVICE##_roll_outer)                                          \
+      ->UseRealTime()                                                          \
+      ->ArgPair(256, 256)                                                      \
+      ->ArgPair(512, 512)                                                      \
+      ->ArgPair(1024, 1024)                                                    \
      ->ArgPair(2048, 2048)

-#define BM_ROLL_ALL(DEVICE)                                                   \
-  static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) {      \
-    TensorShape shape{rows, columns};                                         \
-    const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
-    testing::ItemsProcessed(num_items);                                       \
-    testing::BytesProcessed(num_items * sizeof(float));                       \
-    testing::UseRealTime();                                                   \
-    test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters);                 \
-  }                                                                           \
-  BENCHMARK(BM_##DEVICE##_roll_all)                                           \
-      ->ArgPair(256, 256)                                                     \
-      ->ArgPair(512, 512)                                                     \
-      ->ArgPair(1024, 1024)                                                   \
+#define BM_ROLL_ALL(DEVICE)                                                    \
+  static void BM_##DEVICE##_roll_all(::testing::benchmark::State& state) {     \
+    const int rows = state.range(0);                                           \
+    const int columns = state.range(1);                                        \
+                                                                               \
+    TensorShape shape{rows, columns};                                          \
+    test::Benchmark(#DEVICE, RollGraph(shape, 1), /*old_benchmark_api*/ false) \
+        .Run(state);                                                           \
+    const int64 num_items =                                                    \
+        static_cast<int64>(state.iterations()) * shape.num_elements();         \
+    state.SetItemsProcessed(num_items);                                        \
+    state.SetBytesProcessed(num_items * sizeof(float));                        \
+  }                                                                            \
+  BENCHMARK(BM_##DEVICE##_roll_all)                                            \
+      ->UseRealTime()                                                          \
+      ->ArgPair(256, 256)                                                      \
+      ->ArgPair(512, 512)                                                      \
+      ->ArgPair(1024, 1024)                                                    \
      ->ArgPair(2048, 2048)

 BM_ROLL_OUTER(cpu);
--- a/tensorflow/core/kernels/save_op_test.cc
+++ b/tensorflow/core/kernels/save_op_test.cc
@ -663,8 +663,8 @@ TEST_F(SaveOpSlices2Test, TwoSlices) {

 // Benchmark-related code below.

-static void BM_LargeTensorWrite(int iters, int num_elements) {
-  testing::StopTiming();
+void BM_LargeTensorWrite(::testing::benchmark::State& state) {
+  const int num_elements = state.range(0);

  // 4 * num_elements bytes total , since sizeof(float) == 4.
  Tensor tensor(DT_FLOAT, TensorShape({num_elements}));
@ -689,8 +689,9 @@ static void BM_LargeTensorWrite(int iters, int num_elements) {
  VLOG(1) << "Save op's output path: " << temp_filename;
  VLOG(1) << "# nodes in Graph: " << g->num_nodes();

-  testing::StartTiming();
-  test::Benchmark("cpu", g, &session_options).Run(iters);
+  test::Benchmark("cpu", g, &session_options, nullptr, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
 }
 BENCHMARK(BM_LargeTensorWrite)->Arg((1 << 30) / 4 /* 1GB float tensor */);

--- a/tensorflow/core/kernels/scan_ops_test.cc
+++ b/tensorflow/core/kernels/scan_ops_test.cc
@ -67,79 +67,120 @@ static Graph* ThreeDYCumsum(int num_y, int num_z, bool reverse = false) {
 }

 template <typename T>
-static void LargeOneDimensional(int iters, const string& device, int num_x,
+static void LargeOneDimensional(::testing::benchmark::State& state,
+                                const string& device, int num_x,
                                bool reverse = false) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * sizeof(T));
-  test::Benchmark(device, LargeOneDCumsum<T>(num_x, reverse)).Run(iters);
+  test::Benchmark(device, LargeOneDCumsum<T>(num_x, reverse),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          sizeof(T));
 }

-static void DoRowCumsum(int iters, const string& device, int num_x, int num_y,
+static void DoRowCumsum(::testing::benchmark::State& state,
+                        const string& device, int num_x, int num_y,
                        bool reverse = false) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, RowCumsum(num_x, num_y, reverse)).Run(iters);
+  test::Benchmark(device, RowCumsum(num_x, num_y, reverse),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }

-static void DoColCumsum(int iters, const string& device, int num_x, int num_y,
+static void DoColCumsum(::testing::benchmark::State& state,
+                        const string& device, int num_x, int num_y,
                        bool reverse = false) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ColCumsum(num_x, num_y, reverse)).Run(iters);
+  test::Benchmark(device, ColCumsum(num_x, num_y, reverse),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }

-static void Do3DYCumsum(int iters, const string& device, int num_x, int num_y,
+static void Do3DYCumsum(::testing::benchmark::State& state,
+                        const string& device, int num_x, int num_y,
                        bool reverse = false) {
-  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
-  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
-                          sizeof(float));
-  test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse)).Run(iters);
+  test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse),
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
+                          num_y * sizeof(float));
 }

-static void BM_OneDCumsumGPU(int iters, int num_x) {
-  LargeOneDimensional<float>(iters, "gpu", num_x);
+static void BM_OneDCumsumGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+
+  LargeOneDimensional<float>(state, "gpu", num_x);
 }
 BENCHMARK(BM_OneDCumsumGPU)->Range(1, 1 << 21);

-static void BM_OneDCumsumGPUHalf(int iters, int num_x) {
-  LargeOneDimensional<Eigen::half>(iters, "gpu", num_x);
+static void BM_OneDCumsumGPUHalf(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+
+  LargeOneDimensional<Eigen::half>(state, "gpu", num_x);
 }
 BENCHMARK(BM_OneDCumsumGPUHalf)->Range(1, 1 << 21);

-static void BM_Sum2DRowCumsumGPU(int iters, int num_x, int num_y) {
-  DoRowCumsum(iters, "gpu", num_x, num_y);
+static void BM_Sum2DRowCumsumGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  DoRowCumsum(state, "gpu", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DRowCumsumGPU)->RangePair(1, 8192, 1, 8192);

-static void BM_Sum2DColumnCumsumGPU(int iters, int num_x, int num_y) {
-  DoColCumsum(iters, "gpu", num_x, num_y);
+static void BM_Sum2DColumnCumsumGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  DoColCumsum(state, "gpu", num_x, num_y);
 }
 BENCHMARK(BM_Sum2DColumnCumsumGPU)->RangePair(1, 8192, 1, 8192);

-static void BM_Sum3DYCumsumGPU(int iters, int num_x, int num_y) {
-  Do3DYCumsum(iters, "gpu", num_x, num_y);
+static void BM_Sum3DYCumsumGPU(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  Do3DYCumsum(state, "gpu", num_x, num_y);
 }
 BENCHMARK(BM_Sum3DYCumsumGPU)->RangePair(64, 4096, 64, 4096);

-static void BM_OneDCumsumGPU_reverse(int iters, int num_x) {
-  LargeOneDimensional<float>(iters, "gpu", num_x, true);
+static void BM_OneDCumsumGPU_reverse(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+
+  LargeOneDimensional<float>(state, "gpu", num_x, true);
 }
 BENCHMARK(BM_OneDCumsumGPU_reverse)->Range(1, 1 << 21);

-static void BM_Sum2DRowCumsumGPU_reverse(int iters, int num_x, int num_y) {
-  DoRowCumsum(iters, "gpu", num_x, num_y, true);
+static void BM_Sum2DRowCumsumGPU_reverse(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  DoRowCumsum(state, "gpu", num_x, num_y, true);
 }
 BENCHMARK(BM_Sum2DRowCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);

-static void BM_Sum2DColumnCumsumGPU_reverse(int iters, int num_x, int num_y) {
-  DoColCumsum(iters, "gpu", num_x, num_y, true);
+static void BM_Sum2DColumnCumsumGPU_reverse(
+    ::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  DoColCumsum(state, "gpu", num_x, num_y, true);
 }
 BENCHMARK(BM_Sum2DColumnCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);

-static void BM_Sum3DYCumsumGPU_reverse(int iters, int num_x, int num_y) {
-  Do3DYCumsum(iters, "gpu", num_x, num_y, true);
+static void BM_Sum3DYCumsumGPU_reverse(::testing::benchmark::State& state) {
+  const int num_x = state.range(0);
+  const int num_y = state.range(1);
+
+  Do3DYCumsum(state, "gpu", num_x, num_y, true);
 }
 BENCHMARK(BM_Sum3DYCumsumGPU_reverse)->RangePair(32, 2048, 32, 2048);

--- a/tensorflow/core/kernels/scatter_nd_op_test.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_test.cc
@ -254,8 +254,8 @@ class ScatterNdUpdateBM : public ScatterNdUpdateOpTest {
 };

 template <typename Index>
-static void BM_ScatterNdHelper(int iters, int embedding_size, const char* op) {
-  testing::StopTiming();
+void BM_ScatterNdHelper(::testing::benchmark::State& state, int embedding_size,
+                        const char* op) {
  const int kRows = 10000000 / embedding_size;
  std::vector<float> values;
  values.reserve(kRows);
@ -280,27 +280,33 @@ static void BM_ScatterNdHelper(int iters, int embedding_size, const char* op) {
  bm.AddInputFromArray<Index>(TensorShape({kNumUpdates}), indices);
  bm.AddInputFromArray<float>(TensorShape({kNumUpdates, embedding_size}),
                              updates);
-  testing::ItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
-                          iters);
-  testing::StartTiming();
-  while (iters-- > 0) {
+  for (auto i : state) {
    Status s = bm.RunOpKernel();
  }
-  testing::StopTiming();
+  state.SetItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
+                          state.iterations());
 }

-static void BM_ScatterNdUpdateInt32(int iters, int embedding_size) {
-  BM_ScatterNdHelper<int32>(iters, embedding_size, "ScatterNdUpdate");
+void BM_ScatterNdUpdateInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterNdHelper<int32>(state, embedding_size, "ScatterNdUpdate");
 }
-static void BM_ScatterNdUpdateInt64(int iters, int embedding_size) {
-  BM_ScatterNdHelper<int64>(iters, embedding_size, "ScatterNdUpdate");
+void BM_ScatterNdUpdateInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterNdHelper<int64>(state, embedding_size, "ScatterNdUpdate");
 }

-static void BM_ScatterNdAddInt32(int iters, int embedding_size) {
-  BM_ScatterNdHelper<int32>(iters, embedding_size, "ScatterNdAdd");
+void BM_ScatterNdAddInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterNdHelper<int32>(state, embedding_size, "ScatterNdAdd");
 }
-static void BM_ScatterNdAddInt64(int iters, int embedding_size) {
-  BM_ScatterNdHelper<int64>(iters, embedding_size, "ScatterNdAdd");
+void BM_ScatterNdAddInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterNdHelper<int64>(state, embedding_size, "ScatterNdAdd");
 }

 BENCHMARK(BM_ScatterNdUpdateInt32)
--- a/tensorflow/core/kernels/scatter_op_test.cc
+++ b/tensorflow/core/kernels/scatter_op_test.cc
@ -280,9 +280,8 @@ class ScatterUpdateBM : public ScatterUpdateOpTest {
 };

 template <typename Index>
-static void BM_ScatterHelper(int iters, int embedding_size, const char* op,
-                             bool big_num_updates = false) {
-  testing::StopTiming();
+void BM_ScatterHelper(::testing::benchmark::State& state, int embedding_size,
+                      const char* op, bool big_num_updates = false) {
  const int kRows = 10000000 / embedding_size;
  std::vector<float> values;
  values.reserve(kRows);
@ -307,59 +306,83 @@ static void BM_ScatterHelper(int iters, int embedding_size, const char* op,
  bm.AddInputFromArray<Index>(TensorShape({kNumUpdates}), indices);
  bm.AddInputFromArray<float>(TensorShape({kNumUpdates, embedding_size}),
                              updates);
-  testing::ItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
-                          iters);
-  testing::StartTiming();
-  while (iters-- > 0) {
+  for (auto i : state) {
    Status s = bm.RunOpKernel();
  }
-  testing::StopTiming();
+  state.SetItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
+                          state.iterations());
 }

-static void BM_ScatterUpdateInt32(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterUpdate");
+void BM_ScatterUpdateInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterUpdate");
 }
-static void BM_ScatterUpdateInt64(int iters, int embedding_size) {
-  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterUpdate");
+void BM_ScatterUpdateInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int64>(state, embedding_size, "ScatterUpdate");
 }

-static void BM_ScatterAddInt32(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterAdd");
+void BM_ScatterAddInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterAdd");
 }

-static void BM_ScatterAddInt32Large(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterAdd", true);
+void BM_ScatterAddInt32Large(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterAdd", true);
 }
-static void BM_ScatterAddInt64(int iters, int embedding_size) {
-  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterAdd");
+void BM_ScatterAddInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int64>(state, embedding_size, "ScatterAdd");
 }

-static void BM_ScatterMulInt32(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMul");
+void BM_ScatterMulInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterMul");
 }
-static void BM_ScatterMulInt64(int iters, int embedding_size) {
-  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMul");
+void BM_ScatterMulInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int64>(state, embedding_size, "ScatterMul");
 }

-static void BM_ScatterDivInt32(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterDiv");
+void BM_ScatterDivInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterDiv");
 }
-static void BM_ScatterDivInt64(int iters, int embedding_size) {
-  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterDiv");
+void BM_ScatterDivInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int64>(state, embedding_size, "ScatterDiv");
 }

-static void BM_ScatterMinInt32(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMin");
+void BM_ScatterMinInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterMin");
 }
-static void BM_ScatterMinInt64(int iters, int embedding_size) {
-  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMin");
+void BM_ScatterMinInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int64>(state, embedding_size, "ScatterMin");
 }

-static void BM_ScatterMaxInt32(int iters, int embedding_size) {
-  BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMax");
+void BM_ScatterMaxInt32(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int32>(state, embedding_size, "ScatterMax");
 }
-static void BM_ScatterMaxInt64(int iters, int embedding_size) {
-  BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMax");
+void BM_ScatterMaxInt64(::testing::benchmark::State& state) {
+  const int embedding_size = state.range(0);
+
+  BM_ScatterHelper<int64>(state, embedding_size, "ScatterMax");
 }

 BENCHMARK(BM_ScatterUpdateInt32)
--- a/tensorflow/core/kernels/segment_reduction_ops_test.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc
@ -39,10 +39,9 @@ limitations under the License.
 namespace tensorflow {

 template <typename Index>
-static void BM_SegmentReduction(int iters, const string& reduction,
-                                Index num_rows, Index num_cols,
-                                Index segment_size) {
-  testing::StopTiming();
+static void BM_SegmentReduction(::testing::benchmark::State& state,
+                                const string& reduction, Index num_rows,
+                                Index num_cols, Index segment_size) {
  std::unique_ptr<Device> device(
      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));

@ -81,24 +80,25 @@ static void BM_SegmentReduction(int iters, const string& reduction,

  reduction_op->Compute(reduction_context.get());
  TF_CHECK_OK(reduction_context->status());
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    delete reduction_context->release_output(0).tensor;
    reduction_op->Compute(reduction_context.get());
  }
  int64 bytes_per_iter =
      static_cast<int64>(num_rows * num_cols * sizeof(float));
-  testing::BytesProcessed(bytes_per_iter * iters);
+  state.SetBytesProcessed(bytes_per_iter * state.iterations());
 }

-#define BM_Reduce(O, R, C, S)                                      \
-  static void BM_Reduce_##O##_##R##_##C##_##S##_int32(int iters) { \
-    BM_SegmentReduction<int32>(iters, #O, R, C, S);                \
-  }                                                                \
-  static void BM_Reduce_##O##_##R##_##C##_##S##_int64(int iters) { \
-    BM_SegmentReduction<int64>(iters, #O, R, C, S);                \
-  }                                                                \
-  BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32);              \
+#define BM_Reduce(O, R, C, S)                          \
+  static void BM_Reduce_##O##_##R##_##C##_##S##_int32( \
+      ::testing::benchmark::State & state) {           \
+    BM_SegmentReduction<int32>(state, #O, R, C, S);    \
+  }                                                    \
+  static void BM_Reduce_##O##_##R##_##C##_##S##_int64( \
+      ::testing::benchmark::State & state) {           \
+    BM_SegmentReduction<int64>(state, #O, R, C, S);    \
+  }                                                    \
+  BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32);  \
  BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int64);

 #define BM_Reduce_Arg(R, C, S)    \
@ -113,8 +113,8 @@ BM_Reduce_Arg(64, 32, 2);
 BM_Reduce_Arg(4096, 32, 2);
 BM_Reduce_Arg(4096, 128, 2);

-static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) {
-  testing::StopTiming();
+static void SparseSegmentMeanGradHelper(::testing::benchmark::State& state,
+                                        float uniqueness, int size) {
  Graph* g = new Graph(OpRegistry::Global());
  CHECK_LE(uniqueness, 1.0);
  CHECK_GT(uniqueness, 0.0);
@ -148,22 +148,24 @@ static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) {
                  .Attr("T", DT_FLOAT)
                  .Finalize(g, &node));

-  testing::UseRealTime();
-  testing::BytesProcessed(static_cast<int64>(iters) * (kDim1 * kDim2) *
-                          sizeof(float));
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
+                          (kDim1 * kDim2) * sizeof(float));
 }

-static void BM_SparseSegmentMeanGrad_Low(int iters, int size) {
-  return SparseSegmentMeanGradHelper(iters, 1.0, size);
+static void BM_SparseSegmentMeanGrad_Low(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
+  return SparseSegmentMeanGradHelper(state, 1.0, size);
 }

-static void BM_SparseSegmentMeanGrad_High(int iters, int size) {
-  return SparseSegmentMeanGradHelper(iters, 0.01, size);
+static void BM_SparseSegmentMeanGrad_High(::testing::benchmark::State& state) {
+  const int size = state.range(0);
+
+  return SparseSegmentMeanGradHelper(state, 0.01, size);
 }

-BENCHMARK(BM_SparseSegmentMeanGrad_Low)->Arg(1000)->Arg(100000);
-BENCHMARK(BM_SparseSegmentMeanGrad_High)->Arg(1000)->Arg(100000);
+BENCHMARK(BM_SparseSegmentMeanGrad_Low)->UseRealTime()->Arg(1000)->Arg(100000);
+BENCHMARK(BM_SparseSegmentMeanGrad_High)->UseRealTime()->Arg(1000)->Arg(100000);

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sendrecv_ops_test.cc
+++ b/tensorflow/core/kernels/sendrecv_ops_test.cc
@ -54,21 +54,21 @@ static Graph* Recv() {
  return g;
 }

-static void BM_Send(int iters) {
-  testing::UseRealTime();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous)
-      .Run(iters);
+void BM_Send(::testing::benchmark::State& state) {
+  test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }
-BENCHMARK(BM_Send);
+BENCHMARK(BM_Send)->UseRealTime();

-static void BM_Recv(int iters) {
-  testing::UseRealTime();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous)
-      .Run(iters);
+void BM_Recv(::testing::benchmark::State& state) {
+  test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }
-BENCHMARK(BM_Recv);
+BENCHMARK(BM_Recv)->UseRealTime();

 }  // namespace
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/slice_op_test.cc
+++ b/tensorflow/core/kernels/slice_op_test.cc
@ -37,8 +37,8 @@ namespace {
 // For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
 // in size, and concat them together along "concat_dimension"
 template <typename T>
-static void SliceHelper(int iters, int size) {
-  testing::StopTiming();
+static void SliceHelper(::testing::benchmark::State& state) {
+  const int size = state.range(0);
  Graph* g = new Graph(OpRegistry::Global());
  DataType dt = DataTypeToEnum<T>::v();
  int kDim = 100;
@ -65,26 +65,24 @@ static void SliceHelper(int iters, int size) {
                  .Finalize(g, &node));
  FixupSourceAndSinkEdges(g);

-  testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T));
-  testing::StartTiming();
  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .Run(iters);
-
-  testing::UseRealTime();
+                  "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim * size *
+                          sizeof(T));
 }

-static void BM_SliceFloat(int iters, int dim2) {
-  SliceHelper<float>(iters, dim2);
+void BM_SliceFloat(::testing::benchmark::State& state) {
+  SliceHelper<float>(state);
 }

-BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000);
+BENCHMARK(BM_SliceFloat)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);

-static void BM_SliceBFloat16(int iters, int dim2) {
-  SliceHelper<bfloat16>(iters, dim2);
+void BM_SliceBFloat16(::testing::benchmark::State& state) {
+  SliceHelper<bfloat16>(state);
 }

-BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000);
+BENCHMARK(BM_SliceBFloat16)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);

 }  // namespace
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@ -276,15 +276,18 @@ static ST MakeSparseTensor(Graph* g, int B, int M, int N, int nnz_inner) {

 // [8, 4, N{nnz}] cmul [8, 4, N]
 #define BM_SparseMatCMulDenseMatArgs(N, NNZ_INNER)                             \
-  static void BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER(int iters) {          \
+  static void BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER(                      \
+      ::testing::benchmark::State& state) {                                    \
    Graph* g = new Graph(OpRegistry::Global());                                \
    Node* dense = MakeTensor(g, 8, 4, N);                                      \
    ST sp = MakeSparseTensor(g, 8, 4, N, NNZ_INNER);                           \
                                                                               \
-    testing::ItemsProcessed(static_cast<int64>(iters * 8 * 4 * N * 2));        \
    test::Benchmark(                                                           \
-        "cpu", SparseMatCMulDenseMat(g, sp.indices, sp.vals, sp.shape, dense)) \
-        .Run(iters);                                                           \
+        "cpu", SparseMatCMulDenseMat(g, sp.indices, sp.vals, sp.shape, dense), \
+        /*old_benchmark_api*/ false)                                           \
+        .Run(state);                                                           \
+    state.SetItemsProcessed(                                                   \
+        static_cast<int64>(state.iterations() * 8 * 4 * N * 2));               \
  }                                                                            \
  BENCHMARK(BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER)

--- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@ -198,9 +198,11 @@ TEST_F(SparseToDenseTest, ThreeD_MultValues) {

 }  // namespace

-static void BM_SparseToDense(int iters, int NDIM, int N) {
+static void BM_SparseToDense(::testing::benchmark::State& state) {
+  const int NDIM = state.range(0);
+  const int N = state.range(1);
+
  // TODO(zhifengc): Switch to use kernel_benchmark_testlib.h
-  tensorflow::testing::StopTiming();

  const int IndexDim = (NDIM == 1) ? 0 : 1;

@ -253,18 +255,15 @@ static void BM_SparseToDense(int iters, int NDIM, int N) {

  std::unique_ptr<OpKernelContext> sparse_context(new OpKernelContext(&params));
  op->Compute(sparse_context.get());
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    delete sparse_context->release_output(0).tensor;
    op->Compute(sparse_context.get());
    TF_ASSERT_OK(sparse_context->status());
  }
-  tensorflow::testing::StopTiming();

  // processing input, mainly
  int64 bytes_per_iter = static_cast<int64>((N + N * NDIM) * sizeof(float));
-
-  tensorflow::testing::BytesProcessed(bytes_per_iter * iters);
+  state.SetBytesProcessed(bytes_per_iter * state.iterations());
 }

 BENCHMARK(BM_SparseToDense)
--- a/tensorflow/core/kernels/sparse_xent_op_test.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_test.cc
@ -41,11 +41,15 @@ static Graph* SparseXent(int batch_size, int num_classes) {
  return g;
 }

-#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                          \
-  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
-    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters);      \
-  }                                                                     \
+#define BM_SparseXentDev(BATCH, CLASS, DEVICE)                               \
+  static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(                  \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS),                       \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * BATCH * \
+                            CLASS);                                          \
+  }                                                                          \
  BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE);

 /// The representative tests for ptb_word on GPU