Internal tests cleanup.

PiperOrigin-RevId: 339762896 Change-Id: I2e06cf0b409a1e621cd567060ba8670ce70d34c9
2020-10-29 15:49:24 -07:00 · 2020-10-29 15:49:24 -07:00 · 04f62ae3b1
commit 04f62ae3b1
parent b6aa9f3368
10 changed files with 206 additions and 165 deletions
--- a/tensorflow/core/kernels/split_op_test.cc
+++ b/tensorflow/core/kernels/split_op_test.cc
@ -44,38 +44,34 @@ static Graph* MakeGraph(int split_dim, int num_split,
 }

 #define BM_SPLIT_1D(num_split, chunk_size)                                  \
-  static void BM_Split_1d_##num_split##_##chunk_size(int iters) {           \
-    testing::StopTiming();                                                  \
-    testing::ItemsProcessed(static_cast<int64>(iters) * num_split *         \
-                            chunk_size);                                    \
+  static void BM_Split_1d_##num_split##_##chunk_size(                       \
+      ::testing::benchmark::State& state) {                                 \
    auto label =                                                            \
        strings::Printf("1-D %d chunks of %d each", num_split, chunk_size); \
-    testing::SetLabel(label);                                               \
-    testing::UseRealTime();                                                 \
+    state.SetLabel(label);                                                  \
    auto g = MakeGraph(/* split_dim = */ 0, num_split, {chunk_size});       \
-    testing::StartTiming();                                                 \
-    test::Benchmark("cpu", g).Run(iters);                                   \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);      \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) *        \
+                            num_split * chunk_size);                        \
  }                                                                         \
-  BENCHMARK(BM_Split_1d_##num_split##_##chunk_size);
+  BENCHMARK(BM_Split_1d_##num_split##_##chunk_size)->UseRealTime();

 #define BM_SPLIT_2D(split_dim, num_split, chunk_size0, chunk_size1)          \
  static void                                                                \
      BM_Split_2d_##split_dim##_##num_split##_##chunk_size0##_##chunk_size1( \
-          int iters) {                                                       \
-    testing::StopTiming();                                                   \
-    testing::ItemsProcessed(static_cast<int64>(iters) * num_split *          \
-                            chunk_size0 * chunk_size1);                      \
+          ::testing::benchmark::State& state) {                              \
    auto label =                                                             \
        strings::Printf("2-D %d chunks in dim %d of (%d * %d) each",         \
                        num_split, split_dim, chunk_size0, chunk_size1);     \
-    testing::SetLabel(label);                                                \
-    testing::UseRealTime();                                                  \
+    state.SetLabel(label);                                                   \
    auto g = MakeGraph(split_dim, num_split, {chunk_size0, chunk_size1});    \
-    testing::StartTiming();                                                  \
-    test::Benchmark("cpu", g).Run(iters);                                    \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);       \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) *         \
+                            num_split * chunk_size0 * chunk_size1);          \
  }                                                                          \
  BENCHMARK(                                                                 \
-      BM_Split_2d_##split_dim##_##num_split##_##chunk_size0##_##chunk_size1);
+      BM_Split_2d_##split_dim##_##num_split##_##chunk_size0##_##chunk_size1) \
+      ->UseRealTime();

 BM_SPLIT_1D(5, 1);
 BM_SPLIT_1D(262144, 1);
--- a/tensorflow/core/kernels/split_v_op_test.cc
+++ b/tensorflow/core/kernels/split_v_op_test.cc
@ -73,43 +73,40 @@ static Graph* MakeGraph(int split_dim, const std::vector<int64>& size_splits,
 }

 #define BM_SPLITV_1D(num_split, total_size)                                  \
-  static void BM_SplitV_1d_##num_split##_##total_size(int iters) {           \
-    testing::StopTiming();                                                   \
-    testing::ItemsProcessed(static_cast<int64>(iters) * total_size);         \
+  static void BM_SplitV_1d_##num_split##_##total_size(                       \
+      ::testing::benchmark::State& state) {                                  \
    auto label =                                                             \
        strings::Printf("1-D %d chunks totaling %d", num_split, total_size); \
-    testing::SetLabel(label);                                                \
-    testing::UseRealTime();                                                  \
+    state.SetLabel(label);                                                   \
    auto g = MakeGraph(/* split_dim = */ 0,                                  \
                       GenerateRandomIntsWithSum(total_size, num_split),     \
                       {total_size});                                        \
-    testing::StartTiming();                                                  \
-    test::Benchmark("cpu", g).Run(iters);                                    \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);       \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) *         \
+                            total_size);                                     \
  }                                                                          \
-  BENCHMARK(BM_SplitV_1d_##num_split##_##total_size);
+  BENCHMARK(BM_SplitV_1d_##num_split##_##total_size)->UseRealTime();

 #define BM_SPLITV_2D(split_dim, num_split, total_size0, total_size1)          \
  static void                                                                 \
      BM_SplitV_2d_##split_dim##_##num_split##_##total_size0##_##total_size1( \
-          int iters) {                                                        \
-    testing::StopTiming();                                                    \
+          ::testing::benchmark::State& state) {                               \
    std::vector<int64> total_size_vec{total_size0, total_size1};              \
-    testing::ItemsProcessed(static_cast<int64>(iters) * total_size0 *         \
-                            total_size1);                                     \
    auto label =                                                              \
        strings::Printf("2-D %d chunks in dim %d totaling (%d * %d)",         \
                        num_split, split_dim, total_size0, total_size1);      \
-    testing::SetLabel(label);                                                 \
-    testing::UseRealTime();                                                   \
+    state.SetLabel(label);                                                    \
    auto g = MakeGraph(                                                       \
        split_dim,                                                            \
        GenerateRandomIntsWithSum(total_size_vec[split_dim], num_split),      \
        {total_size0, total_size1});                                          \
-    testing::StartTiming();                                                   \
-    test::Benchmark("cpu", g).Run(iters);                                     \
+    test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);        \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) *          \
+                            total_size0 * total_size1);                       \
  }                                                                           \
  BENCHMARK(                                                                  \
-      BM_SplitV_2d_##split_dim##_##num_split##_##total_size0##_##total_size1);
+      BM_SplitV_2d_##split_dim##_##num_split##_##total_size0##_##total_size1) \
+      ->UseRealTime();

 BM_SPLITV_1D(5, 20);
 BM_SPLITV_1D(262144, 1000000);
--- a/tensorflow/core/kernels/strided_slice_op_test.cc
+++ b/tensorflow/core/kernels/strided_slice_op_test.cc
@ -38,8 +38,8 @@ namespace {
 // For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
 // in size, and concat them together along "concat_dimension"
 template <typename T>
-static void SliceHelper(int iters, int size) {
-  testing::StopTiming();
+static void SliceHelper(::testing::benchmark::State& state) {
+  const int size = state.range(0);
  Graph* g = new Graph(OpRegistry::Global());
  DataType dt = DataTypeToEnum<T>::v();
  int kDim = 100;
@ -70,32 +70,30 @@ static void SliceHelper(int iters, int size) {
                  .Attr("T", dt)
                  .Finalize(g, &node));

-  testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T));
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
-  testing::UseRealTime();
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim * size *
+                          sizeof(T));
 }

-static void BM_SliceFloat(int iters, int dim2) {
-  SliceHelper<float>(iters, dim2);
+void BM_SliceFloat(::testing::benchmark::State& state) {
+  SliceHelper<float>(state);
 }

-BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000);
+BENCHMARK(BM_SliceFloat)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);

-static void BM_SliceComplex64(int iters, int dim2) {
-  SliceHelper<std::complex<float>>(iters, dim2);
+void BM_SliceComplex64(::testing::benchmark::State& state) {
+  SliceHelper<std::complex<float>>(state);
 }

-BENCHMARK(BM_SliceComplex64)->Arg(100)->Arg(1000)->Arg(10000);
+BENCHMARK(BM_SliceComplex64)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);

-static void BM_SliceBFloat16(int iters, int dim2) {
-  SliceHelper<bfloat16>(iters, dim2);
+void BM_SliceBFloat16(::testing::benchmark::State& state) {
+  SliceHelper<bfloat16>(state);
 }

-BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000);
+BENCHMARK(BM_SliceBFloat16)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);

-static void BM_ValidateStridedSliceOp(int iters) {
-  testing::StopTiming();
+void BM_ValidateStridedSliceOp(::testing::benchmark::State& state) {
  int kDim = 100;
  int kMaxSize = 15000;
  int size = 100;
@ -104,8 +102,7 @@ static void BM_ValidateStridedSliceOp(int iters) {
  Tensor strides = test::AsTensor<int32>({1, 1});
  TensorShape input_shape({2 * kDim, kMaxSize});

-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    TensorShape processing_shape, final_shape;
    bool is_identity = true, slice_dim0 = true, is_simple_slice = true;
    gtl::InlinedVector<int64, 4> begin_out, end_out, strides_out;
--- a/tensorflow/core/kernels/string_split_op_test.cc
+++ b/tensorflow/core/kernels/string_split_op_test.cc
@ -76,17 +76,17 @@ Graph* SetupStringSplitGraph(const Tensor& input) {
  return g;
 }

-void BM_StringSplit(int iters, int batch_size) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  testing::UseRealTime();
+static void BM_StringSplit(::testing::benchmark::State& state) {
+  const int batch_size = state.range(0);
+
  Tensor input = GetTestTensor(batch_size);
  Graph* g = SetupStringSplitGraph(input);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }

 BENCHMARK(BM_StringSplit)
+    ->UseRealTime()
    ->Arg(1)
    ->Arg(8)
    ->Arg(16)
@ -107,17 +107,17 @@ Graph* SetupStringSplitV2Graph(const Tensor& input) {
  return g;
 }

-void BM_StringSplitV2(int iters, int batch_size) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  testing::UseRealTime();
+static void BM_StringSplitV2(::testing::benchmark::State& state) {
+  const int batch_size = state.range(0);
+
  Tensor input = GetTestTensor(batch_size);
  Graph* g = SetupStringSplitV2Graph(input);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }

 BENCHMARK(BM_StringSplitV2)
+    ->UseRealTime()
    ->Arg(1)
    ->Arg(8)
    ->Arg(16)
--- a/tensorflow/core/kernels/substr_op_test.cc
+++ b/tensorflow/core/kernels/substr_op_test.cc
@ -149,27 +149,26 @@ Graph* SetupSubstrGraph(const Tensor& input, const int32 pos, const int32 len,
  return g;
 }

-void BM_SubstrByte(int iters, int batch_size) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  testing::UseRealTime();
+static void BM_SubstrByte(::testing::benchmark::State& state) {
+  const int batch_size = state.range(0);
+
  Tensor input = GetTestTensor(batch_size);
  Graph* g = SetupSubstrGraph(input, 3, 30, kByteUnit);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(state.iterations());
 }

-void BM_SubstrUTF8(int iters, int batch_size) {
-  testing::StopTiming();
-  testing::ItemsProcessed(static_cast<int64>(iters));
-  testing::UseRealTime();
+static void BM_SubstrUTF8(::testing::benchmark::State& state) {
+  const int batch_size = state.range(0);
+
  Tensor input = GetTestUTF8Tensor(batch_size);
  Graph* g = SetupSubstrGraph(input, 3, 30, kUTF8Unit);
-  testing::StartTiming();
-  test::Benchmark("cpu", g).Run(iters);
+  test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
+  state.SetItemsProcessed(state.iterations());
 }

 BENCHMARK(BM_SubstrByte)
+    ->UseRealTime()
    ->Arg(1)
    ->Arg(8)
    ->Arg(16)
@ -178,6 +177,7 @@ BENCHMARK(BM_SubstrByte)
    ->Arg(128)
    ->Arg(256);
 BENCHMARK(BM_SubstrUTF8)
+    ->UseRealTime()
    ->Arg(1)
    ->Arg(8)
    ->Arg(16)
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@ -103,14 +103,18 @@ static void SGD(int32 n, Graph** init_g, Graph** train_g) {
  }
 }

-static void BM_SGD(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_SGD(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+
  Graph* init;
  Graph* train;
  SGD(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_SGD)->Arg(128 << 10)->Arg(256 << 10);

@ -135,14 +139,18 @@ static void Adagrad(int32 n, Graph** init_g, Graph** train_g) {
  }
 }

-static void BM_Adagrad(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_Adagrad(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+
  Graph* init;
  Graph* train;
  Adagrad(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_Adagrad)->Arg(128 << 10)->Arg(256 << 10);

@ -168,17 +176,22 @@ static void SparseAdagrad(int32 m, int32 n, Graph** init_g, Graph** train_g) {
    *train_g = g;
  }
 }
-static void BM_SparseAdagrad(int iters, int m, int n) {
-  const int64 tot = static_cast<int64>(iters) * m * n;
-  testing::UseRealTime();
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_SparseAdagrad(::testing::benchmark::State& state) {
+  const int m = state.range(0);
+  const int n = state.range(1);
+
  Graph* init;
  Graph* train;
  SparseAdagrad(m, n, &init, &train);
-  test::Benchmark("cpu", train, GetMultiThreadedOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetMultiThreadedOptions(), init, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  const int64 tot = static_cast<int64>(state.iterations()) * m * n;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_SparseAdagrad)
+    ->UseRealTime()
    ->ArgPair(128, 1 << 10)
    ->ArgPair(128, 4 << 10)
    ->ArgPair(128, 8 << 10)
@ -208,14 +221,18 @@ static void Momentum(int32 n, Graph** init_g, Graph** train_g) {
  }
 }

-static void BM_Momentum(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_Momentum(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+
  Graph* init;
  Graph* train;
  Momentum(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                  /*old_benchmark_api*/ false)
+      .Run(state);
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_Momentum)->Arg(128 << 10)->Arg(256 << 10);

@ -251,19 +268,26 @@ static void Adam(int32 n, Graph** init_g, Graph** train_g) {
  }
 }

-static void BM_Adam(int iters, int params, int is_multi_threaded) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_Adam(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+  const int is_multi_threaded = state.range(1);
+
  Graph* init;
  Graph* train;
  Adam(params, &init, &train);
  if (is_multi_threaded) {
    // Use max thread number if test performance.
-    test::Benchmark("cpu", train, nullptr, init).Run(iters);
+    test::Benchmark("cpu", train, nullptr, init, nullptr, "",
+                    /*old_benchmark_api*/ false)
+        .Run(state);
  } else {
-    test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+    test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                    /*old_benchmark_api*/ false)
+        .Run(state);
  }
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_Adam)->ArgPair(128 << 10, 0)->ArgPair(256 << 10, 0);
 BENCHMARK(BM_Adam)->ArgPair(256 << 5, 1)->ArgPair(256 << 16, 1);
@ -297,14 +321,18 @@ static void RMSProp(int32 n, Graph** init_g, Graph** train_g) {
  }
 }

-static void BM_RMSProp(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_RMSProp(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+
  Graph* init;
  Graph* train;
  RMSProp(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                  /*old_benhcmark_api*/ false)
+      .Run(state);
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_RMSProp)->Arg(128 << 10)->Arg(256 << 10);

@ -334,14 +362,18 @@ static void AddSign(int32 n, Graph** init_g, Graph** train_g) {
  }
 }

-static void BM_AddSign(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_AddSign(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+
  Graph* init;
  Graph* train;
  AddSign(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                  /*old_benhcmark_api*/ false)
+      .Run(state);
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_AddSign)->Arg(128 << 10)->Arg(256 << 10);

@ -371,14 +403,19 @@ static void PowerSign(int32 n, Graph** init_g, Graph** train_g) {
  }
 }

-static void BM_PowerSign(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
+static void BM_PowerSign(::testing::benchmark::State& state) {
+  const int params = state.range(0);
+
  Graph* init;
  Graph* train;
  PowerSign(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+  test::Benchmark("cpu", train, GetOptions(), init, nullptr, "",
+                  /*old_benhcmark_api*/ false)
+      .Run(state);
+
+  const int64 tot = static_cast<int64>(state.iterations()) * params;
+  state.SetItemsProcessed(tot);
+  state.SetBytesProcessed(tot * sizeof(float));
 }
 BENCHMARK(BM_PowerSign)->Arg(128 << 10)->Arg(256 << 10);

--- a/tensorflow/core/kernels/unary_ops_composition_test.cc
+++ b/tensorflow/core/kernels/unary_ops_composition_test.cc
@ -108,11 +108,15 @@ static Graph* UnaryOpsChain(int tensor_size, int repeat_graph,
  return g;
 }

-#define BM_UnaryOpsChain(N, R, F, type)                                \
-  static void BM_UnaryOpsChain##_##type##_##N##_##R##_##F(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * R * F);    \
-    test::Benchmark(#type, UnaryOpsChain(N, R, F)).Run(iters);         \
-  }                                                                    \
+#define BM_UnaryOpsChain(N, R, F, type)                                      \
+  static void BM_UnaryOpsChain##_##type##_##N##_##R##_##F(                   \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#type, UnaryOpsChain(N, R, F),                           \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * R * \
+                            F);                                              \
+  }                                                                          \
  BENCHMARK(BM_UnaryOpsChain##_##type##_##N##_##R##_##F);

 // Unary ops fused together.
@ -140,11 +144,15 @@ static Graph* UnaryOpsCompo(int tensor_size, int repeat_graph,
  return g;
 }

-#define BM_UnaryOpsCompo(N, R, F, type)                                \
-  static void BM_UnaryOpsCompo##_##type##_##N##_##R##_##F(int iters) { \
-    testing::ItemsProcessed(static_cast<int64>(iters) * N * R * F);    \
-    test::Benchmark(#type, UnaryOpsCompo(N, R, F)).Run(iters);         \
-  }                                                                    \
+#define BM_UnaryOpsCompo(N, R, F, type)                                      \
+  static void BM_UnaryOpsCompo##_##type##_##N##_##R##_##F(                   \
+      ::testing::benchmark::State& state) {                                  \
+    test::Benchmark(#type, UnaryOpsCompo(N, R, F),                           \
+                    /*old_benchmark_api*/ false)                             \
+        .Run(state);                                                         \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * R * \
+                            F);                                              \
+  }                                                                          \
  BENCHMARK(BM_UnaryOpsCompo##_##type##_##N##_##R##_##F);

 // BenchmarkName(tensor_size, repeat_graph, num_ops, type)
--- a/tensorflow/core/kernels/unique_op_test.cc
+++ b/tensorflow/core/kernels/unique_op_test.cc
@ -64,8 +64,10 @@ TensorProto GetRandomInt32TensorProtoWithRepeat(int dim, int repeat,
  return tensor_proto;
 }

-static void BM_Unique_INT32(int iters, int dim, int max_int) {
-  testing::StopTiming();
+void BM_Unique_INT32(::testing::benchmark::State& state) {
+  const int dim = state.range(0);
+  const int max_int = state.range(1);
+
  Graph* g = new Graph(OpRegistry::Global());

  Tensor input(DT_INT32, TensorShape({dim}));
@ -78,16 +80,17 @@ static void BM_Unique_INT32(int iters, int dim, int max_int) {
                  .Finalize(g, &node));
  FixupSourceAndSinkEdges(g);

-  testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(int32));
-  testing::UseRealTime();
-  testing::StartTiming();
  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .Run(iters);
+                  "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * dim *
+                          sizeof(int32));
 }

-static void BM_Unique_INT32_Repeat(int iters, int dim, int max_int) {
-  testing::StopTiming();
+void BM_Unique_INT32_Repeat(::testing::benchmark::State& state) {
+  const int dim = state.range(0);
+  const int max_int = state.range(1);
+
  Graph* g = new Graph(OpRegistry::Global());

  Tensor input(DT_INT32, TensorShape({dim * 200}));
@ -101,13 +104,11 @@ static void BM_Unique_INT32_Repeat(int iters, int dim, int max_int) {
                  .Finalize(g, &node));
  FixupSourceAndSinkEdges(g);

-  testing::BytesProcessed(static_cast<int64>(iters) * dim * 200 *
-                          sizeof(int32));
-  testing::UseRealTime();
-  testing::StartTiming();
  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .Run(iters);
+                  "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * dim * 200 *
+                          sizeof(int32));
 }

 TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) {
@ -127,8 +128,9 @@ TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) {
  return tensor_proto;
 }

-static void BM_Unique_STRING(int iters, int dim) {
-  testing::StopTiming();
+void BM_Unique_STRING(::testing::benchmark::State& state) {
+  const int dim = state.range(0);
+
  Graph* g = new Graph(OpRegistry::Global());

  Tensor input(DT_STRING, TensorShape({dim}));
@ -140,16 +142,15 @@ static void BM_Unique_STRING(int iters, int dim) {
                  .Attr("T", DT_STRING)
                  .Finalize(g, &node));
  FixupSourceAndSinkEdges(g);
-
-  testing::BytesProcessed(static_cast<int64>(iters) * dim * sizeof(tstring));
-  testing::UseRealTime();
-  testing::StartTiming();
  test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
-                  "SINGLE_THREADED_EXECUTOR")
-      .Run(iters);
+                  "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
+      .Run(state);
+  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * dim *
+                          sizeof(tstring));
 }

 BENCHMARK(BM_Unique_INT32)
+    ->UseRealTime()
    ->ArgPair(32, 1024 * 1024)
    ->ArgPair(256, 1024 * 1024)
    ->ArgPair(1024, 1024 * 1024)
@ -168,6 +169,7 @@ BENCHMARK(BM_Unique_INT32)
    ->ArgPair(4 * 1024 * 1024, 64 * 1024 * 1024);

 BENCHMARK(BM_Unique_INT32_Repeat)
+    ->UseRealTime()
    ->ArgPair(32, 1024 * 1024)
    ->ArgPair(256, 1024 * 1024)
    ->ArgPair(1024, 1024 * 1024)
@ -192,6 +194,7 @@ BENCHMARK(BM_Unique_INT32_Repeat)
    ->ArgPair(1024 * 1024, 64 * 1024 * 1024);

 BENCHMARK(BM_Unique_STRING)
+    ->UseRealTime()
    ->Arg(32)
    ->Arg(256)
    ->Arg(1024)
--- a/tensorflow/core/kernels/variable_ops_test.cc
+++ b/tensorflow/core/kernels/variable_ops_test.cc
@ -28,8 +28,8 @@ namespace {
 // Benchmark to simulate the overhead in training and serving workloads from too
 // many threads grabbing the ResourceMgr lock at the same time because of the
 // variable and queue ops.
-void ManyManyVariablesHelper(int threads, int variables, int iters) {
-  testing::StopTiming();
+void ManyManyVariablesHelper(int threads, int variables,
+                             ::testing::benchmark::State& state) {
  Graph g(OpRegistry::Global());
  std::vector<string> targets;
  for (int i = 0; i < variables; ++i) {
@ -50,16 +50,16 @@ void ManyManyVariablesHelper(int threads, int variables, int iters) {
  Session* sess = NewSession(opts);
  TF_CHECK_OK(sess->Create(gd));
  TF_CHECK_OK(sess->Run({}, {}, targets, nullptr));
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    TF_CHECK_OK(sess->Run({}, {}, targets, nullptr));
  }
-  testing::StopTiming();
  delete sess;
 }

-void BM_ManyManyVariablesManyThreads(int iters, int threads) {
-  ManyManyVariablesHelper(threads, 1000, iters);
+void BM_ManyManyVariablesManyThreads(::testing::benchmark::State& state) {
+  const int threads = state.range(0);
+
+  ManyManyVariablesHelper(threads, 1000, state);
 }

 BENCHMARK(BM_ManyManyVariablesManyThreads)->Arg(50);
--- a/tensorflow/core/kernels/xent_op_test.cc
+++ b/tensorflow/core/kernels/xent_op_test.cc
@ -33,11 +33,14 @@ static Graph* Xent(int batch_size, int num_classes) {
  return g;
 }

-#define BM_XentDev(BATCH, CLASS, DEVICE)                                \
-  static void BM_Xent##_##BATCH##_##CLASS##_##DEVICE(int iters) {       \
-    testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
-    test::Benchmark(#DEVICE, Xent(BATCH, CLASS)).Run(iters);            \
-  }                                                                     \
+#define BM_XentDev(BATCH, CLASS, DEVICE)                                      \
+  static void BM_Xent##_##BATCH##_##CLASS##_##DEVICE(                         \
+      ::testing::benchmark::State& state) {                                   \
+    test::Benchmark(#DEVICE, Xent(BATCH, CLASS), /*old_benchmark_api*/ false) \
+        .Run(state);                                                          \
+    state.SetItemsProcessed(static_cast<int64>(state.iterations()) * BATCH *  \
+                            CLASS);                                           \
+  }                                                                           \
  BENCHMARK(BM_Xent##_##BATCH##_##CLASS##_##DEVICE);

 /// The representative tests for ptb_word on GPU