Internal tests cleanup.

PiperOrigin-RevId: 339820260 Change-Id: Ic704d3ec6c1d5e0a4155b6a49c14b977bf264716
2020-10-29 23:39:35 -07:00 · 2020-10-29 23:39:35 -07:00 · b62ccde60d
commit b62ccde60d
parent 92f946352c
9 changed files with 206 additions and 208 deletions
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@ -2587,11 +2587,9 @@ TEST(DirectSessionTest,
 // A simple benchmark for the overhead of `DirectSession::Run()` calls
 // with varying numbers of feeds/fetches.
-void FeedFetchBenchmarkHelper(int iters, int num_feeds, bool use_make_callable,
+void FeedFetchBenchmarkHelper(::testing::benchmark::State& state, int num_feeds,
-                              int inter_op_threads,
+                              bool use_make_callable, int inter_op_threads,
                              bool use_single_threaded_executor) {
  testing::StopTiming();
  Tensor value(DT_FLOAT, TensorShape());
  value.flat<float>()(0) = 37.0;
@ -2643,13 +2641,11 @@ void FeedFetchBenchmarkHelper(int iters, int num_feeds, bool use_make_callable,
    }
    TF_CHECK_OK(session->MakeCallable(callable_options, &handle));
-    testing::StartTiming();
+    for (auto s : state) {
    for (int i = 0; i < iters; ++i) {
      std::vector<Tensor> output_values;
      TF_CHECK_OK(
          session->RunCallable(handle, input_tensors, &output_values, nullptr));
    }
    testing::StopTiming();
  } else {
    {
      // NOTE(mrry): Ignore the first run, which will incur the graph
@ -2661,32 +2657,40 @@ void FeedFetchBenchmarkHelper(int iters, int num_feeds, bool use_make_callable,
      std::vector<Tensor> output_values;
      TF_CHECK_OK(session->Run(inputs, outputs, {}, &output_values));
    }
-    testing::StartTiming();
+
-    for (int i = 0; i < iters; ++i) {
+    for (auto s : state) {
      std::vector<Tensor> output_values;
      TF_CHECK_OK(session->Run(inputs, outputs, {}, &output_values));
    }
    testing::StopTiming();
  }
 }
-void BM_FeedFetch(int iters, int num_feeds) {
+void BM_FeedFetch(::testing::benchmark::State& state) {
-  FeedFetchBenchmarkHelper(iters, num_feeds, /* use_make_callable */ false,
+  const int num_feeds = state.range(0);
  FeedFetchBenchmarkHelper(state, num_feeds, /* use_make_callable */ false,
                           /* inter_op_threads */ 0,
                           /* use_single_threaded_executor */ false);
 }
-void BM_FeedFetchCallable(int iters, int num_feeds) {
+void BM_FeedFetchCallable(::testing::benchmark::State& state) {
-  FeedFetchBenchmarkHelper(iters, num_feeds, /* use_make_callable */ true,
+  const int num_feeds = state.range(0);
  FeedFetchBenchmarkHelper(state, num_feeds, /* use_make_callable */ true,
                           /* inter_op_threads */ 0,
                           /* use_single_threaded_executor */ false);
 }
-void BM_FeedFetchCallableSingleThread(int iters, int num_feeds) {
+void BM_FeedFetchCallableSingleThread(::testing::benchmark::State& state) {
-  FeedFetchBenchmarkHelper(iters, num_feeds, /* use_make_callable */ true,
+  const int num_feeds = state.range(0);
  FeedFetchBenchmarkHelper(state, num_feeds, /* use_make_callable */ true,
                           /* inter_op_threads */ -1,
                           /* use_single_threaded_executor */ false);
 }
-void BM_FeedFetchCallableSingleThreadExecutor(int iters, int num_feeds) {
+void BM_FeedFetchCallableSingleThreadExecutor(
-  FeedFetchBenchmarkHelper(iters, num_feeds, /* use_make_callable */ true,
+    ::testing::benchmark::State& state) {
  const int num_feeds = state.range(0);
  FeedFetchBenchmarkHelper(state, num_feeds, /* use_make_callable */ true,
                           /* inter_op_threads */ -1,
                           /* use_single_threaded_executor */ true);
 }
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@ -69,8 +69,8 @@ class TestEnv {
  Device* cpu_device_;
 };
-void BM_CreateGraph(int iters) {
+void BM_CreateGraph(::testing::benchmark::State& state) {
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    Scope root = Scope::NewRootScope();
    auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
    auto M = ops::MatMul(root, C, C);
@ -79,8 +79,7 @@ void BM_CreateGraph(int iters) {
 }
 BENCHMARK(BM_CreateGraph);
-void BM_RunGraph(int iters) {
+void BM_RunGraph(::testing::benchmark::State& state) {
  tensorflow::testing::StopTiming();
  Scope root = Scope::NewRootScope();
  auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
  auto M = ops::MatMul(root, C, C);
@ -89,28 +88,24 @@ void BM_RunGraph(int iters) {
  opts.config.set_intra_op_parallelism_threads(1);
  ClientSession sess(root, opts);
  std::vector<Tensor> outputs;
-  tensorflow::testing::StartTiming();
+  for (auto s : state) {
  for (int i = 0; i < iters; ++i) {
    outputs.clear();
    TF_CHECK_OK(sess.Run({M}, &outputs));
  }
 }
 BENCHMARK(BM_RunGraph);
-void BM_CreateAndDestroySession(int iters) {
+void BM_CreateAndDestroySession(::testing::benchmark::State& state) {
  tensorflow::testing::StopTiming();
  Scope root = Scope::NewRootScope();
  auto C = ops::Const(root, {{1.0, 2.0}, {3.0, 4.0}});
  auto M = ops::MatMul(root, C, C);
-  tensorflow::testing::StartTiming();
+  for (auto s : state) {
  for (int i = 0; i < iters; ++i) {
    ClientSession sess(root);
  }
 }
 BENCHMARK(BM_CreateAndDestroySession);
-void BM_KernelAndDeviceInit(int iters) {
+void BM_KernelAndDeviceInit(::testing::benchmark::State& state) {
  tensorflow::testing::StopTiming();
  NodeDef ndef(AttrBuilder("MatMul")
                   .Set("T", DT_FLOAT)
                   .Set("transpose_a", false)
@ -120,15 +115,13 @@ void BM_KernelAndDeviceInit(int iters) {
  TestEnv env;
  KernelAndDeviceOp k(nullptr, false, env.function_library_runtime(), nullptr,
                      nullptr, env.cpu_device());
-  tensorflow::testing::StartTiming();
+  for (auto s : state) {
  for (int i = 0; i < iters; ++i) {
    TF_CHECK_OK(k.Init({}, ndef, nullptr));
  }
 }
 BENCHMARK(BM_KernelAndDeviceInit);
-void BM_KernelAndDeviceRun(int iters) {
+void BM_KernelAndDeviceRun(::testing::benchmark::State& state) {
  tensorflow::testing::StopTiming();
  Tensor t(Input({{1.0f, 2.0f}, {3.0f, 4.0f}}).tensor());
  gtl::InlinedVector<TensorValue, 4> inputs;
  inputs.push_back(TensorValue(&t));
@ -145,8 +138,7 @@ void BM_KernelAndDeviceRun(int iters) {
                      nullptr, env.cpu_device());
  TF_CHECK_OK(k.Init({}, ndef, nullptr));
  const EagerKernelArgs args(std::move(inputs));
-  tensorflow::testing::StartTiming();
+  for (auto s : state) {
  for (int i = 0; i < iters; ++i) {
    TF_CHECK_OK(k.Run(nullptr, args, &outputs, nullptr, absl::nullopt));
  }
 }
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@ -433,11 +433,10 @@ TEST_F(ExecutorTest, NoInputTensors) {
 // Create a graph that is 'depth' deep. At each level, fan-in and fan-out a
 // maximum of 'width' nodes. All nodes are no-ops and all dependencies are
 // control dependencies.
-static void BM_executor(int iters, int width, int depth) {
+static void BM_executor(::testing::benchmark::State& state) {
-  testing::StopTiming();
+  const int width = state.range(0);
-#ifdef PLATFORM_GOOGLE
+  const int depth = state.range(1);
-  BenchmarkUseRealTime();
+
 #endif  // PLATFORM_GOOGLE
  Graph* g = new Graph(OpRegistry::Global());
  random::PhiloxRandom philox(1729, 17);
  random::SimplePhilox rand(&philox);
@ -466,30 +465,29 @@ static void BM_executor(int iters, int width, int depth) {
      ++cur;
    }
  }
-#ifdef PLATFORM_GOOGLE
+
  SetBenchmarkLabel(strings::StrCat("Nodes = ", cur));
  SetBenchmarkItemsProcessed(cur * static_cast<int64>(iters));
 #endif  // PLATFORM_GOOGLE
  FixupSourceAndSinkEdges(g);
-  testing::StartTiming();
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
-  test::Benchmark("cpu", g).Run(iters);
+
  state.SetLabel(strings::StrCat("Nodes = ", cur));
  state.SetItemsProcessed(cur * static_cast<int64>(state.iterations()));
 }
 // Tall skinny graphs
-BENCHMARK(BM_executor)->ArgPair(16, 1024);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(16, 1024);
-BENCHMARK(BM_executor)->ArgPair(32, 8192);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(32, 8192);
 // Short fat graphs
-BENCHMARK(BM_executor)->ArgPair(1024, 16);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 16);
-BENCHMARK(BM_executor)->ArgPair(8192, 32);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(8192, 32);
 // Tall fat graph
-BENCHMARK(BM_executor)->ArgPair(1024, 1024);
+BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 1024);
 static void BM_const_identity(::testing::benchmark::State& state) {
  const int width = state.range(0);
  const int outputs_per_const = state.range(1);
 static void BM_const_identity(int iters, int width, int outputs_per_const) {
 #ifdef PLATFORM_GOOGL
  BenchmarkUseRealTime();
 #endif  // PLATFORM_GOOGLE
  Graph* g = new Graph(OpRegistry::Global());
  for (int i = 0; i < width; ++i) {
    Tensor i_t(i);
@ -499,23 +497,21 @@ static void BM_const_identity(int iters, int width, int outputs_per_const) {
    }
  }
  FixupSourceAndSinkEdges(g);
-#ifdef PLATFORM_GOOGLE
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
-  SetBenchmarkLabel(
+  state.SetLabel(strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
-      strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
+  state.SetItemsProcessed((1 + outputs_per_const) * width *
-  SetBenchmarkItemsProcessed((1 + outputs_per_const) * width *
+                          static_cast<int64>(state.iterations()));
                             static_cast<int64>(iters));
 #endif  // PLATFORM_GOOGLE
  test::Benchmark("cpu", g).Run(iters);
 }
 // Graph with actual op execution.
-BENCHMARK(BM_const_identity)->ArgPair(1, 1);
+BENCHMARK(BM_const_identity)
-BENCHMARK(BM_const_identity)->ArgPair(1, 100);
+    ->UseRealTime()
-BENCHMARK(BM_const_identity)->ArgPair(100, 1);
+    ->ArgPair(1, 1)
-BENCHMARK(BM_const_identity)->ArgPair(100, 100);
+    ->ArgPair(1, 100)
    ->ArgPair(100, 1)
    ->ArgPair(100, 100);
-static void BM_FeedInputFetchOutput(int iters) {
+static void BM_FeedInputFetchOutput(::testing::benchmark::State& state) {
  testing::StopTiming();
  Graph* g = new Graph(OpRegistry::Global());
  // z = x + y: x and y are provided as benchmark inputs.  z is the
  // output of the benchmark.  Conceptually, the caller is ALICE, the
@ -531,13 +527,10 @@ static void BM_FeedInputFetchOutput(int iters) {
  Tensor val(DT_FLOAT, TensorShape({}));
  val.scalar<float>()() = 3.14;
 #ifdef PLATFORM_GOOGLE
  SetBenchmarkItemsProcessed(static_cast<int64>(iters));
 #endif  // PLATFORM_GOOGLE
  FixupSourceAndSinkEdges(g);
-  testing::StartTiming();
+  test::Benchmark("cpu", g, /*old_benchmark_api=*/false)
-  test::Benchmark("cpu", g).RunWithRendezvousArgs({{x_key, val}, {y_key, val}},
+      .RunWithRendezvousArgs({{x_key, val}, {y_key, val}}, {z_key}, state);
-                                                  {z_key}, iters);
+  state.SetItemsProcessed(static_cast<int64>(state.iterations()));
 }
 BENCHMARK(BM_FeedInputFetchOutput);
@ -549,9 +542,8 @@ BENCHMARK(BM_FeedInputFetchOutput);
 //
 // ...using the functional `WhileOp` (if `lower` is false) or the
 // `Switch`/`Merge`-style of control flow (if `lower` is true).
-static void BM_WhileLoopHelper(int iters, int loop_iters, int loop_vars,
+static void BM_WhileLoopHelper(::testing::benchmark::State& state,
-                               bool lower) {
+                               int loop_iters, int loop_vars, bool lower) {
  testing::StopTiming();
  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
  // Add test functions for cond and body.
@ -661,12 +653,15 @@ static void BM_WhileLoopHelper(int iters, int loop_iters, int loop_vars,
  }
  FixupSourceAndSinkEdges(graph.get());
-  testing::StartTiming();
+  test::Benchmark("cpu", graph.release(), /*old_benchmark_api=*/false)
-  test::Benchmark("cpu", graph.release()).Run(iters);
+      .Run(state);
 }
-static void BM_LoweredWhileLoop(int iters, int loop_iters, int loop_vars) {
+static void BM_LoweredWhileLoop(::testing::benchmark::State& state) {
-  BM_WhileLoopHelper(iters, loop_iters, loop_vars, /* lower= */ true);
+  const int loop_iters = state.range(0);
  const int loop_vars = state.range(1);
  BM_WhileLoopHelper(state, loop_iters, loop_vars, /* lower= */ true);
 }
 BENCHMARK(BM_LoweredWhileLoop)
    ->ArgPair(0, 1)
@ -680,8 +675,11 @@ BENCHMARK(BM_LoweredWhileLoop)
    ->ArgPair(100, 100)
    ->ArgPair(1000, 100);
-static void BM_FunctionalWhileLoop(int iters, int loop_iters, int loop_vars) {
+static void BM_FunctionalWhileLoop(::testing::benchmark::State& state) {
-  BM_WhileLoopHelper(iters, loop_iters, loop_vars, /* lower= */ false);
+  const int loop_iters = state.range(0);
  const int loop_vars = state.range(1);
  BM_WhileLoopHelper(state, loop_iters, loop_vars, /* lower= */ false);
 }
 BENCHMARK(BM_FunctionalWhileLoop)
    ->ArgPair(0, 1)
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@ -221,14 +221,16 @@ TEST(CustomAllocatorAttributes, TestSetterAndGetter) {
  EXPECT_FALSE(HasDeviceAllocatorAttribute(AllocatorAttributes()));
 }
-static void BM_Allocation(int iters, int arg) {
+static void BM_Allocation(::testing::benchmark::State& state) {
  const int arg = state.range(0);
  Allocator* a = cpu_allocator();
  // Exercise a few different allocation sizes
  std::vector<int> sizes = {256, 4096, 16384, 524288, 512, 1048576};
  int size_index = 0;
  if (arg) EnableCPUAllocatorStats();
-  while (--iters > 0) {
+  for (auto s : state) {
    int bytes = sizes[size_index++ % sizes.size()];
    void* p = a->AllocateRaw(1, bytes);
    a->DeallocateRaw(p);
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@ -39,60 +39,60 @@ TEST(Bfloat16Test, Conversion) {
  }
 }
-static void BM_FloatToBFloat16(int iters) {
+void BM_FloatToBFloat16(::testing::benchmark::State& state) {
  testing::StopTiming();
  static const int N = 32 << 20;
  const int64 tot = static_cast<int64>(iters) * N;
  testing::ItemsProcessed(tot);
  testing::BytesProcessed(tot * (sizeof(float) + sizeof(bfloat16)));
  float* inp = new float[N];
  bfloat16* out = new bfloat16[N];
-  testing::StartTiming();
+  for (auto s : state) {
  while (iters--) {
    FloatToBFloat16(inp, out, N);
  }
  const int64 tot = static_cast<int64>(state.iterations()) * N;
  state.SetItemsProcessed(tot);
  state.SetBytesProcessed(tot * (sizeof(float) + sizeof(bfloat16)));
  delete[] inp;
  delete[] out;
 }
 BENCHMARK(BM_FloatToBFloat16);
-static void BM_RoundFloatToBFloat16(int iters) {
+void BM_RoundFloatToBFloat16(::testing::benchmark::State& state) {
  testing::StopTiming();
  static const int N = 32 << 20;
  const int64 tot = static_cast<int64>(iters) * N;
  testing::ItemsProcessed(tot);
  testing::BytesProcessed(tot * (sizeof(float) + sizeof(bfloat16)));
  float* inp = new float[N];
  bfloat16* out = new bfloat16[N];
-  testing::StartTiming();
+  for (auto s : state) {
  while (iters--) {
    RoundFloatToBFloat16(inp, out, N);
    tensorflow::testing::DoNotOptimize(inp);
    tensorflow::testing::DoNotOptimize(out);
  }
  const int64 tot = static_cast<int64>(state.iterations()) * N;
  state.SetItemsProcessed(tot);
  state.SetBytesProcessed(tot * (sizeof(float) + sizeof(bfloat16)));
  delete[] inp;
  delete[] out;
 }
 BENCHMARK(BM_RoundFloatToBFloat16);
-static void BM_BFloat16ToFloat(int iters) {
+void BM_BFloat16ToFloat(::testing::benchmark::State& state) {
  testing::StopTiming();
  static const int N = 32 << 20;
  const int64 tot = static_cast<int64>(iters) * N;
  testing::ItemsProcessed(tot);
  testing::BytesProcessed(tot * (sizeof(float) + sizeof(bfloat16)));
  bfloat16* inp = new bfloat16[N];
  float* out = new float[N];
-  testing::StartTiming();
+  for (auto s : state) {
  while (iters--) {
    BFloat16ToFloat(inp, out, N);
  }
  const int64 tot = static_cast<int64>(state.iterations()) * N;
  state.SetItemsProcessed(tot);
  state.SetBytesProcessed(tot * (sizeof(float) + sizeof(bfloat16)));
  delete[] inp;
  delete[] out;
 }
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@ -1002,9 +1002,9 @@ TEST_F(LabelTest, Duplicate) {
                error::INVALID_ARGUMENT);
 }
-void BM_InputRangeHelper(int iters, const NodeDef& node_def,
+void BM_InputRangeHelper(::testing::benchmark::State& state,
-                         const char* input_name, int expected_start,
+                         const NodeDef& node_def, const char* input_name,
-                         int expected_stop) {
+                         int expected_start, int expected_stop) {
  Status status;
  auto device = absl::make_unique<DummyDevice>(Env::Default());
@ -1013,24 +1013,20 @@ void BM_InputRangeHelper(int iters, const NodeDef& node_def,
                                              TF_GRAPH_DEF_VERSION, &status));
  TF_CHECK_OK(status);
-  testing::StartTiming();
+  for (auto s : state) {
  for (int i = 0; i < iters; ++i) {
    int start;
    int stop;
    TF_CHECK_OK(op->InputRange(input_name, &start, &stop));
    EXPECT_EQ(expected_start, start);
    EXPECT_EQ(expected_stop, stop);
  }
  testing::StopTiming();
 }
 REGISTER_KERNEL_BUILDER(Name("ConcatV2").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("Select").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("MatMul").Device(DEVICE_CPU), DummyKernel);
-void BM_ConcatInputRange(int iters) {
+void BM_ConcatInputRange(::testing::benchmark::State& state) {
  testing::StopTiming();
  // Create a ConcatV2 NodeDef with 4 inputs (plus the axis).
  NodeDef node_def;
  node_def.set_name("concat-op");
@ -1048,12 +1044,10 @@ void BM_ConcatInputRange(int iters) {
    node_def.add_input(strings::StrCat("a:", i));
  }
-  BM_InputRangeHelper(iters, node_def, "values", 0, 4);
+  BM_InputRangeHelper(state, node_def, "values", 0, 4);
 }
-void BM_SelectInputRange(int iters) {
+void BM_SelectInputRange(::testing::benchmark::State& state) {
  testing::StopTiming();
  // Create a Select NodeDef with 3 inputs.
  NodeDef node_def;
  node_def.set_name("select-op");
@ -1065,11 +1059,11 @@ void BM_SelectInputRange(int iters) {
    node_def.add_input(strings::StrCat("a:", i));
  }
-  BM_InputRangeHelper(iters, node_def, "condition", 0, 1);
+  BM_InputRangeHelper(state, node_def, "condition", 0, 1);
 }
-void BM_TraceString(const int iters, const int verbose) {
+void BM_TraceString(::testing::benchmark::State& state) {
-  testing::StopTiming();
+  const int verbose = state.range(0);
  // Create a MatMul NodeDef with 2 inputs.
  NodeDef node_def;
@ -1103,11 +1097,9 @@ void BM_TraceString(const int iters, const int verbose) {
  params.inputs = &inputs;
  auto ctx = absl::make_unique<OpKernelContext>(&params);
-  testing::StartTiming();
+  for (auto s : state) {
  for (int i = 0; i < iters; ++i) {
    auto trace = op->TraceString(*ctx, verbose);
  }
  testing::StopTiming();
 }
 BENCHMARK(BM_ConcatInputRange);
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@ -434,83 +434,89 @@ TEST_F(LocalRendezvousTest, TransferDummyDeviceContext) {
  args1.device_context->Unref();
 }
-void BM_SendRecv(int iters) {
+void BM_SendRecv(::testing::benchmark::State& state) {
  Rendezvous* rendez = NewLocalRendezvous();
  Tensor orig = V("val");
  Tensor val(DT_STRING, TensorShape({}));
  bool is_dead = false;
  Rendezvous::Args args;
-  if (iters > 0) {
+
-    while (iters--) {
+  for (auto s : state) {
-      TF_CHECK_OK(rendez->Send(KeyFoo(), args, orig, is_dead));
+    TF_CHECK_OK(rendez->Send(KeyFoo(), args, orig, is_dead));
-      TF_CHECK_OK(rendez->Recv(KeyFoo(), args, &val, &is_dead));
+    TF_CHECK_OK(rendez->Recv(KeyFoo(), args, &val, &is_dead));
    }
    CHECK_EQ(V(val), V(orig));
  }
  CHECK_EQ(V(val), V(orig));
  rendez->Unref();
 }
 BENCHMARK(BM_SendRecv);
-void BM_RecvSend(int iters) {
+void BM_RecvSend(::testing::benchmark::State& state) {
  Rendezvous* rendez = NewLocalRendezvous();
  Tensor orig = V("val");
  Tensor val(DT_STRING, TensorShape({}));
  bool is_dead = false;
  Rendezvous::Args args;
-  if (iters > 0) {
+
-    while (iters--) {
+  for (auto s : state) {
-      bool received = false;
+    bool received = false;
-      rendez->RecvAsync(
+    rendez->RecvAsync(
-          KeyFoo(), args,
+        KeyFoo(), args,
-          [&val, &received](const Status& s, const Rendezvous::Args& send_args,
+        [&val, &received](const Status& /*s*/,
-                            const Rendezvous::Args& recv_args,
+                          const Rendezvous::Args& /*send_args*/,
-                            const Tensor& tensor, bool is_dead) {
+                          const Rendezvous::Args& /*recv_args*/,
-            val = tensor;
+                          const Tensor& tensor, bool /*is_dead*/) {
-            received = true;
+          val = tensor;
-          });
+          received = true;
-      TF_CHECK_OK(rendez->Send(KeyFoo(), args, orig, is_dead));
+        });
-      CHECK(received);
+    TF_CHECK_OK(rendez->Send(KeyFoo(), args, orig, is_dead));
-    }
+    CHECK(received);
    CHECK_EQ(V(val), V(orig));
  }
  CHECK_EQ(V(val), V(orig));
  rendez->Unref();
 }
 BENCHMARK(BM_RecvSend);
-void BM_PingPong(int iters) {
+void BM_PingPong(::testing::benchmark::State& state) {
-  CHECK_GT(iters, 0);
+  const int messages_count = state.range(0);
  auto* cm = new CancellationManager();
  thread::ThreadPool* pool = new thread::ThreadPool(Env::Default(), "test", 1);
-  // The main thread sends "foo" for iters times and receives "bar"
+  // Benchmark loop
-  // for iters times.  The other thread sends "bar" for iters times
+  // In each iteration:
-  // and receives "foo" for iters times.
+  // The main thread sends "foo" for messages_count times and receives "bar"
-  Rendezvous* rendez = NewLocalRendezvous();
+  // for messages_count times.  The other thread sends "bar" for
-  pool->Schedule([rendez, iters]() {
+  // messages_count times and receives "foo" for messages_count times.
-    Tensor bar = V("bar");
+  for (auto s : state) {
-    Tensor foo(DT_STRING, TensorShape({}));
+    Rendezvous* rendez = NewLocalRendezvous();
    pool->Schedule([rendez, messages_count]() {
      Tensor bar = V("bar");
      Tensor foo(DT_STRING, TensorShape({}));
      bool is_dead = false;
      Rendezvous::Args args;
      for (int i = 0; i < messages_count; ++i) {
        TF_CHECK_OK(rendez->Recv(KeyFoo(), args, &foo, &is_dead));
        TF_CHECK_OK(rendez->Send(KeyBar(), args, bar, is_dead));
      }
      CHECK_EQ("foo", V(foo));
    });
    Tensor foo = V("foo");
    Tensor bar(DT_STRING, TensorShape({}));
    bool is_dead = false;
    Rendezvous::Args args;
-    for (int i = 0; i < iters; ++i) {
+    args.cancellation_manager = cm;
-      TF_CHECK_OK(rendez->Recv(KeyFoo(), args, &foo, &is_dead));
+    for (int i = 0; i < messages_count; ++i) {
-      TF_CHECK_OK(rendez->Send(KeyBar(), args, bar, is_dead));
+      TF_CHECK_OK(rendez->Send(KeyFoo(), args, foo, is_dead));
      TF_CHECK_OK(rendez->Recv(KeyBar(), args, &bar, &is_dead));
    }
-    CHECK_EQ("foo", V(foo));
+    CHECK_EQ("bar", V(bar));
  });
  Tensor foo = V("foo");
  Tensor bar(DT_STRING, TensorShape({}));
  bool is_dead = false;
  Rendezvous::Args args;
  args.cancellation_manager = cm;
  for (int i = 0; i < iters; ++i) {
    TF_CHECK_OK(rendez->Send(KeyFoo(), args, foo, is_dead));
    TF_CHECK_OK(rendez->Recv(KeyBar(), args, &bar, &is_dead));
  }
-  CHECK_EQ("bar", V(bar));
+  state.SetItemsProcessed(messages_count * state.iterations());
  delete pool;
  delete cm;
 }
-BENCHMARK(BM_PingPong);
+BENCHMARK(BM_PingPong)->Arg(100)->Arg(200)->Arg(300);
 }  // namespace
 }  // namespace tensorflow
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@ -684,19 +684,24 @@ static std::vector<int64> MakeSizes(int arg) {
  return sizes;
 }
-static void BM_TensorShape_Init(int iters, int arg) {
+void BM_TensorShape_Init(::testing::benchmark::State& state) {
  const int arg = state.range(0);
  auto sizes = MakeSizes(arg);
-  while (--iters > 0) {
+  for (auto s : state) {
    TensorShape shape(sizes);
    tensorflow::testing::DoNotOptimize(shape.num_elements());
  }
 }
 BENCHMARK(BM_TensorShape_Init)->Arg(0)->Arg(1)->Arg(2)->Arg(3)->Arg(4);
-static void BM_TensorShape_Assign(int iters, int arg) {
+void BM_TensorShape_Assign(::testing::benchmark::State& state) {
-  TensorShape s(MakeSizes(arg));
+  const int arg = state.range(0);
-  while (--iters > 0) {
+
-    TensorShape s2 = s;
+  TensorShape shape(MakeSizes(arg));
  for (auto s : state) {
    const TensorShape s2 = shape;
    tensorflow::testing::DoNotOptimize(s2);
  }
 }
 BENCHMARK(BM_TensorShape_Assign)->Arg(0)->Arg(1)->Arg(2)->Arg(3)->Arg(4);
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@ -1468,19 +1468,19 @@ TEST(SummarizeValue, STRING_PRINT_V2) {
            x.SummarizeValue(16, true));
 }
-void BM_CreateAndDestroy(int iters) {
+void BM_CreateAndDestroy(::testing::benchmark::State& state) {
  TensorShape shape({10, 20});
-  while (--iters) {
+  for (auto s : state) {
    Tensor t(DT_FLOAT, shape);
  }
 }
 BENCHMARK(BM_CreateAndDestroy);
-void BM_Assign(int iters) {
+void BM_Assign(::testing::benchmark::State& state) {
  Tensor a(DT_FLOAT, TensorShape({10, 20}));
  Tensor b(DT_FLOAT, TensorShape({10, 20}));
  bool a_to_b = true;
-  while (--iters) {
+  for (auto s : state) {
    if (a_to_b) {
      b = a;
    } else {
@ -1498,20 +1498,20 @@ TEST(Tensor, EmptyTensorData) {
 }
 // Benchmark create and destroy a tensor, with an allocated buffer.
-void BM_CreateAndDestroyWithBuf(int iters) {
+void BM_CreateAndDestroyWithBuf(::testing::benchmark::State& state) {
  TensorShape shape({10, 20});
  Allocator* allocator = cpu_allocator();
-  while (--iters) {
+  for (auto s : state) {
    Tensor a(allocator, DT_FLOAT, shape);
  }
 }
 BENCHMARK(BM_CreateAndDestroyWithBuf);
 // Benchmark create+copy a tensor, with an allocated buffer.
-void BM_CreateAndCopyCtrWithBuf(int iters) {
+void BM_CreateAndCopyCtrWithBuf(::testing::benchmark::State& state) {
  TensorShape shape({10, 20});
  Allocator* allocator = cpu_allocator();
-  while (--iters) {
+  for (auto s : state) {
    Tensor a(allocator, DT_FLOAT, shape);
    Tensor b(a);
  }
@ -1519,10 +1519,10 @@ void BM_CreateAndCopyCtrWithBuf(int iters) {
 BENCHMARK(BM_CreateAndCopyCtrWithBuf);
 // Benchmark create+move a tensor, with an allocated buffer.
-void BM_CreateAndMoveCtrWithBuf(int iters) {
+void BM_CreateAndMoveCtrWithBuf(::testing::benchmark::State& state) {
  TensorShape shape({10, 20});
  Allocator* allocator = cpu_allocator();
-  while (--iters) {
+  for (auto s : state) {
    Tensor a(allocator, DT_FLOAT, shape);
    Tensor b(std::move(a));
  }
@ -1531,10 +1531,11 @@ BENCHMARK(BM_CreateAndMoveCtrWithBuf);
 // Benchmark creating and destroy a host-scalar tensor, using the allocator
 // interface.
-void BM_CreateAndDestroyHostScalarNonOptimized(int iters) {
+void BM_CreateAndDestroyHostScalarNonOptimized(
    ::testing::benchmark::State& state) {
  TensorShape shape({});
  Allocator* allocator = cpu_allocator();
-  while (--iters) {
+  for (auto s : state) {
    Tensor a(allocator, DT_FLOAT, shape);
    a.scalar<float>()() = 37.0;
  }
@ -1543,32 +1544,33 @@ BENCHMARK(BM_CreateAndDestroyHostScalarNonOptimized);
 // Benchmark creating and destroy a host-scalar tensor, using the specialized
 // constructor.
-void BM_CreateAndDestroyHostScalarOptimized(int iters) {
+void BM_CreateAndDestroyHostScalarOptimized(
-  while (--iters) {
+    ::testing::benchmark::State& state) {
  for (auto s : state) {
    Tensor a(37.0);
  }
 }
 BENCHMARK(BM_CreateAndDestroyHostScalarOptimized);
-static void BM_FromProto(int iters, int size) {
+void BM_FromProto(::testing::benchmark::State& state) {
-  testing::StopTiming();
+  const int size = state.range(0);
  TensorShape shape({size});
  Allocator* allocator = cpu_allocator();
  Tensor a(allocator, DT_FLOAT, shape);
  std::fill_n(a.flat<float>().data(), size, 42.0);
  TensorProto p;
  a.AsProtoField(&p);
-  testing::StartTiming();
+  for (auto s : state) {
  while (--iters) {
    Tensor b;
    ASSERT_TRUE(b.FromProto(p));
  }
  testing::StopTiming();
 }
 BENCHMARK(BM_FromProto)->Range(1, 1 << 20);
-static void BM_FromProtoCompressed(int iters, int size) {
+void BM_FromProtoCompressed(::testing::benchmark::State& state) {
-  testing::StopTiming();
+  const int size = state.range(0);
  TensorShape shape({size});
  Allocator* allocator = cpu_allocator();
  Tensor a(allocator, DT_FLOAT, shape);
@ -1576,17 +1578,16 @@ static void BM_FromProtoCompressed(int iters, int size) {
  TensorProto p;
  a.AsProtoField(&p);
  tensor::CompressTensorProtoInPlace(&p);
-  testing::StartTiming();
+  for (auto s : state) {
  while (--iters) {
    Tensor b;
    ASSERT_TRUE(b.FromProto(p));
  }
  testing::StopTiming();
 }
 BENCHMARK(BM_FromProtoCompressed)->Range(1, 1 << 20);
-static void BM_FromProtoCompressedZero(int iters, int size) {
+void BM_FromProtoCompressedZero(::testing::benchmark::State& state) {
-  testing::StopTiming();
+  const int size = state.range(0);
  TensorShape shape({size});
  Allocator* allocator = cpu_allocator();
  Tensor a(allocator, DT_FLOAT, shape);
@ -1595,12 +1596,10 @@ static void BM_FromProtoCompressedZero(int iters, int size) {
  TensorProto p;
  a.AsProtoField(&p);
  tensor::CompressTensorProtoInPlace(&p);
-  testing::StartTiming();
+  for (auto s : state) {
  while (--iters) {
    Tensor b;
    ASSERT_TRUE(b.FromProto(p));
  }
  testing::StopTiming();
 }
 BENCHMARK(BM_FromProtoCompressedZero)->Range(1, 1 << 20);