Internal tests cleanup.

PiperOrigin-RevId: 339456235 Change-Id: Ia960a93771ef371256dc10078a39421ca1faeb14
2020-10-28 07:36:09 -07:00 · 2020-10-28 07:36:09 -07:00 · fbac0a99f7
commit fbac0a99f7
parent 08af2ba27c
9 changed files with 91 additions and 85 deletions
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@ -2100,10 +2100,14 @@ std::unique_ptr<HloComputation> MakeBenchmarkWhileBody() {
  return builder.Build();
 }
-void BM_SequentialWhiles(int num_iters, int num_whiles) {
+void BM_SequentialWhiles(::testing::benchmark::State& state) {
  const int num_whiles = state.range(0);
  // This benchmark constructs a chain of sequential while instructions.
-  tensorflow::testing::StopTiming();
+  // Timer starts automatically at the first iteration of this loop
-  for (int i = 0; i < num_iters; ++i) {
+  // and ends after the last one.
  for (auto s : state) {
    state.PauseTiming();
    HloModuleConfig config;
    config.set_debug_options(GetDebugOptionsFromFlags());
    HloModule module("BM_SequentialWhiles", config);
@ -2131,19 +2135,22 @@ void BM_SequentialWhiles(int num_iters, int num_whiles) {
    CopyInsertion copy_insertion;
-    tensorflow::testing::StartTiming();
+    state.ResumeTiming();
    ASSERT_IS_OK(copy_insertion.Run(&module).status());
-    tensorflow::testing::StopTiming();
+    state.PauseTiming();
    // The entry computation should have three copies, and each body has one.
    ASSERT_EQ(CountCopies(module), 3 + num_whiles);
    state.ResumeTiming();
  }
 }
-void BM_ParallelWhiles(int num_iters, int num_whiles) {
+void BM_ParallelWhiles(::testing::benchmark::State& state) {
  const int num_whiles = state.range(0);
  // This benchmark constructs a fan-out of parallel while instructions.
-  tensorflow::testing::StopTiming();
+  for (auto s : state) {
-  for (int i = 0; i < num_iters; ++i) {
+    state.PauseTiming();
    HloModuleConfig config;
    config.set_debug_options(GetDebugOptionsFromFlags());
    HloModule module("BM_SequentialWhiles", config);
@ -2182,9 +2189,9 @@ void BM_ParallelWhiles(int num_iters, int num_whiles) {
    CopyInsertion copy_insertion;
-    tensorflow::testing::StartTiming();
+    state.ResumeTiming();
    ASSERT_IS_OK(copy_insertion.Run(&module).status());
-    tensorflow::testing::StopTiming();
+    state.PauseTiming();
    // Each body receives of copy of two of the parameters (the corresponding
    // elements in the body are modified), and there is one copy in each body.
@ -2209,14 +2216,15 @@ std::unique_ptr<HloComputation> MakeBenchmarkWhileBody(
  return builder.Build();
 }
-void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
+void BM_ManyElementTuple(::testing::benchmark::State& state) {
-  tensorflow::testing::StopTiming();
+  const int num_tuple_inputs = state.range(0);
  HloModuleConfig config;
  config.set_debug_options(GetDebugOptionsFromFlags());
  CopyInsertion copy_insertion;
  const Shape element_shape = ShapeUtil::MakeShape(F32, {});
  std::vector<HloInstruction*> tuple_params(num_tuple_inputs);
-  for (int i = 0; i < num_iters; ++i) {
+  for (auto s : state) {
    state.PauseTiming();
    auto builder = HloComputation::Builder("BM_ParallelWhiles");
    HloModule module("BM_ManyElementTuple", config);
    for (int j = 0; j < num_tuple_inputs; ++j) {
@ -2234,9 +2242,8 @@ void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
    builder.AddInstruction(HloInstruction::CreateGetTupleElement(
        ShapeUtil::MakeShape(F32, {}), xla_while, 0));
    module.AddEntryComputation(builder.Build());
-    tensorflow::testing::StartTiming();
+    state.ResumeTiming();
    ASSERT_IS_OK(copy_insertion.Run(&module).status());
    tensorflow::testing::StopTiming();
  }
 }
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@ -2545,8 +2545,7 @@ TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
 // Reducing many numbers should be fast because it doesn't create
 // intermediate Literals; the microbenchmark should finish in < 1 msec.
-void BM_ReducePrecisely(int num_iters) {
+void BM_ReducePrecisely(::testing::benchmark::State& state) {
  tensorflow::testing::StopTiming();
  HloComputation::Builder b("BM_ReducePrecisely");
  HloModuleConfig config;
  config.set_debug_options(GetDebugOptionsFromFlags());
@ -2574,10 +2573,11 @@ void BM_ReducePrecisely(int num_iters) {
                                   /*dimensions_to_reduce=*/{0}, add_func));
  module.AddEntryComputation(b.Build());
  // Benchmark loop
  for (auto s : state) {
    HloEvaluator hlo_eval;
  tensorflow::testing::StartTiming();
    hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
-  tensorflow::testing::StopTiming();
+  }
 }
 BENCHMARK(BM_ReducePrecisely);
--- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@ -173,8 +173,10 @@ TEST(ScopedShapedBufferTest, TestSubShapeTree) {
 // Test TakeSubTree with different depths (depth of ShapeTree) and fan-outs
 // (cardinality of each non-leaf node's children).
-void BM_TakeSubTree(int iters, int depth, int fan_out) {
+void BM_TakeSubTree(::testing::benchmark::State& state) {
-  tensorflow::testing::StopTiming();
+  const int depth = state.range(0);
  const int fan_out = state.range(1);
  TestAllocator allocator;
  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128});
  for (int i = 0; i < depth; ++i) {
@ -183,13 +185,11 @@ void BM_TakeSubTree(int iters, int depth, int fan_out) {
  }
  xla::ScopedShapedBuffer shaped_buffer(shape, /*allocator=*/&allocator,
                                        /*device_ordinal=*/0);
-  tensorflow::testing::StartTiming();
+  for (auto s : state) {
  for (int i = 0; i < iters; ++i) {
    // Extract a buffer from approximately the middle of the first level of the
    // tree.
    (void)shaped_buffer.TakeSubTree(/*index=*/{fan_out / 2}).release();
  }
  tensorflow::testing::StopTiming();
 }
 BENCHMARK(BM_TakeSubTree)
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@ -535,94 +535,100 @@ TEST_F(ShapeTreeTest, ReverseIterateOrderLeaves) {
               }));
 }
-void BM_Construct(int iters, int depth, int fan_out) {
+void BM_Construct(::testing::benchmark::State& state) {
-  tensorflow::testing::StopTiming();
+  const int depth = state.range(0);
  const int fan_out = state.range(1);
  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
  for (int i = 0; i < depth; ++i) {
    std::vector<xla::Shape> shapes(fan_out, shape);
    shape = ShapeUtil::MakeTupleShape(shapes);
  }
  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    ShapeTree<int> shape_tree(shape);
  }
 }
-void BM_ConstructUnowned(int iters, int depth, int fan_out) {
+void BM_ConstructUnowned(::testing::benchmark::State& state) {
-  tensorflow::testing::StopTiming();
+  const int depth = state.range(0);
  const int fan_out = state.range(1);
  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
  for (int i = 0; i < depth; ++i) {
    std::vector<xla::Shape> shapes(fan_out, shape);
    shape = ShapeUtil::MakeTupleShape(shapes);
  }
  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    ShapeTree<int> shape_tree(&shape);
  }
 }
-void BM_Copy(int iters, int depth, int fan_out) {
+void BM_Copy(::testing::benchmark::State& state) {
-  tensorflow::testing::StopTiming();
+  const int depth = state.range(0);
  const int fan_out = state.range(1);
  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
  for (int i = 0; i < depth; ++i) {
    std::vector<xla::Shape> shapes(fan_out, shape);
    shape = ShapeUtil::MakeTupleShape(shapes);
  }
  tensorflow::testing::StartTiming();
  ShapeTree<int> shape_tree(shape);
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    ShapeTree<int> copy = shape_tree;
    tensorflow::testing::DoNotOptimize(copy);
  }
 }
-void BM_Move(int iters, int depth, int fan_out) {
+void BM_Move(::testing::benchmark::State& state) {
-  tensorflow::testing::StopTiming();
+  const int depth = state.range(0);
  const int fan_out = state.range(1);
  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
  for (int i = 0; i < depth; ++i) {
    std::vector<xla::Shape> shapes(fan_out, shape);
    shape = ShapeUtil::MakeTupleShape(shapes);
  }
  tensorflow::testing::StartTiming();
  ShapeTree<int> shape_tree(shape);
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    ShapeTree<int> copy = std::move(shape_tree);
    shape_tree = std::move(copy);
  }
 }
-void BM_ForEach(int iters, int depth, int fan_out) {
+void BM_ForEach(::testing::benchmark::State& state) {
-  tensorflow::testing::StopTiming();
+  const int depth = state.range(0);
  const int fan_out = state.range(1);
  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
  for (int i = 0; i < depth; ++i) {
    std::vector<xla::Shape> shapes(fan_out, shape);
    shape = ShapeUtil::MakeTupleShape(shapes);
  }
  tensorflow::testing::StartTiming();
  ShapeTree<int> shape_tree(shape);
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    shape_tree.ForEachMutableElement([](const ShapeIndex& index, int* data) {
      tensorflow::testing::DoNotOptimize(index);
    });
  }
 }
-void BM_Iterate(int iters, int depth, int fan_out) {
+void BM_Iterate(::testing::benchmark::State& state) {
-  tensorflow::testing::StopTiming();
+  const int depth = state.range(0);
  const int fan_out = state.range(1);
  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
  for (int i = 0; i < depth; ++i) {
    std::vector<xla::Shape> shapes(fan_out, shape);
    shape = ShapeUtil::MakeTupleShape(shapes);
  }
  tensorflow::testing::StartTiming();
  ShapeTree<int> shape_tree(shape);
-  for (int i = 0; i < iters; ++i) {
+  for (auto s : state) {
    for (auto& iter : shape_tree) {
      tensorflow::testing::DoNotOptimize(iter.second);
    }
--- a/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc
@ -824,9 +824,8 @@ XLA_TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
  ComputeAndCompare(&b, {});
 }
-void BM_ParallelFusion(int num_iters) {
+void BM_ParallelFusion(::testing::benchmark::State& state) {
  // Simple element-wise computation to benchmark parallel task partitioning.
  tensorflow::testing::StopTiming();
  se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
  auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
@ -915,17 +914,16 @@ void BM_ParallelFusion(int num_iters) {
  const int64 total_bytes = param0_dim0 * param0_dim0 +
                            param1_dim0 * param1_dim0 +
                            param2_dim0 * param2_dim0;
-  tensorflow::testing::BytesProcessed(static_cast<int64>(num_iters) *
+
-                                      total_bytes * sizeof(float));
+  for (auto s : state) {
  tensorflow::testing::UseRealTime();
  tensorflow::testing::StartTiming();
  for (int i = 0; i < num_iters; ++i) {
    auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options);
    ASSERT_TRUE(result.ok());
  }
  state.SetBytesProcessed(static_cast<int64>(state.iterations()) * total_bytes *
                          sizeof(float));
 }
-BENCHMARK(BM_ParallelFusion);
+BENCHMARK(BM_ParallelFusion)->UseRealTime();
 }  // namespace
 }  // namespace xla
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@ -750,9 +750,7 @@ XLA_TEST_F(HloTestBase, AddOfDUS) {
  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0}));
 }
-void BM_DynamicSlice(int num_iters) {
+void BM_DynamicSlice(::testing::benchmark::State& state) {
  tensorflow::testing::StopTiming();
  se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
  auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
  se::StreamExecutorMemoryAllocator allocator(platform, executors);
@ -817,8 +815,7 @@ void BM_DynamicSlice(int num_iters) {
  }
  // Run benchmark.
-  tensorflow::testing::StartTiming();
+  for (auto s : state) {
  for (int i = 0; i < num_iters; ++i) {
    auto result = executable->Run(shaped_buffer_ptrs, options);
    ASSERT_TRUE(result.ok());
  }
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@ -946,9 +946,7 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedOutfeedTest)) {
 // Benchmark that measures the overhead of the LocalClient API when running a
 // trivial computation
-void BM_LocalClientOverhead(int num_iters) {
+void BM_LocalClientOverhead(benchmark::State& state) {
  tensorflow::testing::StopTiming();
  se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
  auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
  se::StreamExecutorMemoryAllocator allocator(platform, executors);
@ -990,8 +988,7 @@ void BM_LocalClientOverhead(int num_iters) {
    ASSERT_IS_OK(result);
  }
-  tensorflow::testing::StartTiming();
+  for (auto s : state) {
  for (int i = 0; i < num_iters; ++i) {
    auto result = executable->Run({&buffer}, run_options);
    ASSERT_IS_OK(result);
  }
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@ -357,8 +357,8 @@ class TransferDeviceToHostBenchmark : public TransferManagerTest {
  using TransferManagerTest::TransferManagerTest;
  ~TransferDeviceToHostBenchmark() override {}
-  void Run(int iters, int num_tuple_elements, int array_size) {
+  void Run(::testing::benchmark::State& state, int num_tuple_elements,
-    tensorflow::testing::StopTiming();
+           int array_size) {
    SetUp();
    std::vector<Literal> tuple_elements;
@ -370,13 +370,11 @@ class TransferDeviceToHostBenchmark : public TransferManagerTest {
    auto device_buffer = AllocateDeviceBuffer(literal.shape());
    TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
                                                           device_buffer));
-    tensorflow::testing::StartTiming();
+    for (auto s : state) {
    for (int i = 0; i < iters; ++i) {
      TF_ASSERT_OK_AND_ASSIGN(
          Literal result,
          transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer));
    }
    tensorflow::testing::StopTiming();
    TearDown();
  }
@ -388,7 +386,8 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
  using TransferManagerTest::TransferManagerTest;
  ~TransferHostToDeviceBenchmark() override {}
-  void Run(int iters, int num_tuple_elements, int array_size) {
+  void Run(::testing::benchmark::State& state, int num_tuple_elements,
           int array_size) {
    tensorflow::testing::StopTiming();
    SetUp();
@ -400,7 +399,7 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
    Literal literal = LiteralUtil::MakeTupleOwned(std::move(tuple_elements));
    auto device_buffer = AllocateDeviceBuffer(literal.shape());
    tensorflow::testing::StartTiming();
-    for (int i = 0; i < iters; ++i) {
+    for (auto s : state) {
      TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal,
                                                             device_buffer));
    }
@ -411,16 +410,20 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest {
  void TestBody() override {}
 };
-void BM_TransferDeviceToHost(int iters, int num_tuple_elements,
+void BM_TransferDeviceToHost(::testing::benchmark::State& state) {
-                             int array_size) {
+  const int num_tuple_elements = state.range(0);
  const int array_size = state.range(1);
  TransferDeviceToHostBenchmark bm;
-  bm.Run(iters, num_tuple_elements, array_size);
+  bm.Run(state, num_tuple_elements, array_size);
 }
-void BM_TransferHostToDevice(int iters, int num_tuple_elements,
+void BM_TransferHostToDevice(::testing::benchmark::State& state) {
-                             int array_size) {
+  const int num_tuple_elements = state.range(0);
  const int array_size = state.range(1);
  TransferHostToDeviceBenchmark bm;
-  bm.Run(iters, num_tuple_elements, array_size);
+  bm.Run(state, num_tuple_elements, array_size);
 }
 BENCHMARK(BM_TransferHostToDevice)
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@ -1259,9 +1259,8 @@ XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileInfeedCondition)) {
  ComputeAndCompareR0<int32>(&builder, 2, {});
 }
-void BM_WhileLoop(int num_iters) {
+void BM_WhileLoop(::testing::benchmark::State& state) {
  // Benchmark a simple kernel to measure while loop overheads.
  tensorflow::testing::StopTiming();
  se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
  auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
@ -1330,8 +1329,7 @@ void BM_WhileLoop(int num_iters) {
  }
  // Run benchmark.
-  tensorflow::testing::StartTiming();
+  for (auto s : state) {
  for (int i = 0; i < num_iters; ++i) {
    auto result =
        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
    ASSERT_TRUE(result.ok());