diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc index 78730cbdcb8..74f2e38b8ab 100644 --- a/tensorflow/compiler/xla/service/copy_insertion_test.cc +++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc @@ -2100,10 +2100,14 @@ std::unique_ptr MakeBenchmarkWhileBody() { return builder.Build(); } -void BM_SequentialWhiles(int num_iters, int num_whiles) { +void BM_SequentialWhiles(::testing::benchmark::State& state) { + const int num_whiles = state.range(0); + // This benchmark constructs a chain of sequential while instructions. - tensorflow::testing::StopTiming(); - for (int i = 0; i < num_iters; ++i) { + // Timer starts automatically at the first iteration of this loop + // and ends after the last one. + for (auto s : state) { + state.PauseTiming(); HloModuleConfig config; config.set_debug_options(GetDebugOptionsFromFlags()); HloModule module("BM_SequentialWhiles", config); @@ -2131,19 +2135,22 @@ void BM_SequentialWhiles(int num_iters, int num_whiles) { CopyInsertion copy_insertion; - tensorflow::testing::StartTiming(); + state.ResumeTiming(); ASSERT_IS_OK(copy_insertion.Run(&module).status()); - tensorflow::testing::StopTiming(); + state.PauseTiming(); // The entry computation should have three copies, and each body has one. ASSERT_EQ(CountCopies(module), 3 + num_whiles); + state.ResumeTiming(); } } -void BM_ParallelWhiles(int num_iters, int num_whiles) { +void BM_ParallelWhiles(::testing::benchmark::State& state) { + const int num_whiles = state.range(0); + // This benchmark constructs a fan-out of parallel while instructions. - tensorflow::testing::StopTiming(); - for (int i = 0; i < num_iters; ++i) { + for (auto s : state) { + state.PauseTiming(); HloModuleConfig config; config.set_debug_options(GetDebugOptionsFromFlags()); HloModule module("BM_SequentialWhiles", config); @@ -2182,9 +2189,9 @@ void BM_ParallelWhiles(int num_iters, int num_whiles) { CopyInsertion copy_insertion; - tensorflow::testing::StartTiming(); + state.ResumeTiming(); ASSERT_IS_OK(copy_insertion.Run(&module).status()); - tensorflow::testing::StopTiming(); + state.PauseTiming(); // Each body receives of copy of two of the parameters (the corresponding // elements in the body are modified), and there is one copy in each body. @@ -2209,14 +2216,15 @@ std::unique_ptr MakeBenchmarkWhileBody( return builder.Build(); } -void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) { - tensorflow::testing::StopTiming(); +void BM_ManyElementTuple(::testing::benchmark::State& state) { + const int num_tuple_inputs = state.range(0); HloModuleConfig config; config.set_debug_options(GetDebugOptionsFromFlags()); CopyInsertion copy_insertion; const Shape element_shape = ShapeUtil::MakeShape(F32, {}); std::vector tuple_params(num_tuple_inputs); - for (int i = 0; i < num_iters; ++i) { + for (auto s : state) { + state.PauseTiming(); auto builder = HloComputation::Builder("BM_ParallelWhiles"); HloModule module("BM_ManyElementTuple", config); for (int j = 0; j < num_tuple_inputs; ++j) { @@ -2234,9 +2242,8 @@ void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) { builder.AddInstruction(HloInstruction::CreateGetTupleElement( ShapeUtil::MakeShape(F32, {}), xla_while, 0)); module.AddEntryComputation(builder.Build()); - tensorflow::testing::StartTiming(); + state.ResumeTiming(); ASSERT_IS_OK(copy_insertion.Run(&module).status()); - tensorflow::testing::StopTiming(); } } diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc index b04635dda03..b01caca0b1d 100644 --- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc +++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc @@ -2545,8 +2545,7 @@ TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) { // Reducing many numbers should be fast because it doesn't create // intermediate Literals; the microbenchmark should finish in < 1 msec. -void BM_ReducePrecisely(int num_iters) { - tensorflow::testing::StopTiming(); +void BM_ReducePrecisely(::testing::benchmark::State& state) { HloComputation::Builder b("BM_ReducePrecisely"); HloModuleConfig config; config.set_debug_options(GetDebugOptionsFromFlags()); @@ -2574,10 +2573,11 @@ void BM_ReducePrecisely(int num_iters) { /*dimensions_to_reduce=*/{0}, add_func)); module.AddEntryComputation(b.Build()); - HloEvaluator hlo_eval; - tensorflow::testing::StartTiming(); - hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie(); - tensorflow::testing::StopTiming(); + // Benchmark loop + for (auto s : state) { + HloEvaluator hlo_eval; + hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie(); + } } BENCHMARK(BM_ReducePrecisely); diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc index 49751d10c5a..763d89e57fa 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc @@ -173,8 +173,10 @@ TEST(ScopedShapedBufferTest, TestSubShapeTree) { // Test TakeSubTree with different depths (depth of ShapeTree) and fan-outs // (cardinality of each non-leaf node's children). -void BM_TakeSubTree(int iters, int depth, int fan_out) { - tensorflow::testing::StopTiming(); +void BM_TakeSubTree(::testing::benchmark::State& state) { + const int depth = state.range(0); + const int fan_out = state.range(1); + TestAllocator allocator; xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128}); for (int i = 0; i < depth; ++i) { @@ -183,13 +185,11 @@ void BM_TakeSubTree(int iters, int depth, int fan_out) { } xla::ScopedShapedBuffer shaped_buffer(shape, /*allocator=*/&allocator, /*device_ordinal=*/0); - tensorflow::testing::StartTiming(); - for (int i = 0; i < iters; ++i) { + for (auto s : state) { // Extract a buffer from approximately the middle of the first level of the // tree. (void)shaped_buffer.TakeSubTree(/*index=*/{fan_out / 2}).release(); } - tensorflow::testing::StopTiming(); } BENCHMARK(BM_TakeSubTree) diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc index c294355e269..9078f674fa0 100644 --- a/tensorflow/compiler/xla/shape_tree_test.cc +++ b/tensorflow/compiler/xla/shape_tree_test.cc @@ -535,94 +535,100 @@ TEST_F(ShapeTreeTest, ReverseIterateOrderLeaves) { })); } -void BM_Construct(int iters, int depth, int fan_out) { - tensorflow::testing::StopTiming(); +void BM_Construct(::testing::benchmark::State& state) { + const int depth = state.range(0); + const int fan_out = state.range(1); + Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128}); for (int i = 0; i < depth; ++i) { std::vector shapes(fan_out, shape); shape = ShapeUtil::MakeTupleShape(shapes); } - tensorflow::testing::StartTiming(); - for (int i = 0; i < iters; ++i) { + for (auto s : state) { ShapeTree shape_tree(shape); } } -void BM_ConstructUnowned(int iters, int depth, int fan_out) { - tensorflow::testing::StopTiming(); +void BM_ConstructUnowned(::testing::benchmark::State& state) { + const int depth = state.range(0); + const int fan_out = state.range(1); + Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128}); for (int i = 0; i < depth; ++i) { std::vector shapes(fan_out, shape); shape = ShapeUtil::MakeTupleShape(shapes); } - tensorflow::testing::StartTiming(); - for (int i = 0; i < iters; ++i) { + for (auto s : state) { ShapeTree shape_tree(&shape); } } -void BM_Copy(int iters, int depth, int fan_out) { - tensorflow::testing::StopTiming(); +void BM_Copy(::testing::benchmark::State& state) { + const int depth = state.range(0); + const int fan_out = state.range(1); + Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128}); for (int i = 0; i < depth; ++i) { std::vector shapes(fan_out, shape); shape = ShapeUtil::MakeTupleShape(shapes); } - tensorflow::testing::StartTiming(); ShapeTree shape_tree(shape); - for (int i = 0; i < iters; ++i) { + for (auto s : state) { ShapeTree copy = shape_tree; tensorflow::testing::DoNotOptimize(copy); } } -void BM_Move(int iters, int depth, int fan_out) { - tensorflow::testing::StopTiming(); +void BM_Move(::testing::benchmark::State& state) { + const int depth = state.range(0); + const int fan_out = state.range(1); + Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128}); for (int i = 0; i < depth; ++i) { std::vector shapes(fan_out, shape); shape = ShapeUtil::MakeTupleShape(shapes); } - tensorflow::testing::StartTiming(); ShapeTree shape_tree(shape); - for (int i = 0; i < iters; ++i) { + for (auto s : state) { ShapeTree copy = std::move(shape_tree); shape_tree = std::move(copy); } } -void BM_ForEach(int iters, int depth, int fan_out) { - tensorflow::testing::StopTiming(); +void BM_ForEach(::testing::benchmark::State& state) { + const int depth = state.range(0); + const int fan_out = state.range(1); + Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128}); for (int i = 0; i < depth; ++i) { std::vector shapes(fan_out, shape); shape = ShapeUtil::MakeTupleShape(shapes); } - tensorflow::testing::StartTiming(); ShapeTree shape_tree(shape); - for (int i = 0; i < iters; ++i) { + for (auto s : state) { shape_tree.ForEachMutableElement([](const ShapeIndex& index, int* data) { tensorflow::testing::DoNotOptimize(index); }); } } -void BM_Iterate(int iters, int depth, int fan_out) { - tensorflow::testing::StopTiming(); +void BM_Iterate(::testing::benchmark::State& state) { + const int depth = state.range(0); + const int fan_out = state.range(1); + Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128}); for (int i = 0; i < depth; ++i) { std::vector shapes(fan_out, shape); shape = ShapeUtil::MakeTupleShape(shapes); } - tensorflow::testing::StartTiming(); ShapeTree shape_tree(shape); - for (int i = 0; i < iters; ++i) { + for (auto s : state) { for (auto& iter : shape_tree) { tensorflow::testing::DoNotOptimize(iter.second); } diff --git a/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc b/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc index 2a1eed7c7a7..c884fcca25b 100644 --- a/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc +++ b/tensorflow/compiler/xla/tests/cpu_gpu_fusion_test.cc @@ -824,9 +824,8 @@ XLA_TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) { ComputeAndCompare(&b, {}); } -void BM_ParallelFusion(int num_iters) { +void BM_ParallelFusion(::testing::benchmark::State& state) { // Simple element-wise computation to benchmark parallel task partitioning. - tensorflow::testing::StopTiming(); se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie(); auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie(); @@ -915,17 +914,16 @@ void BM_ParallelFusion(int num_iters) { const int64 total_bytes = param0_dim0 * param0_dim0 + param1_dim0 * param1_dim0 + param2_dim0 * param2_dim0; - tensorflow::testing::BytesProcessed(static_cast(num_iters) * - total_bytes * sizeof(float)); - tensorflow::testing::UseRealTime(); - tensorflow::testing::StartTiming(); - for (int i = 0; i < num_iters; ++i) { + + for (auto s : state) { auto result = executable->Run({&buffer0, &buffer1, &buffer2}, options); ASSERT_TRUE(result.ok()); } + state.SetBytesProcessed(static_cast(state.iterations()) * total_bytes * + sizeof(float)); } -BENCHMARK(BM_ParallelFusion); +BENCHMARK(BM_ParallelFusion)->UseRealTime(); } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc index 0974d37779e..0362d5fe1a5 100644 --- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc +++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc @@ -750,9 +750,7 @@ XLA_TEST_F(HloTestBase, AddOfDUS) { EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{0, 0})); } -void BM_DynamicSlice(int num_iters) { - tensorflow::testing::StopTiming(); - +void BM_DynamicSlice(::testing::benchmark::State& state) { se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie(); auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie(); se::StreamExecutorMemoryAllocator allocator(platform, executors); @@ -817,8 +815,7 @@ void BM_DynamicSlice(int num_iters) { } // Run benchmark. - tensorflow::testing::StartTiming(); - for (int i = 0; i < num_iters; ++i) { + for (auto s : state) { auto result = executable->Run(shaped_buffer_ptrs, options); ASSERT_TRUE(result.ok()); } diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc index fab1a53611f..e5fe7793769 100644 --- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc @@ -946,9 +946,7 @@ XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedOutfeedTest)) { // Benchmark that measures the overhead of the LocalClient API when running a // trivial computation -void BM_LocalClientOverhead(int num_iters) { - tensorflow::testing::StopTiming(); - +void BM_LocalClientOverhead(benchmark::State& state) { se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie(); auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie(); se::StreamExecutorMemoryAllocator allocator(platform, executors); @@ -990,8 +988,7 @@ void BM_LocalClientOverhead(int num_iters) { ASSERT_IS_OK(result); } - tensorflow::testing::StartTiming(); - for (int i = 0; i < num_iters; ++i) { + for (auto s : state) { auto result = executable->Run({&buffer}, run_options); ASSERT_IS_OK(result); } diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc index 697c24e6587..a343184d66e 100644 --- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc +++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc @@ -357,8 +357,8 @@ class TransferDeviceToHostBenchmark : public TransferManagerTest { using TransferManagerTest::TransferManagerTest; ~TransferDeviceToHostBenchmark() override {} - void Run(int iters, int num_tuple_elements, int array_size) { - tensorflow::testing::StopTiming(); + void Run(::testing::benchmark::State& state, int num_tuple_elements, + int array_size) { SetUp(); std::vector tuple_elements; @@ -370,13 +370,11 @@ class TransferDeviceToHostBenchmark : public TransferManagerTest { auto device_buffer = AllocateDeviceBuffer(literal.shape()); TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal, device_buffer)); - tensorflow::testing::StartTiming(); - for (int i = 0; i < iters; ++i) { + for (auto s : state) { TF_ASSERT_OK_AND_ASSIGN( Literal result, transfer_manager_->TransferLiteralFromDevice(stream_, device_buffer)); } - tensorflow::testing::StopTiming(); TearDown(); } @@ -388,7 +386,8 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest { using TransferManagerTest::TransferManagerTest; ~TransferHostToDeviceBenchmark() override {} - void Run(int iters, int num_tuple_elements, int array_size) { + void Run(::testing::benchmark::State& state, int num_tuple_elements, + int array_size) { tensorflow::testing::StopTiming(); SetUp(); @@ -400,7 +399,7 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest { Literal literal = LiteralUtil::MakeTupleOwned(std::move(tuple_elements)); auto device_buffer = AllocateDeviceBuffer(literal.shape()); tensorflow::testing::StartTiming(); - for (int i = 0; i < iters; ++i) { + for (auto s : state) { TF_CHECK_OK(transfer_manager_->TransferLiteralToDevice(stream_, literal, device_buffer)); } @@ -411,16 +410,20 @@ class TransferHostToDeviceBenchmark : public TransferManagerTest { void TestBody() override {} }; -void BM_TransferDeviceToHost(int iters, int num_tuple_elements, - int array_size) { +void BM_TransferDeviceToHost(::testing::benchmark::State& state) { + const int num_tuple_elements = state.range(0); + const int array_size = state.range(1); + TransferDeviceToHostBenchmark bm; - bm.Run(iters, num_tuple_elements, array_size); + bm.Run(state, num_tuple_elements, array_size); } -void BM_TransferHostToDevice(int iters, int num_tuple_elements, - int array_size) { +void BM_TransferHostToDevice(::testing::benchmark::State& state) { + const int num_tuple_elements = state.range(0); + const int array_size = state.range(1); + TransferHostToDeviceBenchmark bm; - bm.Run(iters, num_tuple_elements, array_size); + bm.Run(state, num_tuple_elements, array_size); } BENCHMARK(BM_TransferHostToDevice) diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc index 8e8c3605cc7..73bb30f46d0 100644 --- a/tensorflow/compiler/xla/tests/while_test.cc +++ b/tensorflow/compiler/xla/tests/while_test.cc @@ -1259,9 +1259,8 @@ XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileInfeedCondition)) { ComputeAndCompareR0(&builder, 2, {}); } -void BM_WhileLoop(int num_iters) { +void BM_WhileLoop(::testing::benchmark::State& state) { // Benchmark a simple kernel to measure while loop overheads. - tensorflow::testing::StopTiming(); se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie(); auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie(); @@ -1330,8 +1329,7 @@ void BM_WhileLoop(int num_iters) { } // Run benchmark. - tensorflow::testing::StartTiming(); - for (int i = 0; i < num_iters; ++i) { + for (auto s : state) { auto result = executable->Run(absl::Span(), options); ASSERT_TRUE(result.ok());