From 5008bbbca44ef43db8c92be8516badf041f6edba Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 27 Oct 2020 21:24:10 -0700 Subject: [PATCH] Internal tests cleanup. PiperOrigin-RevId: 339390176 Change-Id: Ie0480a0d8d78bb1a50db434c7f456d407a72444c --- .../core/kernels/basic_ops_benchmark_test.cc | 11 +- .../basic_batch_scheduler_benchmark_test.cc | 35 ++- tensorflow/core/kernels/bias_op_test.cc | 35 +-- tensorflow/core/kernels/bincount_op_test.cc | 14 +- .../core/kernels/broadcast_to_op_test.cc | 50 ++-- tensorflow/core/kernels/cast_op_test.cc | 161 +++++++----- .../core/kernels/clustering_ops_test.cc | 67 +++-- tensorflow/core/kernels/concat_op_test.cc | 159 ++++++----- tensorflow/core/kernels/constant_op_test.cc | 20 +- .../core/kernels/conv_ops_benchmark_test.cc | 137 +++++----- tensorflow/core/kernels/cwise_ops_test.cc | 246 ++++++++++-------- .../data/single_threaded_executor_test.cc | 66 +++-- tensorflow/core/kernels/dequantize_op_test.cc | 55 ++-- tensorflow/core/kernels/diag_op_test.cc | 13 +- 14 files changed, 607 insertions(+), 462 deletions(-) diff --git a/tensorflow/core/kernels/basic_ops_benchmark_test.cc b/tensorflow/core/kernels/basic_ops_benchmark_test.cc index 5726062938b..171bf220466 100644 --- a/tensorflow/core/kernels/basic_ops_benchmark_test.cc +++ b/tensorflow/core/kernels/basic_ops_benchmark_test.cc @@ -64,13 +64,16 @@ static void MulChain(int chain_length, Graph** init_g, Graph** run_g) { // Benchmark a chain of simple multiplications. // This emphasizes per-op overhead. -static void BM_MulChain(int iters, int chain_length) { - const int64 tot = static_cast(iters) * chain_length; - testing::ItemsProcessed(tot); +static void BM_MulChain(::testing::benchmark::State& state) { + const int chain_length = state.range(0); + Graph* init; Graph* run; MulChain(chain_length, &init, &run); - test::Benchmark("cpu", run, GetOptions(), init).Run(iters); + test::Benchmark("cpu", run, GetOptions(), init, nullptr, "", + /*old_benchmark_api=*/false) + .Run(state); + state.SetItemsProcessed(state.iterations()); } BENCHMARK(BM_MulChain)->Arg(1 << 10); diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc index af93a3ec9a6..c0f22ab1098 100644 --- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc +++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler_benchmark_test.cc @@ -115,7 +115,7 @@ class ThroughputBenchmark { ThroughputBenchmark& operator=(const ThroughputBenchmark&) = delete; // Perform the benchmark run, based on the parameters supplied to the ctor. - void RunBenchmark(int iters); + void RunBenchmark(::testing::benchmark::State& state); private: // Resets all mutable state, including the scheduler. @@ -136,22 +136,18 @@ ThroughputBenchmark::ThroughputBenchmark( const BasicBatchScheduler::Options& scheduler_options) : scheduler_options_(scheduler_options) {} -void ThroughputBenchmark::RunBenchmark(int iters) { - CHECK_GE(iters, 1); +void ThroughputBenchmark::RunBenchmark(::testing::benchmark::State& state) { + CHECK_GE(state.max_iterations, 1); - testing::StopTiming(); ResetState(); // Have each iteration issue a reasonably large number of tasks, to ensure our // measurements reflect steady-state behavior. const int kNumTasksPerIteration = 100 * 1000; - - testing::ItemsProcessed(iters * kNumTasksPerIteration); testing::UseRealTime(); - testing::StartTiming(); // Schedule 'num_iterations_*kNumTasksPerIteration' tasks. - for (int i = 0; i < iters; ++i) { + for (auto s : state) { for (int j = 0; j < kNumTasksPerIteration; ++j) { auto task = std::unique_ptr(new BenchmarkBatchTask); TF_CHECK_OK(scheduler_->Schedule(&task)); @@ -160,7 +156,7 @@ void ThroughputBenchmark::RunBenchmark(int iters) { // Wait for the scheduler to process all tasks. scheduler_.reset(); - testing::StopTiming(); + state.SetItemsProcessed(state.iterations() * kNumTasksPerIteration); } void ThroughputBenchmark::ResetState() { @@ -338,7 +334,8 @@ void LatencyBenchmark::PerformBatchCpuWork() const { CHECK_NE(dummy, 0); } -static void RunThroughputBenchmark(int iters, int64 batch_timeout_micros, +static void RunThroughputBenchmark(::testing::benchmark::State& state, + int64 batch_timeout_micros, int num_batch_threads) { BasicBatchScheduler::Options scheduler_options; const int kMaxBatchSize = 100; @@ -347,13 +344,14 @@ static void RunThroughputBenchmark(int iters, int64 batch_timeout_micros, scheduler_options.num_batch_threads = num_batch_threads; scheduler_options.max_enqueued_batches = INT_MAX; // Unbounded queue. ThroughputBenchmark benchmark(scheduler_options); - benchmark.RunBenchmark(iters); + benchmark.RunBenchmark(state); } -static void ThroughputBM_ZeroTimeout(int iters, int num_batch_threads) { - RunThroughputBenchmark(iters, 0 /* 0 ms timeout */, num_batch_threads); +static void ThroughputBM_ZeroTimeout(::testing::benchmark::State& state) { + RunThroughputBenchmark(state, 0 /* 0 ms timeout */, state.range(0)); } BENCHMARK(ThroughputBM_ZeroTimeout) + ->UseRealTime() ->Arg(1) ->Arg(2) ->Arg(4) @@ -362,10 +360,11 @@ BENCHMARK(ThroughputBM_ZeroTimeout) ->Arg(32) ->Arg(64); -static void ThroughputBM_SmallTimeout(int iters, int num_batch_threads) { - RunThroughputBenchmark(iters, 1 * 1000 /* 1 ms timeout */, num_batch_threads); +static void ThroughputBM_SmallTimeout(::testing::benchmark::State& state) { + RunThroughputBenchmark(state, 1 * 1000 /* 1 ms timeout */, state.range(0)); } BENCHMARK(ThroughputBM_SmallTimeout) + ->UseRealTime() ->Arg(1) ->Arg(2) ->Arg(4) @@ -374,11 +373,11 @@ BENCHMARK(ThroughputBM_SmallTimeout) ->Arg(32) ->Arg(64); -static void ThroughputBM_LargeTimeout(int iters, int num_batch_threads) { - RunThroughputBenchmark(iters, 50 * 1000 /* 50 ms timeout */, - num_batch_threads); +static void ThroughputBM_LargeTimeout(::testing::benchmark::State& state) { + RunThroughputBenchmark(state, 50 * 1000 /* 50 ms timeout */, state.range(0)); } BENCHMARK(ThroughputBM_LargeTimeout) + ->UseRealTime() ->Arg(1) ->Arg(2) ->Arg(4) diff --git a/tensorflow/core/kernels/bias_op_test.cc b/tensorflow/core/kernels/bias_op_test.cc index 2da219f7e45..6119b52bc01 100644 --- a/tensorflow/core/kernels/bias_op_test.cc +++ b/tensorflow/core/kernels/bias_op_test.cc @@ -43,22 +43,27 @@ static Graph* BiasAddGrad(int d0, int d1, int d2, int d3) { return g; } -#define BM_BiasAddNHWC(N, W, H, C, DEVICE) \ - static void BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE(int iters) { \ - testing::UseRealTime(); \ - testing::ItemsProcessed(static_cast(iters) * N * H * W * C); \ - test::Benchmark(#DEVICE, BiasAdd(N, H, W, C)).Run(iters); \ - } \ - BENCHMARK(BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE); +#define BM_BiasAddNHWC(N, W, H, C, DEVICE) \ + static void BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE( \ + ::testing::benchmark::State& state) { \ + test::Benchmark(#DEVICE, BiasAdd(N, H, W, C), /*old_benchmark_api=*/false) \ + .Run(state); \ + state.SetItemsProcessed(static_cast(state.iterations()) * N * H * \ + W * C); \ + } \ + BENCHMARK(BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE)->UseRealTime(); -#define BM_BiasAddGradNHWC(N, W, H, C, DEVICE) \ - static void BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE( \ - int iters) { \ - testing::UseRealTime(); \ - testing::ItemsProcessed(static_cast(iters) * N * H * W * C); \ - test::Benchmark(#DEVICE, BiasAddGrad(N, H, W, C)).Run(iters); \ - } \ - BENCHMARK(BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE); +#define BM_BiasAddGradNHWC(N, W, H, C, DEVICE) \ + static void BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE( \ + ::testing::benchmark::State& state) { \ + test::Benchmark(#DEVICE, BiasAddGrad(N, H, W, C), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + state.SetItemsProcessed(static_cast(state.iterations()) * N * H * \ + W * C); \ + } \ + BENCHMARK(BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE) \ + ->UseRealTime(); // CPU BM_BiasAddNHWC(32, 32, 32, 128, cpu); diff --git a/tensorflow/core/kernels/bincount_op_test.cc b/tensorflow/core/kernels/bincount_op_test.cc index cb04b40637a..80257fb435d 100644 --- a/tensorflow/core/kernels/bincount_op_test.cc +++ b/tensorflow/core/kernels/bincount_op_test.cc @@ -45,11 +45,15 @@ static Graph* Bincount(int arr_size, int nbins) { return g; } -#define BM_BincountDev(K, NBINS, type) \ - static void BM_Bincount##_##type##_##K##_##NBINS(int iters) { \ - testing::ItemsProcessed(static_cast(iters) * K * 1024); \ - test::Benchmark(#type, Bincount(K * 1024, NBINS)).Run(iters); \ - } \ +#define BM_BincountDev(K, NBINS, type) \ + static void BM_Bincount##_##type##_##K##_##NBINS( \ + ::testing::benchmark::State& state) { \ + test::Benchmark(#type, Bincount(K * 1024, NBINS), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + state.SetItemsProcessed(static_cast(state.iterations()) * K * \ + 1024); \ + } \ BENCHMARK(BM_Bincount##_##type##_##K##_##NBINS); BM_BincountDev(32, 1000, cpu); diff --git a/tensorflow/core/kernels/broadcast_to_op_test.cc b/tensorflow/core/kernels/broadcast_to_op_test.cc index c8cb7ddc1a8..d0b74472565 100644 --- a/tensorflow/core/kernels/broadcast_to_op_test.cc +++ b/tensorflow/core/kernels/broadcast_to_op_test.cc @@ -44,29 +44,35 @@ static Graph* BroadcastTo(int dim0, int dim1, InputShape input_shape) { return g; } -#define BM_BroadcastTo_InnerDim(DIM0, DIM1, type) \ - static void BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1(int iters) { \ - testing::UseRealTime(); \ - testing::ItemsProcessed(static_cast(iters) * DIM0 * DIM1); \ - test::Benchmark(#type, BroadcastTo(DIM0, DIM1, \ - [](int dim0, int dim1) { \ - return TensorShape({dim0, 1}); \ - })) \ - .Run(iters); \ - } \ - BENCHMARK(BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1); +#define BM_BroadcastTo_InnerDim(DIM0, DIM1, type) \ + static void BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1( \ + ::testing::benchmark::State& state) { \ + test::Benchmark(#type, \ + BroadcastTo(DIM0, DIM1, \ + [](int dim0, int dim1) { \ + return TensorShape({dim0, 1}); \ + }), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + state.SetItemsProcessed(static_cast(state.iterations()) * DIM0 * \ + DIM1); \ + } \ + BENCHMARK(BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1)->UseRealTime(); -#define BM_BroadcastTo_OuterDim(DIM0, DIM1, type) \ - static void BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1(int iters) { \ - testing::UseRealTime(); \ - testing::ItemsProcessed(static_cast(iters) * DIM0 * DIM1); \ - test::Benchmark(#type, BroadcastTo(DIM0, DIM1, \ - [](int dim0, int dim1) { \ - return TensorShape({1, dim1}); \ - })) \ - .Run(iters); \ - } \ - BENCHMARK(BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1); +#define BM_BroadcastTo_OuterDim(DIM0, DIM1, type) \ + static void BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1( \ + ::testing::benchmark::State& state) { \ + test::Benchmark(#type, \ + BroadcastTo(DIM0, DIM1, \ + [](int dim0, int dim1) { \ + return TensorShape({1, dim1}); \ + }), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + state.SetItemsProcessed(static_cast(state.iterations()) * DIM0 * \ + DIM1); \ + } \ + BENCHMARK(BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1)->UseRealTime(); BM_BroadcastTo_InnerDim(64, 64, cpu); BM_BroadcastTo_InnerDim(128, 128, cpu); diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc index 11550be4874..a7579c0705e 100644 --- a/tensorflow/core/kernels/cast_op_test.cc +++ b/tensorflow/core/kernels/cast_op_test.cc @@ -121,102 +121,127 @@ TEST_ALL_CASTS_FROM(quint16) // TODO(wicke): check conversions from/to bool, and bfloat16 -static void BM_cpu_float_int64(int iters, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - testing::BytesProcessed(static_cast(iters) * num * +static void BM_cpu_float_int64(::testing::benchmark::State& state) { + const int num = state.range(0); + test::Benchmark("cpu", Cast(num), /*old_benchmark_api=*/false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num); + state.SetBytesProcessed(static_cast(state.iterations()) * num * (sizeof(float) + sizeof(int64))); - testing::UseRealTime(); - test::Benchmark("cpu", Cast(num)).Run(iters); } -BENCHMARK(BM_cpu_float_int64)->Arg(64 << 10)->Arg(32 << 20); +BENCHMARK(BM_cpu_float_int64)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20); -static void BM_gpu_float_int64(int iters, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - testing::BytesProcessed(static_cast(iters) * num * +static void BM_gpu_float_int64(::testing::benchmark::State& state) { + const int num = state.range(0); + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + test::Benchmark("gpu", Cast(num), /*old_benchmark_api=*/false) + .Run(state); +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + state.SetItemsProcessed(static_cast(state.iterations()) * num); + state.SetBytesProcessed(static_cast(state.iterations()) * num * (sizeof(float) + sizeof(int64))); - testing::UseRealTime(); -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - test::Benchmark("gpu", Cast(num)).Run(iters); -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } -BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20); +BENCHMARK(BM_gpu_float_int64)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20); -static void BM_cpu_bool_float(int iters, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - testing::BytesProcessed(static_cast(iters) * num * +static void BM_cpu_bool_float(::testing::benchmark::State& state) { + const int num = state.range(0); + + test::Benchmark("cpu", Cast(num), /*old_benchmark_api=*/false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num); + state.SetBytesProcessed(static_cast(state.iterations()) * num * (sizeof(bool) + sizeof(float))); - testing::UseRealTime(); - test::Benchmark("cpu", Cast(num)).Run(iters); } -BENCHMARK(BM_cpu_bool_float)->Arg(64 << 10)->Arg(32 << 20); +BENCHMARK(BM_cpu_bool_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20); -static void BM_gpu_bool_float(int iters, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - testing::BytesProcessed(static_cast(iters) * num * +static void BM_gpu_bool_float(::testing::benchmark::State& state) { + const int num = state.range(0); + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + test::Benchmark("gpu", Cast(num), /*old_benchmark_api=*/false) + .Run(state); +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + state.SetItemsProcessed(static_cast(state.iterations()) * num); + state.SetBytesProcessed(static_cast(state.iterations()) * num * (sizeof(bool) + sizeof(float))); - testing::UseRealTime(); -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - test::Benchmark("gpu", Cast(num)).Run(iters); -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } -BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20); +BENCHMARK(BM_gpu_bool_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20); -static void BM_cpu_float_bfloat16(int iters, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - testing::BytesProcessed(static_cast(iters) * num * +static void BM_cpu_float_bfloat16(::testing::benchmark::State& state) { + const int num = state.range(0); + test::Benchmark("cpu", Cast(num), + /*old_benchmark_api=*/false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num); + state.SetBytesProcessed(static_cast(state.iterations()) * num * (sizeof(float) + sizeof(bfloat16))); - testing::UseRealTime(); - test::Benchmark("cpu", Cast(num)).Run(iters); } -BENCHMARK(BM_cpu_float_bfloat16)->Arg(64 << 10)->Arg(32 << 20); +BENCHMARK(BM_cpu_float_bfloat16)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20); -static void BM_cpu_bfloat16_float(int iters, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - testing::BytesProcessed(static_cast(iters) * num * +static void BM_cpu_bfloat16_float(::testing::benchmark::State& state) { + const int num = state.range(0); + test::Benchmark("cpu", Cast(num), + /*old_benchmark_api=*/false) + .Run(state); + + state.SetItemsProcessed(static_cast(state.iterations()) * num); + state.SetBytesProcessed(static_cast(state.iterations()) * num * (sizeof(float) + sizeof(bfloat16))); - testing::UseRealTime(); - test::Benchmark("cpu", Cast(num)).Run(iters); } -BENCHMARK(BM_cpu_bfloat16_float)->Arg(64 << 10)->Arg(32 << 20); +BENCHMARK(BM_cpu_bfloat16_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20); -static void BM_cpu_float_half(int iters, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - testing::BytesProcessed(static_cast(iters) * num * +static void BM_cpu_float_half(::testing::benchmark::State& state) { + const int num = state.range(0); + + test::Benchmark("cpu", Cast(num), + /*old_benchmark_api=*/false) + .Run(state); + + state.SetItemsProcessed(static_cast(state.iterations()) * num); + state.SetBytesProcessed(static_cast(state.iterations()) * num * (sizeof(float) + sizeof(Eigen::half))); - testing::UseRealTime(); - test::Benchmark("cpu", Cast(num)).Run(iters); } -BENCHMARK(BM_cpu_float_half)->Arg(64 << 10)->Arg(32 << 20); +BENCHMARK(BM_cpu_float_half)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20); -static void BM_cpu_half_float(int iters, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - testing::BytesProcessed(static_cast(iters) * num * +static void BM_cpu_half_float(::testing::benchmark::State& state) { + const int num = state.range(0); + + test::Benchmark("cpu", Cast(num), + /*old_benchmark_api=*/false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num); + state.SetBytesProcessed(static_cast(state.iterations()) * num * (sizeof(float) + sizeof(Eigen::half))); - testing::UseRealTime(); - test::Benchmark("cpu", Cast(num)).Run(iters); } -BENCHMARK(BM_cpu_half_float)->Arg(64 << 10)->Arg(32 << 20); +BENCHMARK(BM_cpu_half_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20); -static void BM_gpu_float_half(int iters, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - testing::BytesProcessed(static_cast(iters) * num * - (sizeof(float) + sizeof(Eigen::half))); - testing::UseRealTime(); +static void BM_gpu_float_half(::testing::benchmark::State& state) { + const int num = state.range(0); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - test::Benchmark("gpu", Cast(num)).Run(iters); + test::Benchmark("gpu", Cast(num), + /*old_benchmark_api=*/false) + .Run(state); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM -} -BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20); -static void BM_gpu_half_float(int iters, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - testing::BytesProcessed(static_cast(iters) * num * + state.SetItemsProcessed(static_cast(state.iterations()) * num); + state.SetBytesProcessed(static_cast(state.iterations()) * num * (sizeof(float) + sizeof(Eigen::half))); - testing::UseRealTime(); -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - test::Benchmark("gpu", Cast(num)).Run(iters); -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } -BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20); +BENCHMARK(BM_gpu_float_half)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20); + +static void BM_gpu_half_float(::testing::benchmark::State& state) { + const int num = state.range(0); + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + test::Benchmark("gpu", Cast(num), + /*old_benchmark_api=*/false) + .Run(state); +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + state.SetItemsProcessed(static_cast(state.iterations()) * num); + state.SetBytesProcessed(static_cast(state.iterations()) * num * + (sizeof(float) + sizeof(Eigen::half))); +} +BENCHMARK(BM_gpu_half_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20); } // end namespace tensorflow diff --git a/tensorflow/core/kernels/clustering_ops_test.cc b/tensorflow/core/kernels/clustering_ops_test.cc index 8172a7cebb8..5b5d7472296 100644 --- a/tensorflow/core/kernels/clustering_ops_test.cc +++ b/tensorflow/core/kernels/clustering_ops_test.cc @@ -72,22 +72,21 @@ Graph* SetUpKmeansPlusPlusInitialization(int num_dims, int num_points, template -void BM_KmeansPlusPlusInitialization(int iters) { - testing::StopTiming(); - testing::ItemsProcessed(static_cast(iters) * num_points * num_dims * - num_to_sample); - testing::UseRealTime(); +void BM_KmeansPlusPlusInitialization(::testing::benchmark::State& state) { Graph* g = SetUpKmeansPlusPlusInitialization( num_dims, num_points, num_to_sample, retries_per_sample); - testing::StartTiming(); - test::Benchmark("cpu", g).Run(iters); + test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num_points * + num_dims * num_to_sample); } -#define BENCHMARK_KMEANS_PLUS_PLUS(p, c, d, r) \ - void BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r(int iters) { \ - BM_KmeansPlusPlusInitialization(iters); \ - } \ - BENCHMARK(BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r); +#define BENCHMARK_KMEANS_PLUS_PLUS(p, c, d, r) \ + void BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r( \ + ::testing::benchmark::State& state) { \ + BM_KmeansPlusPlusInitialization(state); \ + } \ + BENCHMARK(BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r) \ + ->UseRealTime(); #define RUN_BM_KmeansPlusPlusInitialization(retries) \ BENCHMARK_KMEANS_PLUS_PLUS(k10Points, k2Centers, k100Dim, retries); \ @@ -132,20 +131,18 @@ Graph* SetUpKMC2Initialization(int num_points) { } template -void BM_KMC2Initialization(int iters) { - testing::StopTiming(); - testing::ItemsProcessed(static_cast(iters) * num_points * num_dims * - num_to_sample); - testing::UseRealTime(); +void BM_KMC2Initialization(::testing::benchmark::State& state) { Graph* g = SetUpKMC2Initialization(num_points); - testing::StartTiming(); - test::Benchmark("cpu", g).Run(iters); + test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num_points * + num_dims * num_to_sample); } -#define BENCHMARK_KMC2(p, c, d) \ - void BM_KMC2Initialization_##p##_##c##_##d(int iters) { \ - BM_KMC2Initialization(iters); \ - } \ - BENCHMARK(BM_KMC2Initialization_##p##_##c##_##d); +#define BENCHMARK_KMC2(p, c, d) \ + void BM_KMC2Initialization_##p##_##c##_##d( \ + ::testing::benchmark::State& state) { \ + BM_KMC2Initialization(state); \ + } \ + BENCHMARK(BM_KMC2Initialization_##p##_##c##_##d)->UseRealTime(); #define RUN_BM_KMC2Initialization \ BENCHMARK_KMC2(k10Points, k2Centers, k100Dim); \ @@ -191,14 +188,11 @@ Graph* SetUpNearestNeighbors(int num_dims, int num_points, int num_centers, } template -void BM_NearestNeighbors(int iters) { - testing::StopTiming(); - testing::ItemsProcessed(static_cast(iters) * num_points * num_dims * - num_centers); - testing::UseRealTime(); +void BM_NearestNeighbors(::testing::benchmark::State& state) { Graph* g = SetUpNearestNeighbors(num_dims, num_points, num_centers, k); - testing::StartTiming(); - test::Benchmark("cpu", g).Run(iters); + test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num_points * + num_dims * num_centers); } constexpr int kTop1 = 1; @@ -206,11 +200,12 @@ constexpr int kTop2 = 2; constexpr int kTop5 = 5; constexpr int kTop10 = 10; -#define BENCHMARK_NEAREST_NEIGHBORS(d, p, c, k) \ - void BM_NearestNeighbors##d##_##p##_##c##_##k(int iters) { \ - BM_NearestNeighbors(iters); \ - } \ - BENCHMARK(BM_NearestNeighbors##d##_##p##_##c##_##k); +#define BENCHMARK_NEAREST_NEIGHBORS(d, p, c, k) \ + void BM_NearestNeighbors##d##_##p##_##c##_##k( \ + ::testing::benchmark::State& state) { \ + BM_NearestNeighbors(state); \ + } \ + BENCHMARK(BM_NearestNeighbors##d##_##p##_##c##_##k)->UseRealTime(); #define RUN_BM_NearestNeighbors(k) \ BENCHMARK_NEAREST_NEIGHBORS(k100Dim, k1kPoints, k100Centers, k); \ diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc index 5dffe76130d..66263a1d812 100644 --- a/tensorflow/core/kernels/concat_op_test.cc +++ b/tensorflow/core/kernels/concat_op_test.cc @@ -57,9 +57,9 @@ void FillTensorWithRandomValues(Tensor* t, int string_length, // std::string, then the length of individual strings in the tensors will be // of length "string_length". template -static void ConcatHelper(int iters, int concat_dimension, int dim2, +static void ConcatHelper(::testing::benchmark::State& state, + int concat_dimension, int dim2, int string_length = 0) { - testing::StopTiming(); Graph* g = new Graph(OpRegistry::Global()); DataType dt = DataTypeToEnum::v(); @@ -81,49 +81,82 @@ static void ConcatHelper(int iters, int concat_dimension, int dim2, .Attr("T", dt) .Finalize(g, &node)); - testing::BytesProcessed(static_cast(iters) * (in0_bytes + in1_bytes)); - testing::StartTiming(); - test::Benchmark("cpu", g).Run(iters); - testing::UseRealTime(); + test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state); + state.SetBytesProcessed(static_cast(state.iterations()) * + (in0_bytes + in1_bytes)); } -static void BM_ConcatDim0Float(int iters, int dim2) { - ConcatHelper(iters, 0, dim2); +void BM_ConcatDim0Float(::testing::benchmark::State& state) { + const int dim2 = state.range(0); + + ConcatHelper(state, 0, dim2); } -static void BM_ConcatDim1Float(int iters, int dim2) { - ConcatHelper(iters, 1, dim2); +void BM_ConcatDim1Float(::testing::benchmark::State& state) { + const int dim2 = state.range(0); + + ConcatHelper(state, 1, dim2); } -BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000); -BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000); +BENCHMARK(BM_ConcatDim0Float) + ->UseRealTime() + ->Arg(1000) + ->Arg(100000) + ->Arg(1000000); +BENCHMARK(BM_ConcatDim1Float) + ->UseRealTime() + ->Arg(1000) + ->Arg(100000) + ->Arg(1000000); -static void BM_ConcatDim0String(int iters, int dim2, int string_length) { - ConcatHelper(iters, 0, dim2, string_length); +void BM_ConcatDim0String(::testing::benchmark::State& state) { + const int dim2 = state.range(0); + const int string_length = state.range(1); + + ConcatHelper(state, 0, dim2, string_length); } BENCHMARK(BM_ConcatDim0String) + ->UseRealTime() ->ArgPair(1, 16) ->ArgPair(1, 10000) ->ArgPair(100, 16); -static void BM_ConcatDim1uint8(int iters, int dim2) { - ConcatHelper(iters, 1, dim2); +void BM_ConcatDim1uint8(::testing::benchmark::State& state) { + const int dim2 = state.range(0); + + ConcatHelper(state, 1, dim2); } -static void BM_ConcatDim1int16(int iters, int dim2) { - ConcatHelper(iters, 1, dim2); +void BM_ConcatDim1int16(::testing::benchmark::State& state) { + const int dim2 = state.range(0); + + ConcatHelper(state, 1, dim2); } -static void BM_ConcatDim1bfloat16(int iters, int dim2) { - ConcatHelper(iters, 1, dim2); +void BM_ConcatDim1bfloat16(::testing::benchmark::State& state) { + const int dim2 = state.range(0); + + ConcatHelper(state, 1, dim2); } -BENCHMARK(BM_ConcatDim1uint8)->Arg(1000)->Arg(100000)->Arg(1000000); -BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000); -BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000); +BENCHMARK(BM_ConcatDim1uint8) + ->UseRealTime() + ->Arg(1000) + ->Arg(100000) + ->Arg(1000000); +BENCHMARK(BM_ConcatDim1int16) + ->UseRealTime() + ->Arg(1000) + ->Arg(100000) + ->Arg(1000000); +BENCHMARK(BM_ConcatDim1bfloat16) + ->UseRealTime() + ->Arg(1000) + ->Arg(100000) + ->Arg(1000000); template -static void ConcatManyHelper(int iters, int concat_dimension, int dim2) { - testing::StopTiming(); +static void ConcatManyHelper(::testing::benchmark::State& state, + int concat_dimension, int dim2) { Graph* g = new Graph(OpRegistry::Global()); DataType dt = DataTypeToEnum::v(); @@ -146,30 +179,25 @@ static void ConcatManyHelper(int iters, int concat_dimension, int dim2) { .Attr("N", 64) .Attr("T", dt) .Finalize(g, &node)); - testing::BytesProcessed(static_cast(iters) * kDim1 * dim2 * - kNumInputs * sizeof(T)); - testing::StartTiming(); - test::Benchmark("cpu", g).Run(iters); - testing::UseRealTime(); + test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); + state.SetBytesProcessed(static_cast(state.iterations()) * kDim1 * + dim2 * kNumInputs * sizeof(T)); } -static void BM_ConcatManyDim1bfloat16(int iters, int dim2) { - ConcatManyHelper(iters, 1, dim2); +void BM_ConcatManyDim1bfloat16(::testing::benchmark::State& state) { + const int dim2 = state.range(0); + + ConcatManyHelper(state, 1, dim2); } -BENCHMARK(BM_ConcatManyDim1bfloat16)->Arg(18)->Arg(34)->Arg(60); - -static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) { - testing::StopTiming(); +BENCHMARK(BM_ConcatManyDim1bfloat16)->UseRealTime()->Arg(18)->Arg(34)->Arg(60); +void MemcpyAlternativeHelper(::testing::benchmark::State& state, int dim2) { const int kDim1 = 100; std::vector data1(kDim1 * dim2, 1.0f); std::vector data2(kDim1 * dim2, 2.0f); - testing::BytesProcessed(static_cast(iters) * - ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float)); - testing::StartTiming(); - while (--iters > 0) { + for (auto s : state) { const size_t n0 = data1.size(); const size_t n1 = data2.size(); float* result = new float[n0 + n1]; @@ -177,24 +205,37 @@ static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) { memcpy(&result[n0], &data2[0], n1 * sizeof(float)); delete[] result; } + state.SetBytesProcessed(static_cast(state.iterations()) * + ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float)); } -static void BM_MemcpyAlternativeDim0(int iters, int dim2) { - MemcpyAlternativeHelper(iters, 0, dim2); +void BM_MemcpyAlternativeDim0(::testing::benchmark::State& state) { + const int dim2 = state.range(0); + + MemcpyAlternativeHelper(state, dim2); } -static void BM_MemcpyAlternativeDim1(int iters, int dim2) { - MemcpyAlternativeHelper(iters, 1, dim2); +void BM_MemcpyAlternativeDim1(::testing::benchmark::State& state) { + const int dim2 = state.range(0); + + MemcpyAlternativeHelper(state, dim2); } -BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000); -BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000); +BENCHMARK(BM_MemcpyAlternativeDim0) + ->UseRealTime() + ->Arg(1000) + ->Arg(100000) + ->Arg(1000000); +BENCHMARK(BM_MemcpyAlternativeDim1) + ->UseRealTime() + ->Arg(1000) + ->Arg(100000) + ->Arg(1000000); typedef Eigen::TensorMap, Eigen::Unaligned> EigenMap; -static void MemcpyManyAlternative1(int iters, int dim2) { - testing::StopTiming(); - +void MemcpyManyAlternative1(::testing::benchmark::State& state) { + int dim2 = state.range(0); const int kDim1 = 40000; const int kNumCopies = 64; const int size = kDim1 * dim2 * kNumCopies; @@ -202,10 +243,7 @@ static void MemcpyManyAlternative1(int iters, int dim2) { EigenMap map(data, size); map.setRandom(); - testing::BytesProcessed(static_cast(iters) * kDim1 * dim2 * - kNumCopies * sizeof(bfloat16)); - testing::StartTiming(); - while (iters-- > 0) { + for (auto s : state) { std::vector inputs(kNumCopies); for (int i = 0; i < kNumCopies; ++i) { inputs[i] = &data[i * kDim1 * dim2]; @@ -225,11 +263,12 @@ static void MemcpyManyAlternative1(int iters, int dim2) { delete[] result; } delete[] data; + state.SetBytesProcessed(static_cast(state.iterations()) * kDim1 * + dim2 * kNumCopies * sizeof(bfloat16)); } -static void MemcpyManyAlternative2(int iters, int dim2) { - testing::StopTiming(); - +void MemcpyManyAlternative2(::testing::benchmark::State& state) { + int dim2 = state.range(0); const int kDim1 = 40000; const int kNumCopies = 64; const int size = kDim1 * dim2 * kNumCopies; @@ -237,11 +276,8 @@ static void MemcpyManyAlternative2(int iters, int dim2) { EigenMap map(data, size); map.setRandom(); - testing::BytesProcessed(static_cast(iters) * kDim1 * dim2 * - kNumCopies * sizeof(bfloat16)); - testing::StartTiming(); std::vector inputs(kNumCopies); - while (--iters > 0) { + for (auto s : state) { bfloat16* result = new bfloat16[size]; for (int i = 0; i < kNumCopies; ++i) { inputs[i] = &data[i * kDim1 * dim2]; @@ -260,6 +296,9 @@ static void MemcpyManyAlternative2(int iters, int dim2) { delete[] result; } delete[] data; + + state.SetBytesProcessed(static_cast(state.iterations()) * kDim1 * + dim2 * kNumCopies * sizeof(bfloat16)); } BENCHMARK(MemcpyManyAlternative1) diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc index 7f424b49994..12372202914 100644 --- a/tensorflow/core/kernels/constant_op_test.cc +++ b/tensorflow/core/kernels/constant_op_test.cc @@ -114,15 +114,23 @@ static Graph* ManyConsts(int num, bool sequential) { return g; } -static void BM_ManyConsts_Parallel(int iters, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - test::Benchmark("cpu", ManyConsts(num, false /* !sequential */)).Run(iters); +static void BM_ManyConsts_Parallel(::testing::benchmark::State& state) { + const int num = state.range(0); + + test::Benchmark("cpu", ManyConsts(num, false /* !sequential */), + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num); } BENCHMARK(BM_ManyConsts_Parallel)->Range(1, 1 << 10); -static void BM_ManyConsts_Sequential(int iters, int num) { - testing::ItemsProcessed(static_cast(iters) * num); - test::Benchmark("cpu", ManyConsts(num, true /* sequential */)).Run(iters); +static void BM_ManyConsts_Sequential(::testing::benchmark::State& state) { + const int num = state.range(0); + + test::Benchmark("cpu", ManyConsts(num, true /* sequential */), + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num); } BENCHMARK(BM_ManyConsts_Sequential)->Range(1, 1 << 10); diff --git a/tensorflow/core/kernels/conv_ops_benchmark_test.cc b/tensorflow/core/kernels/conv_ops_benchmark_test.cc index 339c8e2dda6..8408c8b7ec5 100644 --- a/tensorflow/core/kernels/conv_ops_benchmark_test.cc +++ b/tensorflow/core/kernels/conv_ops_benchmark_test.cc @@ -309,104 +309,120 @@ static Graph* FusedConv2DWithBatchNorm( // The following benchmarks are always using 'float' data type with NHWC layout. // -------------------------------------------------------------------------- // -#define BM_SETUP(N, H, W, C, type, LABEL, NAME) \ - testing::ItemsProcessed(static_cast(iters) * (N) * (H) * (W) * (C)); \ - testing::SetLabel(LABEL); +#define BM_SET_INFO(N, H, W, C, type, LABEL, NAME) \ + state.SetItemsProcessed(static_cast(state.iterations()) * (N) * (H) * \ + (W) * (C)); \ + state.SetLabel(LABEL); #define BM_NAME(name, type, N, H, W, C, FW, FH, FC) \ name##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC -#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL) \ - static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)(int iters) { \ - BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \ - test::Benchmark(#type, Conv2D(N, H, W, C, FW, FH, FC).graph) \ - .Run(iters); \ - } \ +#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL) \ + static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, \ + FC)(::testing::benchmark::State & state) { \ + test::Benchmark(#type, Conv2D(N, H, W, C, FW, FH, FC).graph, \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \ + } \ BENCHMARK(BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)); #define BM_Conv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL) \ static void BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, \ - FC)(int iters) { \ - BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \ + FC)(::testing::benchmark::State & state) { \ test::Benchmark(#type, \ - Conv2DWithBias(N, H, W, C, FW, FH, FC).graph) \ - .Run(iters); \ + Conv2DWithBias(N, H, W, C, FW, FH, FC).graph, \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \ } \ BENCHMARK(BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, FC)); -#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL) \ - static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \ - FC)(int iters) { \ - BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \ - test::Benchmark(#type, Conv2DWithBiasAndActivation(N, H, W, C, FW, \ - FH, FC, "Relu") \ - .graph) \ - .Run(iters); \ - } \ +#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL) \ + static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \ + FC)(::testing::benchmark::State & state) { \ + test::Benchmark( \ + #type, \ + Conv2DWithBiasAndActivation(N, H, W, C, FW, FH, FC, "Relu") \ + .graph, \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \ + } \ BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC)); -#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL) \ - static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, \ - FC)(int iters) { \ - BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \ - test::Benchmark(#type, FusedConv2DWithBias(N, H, W, C, FW, FH, FC, \ - {"BiasAdd"})) \ - .Run(iters); \ - } \ +#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL) \ + static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, \ + FC)(::testing::benchmark::State & state) { \ + test::Benchmark( \ + #type, \ + FusedConv2DWithBias(N, H, W, C, FW, FH, FC, {"BiasAdd"}), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \ + } \ BENCHMARK(BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, FC)); #define BM_FusedConv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL) \ static void BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \ - FC)(int iters) { \ - BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \ - test::Benchmark(#type, FusedConv2DWithBias(N, H, W, C, FW, FH, FC, \ - {"BiasAdd", "Relu"})) \ - .Run(iters); \ + FC)(::testing::benchmark::State & state) { \ + test::Benchmark(#type, \ + FusedConv2DWithBias(N, H, W, C, FW, FH, FC, \ + {"BiasAdd", "Relu"}), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \ } \ BENCHMARK( \ BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC)); #define BM_Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL) \ static void BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, \ - FC)(int iters) { \ - BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \ + FC)(::testing::benchmark::State & state) { \ test::Benchmark(#type, \ - Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC).graph) \ - .Run(iters); \ + Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC).graph, \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \ } \ BENCHMARK(BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC)); #define BM_Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type, LABEL) \ static void BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, \ - FC)(int iters) { \ - BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \ - test::Benchmark(#type, Conv2DWithBatchNormAndActivation( \ - N, H, W, C, FW, FH, FC, "Relu") \ - .graph) \ - .Run(iters); \ + FC)(::testing::benchmark::State & state) { \ + test::Benchmark(#type, \ + Conv2DWithBatchNormAndActivation(N, H, W, C, FW, \ + FH, FC, "Relu") \ + .graph, \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \ } \ BENCHMARK( \ BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, FC)); #define BM_FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL) \ static void BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, \ - FC)(int iters) { \ - BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \ - test::Benchmark(#type, FusedConv2DWithBatchNorm( \ - N, H, W, C, FW, FH, FC, {"FusedBatchNorm"})) \ - .Run(iters); \ + FC)(::testing::benchmark::State & state) { \ + test::Benchmark(#type, \ + FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC, \ + {"FusedBatchNorm"}), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \ } \ BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC)); #define BM_FusedConv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type, \ LABEL) \ static void BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, \ - FW, FH, FC)(int iters) { \ - BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \ - test::Benchmark( \ - #type, FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC, \ - {"FusedBatchNorm", "Relu"})) \ - .Run(iters); \ + FW, FH, FC)(::testing::benchmark::State & state) { \ + test::Benchmark(#type, \ + FusedConv2DWithBatchNorm( \ + N, H, W, C, FW, FH, FC, {"FusedBatchNorm", "Relu"}), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \ } \ BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, FW, \ FH, FC)); @@ -561,11 +577,12 @@ BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32"); #define BM_Conv2DFmt(T, FORMAT, N, H, W, C, FW, FH, FC, type) \ static void BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH, \ - FC)(int iters) { \ - BM_SETUP(N, H, W, C, type, "", Conv2D); \ + FC)(::testing::benchmark::State & state) { \ test::Benchmark(#type, \ - Conv2D(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph) \ - .Run(iters); \ + Conv2D(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph, \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + BM_SET_INFO(N, H, W, C, type, "", Conv2D); \ } \ BENCHMARK(BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH, FC)); diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc index 61f4b89535a..f0a0d7bccbd 100644 --- a/tensorflow/core/kernels/cwise_ops_test.cc +++ b/tensorflow/core/kernels/cwise_ops_test.cc @@ -42,15 +42,19 @@ int RowsAndColsArg(int r, int c) { return r * kRows + c; } int RowsFromArg(int arg) { return (arg / kRows); } int ColsFromArg(int arg) { return (arg % kRows); } -#define BM_UNARY(DEVICE, FUNC, T, TYPE) \ - void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) { \ - const int64 tot = static_cast(iters) * num; \ - testing::UseRealTime(); \ - testing::ItemsProcessed(tot); \ - testing::BytesProcessed(tot * sizeof(T)); \ - test::Benchmark(#DEVICE, Unary(#FUNC, num, TYPE)).Run(iters); \ - } \ - BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20); +#define BM_UNARY(DEVICE, FUNC, T, TYPE) \ + void BM_##DEVICE##_##FUNC##_##TYPE(::testing::benchmark::State& state) { \ + const int num = state.range(0); \ + test::Benchmark(#DEVICE, Unary(#FUNC, num, TYPE), \ + /*old_benchmark_api*/ false) \ + .Run(state); \ + const int64 tot = static_cast(state.iterations()) * num; \ + state.SetItemsProcessed(tot); \ + state.SetBytesProcessed(tot * sizeof(T)); \ + } \ + BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE) \ + ->UseRealTime() \ + ->Range(4 << 10, 1 << 20); BM_UNARY(cpu, Floor, float, DT_FLOAT); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM @@ -101,27 +105,30 @@ Graph* BinaryScalar(int num, const string& func) { return g; } -#define BM_BINARY_SCALAR(DEVICE, FUNC) \ - void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) { \ - const int64 tot = static_cast(iters) * num; \ - testing::UseRealTime(); \ - testing::ItemsProcessed(tot); \ - testing::BytesProcessed(tot * sizeof(float)); \ - test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \ - } \ - BENCHMARK(BM_##DEVICE##_##FUNC##_scalar) \ - ->Arg(1 << 12) /* must >= 4096 */ \ - ->Arg(1 << 13) \ - ->Arg(1 << 14) \ - ->Arg((1 << 15) - (1 << 13)) \ - ->Arg(1 << 15) \ - ->Arg((1 << 15) + (1 << 14)) \ - ->Arg(1 << 16) \ - ->Arg((1 << 17) - (1 << 15)) \ - ->Arg(1 << 17) \ - ->Arg((1 << 17) + (1 << 16)) \ - ->Arg(1 << 18) \ - ->Arg(1 << 19) \ +#define BM_BINARY_SCALAR(DEVICE, FUNC) \ + void BM_##DEVICE##_##FUNC##_scalar(::testing::benchmark::State& state) { \ + const int num = state.range(0); \ + \ + test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + const int64 tot = static_cast(state.iterations()) * num; \ + state.SetItemsProcessed(tot); \ + state.SetBytesProcessed(tot * sizeof(float)); \ + } \ + BENCHMARK(BM_##DEVICE##_##FUNC##_scalar) \ + ->Arg(1 << 12) /* must >= 4096 */ \ + ->Arg(1 << 13) \ + ->Arg(1 << 14) \ + ->Arg((1 << 15) - (1 << 13)) \ + ->Arg(1 << 15) \ + ->Arg((1 << 15) + (1 << 14)) \ + ->Arg(1 << 16) \ + ->Arg((1 << 17) - (1 << 15)) \ + ->Arg(1 << 17) \ + ->Arg((1 << 17) + (1 << 16)) \ + ->Arg(1 << 18) \ + ->Arg(1 << 19) \ ->Arg(1 << 20); BM_BINARY_SCALAR(cpu, Less); @@ -173,17 +180,19 @@ Graph* CubeWithMulSquare(int num) { return g; } -#define BM_CUBE(DEVICE, Impl) \ - void BM_##DEVICE##_Cube_##Impl(int iters, int num) { \ - const int64 tot = static_cast(iters) * num; \ - testing::UseRealTime(); \ - testing::ItemsProcessed(tot); \ - testing::BytesProcessed(tot * sizeof(float)); \ - test::Benchmark(#DEVICE, Impl(num)).Run(iters); \ - } \ - BENCHMARK(BM_##DEVICE##_Cube_##Impl) \ - ->Arg(1 << 12) /* must >= 4096 */ \ - ->Arg(1 << 16) \ +#define BM_CUBE(DEVICE, Impl) \ + void BM_##DEVICE##_Cube_##Impl(::testing::benchmark::State& state) { \ + const int num = state.range(0); \ + \ + test::Benchmark(#DEVICE, Impl(num)).Run(state.iterations()); \ + const int64 tot = static_cast(state.iterations()) * num; \ + state.SetItemsProcessed(tot); \ + state.SetBytesProcessed(tot * sizeof(float)); \ + } \ + BENCHMARK(BM_##DEVICE##_Cube_##Impl) \ + ->UseRealTime() \ + ->Arg(1 << 12) /* must >= 4096 */ \ + ->Arg(1 << 16) \ ->Arg(1 << 20); BM_CUBE(cpu, CubeWithPow3); @@ -211,17 +220,21 @@ Graph* BiasAdd(int rows, int cols, DataType type) { return g; } -#define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C) \ - void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C(int iters, int arg) { \ - const int rows = RowsFromArg(arg); \ - const int cols = ColsFromArg(arg); \ - const int64 tot = static_cast(iters) * rows * cols; \ - testing::UseRealTime(); \ - testing::ItemsProcessed(tot); \ - testing::BytesProcessed(tot * sizeof(C_TYPE)); \ - test::Benchmark(#DEVICE, BiasAdd(rows, cols, TF_TYPE)).Run(iters); \ - } \ - BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C) \ +#define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C) \ + void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C( \ + ::testing::benchmark::State& state) { \ + const int arg = state.range(0); \ + const int rows = RowsFromArg(arg); \ + const int cols = ColsFromArg(arg); \ + const int64 tot = static_cast(state.iterations()) * rows * cols; \ + test::Benchmark(#DEVICE, BiasAdd(rows, cols, TF_TYPE), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + state.SetItemsProcessed(tot); \ + state.SetBytesProcessed(tot * sizeof(C_TYPE)); \ + } \ + BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C) \ + ->UseRealTime() \ ->Arg(RowsAndColsArg(R, C)); #define BM_BIAS_ADD_ALL(DEVICE, C_TYPE, TF_TYPE) \ @@ -264,16 +277,21 @@ Graph* BiasAddGrad(int rows, int cols, int channels, DataType type, #define BM_BIAS_ADD_GRAD(DEVICE, FMT, C_TYPE, TF_TYPE, R, C, CH) \ void BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH( \ - int iters, int arg, int channels) { \ + ::testing::benchmark::State& state) { \ + const int arg = state.range(0); \ + const int channels = state.range(1); \ + \ const int rows = RowsFromArg(arg); \ const int cols = ColsFromArg(arg); \ - const int64 tot = static_cast(iters) * rows * cols * channels; \ - testing::UseRealTime(); \ - testing::ItemsProcessed(tot); \ - testing::BytesProcessed(tot * sizeof(C_TYPE)); \ - test::Benchmark(#DEVICE, BiasAddGrad(rows, cols, channels, \ - TF_TYPE, FORMAT_##FMT)) \ - .Run(iters); \ + test::Benchmark( \ + #DEVICE, \ + BiasAddGrad(rows, cols, channels, TF_TYPE, FORMAT_##FMT), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + const int64 tot = \ + static_cast(state.iterations()) * rows * cols * channels; \ + state.SetItemsProcessed(tot); \ + state.SetBytesProcessed(tot * sizeof(C_TYPE)); \ } \ BENCHMARK(BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH) \ ->ArgPair(RowsAndColsArg(R, C), CH); @@ -326,16 +344,20 @@ Graph* BcastAdd(int rows, int cols, int dim) { return g; } -#define BM_BCAST_ADD_ROW(DEVICE, R, C) \ - void BM_##DEVICE##_BcastAddRow_R##R##_C##C(int iters, int arg) { \ - const int rows = RowsFromArg(arg); \ - const int cols = ColsFromArg(arg); \ - const int64 tot = static_cast(iters) * rows * cols; \ - testing::UseRealTime(); \ - testing::ItemsProcessed(tot); \ - testing::BytesProcessed(tot * sizeof(float)); \ - test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters); \ - } \ +#define BM_BCAST_ADD_ROW(DEVICE, R, C) \ + void BM_##DEVICE##_BcastAddRow_R##R##_C##C( \ + ::testing::benchmark::State& state) { \ + const int arg = state.range(0); \ + \ + const int rows = RowsFromArg(arg); \ + const int cols = ColsFromArg(arg); \ + test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + const int64 tot = static_cast(state.iterations()) * rows * cols; \ + state.SetItemsProcessed(tot); \ + state.SetBytesProcessed(tot * sizeof(float)); \ + } \ BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C)); #define BM_BCAST_ADD_ROW_ALL(DEVICE) \ @@ -350,17 +372,24 @@ BM_BCAST_ADD_ROW_ALL(gpu); #undef BM_BCAST_ADD_ROW_ALL #undef BM_BCAST_ADD_ROW -#define BM_BCAST_ADD_COL(DEVICE, R, C) \ - void BM_##DEVICE##_BcastAddCol_R##R##_C##C(int iters, int arg) { \ - const int rows = RowsFromArg(arg); \ - const int cols = ColsFromArg(arg); \ - const int64 tot = static_cast(iters) * rows * cols; \ - testing::UseRealTime(); \ - testing::ItemsProcessed(tot); \ - testing::BytesProcessed(tot * sizeof(float)); \ - test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters); \ - } \ - BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)->Arg(RowsAndColsArg(R, C)); +#define BM_BCAST_ADD_COL(DEVICE, R, C) \ + void BM_##DEVICE##_BcastAddCol_R##R##_C##C( \ + ::testing::benchmark::State& state) { \ + const int arg = state.range(0); \ + \ + const int rows = RowsFromArg(arg); \ + const int cols = ColsFromArg(arg); \ + test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + const int64 tot = static_cast(state.iterations()) * rows * cols; \ + \ + state.SetItemsProcessed(tot); \ + state.SetBytesProcessed(tot * sizeof(float)); \ + } \ + BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C) \ + ->UseRealTime() \ + ->Arg(RowsAndColsArg(R, C)); #define BM_BCAST_ADD_COL_ALL(DEVICE) \ BM_BCAST_ADD_COL(DEVICE, 512, 2048); \ @@ -374,17 +403,23 @@ BM_BCAST_ADD_COL_ALL(gpu); #undef BM_BCAST_ADD_COL_ALL #undef BM_BCAST_ADD_COL -#define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C) \ - void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C(int iters, int arg) { \ - const int rows = RowsFromArg(arg); \ - const int cols = ColsFromArg(arg); \ - const int64 tot = static_cast(iters) * rows * cols; \ - testing::UseRealTime(); \ - testing::ItemsProcessed(tot); \ - testing::BytesProcessed(tot * sizeof(float)); \ - test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2)).Run(iters); \ - } \ - BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C) \ +#define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C) \ + void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C( \ + ::testing::benchmark::State& state) { \ + const int arg = state.range(0); \ + \ + const int rows = RowsFromArg(arg); \ + const int cols = ColsFromArg(arg); \ + test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2), \ + /*old_benchmark_api=*/false) \ + .Run(state); \ + const int64 tot = static_cast(state.iterations()) * rows * cols; \ + \ + state.SetItemsProcessed(tot); \ + state.SetBytesProcessed(tot * sizeof(float)); \ + } \ + BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C) \ + ->UseRealTime() \ ->Arg(RowsAndColsArg(R, C)); #define BM_BCAST_ADD_CROSS_RC_ALL(DEVICE) \ @@ -399,17 +434,22 @@ BM_BCAST_ADD_CROSS_RC_ALL(gpu); #undef BM_BCAST_ADD_CROSS_RC_ALL #undef BM_BCAST_ADD_CROSS_RC -#define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C) \ - void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C(int iters, int arg) { \ - const int rows = RowsFromArg(arg); \ - const int cols = ColsFromArg(arg); \ - const int64 tot = static_cast(iters) * rows * cols; \ - testing::UseRealTime(); \ - testing::ItemsProcessed(tot); \ - testing::BytesProcessed(tot * sizeof(float)); \ - test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3)).Run(iters); \ - } \ - BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C) \ +#define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C) \ + void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C( \ + ::testing::benchmark::State& state) { \ + const int arg = state.range(0); \ + \ + const int rows = RowsFromArg(arg); \ + const int cols = ColsFromArg(arg); \ + test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3), \ + /*old_benchmark_api*/ false) \ + .Run(state); \ + const int64 tot = static_cast(state.iterations()) * rows * cols; \ + state.SetItemsProcessed(tot); \ + state.SetBytesProcessed(tot * sizeof(float)); \ + } \ + BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C) \ + ->UseRealTime() \ ->Arg(RowsAndColsArg(R, C)); #define BM_BCAST_ADD_CROSS_CR_ALL(DEVICE) \ diff --git a/tensorflow/core/kernels/data/single_threaded_executor_test.cc b/tensorflow/core/kernels/data/single_threaded_executor_test.cc index 16ad78e5f9b..c0cb1e2f1e7 100644 --- a/tensorflow/core/kernels/data/single_threaded_executor_test.cc +++ b/tensorflow/core/kernels/data/single_threaded_executor_test.cc @@ -273,10 +273,10 @@ TEST_F(ExecutorTest, ControlDependenciesFromSpecialNodes) { EXPECT_EQ(3.0, V(retvals[0])); // out = 1.0 + 2.0 = 3.0 } -static void BM_executor(int iters, int width, int depth) { -#ifdef PLATFORM_GOOGLE - BenchmarkUseRealTime(); -#endif // PLATFORM_GOOGLE +void BM_executor(::testing::benchmark::State& state) { + const int width = state.range(0); + const int depth = state.range(1); + Graph* g = new Graph(OpRegistry::Global()); random::PhiloxRandom philox(1729, 17); random::SimplePhilox rand(&philox); @@ -306,30 +306,28 @@ static void BM_executor(int iters, int width, int depth) { } } FixupSourceAndSinkEdges(g); -#ifdef PLATFORM_GOOGLE - SetBenchmarkLabel(strings::StrCat("Nodes = ", cur)); - SetBenchmarkItemsProcessed(cur * static_cast(iters)); -#endif // PLATFORM_GOOGLE test::Benchmark("cpu", g, nullptr, nullptr, nullptr, - "SINGLE_THREADED_EXECUTOR") - .Run(iters); + "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api=*/false) + .Run(state); + state.SetLabel(strings::StrCat("Nodes = ", cur)); + state.SetItemsProcessed(cur * static_cast(state.iterations())); } // Tall skinny graphs -BENCHMARK(BM_executor)->ArgPair(16, 1024); -BENCHMARK(BM_executor)->ArgPair(32, 8192); +BENCHMARK(BM_executor)->UseRealTime()->ArgPair(16, 1024); +BENCHMARK(BM_executor)->UseRealTime()->ArgPair(32, 8192); // Short fat graphs -BENCHMARK(BM_executor)->ArgPair(1024, 16); -BENCHMARK(BM_executor)->ArgPair(8192, 32); +BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 16); +BENCHMARK(BM_executor)->UseRealTime()->ArgPair(8192, 32); // Tall fat graph -BENCHMARK(BM_executor)->ArgPair(1024, 1024); +BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 1024); + +void BM_const_identity(::testing::benchmark::State& state) { + const int width = state.range(0); + const int outputs_per_const = state.range(1); -static void BM_const_identity(int iters, int width, int outputs_per_const) { -#ifdef PLATFORM_GOOGLE - BenchmarkUseRealTime(); -#endif // PLATFORM_GOOGLE Graph* g = new Graph(OpRegistry::Global()); for (int i = 0; i < width; ++i) { Tensor i_t(i); @@ -339,22 +337,20 @@ static void BM_const_identity(int iters, int width, int outputs_per_const) { } } FixupSourceAndSinkEdges(g); -#ifdef PLATFORM_GOOGLE - SetBenchmarkLabel( - strings::StrCat("Nodes = ", (1 + outputs_per_const) * width)); - SetBenchmarkItemsProcessed((1 + outputs_per_const) * width * - static_cast(iters)); -#endif // PLATFORM_GOOGLE test::Benchmark("cpu", g, nullptr, nullptr, nullptr, - "SINGLE_THREADED_EXECUTOR") - .Run(iters); + "SINGLE_THREADED_EXECUTOR", + /*old_benchmark_api=*/false) + .Run(state); + state.SetLabel(strings::StrCat("Nodes = ", (1 + outputs_per_const) * width)); + state.SetItemsProcessed((1 + outputs_per_const) * width * + static_cast(state.iterations())); } // Graph with actual op execution. -BENCHMARK(BM_const_identity)->ArgPair(1, 1); -BENCHMARK(BM_const_identity)->ArgPair(1, 100); -BENCHMARK(BM_const_identity)->ArgPair(100, 1); -BENCHMARK(BM_const_identity)->ArgPair(100, 100); +BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(1, 1); +BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(1, 100); +BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(100, 1); +BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(100, 100); // TODO(mrry): This benchmark currently crashes with a use-after free, because // test::Benchmark::RunWithArgs() assumes that the executor will take ownership @@ -368,7 +364,7 @@ BENCHMARK(BM_const_identity)->ArgPair(100, 100); #define ALICE "/job:j/replica:0/task:0/cpu:0" #define BOB "/job:j/replica:0/task:0/gpu:0" -static void BM_FeedInputFetchOutput(int iters) { +static void BM_FeedInputFetchOutput(::testing::benchmark::State& state) { Graph* g = new Graph(OpRegistry::Global()); // z = x + y: x and y are provided as benchmark inputs. z is the // output of the benchmark. Conceptually, the caller is ALICE, the @@ -380,10 +376,10 @@ static void BM_FeedInputFetchOutput(int iters) { FixupSourceAndSinkEdges(g); Tensor val(DT_FLOAT, TensorShape({})); val.scalar()() = 3.14; - SetBenchmarkItemsProcessed(static_cast(iters)); test::Benchmark("cpu", g, nullptr, nullptr, nullptr, - "SINGLE_THREADED_EXECUTOR") - .RunWithArgs({{x, val}, {y, val}}, {z}, iters); + "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api=*/false) + .RunWithArgs({{x, val}, {y, val}}, {z}, state); + state.SetItemsProcessed(state.iterations()); } BENCHMARK(BM_FeedInputFetchOutput); #endif diff --git a/tensorflow/core/kernels/dequantize_op_test.cc b/tensorflow/core/kernels/dequantize_op_test.cc index 3c9d1790787..4dcb70b18c5 100644 --- a/tensorflow/core/kernels/dequantize_op_test.cc +++ b/tensorflow/core/kernels/dequantize_op_test.cc @@ -247,7 +247,7 @@ TEST_F(DequantizeOpTest, DequantizeScaledQint8Axis3) { } template -static void BM_DequantizeMinCombinedCpu(int iters) { +static void BM_DequantizeMinCombinedCpu(::testing::benchmark::State& state) { auto root = Scope::NewRootScope().ExitOnError(); const int64 num_values = 1500 * 250; std::vector inputs; @@ -262,25 +262,26 @@ static void BM_DequantizeMinCombinedCpu(int iters) { Graph* g = new Graph(OpRegistry::Global()); TF_CHECK_OK(root.ToGraph(g)); - test::Benchmark("cpu", g).Run(iters); - testing::BytesProcessed(iters * num_values * (sizeof(float) + sizeof(T))); - testing::ItemsProcessed(iters); + test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); + state.SetBytesProcessed(state.iterations() * num_values * + (sizeof(float) + sizeof(T))); + state.SetItemsProcessed(state.iterations()); } -static void BM_DequantizeMinCombinedCpuQuint16(int iters) { - BM_DequantizeMinCombinedCpu(iters); +void BM_DequantizeMinCombinedCpuQuint16(::testing::benchmark::State& state) { + BM_DequantizeMinCombinedCpu(state); } -static void BM_DequantizeMinCombinedCpuQint16(int iters) { - BM_DequantizeMinCombinedCpu(iters); +void BM_DequantizeMinCombinedCpuQint16(::testing::benchmark::State& state) { + BM_DequantizeMinCombinedCpu(state); } -static void BM_DequantizeMinCombinedCpuQuint8(int iters) { - BM_DequantizeMinCombinedCpu(iters); +void BM_DequantizeMinCombinedCpuQuint8(::testing::benchmark::State& state) { + BM_DequantizeMinCombinedCpu(state); } -static void BM_DequantizeMinCombinedCpuQint8(int iters) { - BM_DequantizeMinCombinedCpu(iters); +void BM_DequantizeMinCombinedCpuQint8(::testing::benchmark::State& state) { + BM_DequantizeMinCombinedCpu(state); } BENCHMARK(BM_DequantizeMinCombinedCpuQuint16); @@ -289,7 +290,8 @@ BENCHMARK(BM_DequantizeMinCombinedCpuQuint8); BENCHMARK(BM_DequantizeMinCombinedCpuQint8); template -static void BM_DequantizeBfloat16MinCombinedCpu(int iters) { +static void BM_DequantizeBfloat16MinCombinedCpu( + ::testing::benchmark::State& state) { auto root = Scope::NewRootScope().ExitOnError(); const int64 num_values = 1500 * 250; std::vector inputs; @@ -304,25 +306,30 @@ static void BM_DequantizeBfloat16MinCombinedCpu(int iters) { Graph* g = new Graph(OpRegistry::Global()); TF_CHECK_OK(root.ToGraph(g)); - test::Benchmark("cpu", g).Run(iters); - testing::BytesProcessed(iters * num_values * (sizeof(bfloat16) + sizeof(T))); - testing::ItemsProcessed(iters); + test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state); + state.SetBytesProcessed(state.iterations() * num_values * + (sizeof(bfloat16) + sizeof(T))); + state.SetItemsProcessed(state.iterations()); } -static void BM_DequantizeBfloat16MinCombinedCpuQuint16(int iters) { - BM_DequantizeBfloat16MinCombinedCpu(iters); +void BM_DequantizeBfloat16MinCombinedCpuQuint16( + ::testing::benchmark::State& state) { + BM_DequantizeBfloat16MinCombinedCpu(state); } -static void BM_DequantizeBfloat16MinCombinedCpuQint16(int iters) { - BM_DequantizeBfloat16MinCombinedCpu(iters); +void BM_DequantizeBfloat16MinCombinedCpuQint16( + ::testing::benchmark::State& state) { + BM_DequantizeBfloat16MinCombinedCpu(state); } -static void BM_DequantizeBfloat16MinCombinedCpuQuint8(int iters) { - BM_DequantizeBfloat16MinCombinedCpu(iters); +void BM_DequantizeBfloat16MinCombinedCpuQuint8( + ::testing::benchmark::State& state) { + BM_DequantizeBfloat16MinCombinedCpu(state); } -static void BM_DequantizeBfloat16MinCombinedCpuQint8(int iters) { - BM_DequantizeBfloat16MinCombinedCpu(iters); +void BM_DequantizeBfloat16MinCombinedCpuQint8( + ::testing::benchmark::State& state) { + BM_DequantizeBfloat16MinCombinedCpu(state); } BENCHMARK(BM_DequantizeBfloat16MinCombinedCpuQuint16); diff --git a/tensorflow/core/kernels/diag_op_test.cc b/tensorflow/core/kernels/diag_op_test.cc index a708e53dd01..8fdf1018eba 100644 --- a/tensorflow/core/kernels/diag_op_test.cc +++ b/tensorflow/core/kernels/diag_op_test.cc @@ -30,12 +30,13 @@ static Graph* Diag(int n, DataType type) { return g; } -#define BM_DiagDev(N, T, TFTYPE, DEVICE) \ - static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) { \ - testing::UseRealTime(); \ - testing::ItemsProcessed(static_cast(iters) * N * N); \ - test::Benchmark(#DEVICE, Diag(N, TFTYPE)).Run(iters); \ - } \ +#define BM_DiagDev(N, T, TFTYPE, DEVICE) \ + static void BM_Diag##_##N##_##TFTYPE##_##DEVICE( \ + ::testing::benchmark::State& state) { \ + test::Benchmark(#DEVICE, Diag(N, TFTYPE), /*old_benchmark_api=*/false) \ + .Run(state); \ + state.SetItemsProcessed(static_cast(state.iterations()) * N * N); \ + } \ BENCHMARK(BM_Diag##_##N##_##TFTYPE##_##DEVICE); #define BM_Diag(N) \