diff --git a/tensorflow/core/kernels/random_op_test.cc b/tensorflow/core/kernels/random_op_test.cc index 47d94ad9028..e32ec11c9b3 100644 --- a/tensorflow/core/kernels/random_op_test.cc +++ b/tensorflow/core/kernels/random_op_test.cc @@ -58,11 +58,14 @@ Graph* TruncatedNormal(int64 n) { return g; } -#define BM_RNG(DEVICE, RNG) \ - void BM_##DEVICE##_##RNG(int iters, int arg) { \ - testing::ItemsProcessed(static_cast(iters) * arg); \ - test::Benchmark(#DEVICE, RNG(arg)).Run(iters); \ - } \ +#define BM_RNG(DEVICE, RNG) \ + void BM_##DEVICE##_##RNG(::testing::benchmark::State& state) { \ + const int arg = state.range(0); \ + \ + test::Benchmark(#DEVICE, RNG(arg), /*old_benchmark_api*/ false) \ + .Run(state); \ + state.SetItemsProcessed(static_cast(state.iterations()) * arg); \ + } \ BENCHMARK(BM_##DEVICE##_##RNG)->Range(1 << 20, 8 << 20); BM_RNG(cpu, RandomUniform); @@ -84,60 +87,48 @@ Tensor VecAlphas(int64 n) { return alphas; } -void BM_cpu_RandomGamma(int iters, int nsamp, int nalpha) { - testing::ItemsProcessed(static_cast(iters) * nsamp * nalpha); +void BM_cpu_RandomGamma(::testing::benchmark::State& state) { + const int nsamp = state.range(0); + const int nalpha = state.range(1); + Graph* g = new Graph(OpRegistry::Global()); test::graph::RandomGamma(g, test::graph::Constant(g, VecShape(nsamp)), test::graph::Constant(g, VecAlphas(nalpha))); - test::Benchmark("cpu", g).Run(iters); + test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * nsamp * + nalpha); } BENCHMARK(BM_cpu_RandomGamma)->RangePair(1 << 14, 4 << 15, 2, 50); -void BM_PhiloxRandom(int iters) { +void BM_PhiloxRandom(::testing::benchmark::State& state) { // Fill 2M random numbers int count = 2 << 20; - - testing::ItemsProcessed(static_cast(iters) * count); - random::PhiloxRandom gen(0x12345); - int val = 1; - for (int i = 0; i < iters; ++i) { + for (auto s : state) { for (int j = 0; j < count; j += 4) { /// each invocation of gen() returns 128-bit samples auto samples = gen(); - - // use the result trivially so the compiler does not optimize it away - val ^= samples[0] ^ samples[1] ^ samples[2] ^ samples[3]; + tensorflow::testing::DoNotOptimize(samples); } } - - // A anchor point to make sure the compiler does not cut corners - CHECK(val) << val; + state.SetItemsProcessed(static_cast(state.iterations()) * count); } BENCHMARK(BM_PhiloxRandom); -void BM_StdMTRandom(int iters) { +void BM_StdMTRandom(::testing::benchmark::State& state) { // Fill 2M random numbers int count = 2 << 20; - - testing::ItemsProcessed(static_cast(iters) * count); - std::mt19937 gen(0x12345); - uint_fast32_t val = 1; - for (int i = 0; i < iters; ++i) { + for (auto s : state) { for (int j = 0; j < count; ++j) { /// each invocation of gen() returns 32-bit sample uint_fast32_t sample = gen(); - - // use the result trivially so the compiler does not optimize it away - val ^= sample; + tensorflow::testing::DoNotOptimize(sample); } } - - // A anchor point to make sure the compiler does not cut corners - CHECK(val) << val; + state.SetItemsProcessed(static_cast(state.iterations()) * count); } BENCHMARK(BM_StdMTRandom); diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc index 359d7dbeca5..90666a77de6 100644 --- a/tensorflow/core/kernels/reduction_ops_test.cc +++ b/tensorflow/core/kernels/reduction_ops_test.cc @@ -84,108 +84,167 @@ static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) { // Creates a bench which reduces a 3D tensor with total "num" floats // into a scalar on a "device". Runs the bench for "iters" times. template -static void ReduceToScalar(int iters, const string& device, - const string& reduce, int num_x, int num_y) { - testing::ItemsProcessed(static_cast(iters) * num_x * num_y); - testing::BytesProcessed(static_cast(iters) * num_x * num_y * - sizeof(T)); - test::Benchmark(device, ToScalar(reduce, num_x, num_y)).Run(iters); +static void ReduceToScalar(::testing::benchmark::State& state, + const string& device, const string& reduce, + int num_x, int num_y) { + test::Benchmark(device, ToScalar(reduce, num_x, num_y), + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num_x * + num_y); + state.SetBytesProcessed(static_cast(state.iterations()) * num_x * + num_y * sizeof(T)); } -static void DoRowReduce(int iters, const string& device, const string& reduce, - int num_x, int num_y) { - testing::ItemsProcessed(static_cast(iters) * num_x * num_y); - testing::BytesProcessed(static_cast(iters) * num_x * num_y * - sizeof(float)); - test::Benchmark(device, RowReduce(reduce, num_x, num_y)).Run(iters); +static void DoRowReduce(::testing::benchmark::State& state, + const string& device, const string& reduce, int num_x, + int num_y) { + test::Benchmark(device, RowReduce(reduce, num_x, num_y), + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num_x * + num_y); + state.SetBytesProcessed(static_cast(state.iterations()) * num_x * + num_y * sizeof(float)); } -static void DoColReduce(int iters, const string& device, const string& reduce, - int num_x, int num_y) { - testing::ItemsProcessed(static_cast(iters) * num_x * num_y); - testing::BytesProcessed(static_cast(iters) * num_x * num_y * - sizeof(float)); - test::Benchmark(device, ColReduce(reduce, num_x, num_y)).Run(iters); +static void DoColReduce(::testing::benchmark::State& state, + const string& device, const string& reduce, int num_x, + int num_y) { + test::Benchmark(device, ColReduce(reduce, num_x, num_y), + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num_x * + num_y); + state.SetBytesProcessed(static_cast(state.iterations()) * num_x * + num_y * sizeof(float)); } -static void Do3DYReduce(int iters, const string& device, const string& reduce, - int num_x, int num_y) { - testing::ItemsProcessed(static_cast(iters) * num_x * num_y); - testing::BytesProcessed(static_cast(iters) * num_x * num_y * - sizeof(float)); - test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y)).Run(iters); +static void Do3DYReduce(::testing::benchmark::State& state, + const string& device, const string& reduce, int num_x, + int num_y) { + test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y), + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num_x * + num_y); + state.SetBytesProcessed(static_cast(state.iterations()) * num_x * + num_y * sizeof(float)); } -static void Do3DXZReduce(int iters, const string& device, const string& reduce, - int num_x, int num_y) { - testing::ItemsProcessed(static_cast(iters) * num_x * num_y); - testing::BytesProcessed(static_cast(iters) * num_x * num_y * - sizeof(float)); - test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y)).Run(iters); +static void Do3DXZReduce(::testing::benchmark::State& state, + const string& device, const string& reduce, int num_x, + int num_y) { + test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y), + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num_x * + num_y); + state.SetBytesProcessed(static_cast(state.iterations()) * num_x * + num_y * sizeof(float)); } -static void BM_Sum2DToScalarGPU(int iters, int num_x, int num_y) { - ReduceToScalar(iters, "gpu", "Sum", num_x, num_y); +static void BM_Sum2DToScalarGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + ReduceToScalar(state, "gpu", "Sum", num_x, num_y); } BENCHMARK(BM_Sum2DToScalarGPU)->RangePair(1, 8192, 1, 8192); -static void BM_Sum2DToScalarGPUComplex(int iters, int num_x, int num_y) { - ReduceToScalar>(iters, "gpu", "Sum", num_x, num_y); +static void BM_Sum2DToScalarGPUComplex(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + ReduceToScalar>(state, "gpu", "Sum", num_x, num_y); } BENCHMARK(BM_Sum2DToScalarGPUComplex)->RangePair(1, 8192, 1, 8192); -static void BM_Sum2DToScalarGPUHalf(int iters, int num_x, int num_y) { - ReduceToScalar(iters, "gpu", "Sum", num_x, num_y); +static void BM_Sum2DToScalarGPUHalf(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + ReduceToScalar(state, "gpu", "Sum", num_x, num_y); } BENCHMARK(BM_Sum2DToScalarGPUHalf)->RangePair(1, 8192, 1, 8192); -static void BM_Sum2DRowReduceGPU(int iters, int num_x, int num_y) { - DoRowReduce(iters, "gpu", "Sum", num_x, num_y); +static void BM_Sum2DRowReduceGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + DoRowReduce(state, "gpu", "Sum", num_x, num_y); } BENCHMARK(BM_Sum2DRowReduceGPU)->RangePair(1, 8192, 1, 8192); -static void BM_Sum2DColumnReduceGPU(int iters, int num_x, int num_y) { - DoColReduce(iters, "gpu", "Sum", num_x, num_y); +static void BM_Sum2DColumnReduceGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + DoColReduce(state, "gpu", "Sum", num_x, num_y); } BENCHMARK(BM_Sum2DColumnReduceGPU)->RangePair(1, 8192, 1, 8192); -static void BM_Sum3DYReduceGPU(int iters, int num_x, int num_y) { - Do3DYReduce(iters, "gpu", "Sum", num_x, num_y); +static void BM_Sum3DYReduceGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + Do3DYReduce(state, "gpu", "Sum", num_x, num_y); } BENCHMARK(BM_Sum3DYReduceGPU)->RangePair(64, 4096, 64, 4096); -static void BM_Sum3DXZReduceGPU(int iters, int num_x, int num_y) { - Do3DXZReduce(iters, "gpu", "Sum", num_x, num_y); +static void BM_Sum3DXZReduceGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + Do3DXZReduce(state, "gpu", "Sum", num_x, num_y); } BENCHMARK(BM_Sum3DXZReduceGPU)->RangePair(64, 4096, 64, 4096); -static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) { - ReduceToScalar(iters, "gpu", "Mean", num_x, num_y); +static void BM_Mean2DToScalarGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + ReduceToScalar(state, "gpu", "Mean", num_x, num_y); } BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); -static void BM_EuclideanNorm2DToScalarGPU(int iters, int num_x, int num_y) { - ReduceToScalar(iters, "gpu", "EuclideanNorm", num_x, num_y); +static void BM_EuclideanNorm2DToScalarGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + ReduceToScalar(state, "gpu", "EuclideanNorm", num_x, num_y); } BENCHMARK(BM_EuclideanNorm2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); -static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) { - ReduceToScalar(iters, "gpu", "Max", num_x, num_y); +static void BM_Max2DToScalarGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + ReduceToScalar(state, "gpu", "Max", num_x, num_y); } BENCHMARK(BM_Max2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); -static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) { - ReduceToScalar(iters, "gpu", "Min", num_x, num_y); +static void BM_Min2DToScalarGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + ReduceToScalar(state, "gpu", "Min", num_x, num_y); } BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); -static void BM_Min2DToScalarGPUHalf(int iters, int num_x, int num_y) { - ReduceToScalar(iters, "gpu", "Min", num_x, num_y); +static void BM_Min2DToScalarGPUHalf(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + ReduceToScalar(state, "gpu", "Min", num_x, num_y); } BENCHMARK(BM_Min2DToScalarGPUHalf)->RangePair(2048, 8192, 2048, 8192); -static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) { - ReduceToScalar(iters, "gpu", "All", num_x, num_y); +static void BM_Bool2DToScalarGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + ReduceToScalar(state, "gpu", "All", num_x, num_y); } BENCHMARK(BM_Bool2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); diff --git a/tensorflow/core/kernels/regex_replace_op_test.cc b/tensorflow/core/kernels/regex_replace_op_test.cc index b9e960efecc..7c537b6dbde 100644 --- a/tensorflow/core/kernels/regex_replace_op_test.cc +++ b/tensorflow/core/kernels/regex_replace_op_test.cc @@ -84,17 +84,17 @@ Graph* SetupRegexReplaceGraph(const Tensor& input, const string& input_pattern, return g; } -void BM_RegexReplace(int iters, int batch_size) { - testing::StopTiming(); - testing::ItemsProcessed(static_cast(iters)); - testing::UseRealTime(); +static void BM_RegexReplace(::testing::benchmark::State& state) { + const int batch_size = state.range(0); + Tensor input = GetTestTensor(batch_size); Graph* g = SetupRegexReplaceGraph(input, kRegExPattern, kRewrite); - testing::StartTiming(); - test::Benchmark("cpu", g).Run(iters); + test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); + state.SetItemsProcessed(static_cast(state.iterations())); } BENCHMARK(BM_RegexReplace) + ->UseRealTime() ->Arg(1) ->Arg(8) ->Arg(16) @@ -115,17 +115,17 @@ Graph* SetupStaticGraph(const Tensor& input, const string& input_pattern, .Finalize(g, nullptr /* node */)); return g; } -void BM_StaticRegexReplace(int iters, int batch_size) { - testing::StopTiming(); - testing::ItemsProcessed(static_cast(iters)); - testing::UseRealTime(); +static void BM_StaticRegexReplace(::testing::benchmark::State& state) { + const int batch_size = state.range(0); + Tensor input = GetTestTensor(batch_size); Graph* g = SetupStaticGraph(input, kRegExPattern, kRewrite); - testing::StartTiming(); - test::Benchmark("cpu", g).Run(iters); + test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); + state.SetItemsProcessed(static_cast(state.iterations())); } BENCHMARK(BM_StaticRegexReplace) + ->UseRealTime() ->Arg(1) ->Arg(8) ->Arg(16) diff --git a/tensorflow/core/kernels/requantization_range_op_test.cc b/tensorflow/core/kernels/requantization_range_op_test.cc index dd04da373d8..a9740dd31d7 100644 --- a/tensorflow/core/kernels/requantization_range_op_test.cc +++ b/tensorflow/core/kernels/requantization_range_op_test.cc @@ -67,56 +67,29 @@ TEST_F(RequantizationRangeTest, HandCrafted) { test::ExpectTensorEqual(expected_max, *GetOutput(1)); } -static void BM_RequantizationRange(int iters, int size) { - testing::StopTiming(); - testing::UseRealTime(); - testing::ItemsProcessed(static_cast(iters) * size); - testing::ItemsProcessed(static_cast(iters) * size * 4); +static void BM_RequantizationRange(::testing::benchmark::State& state) { + const int size = state.range(0); Tensor quantized_tensor(DT_QINT32, TensorShape({1, size})); test::FillFn(&quantized_tensor, [](int n) { return qint32(n); }); qint32 actual_min; qint32 actual_max; - testing::StartTiming(); - for (int iter = 0; iter < iters; ++iter) { + for (auto s : state) { CalculateUsedRange(quantized_tensor, &actual_min, &actual_max); } + state.SetItemsProcessed(static_cast(state.iterations()) * size); + state.SetBytesProcessed(static_cast(state.iterations()) * size * 4); } -static void BM_RequantizationRange100(int iters) { - BM_RequantizationRange(100, iters); -} -BENCHMARK(BM_RequantizationRange100); - -static void BM_RequantizationRange1000(int iters) { - BM_RequantizationRange(1000, iters); -} -BENCHMARK(BM_RequantizationRange1000); - -static void BM_RequantizationRange10000(int iters) { - BM_RequantizationRange(10000, iters); -} -BENCHMARK(BM_RequantizationRange10000); - -static void BM_RequantizationRange100000(int iters) { - BM_RequantizationRange(100000, iters); -} -BENCHMARK(BM_RequantizationRange100000); - -static void BM_RequantizationRange1000000(int iters) { - BM_RequantizationRange(1000000, iters); -} -BENCHMARK(BM_RequantizationRange1000000); - -static void BM_RequantizationRange10000000(int iters) { - BM_RequantizationRange(10000000, iters); -} -BENCHMARK(BM_RequantizationRange10000000); - -static void BM_RequantizationRange100000000(int iters) { - BM_RequantizationRange(100000000, iters); -} -BENCHMARK(BM_RequantizationRange100000000); +BENCHMARK(BM_RequantizationRange) + ->UseRealTime() + ->Arg(100) + ->Arg(1000) + ->Arg(10000) + ->Arg(100000) + ->Arg(1000000) + ->Arg(10000000) + ->Arg(100000000); } // end namespace tensorflow diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc index 62d7d294597..d34e97ea2c2 100644 --- a/tensorflow/core/kernels/reverse_op_test.cc +++ b/tensorflow/core/kernels/reverse_op_test.cc @@ -197,148 +197,187 @@ static Graph* Reverse(const TensorShape& shape, int reverse_axis) { } template -static void RunReverseRowsBenchmark(int iters, int outer_dim, int middle_dim, +static void RunReverseRowsBenchmark(::testing::benchmark::State& state, + int outer_dim, int middle_dim, int intra_threads, int channels) { SessionOptions opts = GetOptions(intra_threads); TensorShape shape{outer_dim, middle_dim, channels}; - const int64 num_items = static_cast(iters) * shape.num_elements(); - testing::ItemsProcessed(num_items); - testing::BytesProcessed(num_items * sizeof(T)); - testing::UseRealTime(); - test::Benchmark("cpu", Reverse(shape, 1), &opts).Run(iters); + test::Benchmark("cpu", Reverse(shape, 1), &opts, nullptr, nullptr, "", + /*old_benchmark_api*/ false) + .Run(state); + const int64 num_items = + static_cast(state.iterations()) * shape.num_elements(); + state.SetItemsProcessed(num_items); + state.SetBytesProcessed(num_items * sizeof(T)); } -static void BM_ReverseRowsOf1Channel_1T_float(int iters, int outer_dim, - int middle_dim) { - RunReverseRowsBenchmark(iters, outer_dim, middle_dim, +void BM_ReverseRowsOf1Channel_1T_float(::testing::benchmark::State& state) { + const int outer_dim = state.range(0); + const int middle_dim = state.range(1); + + RunReverseRowsBenchmark(state, outer_dim, middle_dim, 1 /* intra_threads */, 1 /* channels */); } BENCHMARK(BM_ReverseRowsOf1Channel_1T_float) + ->UseRealTime() ->ArgPair(288, 288) ->ArgPair(1024, 1024) ->ArgPair(10 * 1024, 1024); -static void BM_ReverseRowsOf1Channel_1T_uint8(int iters, int outer_dim, - int middle_dim) { - RunReverseRowsBenchmark(iters, outer_dim, middle_dim, +void BM_ReverseRowsOf1Channel_1T_uint8(::testing::benchmark::State& state) { + const int outer_dim = state.range(0); + const int middle_dim = state.range(1); + + RunReverseRowsBenchmark(state, outer_dim, middle_dim, 1 /* intra_threads */, 1 /* channels */); } BENCHMARK(BM_ReverseRowsOf1Channel_1T_uint8) + ->UseRealTime() ->ArgPair(288, 288) ->ArgPair(1024, 1024) ->ArgPair(10 * 1024, 1024); -static void BM_ReverseRowsOf1Channel_4T_float(int iters, int outer_dim, - int middle_dim) { - RunReverseRowsBenchmark(iters, outer_dim, middle_dim, +void BM_ReverseRowsOf1Channel_4T_float(::testing::benchmark::State& state) { + const int outer_dim = state.range(0); + const int middle_dim = state.range(1); + + RunReverseRowsBenchmark(state, outer_dim, middle_dim, 4 /* intra_threads */, 1 /* channels */); } BENCHMARK(BM_ReverseRowsOf1Channel_4T_float) + ->UseRealTime() ->ArgPair(288, 288) ->ArgPair(1024, 1024) ->ArgPair(10 * 1024, 1024); -static void BM_ReverseRowsOf1Channel_4T_uint8(int iters, int outer_dim, - int middle_dim) { - RunReverseRowsBenchmark(iters, outer_dim, middle_dim, +void BM_ReverseRowsOf1Channel_4T_uint8(::testing::benchmark::State& state) { + const int outer_dim = state.range(0); + const int middle_dim = state.range(1); + + RunReverseRowsBenchmark(state, outer_dim, middle_dim, 4 /* intra_threads */, 1 /* channels */); } BENCHMARK(BM_ReverseRowsOf1Channel_4T_uint8) + ->UseRealTime() ->ArgPair(288, 288) ->ArgPair(1024, 1024) ->ArgPair(10 * 1024, 1024); -static void BM_ReverseRowsOf3Channels_1T_float(int iters, int outer_dim, - int middle_dim) { - RunReverseRowsBenchmark(iters, outer_dim, middle_dim, +void BM_ReverseRowsOf3Channels_1T_float(::testing::benchmark::State& state) { + const int outer_dim = state.range(0); + const int middle_dim = state.range(1); + + RunReverseRowsBenchmark(state, outer_dim, middle_dim, 1 /* intra_threads */, 3 /* channels */); } BENCHMARK(BM_ReverseRowsOf3Channels_1T_float) + ->UseRealTime() ->ArgPair(288, 288) ->ArgPair(30, 30) ->ArgPair(1024, 1024) ->ArgPair(10 * 1024, 1024); -static void BM_ReverseRowsOf3Channels_1T_uint8(int iters, int outer_dim, - int middle_dim) { - RunReverseRowsBenchmark(iters, outer_dim, middle_dim, +void BM_ReverseRowsOf3Channels_1T_uint8(::testing::benchmark::State& state) { + const int outer_dim = state.range(0); + const int middle_dim = state.range(1); + + RunReverseRowsBenchmark(state, outer_dim, middle_dim, 1 /* intra_threads */, 3 /* channels */); } BENCHMARK(BM_ReverseRowsOf3Channels_1T_uint8) + ->UseRealTime() ->ArgPair(288, 288) ->ArgPair(30, 30) ->ArgPair(1024, 1024) ->ArgPair(10 * 1024, 1024); -static void BM_ReverseRowsOf3Channels_4T_float(int iters, int outer_dim, - int middle_dim) { - RunReverseRowsBenchmark(iters, outer_dim, middle_dim, +void BM_ReverseRowsOf3Channels_4T_float(::testing::benchmark::State& state) { + const int outer_dim = state.range(0); + const int middle_dim = state.range(1); + + RunReverseRowsBenchmark(state, outer_dim, middle_dim, 4 /* intra_threads */, 3 /* channels */); } BENCHMARK(BM_ReverseRowsOf3Channels_4T_float) + ->UseRealTime() ->ArgPair(288, 288) ->ArgPair(30, 30) ->ArgPair(1024, 1024) ->ArgPair(10 * 1024, 1024); -static void BM_ReverseRowsOf3Channels_4T_uint8(int iters, int outer_dim, - int middle_dim) { - RunReverseRowsBenchmark(iters, outer_dim, middle_dim, +void BM_ReverseRowsOf3Channels_4T_uint8(::testing::benchmark::State& state) { + const int outer_dim = state.range(0); + const int middle_dim = state.range(1); + + RunReverseRowsBenchmark(state, outer_dim, middle_dim, 4 /* intra_threads */, 3 /* channels */); } BENCHMARK(BM_ReverseRowsOf3Channels_4T_uint8) + ->UseRealTime() ->ArgPair(288, 288) ->ArgPair(30, 30) ->ArgPair(1024, 1024) ->ArgPair(10 * 1024, 1024); -static void BM_ReverseRowsOf4Channels_1T_float(int iters, int outer_dim, - int middle_dim) { - RunReverseRowsBenchmark(iters, outer_dim, middle_dim, +void BM_ReverseRowsOf4Channels_1T_float(::testing::benchmark::State& state) { + const int outer_dim = state.range(0); + const int middle_dim = state.range(1); + + RunReverseRowsBenchmark(state, outer_dim, middle_dim, 1 /* intra_threads */, 4 /* channels */); } BENCHMARK(BM_ReverseRowsOf4Channels_1T_float) + ->UseRealTime() ->ArgPair(288, 288) ->ArgPair(1024, 1024) ->ArgPair(10 * 1024, 1024); -static void BM_ReverseRowsOf4Channels_1T_uint8(int iters, int outer_dim, - int middle_dim) { - RunReverseRowsBenchmark(iters, outer_dim, middle_dim, +void BM_ReverseRowsOf4Channels_1T_uint8(::testing::benchmark::State& state) { + const int outer_dim = state.range(0); + const int middle_dim = state.range(1); + + RunReverseRowsBenchmark(state, outer_dim, middle_dim, 1 /* intra_threads */, 4 /* channels */); } BENCHMARK(BM_ReverseRowsOf4Channels_1T_uint8) + ->UseRealTime() ->ArgPair(288, 288) ->ArgPair(1024, 1024) ->ArgPair(10 * 1024, 1024); -static void BM_ReverseRowsOf4Channels_4T_float(int iters, int outer_dim, - int middle_dim) { - RunReverseRowsBenchmark(iters, outer_dim, middle_dim, +void BM_ReverseRowsOf4Channels_4T_float(::testing::benchmark::State& state) { + const int outer_dim = state.range(0); + const int middle_dim = state.range(1); + + RunReverseRowsBenchmark(state, outer_dim, middle_dim, 4 /* intra_threads */, 4 /* channels */); } BENCHMARK(BM_ReverseRowsOf4Channels_4T_float) + ->UseRealTime() ->ArgPair(288, 288) ->ArgPair(1024, 1024) ->ArgPair(10 * 1024, 1024); -static void BM_ReverseRowsOf4Channels_4T_uint8(int iters, int outer_dim, - int middle_dim) { - RunReverseRowsBenchmark(iters, outer_dim, middle_dim, +void BM_ReverseRowsOf4Channels_4T_uint8(::testing::benchmark::State& state) { + const int outer_dim = state.range(0); + const int middle_dim = state.range(1); + + RunReverseRowsBenchmark(state, outer_dim, middle_dim, 4 /* intra_threads */, 4 /* channels */); } BENCHMARK(BM_ReverseRowsOf4Channels_4T_uint8) + ->UseRealTime() ->ArgPair(288, 288) ->ArgPair(1024, 1024) ->ArgPair(10 * 1024, 1024); diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc index 3ee66906139..6e0b638c79d 100644 --- a/tensorflow/core/kernels/roll_op_test.cc +++ b/tensorflow/core/kernels/roll_op_test.cc @@ -450,34 +450,44 @@ static Graph* RollGraph(const TensorShape& shape, int isd) { return g; } -#define BM_ROLL_OUTER(DEVICE) \ - static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) { \ - TensorShape shape{rows, columns}; \ - const int64 num_items = static_cast(iters) * shape.num_elements(); \ - testing::ItemsProcessed(num_items); \ - testing::BytesProcessed(num_items * sizeof(float)); \ - testing::UseRealTime(); \ - test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters); \ - } \ - BENCHMARK(BM_##DEVICE##_roll_outer) \ - ->ArgPair(256, 256) \ - ->ArgPair(512, 512) \ - ->ArgPair(1024, 1024) \ +#define BM_ROLL_OUTER(DEVICE) \ + static void BM_##DEVICE##_roll_outer(::testing::benchmark::State& state) { \ + const int rows = state.range(0); \ + const int columns = state.range(1); \ + \ + TensorShape shape{rows, columns}; \ + test::Benchmark(#DEVICE, RollGraph(shape, 0), /*old_benchmark_api*/ false) \ + .Run(state); \ + const int64 num_items = \ + static_cast(state.iterations()) * shape.num_elements(); \ + state.SetItemsProcessed(num_items); \ + state.SetBytesProcessed(num_items * sizeof(float)); \ + } \ + BENCHMARK(BM_##DEVICE##_roll_outer) \ + ->UseRealTime() \ + ->ArgPair(256, 256) \ + ->ArgPair(512, 512) \ + ->ArgPair(1024, 1024) \ ->ArgPair(2048, 2048) -#define BM_ROLL_ALL(DEVICE) \ - static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) { \ - TensorShape shape{rows, columns}; \ - const int64 num_items = static_cast(iters) * shape.num_elements(); \ - testing::ItemsProcessed(num_items); \ - testing::BytesProcessed(num_items * sizeof(float)); \ - testing::UseRealTime(); \ - test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters); \ - } \ - BENCHMARK(BM_##DEVICE##_roll_all) \ - ->ArgPair(256, 256) \ - ->ArgPair(512, 512) \ - ->ArgPair(1024, 1024) \ +#define BM_ROLL_ALL(DEVICE) \ + static void BM_##DEVICE##_roll_all(::testing::benchmark::State& state) { \ + const int rows = state.range(0); \ + const int columns = state.range(1); \ + \ + TensorShape shape{rows, columns}; \ + test::Benchmark(#DEVICE, RollGraph(shape, 1), /*old_benchmark_api*/ false) \ + .Run(state); \ + const int64 num_items = \ + static_cast(state.iterations()) * shape.num_elements(); \ + state.SetItemsProcessed(num_items); \ + state.SetBytesProcessed(num_items * sizeof(float)); \ + } \ + BENCHMARK(BM_##DEVICE##_roll_all) \ + ->UseRealTime() \ + ->ArgPair(256, 256) \ + ->ArgPair(512, 512) \ + ->ArgPair(1024, 1024) \ ->ArgPair(2048, 2048) BM_ROLL_OUTER(cpu); diff --git a/tensorflow/core/kernels/save_op_test.cc b/tensorflow/core/kernels/save_op_test.cc index 1f6d8257bdd..b46609ef193 100644 --- a/tensorflow/core/kernels/save_op_test.cc +++ b/tensorflow/core/kernels/save_op_test.cc @@ -663,8 +663,8 @@ TEST_F(SaveOpSlices2Test, TwoSlices) { // Benchmark-related code below. -static void BM_LargeTensorWrite(int iters, int num_elements) { - testing::StopTiming(); +void BM_LargeTensorWrite(::testing::benchmark::State& state) { + const int num_elements = state.range(0); // 4 * num_elements bytes total , since sizeof(float) == 4. Tensor tensor(DT_FLOAT, TensorShape({num_elements})); @@ -689,8 +689,9 @@ static void BM_LargeTensorWrite(int iters, int num_elements) { VLOG(1) << "Save op's output path: " << temp_filename; VLOG(1) << "# nodes in Graph: " << g->num_nodes(); - testing::StartTiming(); - test::Benchmark("cpu", g, &session_options).Run(iters); + test::Benchmark("cpu", g, &session_options, nullptr, nullptr, "", + /*old_benchmark_api*/ false) + .Run(state); } BENCHMARK(BM_LargeTensorWrite)->Arg((1 << 30) / 4 /* 1GB float tensor */); diff --git a/tensorflow/core/kernels/scan_ops_test.cc b/tensorflow/core/kernels/scan_ops_test.cc index 588b606a99b..88cb351eb53 100644 --- a/tensorflow/core/kernels/scan_ops_test.cc +++ b/tensorflow/core/kernels/scan_ops_test.cc @@ -67,79 +67,120 @@ static Graph* ThreeDYCumsum(int num_y, int num_z, bool reverse = false) { } template -static void LargeOneDimensional(int iters, const string& device, int num_x, +static void LargeOneDimensional(::testing::benchmark::State& state, + const string& device, int num_x, bool reverse = false) { - testing::ItemsProcessed(static_cast(iters) * num_x); - testing::BytesProcessed(static_cast(iters) * num_x * sizeof(T)); - test::Benchmark(device, LargeOneDCumsum(num_x, reverse)).Run(iters); + test::Benchmark(device, LargeOneDCumsum(num_x, reverse), + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num_x); + state.SetBytesProcessed(static_cast(state.iterations()) * num_x * + sizeof(T)); } -static void DoRowCumsum(int iters, const string& device, int num_x, int num_y, +static void DoRowCumsum(::testing::benchmark::State& state, + const string& device, int num_x, int num_y, bool reverse = false) { - testing::ItemsProcessed(static_cast(iters) * num_x * num_y); - testing::BytesProcessed(static_cast(iters) * num_x * num_y * - sizeof(float)); - test::Benchmark(device, RowCumsum(num_x, num_y, reverse)).Run(iters); + test::Benchmark(device, RowCumsum(num_x, num_y, reverse), + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num_x * + num_y); + state.SetBytesProcessed(static_cast(state.iterations()) * num_x * + num_y * sizeof(float)); } -static void DoColCumsum(int iters, const string& device, int num_x, int num_y, +static void DoColCumsum(::testing::benchmark::State& state, + const string& device, int num_x, int num_y, bool reverse = false) { - testing::ItemsProcessed(static_cast(iters) * num_x * num_y); - testing::BytesProcessed(static_cast(iters) * num_x * num_y * - sizeof(float)); - test::Benchmark(device, ColCumsum(num_x, num_y, reverse)).Run(iters); + test::Benchmark(device, ColCumsum(num_x, num_y, reverse), + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num_x * + num_y); + state.SetBytesProcessed(static_cast(state.iterations()) * num_x * + num_y * sizeof(float)); } -static void Do3DYCumsum(int iters, const string& device, int num_x, int num_y, +static void Do3DYCumsum(::testing::benchmark::State& state, + const string& device, int num_x, int num_y, bool reverse = false) { - testing::ItemsProcessed(static_cast(iters) * num_x * num_y); - testing::BytesProcessed(static_cast(iters) * num_x * num_y * - sizeof(float)); - test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse)).Run(iters); + test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse), + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations()) * num_x * + num_y); + state.SetBytesProcessed(static_cast(state.iterations()) * num_x * + num_y * sizeof(float)); } -static void BM_OneDCumsumGPU(int iters, int num_x) { - LargeOneDimensional(iters, "gpu", num_x); +static void BM_OneDCumsumGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + + LargeOneDimensional(state, "gpu", num_x); } BENCHMARK(BM_OneDCumsumGPU)->Range(1, 1 << 21); -static void BM_OneDCumsumGPUHalf(int iters, int num_x) { - LargeOneDimensional(iters, "gpu", num_x); +static void BM_OneDCumsumGPUHalf(::testing::benchmark::State& state) { + const int num_x = state.range(0); + + LargeOneDimensional(state, "gpu", num_x); } BENCHMARK(BM_OneDCumsumGPUHalf)->Range(1, 1 << 21); -static void BM_Sum2DRowCumsumGPU(int iters, int num_x, int num_y) { - DoRowCumsum(iters, "gpu", num_x, num_y); +static void BM_Sum2DRowCumsumGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + DoRowCumsum(state, "gpu", num_x, num_y); } BENCHMARK(BM_Sum2DRowCumsumGPU)->RangePair(1, 8192, 1, 8192); -static void BM_Sum2DColumnCumsumGPU(int iters, int num_x, int num_y) { - DoColCumsum(iters, "gpu", num_x, num_y); +static void BM_Sum2DColumnCumsumGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + DoColCumsum(state, "gpu", num_x, num_y); } BENCHMARK(BM_Sum2DColumnCumsumGPU)->RangePair(1, 8192, 1, 8192); -static void BM_Sum3DYCumsumGPU(int iters, int num_x, int num_y) { - Do3DYCumsum(iters, "gpu", num_x, num_y); +static void BM_Sum3DYCumsumGPU(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + Do3DYCumsum(state, "gpu", num_x, num_y); } BENCHMARK(BM_Sum3DYCumsumGPU)->RangePair(64, 4096, 64, 4096); -static void BM_OneDCumsumGPU_reverse(int iters, int num_x) { - LargeOneDimensional(iters, "gpu", num_x, true); +static void BM_OneDCumsumGPU_reverse(::testing::benchmark::State& state) { + const int num_x = state.range(0); + + LargeOneDimensional(state, "gpu", num_x, true); } BENCHMARK(BM_OneDCumsumGPU_reverse)->Range(1, 1 << 21); -static void BM_Sum2DRowCumsumGPU_reverse(int iters, int num_x, int num_y) { - DoRowCumsum(iters, "gpu", num_x, num_y, true); +static void BM_Sum2DRowCumsumGPU_reverse(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + DoRowCumsum(state, "gpu", num_x, num_y, true); } BENCHMARK(BM_Sum2DRowCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192); -static void BM_Sum2DColumnCumsumGPU_reverse(int iters, int num_x, int num_y) { - DoColCumsum(iters, "gpu", num_x, num_y, true); +static void BM_Sum2DColumnCumsumGPU_reverse( + ::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + DoColCumsum(state, "gpu", num_x, num_y, true); } BENCHMARK(BM_Sum2DColumnCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192); -static void BM_Sum3DYCumsumGPU_reverse(int iters, int num_x, int num_y) { - Do3DYCumsum(iters, "gpu", num_x, num_y, true); +static void BM_Sum3DYCumsumGPU_reverse(::testing::benchmark::State& state) { + const int num_x = state.range(0); + const int num_y = state.range(1); + + Do3DYCumsum(state, "gpu", num_x, num_y, true); } BENCHMARK(BM_Sum3DYCumsumGPU_reverse)->RangePair(32, 2048, 32, 2048); diff --git a/tensorflow/core/kernels/scatter_nd_op_test.cc b/tensorflow/core/kernels/scatter_nd_op_test.cc index 9c31bed784f..b7837e11e73 100644 --- a/tensorflow/core/kernels/scatter_nd_op_test.cc +++ b/tensorflow/core/kernels/scatter_nd_op_test.cc @@ -254,8 +254,8 @@ class ScatterNdUpdateBM : public ScatterNdUpdateOpTest { }; template -static void BM_ScatterNdHelper(int iters, int embedding_size, const char* op) { - testing::StopTiming(); +void BM_ScatterNdHelper(::testing::benchmark::State& state, int embedding_size, + const char* op) { const int kRows = 10000000 / embedding_size; std::vector values; values.reserve(kRows); @@ -280,27 +280,33 @@ static void BM_ScatterNdHelper(int iters, int embedding_size, const char* op) { bm.AddInputFromArray(TensorShape({kNumUpdates}), indices); bm.AddInputFromArray(TensorShape({kNumUpdates, embedding_size}), updates); - testing::ItemsProcessed((static_cast(kNumUpdates) * embedding_size) * - iters); - testing::StartTiming(); - while (iters-- > 0) { + for (auto i : state) { Status s = bm.RunOpKernel(); } - testing::StopTiming(); + state.SetItemsProcessed((static_cast(kNumUpdates) * embedding_size) * + state.iterations()); } -static void BM_ScatterNdUpdateInt32(int iters, int embedding_size) { - BM_ScatterNdHelper(iters, embedding_size, "ScatterNdUpdate"); +void BM_ScatterNdUpdateInt32(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterNdHelper(state, embedding_size, "ScatterNdUpdate"); } -static void BM_ScatterNdUpdateInt64(int iters, int embedding_size) { - BM_ScatterNdHelper(iters, embedding_size, "ScatterNdUpdate"); +void BM_ScatterNdUpdateInt64(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterNdHelper(state, embedding_size, "ScatterNdUpdate"); } -static void BM_ScatterNdAddInt32(int iters, int embedding_size) { - BM_ScatterNdHelper(iters, embedding_size, "ScatterNdAdd"); +void BM_ScatterNdAddInt32(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterNdHelper(state, embedding_size, "ScatterNdAdd"); } -static void BM_ScatterNdAddInt64(int iters, int embedding_size) { - BM_ScatterNdHelper(iters, embedding_size, "ScatterNdAdd"); +void BM_ScatterNdAddInt64(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterNdHelper(state, embedding_size, "ScatterNdAdd"); } BENCHMARK(BM_ScatterNdUpdateInt32) diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc index e52f6e74dd5..7febb0e1cb7 100644 --- a/tensorflow/core/kernels/scatter_op_test.cc +++ b/tensorflow/core/kernels/scatter_op_test.cc @@ -280,9 +280,8 @@ class ScatterUpdateBM : public ScatterUpdateOpTest { }; template -static void BM_ScatterHelper(int iters, int embedding_size, const char* op, - bool big_num_updates = false) { - testing::StopTiming(); +void BM_ScatterHelper(::testing::benchmark::State& state, int embedding_size, + const char* op, bool big_num_updates = false) { const int kRows = 10000000 / embedding_size; std::vector values; values.reserve(kRows); @@ -307,59 +306,83 @@ static void BM_ScatterHelper(int iters, int embedding_size, const char* op, bm.AddInputFromArray(TensorShape({kNumUpdates}), indices); bm.AddInputFromArray(TensorShape({kNumUpdates, embedding_size}), updates); - testing::ItemsProcessed((static_cast(kNumUpdates) * embedding_size) * - iters); - testing::StartTiming(); - while (iters-- > 0) { + for (auto i : state) { Status s = bm.RunOpKernel(); } - testing::StopTiming(); + state.SetItemsProcessed((static_cast(kNumUpdates) * embedding_size) * + state.iterations()); } -static void BM_ScatterUpdateInt32(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterUpdate"); +void BM_ScatterUpdateInt32(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterUpdate"); } -static void BM_ScatterUpdateInt64(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterUpdate"); +void BM_ScatterUpdateInt64(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterUpdate"); } -static void BM_ScatterAddInt32(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterAdd"); +void BM_ScatterAddInt32(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterAdd"); } -static void BM_ScatterAddInt32Large(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterAdd", true); +void BM_ScatterAddInt32Large(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterAdd", true); } -static void BM_ScatterAddInt64(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterAdd"); +void BM_ScatterAddInt64(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterAdd"); } -static void BM_ScatterMulInt32(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterMul"); +void BM_ScatterMulInt32(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterMul"); } -static void BM_ScatterMulInt64(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterMul"); +void BM_ScatterMulInt64(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterMul"); } -static void BM_ScatterDivInt32(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterDiv"); +void BM_ScatterDivInt32(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterDiv"); } -static void BM_ScatterDivInt64(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterDiv"); +void BM_ScatterDivInt64(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterDiv"); } -static void BM_ScatterMinInt32(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterMin"); +void BM_ScatterMinInt32(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterMin"); } -static void BM_ScatterMinInt64(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterMin"); +void BM_ScatterMinInt64(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterMin"); } -static void BM_ScatterMaxInt32(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterMax"); +void BM_ScatterMaxInt32(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterMax"); } -static void BM_ScatterMaxInt64(int iters, int embedding_size) { - BM_ScatterHelper(iters, embedding_size, "ScatterMax"); +void BM_ScatterMaxInt64(::testing::benchmark::State& state) { + const int embedding_size = state.range(0); + + BM_ScatterHelper(state, embedding_size, "ScatterMax"); } BENCHMARK(BM_ScatterUpdateInt32) diff --git a/tensorflow/core/kernels/segment_reduction_ops_test.cc b/tensorflow/core/kernels/segment_reduction_ops_test.cc index 8d7b70878b7..ca8c3db3d42 100644 --- a/tensorflow/core/kernels/segment_reduction_ops_test.cc +++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc @@ -39,10 +39,9 @@ limitations under the License. namespace tensorflow { template -static void BM_SegmentReduction(int iters, const string& reduction, - Index num_rows, Index num_cols, - Index segment_size) { - testing::StopTiming(); +static void BM_SegmentReduction(::testing::benchmark::State& state, + const string& reduction, Index num_rows, + Index num_cols, Index segment_size) { std::unique_ptr device( DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); @@ -81,24 +80,25 @@ static void BM_SegmentReduction(int iters, const string& reduction, reduction_op->Compute(reduction_context.get()); TF_CHECK_OK(reduction_context->status()); - testing::StartTiming(); - for (int i = 0; i < iters; ++i) { + for (auto s : state) { delete reduction_context->release_output(0).tensor; reduction_op->Compute(reduction_context.get()); } int64 bytes_per_iter = static_cast(num_rows * num_cols * sizeof(float)); - testing::BytesProcessed(bytes_per_iter * iters); + state.SetBytesProcessed(bytes_per_iter * state.iterations()); } -#define BM_Reduce(O, R, C, S) \ - static void BM_Reduce_##O##_##R##_##C##_##S##_int32(int iters) { \ - BM_SegmentReduction(iters, #O, R, C, S); \ - } \ - static void BM_Reduce_##O##_##R##_##C##_##S##_int64(int iters) { \ - BM_SegmentReduction(iters, #O, R, C, S); \ - } \ - BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32); \ +#define BM_Reduce(O, R, C, S) \ + static void BM_Reduce_##O##_##R##_##C##_##S##_int32( \ + ::testing::benchmark::State & state) { \ + BM_SegmentReduction(state, #O, R, C, S); \ + } \ + static void BM_Reduce_##O##_##R##_##C##_##S##_int64( \ + ::testing::benchmark::State & state) { \ + BM_SegmentReduction(state, #O, R, C, S); \ + } \ + BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32); \ BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int64); #define BM_Reduce_Arg(R, C, S) \ @@ -113,8 +113,8 @@ BM_Reduce_Arg(64, 32, 2); BM_Reduce_Arg(4096, 32, 2); BM_Reduce_Arg(4096, 128, 2); -static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) { - testing::StopTiming(); +static void SparseSegmentMeanGradHelper(::testing::benchmark::State& state, + float uniqueness, int size) { Graph* g = new Graph(OpRegistry::Global()); CHECK_LE(uniqueness, 1.0); CHECK_GT(uniqueness, 0.0); @@ -148,22 +148,24 @@ static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) { .Attr("T", DT_FLOAT) .Finalize(g, &node)); - testing::UseRealTime(); - testing::BytesProcessed(static_cast(iters) * (kDim1 * kDim2) * - sizeof(float)); - testing::StartTiming(); - test::Benchmark("cpu", g).Run(iters); + test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); + state.SetBytesProcessed(static_cast(state.iterations()) * + (kDim1 * kDim2) * sizeof(float)); } -static void BM_SparseSegmentMeanGrad_Low(int iters, int size) { - return SparseSegmentMeanGradHelper(iters, 1.0, size); +static void BM_SparseSegmentMeanGrad_Low(::testing::benchmark::State& state) { + const int size = state.range(0); + + return SparseSegmentMeanGradHelper(state, 1.0, size); } -static void BM_SparseSegmentMeanGrad_High(int iters, int size) { - return SparseSegmentMeanGradHelper(iters, 0.01, size); +static void BM_SparseSegmentMeanGrad_High(::testing::benchmark::State& state) { + const int size = state.range(0); + + return SparseSegmentMeanGradHelper(state, 0.01, size); } -BENCHMARK(BM_SparseSegmentMeanGrad_Low)->Arg(1000)->Arg(100000); -BENCHMARK(BM_SparseSegmentMeanGrad_High)->Arg(1000)->Arg(100000); +BENCHMARK(BM_SparseSegmentMeanGrad_Low)->UseRealTime()->Arg(1000)->Arg(100000); +BENCHMARK(BM_SparseSegmentMeanGrad_High)->UseRealTime()->Arg(1000)->Arg(100000); } // namespace tensorflow diff --git a/tensorflow/core/kernels/sendrecv_ops_test.cc b/tensorflow/core/kernels/sendrecv_ops_test.cc index 092a29f2f3c..347f7d933d0 100644 --- a/tensorflow/core/kernels/sendrecv_ops_test.cc +++ b/tensorflow/core/kernels/sendrecv_ops_test.cc @@ -54,21 +54,21 @@ static Graph* Recv() { return g; } -static void BM_Send(int iters) { - testing::UseRealTime(); - testing::ItemsProcessed(static_cast(iters)); - test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous) - .Run(iters); +void BM_Send(::testing::benchmark::State& state) { + test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous, "", + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations())); } -BENCHMARK(BM_Send); +BENCHMARK(BM_Send)->UseRealTime(); -static void BM_Recv(int iters) { - testing::UseRealTime(); - testing::ItemsProcessed(static_cast(iters)); - test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous) - .Run(iters); +void BM_Recv(::testing::benchmark::State& state) { + test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous, "", + /*old_benchmark_api*/ false) + .Run(state); + state.SetItemsProcessed(static_cast(state.iterations())); } -BENCHMARK(BM_Recv); +BENCHMARK(BM_Recv)->UseRealTime(); } // namespace } // namespace tensorflow diff --git a/tensorflow/core/kernels/slice_op_test.cc b/tensorflow/core/kernels/slice_op_test.cc index f589a09c4fc..aeb96566da6 100644 --- a/tensorflow/core/kernels/slice_op_test.cc +++ b/tensorflow/core/kernels/slice_op_test.cc @@ -37,8 +37,8 @@ namespace { // For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim' // in size, and concat them together along "concat_dimension" template -static void SliceHelper(int iters, int size) { - testing::StopTiming(); +static void SliceHelper(::testing::benchmark::State& state) { + const int size = state.range(0); Graph* g = new Graph(OpRegistry::Global()); DataType dt = DataTypeToEnum::v(); int kDim = 100; @@ -65,26 +65,24 @@ static void SliceHelper(int iters, int size) { .Finalize(g, &node)); FixupSourceAndSinkEdges(g); - testing::BytesProcessed(static_cast(iters) * kDim * size * sizeof(T)); - testing::StartTiming(); test::Benchmark("cpu", g, nullptr, nullptr, nullptr, - "SINGLE_THREADED_EXECUTOR") - .Run(iters); - - testing::UseRealTime(); + "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false) + .Run(state); + state.SetBytesProcessed(static_cast(state.iterations()) * kDim * size * + sizeof(T)); } -static void BM_SliceFloat(int iters, int dim2) { - SliceHelper(iters, dim2); +void BM_SliceFloat(::testing::benchmark::State& state) { + SliceHelper(state); } -BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000); +BENCHMARK(BM_SliceFloat)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000); -static void BM_SliceBFloat16(int iters, int dim2) { - SliceHelper(iters, dim2); +void BM_SliceBFloat16(::testing::benchmark::State& state) { + SliceHelper(state); } -BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000); +BENCHMARK(BM_SliceBFloat16)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000); } // namespace } // namespace tensorflow diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc index e3e9a27f316..4f6c20921ed 100644 --- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc +++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc @@ -276,15 +276,18 @@ static ST MakeSparseTensor(Graph* g, int B, int M, int N, int nnz_inner) { // [8, 4, N{nnz}] cmul [8, 4, N] #define BM_SparseMatCMulDenseMatArgs(N, NNZ_INNER) \ - static void BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER(int iters) { \ + static void BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER( \ + ::testing::benchmark::State& state) { \ Graph* g = new Graph(OpRegistry::Global()); \ Node* dense = MakeTensor(g, 8, 4, N); \ ST sp = MakeSparseTensor(g, 8, 4, N, NNZ_INNER); \ \ - testing::ItemsProcessed(static_cast(iters * 8 * 4 * N * 2)); \ test::Benchmark( \ - "cpu", SparseMatCMulDenseMat(g, sp.indices, sp.vals, sp.shape, dense)) \ - .Run(iters); \ + "cpu", SparseMatCMulDenseMat(g, sp.indices, sp.vals, sp.shape, dense), \ + /*old_benchmark_api*/ false) \ + .Run(state); \ + state.SetItemsProcessed( \ + static_cast(state.iterations() * 8 * 4 * N * 2)); \ } \ BENCHMARK(BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER) diff --git a/tensorflow/core/kernels/sparse_to_dense_op_test.cc b/tensorflow/core/kernels/sparse_to_dense_op_test.cc index 84e1e09c219..a1f22e355ec 100644 --- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc +++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc @@ -198,9 +198,11 @@ TEST_F(SparseToDenseTest, ThreeD_MultValues) { } // namespace -static void BM_SparseToDense(int iters, int NDIM, int N) { +static void BM_SparseToDense(::testing::benchmark::State& state) { + const int NDIM = state.range(0); + const int N = state.range(1); + // TODO(zhifengc): Switch to use kernel_benchmark_testlib.h - tensorflow::testing::StopTiming(); const int IndexDim = (NDIM == 1) ? 0 : 1; @@ -253,18 +255,15 @@ static void BM_SparseToDense(int iters, int NDIM, int N) { std::unique_ptr sparse_context(new OpKernelContext(¶ms)); op->Compute(sparse_context.get()); - tensorflow::testing::StartTiming(); - for (int i = 0; i < iters; ++i) { + for (auto s : state) { delete sparse_context->release_output(0).tensor; op->Compute(sparse_context.get()); TF_ASSERT_OK(sparse_context->status()); } - tensorflow::testing::StopTiming(); // processing input, mainly int64 bytes_per_iter = static_cast((N + N * NDIM) * sizeof(float)); - - tensorflow::testing::BytesProcessed(bytes_per_iter * iters); + state.SetBytesProcessed(bytes_per_iter * state.iterations()); } BENCHMARK(BM_SparseToDense) diff --git a/tensorflow/core/kernels/sparse_xent_op_test.cc b/tensorflow/core/kernels/sparse_xent_op_test.cc index 3b252d77d0a..85a5cd3befc 100644 --- a/tensorflow/core/kernels/sparse_xent_op_test.cc +++ b/tensorflow/core/kernels/sparse_xent_op_test.cc @@ -41,11 +41,15 @@ static Graph* SparseXent(int batch_size, int num_classes) { return g; } -#define BM_SparseXentDev(BATCH, CLASS, DEVICE) \ - static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \ - testing::ItemsProcessed(static_cast(iters) * BATCH * CLASS); \ - test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters); \ - } \ +#define BM_SparseXentDev(BATCH, CLASS, DEVICE) \ + static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE( \ + ::testing::benchmark::State& state) { \ + test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS), \ + /*old_benchmark_api*/ false) \ + .Run(state); \ + state.SetItemsProcessed(static_cast(state.iterations()) * BATCH * \ + CLASS); \ + } \ BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE); /// The representative tests for ptb_word on GPU