Internal tests cleanup.

PiperOrigin-RevId: 339390176
Change-Id: Ie0480a0d8d78bb1a50db434c7f456d407a72444c
This commit is contained in:
A. Unique TensorFlower 2020-10-27 21:24:10 -07:00 committed by TensorFlower Gardener
parent e7715df2de
commit 5008bbbca4
14 changed files with 607 additions and 462 deletions

View File

@ -64,13 +64,16 @@ static void MulChain(int chain_length, Graph** init_g, Graph** run_g) {
// Benchmark a chain of simple multiplications.
// This emphasizes per-op overhead.
static void BM_MulChain(int iters, int chain_length) {
const int64 tot = static_cast<int64>(iters) * chain_length;
testing::ItemsProcessed(tot);
static void BM_MulChain(::testing::benchmark::State& state) {
const int chain_length = state.range(0);
Graph* init;
Graph* run;
MulChain(chain_length, &init, &run);
test::Benchmark("cpu", run, GetOptions(), init).Run(iters);
test::Benchmark("cpu", run, GetOptions(), init, nullptr, "",
/*old_benchmark_api=*/false)
.Run(state);
state.SetItemsProcessed(state.iterations());
}
BENCHMARK(BM_MulChain)->Arg(1 << 10);

View File

@ -115,7 +115,7 @@ class ThroughputBenchmark {
ThroughputBenchmark& operator=(const ThroughputBenchmark&) = delete;
// Perform the benchmark run, based on the parameters supplied to the ctor.
void RunBenchmark(int iters);
void RunBenchmark(::testing::benchmark::State& state);
private:
// Resets all mutable state, including the scheduler.
@ -136,22 +136,18 @@ ThroughputBenchmark::ThroughputBenchmark(
const BasicBatchScheduler<BenchmarkBatchTask>::Options& scheduler_options)
: scheduler_options_(scheduler_options) {}
void ThroughputBenchmark::RunBenchmark(int iters) {
CHECK_GE(iters, 1);
void ThroughputBenchmark::RunBenchmark(::testing::benchmark::State& state) {
CHECK_GE(state.max_iterations, 1);
testing::StopTiming();
ResetState();
// Have each iteration issue a reasonably large number of tasks, to ensure our
// measurements reflect steady-state behavior.
const int kNumTasksPerIteration = 100 * 1000;
testing::ItemsProcessed(iters * kNumTasksPerIteration);
testing::UseRealTime();
testing::StartTiming();
// Schedule 'num_iterations_*kNumTasksPerIteration' tasks.
for (int i = 0; i < iters; ++i) {
for (auto s : state) {
for (int j = 0; j < kNumTasksPerIteration; ++j) {
auto task = std::unique_ptr<BenchmarkBatchTask>(new BenchmarkBatchTask);
TF_CHECK_OK(scheduler_->Schedule(&task));
@ -160,7 +156,7 @@ void ThroughputBenchmark::RunBenchmark(int iters) {
// Wait for the scheduler to process all tasks.
scheduler_.reset();
testing::StopTiming();
state.SetItemsProcessed(state.iterations() * kNumTasksPerIteration);
}
void ThroughputBenchmark::ResetState() {
@ -338,7 +334,8 @@ void LatencyBenchmark::PerformBatchCpuWork() const {
CHECK_NE(dummy, 0);
}
static void RunThroughputBenchmark(int iters, int64 batch_timeout_micros,
static void RunThroughputBenchmark(::testing::benchmark::State& state,
int64 batch_timeout_micros,
int num_batch_threads) {
BasicBatchScheduler<BenchmarkBatchTask>::Options scheduler_options;
const int kMaxBatchSize = 100;
@ -347,13 +344,14 @@ static void RunThroughputBenchmark(int iters, int64 batch_timeout_micros,
scheduler_options.num_batch_threads = num_batch_threads;
scheduler_options.max_enqueued_batches = INT_MAX; // Unbounded queue.
ThroughputBenchmark benchmark(scheduler_options);
benchmark.RunBenchmark(iters);
benchmark.RunBenchmark(state);
}
static void ThroughputBM_ZeroTimeout(int iters, int num_batch_threads) {
RunThroughputBenchmark(iters, 0 /* 0 ms timeout */, num_batch_threads);
static void ThroughputBM_ZeroTimeout(::testing::benchmark::State& state) {
RunThroughputBenchmark(state, 0 /* 0 ms timeout */, state.range(0));
}
BENCHMARK(ThroughputBM_ZeroTimeout)
->UseRealTime()
->Arg(1)
->Arg(2)
->Arg(4)
@ -362,10 +360,11 @@ BENCHMARK(ThroughputBM_ZeroTimeout)
->Arg(32)
->Arg(64);
static void ThroughputBM_SmallTimeout(int iters, int num_batch_threads) {
RunThroughputBenchmark(iters, 1 * 1000 /* 1 ms timeout */, num_batch_threads);
static void ThroughputBM_SmallTimeout(::testing::benchmark::State& state) {
RunThroughputBenchmark(state, 1 * 1000 /* 1 ms timeout */, state.range(0));
}
BENCHMARK(ThroughputBM_SmallTimeout)
->UseRealTime()
->Arg(1)
->Arg(2)
->Arg(4)
@ -374,11 +373,11 @@ BENCHMARK(ThroughputBM_SmallTimeout)
->Arg(32)
->Arg(64);
static void ThroughputBM_LargeTimeout(int iters, int num_batch_threads) {
RunThroughputBenchmark(iters, 50 * 1000 /* 50 ms timeout */,
num_batch_threads);
static void ThroughputBM_LargeTimeout(::testing::benchmark::State& state) {
RunThroughputBenchmark(state, 50 * 1000 /* 50 ms timeout */, state.range(0));
}
BENCHMARK(ThroughputBM_LargeTimeout)
->UseRealTime()
->Arg(1)
->Arg(2)
->Arg(4)

View File

@ -43,22 +43,27 @@ static Graph* BiasAddGrad(int d0, int d1, int d2, int d3) {
return g;
}
#define BM_BiasAddNHWC(N, W, H, C, DEVICE) \
static void BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE(int iters) { \
testing::UseRealTime(); \
testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
test::Benchmark(#DEVICE, BiasAdd(N, H, W, C)).Run(iters); \
} \
BENCHMARK(BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE);
#define BM_BiasAddNHWC(N, W, H, C, DEVICE) \
static void BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE( \
::testing::benchmark::State& state) { \
test::Benchmark(#DEVICE, BiasAdd(N, H, W, C), /*old_benchmark_api=*/false) \
.Run(state); \
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * H * \
W * C); \
} \
BENCHMARK(BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE)->UseRealTime();
#define BM_BiasAddGradNHWC(N, W, H, C, DEVICE) \
static void BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE( \
int iters) { \
testing::UseRealTime(); \
testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
test::Benchmark(#DEVICE, BiasAddGrad(N, H, W, C)).Run(iters); \
} \
BENCHMARK(BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE);
#define BM_BiasAddGradNHWC(N, W, H, C, DEVICE) \
static void BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE( \
::testing::benchmark::State& state) { \
test::Benchmark(#DEVICE, BiasAddGrad(N, H, W, C), \
/*old_benchmark_api=*/false) \
.Run(state); \
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * H * \
W * C); \
} \
BENCHMARK(BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE) \
->UseRealTime();
// CPU
BM_BiasAddNHWC(32, 32, 32, 128, cpu);

View File

@ -45,11 +45,15 @@ static Graph* Bincount(int arr_size, int nbins) {
return g;
}
#define BM_BincountDev(K, NBINS, type) \
static void BM_Bincount##_##type##_##K##_##NBINS(int iters) { \
testing::ItemsProcessed(static_cast<int64>(iters) * K * 1024); \
test::Benchmark(#type, Bincount(K * 1024, NBINS)).Run(iters); \
} \
#define BM_BincountDev(K, NBINS, type) \
static void BM_Bincount##_##type##_##K##_##NBINS( \
::testing::benchmark::State& state) { \
test::Benchmark(#type, Bincount(K * 1024, NBINS), \
/*old_benchmark_api=*/false) \
.Run(state); \
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * K * \
1024); \
} \
BENCHMARK(BM_Bincount##_##type##_##K##_##NBINS);
BM_BincountDev(32, 1000, cpu);

View File

@ -44,29 +44,35 @@ static Graph* BroadcastTo(int dim0, int dim1, InputShape input_shape) {
return g;
}
#define BM_BroadcastTo_InnerDim(DIM0, DIM1, type) \
static void BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1(int iters) { \
testing::UseRealTime(); \
testing::ItemsProcessed(static_cast<int64>(iters) * DIM0 * DIM1); \
test::Benchmark(#type, BroadcastTo(DIM0, DIM1, \
[](int dim0, int dim1) { \
return TensorShape({dim0, 1}); \
})) \
.Run(iters); \
} \
BENCHMARK(BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1);
#define BM_BroadcastTo_InnerDim(DIM0, DIM1, type) \
static void BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1( \
::testing::benchmark::State& state) { \
test::Benchmark(#type, \
BroadcastTo(DIM0, DIM1, \
[](int dim0, int dim1) { \
return TensorShape({dim0, 1}); \
}), \
/*old_benchmark_api=*/false) \
.Run(state); \
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * DIM0 * \
DIM1); \
} \
BENCHMARK(BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1)->UseRealTime();
#define BM_BroadcastTo_OuterDim(DIM0, DIM1, type) \
static void BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1(int iters) { \
testing::UseRealTime(); \
testing::ItemsProcessed(static_cast<int64>(iters) * DIM0 * DIM1); \
test::Benchmark(#type, BroadcastTo(DIM0, DIM1, \
[](int dim0, int dim1) { \
return TensorShape({1, dim1}); \
})) \
.Run(iters); \
} \
BENCHMARK(BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1);
#define BM_BroadcastTo_OuterDim(DIM0, DIM1, type) \
static void BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1( \
::testing::benchmark::State& state) { \
test::Benchmark(#type, \
BroadcastTo(DIM0, DIM1, \
[](int dim0, int dim1) { \
return TensorShape({1, dim1}); \
}), \
/*old_benchmark_api=*/false) \
.Run(state); \
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * DIM0 * \
DIM1); \
} \
BENCHMARK(BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1)->UseRealTime();
BM_BroadcastTo_InnerDim(64, 64, cpu);
BM_BroadcastTo_InnerDim(128, 128, cpu);

View File

@ -121,102 +121,127 @@ TEST_ALL_CASTS_FROM(quint16)
// TODO(wicke): check conversions from/to bool, and bfloat16
static void BM_cpu_float_int64(int iters, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
testing::BytesProcessed(static_cast<int64>(iters) * num *
static void BM_cpu_float_int64(::testing::benchmark::State& state) {
const int num = state.range(0);
test::Benchmark("cpu", Cast<float, int64>(num), /*old_benchmark_api=*/false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
(sizeof(float) + sizeof(int64)));
testing::UseRealTime();
test::Benchmark("cpu", Cast<float, int64>(num)).Run(iters);
}
BENCHMARK(BM_cpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
BENCHMARK(BM_cpu_float_int64)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
static void BM_gpu_float_int64(int iters, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
testing::BytesProcessed(static_cast<int64>(iters) * num *
static void BM_gpu_float_int64(::testing::benchmark::State& state) {
const int num = state.range(0);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
test::Benchmark("gpu", Cast<float, int64>(num), /*old_benchmark_api=*/false)
.Run(state);
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
(sizeof(float) + sizeof(int64)));
testing::UseRealTime();
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
}
BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
BENCHMARK(BM_gpu_float_int64)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
static void BM_cpu_bool_float(int iters, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
testing::BytesProcessed(static_cast<int64>(iters) * num *
static void BM_cpu_bool_float(::testing::benchmark::State& state) {
const int num = state.range(0);
test::Benchmark("cpu", Cast<bool, float>(num), /*old_benchmark_api=*/false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
(sizeof(bool) + sizeof(float)));
testing::UseRealTime();
test::Benchmark("cpu", Cast<bool, float>(num)).Run(iters);
}
BENCHMARK(BM_cpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
BENCHMARK(BM_cpu_bool_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
static void BM_gpu_bool_float(int iters, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
testing::BytesProcessed(static_cast<int64>(iters) * num *
static void BM_gpu_bool_float(::testing::benchmark::State& state) {
const int num = state.range(0);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
test::Benchmark("gpu", Cast<bool, float>(num), /*old_benchmark_api=*/false)
.Run(state);
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
(sizeof(bool) + sizeof(float)));
testing::UseRealTime();
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
}
BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
BENCHMARK(BM_gpu_bool_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
static void BM_cpu_float_bfloat16(int iters, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
testing::BytesProcessed(static_cast<int64>(iters) * num *
static void BM_cpu_float_bfloat16(::testing::benchmark::State& state) {
const int num = state.range(0);
test::Benchmark("cpu", Cast<float, bfloat16>(num),
/*old_benchmark_api=*/false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
(sizeof(float) + sizeof(bfloat16)));
testing::UseRealTime();
test::Benchmark("cpu", Cast<float, bfloat16>(num)).Run(iters);
}
BENCHMARK(BM_cpu_float_bfloat16)->Arg(64 << 10)->Arg(32 << 20);
BENCHMARK(BM_cpu_float_bfloat16)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
static void BM_cpu_bfloat16_float(int iters, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
testing::BytesProcessed(static_cast<int64>(iters) * num *
static void BM_cpu_bfloat16_float(::testing::benchmark::State& state) {
const int num = state.range(0);
test::Benchmark("cpu", Cast<bfloat16, float>(num),
/*old_benchmark_api=*/false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
(sizeof(float) + sizeof(bfloat16)));
testing::UseRealTime();
test::Benchmark("cpu", Cast<bfloat16, float>(num)).Run(iters);
}
BENCHMARK(BM_cpu_bfloat16_float)->Arg(64 << 10)->Arg(32 << 20);
BENCHMARK(BM_cpu_bfloat16_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
static void BM_cpu_float_half(int iters, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
testing::BytesProcessed(static_cast<int64>(iters) * num *
static void BM_cpu_float_half(::testing::benchmark::State& state) {
const int num = state.range(0);
test::Benchmark("cpu", Cast<float, Eigen::half>(num),
/*old_benchmark_api=*/false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
(sizeof(float) + sizeof(Eigen::half)));
testing::UseRealTime();
test::Benchmark("cpu", Cast<float, Eigen::half>(num)).Run(iters);
}
BENCHMARK(BM_cpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
BENCHMARK(BM_cpu_float_half)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
static void BM_cpu_half_float(int iters, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
testing::BytesProcessed(static_cast<int64>(iters) * num *
static void BM_cpu_half_float(::testing::benchmark::State& state) {
const int num = state.range(0);
test::Benchmark("cpu", Cast<Eigen::half, float>(num),
/*old_benchmark_api=*/false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
(sizeof(float) + sizeof(Eigen::half)));
testing::UseRealTime();
test::Benchmark("cpu", Cast<Eigen::half, float>(num)).Run(iters);
}
BENCHMARK(BM_cpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
BENCHMARK(BM_cpu_half_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
static void BM_gpu_float_half(int iters, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
testing::BytesProcessed(static_cast<int64>(iters) * num *
(sizeof(float) + sizeof(Eigen::half)));
testing::UseRealTime();
static void BM_gpu_float_half(::testing::benchmark::State& state) {
const int num = state.range(0);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
test::Benchmark("gpu", Cast<float, Eigen::half>(num),
/*old_benchmark_api=*/false)
.Run(state);
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
}
BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
static void BM_gpu_half_float(int iters, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
testing::BytesProcessed(static_cast<int64>(iters) * num *
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
(sizeof(float) + sizeof(Eigen::half)));
testing::UseRealTime();
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
}
BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
BENCHMARK(BM_gpu_float_half)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
static void BM_gpu_half_float(::testing::benchmark::State& state) {
const int num = state.range(0);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
test::Benchmark("gpu", Cast<Eigen::half, float>(num),
/*old_benchmark_api=*/false)
.Run(state);
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
(sizeof(float) + sizeof(Eigen::half)));
}
BENCHMARK(BM_gpu_half_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
} // end namespace tensorflow

View File

@ -72,22 +72,21 @@ Graph* SetUpKmeansPlusPlusInitialization(int num_dims, int num_points,
template <int num_points, int num_to_sample, int num_dims,
int retries_per_sample>
void BM_KmeansPlusPlusInitialization(int iters) {
testing::StopTiming();
testing::ItemsProcessed(static_cast<int64>(iters) * num_points * num_dims *
num_to_sample);
testing::UseRealTime();
void BM_KmeansPlusPlusInitialization(::testing::benchmark::State& state) {
Graph* g = SetUpKmeansPlusPlusInitialization(
num_dims, num_points, num_to_sample, retries_per_sample);
testing::StartTiming();
test::Benchmark("cpu", g).Run(iters);
test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_points *
num_dims * num_to_sample);
}
#define BENCHMARK_KMEANS_PLUS_PLUS(p, c, d, r) \
void BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r(int iters) { \
BM_KmeansPlusPlusInitialization<p, c, d, r>(iters); \
} \
BENCHMARK(BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r);
#define BENCHMARK_KMEANS_PLUS_PLUS(p, c, d, r) \
void BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r( \
::testing::benchmark::State& state) { \
BM_KmeansPlusPlusInitialization<p, c, d, r>(state); \
} \
BENCHMARK(BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r) \
->UseRealTime();
#define RUN_BM_KmeansPlusPlusInitialization(retries) \
BENCHMARK_KMEANS_PLUS_PLUS(k10Points, k2Centers, k100Dim, retries); \
@ -132,20 +131,18 @@ Graph* SetUpKMC2Initialization(int num_points) {
}
template <int num_points, int num_to_sample, int num_dims>
void BM_KMC2Initialization(int iters) {
testing::StopTiming();
testing::ItemsProcessed(static_cast<int64>(iters) * num_points * num_dims *
num_to_sample);
testing::UseRealTime();
void BM_KMC2Initialization(::testing::benchmark::State& state) {
Graph* g = SetUpKMC2Initialization(num_points);
testing::StartTiming();
test::Benchmark("cpu", g).Run(iters);
test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_points *
num_dims * num_to_sample);
}
#define BENCHMARK_KMC2(p, c, d) \
void BM_KMC2Initialization_##p##_##c##_##d(int iters) { \
BM_KMC2Initialization<p, c, d>(iters); \
} \
BENCHMARK(BM_KMC2Initialization_##p##_##c##_##d);
#define BENCHMARK_KMC2(p, c, d) \
void BM_KMC2Initialization_##p##_##c##_##d( \
::testing::benchmark::State& state) { \
BM_KMC2Initialization<p, c, d>(state); \
} \
BENCHMARK(BM_KMC2Initialization_##p##_##c##_##d)->UseRealTime();
#define RUN_BM_KMC2Initialization \
BENCHMARK_KMC2(k10Points, k2Centers, k100Dim); \
@ -191,14 +188,11 @@ Graph* SetUpNearestNeighbors(int num_dims, int num_points, int num_centers,
}
template <int num_dims, int num_points, int num_centers, int k>
void BM_NearestNeighbors(int iters) {
testing::StopTiming();
testing::ItemsProcessed(static_cast<int64>(iters) * num_points * num_dims *
num_centers);
testing::UseRealTime();
void BM_NearestNeighbors(::testing::benchmark::State& state) {
Graph* g = SetUpNearestNeighbors(num_dims, num_points, num_centers, k);
testing::StartTiming();
test::Benchmark("cpu", g).Run(iters);
test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_points *
num_dims * num_centers);
}
constexpr int kTop1 = 1;
@ -206,11 +200,12 @@ constexpr int kTop2 = 2;
constexpr int kTop5 = 5;
constexpr int kTop10 = 10;
#define BENCHMARK_NEAREST_NEIGHBORS(d, p, c, k) \
void BM_NearestNeighbors##d##_##p##_##c##_##k(int iters) { \
BM_NearestNeighbors<d, p, c, k>(iters); \
} \
BENCHMARK(BM_NearestNeighbors##d##_##p##_##c##_##k);
#define BENCHMARK_NEAREST_NEIGHBORS(d, p, c, k) \
void BM_NearestNeighbors##d##_##p##_##c##_##k( \
::testing::benchmark::State& state) { \
BM_NearestNeighbors<d, p, c, k>(state); \
} \
BENCHMARK(BM_NearestNeighbors##d##_##p##_##c##_##k)->UseRealTime();
#define RUN_BM_NearestNeighbors(k) \
BENCHMARK_NEAREST_NEIGHBORS(k100Dim, k1kPoints, k100Centers, k); \

View File

@ -57,9 +57,9 @@ void FillTensorWithRandomValues<tstring>(Tensor* t, int string_length,
// std::string, then the length of individual strings in the tensors will be
// of length "string_length".
template <typename T>
static void ConcatHelper(int iters, int concat_dimension, int dim2,
static void ConcatHelper(::testing::benchmark::State& state,
int concat_dimension, int dim2,
int string_length = 0) {
testing::StopTiming();
Graph* g = new Graph(OpRegistry::Global());
DataType dt = DataTypeToEnum<T>::v();
@ -81,49 +81,82 @@ static void ConcatHelper(int iters, int concat_dimension, int dim2,
.Attr("T", dt)
.Finalize(g, &node));
testing::BytesProcessed(static_cast<int64>(iters) * (in0_bytes + in1_bytes));
testing::StartTiming();
test::Benchmark("cpu", g).Run(iters);
testing::UseRealTime();
test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
(in0_bytes + in1_bytes));
}
static void BM_ConcatDim0Float(int iters, int dim2) {
ConcatHelper<float>(iters, 0, dim2);
void BM_ConcatDim0Float(::testing::benchmark::State& state) {
const int dim2 = state.range(0);
ConcatHelper<float>(state, 0, dim2);
}
static void BM_ConcatDim1Float(int iters, int dim2) {
ConcatHelper<float>(iters, 1, dim2);
void BM_ConcatDim1Float(::testing::benchmark::State& state) {
const int dim2 = state.range(0);
ConcatHelper<float>(state, 1, dim2);
}
BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
BENCHMARK(BM_ConcatDim0Float)
->UseRealTime()
->Arg(1000)
->Arg(100000)
->Arg(1000000);
BENCHMARK(BM_ConcatDim1Float)
->UseRealTime()
->Arg(1000)
->Arg(100000)
->Arg(1000000);
static void BM_ConcatDim0String(int iters, int dim2, int string_length) {
ConcatHelper<tstring>(iters, 0, dim2, string_length);
void BM_ConcatDim0String(::testing::benchmark::State& state) {
const int dim2 = state.range(0);
const int string_length = state.range(1);
ConcatHelper<tstring>(state, 0, dim2, string_length);
}
BENCHMARK(BM_ConcatDim0String)
->UseRealTime()
->ArgPair(1, 16)
->ArgPair(1, 10000)
->ArgPair(100, 16);
static void BM_ConcatDim1uint8(int iters, int dim2) {
ConcatHelper<uint8>(iters, 1, dim2);
void BM_ConcatDim1uint8(::testing::benchmark::State& state) {
const int dim2 = state.range(0);
ConcatHelper<uint8>(state, 1, dim2);
}
static void BM_ConcatDim1int16(int iters, int dim2) {
ConcatHelper<int16>(iters, 1, dim2);
void BM_ConcatDim1int16(::testing::benchmark::State& state) {
const int dim2 = state.range(0);
ConcatHelper<int16>(state, 1, dim2);
}
static void BM_ConcatDim1bfloat16(int iters, int dim2) {
ConcatHelper<bfloat16>(iters, 1, dim2);
void BM_ConcatDim1bfloat16(::testing::benchmark::State& state) {
const int dim2 = state.range(0);
ConcatHelper<bfloat16>(state, 1, dim2);
}
BENCHMARK(BM_ConcatDim1uint8)->Arg(1000)->Arg(100000)->Arg(1000000);
BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000);
BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000);
BENCHMARK(BM_ConcatDim1uint8)
->UseRealTime()
->Arg(1000)
->Arg(100000)
->Arg(1000000);
BENCHMARK(BM_ConcatDim1int16)
->UseRealTime()
->Arg(1000)
->Arg(100000)
->Arg(1000000);
BENCHMARK(BM_ConcatDim1bfloat16)
->UseRealTime()
->Arg(1000)
->Arg(100000)
->Arg(1000000);
template <typename T>
static void ConcatManyHelper(int iters, int concat_dimension, int dim2) {
testing::StopTiming();
static void ConcatManyHelper(::testing::benchmark::State& state,
int concat_dimension, int dim2) {
Graph* g = new Graph(OpRegistry::Global());
DataType dt = DataTypeToEnum<T>::v();
@ -146,30 +179,25 @@ static void ConcatManyHelper(int iters, int concat_dimension, int dim2) {
.Attr("N", 64)
.Attr("T", dt)
.Finalize(g, &node));
testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
kNumInputs * sizeof(T));
testing::StartTiming();
test::Benchmark("cpu", g).Run(iters);
testing::UseRealTime();
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
dim2 * kNumInputs * sizeof(T));
}
static void BM_ConcatManyDim1bfloat16(int iters, int dim2) {
ConcatManyHelper<bfloat16>(iters, 1, dim2);
void BM_ConcatManyDim1bfloat16(::testing::benchmark::State& state) {
const int dim2 = state.range(0);
ConcatManyHelper<bfloat16>(state, 1, dim2);
}
BENCHMARK(BM_ConcatManyDim1bfloat16)->Arg(18)->Arg(34)->Arg(60);
static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) {
testing::StopTiming();
BENCHMARK(BM_ConcatManyDim1bfloat16)->UseRealTime()->Arg(18)->Arg(34)->Arg(60);
void MemcpyAlternativeHelper(::testing::benchmark::State& state, int dim2) {
const int kDim1 = 100;
std::vector<float> data1(kDim1 * dim2, 1.0f);
std::vector<float> data2(kDim1 * dim2, 2.0f);
testing::BytesProcessed(static_cast<int64>(iters) *
((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
testing::StartTiming();
while (--iters > 0) {
for (auto s : state) {
const size_t n0 = data1.size();
const size_t n1 = data2.size();
float* result = new float[n0 + n1];
@ -177,24 +205,37 @@ static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) {
memcpy(&result[n0], &data2[0], n1 * sizeof(float));
delete[] result;
}
state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
}
static void BM_MemcpyAlternativeDim0(int iters, int dim2) {
MemcpyAlternativeHelper(iters, 0, dim2);
void BM_MemcpyAlternativeDim0(::testing::benchmark::State& state) {
const int dim2 = state.range(0);
MemcpyAlternativeHelper(state, dim2);
}
static void BM_MemcpyAlternativeDim1(int iters, int dim2) {
MemcpyAlternativeHelper(iters, 1, dim2);
void BM_MemcpyAlternativeDim1(::testing::benchmark::State& state) {
const int dim2 = state.range(0);
MemcpyAlternativeHelper(state, dim2);
}
BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
BENCHMARK(BM_MemcpyAlternativeDim0)
->UseRealTime()
->Arg(1000)
->Arg(100000)
->Arg(1000000);
BENCHMARK(BM_MemcpyAlternativeDim1)
->UseRealTime()
->Arg(1000)
->Arg(100000)
->Arg(1000000);
typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
Eigen::Unaligned>
EigenMap;
static void MemcpyManyAlternative1(int iters, int dim2) {
testing::StopTiming();
void MemcpyManyAlternative1(::testing::benchmark::State& state) {
int dim2 = state.range(0);
const int kDim1 = 40000;
const int kNumCopies = 64;
const int size = kDim1 * dim2 * kNumCopies;
@ -202,10 +243,7 @@ static void MemcpyManyAlternative1(int iters, int dim2) {
EigenMap map(data, size);
map.setRandom();
testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
kNumCopies * sizeof(bfloat16));
testing::StartTiming();
while (iters-- > 0) {
for (auto s : state) {
std::vector<bfloat16*> inputs(kNumCopies);
for (int i = 0; i < kNumCopies; ++i) {
inputs[i] = &data[i * kDim1 * dim2];
@ -225,11 +263,12 @@ static void MemcpyManyAlternative1(int iters, int dim2) {
delete[] result;
}
delete[] data;
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
dim2 * kNumCopies * sizeof(bfloat16));
}
static void MemcpyManyAlternative2(int iters, int dim2) {
testing::StopTiming();
void MemcpyManyAlternative2(::testing::benchmark::State& state) {
int dim2 = state.range(0);
const int kDim1 = 40000;
const int kNumCopies = 64;
const int size = kDim1 * dim2 * kNumCopies;
@ -237,11 +276,8 @@ static void MemcpyManyAlternative2(int iters, int dim2) {
EigenMap map(data, size);
map.setRandom();
testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
kNumCopies * sizeof(bfloat16));
testing::StartTiming();
std::vector<bfloat16*> inputs(kNumCopies);
while (--iters > 0) {
for (auto s : state) {
bfloat16* result = new bfloat16[size];
for (int i = 0; i < kNumCopies; ++i) {
inputs[i] = &data[i * kDim1 * dim2];
@ -260,6 +296,9 @@ static void MemcpyManyAlternative2(int iters, int dim2) {
delete[] result;
}
delete[] data;
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
dim2 * kNumCopies * sizeof(bfloat16));
}
BENCHMARK(MemcpyManyAlternative1)

View File

@ -114,15 +114,23 @@ static Graph* ManyConsts(int num, bool sequential) {
return g;
}
static void BM_ManyConsts_Parallel(int iters, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
test::Benchmark("cpu", ManyConsts(num, false /* !sequential */)).Run(iters);
static void BM_ManyConsts_Parallel(::testing::benchmark::State& state) {
const int num = state.range(0);
test::Benchmark("cpu", ManyConsts(num, false /* !sequential */),
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
}
BENCHMARK(BM_ManyConsts_Parallel)->Range(1, 1 << 10);
static void BM_ManyConsts_Sequential(int iters, int num) {
testing::ItemsProcessed(static_cast<int64>(iters) * num);
test::Benchmark("cpu", ManyConsts(num, true /* sequential */)).Run(iters);
static void BM_ManyConsts_Sequential(::testing::benchmark::State& state) {
const int num = state.range(0);
test::Benchmark("cpu", ManyConsts(num, true /* sequential */),
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
}
BENCHMARK(BM_ManyConsts_Sequential)->Range(1, 1 << 10);

View File

@ -309,104 +309,120 @@ static Graph* FusedConv2DWithBatchNorm(
// The following benchmarks are always using 'float' data type with NHWC layout.
// -------------------------------------------------------------------------- //
#define BM_SETUP(N, H, W, C, type, LABEL, NAME) \
testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * (C)); \
testing::SetLabel(LABEL);
#define BM_SET_INFO(N, H, W, C, type, LABEL, NAME) \
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * (N) * (H) * \
(W) * (C)); \
state.SetLabel(LABEL);
#define BM_NAME(name, type, N, H, W, C, FW, FH, FC) \
name##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL) \
static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)(int iters) { \
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
test::Benchmark(#type, Conv2D<float>(N, H, W, C, FW, FH, FC).graph) \
.Run(iters); \
} \
#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL) \
static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, \
FC)(::testing::benchmark::State & state) { \
test::Benchmark(#type, Conv2D<float>(N, H, W, C, FW, FH, FC).graph, \
/*old_benchmark_api=*/false) \
.Run(state); \
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
} \
BENCHMARK(BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC));
#define BM_Conv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL) \
static void BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, \
FC)(int iters) { \
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
FC)(::testing::benchmark::State & state) { \
test::Benchmark(#type, \
Conv2DWithBias<float>(N, H, W, C, FW, FH, FC).graph) \
.Run(iters); \
Conv2DWithBias<float>(N, H, W, C, FW, FH, FC).graph, \
/*old_benchmark_api=*/false) \
.Run(state); \
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
} \
BENCHMARK(BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, FC));
#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL) \
static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
FC)(int iters) { \
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
test::Benchmark(#type, Conv2DWithBiasAndActivation<float>(N, H, W, C, FW, \
FH, FC, "Relu") \
.graph) \
.Run(iters); \
} \
#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL) \
static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
FC)(::testing::benchmark::State & state) { \
test::Benchmark( \
#type, \
Conv2DWithBiasAndActivation<float>(N, H, W, C, FW, FH, FC, "Relu") \
.graph, \
/*old_benchmark_api=*/false) \
.Run(state); \
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
} \
BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL) \
static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, \
FC)(int iters) { \
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
test::Benchmark(#type, FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, \
{"BiasAdd"})) \
.Run(iters); \
} \
#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL) \
static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, \
FC)(::testing::benchmark::State & state) { \
test::Benchmark( \
#type, \
FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, {"BiasAdd"}), \
/*old_benchmark_api=*/false) \
.Run(state); \
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
} \
BENCHMARK(BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, FC));
#define BM_FusedConv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL) \
static void BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
FC)(int iters) { \
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
test::Benchmark(#type, FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, \
{"BiasAdd", "Relu"})) \
.Run(iters); \
FC)(::testing::benchmark::State & state) { \
test::Benchmark(#type, \
FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, \
{"BiasAdd", "Relu"}), \
/*old_benchmark_api=*/false) \
.Run(state); \
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
} \
BENCHMARK( \
BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
#define BM_Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL) \
static void BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, \
FC)(int iters) { \
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
FC)(::testing::benchmark::State & state) { \
test::Benchmark(#type, \
Conv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC).graph) \
.Run(iters); \
Conv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC).graph, \
/*old_benchmark_api=*/false) \
.Run(state); \
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
} \
BENCHMARK(BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
#define BM_Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type, LABEL) \
static void BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, \
FC)(int iters) { \
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
test::Benchmark(#type, Conv2DWithBatchNormAndActivation<float>( \
N, H, W, C, FW, FH, FC, "Relu") \
.graph) \
.Run(iters); \
FC)(::testing::benchmark::State & state) { \
test::Benchmark(#type, \
Conv2DWithBatchNormAndActivation<float>(N, H, W, C, FW, \
FH, FC, "Relu") \
.graph, \
/*old_benchmark_api=*/false) \
.Run(state); \
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
} \
BENCHMARK( \
BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, FC));
#define BM_FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL) \
static void BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, \
FC)(int iters) { \
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
test::Benchmark(#type, FusedConv2DWithBatchNorm<float>( \
N, H, W, C, FW, FH, FC, {"FusedBatchNorm"})) \
.Run(iters); \
FC)(::testing::benchmark::State & state) { \
test::Benchmark(#type, \
FusedConv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC, \
{"FusedBatchNorm"}), \
/*old_benchmark_api=*/false) \
.Run(state); \
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
} \
BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
#define BM_FusedConv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type, \
LABEL) \
static void BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, \
FW, FH, FC)(int iters) { \
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
test::Benchmark( \
#type, FusedConv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC, \
{"FusedBatchNorm", "Relu"})) \
.Run(iters); \
FW, FH, FC)(::testing::benchmark::State & state) { \
test::Benchmark(#type, \
FusedConv2DWithBatchNorm<float>( \
N, H, W, C, FW, FH, FC, {"FusedBatchNorm", "Relu"}), \
/*old_benchmark_api=*/false) \
.Run(state); \
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
} \
BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, FW, \
FH, FC));
@ -561,11 +577,12 @@ BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32");
#define BM_Conv2DFmt(T, FORMAT, N, H, W, C, FW, FH, FC, type) \
static void BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH, \
FC)(int iters) { \
BM_SETUP(N, H, W, C, type, "", Conv2D); \
FC)(::testing::benchmark::State & state) { \
test::Benchmark(#type, \
Conv2D<T>(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph) \
.Run(iters); \
Conv2D<T>(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph, \
/*old_benchmark_api=*/false) \
.Run(state); \
BM_SET_INFO(N, H, W, C, type, "", Conv2D); \
} \
BENCHMARK(BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH, FC));

View File

@ -42,15 +42,19 @@ int RowsAndColsArg(int r, int c) { return r * kRows + c; }
int RowsFromArg(int arg) { return (arg / kRows); }
int ColsFromArg(int arg) { return (arg % kRows); }
#define BM_UNARY(DEVICE, FUNC, T, TYPE) \
void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) { \
const int64 tot = static_cast<int64>(iters) * num; \
testing::UseRealTime(); \
testing::ItemsProcessed(tot); \
testing::BytesProcessed(tot * sizeof(T)); \
test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE)).Run(iters); \
} \
BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
#define BM_UNARY(DEVICE, FUNC, T, TYPE) \
void BM_##DEVICE##_##FUNC##_##TYPE(::testing::benchmark::State& state) { \
const int num = state.range(0); \
test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE), \
/*old_benchmark_api*/ false) \
.Run(state); \
const int64 tot = static_cast<int64>(state.iterations()) * num; \
state.SetItemsProcessed(tot); \
state.SetBytesProcessed(tot * sizeof(T)); \
} \
BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE) \
->UseRealTime() \
->Range(4 << 10, 1 << 20);
BM_UNARY(cpu, Floor, float, DT_FLOAT);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@ -101,27 +105,30 @@ Graph* BinaryScalar(int num, const string& func) {
return g;
}
#define BM_BINARY_SCALAR(DEVICE, FUNC) \
void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) { \
const int64 tot = static_cast<int64>(iters) * num; \
testing::UseRealTime(); \
testing::ItemsProcessed(tot); \
testing::BytesProcessed(tot * sizeof(float)); \
test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \
} \
BENCHMARK(BM_##DEVICE##_##FUNC##_scalar) \
->Arg(1 << 12) /* must >= 4096 */ \
->Arg(1 << 13) \
->Arg(1 << 14) \
->Arg((1 << 15) - (1 << 13)) \
->Arg(1 << 15) \
->Arg((1 << 15) + (1 << 14)) \
->Arg(1 << 16) \
->Arg((1 << 17) - (1 << 15)) \
->Arg(1 << 17) \
->Arg((1 << 17) + (1 << 16)) \
->Arg(1 << 18) \
->Arg(1 << 19) \
#define BM_BINARY_SCALAR(DEVICE, FUNC) \
void BM_##DEVICE##_##FUNC##_scalar(::testing::benchmark::State& state) { \
const int num = state.range(0); \
\
test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC), \
/*old_benchmark_api=*/false) \
.Run(state); \
const int64 tot = static_cast<int64>(state.iterations()) * num; \
state.SetItemsProcessed(tot); \
state.SetBytesProcessed(tot * sizeof(float)); \
} \
BENCHMARK(BM_##DEVICE##_##FUNC##_scalar) \
->Arg(1 << 12) /* must >= 4096 */ \
->Arg(1 << 13) \
->Arg(1 << 14) \
->Arg((1 << 15) - (1 << 13)) \
->Arg(1 << 15) \
->Arg((1 << 15) + (1 << 14)) \
->Arg(1 << 16) \
->Arg((1 << 17) - (1 << 15)) \
->Arg(1 << 17) \
->Arg((1 << 17) + (1 << 16)) \
->Arg(1 << 18) \
->Arg(1 << 19) \
->Arg(1 << 20);
BM_BINARY_SCALAR(cpu, Less);
@ -173,17 +180,19 @@ Graph* CubeWithMulSquare(int num) {
return g;
}
#define BM_CUBE(DEVICE, Impl) \
void BM_##DEVICE##_Cube_##Impl(int iters, int num) { \
const int64 tot = static_cast<int64>(iters) * num; \
testing::UseRealTime(); \
testing::ItemsProcessed(tot); \
testing::BytesProcessed(tot * sizeof(float)); \
test::Benchmark(#DEVICE, Impl(num)).Run(iters); \
} \
BENCHMARK(BM_##DEVICE##_Cube_##Impl) \
->Arg(1 << 12) /* must >= 4096 */ \
->Arg(1 << 16) \
#define BM_CUBE(DEVICE, Impl) \
void BM_##DEVICE##_Cube_##Impl(::testing::benchmark::State& state) { \
const int num = state.range(0); \
\
test::Benchmark(#DEVICE, Impl(num)).Run(state.iterations()); \
const int64 tot = static_cast<int64>(state.iterations()) * num; \
state.SetItemsProcessed(tot); \
state.SetBytesProcessed(tot * sizeof(float)); \
} \
BENCHMARK(BM_##DEVICE##_Cube_##Impl) \
->UseRealTime() \
->Arg(1 << 12) /* must >= 4096 */ \
->Arg(1 << 16) \
->Arg(1 << 20);
BM_CUBE(cpu, CubeWithPow3);
@ -211,17 +220,21 @@ Graph* BiasAdd(int rows, int cols, DataType type) {
return g;
}
#define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C) \
void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C(int iters, int arg) { \
const int rows = RowsFromArg(arg); \
const int cols = ColsFromArg(arg); \
const int64 tot = static_cast<int64>(iters) * rows * cols; \
testing::UseRealTime(); \
testing::ItemsProcessed(tot); \
testing::BytesProcessed(tot * sizeof(C_TYPE)); \
test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE)).Run(iters); \
} \
BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C) \
#define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C) \
void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C( \
::testing::benchmark::State& state) { \
const int arg = state.range(0); \
const int rows = RowsFromArg(arg); \
const int cols = ColsFromArg(arg); \
const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE), \
/*old_benchmark_api=*/false) \
.Run(state); \
state.SetItemsProcessed(tot); \
state.SetBytesProcessed(tot * sizeof(C_TYPE)); \
} \
BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C) \
->UseRealTime() \
->Arg(RowsAndColsArg(R, C));
#define BM_BIAS_ADD_ALL(DEVICE, C_TYPE, TF_TYPE) \
@ -264,16 +277,21 @@ Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
#define BM_BIAS_ADD_GRAD(DEVICE, FMT, C_TYPE, TF_TYPE, R, C, CH) \
void BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH( \
int iters, int arg, int channels) { \
::testing::benchmark::State& state) { \
const int arg = state.range(0); \
const int channels = state.range(1); \
\
const int rows = RowsFromArg(arg); \
const int cols = ColsFromArg(arg); \
const int64 tot = static_cast<int64>(iters) * rows * cols * channels; \
testing::UseRealTime(); \
testing::ItemsProcessed(tot); \
testing::BytesProcessed(tot * sizeof(C_TYPE)); \
test::Benchmark(#DEVICE, BiasAddGrad<C_TYPE>(rows, cols, channels, \
TF_TYPE, FORMAT_##FMT)) \
.Run(iters); \
test::Benchmark( \
#DEVICE, \
BiasAddGrad<C_TYPE>(rows, cols, channels, TF_TYPE, FORMAT_##FMT), \
/*old_benchmark_api=*/false) \
.Run(state); \
const int64 tot = \
static_cast<int64>(state.iterations()) * rows * cols * channels; \
state.SetItemsProcessed(tot); \
state.SetBytesProcessed(tot * sizeof(C_TYPE)); \
} \
BENCHMARK(BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH) \
->ArgPair(RowsAndColsArg(R, C), CH);
@ -326,16 +344,20 @@ Graph* BcastAdd(int rows, int cols, int dim) {
return g;
}
#define BM_BCAST_ADD_ROW(DEVICE, R, C) \
void BM_##DEVICE##_BcastAddRow_R##R##_C##C(int iters, int arg) { \
const int rows = RowsFromArg(arg); \
const int cols = ColsFromArg(arg); \
const int64 tot = static_cast<int64>(iters) * rows * cols; \
testing::UseRealTime(); \
testing::ItemsProcessed(tot); \
testing::BytesProcessed(tot * sizeof(float)); \
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters); \
} \
#define BM_BCAST_ADD_ROW(DEVICE, R, C) \
void BM_##DEVICE##_BcastAddRow_R##R##_C##C( \
::testing::benchmark::State& state) { \
const int arg = state.range(0); \
\
const int rows = RowsFromArg(arg); \
const int cols = ColsFromArg(arg); \
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0), \
/*old_benchmark_api=*/false) \
.Run(state); \
const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
state.SetItemsProcessed(tot); \
state.SetBytesProcessed(tot * sizeof(float)); \
} \
BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
#define BM_BCAST_ADD_ROW_ALL(DEVICE) \
@ -350,17 +372,24 @@ BM_BCAST_ADD_ROW_ALL(gpu);
#undef BM_BCAST_ADD_ROW_ALL
#undef BM_BCAST_ADD_ROW
#define BM_BCAST_ADD_COL(DEVICE, R, C) \
void BM_##DEVICE##_BcastAddCol_R##R##_C##C(int iters, int arg) { \
const int rows = RowsFromArg(arg); \
const int cols = ColsFromArg(arg); \
const int64 tot = static_cast<int64>(iters) * rows * cols; \
testing::UseRealTime(); \
testing::ItemsProcessed(tot); \
testing::BytesProcessed(tot * sizeof(float)); \
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters); \
} \
BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
#define BM_BCAST_ADD_COL(DEVICE, R, C) \
void BM_##DEVICE##_BcastAddCol_R##R##_C##C( \
::testing::benchmark::State& state) { \
const int arg = state.range(0); \
\
const int rows = RowsFromArg(arg); \
const int cols = ColsFromArg(arg); \
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1), \
/*old_benchmark_api=*/false) \
.Run(state); \
const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
\
state.SetItemsProcessed(tot); \
state.SetBytesProcessed(tot * sizeof(float)); \
} \
BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C) \
->UseRealTime() \
->Arg(RowsAndColsArg(R, C));
#define BM_BCAST_ADD_COL_ALL(DEVICE) \
BM_BCAST_ADD_COL(DEVICE, 512, 2048); \
@ -374,17 +403,23 @@ BM_BCAST_ADD_COL_ALL(gpu);
#undef BM_BCAST_ADD_COL_ALL
#undef BM_BCAST_ADD_COL
#define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C) \
void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C(int iters, int arg) { \
const int rows = RowsFromArg(arg); \
const int cols = ColsFromArg(arg); \
const int64 tot = static_cast<int64>(iters) * rows * cols; \
testing::UseRealTime(); \
testing::ItemsProcessed(tot); \
testing::BytesProcessed(tot * sizeof(float)); \
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2)).Run(iters); \
} \
BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C) \
#define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C) \
void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C( \
::testing::benchmark::State& state) { \
const int arg = state.range(0); \
\
const int rows = RowsFromArg(arg); \
const int cols = ColsFromArg(arg); \
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2), \
/*old_benchmark_api=*/false) \
.Run(state); \
const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
\
state.SetItemsProcessed(tot); \
state.SetBytesProcessed(tot * sizeof(float)); \
} \
BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C) \
->UseRealTime() \
->Arg(RowsAndColsArg(R, C));
#define BM_BCAST_ADD_CROSS_RC_ALL(DEVICE) \
@ -399,17 +434,22 @@ BM_BCAST_ADD_CROSS_RC_ALL(gpu);
#undef BM_BCAST_ADD_CROSS_RC_ALL
#undef BM_BCAST_ADD_CROSS_RC
#define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C) \
void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C(int iters, int arg) { \
const int rows = RowsFromArg(arg); \
const int cols = ColsFromArg(arg); \
const int64 tot = static_cast<int64>(iters) * rows * cols; \
testing::UseRealTime(); \
testing::ItemsProcessed(tot); \
testing::BytesProcessed(tot * sizeof(float)); \
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3)).Run(iters); \
} \
BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C) \
#define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C) \
void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C( \
::testing::benchmark::State& state) { \
const int arg = state.range(0); \
\
const int rows = RowsFromArg(arg); \
const int cols = ColsFromArg(arg); \
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3), \
/*old_benchmark_api*/ false) \
.Run(state); \
const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
state.SetItemsProcessed(tot); \
state.SetBytesProcessed(tot * sizeof(float)); \
} \
BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C) \
->UseRealTime() \
->Arg(RowsAndColsArg(R, C));
#define BM_BCAST_ADD_CROSS_CR_ALL(DEVICE) \

View File

@ -273,10 +273,10 @@ TEST_F(ExecutorTest, ControlDependenciesFromSpecialNodes) {
EXPECT_EQ(3.0, V(retvals[0])); // out = 1.0 + 2.0 = 3.0
}
static void BM_executor(int iters, int width, int depth) {
#ifdef PLATFORM_GOOGLE
BenchmarkUseRealTime();
#endif // PLATFORM_GOOGLE
void BM_executor(::testing::benchmark::State& state) {
const int width = state.range(0);
const int depth = state.range(1);
Graph* g = new Graph(OpRegistry::Global());
random::PhiloxRandom philox(1729, 17);
random::SimplePhilox rand(&philox);
@ -306,30 +306,28 @@ static void BM_executor(int iters, int width, int depth) {
}
}
FixupSourceAndSinkEdges(g);
#ifdef PLATFORM_GOOGLE
SetBenchmarkLabel(strings::StrCat("Nodes = ", cur));
SetBenchmarkItemsProcessed(cur * static_cast<int64>(iters));
#endif // PLATFORM_GOOGLE
test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
"SINGLE_THREADED_EXECUTOR")
.Run(iters);
"SINGLE_THREADED_EXECUTOR", /*old_benchmark_api=*/false)
.Run(state);
state.SetLabel(strings::StrCat("Nodes = ", cur));
state.SetItemsProcessed(cur * static_cast<int64>(state.iterations()));
}
// Tall skinny graphs
BENCHMARK(BM_executor)->ArgPair(16, 1024);
BENCHMARK(BM_executor)->ArgPair(32, 8192);
BENCHMARK(BM_executor)->UseRealTime()->ArgPair(16, 1024);
BENCHMARK(BM_executor)->UseRealTime()->ArgPair(32, 8192);
// Short fat graphs
BENCHMARK(BM_executor)->ArgPair(1024, 16);
BENCHMARK(BM_executor)->ArgPair(8192, 32);
BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 16);
BENCHMARK(BM_executor)->UseRealTime()->ArgPair(8192, 32);
// Tall fat graph
BENCHMARK(BM_executor)->ArgPair(1024, 1024);
BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 1024);
void BM_const_identity(::testing::benchmark::State& state) {
const int width = state.range(0);
const int outputs_per_const = state.range(1);
static void BM_const_identity(int iters, int width, int outputs_per_const) {
#ifdef PLATFORM_GOOGLE
BenchmarkUseRealTime();
#endif // PLATFORM_GOOGLE
Graph* g = new Graph(OpRegistry::Global());
for (int i = 0; i < width; ++i) {
Tensor i_t(i);
@ -339,22 +337,20 @@ static void BM_const_identity(int iters, int width, int outputs_per_const) {
}
}
FixupSourceAndSinkEdges(g);
#ifdef PLATFORM_GOOGLE
SetBenchmarkLabel(
strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
SetBenchmarkItemsProcessed((1 + outputs_per_const) * width *
static_cast<int64>(iters));
#endif // PLATFORM_GOOGLE
test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
"SINGLE_THREADED_EXECUTOR")
.Run(iters);
"SINGLE_THREADED_EXECUTOR",
/*old_benchmark_api=*/false)
.Run(state);
state.SetLabel(strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
state.SetItemsProcessed((1 + outputs_per_const) * width *
static_cast<int64>(state.iterations()));
}
// Graph with actual op execution.
BENCHMARK(BM_const_identity)->ArgPair(1, 1);
BENCHMARK(BM_const_identity)->ArgPair(1, 100);
BENCHMARK(BM_const_identity)->ArgPair(100, 1);
BENCHMARK(BM_const_identity)->ArgPair(100, 100);
BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(1, 1);
BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(1, 100);
BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(100, 1);
BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(100, 100);
// TODO(mrry): This benchmark currently crashes with a use-after free, because
// test::Benchmark::RunWithArgs() assumes that the executor will take ownership
@ -368,7 +364,7 @@ BENCHMARK(BM_const_identity)->ArgPair(100, 100);
#define ALICE "/job:j/replica:0/task:0/cpu:0"
#define BOB "/job:j/replica:0/task:0/gpu:0"
static void BM_FeedInputFetchOutput(int iters) {
static void BM_FeedInputFetchOutput(::testing::benchmark::State& state) {
Graph* g = new Graph(OpRegistry::Global());
// z = x + y: x and y are provided as benchmark inputs. z is the
// output of the benchmark. Conceptually, the caller is ALICE, the
@ -380,10 +376,10 @@ static void BM_FeedInputFetchOutput(int iters) {
FixupSourceAndSinkEdges(g);
Tensor val(DT_FLOAT, TensorShape({}));
val.scalar<float>()() = 3.14;
SetBenchmarkItemsProcessed(static_cast<int64>(iters));
test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
"SINGLE_THREADED_EXECUTOR")
.RunWithArgs({{x, val}, {y, val}}, {z}, iters);
"SINGLE_THREADED_EXECUTOR", /*old_benchmark_api=*/false)
.RunWithArgs({{x, val}, {y, val}}, {z}, state);
state.SetItemsProcessed(state.iterations());
}
BENCHMARK(BM_FeedInputFetchOutput);
#endif

View File

@ -247,7 +247,7 @@ TEST_F(DequantizeOpTest, DequantizeScaledQint8Axis3) {
}
template <typename T>
static void BM_DequantizeMinCombinedCpu(int iters) {
static void BM_DequantizeMinCombinedCpu(::testing::benchmark::State& state) {
auto root = Scope::NewRootScope().ExitOnError();
const int64 num_values = 1500 * 250;
std::vector<T> inputs;
@ -262,25 +262,26 @@ static void BM_DequantizeMinCombinedCpu(int iters) {
Graph* g = new Graph(OpRegistry::Global());
TF_CHECK_OK(root.ToGraph(g));
test::Benchmark("cpu", g).Run(iters);
testing::BytesProcessed(iters * num_values * (sizeof(float) + sizeof(T)));
testing::ItemsProcessed(iters);
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
state.SetBytesProcessed(state.iterations() * num_values *
(sizeof(float) + sizeof(T)));
state.SetItemsProcessed(state.iterations());
}
static void BM_DequantizeMinCombinedCpuQuint16(int iters) {
BM_DequantizeMinCombinedCpu<quint16>(iters);
void BM_DequantizeMinCombinedCpuQuint16(::testing::benchmark::State& state) {
BM_DequantizeMinCombinedCpu<quint16>(state);
}
static void BM_DequantizeMinCombinedCpuQint16(int iters) {
BM_DequantizeMinCombinedCpu<qint16>(iters);
void BM_DequantizeMinCombinedCpuQint16(::testing::benchmark::State& state) {
BM_DequantizeMinCombinedCpu<qint16>(state);
}
static void BM_DequantizeMinCombinedCpuQuint8(int iters) {
BM_DequantizeMinCombinedCpu<quint8>(iters);
void BM_DequantizeMinCombinedCpuQuint8(::testing::benchmark::State& state) {
BM_DequantizeMinCombinedCpu<quint8>(state);
}
static void BM_DequantizeMinCombinedCpuQint8(int iters) {
BM_DequantizeMinCombinedCpu<qint8>(iters);
void BM_DequantizeMinCombinedCpuQint8(::testing::benchmark::State& state) {
BM_DequantizeMinCombinedCpu<qint8>(state);
}
BENCHMARK(BM_DequantizeMinCombinedCpuQuint16);
@ -289,7 +290,8 @@ BENCHMARK(BM_DequantizeMinCombinedCpuQuint8);
BENCHMARK(BM_DequantizeMinCombinedCpuQint8);
template <typename T>
static void BM_DequantizeBfloat16MinCombinedCpu(int iters) {
static void BM_DequantizeBfloat16MinCombinedCpu(
::testing::benchmark::State& state) {
auto root = Scope::NewRootScope().ExitOnError();
const int64 num_values = 1500 * 250;
std::vector<T> inputs;
@ -304,25 +306,30 @@ static void BM_DequantizeBfloat16MinCombinedCpu(int iters) {
Graph* g = new Graph(OpRegistry::Global());
TF_CHECK_OK(root.ToGraph(g));
test::Benchmark("cpu", g).Run(iters);
testing::BytesProcessed(iters * num_values * (sizeof(bfloat16) + sizeof(T)));
testing::ItemsProcessed(iters);
test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
state.SetBytesProcessed(state.iterations() * num_values *
(sizeof(bfloat16) + sizeof(T)));
state.SetItemsProcessed(state.iterations());
}
static void BM_DequantizeBfloat16MinCombinedCpuQuint16(int iters) {
BM_DequantizeBfloat16MinCombinedCpu<quint16>(iters);
void BM_DequantizeBfloat16MinCombinedCpuQuint16(
::testing::benchmark::State& state) {
BM_DequantizeBfloat16MinCombinedCpu<quint16>(state);
}
static void BM_DequantizeBfloat16MinCombinedCpuQint16(int iters) {
BM_DequantizeBfloat16MinCombinedCpu<qint16>(iters);
void BM_DequantizeBfloat16MinCombinedCpuQint16(
::testing::benchmark::State& state) {
BM_DequantizeBfloat16MinCombinedCpu<qint16>(state);
}
static void BM_DequantizeBfloat16MinCombinedCpuQuint8(int iters) {
BM_DequantizeBfloat16MinCombinedCpu<quint8>(iters);
void BM_DequantizeBfloat16MinCombinedCpuQuint8(
::testing::benchmark::State& state) {
BM_DequantizeBfloat16MinCombinedCpu<quint8>(state);
}
static void BM_DequantizeBfloat16MinCombinedCpuQint8(int iters) {
BM_DequantizeBfloat16MinCombinedCpu<qint8>(iters);
void BM_DequantizeBfloat16MinCombinedCpuQint8(
::testing::benchmark::State& state) {
BM_DequantizeBfloat16MinCombinedCpu<qint8>(state);
}
BENCHMARK(BM_DequantizeBfloat16MinCombinedCpuQuint16);

View File

@ -30,12 +30,13 @@ static Graph* Diag(int n, DataType type) {
return g;
}
#define BM_DiagDev(N, T, TFTYPE, DEVICE) \
static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) { \
testing::UseRealTime(); \
testing::ItemsProcessed(static_cast<int64>(iters) * N * N); \
test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE)).Run(iters); \
} \
#define BM_DiagDev(N, T, TFTYPE, DEVICE) \
static void BM_Diag##_##N##_##TFTYPE##_##DEVICE( \
::testing::benchmark::State& state) { \
test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE), /*old_benchmark_api=*/false) \
.Run(state); \
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * N); \
} \
BENCHMARK(BM_Diag##_##N##_##TFTYPE##_##DEVICE);
#define BM_Diag(N) \