Internal tests cleanup.
PiperOrigin-RevId: 339390176 Change-Id: Ie0480a0d8d78bb1a50db434c7f456d407a72444c
This commit is contained in:
parent
e7715df2de
commit
5008bbbca4
@ -64,13 +64,16 @@ static void MulChain(int chain_length, Graph** init_g, Graph** run_g) {
|
||||
|
||||
// Benchmark a chain of simple multiplications.
|
||||
// This emphasizes per-op overhead.
|
||||
static void BM_MulChain(int iters, int chain_length) {
|
||||
const int64 tot = static_cast<int64>(iters) * chain_length;
|
||||
testing::ItemsProcessed(tot);
|
||||
static void BM_MulChain(::testing::benchmark::State& state) {
|
||||
const int chain_length = state.range(0);
|
||||
|
||||
Graph* init;
|
||||
Graph* run;
|
||||
MulChain(chain_length, &init, &run);
|
||||
test::Benchmark("cpu", run, GetOptions(), init).Run(iters);
|
||||
test::Benchmark("cpu", run, GetOptions(), init, nullptr, "",
|
||||
/*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(state.iterations());
|
||||
}
|
||||
BENCHMARK(BM_MulChain)->Arg(1 << 10);
|
||||
|
||||
|
@ -115,7 +115,7 @@ class ThroughputBenchmark {
|
||||
ThroughputBenchmark& operator=(const ThroughputBenchmark&) = delete;
|
||||
|
||||
// Perform the benchmark run, based on the parameters supplied to the ctor.
|
||||
void RunBenchmark(int iters);
|
||||
void RunBenchmark(::testing::benchmark::State& state);
|
||||
|
||||
private:
|
||||
// Resets all mutable state, including the scheduler.
|
||||
@ -136,22 +136,18 @@ ThroughputBenchmark::ThroughputBenchmark(
|
||||
const BasicBatchScheduler<BenchmarkBatchTask>::Options& scheduler_options)
|
||||
: scheduler_options_(scheduler_options) {}
|
||||
|
||||
void ThroughputBenchmark::RunBenchmark(int iters) {
|
||||
CHECK_GE(iters, 1);
|
||||
void ThroughputBenchmark::RunBenchmark(::testing::benchmark::State& state) {
|
||||
CHECK_GE(state.max_iterations, 1);
|
||||
|
||||
testing::StopTiming();
|
||||
ResetState();
|
||||
|
||||
// Have each iteration issue a reasonably large number of tasks, to ensure our
|
||||
// measurements reflect steady-state behavior.
|
||||
const int kNumTasksPerIteration = 100 * 1000;
|
||||
|
||||
testing::ItemsProcessed(iters * kNumTasksPerIteration);
|
||||
testing::UseRealTime();
|
||||
testing::StartTiming();
|
||||
|
||||
// Schedule 'num_iterations_*kNumTasksPerIteration' tasks.
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
for (int j = 0; j < kNumTasksPerIteration; ++j) {
|
||||
auto task = std::unique_ptr<BenchmarkBatchTask>(new BenchmarkBatchTask);
|
||||
TF_CHECK_OK(scheduler_->Schedule(&task));
|
||||
@ -160,7 +156,7 @@ void ThroughputBenchmark::RunBenchmark(int iters) {
|
||||
|
||||
// Wait for the scheduler to process all tasks.
|
||||
scheduler_.reset();
|
||||
testing::StopTiming();
|
||||
state.SetItemsProcessed(state.iterations() * kNumTasksPerIteration);
|
||||
}
|
||||
|
||||
void ThroughputBenchmark::ResetState() {
|
||||
@ -338,7 +334,8 @@ void LatencyBenchmark::PerformBatchCpuWork() const {
|
||||
CHECK_NE(dummy, 0);
|
||||
}
|
||||
|
||||
static void RunThroughputBenchmark(int iters, int64 batch_timeout_micros,
|
||||
static void RunThroughputBenchmark(::testing::benchmark::State& state,
|
||||
int64 batch_timeout_micros,
|
||||
int num_batch_threads) {
|
||||
BasicBatchScheduler<BenchmarkBatchTask>::Options scheduler_options;
|
||||
const int kMaxBatchSize = 100;
|
||||
@ -347,13 +344,14 @@ static void RunThroughputBenchmark(int iters, int64 batch_timeout_micros,
|
||||
scheduler_options.num_batch_threads = num_batch_threads;
|
||||
scheduler_options.max_enqueued_batches = INT_MAX; // Unbounded queue.
|
||||
ThroughputBenchmark benchmark(scheduler_options);
|
||||
benchmark.RunBenchmark(iters);
|
||||
benchmark.RunBenchmark(state);
|
||||
}
|
||||
|
||||
static void ThroughputBM_ZeroTimeout(int iters, int num_batch_threads) {
|
||||
RunThroughputBenchmark(iters, 0 /* 0 ms timeout */, num_batch_threads);
|
||||
static void ThroughputBM_ZeroTimeout(::testing::benchmark::State& state) {
|
||||
RunThroughputBenchmark(state, 0 /* 0 ms timeout */, state.range(0));
|
||||
}
|
||||
BENCHMARK(ThroughputBM_ZeroTimeout)
|
||||
->UseRealTime()
|
||||
->Arg(1)
|
||||
->Arg(2)
|
||||
->Arg(4)
|
||||
@ -362,10 +360,11 @@ BENCHMARK(ThroughputBM_ZeroTimeout)
|
||||
->Arg(32)
|
||||
->Arg(64);
|
||||
|
||||
static void ThroughputBM_SmallTimeout(int iters, int num_batch_threads) {
|
||||
RunThroughputBenchmark(iters, 1 * 1000 /* 1 ms timeout */, num_batch_threads);
|
||||
static void ThroughputBM_SmallTimeout(::testing::benchmark::State& state) {
|
||||
RunThroughputBenchmark(state, 1 * 1000 /* 1 ms timeout */, state.range(0));
|
||||
}
|
||||
BENCHMARK(ThroughputBM_SmallTimeout)
|
||||
->UseRealTime()
|
||||
->Arg(1)
|
||||
->Arg(2)
|
||||
->Arg(4)
|
||||
@ -374,11 +373,11 @@ BENCHMARK(ThroughputBM_SmallTimeout)
|
||||
->Arg(32)
|
||||
->Arg(64);
|
||||
|
||||
static void ThroughputBM_LargeTimeout(int iters, int num_batch_threads) {
|
||||
RunThroughputBenchmark(iters, 50 * 1000 /* 50 ms timeout */,
|
||||
num_batch_threads);
|
||||
static void ThroughputBM_LargeTimeout(::testing::benchmark::State& state) {
|
||||
RunThroughputBenchmark(state, 50 * 1000 /* 50 ms timeout */, state.range(0));
|
||||
}
|
||||
BENCHMARK(ThroughputBM_LargeTimeout)
|
||||
->UseRealTime()
|
||||
->Arg(1)
|
||||
->Arg(2)
|
||||
->Arg(4)
|
||||
|
@ -43,22 +43,27 @@ static Graph* BiasAddGrad(int d0, int d1, int d2, int d3) {
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_BiasAddNHWC(N, W, H, C, DEVICE) \
|
||||
static void BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE(int iters) { \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
|
||||
test::Benchmark(#DEVICE, BiasAdd(N, H, W, C)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE);
|
||||
#define BM_BiasAddNHWC(N, W, H, C, DEVICE) \
|
||||
static void BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE( \
|
||||
::testing::benchmark::State& state) { \
|
||||
test::Benchmark(#DEVICE, BiasAdd(N, H, W, C), /*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * H * \
|
||||
W * C); \
|
||||
} \
|
||||
BENCHMARK(BM_BiasAddNHWC##_##N##_##H##_##W##_##C##_##DEVICE)->UseRealTime();
|
||||
|
||||
#define BM_BiasAddGradNHWC(N, W, H, C, DEVICE) \
|
||||
static void BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE( \
|
||||
int iters) { \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
|
||||
test::Benchmark(#DEVICE, BiasAddGrad(N, H, W, C)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE);
|
||||
#define BM_BiasAddGradNHWC(N, W, H, C, DEVICE) \
|
||||
static void BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE( \
|
||||
::testing::benchmark::State& state) { \
|
||||
test::Benchmark(#DEVICE, BiasAddGrad(N, H, W, C), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * H * \
|
||||
W * C); \
|
||||
} \
|
||||
BENCHMARK(BM_BiasAddGradNHWC##_##N##_##H##_##W##_##C##_##DEVICE) \
|
||||
->UseRealTime();
|
||||
|
||||
// CPU
|
||||
BM_BiasAddNHWC(32, 32, 32, 128, cpu);
|
||||
|
@ -45,11 +45,15 @@ static Graph* Bincount(int arr_size, int nbins) {
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_BincountDev(K, NBINS, type) \
|
||||
static void BM_Bincount##_##type##_##K##_##NBINS(int iters) { \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * K * 1024); \
|
||||
test::Benchmark(#type, Bincount(K * 1024, NBINS)).Run(iters); \
|
||||
} \
|
||||
#define BM_BincountDev(K, NBINS, type) \
|
||||
static void BM_Bincount##_##type##_##K##_##NBINS( \
|
||||
::testing::benchmark::State& state) { \
|
||||
test::Benchmark(#type, Bincount(K * 1024, NBINS), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * K * \
|
||||
1024); \
|
||||
} \
|
||||
BENCHMARK(BM_Bincount##_##type##_##K##_##NBINS);
|
||||
|
||||
BM_BincountDev(32, 1000, cpu);
|
||||
|
@ -44,29 +44,35 @@ static Graph* BroadcastTo(int dim0, int dim1, InputShape input_shape) {
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_BroadcastTo_InnerDim(DIM0, DIM1, type) \
|
||||
static void BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1(int iters) { \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * DIM0 * DIM1); \
|
||||
test::Benchmark(#type, BroadcastTo(DIM0, DIM1, \
|
||||
[](int dim0, int dim1) { \
|
||||
return TensorShape({dim0, 1}); \
|
||||
})) \
|
||||
.Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1);
|
||||
#define BM_BroadcastTo_InnerDim(DIM0, DIM1, type) \
|
||||
static void BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1( \
|
||||
::testing::benchmark::State& state) { \
|
||||
test::Benchmark(#type, \
|
||||
BroadcastTo(DIM0, DIM1, \
|
||||
[](int dim0, int dim1) { \
|
||||
return TensorShape({dim0, 1}); \
|
||||
}), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * DIM0 * \
|
||||
DIM1); \
|
||||
} \
|
||||
BENCHMARK(BM_BroadcastTo_Inner##_##type##_##DIM0##_##DIM1)->UseRealTime();
|
||||
|
||||
#define BM_BroadcastTo_OuterDim(DIM0, DIM1, type) \
|
||||
static void BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1(int iters) { \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * DIM0 * DIM1); \
|
||||
test::Benchmark(#type, BroadcastTo(DIM0, DIM1, \
|
||||
[](int dim0, int dim1) { \
|
||||
return TensorShape({1, dim1}); \
|
||||
})) \
|
||||
.Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1);
|
||||
#define BM_BroadcastTo_OuterDim(DIM0, DIM1, type) \
|
||||
static void BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1( \
|
||||
::testing::benchmark::State& state) { \
|
||||
test::Benchmark(#type, \
|
||||
BroadcastTo(DIM0, DIM1, \
|
||||
[](int dim0, int dim1) { \
|
||||
return TensorShape({1, dim1}); \
|
||||
}), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * DIM0 * \
|
||||
DIM1); \
|
||||
} \
|
||||
BENCHMARK(BM_BroadcastTo_Outer##_##type##_##DIM0##_##DIM1)->UseRealTime();
|
||||
|
||||
BM_BroadcastTo_InnerDim(64, 64, cpu);
|
||||
BM_BroadcastTo_InnerDim(128, 128, cpu);
|
||||
|
@ -121,102 +121,127 @@ TEST_ALL_CASTS_FROM(quint16)
|
||||
|
||||
// TODO(wicke): check conversions from/to bool, and bfloat16
|
||||
|
||||
static void BM_cpu_float_int64(int iters, int num) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num *
|
||||
static void BM_cpu_float_int64(::testing::benchmark::State& state) {
|
||||
const int num = state.range(0);
|
||||
test::Benchmark("cpu", Cast<float, int64>(num), /*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
|
||||
(sizeof(float) + sizeof(int64)));
|
||||
testing::UseRealTime();
|
||||
test::Benchmark("cpu", Cast<float, int64>(num)).Run(iters);
|
||||
}
|
||||
BENCHMARK(BM_cpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
|
||||
BENCHMARK(BM_cpu_float_int64)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
|
||||
|
||||
static void BM_gpu_float_int64(int iters, int num) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num *
|
||||
static void BM_gpu_float_int64(::testing::benchmark::State& state) {
|
||||
const int num = state.range(0);
|
||||
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
test::Benchmark("gpu", Cast<float, int64>(num), /*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
|
||||
(sizeof(float) + sizeof(int64)));
|
||||
testing::UseRealTime();
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
}
|
||||
BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
|
||||
BENCHMARK(BM_gpu_float_int64)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
|
||||
|
||||
static void BM_cpu_bool_float(int iters, int num) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num *
|
||||
static void BM_cpu_bool_float(::testing::benchmark::State& state) {
|
||||
const int num = state.range(0);
|
||||
|
||||
test::Benchmark("cpu", Cast<bool, float>(num), /*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
|
||||
(sizeof(bool) + sizeof(float)));
|
||||
testing::UseRealTime();
|
||||
test::Benchmark("cpu", Cast<bool, float>(num)).Run(iters);
|
||||
}
|
||||
BENCHMARK(BM_cpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
|
||||
BENCHMARK(BM_cpu_bool_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
|
||||
|
||||
static void BM_gpu_bool_float(int iters, int num) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num *
|
||||
static void BM_gpu_bool_float(::testing::benchmark::State& state) {
|
||||
const int num = state.range(0);
|
||||
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
test::Benchmark("gpu", Cast<bool, float>(num), /*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
|
||||
(sizeof(bool) + sizeof(float)));
|
||||
testing::UseRealTime();
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
}
|
||||
BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
|
||||
BENCHMARK(BM_gpu_bool_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
|
||||
|
||||
static void BM_cpu_float_bfloat16(int iters, int num) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num *
|
||||
static void BM_cpu_float_bfloat16(::testing::benchmark::State& state) {
|
||||
const int num = state.range(0);
|
||||
test::Benchmark("cpu", Cast<float, bfloat16>(num),
|
||||
/*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
|
||||
(sizeof(float) + sizeof(bfloat16)));
|
||||
testing::UseRealTime();
|
||||
test::Benchmark("cpu", Cast<float, bfloat16>(num)).Run(iters);
|
||||
}
|
||||
BENCHMARK(BM_cpu_float_bfloat16)->Arg(64 << 10)->Arg(32 << 20);
|
||||
BENCHMARK(BM_cpu_float_bfloat16)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
|
||||
|
||||
static void BM_cpu_bfloat16_float(int iters, int num) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num *
|
||||
static void BM_cpu_bfloat16_float(::testing::benchmark::State& state) {
|
||||
const int num = state.range(0);
|
||||
test::Benchmark("cpu", Cast<bfloat16, float>(num),
|
||||
/*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
|
||||
(sizeof(float) + sizeof(bfloat16)));
|
||||
testing::UseRealTime();
|
||||
test::Benchmark("cpu", Cast<bfloat16, float>(num)).Run(iters);
|
||||
}
|
||||
BENCHMARK(BM_cpu_bfloat16_float)->Arg(64 << 10)->Arg(32 << 20);
|
||||
BENCHMARK(BM_cpu_bfloat16_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
|
||||
|
||||
static void BM_cpu_float_half(int iters, int num) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num *
|
||||
static void BM_cpu_float_half(::testing::benchmark::State& state) {
|
||||
const int num = state.range(0);
|
||||
|
||||
test::Benchmark("cpu", Cast<float, Eigen::half>(num),
|
||||
/*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
|
||||
(sizeof(float) + sizeof(Eigen::half)));
|
||||
testing::UseRealTime();
|
||||
test::Benchmark("cpu", Cast<float, Eigen::half>(num)).Run(iters);
|
||||
}
|
||||
BENCHMARK(BM_cpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
|
||||
BENCHMARK(BM_cpu_float_half)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
|
||||
|
||||
static void BM_cpu_half_float(int iters, int num) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num *
|
||||
static void BM_cpu_half_float(::testing::benchmark::State& state) {
|
||||
const int num = state.range(0);
|
||||
|
||||
test::Benchmark("cpu", Cast<Eigen::half, float>(num),
|
||||
/*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
|
||||
(sizeof(float) + sizeof(Eigen::half)));
|
||||
testing::UseRealTime();
|
||||
test::Benchmark("cpu", Cast<Eigen::half, float>(num)).Run(iters);
|
||||
}
|
||||
BENCHMARK(BM_cpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
|
||||
BENCHMARK(BM_cpu_half_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
|
||||
|
||||
static void BM_gpu_float_half(int iters, int num) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num *
|
||||
(sizeof(float) + sizeof(Eigen::half)));
|
||||
testing::UseRealTime();
|
||||
static void BM_gpu_float_half(::testing::benchmark::State& state) {
|
||||
const int num = state.range(0);
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
test::Benchmark("gpu", Cast<float, Eigen::half>(num)).Run(iters);
|
||||
test::Benchmark("gpu", Cast<float, Eigen::half>(num),
|
||||
/*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
}
|
||||
BENCHMARK(BM_gpu_float_half)->Arg(64 << 10)->Arg(32 << 20);
|
||||
|
||||
static void BM_gpu_half_float(int iters, int num) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num *
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
|
||||
(sizeof(float) + sizeof(Eigen::half)));
|
||||
testing::UseRealTime();
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
test::Benchmark("gpu", Cast<Eigen::half, float>(num)).Run(iters);
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
}
|
||||
BENCHMARK(BM_gpu_half_float)->Arg(64 << 10)->Arg(32 << 20);
|
||||
BENCHMARK(BM_gpu_float_half)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
|
||||
|
||||
static void BM_gpu_half_float(::testing::benchmark::State& state) {
|
||||
const int num = state.range(0);
|
||||
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
test::Benchmark("gpu", Cast<Eigen::half, float>(num),
|
||||
/*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num *
|
||||
(sizeof(float) + sizeof(Eigen::half)));
|
||||
}
|
||||
BENCHMARK(BM_gpu_half_float)->UseRealTime()->Arg(64 << 10)->Arg(32 << 20);
|
||||
|
||||
} // end namespace tensorflow
|
||||
|
@ -72,22 +72,21 @@ Graph* SetUpKmeansPlusPlusInitialization(int num_dims, int num_points,
|
||||
|
||||
template <int num_points, int num_to_sample, int num_dims,
|
||||
int retries_per_sample>
|
||||
void BM_KmeansPlusPlusInitialization(int iters) {
|
||||
testing::StopTiming();
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num_points * num_dims *
|
||||
num_to_sample);
|
||||
testing::UseRealTime();
|
||||
void BM_KmeansPlusPlusInitialization(::testing::benchmark::State& state) {
|
||||
Graph* g = SetUpKmeansPlusPlusInitialization(
|
||||
num_dims, num_points, num_to_sample, retries_per_sample);
|
||||
testing::StartTiming();
|
||||
test::Benchmark("cpu", g).Run(iters);
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_points *
|
||||
num_dims * num_to_sample);
|
||||
}
|
||||
|
||||
#define BENCHMARK_KMEANS_PLUS_PLUS(p, c, d, r) \
|
||||
void BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r(int iters) { \
|
||||
BM_KmeansPlusPlusInitialization<p, c, d, r>(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r);
|
||||
#define BENCHMARK_KMEANS_PLUS_PLUS(p, c, d, r) \
|
||||
void BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r( \
|
||||
::testing::benchmark::State& state) { \
|
||||
BM_KmeansPlusPlusInitialization<p, c, d, r>(state); \
|
||||
} \
|
||||
BENCHMARK(BM_KmeansPlusPlusInitialization_##p##_##c##_##d##_##r) \
|
||||
->UseRealTime();
|
||||
|
||||
#define RUN_BM_KmeansPlusPlusInitialization(retries) \
|
||||
BENCHMARK_KMEANS_PLUS_PLUS(k10Points, k2Centers, k100Dim, retries); \
|
||||
@ -132,20 +131,18 @@ Graph* SetUpKMC2Initialization(int num_points) {
|
||||
}
|
||||
|
||||
template <int num_points, int num_to_sample, int num_dims>
|
||||
void BM_KMC2Initialization(int iters) {
|
||||
testing::StopTiming();
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num_points * num_dims *
|
||||
num_to_sample);
|
||||
testing::UseRealTime();
|
||||
void BM_KMC2Initialization(::testing::benchmark::State& state) {
|
||||
Graph* g = SetUpKMC2Initialization(num_points);
|
||||
testing::StartTiming();
|
||||
test::Benchmark("cpu", g).Run(iters);
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_points *
|
||||
num_dims * num_to_sample);
|
||||
}
|
||||
#define BENCHMARK_KMC2(p, c, d) \
|
||||
void BM_KMC2Initialization_##p##_##c##_##d(int iters) { \
|
||||
BM_KMC2Initialization<p, c, d>(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_KMC2Initialization_##p##_##c##_##d);
|
||||
#define BENCHMARK_KMC2(p, c, d) \
|
||||
void BM_KMC2Initialization_##p##_##c##_##d( \
|
||||
::testing::benchmark::State& state) { \
|
||||
BM_KMC2Initialization<p, c, d>(state); \
|
||||
} \
|
||||
BENCHMARK(BM_KMC2Initialization_##p##_##c##_##d)->UseRealTime();
|
||||
|
||||
#define RUN_BM_KMC2Initialization \
|
||||
BENCHMARK_KMC2(k10Points, k2Centers, k100Dim); \
|
||||
@ -191,14 +188,11 @@ Graph* SetUpNearestNeighbors(int num_dims, int num_points, int num_centers,
|
||||
}
|
||||
|
||||
template <int num_dims, int num_points, int num_centers, int k>
|
||||
void BM_NearestNeighbors(int iters) {
|
||||
testing::StopTiming();
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num_points * num_dims *
|
||||
num_centers);
|
||||
testing::UseRealTime();
|
||||
void BM_NearestNeighbors(::testing::benchmark::State& state) {
|
||||
Graph* g = SetUpNearestNeighbors(num_dims, num_points, num_centers, k);
|
||||
testing::StartTiming();
|
||||
test::Benchmark("cpu", g).Run(iters);
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_points *
|
||||
num_dims * num_centers);
|
||||
}
|
||||
|
||||
constexpr int kTop1 = 1;
|
||||
@ -206,11 +200,12 @@ constexpr int kTop2 = 2;
|
||||
constexpr int kTop5 = 5;
|
||||
constexpr int kTop10 = 10;
|
||||
|
||||
#define BENCHMARK_NEAREST_NEIGHBORS(d, p, c, k) \
|
||||
void BM_NearestNeighbors##d##_##p##_##c##_##k(int iters) { \
|
||||
BM_NearestNeighbors<d, p, c, k>(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_NearestNeighbors##d##_##p##_##c##_##k);
|
||||
#define BENCHMARK_NEAREST_NEIGHBORS(d, p, c, k) \
|
||||
void BM_NearestNeighbors##d##_##p##_##c##_##k( \
|
||||
::testing::benchmark::State& state) { \
|
||||
BM_NearestNeighbors<d, p, c, k>(state); \
|
||||
} \
|
||||
BENCHMARK(BM_NearestNeighbors##d##_##p##_##c##_##k)->UseRealTime();
|
||||
|
||||
#define RUN_BM_NearestNeighbors(k) \
|
||||
BENCHMARK_NEAREST_NEIGHBORS(k100Dim, k1kPoints, k100Centers, k); \
|
||||
|
@ -57,9 +57,9 @@ void FillTensorWithRandomValues<tstring>(Tensor* t, int string_length,
|
||||
// std::string, then the length of individual strings in the tensors will be
|
||||
// of length "string_length".
|
||||
template <typename T>
|
||||
static void ConcatHelper(int iters, int concat_dimension, int dim2,
|
||||
static void ConcatHelper(::testing::benchmark::State& state,
|
||||
int concat_dimension, int dim2,
|
||||
int string_length = 0) {
|
||||
testing::StopTiming();
|
||||
Graph* g = new Graph(OpRegistry::Global());
|
||||
|
||||
DataType dt = DataTypeToEnum<T>::v();
|
||||
@ -81,49 +81,82 @@ static void ConcatHelper(int iters, int concat_dimension, int dim2,
|
||||
.Attr("T", dt)
|
||||
.Finalize(g, &node));
|
||||
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * (in0_bytes + in1_bytes));
|
||||
testing::StartTiming();
|
||||
test::Benchmark("cpu", g).Run(iters);
|
||||
testing::UseRealTime();
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
|
||||
(in0_bytes + in1_bytes));
|
||||
}
|
||||
|
||||
static void BM_ConcatDim0Float(int iters, int dim2) {
|
||||
ConcatHelper<float>(iters, 0, dim2);
|
||||
void BM_ConcatDim0Float(::testing::benchmark::State& state) {
|
||||
const int dim2 = state.range(0);
|
||||
|
||||
ConcatHelper<float>(state, 0, dim2);
|
||||
}
|
||||
|
||||
static void BM_ConcatDim1Float(int iters, int dim2) {
|
||||
ConcatHelper<float>(iters, 1, dim2);
|
||||
void BM_ConcatDim1Float(::testing::benchmark::State& state) {
|
||||
const int dim2 = state.range(0);
|
||||
|
||||
ConcatHelper<float>(state, 1, dim2);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
|
||||
BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
|
||||
BENCHMARK(BM_ConcatDim0Float)
|
||||
->UseRealTime()
|
||||
->Arg(1000)
|
||||
->Arg(100000)
|
||||
->Arg(1000000);
|
||||
BENCHMARK(BM_ConcatDim1Float)
|
||||
->UseRealTime()
|
||||
->Arg(1000)
|
||||
->Arg(100000)
|
||||
->Arg(1000000);
|
||||
|
||||
static void BM_ConcatDim0String(int iters, int dim2, int string_length) {
|
||||
ConcatHelper<tstring>(iters, 0, dim2, string_length);
|
||||
void BM_ConcatDim0String(::testing::benchmark::State& state) {
|
||||
const int dim2 = state.range(0);
|
||||
const int string_length = state.range(1);
|
||||
|
||||
ConcatHelper<tstring>(state, 0, dim2, string_length);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ConcatDim0String)
|
||||
->UseRealTime()
|
||||
->ArgPair(1, 16)
|
||||
->ArgPair(1, 10000)
|
||||
->ArgPair(100, 16);
|
||||
|
||||
static void BM_ConcatDim1uint8(int iters, int dim2) {
|
||||
ConcatHelper<uint8>(iters, 1, dim2);
|
||||
void BM_ConcatDim1uint8(::testing::benchmark::State& state) {
|
||||
const int dim2 = state.range(0);
|
||||
|
||||
ConcatHelper<uint8>(state, 1, dim2);
|
||||
}
|
||||
static void BM_ConcatDim1int16(int iters, int dim2) {
|
||||
ConcatHelper<int16>(iters, 1, dim2);
|
||||
void BM_ConcatDim1int16(::testing::benchmark::State& state) {
|
||||
const int dim2 = state.range(0);
|
||||
|
||||
ConcatHelper<int16>(state, 1, dim2);
|
||||
}
|
||||
static void BM_ConcatDim1bfloat16(int iters, int dim2) {
|
||||
ConcatHelper<bfloat16>(iters, 1, dim2);
|
||||
void BM_ConcatDim1bfloat16(::testing::benchmark::State& state) {
|
||||
const int dim2 = state.range(0);
|
||||
|
||||
ConcatHelper<bfloat16>(state, 1, dim2);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ConcatDim1uint8)->Arg(1000)->Arg(100000)->Arg(1000000);
|
||||
BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000);
|
||||
BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000);
|
||||
BENCHMARK(BM_ConcatDim1uint8)
|
||||
->UseRealTime()
|
||||
->Arg(1000)
|
||||
->Arg(100000)
|
||||
->Arg(1000000);
|
||||
BENCHMARK(BM_ConcatDim1int16)
|
||||
->UseRealTime()
|
||||
->Arg(1000)
|
||||
->Arg(100000)
|
||||
->Arg(1000000);
|
||||
BENCHMARK(BM_ConcatDim1bfloat16)
|
||||
->UseRealTime()
|
||||
->Arg(1000)
|
||||
->Arg(100000)
|
||||
->Arg(1000000);
|
||||
|
||||
template <typename T>
|
||||
static void ConcatManyHelper(int iters, int concat_dimension, int dim2) {
|
||||
testing::StopTiming();
|
||||
static void ConcatManyHelper(::testing::benchmark::State& state,
|
||||
int concat_dimension, int dim2) {
|
||||
Graph* g = new Graph(OpRegistry::Global());
|
||||
|
||||
DataType dt = DataTypeToEnum<T>::v();
|
||||
@ -146,30 +179,25 @@ static void ConcatManyHelper(int iters, int concat_dimension, int dim2) {
|
||||
.Attr("N", 64)
|
||||
.Attr("T", dt)
|
||||
.Finalize(g, &node));
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
|
||||
kNumInputs * sizeof(T));
|
||||
testing::StartTiming();
|
||||
test::Benchmark("cpu", g).Run(iters);
|
||||
testing::UseRealTime();
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
|
||||
dim2 * kNumInputs * sizeof(T));
|
||||
}
|
||||
|
||||
static void BM_ConcatManyDim1bfloat16(int iters, int dim2) {
|
||||
ConcatManyHelper<bfloat16>(iters, 1, dim2);
|
||||
void BM_ConcatManyDim1bfloat16(::testing::benchmark::State& state) {
|
||||
const int dim2 = state.range(0);
|
||||
|
||||
ConcatManyHelper<bfloat16>(state, 1, dim2);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ConcatManyDim1bfloat16)->Arg(18)->Arg(34)->Arg(60);
|
||||
|
||||
static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) {
|
||||
testing::StopTiming();
|
||||
BENCHMARK(BM_ConcatManyDim1bfloat16)->UseRealTime()->Arg(18)->Arg(34)->Arg(60);
|
||||
|
||||
void MemcpyAlternativeHelper(::testing::benchmark::State& state, int dim2) {
|
||||
const int kDim1 = 100;
|
||||
std::vector<float> data1(kDim1 * dim2, 1.0f);
|
||||
std::vector<float> data2(kDim1 * dim2, 2.0f);
|
||||
|
||||
testing::BytesProcessed(static_cast<int64>(iters) *
|
||||
((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
|
||||
testing::StartTiming();
|
||||
while (--iters > 0) {
|
||||
for (auto s : state) {
|
||||
const size_t n0 = data1.size();
|
||||
const size_t n1 = data2.size();
|
||||
float* result = new float[n0 + n1];
|
||||
@ -177,24 +205,37 @@ static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) {
|
||||
memcpy(&result[n0], &data2[0], n1 * sizeof(float));
|
||||
delete[] result;
|
||||
}
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
|
||||
((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
|
||||
}
|
||||
|
||||
static void BM_MemcpyAlternativeDim0(int iters, int dim2) {
|
||||
MemcpyAlternativeHelper(iters, 0, dim2);
|
||||
void BM_MemcpyAlternativeDim0(::testing::benchmark::State& state) {
|
||||
const int dim2 = state.range(0);
|
||||
|
||||
MemcpyAlternativeHelper(state, dim2);
|
||||
}
|
||||
static void BM_MemcpyAlternativeDim1(int iters, int dim2) {
|
||||
MemcpyAlternativeHelper(iters, 1, dim2);
|
||||
void BM_MemcpyAlternativeDim1(::testing::benchmark::State& state) {
|
||||
const int dim2 = state.range(0);
|
||||
|
||||
MemcpyAlternativeHelper(state, dim2);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
|
||||
BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
|
||||
BENCHMARK(BM_MemcpyAlternativeDim0)
|
||||
->UseRealTime()
|
||||
->Arg(1000)
|
||||
->Arg(100000)
|
||||
->Arg(1000000);
|
||||
BENCHMARK(BM_MemcpyAlternativeDim1)
|
||||
->UseRealTime()
|
||||
->Arg(1000)
|
||||
->Arg(100000)
|
||||
->Arg(1000000);
|
||||
|
||||
typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
|
||||
Eigen::Unaligned>
|
||||
EigenMap;
|
||||
static void MemcpyManyAlternative1(int iters, int dim2) {
|
||||
testing::StopTiming();
|
||||
|
||||
void MemcpyManyAlternative1(::testing::benchmark::State& state) {
|
||||
int dim2 = state.range(0);
|
||||
const int kDim1 = 40000;
|
||||
const int kNumCopies = 64;
|
||||
const int size = kDim1 * dim2 * kNumCopies;
|
||||
@ -202,10 +243,7 @@ static void MemcpyManyAlternative1(int iters, int dim2) {
|
||||
EigenMap map(data, size);
|
||||
map.setRandom();
|
||||
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
|
||||
kNumCopies * sizeof(bfloat16));
|
||||
testing::StartTiming();
|
||||
while (iters-- > 0) {
|
||||
for (auto s : state) {
|
||||
std::vector<bfloat16*> inputs(kNumCopies);
|
||||
for (int i = 0; i < kNumCopies; ++i) {
|
||||
inputs[i] = &data[i * kDim1 * dim2];
|
||||
@ -225,11 +263,12 @@ static void MemcpyManyAlternative1(int iters, int dim2) {
|
||||
delete[] result;
|
||||
}
|
||||
delete[] data;
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
|
||||
dim2 * kNumCopies * sizeof(bfloat16));
|
||||
}
|
||||
|
||||
static void MemcpyManyAlternative2(int iters, int dim2) {
|
||||
testing::StopTiming();
|
||||
|
||||
void MemcpyManyAlternative2(::testing::benchmark::State& state) {
|
||||
int dim2 = state.range(0);
|
||||
const int kDim1 = 40000;
|
||||
const int kNumCopies = 64;
|
||||
const int size = kDim1 * dim2 * kNumCopies;
|
||||
@ -237,11 +276,8 @@ static void MemcpyManyAlternative2(int iters, int dim2) {
|
||||
EigenMap map(data, size);
|
||||
map.setRandom();
|
||||
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
|
||||
kNumCopies * sizeof(bfloat16));
|
||||
testing::StartTiming();
|
||||
std::vector<bfloat16*> inputs(kNumCopies);
|
||||
while (--iters > 0) {
|
||||
for (auto s : state) {
|
||||
bfloat16* result = new bfloat16[size];
|
||||
for (int i = 0; i < kNumCopies; ++i) {
|
||||
inputs[i] = &data[i * kDim1 * dim2];
|
||||
@ -260,6 +296,9 @@ static void MemcpyManyAlternative2(int iters, int dim2) {
|
||||
delete[] result;
|
||||
}
|
||||
delete[] data;
|
||||
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
|
||||
dim2 * kNumCopies * sizeof(bfloat16));
|
||||
}
|
||||
|
||||
BENCHMARK(MemcpyManyAlternative1)
|
||||
|
@ -114,15 +114,23 @@ static Graph* ManyConsts(int num, bool sequential) {
|
||||
return g;
|
||||
}
|
||||
|
||||
static void BM_ManyConsts_Parallel(int iters, int num) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num);
|
||||
test::Benchmark("cpu", ManyConsts(num, false /* !sequential */)).Run(iters);
|
||||
static void BM_ManyConsts_Parallel(::testing::benchmark::State& state) {
|
||||
const int num = state.range(0);
|
||||
|
||||
test::Benchmark("cpu", ManyConsts(num, false /* !sequential */),
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
|
||||
}
|
||||
BENCHMARK(BM_ManyConsts_Parallel)->Range(1, 1 << 10);
|
||||
|
||||
static void BM_ManyConsts_Sequential(int iters, int num) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num);
|
||||
test::Benchmark("cpu", ManyConsts(num, true /* sequential */)).Run(iters);
|
||||
static void BM_ManyConsts_Sequential(::testing::benchmark::State& state) {
|
||||
const int num = state.range(0);
|
||||
|
||||
test::Benchmark("cpu", ManyConsts(num, true /* sequential */),
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num);
|
||||
}
|
||||
BENCHMARK(BM_ManyConsts_Sequential)->Range(1, 1 << 10);
|
||||
|
||||
|
@ -309,104 +309,120 @@ static Graph* FusedConv2DWithBatchNorm(
|
||||
// The following benchmarks are always using 'float' data type with NHWC layout.
|
||||
// -------------------------------------------------------------------------- //
|
||||
|
||||
#define BM_SETUP(N, H, W, C, type, LABEL, NAME) \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * (C)); \
|
||||
testing::SetLabel(LABEL);
|
||||
#define BM_SET_INFO(N, H, W, C, type, LABEL, NAME) \
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * (N) * (H) * \
|
||||
(W) * (C)); \
|
||||
state.SetLabel(LABEL);
|
||||
|
||||
#define BM_NAME(name, type, N, H, W, C, FW, FH, FC) \
|
||||
name##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
|
||||
|
||||
#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL) \
|
||||
static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)(int iters) { \
|
||||
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
|
||||
test::Benchmark(#type, Conv2D<float>(N, H, W, C, FW, FH, FC).graph) \
|
||||
.Run(iters); \
|
||||
} \
|
||||
#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL) \
|
||||
static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, \
|
||||
FC)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark(#type, Conv2D<float>(N, H, W, C, FW, FH, FC).graph, \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
|
||||
} \
|
||||
BENCHMARK(BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC));
|
||||
|
||||
#define BM_Conv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL) \
|
||||
static void BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, \
|
||||
FC)(int iters) { \
|
||||
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
|
||||
FC)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark(#type, \
|
||||
Conv2DWithBias<float>(N, H, W, C, FW, FH, FC).graph) \
|
||||
.Run(iters); \
|
||||
Conv2DWithBias<float>(N, H, W, C, FW, FH, FC).graph, \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
|
||||
} \
|
||||
BENCHMARK(BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, FC));
|
||||
|
||||
#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL) \
|
||||
static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
|
||||
FC)(int iters) { \
|
||||
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
|
||||
test::Benchmark(#type, Conv2DWithBiasAndActivation<float>(N, H, W, C, FW, \
|
||||
FH, FC, "Relu") \
|
||||
.graph) \
|
||||
.Run(iters); \
|
||||
} \
|
||||
#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL) \
|
||||
static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
|
||||
FC)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark( \
|
||||
#type, \
|
||||
Conv2DWithBiasAndActivation<float>(N, H, W, C, FW, FH, FC, "Relu") \
|
||||
.graph, \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
|
||||
} \
|
||||
BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
|
||||
|
||||
#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL) \
|
||||
static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, \
|
||||
FC)(int iters) { \
|
||||
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
|
||||
test::Benchmark(#type, FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, \
|
||||
{"BiasAdd"})) \
|
||||
.Run(iters); \
|
||||
} \
|
||||
#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL) \
|
||||
static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, \
|
||||
FC)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark( \
|
||||
#type, \
|
||||
FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, {"BiasAdd"}), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
|
||||
} \
|
||||
BENCHMARK(BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, FC));
|
||||
|
||||
#define BM_FusedConv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL) \
|
||||
static void BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
|
||||
FC)(int iters) { \
|
||||
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
|
||||
test::Benchmark(#type, FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, \
|
||||
{"BiasAdd", "Relu"})) \
|
||||
.Run(iters); \
|
||||
FC)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark(#type, \
|
||||
FusedConv2DWithBias<float>(N, H, W, C, FW, FH, FC, \
|
||||
{"BiasAdd", "Relu"}), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
|
||||
} \
|
||||
BENCHMARK( \
|
||||
BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
|
||||
|
||||
#define BM_Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL) \
|
||||
static void BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, \
|
||||
FC)(int iters) { \
|
||||
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
|
||||
FC)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark(#type, \
|
||||
Conv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC).graph) \
|
||||
.Run(iters); \
|
||||
Conv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC).graph, \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
|
||||
} \
|
||||
BENCHMARK(BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
|
||||
|
||||
#define BM_Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type, LABEL) \
|
||||
static void BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, \
|
||||
FC)(int iters) { \
|
||||
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
|
||||
test::Benchmark(#type, Conv2DWithBatchNormAndActivation<float>( \
|
||||
N, H, W, C, FW, FH, FC, "Relu") \
|
||||
.graph) \
|
||||
.Run(iters); \
|
||||
FC)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark(#type, \
|
||||
Conv2DWithBatchNormAndActivation<float>(N, H, W, C, FW, \
|
||||
FH, FC, "Relu") \
|
||||
.graph, \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
|
||||
} \
|
||||
BENCHMARK( \
|
||||
BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, FC));
|
||||
|
||||
#define BM_FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL) \
|
||||
static void BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, \
|
||||
FC)(int iters) { \
|
||||
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
|
||||
test::Benchmark(#type, FusedConv2DWithBatchNorm<float>( \
|
||||
N, H, W, C, FW, FH, FC, {"FusedBatchNorm"})) \
|
||||
.Run(iters); \
|
||||
FC)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark(#type, \
|
||||
FusedConv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC, \
|
||||
{"FusedBatchNorm"}), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
|
||||
} \
|
||||
BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
|
||||
|
||||
#define BM_FusedConv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type, \
|
||||
LABEL) \
|
||||
static void BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, \
|
||||
FW, FH, FC)(int iters) { \
|
||||
BM_SETUP(N, H, W, C, type, LABEL, Conv2D); \
|
||||
test::Benchmark( \
|
||||
#type, FusedConv2DWithBatchNorm<float>(N, H, W, C, FW, FH, FC, \
|
||||
{"FusedBatchNorm", "Relu"})) \
|
||||
.Run(iters); \
|
||||
FW, FH, FC)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark(#type, \
|
||||
FusedConv2DWithBatchNorm<float>( \
|
||||
N, H, W, C, FW, FH, FC, {"FusedBatchNorm", "Relu"}), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
BM_SET_INFO(N, H, W, C, type, LABEL, Conv2D); \
|
||||
} \
|
||||
BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, FW, \
|
||||
FH, FC));
|
||||
@ -561,11 +577,12 @@ BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32");
|
||||
|
||||
#define BM_Conv2DFmt(T, FORMAT, N, H, W, C, FW, FH, FC, type) \
|
||||
static void BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH, \
|
||||
FC)(int iters) { \
|
||||
BM_SETUP(N, H, W, C, type, "", Conv2D); \
|
||||
FC)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark(#type, \
|
||||
Conv2D<T>(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph) \
|
||||
.Run(iters); \
|
||||
Conv2D<T>(N, H, W, C, FW, FH, FC, FORMAT_##FORMAT).graph, \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
BM_SET_INFO(N, H, W, C, type, "", Conv2D); \
|
||||
} \
|
||||
BENCHMARK(BM_LONG_NAME(BM_Conv2D, type, T, FORMAT, N, H, W, C, FW, FH, FC));
|
||||
|
||||
|
@ -42,15 +42,19 @@ int RowsAndColsArg(int r, int c) { return r * kRows + c; }
|
||||
int RowsFromArg(int arg) { return (arg / kRows); }
|
||||
int ColsFromArg(int arg) { return (arg % kRows); }
|
||||
|
||||
#define BM_UNARY(DEVICE, FUNC, T, TYPE) \
|
||||
void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) { \
|
||||
const int64 tot = static_cast<int64>(iters) * num; \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(tot); \
|
||||
testing::BytesProcessed(tot * sizeof(T)); \
|
||||
test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
|
||||
#define BM_UNARY(DEVICE, FUNC, T, TYPE) \
|
||||
void BM_##DEVICE##_##FUNC##_##TYPE(::testing::benchmark::State& state) { \
|
||||
const int num = state.range(0); \
|
||||
test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE), \
|
||||
/*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
const int64 tot = static_cast<int64>(state.iterations()) * num; \
|
||||
state.SetItemsProcessed(tot); \
|
||||
state.SetBytesProcessed(tot * sizeof(T)); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE) \
|
||||
->UseRealTime() \
|
||||
->Range(4 << 10, 1 << 20);
|
||||
|
||||
BM_UNARY(cpu, Floor, float, DT_FLOAT);
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
@ -101,27 +105,30 @@ Graph* BinaryScalar(int num, const string& func) {
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_BINARY_SCALAR(DEVICE, FUNC) \
|
||||
void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) { \
|
||||
const int64 tot = static_cast<int64>(iters) * num; \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(tot); \
|
||||
testing::BytesProcessed(tot * sizeof(float)); \
|
||||
test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_##FUNC##_scalar) \
|
||||
->Arg(1 << 12) /* must >= 4096 */ \
|
||||
->Arg(1 << 13) \
|
||||
->Arg(1 << 14) \
|
||||
->Arg((1 << 15) - (1 << 13)) \
|
||||
->Arg(1 << 15) \
|
||||
->Arg((1 << 15) + (1 << 14)) \
|
||||
->Arg(1 << 16) \
|
||||
->Arg((1 << 17) - (1 << 15)) \
|
||||
->Arg(1 << 17) \
|
||||
->Arg((1 << 17) + (1 << 16)) \
|
||||
->Arg(1 << 18) \
|
||||
->Arg(1 << 19) \
|
||||
#define BM_BINARY_SCALAR(DEVICE, FUNC) \
|
||||
void BM_##DEVICE##_##FUNC##_scalar(::testing::benchmark::State& state) { \
|
||||
const int num = state.range(0); \
|
||||
\
|
||||
test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
const int64 tot = static_cast<int64>(state.iterations()) * num; \
|
||||
state.SetItemsProcessed(tot); \
|
||||
state.SetBytesProcessed(tot * sizeof(float)); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_##FUNC##_scalar) \
|
||||
->Arg(1 << 12) /* must >= 4096 */ \
|
||||
->Arg(1 << 13) \
|
||||
->Arg(1 << 14) \
|
||||
->Arg((1 << 15) - (1 << 13)) \
|
||||
->Arg(1 << 15) \
|
||||
->Arg((1 << 15) + (1 << 14)) \
|
||||
->Arg(1 << 16) \
|
||||
->Arg((1 << 17) - (1 << 15)) \
|
||||
->Arg(1 << 17) \
|
||||
->Arg((1 << 17) + (1 << 16)) \
|
||||
->Arg(1 << 18) \
|
||||
->Arg(1 << 19) \
|
||||
->Arg(1 << 20);
|
||||
|
||||
BM_BINARY_SCALAR(cpu, Less);
|
||||
@ -173,17 +180,19 @@ Graph* CubeWithMulSquare(int num) {
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_CUBE(DEVICE, Impl) \
|
||||
void BM_##DEVICE##_Cube_##Impl(int iters, int num) { \
|
||||
const int64 tot = static_cast<int64>(iters) * num; \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(tot); \
|
||||
testing::BytesProcessed(tot * sizeof(float)); \
|
||||
test::Benchmark(#DEVICE, Impl(num)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_Cube_##Impl) \
|
||||
->Arg(1 << 12) /* must >= 4096 */ \
|
||||
->Arg(1 << 16) \
|
||||
#define BM_CUBE(DEVICE, Impl) \
|
||||
void BM_##DEVICE##_Cube_##Impl(::testing::benchmark::State& state) { \
|
||||
const int num = state.range(0); \
|
||||
\
|
||||
test::Benchmark(#DEVICE, Impl(num)).Run(state.iterations()); \
|
||||
const int64 tot = static_cast<int64>(state.iterations()) * num; \
|
||||
state.SetItemsProcessed(tot); \
|
||||
state.SetBytesProcessed(tot * sizeof(float)); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_Cube_##Impl) \
|
||||
->UseRealTime() \
|
||||
->Arg(1 << 12) /* must >= 4096 */ \
|
||||
->Arg(1 << 16) \
|
||||
->Arg(1 << 20);
|
||||
|
||||
BM_CUBE(cpu, CubeWithPow3);
|
||||
@ -211,17 +220,21 @@ Graph* BiasAdd(int rows, int cols, DataType type) {
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C) \
|
||||
void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C(int iters, int arg) { \
|
||||
const int rows = RowsFromArg(arg); \
|
||||
const int cols = ColsFromArg(arg); \
|
||||
const int64 tot = static_cast<int64>(iters) * rows * cols; \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(tot); \
|
||||
testing::BytesProcessed(tot * sizeof(C_TYPE)); \
|
||||
test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C) \
|
||||
#define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C) \
|
||||
void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C( \
|
||||
::testing::benchmark::State& state) { \
|
||||
const int arg = state.range(0); \
|
||||
const int rows = RowsFromArg(arg); \
|
||||
const int cols = ColsFromArg(arg); \
|
||||
const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
|
||||
test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(tot); \
|
||||
state.SetBytesProcessed(tot * sizeof(C_TYPE)); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C) \
|
||||
->UseRealTime() \
|
||||
->Arg(RowsAndColsArg(R, C));
|
||||
|
||||
#define BM_BIAS_ADD_ALL(DEVICE, C_TYPE, TF_TYPE) \
|
||||
@ -264,16 +277,21 @@ Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
|
||||
|
||||
#define BM_BIAS_ADD_GRAD(DEVICE, FMT, C_TYPE, TF_TYPE, R, C, CH) \
|
||||
void BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH( \
|
||||
int iters, int arg, int channels) { \
|
||||
::testing::benchmark::State& state) { \
|
||||
const int arg = state.range(0); \
|
||||
const int channels = state.range(1); \
|
||||
\
|
||||
const int rows = RowsFromArg(arg); \
|
||||
const int cols = ColsFromArg(arg); \
|
||||
const int64 tot = static_cast<int64>(iters) * rows * cols * channels; \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(tot); \
|
||||
testing::BytesProcessed(tot * sizeof(C_TYPE)); \
|
||||
test::Benchmark(#DEVICE, BiasAddGrad<C_TYPE>(rows, cols, channels, \
|
||||
TF_TYPE, FORMAT_##FMT)) \
|
||||
.Run(iters); \
|
||||
test::Benchmark( \
|
||||
#DEVICE, \
|
||||
BiasAddGrad<C_TYPE>(rows, cols, channels, TF_TYPE, FORMAT_##FMT), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
const int64 tot = \
|
||||
static_cast<int64>(state.iterations()) * rows * cols * channels; \
|
||||
state.SetItemsProcessed(tot); \
|
||||
state.SetBytesProcessed(tot * sizeof(C_TYPE)); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH) \
|
||||
->ArgPair(RowsAndColsArg(R, C), CH);
|
||||
@ -326,16 +344,20 @@ Graph* BcastAdd(int rows, int cols, int dim) {
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_BCAST_ADD_ROW(DEVICE, R, C) \
|
||||
void BM_##DEVICE##_BcastAddRow_R##R##_C##C(int iters, int arg) { \
|
||||
const int rows = RowsFromArg(arg); \
|
||||
const int cols = ColsFromArg(arg); \
|
||||
const int64 tot = static_cast<int64>(iters) * rows * cols; \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(tot); \
|
||||
testing::BytesProcessed(tot * sizeof(float)); \
|
||||
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters); \
|
||||
} \
|
||||
#define BM_BCAST_ADD_ROW(DEVICE, R, C) \
|
||||
void BM_##DEVICE##_BcastAddRow_R##R##_C##C( \
|
||||
::testing::benchmark::State& state) { \
|
||||
const int arg = state.range(0); \
|
||||
\
|
||||
const int rows = RowsFromArg(arg); \
|
||||
const int cols = ColsFromArg(arg); \
|
||||
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
|
||||
state.SetItemsProcessed(tot); \
|
||||
state.SetBytesProcessed(tot * sizeof(float)); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
|
||||
|
||||
#define BM_BCAST_ADD_ROW_ALL(DEVICE) \
|
||||
@ -350,17 +372,24 @@ BM_BCAST_ADD_ROW_ALL(gpu);
|
||||
#undef BM_BCAST_ADD_ROW_ALL
|
||||
#undef BM_BCAST_ADD_ROW
|
||||
|
||||
#define BM_BCAST_ADD_COL(DEVICE, R, C) \
|
||||
void BM_##DEVICE##_BcastAddCol_R##R##_C##C(int iters, int arg) { \
|
||||
const int rows = RowsFromArg(arg); \
|
||||
const int cols = ColsFromArg(arg); \
|
||||
const int64 tot = static_cast<int64>(iters) * rows * cols; \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(tot); \
|
||||
testing::BytesProcessed(tot * sizeof(float)); \
|
||||
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
|
||||
#define BM_BCAST_ADD_COL(DEVICE, R, C) \
|
||||
void BM_##DEVICE##_BcastAddCol_R##R##_C##C( \
|
||||
::testing::benchmark::State& state) { \
|
||||
const int arg = state.range(0); \
|
||||
\
|
||||
const int rows = RowsFromArg(arg); \
|
||||
const int cols = ColsFromArg(arg); \
|
||||
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
|
||||
\
|
||||
state.SetItemsProcessed(tot); \
|
||||
state.SetBytesProcessed(tot * sizeof(float)); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C) \
|
||||
->UseRealTime() \
|
||||
->Arg(RowsAndColsArg(R, C));
|
||||
|
||||
#define BM_BCAST_ADD_COL_ALL(DEVICE) \
|
||||
BM_BCAST_ADD_COL(DEVICE, 512, 2048); \
|
||||
@ -374,17 +403,23 @@ BM_BCAST_ADD_COL_ALL(gpu);
|
||||
#undef BM_BCAST_ADD_COL_ALL
|
||||
#undef BM_BCAST_ADD_COL
|
||||
|
||||
#define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C) \
|
||||
void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C(int iters, int arg) { \
|
||||
const int rows = RowsFromArg(arg); \
|
||||
const int cols = ColsFromArg(arg); \
|
||||
const int64 tot = static_cast<int64>(iters) * rows * cols; \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(tot); \
|
||||
testing::BytesProcessed(tot * sizeof(float)); \
|
||||
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C) \
|
||||
#define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C) \
|
||||
void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C( \
|
||||
::testing::benchmark::State& state) { \
|
||||
const int arg = state.range(0); \
|
||||
\
|
||||
const int rows = RowsFromArg(arg); \
|
||||
const int cols = ColsFromArg(arg); \
|
||||
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2), \
|
||||
/*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
|
||||
\
|
||||
state.SetItemsProcessed(tot); \
|
||||
state.SetBytesProcessed(tot * sizeof(float)); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C) \
|
||||
->UseRealTime() \
|
||||
->Arg(RowsAndColsArg(R, C));
|
||||
|
||||
#define BM_BCAST_ADD_CROSS_RC_ALL(DEVICE) \
|
||||
@ -399,17 +434,22 @@ BM_BCAST_ADD_CROSS_RC_ALL(gpu);
|
||||
#undef BM_BCAST_ADD_CROSS_RC_ALL
|
||||
#undef BM_BCAST_ADD_CROSS_RC
|
||||
|
||||
#define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C) \
|
||||
void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C(int iters, int arg) { \
|
||||
const int rows = RowsFromArg(arg); \
|
||||
const int cols = ColsFromArg(arg); \
|
||||
const int64 tot = static_cast<int64>(iters) * rows * cols; \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(tot); \
|
||||
testing::BytesProcessed(tot * sizeof(float)); \
|
||||
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C) \
|
||||
#define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C) \
|
||||
void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C( \
|
||||
::testing::benchmark::State& state) { \
|
||||
const int arg = state.range(0); \
|
||||
\
|
||||
const int rows = RowsFromArg(arg); \
|
||||
const int cols = ColsFromArg(arg); \
|
||||
test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3), \
|
||||
/*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
|
||||
state.SetItemsProcessed(tot); \
|
||||
state.SetBytesProcessed(tot * sizeof(float)); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C) \
|
||||
->UseRealTime() \
|
||||
->Arg(RowsAndColsArg(R, C));
|
||||
|
||||
#define BM_BCAST_ADD_CROSS_CR_ALL(DEVICE) \
|
||||
|
@ -273,10 +273,10 @@ TEST_F(ExecutorTest, ControlDependenciesFromSpecialNodes) {
|
||||
EXPECT_EQ(3.0, V(retvals[0])); // out = 1.0 + 2.0 = 3.0
|
||||
}
|
||||
|
||||
static void BM_executor(int iters, int width, int depth) {
|
||||
#ifdef PLATFORM_GOOGLE
|
||||
BenchmarkUseRealTime();
|
||||
#endif // PLATFORM_GOOGLE
|
||||
void BM_executor(::testing::benchmark::State& state) {
|
||||
const int width = state.range(0);
|
||||
const int depth = state.range(1);
|
||||
|
||||
Graph* g = new Graph(OpRegistry::Global());
|
||||
random::PhiloxRandom philox(1729, 17);
|
||||
random::SimplePhilox rand(&philox);
|
||||
@ -306,30 +306,28 @@ static void BM_executor(int iters, int width, int depth) {
|
||||
}
|
||||
}
|
||||
FixupSourceAndSinkEdges(g);
|
||||
#ifdef PLATFORM_GOOGLE
|
||||
SetBenchmarkLabel(strings::StrCat("Nodes = ", cur));
|
||||
SetBenchmarkItemsProcessed(cur * static_cast<int64>(iters));
|
||||
#endif // PLATFORM_GOOGLE
|
||||
test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
|
||||
"SINGLE_THREADED_EXECUTOR")
|
||||
.Run(iters);
|
||||
"SINGLE_THREADED_EXECUTOR", /*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
state.SetLabel(strings::StrCat("Nodes = ", cur));
|
||||
state.SetItemsProcessed(cur * static_cast<int64>(state.iterations()));
|
||||
}
|
||||
|
||||
// Tall skinny graphs
|
||||
BENCHMARK(BM_executor)->ArgPair(16, 1024);
|
||||
BENCHMARK(BM_executor)->ArgPair(32, 8192);
|
||||
BENCHMARK(BM_executor)->UseRealTime()->ArgPair(16, 1024);
|
||||
BENCHMARK(BM_executor)->UseRealTime()->ArgPair(32, 8192);
|
||||
|
||||
// Short fat graphs
|
||||
BENCHMARK(BM_executor)->ArgPair(1024, 16);
|
||||
BENCHMARK(BM_executor)->ArgPair(8192, 32);
|
||||
BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 16);
|
||||
BENCHMARK(BM_executor)->UseRealTime()->ArgPair(8192, 32);
|
||||
|
||||
// Tall fat graph
|
||||
BENCHMARK(BM_executor)->ArgPair(1024, 1024);
|
||||
BENCHMARK(BM_executor)->UseRealTime()->ArgPair(1024, 1024);
|
||||
|
||||
void BM_const_identity(::testing::benchmark::State& state) {
|
||||
const int width = state.range(0);
|
||||
const int outputs_per_const = state.range(1);
|
||||
|
||||
static void BM_const_identity(int iters, int width, int outputs_per_const) {
|
||||
#ifdef PLATFORM_GOOGLE
|
||||
BenchmarkUseRealTime();
|
||||
#endif // PLATFORM_GOOGLE
|
||||
Graph* g = new Graph(OpRegistry::Global());
|
||||
for (int i = 0; i < width; ++i) {
|
||||
Tensor i_t(i);
|
||||
@ -339,22 +337,20 @@ static void BM_const_identity(int iters, int width, int outputs_per_const) {
|
||||
}
|
||||
}
|
||||
FixupSourceAndSinkEdges(g);
|
||||
#ifdef PLATFORM_GOOGLE
|
||||
SetBenchmarkLabel(
|
||||
strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
|
||||
SetBenchmarkItemsProcessed((1 + outputs_per_const) * width *
|
||||
static_cast<int64>(iters));
|
||||
#endif // PLATFORM_GOOGLE
|
||||
test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
|
||||
"SINGLE_THREADED_EXECUTOR")
|
||||
.Run(iters);
|
||||
"SINGLE_THREADED_EXECUTOR",
|
||||
/*old_benchmark_api=*/false)
|
||||
.Run(state);
|
||||
state.SetLabel(strings::StrCat("Nodes = ", (1 + outputs_per_const) * width));
|
||||
state.SetItemsProcessed((1 + outputs_per_const) * width *
|
||||
static_cast<int64>(state.iterations()));
|
||||
}
|
||||
|
||||
// Graph with actual op execution.
|
||||
BENCHMARK(BM_const_identity)->ArgPair(1, 1);
|
||||
BENCHMARK(BM_const_identity)->ArgPair(1, 100);
|
||||
BENCHMARK(BM_const_identity)->ArgPair(100, 1);
|
||||
BENCHMARK(BM_const_identity)->ArgPair(100, 100);
|
||||
BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(1, 1);
|
||||
BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(1, 100);
|
||||
BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(100, 1);
|
||||
BENCHMARK(BM_const_identity)->UseRealTime()->ArgPair(100, 100);
|
||||
|
||||
// TODO(mrry): This benchmark currently crashes with a use-after free, because
|
||||
// test::Benchmark::RunWithArgs() assumes that the executor will take ownership
|
||||
@ -368,7 +364,7 @@ BENCHMARK(BM_const_identity)->ArgPair(100, 100);
|
||||
#define ALICE "/job:j/replica:0/task:0/cpu:0"
|
||||
#define BOB "/job:j/replica:0/task:0/gpu:0"
|
||||
|
||||
static void BM_FeedInputFetchOutput(int iters) {
|
||||
static void BM_FeedInputFetchOutput(::testing::benchmark::State& state) {
|
||||
Graph* g = new Graph(OpRegistry::Global());
|
||||
// z = x + y: x and y are provided as benchmark inputs. z is the
|
||||
// output of the benchmark. Conceptually, the caller is ALICE, the
|
||||
@ -380,10 +376,10 @@ static void BM_FeedInputFetchOutput(int iters) {
|
||||
FixupSourceAndSinkEdges(g);
|
||||
Tensor val(DT_FLOAT, TensorShape({}));
|
||||
val.scalar<float>()() = 3.14;
|
||||
SetBenchmarkItemsProcessed(static_cast<int64>(iters));
|
||||
test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
|
||||
"SINGLE_THREADED_EXECUTOR")
|
||||
.RunWithArgs({{x, val}, {y, val}}, {z}, iters);
|
||||
"SINGLE_THREADED_EXECUTOR", /*old_benchmark_api=*/false)
|
||||
.RunWithArgs({{x, val}, {y, val}}, {z}, state);
|
||||
state.SetItemsProcessed(state.iterations());
|
||||
}
|
||||
BENCHMARK(BM_FeedInputFetchOutput);
|
||||
#endif
|
||||
|
@ -247,7 +247,7 @@ TEST_F(DequantizeOpTest, DequantizeScaledQint8Axis3) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void BM_DequantizeMinCombinedCpu(int iters) {
|
||||
static void BM_DequantizeMinCombinedCpu(::testing::benchmark::State& state) {
|
||||
auto root = Scope::NewRootScope().ExitOnError();
|
||||
const int64 num_values = 1500 * 250;
|
||||
std::vector<T> inputs;
|
||||
@ -262,25 +262,26 @@ static void BM_DequantizeMinCombinedCpu(int iters) {
|
||||
Graph* g = new Graph(OpRegistry::Global());
|
||||
TF_CHECK_OK(root.ToGraph(g));
|
||||
|
||||
test::Benchmark("cpu", g).Run(iters);
|
||||
testing::BytesProcessed(iters * num_values * (sizeof(float) + sizeof(T)));
|
||||
testing::ItemsProcessed(iters);
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
|
||||
state.SetBytesProcessed(state.iterations() * num_values *
|
||||
(sizeof(float) + sizeof(T)));
|
||||
state.SetItemsProcessed(state.iterations());
|
||||
}
|
||||
|
||||
static void BM_DequantizeMinCombinedCpuQuint16(int iters) {
|
||||
BM_DequantizeMinCombinedCpu<quint16>(iters);
|
||||
void BM_DequantizeMinCombinedCpuQuint16(::testing::benchmark::State& state) {
|
||||
BM_DequantizeMinCombinedCpu<quint16>(state);
|
||||
}
|
||||
|
||||
static void BM_DequantizeMinCombinedCpuQint16(int iters) {
|
||||
BM_DequantizeMinCombinedCpu<qint16>(iters);
|
||||
void BM_DequantizeMinCombinedCpuQint16(::testing::benchmark::State& state) {
|
||||
BM_DequantizeMinCombinedCpu<qint16>(state);
|
||||
}
|
||||
|
||||
static void BM_DequantizeMinCombinedCpuQuint8(int iters) {
|
||||
BM_DequantizeMinCombinedCpu<quint8>(iters);
|
||||
void BM_DequantizeMinCombinedCpuQuint8(::testing::benchmark::State& state) {
|
||||
BM_DequantizeMinCombinedCpu<quint8>(state);
|
||||
}
|
||||
|
||||
static void BM_DequantizeMinCombinedCpuQint8(int iters) {
|
||||
BM_DequantizeMinCombinedCpu<qint8>(iters);
|
||||
void BM_DequantizeMinCombinedCpuQint8(::testing::benchmark::State& state) {
|
||||
BM_DequantizeMinCombinedCpu<qint8>(state);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_DequantizeMinCombinedCpuQuint16);
|
||||
@ -289,7 +290,8 @@ BENCHMARK(BM_DequantizeMinCombinedCpuQuint8);
|
||||
BENCHMARK(BM_DequantizeMinCombinedCpuQint8);
|
||||
|
||||
template <typename T>
|
||||
static void BM_DequantizeBfloat16MinCombinedCpu(int iters) {
|
||||
static void BM_DequantizeBfloat16MinCombinedCpu(
|
||||
::testing::benchmark::State& state) {
|
||||
auto root = Scope::NewRootScope().ExitOnError();
|
||||
const int64 num_values = 1500 * 250;
|
||||
std::vector<T> inputs;
|
||||
@ -304,25 +306,30 @@ static void BM_DequantizeBfloat16MinCombinedCpu(int iters) {
|
||||
Graph* g = new Graph(OpRegistry::Global());
|
||||
TF_CHECK_OK(root.ToGraph(g));
|
||||
|
||||
test::Benchmark("cpu", g).Run(iters);
|
||||
testing::BytesProcessed(iters * num_values * (sizeof(bfloat16) + sizeof(T)));
|
||||
testing::ItemsProcessed(iters);
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
|
||||
state.SetBytesProcessed(state.iterations() * num_values *
|
||||
(sizeof(bfloat16) + sizeof(T)));
|
||||
state.SetItemsProcessed(state.iterations());
|
||||
}
|
||||
|
||||
static void BM_DequantizeBfloat16MinCombinedCpuQuint16(int iters) {
|
||||
BM_DequantizeBfloat16MinCombinedCpu<quint16>(iters);
|
||||
void BM_DequantizeBfloat16MinCombinedCpuQuint16(
|
||||
::testing::benchmark::State& state) {
|
||||
BM_DequantizeBfloat16MinCombinedCpu<quint16>(state);
|
||||
}
|
||||
|
||||
static void BM_DequantizeBfloat16MinCombinedCpuQint16(int iters) {
|
||||
BM_DequantizeBfloat16MinCombinedCpu<qint16>(iters);
|
||||
void BM_DequantizeBfloat16MinCombinedCpuQint16(
|
||||
::testing::benchmark::State& state) {
|
||||
BM_DequantizeBfloat16MinCombinedCpu<qint16>(state);
|
||||
}
|
||||
|
||||
static void BM_DequantizeBfloat16MinCombinedCpuQuint8(int iters) {
|
||||
BM_DequantizeBfloat16MinCombinedCpu<quint8>(iters);
|
||||
void BM_DequantizeBfloat16MinCombinedCpuQuint8(
|
||||
::testing::benchmark::State& state) {
|
||||
BM_DequantizeBfloat16MinCombinedCpu<quint8>(state);
|
||||
}
|
||||
|
||||
static void BM_DequantizeBfloat16MinCombinedCpuQint8(int iters) {
|
||||
BM_DequantizeBfloat16MinCombinedCpu<qint8>(iters);
|
||||
void BM_DequantizeBfloat16MinCombinedCpuQint8(
|
||||
::testing::benchmark::State& state) {
|
||||
BM_DequantizeBfloat16MinCombinedCpu<qint8>(state);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_DequantizeBfloat16MinCombinedCpuQuint16);
|
||||
|
@ -30,12 +30,13 @@ static Graph* Diag(int n, DataType type) {
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_DiagDev(N, T, TFTYPE, DEVICE) \
|
||||
static void BM_Diag##_##N##_##TFTYPE##_##DEVICE(int iters) { \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * N * N); \
|
||||
test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE)).Run(iters); \
|
||||
} \
|
||||
#define BM_DiagDev(N, T, TFTYPE, DEVICE) \
|
||||
static void BM_Diag##_##N##_##TFTYPE##_##DEVICE( \
|
||||
::testing::benchmark::State& state) { \
|
||||
test::Benchmark(#DEVICE, Diag<T>(N, TFTYPE), /*old_benchmark_api=*/false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * N * N); \
|
||||
} \
|
||||
BENCHMARK(BM_Diag##_##N##_##TFTYPE##_##DEVICE);
|
||||
|
||||
#define BM_Diag(N) \
|
||||
|
Loading…
x
Reference in New Issue
Block a user