Internal tests cleanup.

PiperOrigin-RevId: 339741501
Change-Id: Iaa532c63d5c653de8e6a76e78822014fbef51b28
This commit is contained in:
A. Unique TensorFlower 2020-10-29 14:01:21 -07:00 committed by TensorFlower Gardener
parent fb49d63afa
commit df70d68014
16 changed files with 518 additions and 369 deletions

View File

@ -58,11 +58,14 @@ Graph* TruncatedNormal(int64 n) {
return g;
}
#define BM_RNG(DEVICE, RNG) \
void BM_##DEVICE##_##RNG(int iters, int arg) { \
testing::ItemsProcessed(static_cast<int64>(iters) * arg); \
test::Benchmark(#DEVICE, RNG(arg)).Run(iters); \
} \
#define BM_RNG(DEVICE, RNG) \
void BM_##DEVICE##_##RNG(::testing::benchmark::State& state) { \
const int arg = state.range(0); \
\
test::Benchmark(#DEVICE, RNG(arg), /*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * arg); \
} \
BENCHMARK(BM_##DEVICE##_##RNG)->Range(1 << 20, 8 << 20);
BM_RNG(cpu, RandomUniform);
@ -84,60 +87,48 @@ Tensor VecAlphas(int64 n) {
return alphas;
}
void BM_cpu_RandomGamma(int iters, int nsamp, int nalpha) {
testing::ItemsProcessed(static_cast<int64>(iters) * nsamp * nalpha);
void BM_cpu_RandomGamma(::testing::benchmark::State& state) {
const int nsamp = state.range(0);
const int nalpha = state.range(1);
Graph* g = new Graph(OpRegistry::Global());
test::graph::RandomGamma(g, test::graph::Constant(g, VecShape(nsamp)),
test::graph::Constant(g, VecAlphas(nalpha)));
test::Benchmark("cpu", g).Run(iters);
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * nsamp *
nalpha);
}
BENCHMARK(BM_cpu_RandomGamma)->RangePair(1 << 14, 4 << 15, 2, 50);
void BM_PhiloxRandom(int iters) {
void BM_PhiloxRandom(::testing::benchmark::State& state) {
// Fill 2M random numbers
int count = 2 << 20;
testing::ItemsProcessed(static_cast<int64>(iters) * count);
random::PhiloxRandom gen(0x12345);
int val = 1;
for (int i = 0; i < iters; ++i) {
for (auto s : state) {
for (int j = 0; j < count; j += 4) {
/// each invocation of gen() returns 128-bit samples
auto samples = gen();
// use the result trivially so the compiler does not optimize it away
val ^= samples[0] ^ samples[1] ^ samples[2] ^ samples[3];
tensorflow::testing::DoNotOptimize(samples);
}
}
// A anchor point to make sure the compiler does not cut corners
CHECK(val) << val;
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * count);
}
BENCHMARK(BM_PhiloxRandom);
void BM_StdMTRandom(int iters) {
void BM_StdMTRandom(::testing::benchmark::State& state) {
// Fill 2M random numbers
int count = 2 << 20;
testing::ItemsProcessed(static_cast<int64>(iters) * count);
std::mt19937 gen(0x12345);
uint_fast32_t val = 1;
for (int i = 0; i < iters; ++i) {
for (auto s : state) {
for (int j = 0; j < count; ++j) {
/// each invocation of gen() returns 32-bit sample
uint_fast32_t sample = gen();
// use the result trivially so the compiler does not optimize it away
val ^= sample;
tensorflow::testing::DoNotOptimize(sample);
}
}
// A anchor point to make sure the compiler does not cut corners
CHECK(val) << val;
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * count);
}
BENCHMARK(BM_StdMTRandom);

View File

@ -84,108 +84,167 @@ static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
// Creates a bench which reduces a 3D tensor with total "num" floats
// into a scalar on a "device". Runs the bench for "iters" times.
template <typename T>
static void ReduceToScalar(int iters, const string& device,
const string& reduce, int num_x, int num_y) {
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
sizeof(T));
test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y)).Run(iters);
static void ReduceToScalar(::testing::benchmark::State& state,
const string& device, const string& reduce,
int num_x, int num_y) {
test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y),
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y * sizeof(T));
}
static void DoRowReduce(int iters, const string& device, const string& reduce,
int num_x, int num_y) {
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
sizeof(float));
test::Benchmark(device, RowReduce(reduce, num_x, num_y)).Run(iters);
static void DoRowReduce(::testing::benchmark::State& state,
const string& device, const string& reduce, int num_x,
int num_y) {
test::Benchmark(device, RowReduce(reduce, num_x, num_y),
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y * sizeof(float));
}
static void DoColReduce(int iters, const string& device, const string& reduce,
int num_x, int num_y) {
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
sizeof(float));
test::Benchmark(device, ColReduce(reduce, num_x, num_y)).Run(iters);
static void DoColReduce(::testing::benchmark::State& state,
const string& device, const string& reduce, int num_x,
int num_y) {
test::Benchmark(device, ColReduce(reduce, num_x, num_y),
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y * sizeof(float));
}
static void Do3DYReduce(int iters, const string& device, const string& reduce,
int num_x, int num_y) {
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
sizeof(float));
test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y)).Run(iters);
static void Do3DYReduce(::testing::benchmark::State& state,
const string& device, const string& reduce, int num_x,
int num_y) {
test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y),
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y * sizeof(float));
}
static void Do3DXZReduce(int iters, const string& device, const string& reduce,
int num_x, int num_y) {
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
sizeof(float));
test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y)).Run(iters);
static void Do3DXZReduce(::testing::benchmark::State& state,
const string& device, const string& reduce, int num_x,
int num_y) {
test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y),
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y * sizeof(float));
}
static void BM_Sum2DToScalarGPU(int iters, int num_x, int num_y) {
ReduceToScalar<float>(iters, "gpu", "Sum", num_x, num_y);
static void BM_Sum2DToScalarGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
ReduceToScalar<float>(state, "gpu", "Sum", num_x, num_y);
}
BENCHMARK(BM_Sum2DToScalarGPU)->RangePair(1, 8192, 1, 8192);
static void BM_Sum2DToScalarGPUComplex(int iters, int num_x, int num_y) {
ReduceToScalar<std::complex<float>>(iters, "gpu", "Sum", num_x, num_y);
static void BM_Sum2DToScalarGPUComplex(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
ReduceToScalar<std::complex<float>>(state, "gpu", "Sum", num_x, num_y);
}
BENCHMARK(BM_Sum2DToScalarGPUComplex)->RangePair(1, 8192, 1, 8192);
static void BM_Sum2DToScalarGPUHalf(int iters, int num_x, int num_y) {
ReduceToScalar<Eigen::half>(iters, "gpu", "Sum", num_x, num_y);
static void BM_Sum2DToScalarGPUHalf(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
ReduceToScalar<Eigen::half>(state, "gpu", "Sum", num_x, num_y);
}
BENCHMARK(BM_Sum2DToScalarGPUHalf)->RangePair(1, 8192, 1, 8192);
static void BM_Sum2DRowReduceGPU(int iters, int num_x, int num_y) {
DoRowReduce(iters, "gpu", "Sum", num_x, num_y);
static void BM_Sum2DRowReduceGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
DoRowReduce(state, "gpu", "Sum", num_x, num_y);
}
BENCHMARK(BM_Sum2DRowReduceGPU)->RangePair(1, 8192, 1, 8192);
static void BM_Sum2DColumnReduceGPU(int iters, int num_x, int num_y) {
DoColReduce(iters, "gpu", "Sum", num_x, num_y);
static void BM_Sum2DColumnReduceGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
DoColReduce(state, "gpu", "Sum", num_x, num_y);
}
BENCHMARK(BM_Sum2DColumnReduceGPU)->RangePair(1, 8192, 1, 8192);
static void BM_Sum3DYReduceGPU(int iters, int num_x, int num_y) {
Do3DYReduce(iters, "gpu", "Sum", num_x, num_y);
static void BM_Sum3DYReduceGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
Do3DYReduce(state, "gpu", "Sum", num_x, num_y);
}
BENCHMARK(BM_Sum3DYReduceGPU)->RangePair(64, 4096, 64, 4096);
static void BM_Sum3DXZReduceGPU(int iters, int num_x, int num_y) {
Do3DXZReduce(iters, "gpu", "Sum", num_x, num_y);
static void BM_Sum3DXZReduceGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
Do3DXZReduce(state, "gpu", "Sum", num_x, num_y);
}
BENCHMARK(BM_Sum3DXZReduceGPU)->RangePair(64, 4096, 64, 4096);
static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) {
ReduceToScalar<float>(iters, "gpu", "Mean", num_x, num_y);
static void BM_Mean2DToScalarGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
ReduceToScalar<float>(state, "gpu", "Mean", num_x, num_y);
}
BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
static void BM_EuclideanNorm2DToScalarGPU(int iters, int num_x, int num_y) {
ReduceToScalar<float>(iters, "gpu", "EuclideanNorm", num_x, num_y);
static void BM_EuclideanNorm2DToScalarGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
ReduceToScalar<float>(state, "gpu", "EuclideanNorm", num_x, num_y);
}
BENCHMARK(BM_EuclideanNorm2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) {
ReduceToScalar<float>(iters, "gpu", "Max", num_x, num_y);
static void BM_Max2DToScalarGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
ReduceToScalar<float>(state, "gpu", "Max", num_x, num_y);
}
BENCHMARK(BM_Max2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) {
ReduceToScalar<float>(iters, "gpu", "Min", num_x, num_y);
static void BM_Min2DToScalarGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
ReduceToScalar<float>(state, "gpu", "Min", num_x, num_y);
}
BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
static void BM_Min2DToScalarGPUHalf(int iters, int num_x, int num_y) {
ReduceToScalar<Eigen::half>(iters, "gpu", "Min", num_x, num_y);
static void BM_Min2DToScalarGPUHalf(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
ReduceToScalar<Eigen::half>(state, "gpu", "Min", num_x, num_y);
}
BENCHMARK(BM_Min2DToScalarGPUHalf)->RangePair(2048, 8192, 2048, 8192);
static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) {
ReduceToScalar<bool>(iters, "gpu", "All", num_x, num_y);
static void BM_Bool2DToScalarGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
ReduceToScalar<bool>(state, "gpu", "All", num_x, num_y);
}
BENCHMARK(BM_Bool2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);

View File

@ -84,17 +84,17 @@ Graph* SetupRegexReplaceGraph(const Tensor& input, const string& input_pattern,
return g;
}
void BM_RegexReplace(int iters, int batch_size) {
testing::StopTiming();
testing::ItemsProcessed(static_cast<int64>(iters));
testing::UseRealTime();
static void BM_RegexReplace(::testing::benchmark::State& state) {
const int batch_size = state.range(0);
Tensor input = GetTestTensor(batch_size);
Graph* g = SetupRegexReplaceGraph(input, kRegExPattern, kRewrite);
testing::StartTiming();
test::Benchmark("cpu", g).Run(iters);
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()));
}
BENCHMARK(BM_RegexReplace)
->UseRealTime()
->Arg(1)
->Arg(8)
->Arg(16)
@ -115,17 +115,17 @@ Graph* SetupStaticGraph(const Tensor& input, const string& input_pattern,
.Finalize(g, nullptr /* node */));
return g;
}
void BM_StaticRegexReplace(int iters, int batch_size) {
testing::StopTiming();
testing::ItemsProcessed(static_cast<int64>(iters));
testing::UseRealTime();
static void BM_StaticRegexReplace(::testing::benchmark::State& state) {
const int batch_size = state.range(0);
Tensor input = GetTestTensor(batch_size);
Graph* g = SetupStaticGraph(input, kRegExPattern, kRewrite);
testing::StartTiming();
test::Benchmark("cpu", g).Run(iters);
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()));
}
BENCHMARK(BM_StaticRegexReplace)
->UseRealTime()
->Arg(1)
->Arg(8)
->Arg(16)

View File

@ -67,56 +67,29 @@ TEST_F(RequantizationRangeTest, HandCrafted) {
test::ExpectTensorEqual<float>(expected_max, *GetOutput(1));
}
static void BM_RequantizationRange(int iters, int size) {
testing::StopTiming();
testing::UseRealTime();
testing::ItemsProcessed(static_cast<int64>(iters) * size);
testing::ItemsProcessed(static_cast<int64>(iters) * size * 4);
static void BM_RequantizationRange(::testing::benchmark::State& state) {
const int size = state.range(0);
Tensor quantized_tensor(DT_QINT32, TensorShape({1, size}));
test::FillFn<qint32>(&quantized_tensor, [](int n) { return qint32(n); });
qint32 actual_min;
qint32 actual_max;
testing::StartTiming();
for (int iter = 0; iter < iters; ++iter) {
for (auto s : state) {
CalculateUsedRange(quantized_tensor, &actual_min, &actual_max);
}
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * size);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * size * 4);
}
static void BM_RequantizationRange100(int iters) {
BM_RequantizationRange(100, iters);
}
BENCHMARK(BM_RequantizationRange100);
static void BM_RequantizationRange1000(int iters) {
BM_RequantizationRange(1000, iters);
}
BENCHMARK(BM_RequantizationRange1000);
static void BM_RequantizationRange10000(int iters) {
BM_RequantizationRange(10000, iters);
}
BENCHMARK(BM_RequantizationRange10000);
static void BM_RequantizationRange100000(int iters) {
BM_RequantizationRange(100000, iters);
}
BENCHMARK(BM_RequantizationRange100000);
static void BM_RequantizationRange1000000(int iters) {
BM_RequantizationRange(1000000, iters);
}
BENCHMARK(BM_RequantizationRange1000000);
static void BM_RequantizationRange10000000(int iters) {
BM_RequantizationRange(10000000, iters);
}
BENCHMARK(BM_RequantizationRange10000000);
static void BM_RequantizationRange100000000(int iters) {
BM_RequantizationRange(100000000, iters);
}
BENCHMARK(BM_RequantizationRange100000000);
BENCHMARK(BM_RequantizationRange)
->UseRealTime()
->Arg(100)
->Arg(1000)
->Arg(10000)
->Arg(100000)
->Arg(1000000)
->Arg(10000000)
->Arg(100000000);
} // end namespace tensorflow

View File

@ -197,148 +197,187 @@ static Graph* Reverse(const TensorShape& shape, int reverse_axis) {
}
template <typename T>
static void RunReverseRowsBenchmark(int iters, int outer_dim, int middle_dim,
static void RunReverseRowsBenchmark(::testing::benchmark::State& state,
int outer_dim, int middle_dim,
int intra_threads, int channels) {
SessionOptions opts = GetOptions(intra_threads);
TensorShape shape{outer_dim, middle_dim, channels};
const int64 num_items = static_cast<int64>(iters) * shape.num_elements();
testing::ItemsProcessed(num_items);
testing::BytesProcessed(num_items * sizeof(T));
testing::UseRealTime();
test::Benchmark("cpu", Reverse<T>(shape, 1), &opts).Run(iters);
test::Benchmark("cpu", Reverse<T>(shape, 1), &opts, nullptr, nullptr, "",
/*old_benchmark_api*/ false)
.Run(state);
const int64 num_items =
static_cast<int64>(state.iterations()) * shape.num_elements();
state.SetItemsProcessed(num_items);
state.SetBytesProcessed(num_items * sizeof(T));
}
static void BM_ReverseRowsOf1Channel_1T_float(int iters, int outer_dim,
int middle_dim) {
RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
void BM_ReverseRowsOf1Channel_1T_float(::testing::benchmark::State& state) {
const int outer_dim = state.range(0);
const int middle_dim = state.range(1);
RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
1 /* intra_threads */, 1 /* channels */);
}
BENCHMARK(BM_ReverseRowsOf1Channel_1T_float)
->UseRealTime()
->ArgPair(288, 288)
->ArgPair(1024, 1024)
->ArgPair(10 * 1024, 1024);
static void BM_ReverseRowsOf1Channel_1T_uint8(int iters, int outer_dim,
int middle_dim) {
RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
void BM_ReverseRowsOf1Channel_1T_uint8(::testing::benchmark::State& state) {
const int outer_dim = state.range(0);
const int middle_dim = state.range(1);
RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
1 /* intra_threads */, 1 /* channels */);
}
BENCHMARK(BM_ReverseRowsOf1Channel_1T_uint8)
->UseRealTime()
->ArgPair(288, 288)
->ArgPair(1024, 1024)
->ArgPair(10 * 1024, 1024);
static void BM_ReverseRowsOf1Channel_4T_float(int iters, int outer_dim,
int middle_dim) {
RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
void BM_ReverseRowsOf1Channel_4T_float(::testing::benchmark::State& state) {
const int outer_dim = state.range(0);
const int middle_dim = state.range(1);
RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
4 /* intra_threads */, 1 /* channels */);
}
BENCHMARK(BM_ReverseRowsOf1Channel_4T_float)
->UseRealTime()
->ArgPair(288, 288)
->ArgPair(1024, 1024)
->ArgPair(10 * 1024, 1024);
static void BM_ReverseRowsOf1Channel_4T_uint8(int iters, int outer_dim,
int middle_dim) {
RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
void BM_ReverseRowsOf1Channel_4T_uint8(::testing::benchmark::State& state) {
const int outer_dim = state.range(0);
const int middle_dim = state.range(1);
RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
4 /* intra_threads */, 1 /* channels */);
}
BENCHMARK(BM_ReverseRowsOf1Channel_4T_uint8)
->UseRealTime()
->ArgPair(288, 288)
->ArgPair(1024, 1024)
->ArgPair(10 * 1024, 1024);
static void BM_ReverseRowsOf3Channels_1T_float(int iters, int outer_dim,
int middle_dim) {
RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
void BM_ReverseRowsOf3Channels_1T_float(::testing::benchmark::State& state) {
const int outer_dim = state.range(0);
const int middle_dim = state.range(1);
RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
1 /* intra_threads */, 3 /* channels */);
}
BENCHMARK(BM_ReverseRowsOf3Channels_1T_float)
->UseRealTime()
->ArgPair(288, 288)
->ArgPair(30, 30)
->ArgPair(1024, 1024)
->ArgPair(10 * 1024, 1024);
static void BM_ReverseRowsOf3Channels_1T_uint8(int iters, int outer_dim,
int middle_dim) {
RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
void BM_ReverseRowsOf3Channels_1T_uint8(::testing::benchmark::State& state) {
const int outer_dim = state.range(0);
const int middle_dim = state.range(1);
RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
1 /* intra_threads */, 3 /* channels */);
}
BENCHMARK(BM_ReverseRowsOf3Channels_1T_uint8)
->UseRealTime()
->ArgPair(288, 288)
->ArgPair(30, 30)
->ArgPair(1024, 1024)
->ArgPair(10 * 1024, 1024);
static void BM_ReverseRowsOf3Channels_4T_float(int iters, int outer_dim,
int middle_dim) {
RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
void BM_ReverseRowsOf3Channels_4T_float(::testing::benchmark::State& state) {
const int outer_dim = state.range(0);
const int middle_dim = state.range(1);
RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
4 /* intra_threads */, 3 /* channels */);
}
BENCHMARK(BM_ReverseRowsOf3Channels_4T_float)
->UseRealTime()
->ArgPair(288, 288)
->ArgPair(30, 30)
->ArgPair(1024, 1024)
->ArgPair(10 * 1024, 1024);
static void BM_ReverseRowsOf3Channels_4T_uint8(int iters, int outer_dim,
int middle_dim) {
RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
void BM_ReverseRowsOf3Channels_4T_uint8(::testing::benchmark::State& state) {
const int outer_dim = state.range(0);
const int middle_dim = state.range(1);
RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
4 /* intra_threads */, 3 /* channels */);
}
BENCHMARK(BM_ReverseRowsOf3Channels_4T_uint8)
->UseRealTime()
->ArgPair(288, 288)
->ArgPair(30, 30)
->ArgPair(1024, 1024)
->ArgPair(10 * 1024, 1024);
static void BM_ReverseRowsOf4Channels_1T_float(int iters, int outer_dim,
int middle_dim) {
RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
void BM_ReverseRowsOf4Channels_1T_float(::testing::benchmark::State& state) {
const int outer_dim = state.range(0);
const int middle_dim = state.range(1);
RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
1 /* intra_threads */, 4 /* channels */);
}
BENCHMARK(BM_ReverseRowsOf4Channels_1T_float)
->UseRealTime()
->ArgPair(288, 288)
->ArgPair(1024, 1024)
->ArgPair(10 * 1024, 1024);
static void BM_ReverseRowsOf4Channels_1T_uint8(int iters, int outer_dim,
int middle_dim) {
RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
void BM_ReverseRowsOf4Channels_1T_uint8(::testing::benchmark::State& state) {
const int outer_dim = state.range(0);
const int middle_dim = state.range(1);
RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
1 /* intra_threads */, 4 /* channels */);
}
BENCHMARK(BM_ReverseRowsOf4Channels_1T_uint8)
->UseRealTime()
->ArgPair(288, 288)
->ArgPair(1024, 1024)
->ArgPair(10 * 1024, 1024);
static void BM_ReverseRowsOf4Channels_4T_float(int iters, int outer_dim,
int middle_dim) {
RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
void BM_ReverseRowsOf4Channels_4T_float(::testing::benchmark::State& state) {
const int outer_dim = state.range(0);
const int middle_dim = state.range(1);
RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
4 /* intra_threads */, 4 /* channels */);
}
BENCHMARK(BM_ReverseRowsOf4Channels_4T_float)
->UseRealTime()
->ArgPair(288, 288)
->ArgPair(1024, 1024)
->ArgPair(10 * 1024, 1024);
static void BM_ReverseRowsOf4Channels_4T_uint8(int iters, int outer_dim,
int middle_dim) {
RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
void BM_ReverseRowsOf4Channels_4T_uint8(::testing::benchmark::State& state) {
const int outer_dim = state.range(0);
const int middle_dim = state.range(1);
RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
4 /* intra_threads */, 4 /* channels */);
}
BENCHMARK(BM_ReverseRowsOf4Channels_4T_uint8)
->UseRealTime()
->ArgPair(288, 288)
->ArgPair(1024, 1024)
->ArgPair(10 * 1024, 1024);

View File

@ -450,34 +450,44 @@ static Graph* RollGraph(const TensorShape& shape, int isd) {
return g;
}
#define BM_ROLL_OUTER(DEVICE) \
static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) { \
TensorShape shape{rows, columns}; \
const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
testing::ItemsProcessed(num_items); \
testing::BytesProcessed(num_items * sizeof(float)); \
testing::UseRealTime(); \
test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters); \
} \
BENCHMARK(BM_##DEVICE##_roll_outer) \
->ArgPair(256, 256) \
->ArgPair(512, 512) \
->ArgPair(1024, 1024) \
#define BM_ROLL_OUTER(DEVICE) \
static void BM_##DEVICE##_roll_outer(::testing::benchmark::State& state) { \
const int rows = state.range(0); \
const int columns = state.range(1); \
\
TensorShape shape{rows, columns}; \
test::Benchmark(#DEVICE, RollGraph(shape, 0), /*old_benchmark_api*/ false) \
.Run(state); \
const int64 num_items = \
static_cast<int64>(state.iterations()) * shape.num_elements(); \
state.SetItemsProcessed(num_items); \
state.SetBytesProcessed(num_items * sizeof(float)); \
} \
BENCHMARK(BM_##DEVICE##_roll_outer) \
->UseRealTime() \
->ArgPair(256, 256) \
->ArgPair(512, 512) \
->ArgPair(1024, 1024) \
->ArgPair(2048, 2048)
#define BM_ROLL_ALL(DEVICE) \
static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) { \
TensorShape shape{rows, columns}; \
const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
testing::ItemsProcessed(num_items); \
testing::BytesProcessed(num_items * sizeof(float)); \
testing::UseRealTime(); \
test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters); \
} \
BENCHMARK(BM_##DEVICE##_roll_all) \
->ArgPair(256, 256) \
->ArgPair(512, 512) \
->ArgPair(1024, 1024) \
#define BM_ROLL_ALL(DEVICE) \
static void BM_##DEVICE##_roll_all(::testing::benchmark::State& state) { \
const int rows = state.range(0); \
const int columns = state.range(1); \
\
TensorShape shape{rows, columns}; \
test::Benchmark(#DEVICE, RollGraph(shape, 1), /*old_benchmark_api*/ false) \
.Run(state); \
const int64 num_items = \
static_cast<int64>(state.iterations()) * shape.num_elements(); \
state.SetItemsProcessed(num_items); \
state.SetBytesProcessed(num_items * sizeof(float)); \
} \
BENCHMARK(BM_##DEVICE##_roll_all) \
->UseRealTime() \
->ArgPair(256, 256) \
->ArgPair(512, 512) \
->ArgPair(1024, 1024) \
->ArgPair(2048, 2048)
BM_ROLL_OUTER(cpu);

View File

@ -663,8 +663,8 @@ TEST_F(SaveOpSlices2Test, TwoSlices) {
// Benchmark-related code below.
static void BM_LargeTensorWrite(int iters, int num_elements) {
testing::StopTiming();
void BM_LargeTensorWrite(::testing::benchmark::State& state) {
const int num_elements = state.range(0);
// 4 * num_elements bytes total , since sizeof(float) == 4.
Tensor tensor(DT_FLOAT, TensorShape({num_elements}));
@ -689,8 +689,9 @@ static void BM_LargeTensorWrite(int iters, int num_elements) {
VLOG(1) << "Save op's output path: " << temp_filename;
VLOG(1) << "# nodes in Graph: " << g->num_nodes();
testing::StartTiming();
test::Benchmark("cpu", g, &session_options).Run(iters);
test::Benchmark("cpu", g, &session_options, nullptr, nullptr, "",
/*old_benchmark_api*/ false)
.Run(state);
}
BENCHMARK(BM_LargeTensorWrite)->Arg((1 << 30) / 4 /* 1GB float tensor */);

View File

@ -67,79 +67,120 @@ static Graph* ThreeDYCumsum(int num_y, int num_z, bool reverse = false) {
}
template <typename T>
static void LargeOneDimensional(int iters, const string& device, int num_x,
static void LargeOneDimensional(::testing::benchmark::State& state,
const string& device, int num_x,
bool reverse = false) {
testing::ItemsProcessed(static_cast<int64>(iters) * num_x);
testing::BytesProcessed(static_cast<int64>(iters) * num_x * sizeof(T));
test::Benchmark(device, LargeOneDCumsum<T>(num_x, reverse)).Run(iters);
test::Benchmark(device, LargeOneDCumsum<T>(num_x, reverse),
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
sizeof(T));
}
static void DoRowCumsum(int iters, const string& device, int num_x, int num_y,
static void DoRowCumsum(::testing::benchmark::State& state,
const string& device, int num_x, int num_y,
bool reverse = false) {
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
sizeof(float));
test::Benchmark(device, RowCumsum(num_x, num_y, reverse)).Run(iters);
test::Benchmark(device, RowCumsum(num_x, num_y, reverse),
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y * sizeof(float));
}
static void DoColCumsum(int iters, const string& device, int num_x, int num_y,
static void DoColCumsum(::testing::benchmark::State& state,
const string& device, int num_x, int num_y,
bool reverse = false) {
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
sizeof(float));
test::Benchmark(device, ColCumsum(num_x, num_y, reverse)).Run(iters);
test::Benchmark(device, ColCumsum(num_x, num_y, reverse),
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y * sizeof(float));
}
static void Do3DYCumsum(int iters, const string& device, int num_x, int num_y,
static void Do3DYCumsum(::testing::benchmark::State& state,
const string& device, int num_x, int num_y,
bool reverse = false) {
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
sizeof(float));
test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse)).Run(iters);
test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse),
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
num_y * sizeof(float));
}
static void BM_OneDCumsumGPU(int iters, int num_x) {
LargeOneDimensional<float>(iters, "gpu", num_x);
static void BM_OneDCumsumGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
LargeOneDimensional<float>(state, "gpu", num_x);
}
BENCHMARK(BM_OneDCumsumGPU)->Range(1, 1 << 21);
static void BM_OneDCumsumGPUHalf(int iters, int num_x) {
LargeOneDimensional<Eigen::half>(iters, "gpu", num_x);
static void BM_OneDCumsumGPUHalf(::testing::benchmark::State& state) {
const int num_x = state.range(0);
LargeOneDimensional<Eigen::half>(state, "gpu", num_x);
}
BENCHMARK(BM_OneDCumsumGPUHalf)->Range(1, 1 << 21);
static void BM_Sum2DRowCumsumGPU(int iters, int num_x, int num_y) {
DoRowCumsum(iters, "gpu", num_x, num_y);
static void BM_Sum2DRowCumsumGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
DoRowCumsum(state, "gpu", num_x, num_y);
}
BENCHMARK(BM_Sum2DRowCumsumGPU)->RangePair(1, 8192, 1, 8192);
static void BM_Sum2DColumnCumsumGPU(int iters, int num_x, int num_y) {
DoColCumsum(iters, "gpu", num_x, num_y);
static void BM_Sum2DColumnCumsumGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
DoColCumsum(state, "gpu", num_x, num_y);
}
BENCHMARK(BM_Sum2DColumnCumsumGPU)->RangePair(1, 8192, 1, 8192);
static void BM_Sum3DYCumsumGPU(int iters, int num_x, int num_y) {
Do3DYCumsum(iters, "gpu", num_x, num_y);
static void BM_Sum3DYCumsumGPU(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
Do3DYCumsum(state, "gpu", num_x, num_y);
}
BENCHMARK(BM_Sum3DYCumsumGPU)->RangePair(64, 4096, 64, 4096);
static void BM_OneDCumsumGPU_reverse(int iters, int num_x) {
LargeOneDimensional<float>(iters, "gpu", num_x, true);
static void BM_OneDCumsumGPU_reverse(::testing::benchmark::State& state) {
const int num_x = state.range(0);
LargeOneDimensional<float>(state, "gpu", num_x, true);
}
BENCHMARK(BM_OneDCumsumGPU_reverse)->Range(1, 1 << 21);
static void BM_Sum2DRowCumsumGPU_reverse(int iters, int num_x, int num_y) {
DoRowCumsum(iters, "gpu", num_x, num_y, true);
static void BM_Sum2DRowCumsumGPU_reverse(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
DoRowCumsum(state, "gpu", num_x, num_y, true);
}
BENCHMARK(BM_Sum2DRowCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
static void BM_Sum2DColumnCumsumGPU_reverse(int iters, int num_x, int num_y) {
DoColCumsum(iters, "gpu", num_x, num_y, true);
static void BM_Sum2DColumnCumsumGPU_reverse(
::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
DoColCumsum(state, "gpu", num_x, num_y, true);
}
BENCHMARK(BM_Sum2DColumnCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
static void BM_Sum3DYCumsumGPU_reverse(int iters, int num_x, int num_y) {
Do3DYCumsum(iters, "gpu", num_x, num_y, true);
static void BM_Sum3DYCumsumGPU_reverse(::testing::benchmark::State& state) {
const int num_x = state.range(0);
const int num_y = state.range(1);
Do3DYCumsum(state, "gpu", num_x, num_y, true);
}
BENCHMARK(BM_Sum3DYCumsumGPU_reverse)->RangePair(32, 2048, 32, 2048);

View File

@ -254,8 +254,8 @@ class ScatterNdUpdateBM : public ScatterNdUpdateOpTest {
};
template <typename Index>
static void BM_ScatterNdHelper(int iters, int embedding_size, const char* op) {
testing::StopTiming();
void BM_ScatterNdHelper(::testing::benchmark::State& state, int embedding_size,
const char* op) {
const int kRows = 10000000 / embedding_size;
std::vector<float> values;
values.reserve(kRows);
@ -280,27 +280,33 @@ static void BM_ScatterNdHelper(int iters, int embedding_size, const char* op) {
bm.AddInputFromArray<Index>(TensorShape({kNumUpdates}), indices);
bm.AddInputFromArray<float>(TensorShape({kNumUpdates, embedding_size}),
updates);
testing::ItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
iters);
testing::StartTiming();
while (iters-- > 0) {
for (auto i : state) {
Status s = bm.RunOpKernel();
}
testing::StopTiming();
state.SetItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
state.iterations());
}
static void BM_ScatterNdUpdateInt32(int iters, int embedding_size) {
BM_ScatterNdHelper<int32>(iters, embedding_size, "ScatterNdUpdate");
void BM_ScatterNdUpdateInt32(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterNdHelper<int32>(state, embedding_size, "ScatterNdUpdate");
}
static void BM_ScatterNdUpdateInt64(int iters, int embedding_size) {
BM_ScatterNdHelper<int64>(iters, embedding_size, "ScatterNdUpdate");
void BM_ScatterNdUpdateInt64(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterNdHelper<int64>(state, embedding_size, "ScatterNdUpdate");
}
static void BM_ScatterNdAddInt32(int iters, int embedding_size) {
BM_ScatterNdHelper<int32>(iters, embedding_size, "ScatterNdAdd");
void BM_ScatterNdAddInt32(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterNdHelper<int32>(state, embedding_size, "ScatterNdAdd");
}
static void BM_ScatterNdAddInt64(int iters, int embedding_size) {
BM_ScatterNdHelper<int64>(iters, embedding_size, "ScatterNdAdd");
void BM_ScatterNdAddInt64(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterNdHelper<int64>(state, embedding_size, "ScatterNdAdd");
}
BENCHMARK(BM_ScatterNdUpdateInt32)

View File

@ -280,9 +280,8 @@ class ScatterUpdateBM : public ScatterUpdateOpTest {
};
template <typename Index>
static void BM_ScatterHelper(int iters, int embedding_size, const char* op,
bool big_num_updates = false) {
testing::StopTiming();
void BM_ScatterHelper(::testing::benchmark::State& state, int embedding_size,
const char* op, bool big_num_updates = false) {
const int kRows = 10000000 / embedding_size;
std::vector<float> values;
values.reserve(kRows);
@ -307,59 +306,83 @@ static void BM_ScatterHelper(int iters, int embedding_size, const char* op,
bm.AddInputFromArray<Index>(TensorShape({kNumUpdates}), indices);
bm.AddInputFromArray<float>(TensorShape({kNumUpdates, embedding_size}),
updates);
testing::ItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
iters);
testing::StartTiming();
while (iters-- > 0) {
for (auto i : state) {
Status s = bm.RunOpKernel();
}
testing::StopTiming();
state.SetItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
state.iterations());
}
static void BM_ScatterUpdateInt32(int iters, int embedding_size) {
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterUpdate");
void BM_ScatterUpdateInt32(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int32>(state, embedding_size, "ScatterUpdate");
}
static void BM_ScatterUpdateInt64(int iters, int embedding_size) {
BM_ScatterHelper<int64>(iters, embedding_size, "ScatterUpdate");
void BM_ScatterUpdateInt64(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int64>(state, embedding_size, "ScatterUpdate");
}
static void BM_ScatterAddInt32(int iters, int embedding_size) {
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterAdd");
void BM_ScatterAddInt32(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int32>(state, embedding_size, "ScatterAdd");
}
static void BM_ScatterAddInt32Large(int iters, int embedding_size) {
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterAdd", true);
void BM_ScatterAddInt32Large(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int32>(state, embedding_size, "ScatterAdd", true);
}
static void BM_ScatterAddInt64(int iters, int embedding_size) {
BM_ScatterHelper<int64>(iters, embedding_size, "ScatterAdd");
void BM_ScatterAddInt64(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int64>(state, embedding_size, "ScatterAdd");
}
static void BM_ScatterMulInt32(int iters, int embedding_size) {
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMul");
void BM_ScatterMulInt32(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int32>(state, embedding_size, "ScatterMul");
}
static void BM_ScatterMulInt64(int iters, int embedding_size) {
BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMul");
void BM_ScatterMulInt64(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int64>(state, embedding_size, "ScatterMul");
}
static void BM_ScatterDivInt32(int iters, int embedding_size) {
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterDiv");
void BM_ScatterDivInt32(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int32>(state, embedding_size, "ScatterDiv");
}
static void BM_ScatterDivInt64(int iters, int embedding_size) {
BM_ScatterHelper<int64>(iters, embedding_size, "ScatterDiv");
void BM_ScatterDivInt64(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int64>(state, embedding_size, "ScatterDiv");
}
static void BM_ScatterMinInt32(int iters, int embedding_size) {
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMin");
void BM_ScatterMinInt32(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int32>(state, embedding_size, "ScatterMin");
}
static void BM_ScatterMinInt64(int iters, int embedding_size) {
BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMin");
void BM_ScatterMinInt64(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int64>(state, embedding_size, "ScatterMin");
}
static void BM_ScatterMaxInt32(int iters, int embedding_size) {
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMax");
void BM_ScatterMaxInt32(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int32>(state, embedding_size, "ScatterMax");
}
static void BM_ScatterMaxInt64(int iters, int embedding_size) {
BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMax");
void BM_ScatterMaxInt64(::testing::benchmark::State& state) {
const int embedding_size = state.range(0);
BM_ScatterHelper<int64>(state, embedding_size, "ScatterMax");
}
BENCHMARK(BM_ScatterUpdateInt32)

View File

@ -39,10 +39,9 @@ limitations under the License.
namespace tensorflow {
template <typename Index>
static void BM_SegmentReduction(int iters, const string& reduction,
Index num_rows, Index num_cols,
Index segment_size) {
testing::StopTiming();
static void BM_SegmentReduction(::testing::benchmark::State& state,
const string& reduction, Index num_rows,
Index num_cols, Index segment_size) {
std::unique_ptr<Device> device(
DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
@ -81,24 +80,25 @@ static void BM_SegmentReduction(int iters, const string& reduction,
reduction_op->Compute(reduction_context.get());
TF_CHECK_OK(reduction_context->status());
testing::StartTiming();
for (int i = 0; i < iters; ++i) {
for (auto s : state) {
delete reduction_context->release_output(0).tensor;
reduction_op->Compute(reduction_context.get());
}
int64 bytes_per_iter =
static_cast<int64>(num_rows * num_cols * sizeof(float));
testing::BytesProcessed(bytes_per_iter * iters);
state.SetBytesProcessed(bytes_per_iter * state.iterations());
}
#define BM_Reduce(O, R, C, S) \
static void BM_Reduce_##O##_##R##_##C##_##S##_int32(int iters) { \
BM_SegmentReduction<int32>(iters, #O, R, C, S); \
} \
static void BM_Reduce_##O##_##R##_##C##_##S##_int64(int iters) { \
BM_SegmentReduction<int64>(iters, #O, R, C, S); \
} \
BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32); \
#define BM_Reduce(O, R, C, S) \
static void BM_Reduce_##O##_##R##_##C##_##S##_int32( \
::testing::benchmark::State & state) { \
BM_SegmentReduction<int32>(state, #O, R, C, S); \
} \
static void BM_Reduce_##O##_##R##_##C##_##S##_int64( \
::testing::benchmark::State & state) { \
BM_SegmentReduction<int64>(state, #O, R, C, S); \
} \
BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32); \
BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int64);
#define BM_Reduce_Arg(R, C, S) \
@ -113,8 +113,8 @@ BM_Reduce_Arg(64, 32, 2);
BM_Reduce_Arg(4096, 32, 2);
BM_Reduce_Arg(4096, 128, 2);
static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) {
testing::StopTiming();
static void SparseSegmentMeanGradHelper(::testing::benchmark::State& state,
float uniqueness, int size) {
Graph* g = new Graph(OpRegistry::Global());
CHECK_LE(uniqueness, 1.0);
CHECK_GT(uniqueness, 0.0);
@ -148,22 +148,24 @@ static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) {
.Attr("T", DT_FLOAT)
.Finalize(g, &node));
testing::UseRealTime();
testing::BytesProcessed(static_cast<int64>(iters) * (kDim1 * kDim2) *
sizeof(float));
testing::StartTiming();
test::Benchmark("cpu", g).Run(iters);
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
(kDim1 * kDim2) * sizeof(float));
}
static void BM_SparseSegmentMeanGrad_Low(int iters, int size) {
return SparseSegmentMeanGradHelper(iters, 1.0, size);
static void BM_SparseSegmentMeanGrad_Low(::testing::benchmark::State& state) {
const int size = state.range(0);
return SparseSegmentMeanGradHelper(state, 1.0, size);
}
static void BM_SparseSegmentMeanGrad_High(int iters, int size) {
return SparseSegmentMeanGradHelper(iters, 0.01, size);
static void BM_SparseSegmentMeanGrad_High(::testing::benchmark::State& state) {
const int size = state.range(0);
return SparseSegmentMeanGradHelper(state, 0.01, size);
}
BENCHMARK(BM_SparseSegmentMeanGrad_Low)->Arg(1000)->Arg(100000);
BENCHMARK(BM_SparseSegmentMeanGrad_High)->Arg(1000)->Arg(100000);
BENCHMARK(BM_SparseSegmentMeanGrad_Low)->UseRealTime()->Arg(1000)->Arg(100000);
BENCHMARK(BM_SparseSegmentMeanGrad_High)->UseRealTime()->Arg(1000)->Arg(100000);
} // namespace tensorflow

View File

@ -54,21 +54,21 @@ static Graph* Recv() {
return g;
}
static void BM_Send(int iters) {
testing::UseRealTime();
testing::ItemsProcessed(static_cast<int64>(iters));
test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous)
.Run(iters);
void BM_Send(::testing::benchmark::State& state) {
test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous, "",
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()));
}
BENCHMARK(BM_Send);
BENCHMARK(BM_Send)->UseRealTime();
static void BM_Recv(int iters) {
testing::UseRealTime();
testing::ItemsProcessed(static_cast<int64>(iters));
test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous)
.Run(iters);
void BM_Recv(::testing::benchmark::State& state) {
test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous, "",
/*old_benchmark_api*/ false)
.Run(state);
state.SetItemsProcessed(static_cast<int64>(state.iterations()));
}
BENCHMARK(BM_Recv);
BENCHMARK(BM_Recv)->UseRealTime();
} // namespace
} // namespace tensorflow

View File

@ -37,8 +37,8 @@ namespace {
// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
// in size, and concat them together along "concat_dimension"
template <typename T>
static void SliceHelper(int iters, int size) {
testing::StopTiming();
static void SliceHelper(::testing::benchmark::State& state) {
const int size = state.range(0);
Graph* g = new Graph(OpRegistry::Global());
DataType dt = DataTypeToEnum<T>::v();
int kDim = 100;
@ -65,26 +65,24 @@ static void SliceHelper(int iters, int size) {
.Finalize(g, &node));
FixupSourceAndSinkEdges(g);
testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T));
testing::StartTiming();
test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
"SINGLE_THREADED_EXECUTOR")
.Run(iters);
testing::UseRealTime();
"SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
.Run(state);
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim * size *
sizeof(T));
}
static void BM_SliceFloat(int iters, int dim2) {
SliceHelper<float>(iters, dim2);
void BM_SliceFloat(::testing::benchmark::State& state) {
SliceHelper<float>(state);
}
BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000);
BENCHMARK(BM_SliceFloat)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);
static void BM_SliceBFloat16(int iters, int dim2) {
SliceHelper<bfloat16>(iters, dim2);
void BM_SliceBFloat16(::testing::benchmark::State& state) {
SliceHelper<bfloat16>(state);
}
BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000);
BENCHMARK(BM_SliceBFloat16)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);
} // namespace
} // namespace tensorflow

View File

@ -276,15 +276,18 @@ static ST MakeSparseTensor(Graph* g, int B, int M, int N, int nnz_inner) {
// [8, 4, N{nnz}] cmul [8, 4, N]
#define BM_SparseMatCMulDenseMatArgs(N, NNZ_INNER) \
static void BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER(int iters) { \
static void BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER( \
::testing::benchmark::State& state) { \
Graph* g = new Graph(OpRegistry::Global()); \
Node* dense = MakeTensor(g, 8, 4, N); \
ST sp = MakeSparseTensor(g, 8, 4, N, NNZ_INNER); \
\
testing::ItemsProcessed(static_cast<int64>(iters * 8 * 4 * N * 2)); \
test::Benchmark( \
"cpu", SparseMatCMulDenseMat(g, sp.indices, sp.vals, sp.shape, dense)) \
.Run(iters); \
"cpu", SparseMatCMulDenseMat(g, sp.indices, sp.vals, sp.shape, dense), \
/*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed( \
static_cast<int64>(state.iterations() * 8 * 4 * N * 2)); \
} \
BENCHMARK(BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER)

View File

@ -198,9 +198,11 @@ TEST_F(SparseToDenseTest, ThreeD_MultValues) {
} // namespace
static void BM_SparseToDense(int iters, int NDIM, int N) {
static void BM_SparseToDense(::testing::benchmark::State& state) {
const int NDIM = state.range(0);
const int N = state.range(1);
// TODO(zhifengc): Switch to use kernel_benchmark_testlib.h
tensorflow::testing::StopTiming();
const int IndexDim = (NDIM == 1) ? 0 : 1;
@ -253,18 +255,15 @@ static void BM_SparseToDense(int iters, int NDIM, int N) {
std::unique_ptr<OpKernelContext> sparse_context(new OpKernelContext(&params));
op->Compute(sparse_context.get());
tensorflow::testing::StartTiming();
for (int i = 0; i < iters; ++i) {
for (auto s : state) {
delete sparse_context->release_output(0).tensor;
op->Compute(sparse_context.get());
TF_ASSERT_OK(sparse_context->status());
}
tensorflow::testing::StopTiming();
// processing input, mainly
int64 bytes_per_iter = static_cast<int64>((N + N * NDIM) * sizeof(float));
tensorflow::testing::BytesProcessed(bytes_per_iter * iters);
state.SetBytesProcessed(bytes_per_iter * state.iterations());
}
BENCHMARK(BM_SparseToDense)

View File

@ -41,11 +41,15 @@ static Graph* SparseXent(int batch_size, int num_classes) {
return g;
}
#define BM_SparseXentDev(BATCH, CLASS, DEVICE) \
static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \
testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters); \
} \
#define BM_SparseXentDev(BATCH, CLASS, DEVICE) \
static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE( \
::testing::benchmark::State& state) { \
test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS), \
/*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * BATCH * \
CLASS); \
} \
BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE);
/// The representative tests for ptb_word on GPU