Internal tests cleanup.
PiperOrigin-RevId: 339741501 Change-Id: Iaa532c63d5c653de8e6a76e78822014fbef51b28
This commit is contained in:
parent
fb49d63afa
commit
df70d68014
tensorflow/core/kernels
random_op_test.ccreduction_ops_test.ccregex_replace_op_test.ccrequantization_range_op_test.ccreverse_op_test.ccroll_op_test.ccsave_op_test.ccscan_ops_test.ccscatter_nd_op_test.ccscatter_op_test.ccsegment_reduction_ops_test.ccsendrecv_ops_test.ccslice_op_test.ccsparse_dense_binary_op_shared_test.ccsparse_to_dense_op_test.ccsparse_xent_op_test.cc
@ -58,11 +58,14 @@ Graph* TruncatedNormal(int64 n) {
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_RNG(DEVICE, RNG) \
|
||||
void BM_##DEVICE##_##RNG(int iters, int arg) { \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * arg); \
|
||||
test::Benchmark(#DEVICE, RNG(arg)).Run(iters); \
|
||||
} \
|
||||
#define BM_RNG(DEVICE, RNG) \
|
||||
void BM_##DEVICE##_##RNG(::testing::benchmark::State& state) { \
|
||||
const int arg = state.range(0); \
|
||||
\
|
||||
test::Benchmark(#DEVICE, RNG(arg), /*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * arg); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_##RNG)->Range(1 << 20, 8 << 20);
|
||||
|
||||
BM_RNG(cpu, RandomUniform);
|
||||
@ -84,60 +87,48 @@ Tensor VecAlphas(int64 n) {
|
||||
return alphas;
|
||||
}
|
||||
|
||||
void BM_cpu_RandomGamma(int iters, int nsamp, int nalpha) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * nsamp * nalpha);
|
||||
void BM_cpu_RandomGamma(::testing::benchmark::State& state) {
|
||||
const int nsamp = state.range(0);
|
||||
const int nalpha = state.range(1);
|
||||
|
||||
Graph* g = new Graph(OpRegistry::Global());
|
||||
test::graph::RandomGamma(g, test::graph::Constant(g, VecShape(nsamp)),
|
||||
test::graph::Constant(g, VecAlphas(nalpha)));
|
||||
test::Benchmark("cpu", g).Run(iters);
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * nsamp *
|
||||
nalpha);
|
||||
}
|
||||
BENCHMARK(BM_cpu_RandomGamma)->RangePair(1 << 14, 4 << 15, 2, 50);
|
||||
|
||||
void BM_PhiloxRandom(int iters) {
|
||||
void BM_PhiloxRandom(::testing::benchmark::State& state) {
|
||||
// Fill 2M random numbers
|
||||
int count = 2 << 20;
|
||||
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * count);
|
||||
|
||||
random::PhiloxRandom gen(0x12345);
|
||||
|
||||
int val = 1;
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
for (int j = 0; j < count; j += 4) {
|
||||
/// each invocation of gen() returns 128-bit samples
|
||||
auto samples = gen();
|
||||
|
||||
// use the result trivially so the compiler does not optimize it away
|
||||
val ^= samples[0] ^ samples[1] ^ samples[2] ^ samples[3];
|
||||
tensorflow::testing::DoNotOptimize(samples);
|
||||
}
|
||||
}
|
||||
|
||||
// A anchor point to make sure the compiler does not cut corners
|
||||
CHECK(val) << val;
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * count);
|
||||
}
|
||||
BENCHMARK(BM_PhiloxRandom);
|
||||
|
||||
void BM_StdMTRandom(int iters) {
|
||||
void BM_StdMTRandom(::testing::benchmark::State& state) {
|
||||
// Fill 2M random numbers
|
||||
int count = 2 << 20;
|
||||
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * count);
|
||||
|
||||
std::mt19937 gen(0x12345);
|
||||
|
||||
uint_fast32_t val = 1;
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
for (int j = 0; j < count; ++j) {
|
||||
/// each invocation of gen() returns 32-bit sample
|
||||
uint_fast32_t sample = gen();
|
||||
|
||||
// use the result trivially so the compiler does not optimize it away
|
||||
val ^= sample;
|
||||
tensorflow::testing::DoNotOptimize(sample);
|
||||
}
|
||||
}
|
||||
|
||||
// A anchor point to make sure the compiler does not cut corners
|
||||
CHECK(val) << val;
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * count);
|
||||
}
|
||||
BENCHMARK(BM_StdMTRandom);
|
||||
|
||||
|
@ -84,108 +84,167 @@ static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
|
||||
// Creates a bench which reduces a 3D tensor with total "num" floats
|
||||
// into a scalar on a "device". Runs the bench for "iters" times.
|
||||
template <typename T>
|
||||
static void ReduceToScalar(int iters, const string& device,
|
||||
const string& reduce, int num_x, int num_y) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
|
||||
sizeof(T));
|
||||
test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y)).Run(iters);
|
||||
static void ReduceToScalar(::testing::benchmark::State& state,
|
||||
const string& device, const string& reduce,
|
||||
int num_x, int num_y) {
|
||||
test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y),
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y * sizeof(T));
|
||||
}
|
||||
|
||||
static void DoRowReduce(int iters, const string& device, const string& reduce,
|
||||
int num_x, int num_y) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
|
||||
sizeof(float));
|
||||
test::Benchmark(device, RowReduce(reduce, num_x, num_y)).Run(iters);
|
||||
static void DoRowReduce(::testing::benchmark::State& state,
|
||||
const string& device, const string& reduce, int num_x,
|
||||
int num_y) {
|
||||
test::Benchmark(device, RowReduce(reduce, num_x, num_y),
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y * sizeof(float));
|
||||
}
|
||||
|
||||
static void DoColReduce(int iters, const string& device, const string& reduce,
|
||||
int num_x, int num_y) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
|
||||
sizeof(float));
|
||||
test::Benchmark(device, ColReduce(reduce, num_x, num_y)).Run(iters);
|
||||
static void DoColReduce(::testing::benchmark::State& state,
|
||||
const string& device, const string& reduce, int num_x,
|
||||
int num_y) {
|
||||
test::Benchmark(device, ColReduce(reduce, num_x, num_y),
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y * sizeof(float));
|
||||
}
|
||||
|
||||
static void Do3DYReduce(int iters, const string& device, const string& reduce,
|
||||
int num_x, int num_y) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
|
||||
sizeof(float));
|
||||
test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y)).Run(iters);
|
||||
static void Do3DYReduce(::testing::benchmark::State& state,
|
||||
const string& device, const string& reduce, int num_x,
|
||||
int num_y) {
|
||||
test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y),
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y * sizeof(float));
|
||||
}
|
||||
|
||||
static void Do3DXZReduce(int iters, const string& device, const string& reduce,
|
||||
int num_x, int num_y) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
|
||||
sizeof(float));
|
||||
test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y)).Run(iters);
|
||||
static void Do3DXZReduce(::testing::benchmark::State& state,
|
||||
const string& device, const string& reduce, int num_x,
|
||||
int num_y) {
|
||||
test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y),
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y * sizeof(float));
|
||||
}
|
||||
|
||||
static void BM_Sum2DToScalarGPU(int iters, int num_x, int num_y) {
|
||||
ReduceToScalar<float>(iters, "gpu", "Sum", num_x, num_y);
|
||||
static void BM_Sum2DToScalarGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
ReduceToScalar<float>(state, "gpu", "Sum", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Sum2DToScalarGPU)->RangePair(1, 8192, 1, 8192);
|
||||
|
||||
static void BM_Sum2DToScalarGPUComplex(int iters, int num_x, int num_y) {
|
||||
ReduceToScalar<std::complex<float>>(iters, "gpu", "Sum", num_x, num_y);
|
||||
static void BM_Sum2DToScalarGPUComplex(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
ReduceToScalar<std::complex<float>>(state, "gpu", "Sum", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Sum2DToScalarGPUComplex)->RangePair(1, 8192, 1, 8192);
|
||||
|
||||
static void BM_Sum2DToScalarGPUHalf(int iters, int num_x, int num_y) {
|
||||
ReduceToScalar<Eigen::half>(iters, "gpu", "Sum", num_x, num_y);
|
||||
static void BM_Sum2DToScalarGPUHalf(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
ReduceToScalar<Eigen::half>(state, "gpu", "Sum", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Sum2DToScalarGPUHalf)->RangePair(1, 8192, 1, 8192);
|
||||
|
||||
static void BM_Sum2DRowReduceGPU(int iters, int num_x, int num_y) {
|
||||
DoRowReduce(iters, "gpu", "Sum", num_x, num_y);
|
||||
static void BM_Sum2DRowReduceGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
DoRowReduce(state, "gpu", "Sum", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Sum2DRowReduceGPU)->RangePair(1, 8192, 1, 8192);
|
||||
|
||||
static void BM_Sum2DColumnReduceGPU(int iters, int num_x, int num_y) {
|
||||
DoColReduce(iters, "gpu", "Sum", num_x, num_y);
|
||||
static void BM_Sum2DColumnReduceGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
DoColReduce(state, "gpu", "Sum", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Sum2DColumnReduceGPU)->RangePair(1, 8192, 1, 8192);
|
||||
|
||||
static void BM_Sum3DYReduceGPU(int iters, int num_x, int num_y) {
|
||||
Do3DYReduce(iters, "gpu", "Sum", num_x, num_y);
|
||||
static void BM_Sum3DYReduceGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
Do3DYReduce(state, "gpu", "Sum", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Sum3DYReduceGPU)->RangePair(64, 4096, 64, 4096);
|
||||
|
||||
static void BM_Sum3DXZReduceGPU(int iters, int num_x, int num_y) {
|
||||
Do3DXZReduce(iters, "gpu", "Sum", num_x, num_y);
|
||||
static void BM_Sum3DXZReduceGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
Do3DXZReduce(state, "gpu", "Sum", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Sum3DXZReduceGPU)->RangePair(64, 4096, 64, 4096);
|
||||
|
||||
static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) {
|
||||
ReduceToScalar<float>(iters, "gpu", "Mean", num_x, num_y);
|
||||
static void BM_Mean2DToScalarGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
ReduceToScalar<float>(state, "gpu", "Mean", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
|
||||
|
||||
static void BM_EuclideanNorm2DToScalarGPU(int iters, int num_x, int num_y) {
|
||||
ReduceToScalar<float>(iters, "gpu", "EuclideanNorm", num_x, num_y);
|
||||
static void BM_EuclideanNorm2DToScalarGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
ReduceToScalar<float>(state, "gpu", "EuclideanNorm", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_EuclideanNorm2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
|
||||
|
||||
static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) {
|
||||
ReduceToScalar<float>(iters, "gpu", "Max", num_x, num_y);
|
||||
static void BM_Max2DToScalarGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
ReduceToScalar<float>(state, "gpu", "Max", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Max2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
|
||||
|
||||
static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) {
|
||||
ReduceToScalar<float>(iters, "gpu", "Min", num_x, num_y);
|
||||
static void BM_Min2DToScalarGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
ReduceToScalar<float>(state, "gpu", "Min", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
|
||||
|
||||
static void BM_Min2DToScalarGPUHalf(int iters, int num_x, int num_y) {
|
||||
ReduceToScalar<Eigen::half>(iters, "gpu", "Min", num_x, num_y);
|
||||
static void BM_Min2DToScalarGPUHalf(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
ReduceToScalar<Eigen::half>(state, "gpu", "Min", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Min2DToScalarGPUHalf)->RangePair(2048, 8192, 2048, 8192);
|
||||
|
||||
static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) {
|
||||
ReduceToScalar<bool>(iters, "gpu", "All", num_x, num_y);
|
||||
static void BM_Bool2DToScalarGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
ReduceToScalar<bool>(state, "gpu", "All", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Bool2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
|
||||
|
||||
|
@ -84,17 +84,17 @@ Graph* SetupRegexReplaceGraph(const Tensor& input, const string& input_pattern,
|
||||
return g;
|
||||
}
|
||||
|
||||
void BM_RegexReplace(int iters, int batch_size) {
|
||||
testing::StopTiming();
|
||||
testing::ItemsProcessed(static_cast<int64>(iters));
|
||||
testing::UseRealTime();
|
||||
static void BM_RegexReplace(::testing::benchmark::State& state) {
|
||||
const int batch_size = state.range(0);
|
||||
|
||||
Tensor input = GetTestTensor(batch_size);
|
||||
Graph* g = SetupRegexReplaceGraph(input, kRegExPattern, kRewrite);
|
||||
testing::StartTiming();
|
||||
test::Benchmark("cpu", g).Run(iters);
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()));
|
||||
}
|
||||
|
||||
BENCHMARK(BM_RegexReplace)
|
||||
->UseRealTime()
|
||||
->Arg(1)
|
||||
->Arg(8)
|
||||
->Arg(16)
|
||||
@ -115,17 +115,17 @@ Graph* SetupStaticGraph(const Tensor& input, const string& input_pattern,
|
||||
.Finalize(g, nullptr /* node */));
|
||||
return g;
|
||||
}
|
||||
void BM_StaticRegexReplace(int iters, int batch_size) {
|
||||
testing::StopTiming();
|
||||
testing::ItemsProcessed(static_cast<int64>(iters));
|
||||
testing::UseRealTime();
|
||||
static void BM_StaticRegexReplace(::testing::benchmark::State& state) {
|
||||
const int batch_size = state.range(0);
|
||||
|
||||
Tensor input = GetTestTensor(batch_size);
|
||||
Graph* g = SetupStaticGraph(input, kRegExPattern, kRewrite);
|
||||
testing::StartTiming();
|
||||
test::Benchmark("cpu", g).Run(iters);
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()));
|
||||
}
|
||||
|
||||
BENCHMARK(BM_StaticRegexReplace)
|
||||
->UseRealTime()
|
||||
->Arg(1)
|
||||
->Arg(8)
|
||||
->Arg(16)
|
||||
|
@ -67,56 +67,29 @@ TEST_F(RequantizationRangeTest, HandCrafted) {
|
||||
test::ExpectTensorEqual<float>(expected_max, *GetOutput(1));
|
||||
}
|
||||
|
||||
static void BM_RequantizationRange(int iters, int size) {
|
||||
testing::StopTiming();
|
||||
testing::UseRealTime();
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * size);
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * size * 4);
|
||||
static void BM_RequantizationRange(::testing::benchmark::State& state) {
|
||||
const int size = state.range(0);
|
||||
|
||||
Tensor quantized_tensor(DT_QINT32, TensorShape({1, size}));
|
||||
test::FillFn<qint32>(&quantized_tensor, [](int n) { return qint32(n); });
|
||||
|
||||
qint32 actual_min;
|
||||
qint32 actual_max;
|
||||
testing::StartTiming();
|
||||
for (int iter = 0; iter < iters; ++iter) {
|
||||
for (auto s : state) {
|
||||
CalculateUsedRange(quantized_tensor, &actual_min, &actual_max);
|
||||
}
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * size);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * size * 4);
|
||||
}
|
||||
|
||||
static void BM_RequantizationRange100(int iters) {
|
||||
BM_RequantizationRange(100, iters);
|
||||
}
|
||||
BENCHMARK(BM_RequantizationRange100);
|
||||
|
||||
static void BM_RequantizationRange1000(int iters) {
|
||||
BM_RequantizationRange(1000, iters);
|
||||
}
|
||||
BENCHMARK(BM_RequantizationRange1000);
|
||||
|
||||
static void BM_RequantizationRange10000(int iters) {
|
||||
BM_RequantizationRange(10000, iters);
|
||||
}
|
||||
BENCHMARK(BM_RequantizationRange10000);
|
||||
|
||||
static void BM_RequantizationRange100000(int iters) {
|
||||
BM_RequantizationRange(100000, iters);
|
||||
}
|
||||
BENCHMARK(BM_RequantizationRange100000);
|
||||
|
||||
static void BM_RequantizationRange1000000(int iters) {
|
||||
BM_RequantizationRange(1000000, iters);
|
||||
}
|
||||
BENCHMARK(BM_RequantizationRange1000000);
|
||||
|
||||
static void BM_RequantizationRange10000000(int iters) {
|
||||
BM_RequantizationRange(10000000, iters);
|
||||
}
|
||||
BENCHMARK(BM_RequantizationRange10000000);
|
||||
|
||||
static void BM_RequantizationRange100000000(int iters) {
|
||||
BM_RequantizationRange(100000000, iters);
|
||||
}
|
||||
BENCHMARK(BM_RequantizationRange100000000);
|
||||
BENCHMARK(BM_RequantizationRange)
|
||||
->UseRealTime()
|
||||
->Arg(100)
|
||||
->Arg(1000)
|
||||
->Arg(10000)
|
||||
->Arg(100000)
|
||||
->Arg(1000000)
|
||||
->Arg(10000000)
|
||||
->Arg(100000000);
|
||||
|
||||
} // end namespace tensorflow
|
||||
|
@ -197,148 +197,187 @@ static Graph* Reverse(const TensorShape& shape, int reverse_axis) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void RunReverseRowsBenchmark(int iters, int outer_dim, int middle_dim,
|
||||
static void RunReverseRowsBenchmark(::testing::benchmark::State& state,
|
||||
int outer_dim, int middle_dim,
|
||||
int intra_threads, int channels) {
|
||||
SessionOptions opts = GetOptions(intra_threads);
|
||||
TensorShape shape{outer_dim, middle_dim, channels};
|
||||
const int64 num_items = static_cast<int64>(iters) * shape.num_elements();
|
||||
testing::ItemsProcessed(num_items);
|
||||
testing::BytesProcessed(num_items * sizeof(T));
|
||||
testing::UseRealTime();
|
||||
test::Benchmark("cpu", Reverse<T>(shape, 1), &opts).Run(iters);
|
||||
test::Benchmark("cpu", Reverse<T>(shape, 1), &opts, nullptr, nullptr, "",
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
const int64 num_items =
|
||||
static_cast<int64>(state.iterations()) * shape.num_elements();
|
||||
state.SetItemsProcessed(num_items);
|
||||
state.SetBytesProcessed(num_items * sizeof(T));
|
||||
}
|
||||
|
||||
static void BM_ReverseRowsOf1Channel_1T_float(int iters, int outer_dim,
|
||||
int middle_dim) {
|
||||
RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
|
||||
void BM_ReverseRowsOf1Channel_1T_float(::testing::benchmark::State& state) {
|
||||
const int outer_dim = state.range(0);
|
||||
const int middle_dim = state.range(1);
|
||||
|
||||
RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
|
||||
1 /* intra_threads */, 1 /* channels */);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ReverseRowsOf1Channel_1T_float)
|
||||
->UseRealTime()
|
||||
->ArgPair(288, 288)
|
||||
->ArgPair(1024, 1024)
|
||||
->ArgPair(10 * 1024, 1024);
|
||||
|
||||
static void BM_ReverseRowsOf1Channel_1T_uint8(int iters, int outer_dim,
|
||||
int middle_dim) {
|
||||
RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
|
||||
void BM_ReverseRowsOf1Channel_1T_uint8(::testing::benchmark::State& state) {
|
||||
const int outer_dim = state.range(0);
|
||||
const int middle_dim = state.range(1);
|
||||
|
||||
RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
|
||||
1 /* intra_threads */, 1 /* channels */);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ReverseRowsOf1Channel_1T_uint8)
|
||||
->UseRealTime()
|
||||
->ArgPair(288, 288)
|
||||
->ArgPair(1024, 1024)
|
||||
->ArgPair(10 * 1024, 1024);
|
||||
|
||||
static void BM_ReverseRowsOf1Channel_4T_float(int iters, int outer_dim,
|
||||
int middle_dim) {
|
||||
RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
|
||||
void BM_ReverseRowsOf1Channel_4T_float(::testing::benchmark::State& state) {
|
||||
const int outer_dim = state.range(0);
|
||||
const int middle_dim = state.range(1);
|
||||
|
||||
RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
|
||||
4 /* intra_threads */, 1 /* channels */);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ReverseRowsOf1Channel_4T_float)
|
||||
->UseRealTime()
|
||||
->ArgPair(288, 288)
|
||||
->ArgPair(1024, 1024)
|
||||
->ArgPair(10 * 1024, 1024);
|
||||
|
||||
static void BM_ReverseRowsOf1Channel_4T_uint8(int iters, int outer_dim,
|
||||
int middle_dim) {
|
||||
RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
|
||||
void BM_ReverseRowsOf1Channel_4T_uint8(::testing::benchmark::State& state) {
|
||||
const int outer_dim = state.range(0);
|
||||
const int middle_dim = state.range(1);
|
||||
|
||||
RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
|
||||
4 /* intra_threads */, 1 /* channels */);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ReverseRowsOf1Channel_4T_uint8)
|
||||
->UseRealTime()
|
||||
->ArgPair(288, 288)
|
||||
->ArgPair(1024, 1024)
|
||||
->ArgPair(10 * 1024, 1024);
|
||||
|
||||
static void BM_ReverseRowsOf3Channels_1T_float(int iters, int outer_dim,
|
||||
int middle_dim) {
|
||||
RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
|
||||
void BM_ReverseRowsOf3Channels_1T_float(::testing::benchmark::State& state) {
|
||||
const int outer_dim = state.range(0);
|
||||
const int middle_dim = state.range(1);
|
||||
|
||||
RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
|
||||
1 /* intra_threads */, 3 /* channels */);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ReverseRowsOf3Channels_1T_float)
|
||||
->UseRealTime()
|
||||
->ArgPair(288, 288)
|
||||
->ArgPair(30, 30)
|
||||
->ArgPair(1024, 1024)
|
||||
->ArgPair(10 * 1024, 1024);
|
||||
|
||||
static void BM_ReverseRowsOf3Channels_1T_uint8(int iters, int outer_dim,
|
||||
int middle_dim) {
|
||||
RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
|
||||
void BM_ReverseRowsOf3Channels_1T_uint8(::testing::benchmark::State& state) {
|
||||
const int outer_dim = state.range(0);
|
||||
const int middle_dim = state.range(1);
|
||||
|
||||
RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
|
||||
1 /* intra_threads */, 3 /* channels */);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ReverseRowsOf3Channels_1T_uint8)
|
||||
->UseRealTime()
|
||||
->ArgPair(288, 288)
|
||||
->ArgPair(30, 30)
|
||||
->ArgPair(1024, 1024)
|
||||
->ArgPair(10 * 1024, 1024);
|
||||
|
||||
static void BM_ReverseRowsOf3Channels_4T_float(int iters, int outer_dim,
|
||||
int middle_dim) {
|
||||
RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
|
||||
void BM_ReverseRowsOf3Channels_4T_float(::testing::benchmark::State& state) {
|
||||
const int outer_dim = state.range(0);
|
||||
const int middle_dim = state.range(1);
|
||||
|
||||
RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
|
||||
4 /* intra_threads */, 3 /* channels */);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ReverseRowsOf3Channels_4T_float)
|
||||
->UseRealTime()
|
||||
->ArgPair(288, 288)
|
||||
->ArgPair(30, 30)
|
||||
->ArgPair(1024, 1024)
|
||||
->ArgPair(10 * 1024, 1024);
|
||||
|
||||
static void BM_ReverseRowsOf3Channels_4T_uint8(int iters, int outer_dim,
|
||||
int middle_dim) {
|
||||
RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
|
||||
void BM_ReverseRowsOf3Channels_4T_uint8(::testing::benchmark::State& state) {
|
||||
const int outer_dim = state.range(0);
|
||||
const int middle_dim = state.range(1);
|
||||
|
||||
RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
|
||||
4 /* intra_threads */, 3 /* channels */);
|
||||
}
|
||||
BENCHMARK(BM_ReverseRowsOf3Channels_4T_uint8)
|
||||
->UseRealTime()
|
||||
->ArgPair(288, 288)
|
||||
->ArgPair(30, 30)
|
||||
->ArgPair(1024, 1024)
|
||||
->ArgPair(10 * 1024, 1024);
|
||||
|
||||
static void BM_ReverseRowsOf4Channels_1T_float(int iters, int outer_dim,
|
||||
int middle_dim) {
|
||||
RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
|
||||
void BM_ReverseRowsOf4Channels_1T_float(::testing::benchmark::State& state) {
|
||||
const int outer_dim = state.range(0);
|
||||
const int middle_dim = state.range(1);
|
||||
|
||||
RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
|
||||
1 /* intra_threads */, 4 /* channels */);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ReverseRowsOf4Channels_1T_float)
|
||||
->UseRealTime()
|
||||
->ArgPair(288, 288)
|
||||
->ArgPair(1024, 1024)
|
||||
->ArgPair(10 * 1024, 1024);
|
||||
|
||||
static void BM_ReverseRowsOf4Channels_1T_uint8(int iters, int outer_dim,
|
||||
int middle_dim) {
|
||||
RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
|
||||
void BM_ReverseRowsOf4Channels_1T_uint8(::testing::benchmark::State& state) {
|
||||
const int outer_dim = state.range(0);
|
||||
const int middle_dim = state.range(1);
|
||||
|
||||
RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
|
||||
1 /* intra_threads */, 4 /* channels */);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ReverseRowsOf4Channels_1T_uint8)
|
||||
->UseRealTime()
|
||||
->ArgPair(288, 288)
|
||||
->ArgPair(1024, 1024)
|
||||
->ArgPair(10 * 1024, 1024);
|
||||
|
||||
static void BM_ReverseRowsOf4Channels_4T_float(int iters, int outer_dim,
|
||||
int middle_dim) {
|
||||
RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
|
||||
void BM_ReverseRowsOf4Channels_4T_float(::testing::benchmark::State& state) {
|
||||
const int outer_dim = state.range(0);
|
||||
const int middle_dim = state.range(1);
|
||||
|
||||
RunReverseRowsBenchmark<float>(state, outer_dim, middle_dim,
|
||||
4 /* intra_threads */, 4 /* channels */);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ReverseRowsOf4Channels_4T_float)
|
||||
->UseRealTime()
|
||||
->ArgPair(288, 288)
|
||||
->ArgPair(1024, 1024)
|
||||
->ArgPair(10 * 1024, 1024);
|
||||
|
||||
static void BM_ReverseRowsOf4Channels_4T_uint8(int iters, int outer_dim,
|
||||
int middle_dim) {
|
||||
RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
|
||||
void BM_ReverseRowsOf4Channels_4T_uint8(::testing::benchmark::State& state) {
|
||||
const int outer_dim = state.range(0);
|
||||
const int middle_dim = state.range(1);
|
||||
|
||||
RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
|
||||
4 /* intra_threads */, 4 /* channels */);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ReverseRowsOf4Channels_4T_uint8)
|
||||
->UseRealTime()
|
||||
->ArgPair(288, 288)
|
||||
->ArgPair(1024, 1024)
|
||||
->ArgPair(10 * 1024, 1024);
|
||||
|
@ -450,34 +450,44 @@ static Graph* RollGraph(const TensorShape& shape, int isd) {
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_ROLL_OUTER(DEVICE) \
|
||||
static void BM_##DEVICE##_roll_outer(int iters, int rows, int columns) { \
|
||||
TensorShape shape{rows, columns}; \
|
||||
const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
|
||||
testing::ItemsProcessed(num_items); \
|
||||
testing::BytesProcessed(num_items * sizeof(float)); \
|
||||
testing::UseRealTime(); \
|
||||
test::Benchmark(#DEVICE, RollGraph(shape, 0)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_roll_outer) \
|
||||
->ArgPair(256, 256) \
|
||||
->ArgPair(512, 512) \
|
||||
->ArgPair(1024, 1024) \
|
||||
#define BM_ROLL_OUTER(DEVICE) \
|
||||
static void BM_##DEVICE##_roll_outer(::testing::benchmark::State& state) { \
|
||||
const int rows = state.range(0); \
|
||||
const int columns = state.range(1); \
|
||||
\
|
||||
TensorShape shape{rows, columns}; \
|
||||
test::Benchmark(#DEVICE, RollGraph(shape, 0), /*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
const int64 num_items = \
|
||||
static_cast<int64>(state.iterations()) * shape.num_elements(); \
|
||||
state.SetItemsProcessed(num_items); \
|
||||
state.SetBytesProcessed(num_items * sizeof(float)); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_roll_outer) \
|
||||
->UseRealTime() \
|
||||
->ArgPair(256, 256) \
|
||||
->ArgPair(512, 512) \
|
||||
->ArgPair(1024, 1024) \
|
||||
->ArgPair(2048, 2048)
|
||||
|
||||
#define BM_ROLL_ALL(DEVICE) \
|
||||
static void BM_##DEVICE##_roll_all(int iters, int rows, int columns) { \
|
||||
TensorShape shape{rows, columns}; \
|
||||
const int64 num_items = static_cast<int64>(iters) * shape.num_elements(); \
|
||||
testing::ItemsProcessed(num_items); \
|
||||
testing::BytesProcessed(num_items * sizeof(float)); \
|
||||
testing::UseRealTime(); \
|
||||
test::Benchmark(#DEVICE, RollGraph(shape, 1)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_roll_all) \
|
||||
->ArgPair(256, 256) \
|
||||
->ArgPair(512, 512) \
|
||||
->ArgPair(1024, 1024) \
|
||||
#define BM_ROLL_ALL(DEVICE) \
|
||||
static void BM_##DEVICE##_roll_all(::testing::benchmark::State& state) { \
|
||||
const int rows = state.range(0); \
|
||||
const int columns = state.range(1); \
|
||||
\
|
||||
TensorShape shape{rows, columns}; \
|
||||
test::Benchmark(#DEVICE, RollGraph(shape, 1), /*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
const int64 num_items = \
|
||||
static_cast<int64>(state.iterations()) * shape.num_elements(); \
|
||||
state.SetItemsProcessed(num_items); \
|
||||
state.SetBytesProcessed(num_items * sizeof(float)); \
|
||||
} \
|
||||
BENCHMARK(BM_##DEVICE##_roll_all) \
|
||||
->UseRealTime() \
|
||||
->ArgPair(256, 256) \
|
||||
->ArgPair(512, 512) \
|
||||
->ArgPair(1024, 1024) \
|
||||
->ArgPair(2048, 2048)
|
||||
|
||||
BM_ROLL_OUTER(cpu);
|
||||
|
@ -663,8 +663,8 @@ TEST_F(SaveOpSlices2Test, TwoSlices) {
|
||||
|
||||
// Benchmark-related code below.
|
||||
|
||||
static void BM_LargeTensorWrite(int iters, int num_elements) {
|
||||
testing::StopTiming();
|
||||
void BM_LargeTensorWrite(::testing::benchmark::State& state) {
|
||||
const int num_elements = state.range(0);
|
||||
|
||||
// 4 * num_elements bytes total , since sizeof(float) == 4.
|
||||
Tensor tensor(DT_FLOAT, TensorShape({num_elements}));
|
||||
@ -689,8 +689,9 @@ static void BM_LargeTensorWrite(int iters, int num_elements) {
|
||||
VLOG(1) << "Save op's output path: " << temp_filename;
|
||||
VLOG(1) << "# nodes in Graph: " << g->num_nodes();
|
||||
|
||||
testing::StartTiming();
|
||||
test::Benchmark("cpu", g, &session_options).Run(iters);
|
||||
test::Benchmark("cpu", g, &session_options, nullptr, nullptr, "",
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
}
|
||||
BENCHMARK(BM_LargeTensorWrite)->Arg((1 << 30) / 4 /* 1GB float tensor */);
|
||||
|
||||
|
@ -67,79 +67,120 @@ static Graph* ThreeDYCumsum(int num_y, int num_z, bool reverse = false) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void LargeOneDimensional(int iters, const string& device, int num_x,
|
||||
static void LargeOneDimensional(::testing::benchmark::State& state,
|
||||
const string& device, int num_x,
|
||||
bool reverse = false) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num_x);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num_x * sizeof(T));
|
||||
test::Benchmark(device, LargeOneDCumsum<T>(num_x, reverse)).Run(iters);
|
||||
test::Benchmark(device, LargeOneDCumsum<T>(num_x, reverse),
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
sizeof(T));
|
||||
}
|
||||
|
||||
static void DoRowCumsum(int iters, const string& device, int num_x, int num_y,
|
||||
static void DoRowCumsum(::testing::benchmark::State& state,
|
||||
const string& device, int num_x, int num_y,
|
||||
bool reverse = false) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
|
||||
sizeof(float));
|
||||
test::Benchmark(device, RowCumsum(num_x, num_y, reverse)).Run(iters);
|
||||
test::Benchmark(device, RowCumsum(num_x, num_y, reverse),
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y * sizeof(float));
|
||||
}
|
||||
|
||||
static void DoColCumsum(int iters, const string& device, int num_x, int num_y,
|
||||
static void DoColCumsum(::testing::benchmark::State& state,
|
||||
const string& device, int num_x, int num_y,
|
||||
bool reverse = false) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
|
||||
sizeof(float));
|
||||
test::Benchmark(device, ColCumsum(num_x, num_y, reverse)).Run(iters);
|
||||
test::Benchmark(device, ColCumsum(num_x, num_y, reverse),
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y * sizeof(float));
|
||||
}
|
||||
|
||||
static void Do3DYCumsum(int iters, const string& device, int num_x, int num_y,
|
||||
static void Do3DYCumsum(::testing::benchmark::State& state,
|
||||
const string& device, int num_x, int num_y,
|
||||
bool reverse = false) {
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
|
||||
sizeof(float));
|
||||
test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse)).Run(iters);
|
||||
test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse),
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * num_x *
|
||||
num_y * sizeof(float));
|
||||
}
|
||||
|
||||
static void BM_OneDCumsumGPU(int iters, int num_x) {
|
||||
LargeOneDimensional<float>(iters, "gpu", num_x);
|
||||
static void BM_OneDCumsumGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
|
||||
LargeOneDimensional<float>(state, "gpu", num_x);
|
||||
}
|
||||
BENCHMARK(BM_OneDCumsumGPU)->Range(1, 1 << 21);
|
||||
|
||||
static void BM_OneDCumsumGPUHalf(int iters, int num_x) {
|
||||
LargeOneDimensional<Eigen::half>(iters, "gpu", num_x);
|
||||
static void BM_OneDCumsumGPUHalf(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
|
||||
LargeOneDimensional<Eigen::half>(state, "gpu", num_x);
|
||||
}
|
||||
BENCHMARK(BM_OneDCumsumGPUHalf)->Range(1, 1 << 21);
|
||||
|
||||
static void BM_Sum2DRowCumsumGPU(int iters, int num_x, int num_y) {
|
||||
DoRowCumsum(iters, "gpu", num_x, num_y);
|
||||
static void BM_Sum2DRowCumsumGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
DoRowCumsum(state, "gpu", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Sum2DRowCumsumGPU)->RangePair(1, 8192, 1, 8192);
|
||||
|
||||
static void BM_Sum2DColumnCumsumGPU(int iters, int num_x, int num_y) {
|
||||
DoColCumsum(iters, "gpu", num_x, num_y);
|
||||
static void BM_Sum2DColumnCumsumGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
DoColCumsum(state, "gpu", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Sum2DColumnCumsumGPU)->RangePair(1, 8192, 1, 8192);
|
||||
|
||||
static void BM_Sum3DYCumsumGPU(int iters, int num_x, int num_y) {
|
||||
Do3DYCumsum(iters, "gpu", num_x, num_y);
|
||||
static void BM_Sum3DYCumsumGPU(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
Do3DYCumsum(state, "gpu", num_x, num_y);
|
||||
}
|
||||
BENCHMARK(BM_Sum3DYCumsumGPU)->RangePair(64, 4096, 64, 4096);
|
||||
|
||||
static void BM_OneDCumsumGPU_reverse(int iters, int num_x) {
|
||||
LargeOneDimensional<float>(iters, "gpu", num_x, true);
|
||||
static void BM_OneDCumsumGPU_reverse(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
|
||||
LargeOneDimensional<float>(state, "gpu", num_x, true);
|
||||
}
|
||||
BENCHMARK(BM_OneDCumsumGPU_reverse)->Range(1, 1 << 21);
|
||||
|
||||
static void BM_Sum2DRowCumsumGPU_reverse(int iters, int num_x, int num_y) {
|
||||
DoRowCumsum(iters, "gpu", num_x, num_y, true);
|
||||
static void BM_Sum2DRowCumsumGPU_reverse(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
DoRowCumsum(state, "gpu", num_x, num_y, true);
|
||||
}
|
||||
BENCHMARK(BM_Sum2DRowCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
|
||||
|
||||
static void BM_Sum2DColumnCumsumGPU_reverse(int iters, int num_x, int num_y) {
|
||||
DoColCumsum(iters, "gpu", num_x, num_y, true);
|
||||
static void BM_Sum2DColumnCumsumGPU_reverse(
|
||||
::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
DoColCumsum(state, "gpu", num_x, num_y, true);
|
||||
}
|
||||
BENCHMARK(BM_Sum2DColumnCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
|
||||
|
||||
static void BM_Sum3DYCumsumGPU_reverse(int iters, int num_x, int num_y) {
|
||||
Do3DYCumsum(iters, "gpu", num_x, num_y, true);
|
||||
static void BM_Sum3DYCumsumGPU_reverse(::testing::benchmark::State& state) {
|
||||
const int num_x = state.range(0);
|
||||
const int num_y = state.range(1);
|
||||
|
||||
Do3DYCumsum(state, "gpu", num_x, num_y, true);
|
||||
}
|
||||
BENCHMARK(BM_Sum3DYCumsumGPU_reverse)->RangePair(32, 2048, 32, 2048);
|
||||
|
||||
|
@ -254,8 +254,8 @@ class ScatterNdUpdateBM : public ScatterNdUpdateOpTest {
|
||||
};
|
||||
|
||||
template <typename Index>
|
||||
static void BM_ScatterNdHelper(int iters, int embedding_size, const char* op) {
|
||||
testing::StopTiming();
|
||||
void BM_ScatterNdHelper(::testing::benchmark::State& state, int embedding_size,
|
||||
const char* op) {
|
||||
const int kRows = 10000000 / embedding_size;
|
||||
std::vector<float> values;
|
||||
values.reserve(kRows);
|
||||
@ -280,27 +280,33 @@ static void BM_ScatterNdHelper(int iters, int embedding_size, const char* op) {
|
||||
bm.AddInputFromArray<Index>(TensorShape({kNumUpdates}), indices);
|
||||
bm.AddInputFromArray<float>(TensorShape({kNumUpdates, embedding_size}),
|
||||
updates);
|
||||
testing::ItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
|
||||
iters);
|
||||
testing::StartTiming();
|
||||
while (iters-- > 0) {
|
||||
for (auto i : state) {
|
||||
Status s = bm.RunOpKernel();
|
||||
}
|
||||
testing::StopTiming();
|
||||
state.SetItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
|
||||
state.iterations());
|
||||
}
|
||||
|
||||
static void BM_ScatterNdUpdateInt32(int iters, int embedding_size) {
|
||||
BM_ScatterNdHelper<int32>(iters, embedding_size, "ScatterNdUpdate");
|
||||
void BM_ScatterNdUpdateInt32(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterNdHelper<int32>(state, embedding_size, "ScatterNdUpdate");
|
||||
}
|
||||
static void BM_ScatterNdUpdateInt64(int iters, int embedding_size) {
|
||||
BM_ScatterNdHelper<int64>(iters, embedding_size, "ScatterNdUpdate");
|
||||
void BM_ScatterNdUpdateInt64(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterNdHelper<int64>(state, embedding_size, "ScatterNdUpdate");
|
||||
}
|
||||
|
||||
static void BM_ScatterNdAddInt32(int iters, int embedding_size) {
|
||||
BM_ScatterNdHelper<int32>(iters, embedding_size, "ScatterNdAdd");
|
||||
void BM_ScatterNdAddInt32(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterNdHelper<int32>(state, embedding_size, "ScatterNdAdd");
|
||||
}
|
||||
static void BM_ScatterNdAddInt64(int iters, int embedding_size) {
|
||||
BM_ScatterNdHelper<int64>(iters, embedding_size, "ScatterNdAdd");
|
||||
void BM_ScatterNdAddInt64(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterNdHelper<int64>(state, embedding_size, "ScatterNdAdd");
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ScatterNdUpdateInt32)
|
||||
|
@ -280,9 +280,8 @@ class ScatterUpdateBM : public ScatterUpdateOpTest {
|
||||
};
|
||||
|
||||
template <typename Index>
|
||||
static void BM_ScatterHelper(int iters, int embedding_size, const char* op,
|
||||
bool big_num_updates = false) {
|
||||
testing::StopTiming();
|
||||
void BM_ScatterHelper(::testing::benchmark::State& state, int embedding_size,
|
||||
const char* op, bool big_num_updates = false) {
|
||||
const int kRows = 10000000 / embedding_size;
|
||||
std::vector<float> values;
|
||||
values.reserve(kRows);
|
||||
@ -307,59 +306,83 @@ static void BM_ScatterHelper(int iters, int embedding_size, const char* op,
|
||||
bm.AddInputFromArray<Index>(TensorShape({kNumUpdates}), indices);
|
||||
bm.AddInputFromArray<float>(TensorShape({kNumUpdates, embedding_size}),
|
||||
updates);
|
||||
testing::ItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
|
||||
iters);
|
||||
testing::StartTiming();
|
||||
while (iters-- > 0) {
|
||||
for (auto i : state) {
|
||||
Status s = bm.RunOpKernel();
|
||||
}
|
||||
testing::StopTiming();
|
||||
state.SetItemsProcessed((static_cast<int64>(kNumUpdates) * embedding_size) *
|
||||
state.iterations());
|
||||
}
|
||||
|
||||
static void BM_ScatterUpdateInt32(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterUpdate");
|
||||
void BM_ScatterUpdateInt32(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int32>(state, embedding_size, "ScatterUpdate");
|
||||
}
|
||||
static void BM_ScatterUpdateInt64(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int64>(iters, embedding_size, "ScatterUpdate");
|
||||
void BM_ScatterUpdateInt64(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int64>(state, embedding_size, "ScatterUpdate");
|
||||
}
|
||||
|
||||
static void BM_ScatterAddInt32(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterAdd");
|
||||
void BM_ScatterAddInt32(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int32>(state, embedding_size, "ScatterAdd");
|
||||
}
|
||||
|
||||
static void BM_ScatterAddInt32Large(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterAdd", true);
|
||||
void BM_ScatterAddInt32Large(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int32>(state, embedding_size, "ScatterAdd", true);
|
||||
}
|
||||
static void BM_ScatterAddInt64(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int64>(iters, embedding_size, "ScatterAdd");
|
||||
void BM_ScatterAddInt64(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int64>(state, embedding_size, "ScatterAdd");
|
||||
}
|
||||
|
||||
static void BM_ScatterMulInt32(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMul");
|
||||
void BM_ScatterMulInt32(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int32>(state, embedding_size, "ScatterMul");
|
||||
}
|
||||
static void BM_ScatterMulInt64(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMul");
|
||||
void BM_ScatterMulInt64(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int64>(state, embedding_size, "ScatterMul");
|
||||
}
|
||||
|
||||
static void BM_ScatterDivInt32(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterDiv");
|
||||
void BM_ScatterDivInt32(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int32>(state, embedding_size, "ScatterDiv");
|
||||
}
|
||||
static void BM_ScatterDivInt64(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int64>(iters, embedding_size, "ScatterDiv");
|
||||
void BM_ScatterDivInt64(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int64>(state, embedding_size, "ScatterDiv");
|
||||
}
|
||||
|
||||
static void BM_ScatterMinInt32(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMin");
|
||||
void BM_ScatterMinInt32(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int32>(state, embedding_size, "ScatterMin");
|
||||
}
|
||||
static void BM_ScatterMinInt64(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMin");
|
||||
void BM_ScatterMinInt64(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int64>(state, embedding_size, "ScatterMin");
|
||||
}
|
||||
|
||||
static void BM_ScatterMaxInt32(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int32>(iters, embedding_size, "ScatterMax");
|
||||
void BM_ScatterMaxInt32(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int32>(state, embedding_size, "ScatterMax");
|
||||
}
|
||||
static void BM_ScatterMaxInt64(int iters, int embedding_size) {
|
||||
BM_ScatterHelper<int64>(iters, embedding_size, "ScatterMax");
|
||||
void BM_ScatterMaxInt64(::testing::benchmark::State& state) {
|
||||
const int embedding_size = state.range(0);
|
||||
|
||||
BM_ScatterHelper<int64>(state, embedding_size, "ScatterMax");
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ScatterUpdateInt32)
|
||||
|
@ -39,10 +39,9 @@ limitations under the License.
|
||||
namespace tensorflow {
|
||||
|
||||
template <typename Index>
|
||||
static void BM_SegmentReduction(int iters, const string& reduction,
|
||||
Index num_rows, Index num_cols,
|
||||
Index segment_size) {
|
||||
testing::StopTiming();
|
||||
static void BM_SegmentReduction(::testing::benchmark::State& state,
|
||||
const string& reduction, Index num_rows,
|
||||
Index num_cols, Index segment_size) {
|
||||
std::unique_ptr<Device> device(
|
||||
DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
|
||||
|
||||
@ -81,24 +80,25 @@ static void BM_SegmentReduction(int iters, const string& reduction,
|
||||
|
||||
reduction_op->Compute(reduction_context.get());
|
||||
TF_CHECK_OK(reduction_context->status());
|
||||
testing::StartTiming();
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
delete reduction_context->release_output(0).tensor;
|
||||
reduction_op->Compute(reduction_context.get());
|
||||
}
|
||||
int64 bytes_per_iter =
|
||||
static_cast<int64>(num_rows * num_cols * sizeof(float));
|
||||
testing::BytesProcessed(bytes_per_iter * iters);
|
||||
state.SetBytesProcessed(bytes_per_iter * state.iterations());
|
||||
}
|
||||
|
||||
#define BM_Reduce(O, R, C, S) \
|
||||
static void BM_Reduce_##O##_##R##_##C##_##S##_int32(int iters) { \
|
||||
BM_SegmentReduction<int32>(iters, #O, R, C, S); \
|
||||
} \
|
||||
static void BM_Reduce_##O##_##R##_##C##_##S##_int64(int iters) { \
|
||||
BM_SegmentReduction<int64>(iters, #O, R, C, S); \
|
||||
} \
|
||||
BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32); \
|
||||
#define BM_Reduce(O, R, C, S) \
|
||||
static void BM_Reduce_##O##_##R##_##C##_##S##_int32( \
|
||||
::testing::benchmark::State & state) { \
|
||||
BM_SegmentReduction<int32>(state, #O, R, C, S); \
|
||||
} \
|
||||
static void BM_Reduce_##O##_##R##_##C##_##S##_int64( \
|
||||
::testing::benchmark::State & state) { \
|
||||
BM_SegmentReduction<int64>(state, #O, R, C, S); \
|
||||
} \
|
||||
BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32); \
|
||||
BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int64);
|
||||
|
||||
#define BM_Reduce_Arg(R, C, S) \
|
||||
@ -113,8 +113,8 @@ BM_Reduce_Arg(64, 32, 2);
|
||||
BM_Reduce_Arg(4096, 32, 2);
|
||||
BM_Reduce_Arg(4096, 128, 2);
|
||||
|
||||
static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) {
|
||||
testing::StopTiming();
|
||||
static void SparseSegmentMeanGradHelper(::testing::benchmark::State& state,
|
||||
float uniqueness, int size) {
|
||||
Graph* g = new Graph(OpRegistry::Global());
|
||||
CHECK_LE(uniqueness, 1.0);
|
||||
CHECK_GT(uniqueness, 0.0);
|
||||
@ -148,22 +148,24 @@ static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) {
|
||||
.Attr("T", DT_FLOAT)
|
||||
.Finalize(g, &node));
|
||||
|
||||
testing::UseRealTime();
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * (kDim1 * kDim2) *
|
||||
sizeof(float));
|
||||
testing::StartTiming();
|
||||
test::Benchmark("cpu", g).Run(iters);
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
|
||||
(kDim1 * kDim2) * sizeof(float));
|
||||
}
|
||||
|
||||
static void BM_SparseSegmentMeanGrad_Low(int iters, int size) {
|
||||
return SparseSegmentMeanGradHelper(iters, 1.0, size);
|
||||
static void BM_SparseSegmentMeanGrad_Low(::testing::benchmark::State& state) {
|
||||
const int size = state.range(0);
|
||||
|
||||
return SparseSegmentMeanGradHelper(state, 1.0, size);
|
||||
}
|
||||
|
||||
static void BM_SparseSegmentMeanGrad_High(int iters, int size) {
|
||||
return SparseSegmentMeanGradHelper(iters, 0.01, size);
|
||||
static void BM_SparseSegmentMeanGrad_High(::testing::benchmark::State& state) {
|
||||
const int size = state.range(0);
|
||||
|
||||
return SparseSegmentMeanGradHelper(state, 0.01, size);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_SparseSegmentMeanGrad_Low)->Arg(1000)->Arg(100000);
|
||||
BENCHMARK(BM_SparseSegmentMeanGrad_High)->Arg(1000)->Arg(100000);
|
||||
BENCHMARK(BM_SparseSegmentMeanGrad_Low)->UseRealTime()->Arg(1000)->Arg(100000);
|
||||
BENCHMARK(BM_SparseSegmentMeanGrad_High)->UseRealTime()->Arg(1000)->Arg(100000);
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -54,21 +54,21 @@ static Graph* Recv() {
|
||||
return g;
|
||||
}
|
||||
|
||||
static void BM_Send(int iters) {
|
||||
testing::UseRealTime();
|
||||
testing::ItemsProcessed(static_cast<int64>(iters));
|
||||
test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous)
|
||||
.Run(iters);
|
||||
void BM_Send(::testing::benchmark::State& state) {
|
||||
test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous, "",
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()));
|
||||
}
|
||||
BENCHMARK(BM_Send);
|
||||
BENCHMARK(BM_Send)->UseRealTime();
|
||||
|
||||
static void BM_Recv(int iters) {
|
||||
testing::UseRealTime();
|
||||
testing::ItemsProcessed(static_cast<int64>(iters));
|
||||
test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous)
|
||||
.Run(iters);
|
||||
void BM_Recv(::testing::benchmark::State& state) {
|
||||
test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous, "",
|
||||
/*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()));
|
||||
}
|
||||
BENCHMARK(BM_Recv);
|
||||
BENCHMARK(BM_Recv)->UseRealTime();
|
||||
|
||||
} // namespace
|
||||
} // namespace tensorflow
|
||||
|
@ -37,8 +37,8 @@ namespace {
|
||||
// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
|
||||
// in size, and concat them together along "concat_dimension"
|
||||
template <typename T>
|
||||
static void SliceHelper(int iters, int size) {
|
||||
testing::StopTiming();
|
||||
static void SliceHelper(::testing::benchmark::State& state) {
|
||||
const int size = state.range(0);
|
||||
Graph* g = new Graph(OpRegistry::Global());
|
||||
DataType dt = DataTypeToEnum<T>::v();
|
||||
int kDim = 100;
|
||||
@ -65,26 +65,24 @@ static void SliceHelper(int iters, int size) {
|
||||
.Finalize(g, &node));
|
||||
FixupSourceAndSinkEdges(g);
|
||||
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * kDim * size * sizeof(T));
|
||||
testing::StartTiming();
|
||||
test::Benchmark("cpu", g, nullptr, nullptr, nullptr,
|
||||
"SINGLE_THREADED_EXECUTOR")
|
||||
.Run(iters);
|
||||
|
||||
testing::UseRealTime();
|
||||
"SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
|
||||
.Run(state);
|
||||
state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim * size *
|
||||
sizeof(T));
|
||||
}
|
||||
|
||||
static void BM_SliceFloat(int iters, int dim2) {
|
||||
SliceHelper<float>(iters, dim2);
|
||||
void BM_SliceFloat(::testing::benchmark::State& state) {
|
||||
SliceHelper<float>(state);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_SliceFloat)->Arg(100)->Arg(1000)->Arg(10000);
|
||||
BENCHMARK(BM_SliceFloat)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);
|
||||
|
||||
static void BM_SliceBFloat16(int iters, int dim2) {
|
||||
SliceHelper<bfloat16>(iters, dim2);
|
||||
void BM_SliceBFloat16(::testing::benchmark::State& state) {
|
||||
SliceHelper<bfloat16>(state);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_SliceBFloat16)->Arg(100)->Arg(1000)->Arg(10000);
|
||||
BENCHMARK(BM_SliceBFloat16)->UseRealTime()->Arg(100)->Arg(1000)->Arg(10000);
|
||||
|
||||
} // namespace
|
||||
} // namespace tensorflow
|
||||
|
@ -276,15 +276,18 @@ static ST MakeSparseTensor(Graph* g, int B, int M, int N, int nnz_inner) {
|
||||
|
||||
// [8, 4, N{nnz}] cmul [8, 4, N]
|
||||
#define BM_SparseMatCMulDenseMatArgs(N, NNZ_INNER) \
|
||||
static void BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER(int iters) { \
|
||||
static void BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER( \
|
||||
::testing::benchmark::State& state) { \
|
||||
Graph* g = new Graph(OpRegistry::Global()); \
|
||||
Node* dense = MakeTensor(g, 8, 4, N); \
|
||||
ST sp = MakeSparseTensor(g, 8, 4, N, NNZ_INNER); \
|
||||
\
|
||||
testing::ItemsProcessed(static_cast<int64>(iters * 8 * 4 * N * 2)); \
|
||||
test::Benchmark( \
|
||||
"cpu", SparseMatCMulDenseMat(g, sp.indices, sp.vals, sp.shape, dense)) \
|
||||
.Run(iters); \
|
||||
"cpu", SparseMatCMulDenseMat(g, sp.indices, sp.vals, sp.shape, dense), \
|
||||
/*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed( \
|
||||
static_cast<int64>(state.iterations() * 8 * 4 * N * 2)); \
|
||||
} \
|
||||
BENCHMARK(BM_SparseMatCMulDenseMat_##N##_##NNZ_INNER)
|
||||
|
||||
|
@ -198,9 +198,11 @@ TEST_F(SparseToDenseTest, ThreeD_MultValues) {
|
||||
|
||||
} // namespace
|
||||
|
||||
static void BM_SparseToDense(int iters, int NDIM, int N) {
|
||||
static void BM_SparseToDense(::testing::benchmark::State& state) {
|
||||
const int NDIM = state.range(0);
|
||||
const int N = state.range(1);
|
||||
|
||||
// TODO(zhifengc): Switch to use kernel_benchmark_testlib.h
|
||||
tensorflow::testing::StopTiming();
|
||||
|
||||
const int IndexDim = (NDIM == 1) ? 0 : 1;
|
||||
|
||||
@ -253,18 +255,15 @@ static void BM_SparseToDense(int iters, int NDIM, int N) {
|
||||
|
||||
std::unique_ptr<OpKernelContext> sparse_context(new OpKernelContext(¶ms));
|
||||
op->Compute(sparse_context.get());
|
||||
tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
delete sparse_context->release_output(0).tensor;
|
||||
op->Compute(sparse_context.get());
|
||||
TF_ASSERT_OK(sparse_context->status());
|
||||
}
|
||||
tensorflow::testing::StopTiming();
|
||||
|
||||
// processing input, mainly
|
||||
int64 bytes_per_iter = static_cast<int64>((N + N * NDIM) * sizeof(float));
|
||||
|
||||
tensorflow::testing::BytesProcessed(bytes_per_iter * iters);
|
||||
state.SetBytesProcessed(bytes_per_iter * state.iterations());
|
||||
}
|
||||
|
||||
BENCHMARK(BM_SparseToDense)
|
||||
|
@ -41,11 +41,15 @@ static Graph* SparseXent(int batch_size, int num_classes) {
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_SparseXentDev(BATCH, CLASS, DEVICE) \
|
||||
static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE(int iters) { \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
|
||||
test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS)).Run(iters); \
|
||||
} \
|
||||
#define BM_SparseXentDev(BATCH, CLASS, DEVICE) \
|
||||
static void BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE( \
|
||||
::testing::benchmark::State& state) { \
|
||||
test::Benchmark(#DEVICE, SparseXent(BATCH, CLASS), \
|
||||
/*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * BATCH * \
|
||||
CLASS); \
|
||||
} \
|
||||
BENCHMARK(BM_SparseXent##_##BATCH##_##CLASS##_##DEVICE);
|
||||
|
||||
/// The representative tests for ptb_word on GPU
|
||||
|
Loading…
Reference in New Issue
Block a user