Updated benchmarks to newer API
PiperOrigin-RevId: 358013972 Change-Id: I99f0f538a39845408fbc29dcd60652c42eaf652e
This commit is contained in:
parent
7d45aa8560
commit
1178262a2a
@ -116,12 +116,15 @@ static Graph* Conv2DBackpropFilter(int batch, int height, int width,
|
||||
#define BM_Conv2DBwdFilter(T, FMT, N, H, W, C, FH, FW, FC, SH, SW, PADDING, \
|
||||
type) \
|
||||
static void BM_NAME(BM_Conv2DBackpropFilter, type, T, FMT, N, H, W, C, FH, \
|
||||
FW, FC, SH, SW, PADDING)(int iters) { \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * \
|
||||
(C)); \
|
||||
test::Benchmark(#type, Conv2DBackpropFilter<T>(N, H, W, C, FH, FW, FC, SH, \
|
||||
SW, PADDING, FORMAT_##FMT)) \
|
||||
.Run(iters); \
|
||||
FW, FC, SH, SW, \
|
||||
PADDING)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark(#type, \
|
||||
Conv2DBackpropFilter<T>(N, H, W, C, FH, FW, FC, SH, SW, \
|
||||
PADDING, FORMAT_##FMT), \
|
||||
/*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * (N) * \
|
||||
(H) * (W) * (C)); \
|
||||
} \
|
||||
BENCHMARK(BM_NAME(BM_Conv2DBackpropFilter, type, T, FMT, N, H, W, C, FH, FW, \
|
||||
FC, SH, SW, PADDING));
|
||||
|
@ -84,9 +84,9 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
|
||||
.Input(backprop)
|
||||
.Attr("T", DataTypeToEnum<T>::value)
|
||||
.Attr("strides", {1, stride_h, stride_w, 1})
|
||||
.Attr("padding", padding == Padding::SAME
|
||||
? "SAME"
|
||||
: padding == Padding::VALID ? "VALID" : "N/A")
|
||||
.Attr("padding", padding == Padding::SAME ? "SAME"
|
||||
: padding == Padding::VALID ? "VALID"
|
||||
: "N/A")
|
||||
.Attr("data_format", ToString(data_format))
|
||||
.Finalize(graph, &conv2d));
|
||||
|
||||
@ -115,12 +115,14 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
|
||||
#define BM_Conv2DBwdInput(T, FMT, N, H, W, C, FW, FH, FC, SH, SW, PADDING, \
|
||||
type) \
|
||||
static void BM_NAME(BM_Conv2DBackpropInput, type, T, FMT, N, H, W, C, FH, \
|
||||
FW, FC, SH, SW, PADDING)(int iters) { \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * \
|
||||
(C)); \
|
||||
test::Benchmark(#type, Conv2DBackpropInput<T>(N, H, W, C, FH, FW, FC, SH, \
|
||||
SW, PADDING, FORMAT_##FMT)) \
|
||||
.Run(iters); \
|
||||
FW, FC, SH, SW, \
|
||||
PADDING)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark(#type, \
|
||||
Conv2DBackpropInput<T>(N, H, W, C, FH, FW, FC, SH, SW, \
|
||||
PADDING, FORMAT_##FMT), \
|
||||
/*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(state.iterations() * (N) * (H) * (W) * (C)); \
|
||||
} \
|
||||
BENCHMARK(BM_NAME(BM_Conv2DBackpropInput, type, T, FMT, N, H, W, C, FH, FW, \
|
||||
FC, SH, SW, PADDING));
|
||||
|
@ -91,10 +91,8 @@ TEST(SnapshotUtilTest, CombinationRoundTripTest) {
|
||||
SnapshotRoundTrip(io::compression::kSnappy, 2);
|
||||
}
|
||||
|
||||
void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
|
||||
int version) {
|
||||
tensorflow::testing::StopTiming();
|
||||
|
||||
void SnapshotReaderBenchmarkLoop(::testing::benchmark::State& state,
|
||||
std::string compression_type, int version) {
|
||||
tensorflow::DataTypeVector dtypes;
|
||||
std::vector<Tensor> tensors;
|
||||
GenerateTensorVector(dtypes, tensors);
|
||||
@ -106,7 +104,7 @@ void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
|
||||
TF_ASSERT_OK(Writer::Create(tensorflow::Env::Default(), filename,
|
||||
compression_type, version, dtypes, &writer));
|
||||
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
writer->WriteTensors(tensors).IgnoreError();
|
||||
}
|
||||
TF_ASSERT_OK(writer->Close());
|
||||
@ -115,34 +113,32 @@ void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
|
||||
TF_ASSERT_OK(Reader::Create(Env::Default(), filename, compression_type,
|
||||
version, dtypes, &reader));
|
||||
|
||||
tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
std::vector<Tensor> read_tensors;
|
||||
reader->ReadTensors(&read_tensors).IgnoreError();
|
||||
}
|
||||
tensorflow::testing::StopTiming();
|
||||
|
||||
TF_ASSERT_OK(Env::Default()->DeleteFile(filename));
|
||||
}
|
||||
|
||||
void SnapshotCustomReaderNoneBenchmark(int iters) {
|
||||
SnapshotReaderBenchmarkLoop(iters, io::compression::kNone, 1);
|
||||
void SnapshotCustomReaderNoneBenchmark(::testing::benchmark::State& state) {
|
||||
SnapshotReaderBenchmarkLoop(state, io::compression::kNone, 1);
|
||||
}
|
||||
|
||||
void SnapshotCustomReaderGzipBenchmark(int iters) {
|
||||
SnapshotReaderBenchmarkLoop(iters, io::compression::kGzip, 1);
|
||||
void SnapshotCustomReaderGzipBenchmark(::testing::benchmark::State& state) {
|
||||
SnapshotReaderBenchmarkLoop(state, io::compression::kGzip, 1);
|
||||
}
|
||||
|
||||
void SnapshotCustomReaderSnappyBenchmark(int iters) {
|
||||
SnapshotReaderBenchmarkLoop(iters, io::compression::kSnappy, 1);
|
||||
void SnapshotCustomReaderSnappyBenchmark(::testing::benchmark::State& state) {
|
||||
SnapshotReaderBenchmarkLoop(state, io::compression::kSnappy, 1);
|
||||
}
|
||||
|
||||
void SnapshotTFRecordReaderNoneBenchmark(int iters) {
|
||||
SnapshotReaderBenchmarkLoop(iters, io::compression::kNone, 2);
|
||||
void SnapshotTFRecordReaderNoneBenchmark(::testing::benchmark::State& state) {
|
||||
SnapshotReaderBenchmarkLoop(state, io::compression::kNone, 2);
|
||||
}
|
||||
|
||||
void SnapshotTFRecordReaderGzipBenchmark(int iters) {
|
||||
SnapshotReaderBenchmarkLoop(iters, io::compression::kGzip, 2);
|
||||
void SnapshotTFRecordReaderGzipBenchmark(::testing::benchmark::State& state) {
|
||||
SnapshotReaderBenchmarkLoop(state, io::compression::kGzip, 2);
|
||||
}
|
||||
|
||||
BENCHMARK(SnapshotCustomReaderNoneBenchmark);
|
||||
@ -151,10 +147,8 @@ BENCHMARK(SnapshotCustomReaderSnappyBenchmark);
|
||||
BENCHMARK(SnapshotTFRecordReaderNoneBenchmark);
|
||||
BENCHMARK(SnapshotTFRecordReaderGzipBenchmark);
|
||||
|
||||
void SnapshotWriterBenchmarkLoop(int iters, std::string compression_type,
|
||||
int version) {
|
||||
tensorflow::testing::StopTiming();
|
||||
|
||||
void SnapshotWriterBenchmarkLoop(::testing::benchmark::State& state,
|
||||
std::string compression_type, int version) {
|
||||
tensorflow::DataTypeVector dtypes;
|
||||
std::vector<Tensor> tensors;
|
||||
GenerateTensorVector(dtypes, tensors);
|
||||
@ -166,38 +160,36 @@ void SnapshotWriterBenchmarkLoop(int iters, std::string compression_type,
|
||||
TF_ASSERT_OK(Writer::Create(tensorflow::Env::Default(), filename,
|
||||
compression_type, version, dtypes, &writer));
|
||||
|
||||
tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < iters; ++i) {
|
||||
for (auto s : state) {
|
||||
writer->WriteTensors(tensors).IgnoreError();
|
||||
}
|
||||
writer->Close().IgnoreError();
|
||||
tensorflow::testing::StopTiming();
|
||||
|
||||
TF_ASSERT_OK(Env::Default()->DeleteFile(filename));
|
||||
}
|
||||
|
||||
void SnapshotCustomWriterNoneBenchmark(int iters) {
|
||||
SnapshotWriterBenchmarkLoop(iters, io::compression::kNone, 1);
|
||||
void SnapshotCustomWriterNoneBenchmark(::testing::benchmark::State& state) {
|
||||
SnapshotWriterBenchmarkLoop(state, io::compression::kNone, 1);
|
||||
}
|
||||
|
||||
void SnapshotCustomWriterGzipBenchmark(int iters) {
|
||||
SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 1);
|
||||
void SnapshotCustomWriterGzipBenchmark(::testing::benchmark::State& state) {
|
||||
SnapshotWriterBenchmarkLoop(state, io::compression::kGzip, 1);
|
||||
}
|
||||
|
||||
void SnapshotCustomWriterSnappyBenchmark(int iters) {
|
||||
SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 1);
|
||||
void SnapshotCustomWriterSnappyBenchmark(::testing::benchmark::State& state) {
|
||||
SnapshotWriterBenchmarkLoop(state, io::compression::kSnappy, 1);
|
||||
}
|
||||
|
||||
void SnapshotTFRecordWriterNoneBenchmark(int iters) {
|
||||
SnapshotWriterBenchmarkLoop(iters, io::compression::kNone, 2);
|
||||
void SnapshotTFRecordWriterNoneBenchmark(::testing::benchmark::State& state) {
|
||||
SnapshotWriterBenchmarkLoop(state, io::compression::kNone, 2);
|
||||
}
|
||||
|
||||
void SnapshotTFRecordWriterGzipBenchmark(int iters) {
|
||||
SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 2);
|
||||
void SnapshotTFRecordWriterGzipBenchmark(::testing::benchmark::State& state) {
|
||||
SnapshotWriterBenchmarkLoop(state, io::compression::kGzip, 2);
|
||||
}
|
||||
|
||||
void SnapshotTFRecordWriterSnappyBenchmark(int iters) {
|
||||
SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 2);
|
||||
void SnapshotTFRecordWriterSnappyBenchmark(::testing::benchmark::State& state) {
|
||||
SnapshotWriterBenchmarkLoop(state, io::compression::kSnappy, 2);
|
||||
}
|
||||
|
||||
BENCHMARK(SnapshotCustomWriterNoneBenchmark);
|
||||
|
@ -35,8 +35,9 @@ class SpatialConvolutionBenchmarksSuite {
|
||||
|
||||
using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
|
||||
|
||||
SpatialConvolutionBenchmarksSuite(int iters, Device& device)
|
||||
: iters_(iters), device_(device) {}
|
||||
SpatialConvolutionBenchmarksSuite(::testing::benchmark::State& state,
|
||||
Device& device)
|
||||
: state_(state), device_(device) {}
|
||||
|
||||
Eigen::Index BufferSize(const Dimensions& dims) {
|
||||
return dims.TotalSize() * sizeof(Scalar);
|
||||
@ -62,12 +63,10 @@ class SpatialConvolutionBenchmarksSuite {
|
||||
Filter filter(filter_data, filter_dims);
|
||||
Output output(output_data, output_dims);
|
||||
|
||||
::tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < iters_; ++i) {
|
||||
for (auto s : state_) {
|
||||
output.device(device_) = Eigen::SpatialConvolution(input, filter);
|
||||
tensorflow::testing::DoNotOptimize(output);
|
||||
}
|
||||
::tensorflow::testing::StopTiming();
|
||||
|
||||
device_.deallocate(input_data);
|
||||
device_.deallocate(filter_data);
|
||||
@ -102,13 +101,11 @@ class SpatialConvolutionBenchmarksSuite {
|
||||
OutputBackward output_backward(output_backward_data, output_dims);
|
||||
InputBackward input_backward(input_backward_data, input_dims);
|
||||
|
||||
::tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < iters_; ++i) {
|
||||
for (auto s : state_) {
|
||||
input_backward.device(device_) = Eigen::SpatialConvolutionBackwardInput(
|
||||
filter, output_backward, input_rows, input_cols);
|
||||
tensorflow::testing::DoNotOptimize(input_backward);
|
||||
}
|
||||
::tensorflow::testing::StopTiming();
|
||||
|
||||
device_.deallocate(filter_data);
|
||||
device_.deallocate(output_backward_data);
|
||||
@ -143,13 +140,11 @@ class SpatialConvolutionBenchmarksSuite {
|
||||
OutputBackward output_backward(output_backward_data, input_dims);
|
||||
FilterBackward filter_backward(filter_backward_data, filter_dims);
|
||||
|
||||
::tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < iters_; ++i) {
|
||||
for (auto s : state_) {
|
||||
filter_backward.device(device_) = Eigen::SpatialConvolutionBackwardKernel(
|
||||
input, output_backward, filter_rows, filter_cols);
|
||||
tensorflow::testing::DoNotOptimize(filter_backward);
|
||||
}
|
||||
::tensorflow::testing::StopTiming();
|
||||
|
||||
device_.deallocate(input_data);
|
||||
device_.deallocate(output_backward_data);
|
||||
@ -157,7 +152,8 @@ class SpatialConvolutionBenchmarksSuite {
|
||||
}
|
||||
|
||||
private:
|
||||
int iters_;
|
||||
::testing::benchmark::State& state_;
|
||||
|
||||
Device& device_;
|
||||
};
|
||||
|
||||
@ -170,8 +166,9 @@ class CuboidConvolutionBenchmarksSuite {
|
||||
|
||||
using Dimensions = Eigen::DSizes<Eigen::Index, 5>;
|
||||
|
||||
CuboidConvolutionBenchmarksSuite(int iters, Device& device)
|
||||
: iters_(iters), device_(device) {}
|
||||
CuboidConvolutionBenchmarksSuite(::testing::benchmark::State& state,
|
||||
Device& device)
|
||||
: state_(state), device_(device) {}
|
||||
|
||||
Eigen::Index BufferSize(const Dimensions& dims) {
|
||||
return dims.TotalSize() * sizeof(Scalar);
|
||||
@ -198,12 +195,10 @@ class CuboidConvolutionBenchmarksSuite {
|
||||
Filter filter(filter_data, filter_dims);
|
||||
Output output(output_data, output_dims);
|
||||
|
||||
::tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < iters_; ++i) {
|
||||
for (auto s : state_) {
|
||||
output.device(device_) = Eigen::CuboidConvolution(input, filter);
|
||||
tensorflow::testing::DoNotOptimize(output);
|
||||
}
|
||||
::tensorflow::testing::StopTiming();
|
||||
|
||||
device_.deallocate(input_data);
|
||||
device_.deallocate(filter_data);
|
||||
@ -240,13 +235,11 @@ class CuboidConvolutionBenchmarksSuite {
|
||||
OutputBackward output_backward(output_backward_data, output_dims);
|
||||
InputBackward input_backward(input_backward_data, input_dims);
|
||||
|
||||
::tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < iters_; ++i) {
|
||||
for (auto s : state_) {
|
||||
input_backward.device(device_) = Eigen::CuboidConvolutionBackwardInput(
|
||||
filter, output_backward, input_planes, input_rows, input_cols);
|
||||
tensorflow::testing::DoNotOptimize(input_backward);
|
||||
}
|
||||
::tensorflow::testing::StopTiming();
|
||||
|
||||
device_.deallocate(filter_data);
|
||||
device_.deallocate(output_backward_data);
|
||||
@ -283,13 +276,11 @@ class CuboidConvolutionBenchmarksSuite {
|
||||
OutputBackward output_backward(output_backward_data, output_dims);
|
||||
FilterBackward filter_backward(filter_backward_data, filter_dims);
|
||||
|
||||
::tensorflow::testing::StartTiming();
|
||||
for (int i = 0; i < iters_; ++i) {
|
||||
for (auto s : state_) {
|
||||
filter_backward.device(device_) = Eigen::CuboidConvolutionBackwardKernel(
|
||||
input, output_backward, filter_planes, filter_rows, filter_cols);
|
||||
tensorflow::testing::DoNotOptimize(filter_backward);
|
||||
}
|
||||
::tensorflow::testing::StopTiming();
|
||||
|
||||
device_.deallocate(input_data);
|
||||
device_.deallocate(output_backward_data);
|
||||
@ -297,7 +288,7 @@ class CuboidConvolutionBenchmarksSuite {
|
||||
}
|
||||
|
||||
private:
|
||||
int iters_;
|
||||
::testing::benchmark::State& state_;
|
||||
Device& device_;
|
||||
};
|
||||
|
||||
|
@ -27,19 +27,17 @@ limitations under the License.
|
||||
// Spatial Convolutions //
|
||||
// -------------------------------------------------------------------------- //
|
||||
|
||||
void SpatialConvolution(int iters, int num_threads,
|
||||
void SpatialConvolution(::testing::benchmark::State& state, int num_threads,
|
||||
/* Input dimensions: */
|
||||
int input_batches, int input_height, int input_width,
|
||||
int input_depth,
|
||||
/* Filter (kernel) dimensions: */
|
||||
int filter_count, int filter_height, int filter_width) {
|
||||
::tensorflow::testing::StopTiming();
|
||||
|
||||
CREATE_THREAD_POOL(num_threads);
|
||||
|
||||
using Benchmark =
|
||||
SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
|
||||
auto benchmark = Benchmark(iters, device);
|
||||
auto benchmark = Benchmark(state, device);
|
||||
|
||||
typename Benchmark::Dimensions input_dims(input_batches, input_height,
|
||||
input_width, input_depth);
|
||||
@ -52,23 +50,22 @@ void SpatialConvolution(int iters, int num_threads,
|
||||
(input_dims.TotalSize() / input_depth) * filter_count;
|
||||
auto flops =
|
||||
num_computed_elements * (input_depth * filter_height * filter_width);
|
||||
::tensorflow::testing::ItemsProcessed(flops * iters);
|
||||
state.SetItemsProcessed(flops * state.iterations());
|
||||
}
|
||||
|
||||
void SpatialConvolutionBackwardInput(int iters, int num_threads,
|
||||
void SpatialConvolutionBackwardInput(::testing::benchmark::State& state,
|
||||
int num_threads,
|
||||
/* Input dimensions: */
|
||||
int input_batches, int input_height,
|
||||
int input_width, int input_depth,
|
||||
/* Filter (kernel) dimensions: */
|
||||
int filter_count, int filter_height,
|
||||
int filter_width) {
|
||||
::tensorflow::testing::StopTiming();
|
||||
|
||||
CREATE_THREAD_POOL(num_threads);
|
||||
|
||||
using Benchmark =
|
||||
SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
|
||||
auto benchmark = Benchmark(iters, device);
|
||||
auto benchmark = Benchmark(state, device);
|
||||
|
||||
typename Benchmark::Dimensions input_dims(input_batches, input_height,
|
||||
input_width, input_depth);
|
||||
@ -80,23 +77,22 @@ void SpatialConvolutionBackwardInput(int iters, int num_threads,
|
||||
auto num_computed_elements = input_dims.TotalSize();
|
||||
auto flops =
|
||||
num_computed_elements * (input_depth * filter_height * filter_width);
|
||||
::tensorflow::testing::ItemsProcessed(flops * iters);
|
||||
state.SetItemsProcessed(flops * state.iterations());
|
||||
}
|
||||
|
||||
void SpatialConvolutionBackwardKernel(int iters, int num_threads,
|
||||
void SpatialConvolutionBackwardKernel(::testing::benchmark::State& state,
|
||||
int num_threads,
|
||||
/* Input dimensions: */
|
||||
int input_batches, int input_height,
|
||||
int input_width, int input_depth,
|
||||
/* Filter (kernel) dimensions: */
|
||||
int filter_count, int filter_height,
|
||||
int filter_width) {
|
||||
::tensorflow::testing::StopTiming();
|
||||
|
||||
CREATE_THREAD_POOL(num_threads);
|
||||
|
||||
using Benchmark =
|
||||
SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
|
||||
auto benchmark = Benchmark(iters, device);
|
||||
auto benchmark = Benchmark(state, device);
|
||||
|
||||
typename Benchmark::Dimensions input_dims(input_batches, input_height,
|
||||
input_width, input_depth);
|
||||
@ -108,7 +104,7 @@ void SpatialConvolutionBackwardKernel(int iters, int num_threads,
|
||||
auto num_computed_elements = filter_dims.TotalSize();
|
||||
auto flops =
|
||||
num_computed_elements * (input_batches * input_height * input_width);
|
||||
::tensorflow::testing::ItemsProcessed(flops * iters);
|
||||
state.SetItemsProcessed(flops * state.iterations());
|
||||
}
|
||||
|
||||
// Macro arguments names: --------------------------------------------------- //
|
||||
@ -126,26 +122,26 @@ void SpatialConvolutionBackwardKernel(int iters, int num_threads,
|
||||
|
||||
#define BM_SpatialConvolution(NT, N, H, W, C, FC, FH, FW, LABEL) \
|
||||
static void BM_SPATIAL_NAME(SpatialConvolution, NT, N, H, W, C, FC, FH, \
|
||||
FW)(int iters) { \
|
||||
::tensorflow::testing::SetLabel(LABEL); \
|
||||
SpatialConvolution(iters, NT, N, H, W, C, FC, FH, FW); \
|
||||
FW)(::testing::benchmark::State & state) { \
|
||||
state.SetLabel(LABEL); \
|
||||
SpatialConvolution(state, NT, N, H, W, C, FC, FH, FW); \
|
||||
} \
|
||||
BENCHMARK(BM_SPATIAL_NAME(SpatialConvolution, NT, N, H, W, C, FC, FH, FW))
|
||||
|
||||
#define BM_SpatialConvolutionBwdInput(NT, N, H, W, C, FC, FH, FW, LABEL) \
|
||||
static void BM_SPATIAL_NAME(SpatialConvolutionBwdInput, NT, N, H, W, C, FC, \
|
||||
FH, FW)(int iters) { \
|
||||
::tensorflow::testing::SetLabel(LABEL); \
|
||||
SpatialConvolutionBackwardInput(iters, NT, N, H, W, C, FC, FH, FW); \
|
||||
FH, FW)(::testing::benchmark::State & state) { \
|
||||
state.SetLabel(LABEL); \
|
||||
SpatialConvolutionBackwardInput(state, NT, N, H, W, C, FC, FH, FW); \
|
||||
} \
|
||||
BENCHMARK( \
|
||||
BM_SPATIAL_NAME(SpatialConvolutionBwdInput, NT, N, H, W, C, FC, FH, FW))
|
||||
|
||||
#define BM_SpatialConvolutionBwdKernel(NT, N, H, W, C, FC, FH, FW, LABEL) \
|
||||
static void BM_SPATIAL_NAME(SpatialConvolutionBwdKernel, NT, N, H, W, C, FC, \
|
||||
FH, FW)(int iters) { \
|
||||
::tensorflow::testing::SetLabel(LABEL); \
|
||||
SpatialConvolutionBackwardKernel(iters, NT, N, H, W, C, FC, FH, FW); \
|
||||
FH, FW)(::testing::benchmark::State & state) { \
|
||||
state.SetLabel(LABEL); \
|
||||
SpatialConvolutionBackwardKernel(state, NT, N, H, W, C, FC, FH, FW); \
|
||||
} \
|
||||
BENCHMARK(BM_SPATIAL_NAME(SpatialConvolutionBwdKernel, NT, N, H, W, C, FC, \
|
||||
FH, FW))
|
||||
@ -248,20 +244,18 @@ BM_SpatialConvolutionsBwdKernel(32, 7, 7, 192, 384, 3, 3, "conv5_00_3x3");
|
||||
// Cuboid Convolutions //
|
||||
// -------------------------------------------------------------------------- //
|
||||
|
||||
void CuboidConvolution(int iters, int num_threads,
|
||||
void CuboidConvolution(::testing::benchmark::State& state, int num_threads,
|
||||
/* Input dimensions: */
|
||||
int input_batches, int input_height, int input_width,
|
||||
int input_planes, int input_depth,
|
||||
/* Filter (kernel) dimensions: */
|
||||
int filter_count, int filter_height, int filter_width,
|
||||
int filter_planes) {
|
||||
::tensorflow::testing::StopTiming();
|
||||
|
||||
CREATE_THREAD_POOL(num_threads);
|
||||
|
||||
using Benchmark =
|
||||
CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
|
||||
auto benchmark = Benchmark(iters, device);
|
||||
auto benchmark = Benchmark(state, device);
|
||||
|
||||
typename Benchmark::Dimensions input_dims(
|
||||
input_batches, input_height, input_width, input_planes, input_depth);
|
||||
@ -274,10 +268,11 @@ void CuboidConvolution(int iters, int num_threads,
|
||||
(input_dims.TotalSize() / input_depth) * filter_count;
|
||||
auto flops = num_computed_elements *
|
||||
(input_depth * filter_height * filter_width * filter_planes);
|
||||
::tensorflow::testing::ItemsProcessed(flops * iters);
|
||||
state.SetItemsProcessed(flops * state.iterations());
|
||||
}
|
||||
|
||||
void CuboidConvolutionBackwardInput(int iters, int num_threads,
|
||||
void CuboidConvolutionBackwardInput(::testing::benchmark::State& state,
|
||||
int num_threads,
|
||||
/* Input dimensions: */
|
||||
int input_batches, int input_height,
|
||||
int input_width, int input_planes,
|
||||
@ -285,13 +280,11 @@ void CuboidConvolutionBackwardInput(int iters, int num_threads,
|
||||
/* Filter (kernel) dimensions: */
|
||||
int filter_count, int filter_height,
|
||||
int filter_width, int filter_planes) {
|
||||
::tensorflow::testing::StopTiming();
|
||||
|
||||
CREATE_THREAD_POOL(num_threads);
|
||||
|
||||
using Benchmark =
|
||||
CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
|
||||
auto benchmark = Benchmark(iters, device);
|
||||
auto benchmark = Benchmark(state, device);
|
||||
|
||||
typename Benchmark::Dimensions input_dims(
|
||||
input_batches, input_height, input_width, input_planes, input_depth);
|
||||
@ -303,10 +296,11 @@ void CuboidConvolutionBackwardInput(int iters, int num_threads,
|
||||
auto num_computed_elements = input_dims.TotalSize();
|
||||
auto flops = num_computed_elements *
|
||||
(input_depth * filter_height * filter_width * filter_planes);
|
||||
::tensorflow::testing::ItemsProcessed(flops * iters);
|
||||
state.SetItemsProcessed(flops * state.iterations());
|
||||
}
|
||||
|
||||
void CuboidConvolutionBackwardKernel(int iters, int num_threads,
|
||||
void CuboidConvolutionBackwardKernel(::testing::benchmark::State& state,
|
||||
int num_threads,
|
||||
/* Input dimensions: */
|
||||
int input_batches, int input_height,
|
||||
int input_width, int input_planes,
|
||||
@ -314,13 +308,11 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
|
||||
/* Filter (kernel) dimensions: */
|
||||
int filter_count, int filter_height,
|
||||
int filter_width, int filter_planes) {
|
||||
::tensorflow::testing::StopTiming();
|
||||
|
||||
CREATE_THREAD_POOL(num_threads);
|
||||
|
||||
using Benchmark =
|
||||
CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
|
||||
auto benchmark = Benchmark(iters, device);
|
||||
auto benchmark = Benchmark(state, device);
|
||||
|
||||
typename Benchmark::Dimensions input_dims(
|
||||
input_batches, input_height, input_width, input_planes, input_depth);
|
||||
@ -332,9 +324,16 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
|
||||
auto num_computed_elements = filter_dims.TotalSize();
|
||||
auto flops = num_computed_elements *
|
||||
(input_batches * input_height * input_width * input_planes);
|
||||
::tensorflow::testing::ItemsProcessed(flops * iters);
|
||||
state.SetItemsProcessed(flops * state.iterations());
|
||||
}
|
||||
|
||||
// The multiple #'s in the function names + the `::testing::benchmark::State&`
|
||||
// as parameters apparently confuses clang if they are not on the same line. So
|
||||
// we need to turn off LINT and clang-format for this block.
|
||||
//
|
||||
// clang-format off
|
||||
// NOLINTBEGIN
|
||||
|
||||
// Macro arguments names: --------------------------------------------------- //
|
||||
// NT: num threads
|
||||
// N: batch size
|
||||
@ -354,33 +353,33 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
|
||||
_f_##FC##_##FH##_##FW##_##FP)
|
||||
|
||||
#define BM_CuboidConvolution(NT, N, H, W, P, C, FC, FH, FW, FP, LABEL) \
|
||||
static void BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, \
|
||||
FP)(int iters) { \
|
||||
::tensorflow::testing::SetLabel(LABEL); \
|
||||
CuboidConvolution(iters, NT, N, H, W, P, C, FC, FH, FW, FP); \
|
||||
static void BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) { \
|
||||
state.SetLabel(LABEL); \
|
||||
CuboidConvolution(state, NT, N, H, W, P, C, FC, FH, FW, FP); \
|
||||
} \
|
||||
BENCHMARK( \
|
||||
BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, FP))
|
||||
|
||||
#define BM_CuboidConvolutionBwdInput(NT, N, H, W, P, C, FC, FH, FW, FP, LABEL) \
|
||||
static void BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, \
|
||||
FH, FW, FP)(int iters) { \
|
||||
::tensorflow::testing::SetLabel(LABEL); \
|
||||
CuboidConvolutionBackwardInput(iters, NT, N, H, W, P, C, FC, FH, FW, FP); \
|
||||
static void BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) { \
|
||||
state.SetLabel(LABEL); \
|
||||
CuboidConvolutionBackwardInput(state, NT, N, H, W, P, C, FC, FH, FW, FP); \
|
||||
} \
|
||||
BENCHMARK(BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, \
|
||||
FH, FW, FP))
|
||||
|
||||
#define BM_CuboidConvolutionBwdKernel(NT, N, H, W, P, C, FC, FH, FW, FP, \
|
||||
LABEL) \
|
||||
static void BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, \
|
||||
FC, FH, FW, FP)(int iters) { \
|
||||
::tensorflow::testing::SetLabel(LABEL); \
|
||||
CuboidConvolutionBackwardKernel(iters, NT, N, H, W, P, C, FC, FH, FW, FP); \
|
||||
static void BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) { \
|
||||
state.SetLabel(LABEL); \
|
||||
CuboidConvolutionBackwardKernel(state, NT, N, H, W, P, C, FC, FH, FW, FP); \
|
||||
} \
|
||||
BENCHMARK(BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, FC, \
|
||||
FH, FW, FP))
|
||||
|
||||
// NOLINTEND
|
||||
// clang-format on
|
||||
|
||||
#define BM_CuboidConvolutions(N, H, W, P, C, FC, FH, FW, FP, LABEL) \
|
||||
BM_CuboidConvolution(2, N, H, W, P, C, FC, FH, FW, FP, LABEL); \
|
||||
BM_CuboidConvolution(4, N, H, W, P, C, FC, FH, FW, FP, LABEL); \
|
||||
|
@ -283,18 +283,23 @@ static Graph* FusedBatchNormGrad(int n, int h, int w, int c, bool is_training,
|
||||
// -------------------------------------------------------------------------- //
|
||||
// FusedBatchNorm inference
|
||||
// -------------------------------------------------------------------------- //
|
||||
// clang-format off
|
||||
// NOLINTBEGIN
|
||||
#define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
|
||||
static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark( \
|
||||
#DEVICE, \
|
||||
FusedBatchNormInference<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT), \
|
||||
/*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(state.iterations() * N * H * W * C); \
|
||||
} \
|
||||
BENCHMARK( \
|
||||
BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)) \
|
||||
->UseRealTime();
|
||||
|
||||
#define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
|
||||
static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, \
|
||||
DEVICE)(int iters) { \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
|
||||
test::Benchmark(#DEVICE, FusedBatchNormInference<T>( \
|
||||
N, H, W, C, IS_TRAINING, FORMAT_##FORMAT)) \
|
||||
.Run(iters); \
|
||||
} \
|
||||
BENCHMARK( \
|
||||
BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE));
|
||||
// NOLINTEND
|
||||
// clang-format on
|
||||
|
||||
BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, cpu);
|
||||
BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, cpu);
|
||||
@ -320,17 +325,19 @@ BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NCHW, gpu);
|
||||
// FusedBatchNorm gradient
|
||||
// -------------------------------------------------------------------------- //
|
||||
|
||||
#define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
|
||||
static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
|
||||
DEVICE)(int iters) { \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
|
||||
test::Benchmark(#DEVICE, FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING, \
|
||||
FORMAT_##FORMAT)) \
|
||||
.Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
|
||||
DEVICE));
|
||||
#define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
|
||||
static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
|
||||
DEVICE)(::testing::benchmark::State & state) { \
|
||||
test::Benchmark( \
|
||||
#DEVICE, \
|
||||
FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT), \
|
||||
/*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(state.iterations() * N * H * W * C); \
|
||||
} \
|
||||
BENCHMARK( \
|
||||
BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)) \
|
||||
->UseRealTime();
|
||||
|
||||
#define BM_FusedBatchNormGradResnetShapes(T, IS_TRAINING, FORMAT, DEVICE) \
|
||||
BM_FusedBatchNormGrad(64, 56, 56, 64, T, IS_TRAINING, FORMAT, DEVICE); \
|
||||
|
@ -98,14 +98,16 @@ static Graph* BandedTriangularSolve(int64 num_bands, int64 n, int64 m,
|
||||
// BS: boolean indicating whether to use the banded solver
|
||||
// T: C++ type of scalars (e.g. float, std::complex)
|
||||
// TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
|
||||
#define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D) \
|
||||
static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT( \
|
||||
int iters) { \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * K * N + N * M); \
|
||||
test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT);
|
||||
#define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D) \
|
||||
static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT( \
|
||||
::testing::benchmark::State& state) { \
|
||||
test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT), \
|
||||
/*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(state.iterations() * K * N + N * M); \
|
||||
} \
|
||||
BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT) \
|
||||
->UseRealTime();
|
||||
|
||||
#define BM_BandedTriangularSolve(K, N, M, BS, D) \
|
||||
BM_BandedTriangularSolveDev(K, N, M, BS, float, DT_FLOAT, D); \
|
||||
|
@ -101,18 +101,18 @@ static Graph* MatrixTriangularSolveWithBroadcast(int64 b0, int64 b1, int64 m,
|
||||
// T: C++ type of scalars (e.g. float, std::complex)
|
||||
// TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
|
||||
// D: Device (e.g. cpu, gpu)
|
||||
#define BM_MatrixTriangularSolveDev(B1, B2, M, N, MB, T, TT, D) \
|
||||
static void \
|
||||
BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D( \
|
||||
int iters) { \
|
||||
testing::UseRealTime(); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * std::max(B1, B2) * M * \
|
||||
M * N * 2); \
|
||||
test::Benchmark( \
|
||||
#D, MatrixTriangularSolveWithBroadcast<T>(B1, B2, M, N, MB, TT)) \
|
||||
.Run(iters); \
|
||||
} \
|
||||
BENCHMARK( \
|
||||
#define BM_MatrixTriangularSolveDev(B1, B2, M, N, MB, T, TT, D) \
|
||||
static void \
|
||||
BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D( \
|
||||
::testing::benchmark::State& state) { \
|
||||
state.SetItemsProcessed(state.iterations() * std::max(B1, B2) * M * M * \
|
||||
N * 2); \
|
||||
test::Benchmark( \
|
||||
#D, MatrixTriangularSolveWithBroadcast<T>(B1, B2, M, N, MB, TT), \
|
||||
/*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
} \
|
||||
BENCHMARK( \
|
||||
BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D);
|
||||
|
||||
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
|
||||
|
@ -56,20 +56,25 @@ static Graph* ConstructSpaceToBatchGraph(
|
||||
|
||||
// The BM_Expand macro is needed for this to build with VC++.
|
||||
#define BM_Expand(x) x
|
||||
// Macro is already longer than 80 chars.
|
||||
// NOLINTBEGIN
|
||||
#define BM_SpaceToBatchDev(OP, DEVICE, DTYPE, B, H, W, D, BS, P00, P01, P10, \
|
||||
P11) \
|
||||
static void \
|
||||
BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11( \
|
||||
int iters) { \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * B * (H + P00 + P01) * \
|
||||
::testing::benchmark::State& state) { \
|
||||
test::Benchmark( \
|
||||
#DEVICE, \
|
||||
ConstructSpaceToBatchGraph(#OP, TensorShape({B, H, W, D}), BS, DTYPE, \
|
||||
{{P00, P01}, {P10, P11}}), \
|
||||
/*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(state.iterations() * B * (H + P00 + P01) * \
|
||||
(W + P10 + P11) * D); \
|
||||
test::Benchmark(#DEVICE, ConstructSpaceToBatchGraph( \
|
||||
#OP, TensorShape({B, H, W, D}), BS, DTYPE, \
|
||||
{{P00, P01}, {P10, P11}})) \
|
||||
.Run(iters); \
|
||||
} \
|
||||
BENCHMARK( \
|
||||
BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11);
|
||||
// NOLINTEND
|
||||
#define BM_SpaceToBatch(OP, ...) \
|
||||
BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \
|
||||
BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \
|
||||
|
@ -107,36 +107,30 @@ static Graph* ReplicatedSparseMatMul(int m, int n, int d, float sparsity_1,
|
||||
#define BM_SPARSE(M, K, N, S1, S2, TRA, TRB, TA, TB) \
|
||||
static void \
|
||||
BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB( \
|
||||
int iters) { \
|
||||
testing::StopTiming(); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2); \
|
||||
::testing::benchmark::State& state) { \
|
||||
auto label = strings::Printf("tr_a: %d tr_b: %d sp_a: %0.2f sp_b: %0.2f", \
|
||||
TRA, TRB, S1 / 100.0, S2 / 100.0); \
|
||||
testing::SetLabel(label); \
|
||||
testing::UseRealTime(); \
|
||||
state.SetLabel(label); \
|
||||
auto g = SparseMatMul<TA, TB>(M, N, K, S1 / 100.0, S2 / 100.0, TRA, TRB); \
|
||||
testing::StartTiming(); \
|
||||
test::Benchmark("cpu", g).Run(iters); \
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); \
|
||||
} \
|
||||
BENCHMARK( \
|
||||
BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB);
|
||||
BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB) \
|
||||
->UseRealTime();
|
||||
|
||||
#define BM_SPARSE_REPLICATED(M, K, N, S1, S2, Copies) \
|
||||
static void BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies( \
|
||||
int iters) { \
|
||||
testing::StopTiming(); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * Copies * \
|
||||
2); \
|
||||
::testing::benchmark::State& state) { \
|
||||
auto label = strings::Printf("copies: %d sp_a: %0.2f sp_b: %0.2f", \
|
||||
(Copies), S1 / 100.0, S2 / 100.0); \
|
||||
testing::SetLabel(label); \
|
||||
testing::UseRealTime(); \
|
||||
state.SetLabel(label); \
|
||||
auto g = \
|
||||
ReplicatedSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, (Copies)); \
|
||||
testing::StartTiming(); \
|
||||
test::Benchmark("cpu", g).Run(iters); \
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); \
|
||||
state.SetItemsProcessed(state.iterations() * M * K * N * Copies * 2); \
|
||||
} \
|
||||
BENCHMARK(BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies);
|
||||
BENCHMARK(BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies) \
|
||||
->UseRealTime();
|
||||
|
||||
#define BM_SPARSE_FLOAT(M, K, N, S1, S2, TRA, TRB) \
|
||||
BM_SPARSE(M, K, N, S1, S2, TRA, TRB, float, float)
|
||||
@ -219,22 +213,21 @@ static Graph* MultiSparseMatMul(int m, int n, int d, float sparsity_1,
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_SPARSE_MULTI(M, K, N, S1, S2, Copies) \
|
||||
static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies( \
|
||||
int iters) { \
|
||||
testing::StopTiming(); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2 * 2 * \
|
||||
Copies); \
|
||||
auto label = strings::Printf("%d_%d_%d_%d_%0.2f_%0.2f", M, K, N, Copies, \
|
||||
S1 / 100.0, S2 / 100.0); \
|
||||
testing::SetLabel(label); \
|
||||
testing::UseRealTime(); \
|
||||
auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, Copies); \
|
||||
testing::StartTiming(); \
|
||||
test::Benchmark("cpu", g).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies);
|
||||
|
||||
// clang-format off
|
||||
// NOLINTBEGIN
|
||||
#define BM_SPARSE_MULTI(M, K, N, S1, S2, Copies) \
|
||||
static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies(::testing::benchmark::State& state) { \
|
||||
auto label = strings::Printf("%d_%d_%d_%d_%0.2f_%0.2f", M, K, N, Copies, \
|
||||
S1 / 100.0, S2 / 100.0); \
|
||||
state.SetLabel(label); \
|
||||
auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, Copies); \
|
||||
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); \
|
||||
state.SetItemsProcessed(state.iterations() * M * K * N * 2 * 2 * Copies); \
|
||||
} \
|
||||
BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies) \
|
||||
->UseRealTime();
|
||||
// NOLINTEND
|
||||
// clang-format on
|
||||
BM_SPARSE_MULTI(1024, 2140, 4096, 0, 82, 1);
|
||||
BM_SPARSE_MULTI(1024, 4096, 2048, 83, 83, 1);
|
||||
BM_SPARSE_MULTI(400, 800, 2560, 85, 85, 1);
|
||||
|
@ -68,19 +68,22 @@ static Graph* SparseTensorDenseMatmul(int nnz, int m, int k, int n,
|
||||
return g;
|
||||
}
|
||||
|
||||
// NOLINTBEGIN
|
||||
#define BM_SparseTensorDenseMatmulDev(NNZ, M, K, N, TA, TB, DEVICE) \
|
||||
static void \
|
||||
BM_SparseTensorDenseMatmul##_##NNZ##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE( \
|
||||
int iters) { \
|
||||
::testing::benchmark::State& state) { \
|
||||
int64 items_per_iter = (static_cast<int64>(NNZ) * (TB ? K : N)); \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter); \
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * items_per_iter * \
|
||||
test::Benchmark(#DEVICE, SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB), \
|
||||
/*old_benchmark_api*/ false) \
|
||||
.Run(state); \
|
||||
state.SetItemsProcessed(state.iterations() * items_per_iter); \
|
||||
state.SetBytesProcessed(state.iterations() * items_per_iter * \
|
||||
sizeof(float)); \
|
||||
test::Benchmark(#DEVICE, SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB)) \
|
||||
.Run(iters); \
|
||||
} \
|
||||
BENCHMARK( \
|
||||
BM_SparseTensorDenseMatmul##_##NNZ##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE);
|
||||
// NOLINTEND
|
||||
|
||||
#define BM_SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB) \
|
||||
BM_SparseTensorDenseMatmulDev(NNZ, M, K, N, TA, TB, cpu); \
|
||||
|
Loading…
x
Reference in New Issue
Block a user