Updated benchmarks to newer API

PiperOrigin-RevId: 358013972
Change-Id: I99f0f538a39845408fbc29dcd60652c42eaf652e
This commit is contained in:
A. Unique TensorFlower 2021-02-17 12:58:26 -08:00 committed by TensorFlower Gardener
parent 7d45aa8560
commit 1178262a2a
11 changed files with 210 additions and 213 deletions

View File

@ -116,12 +116,15 @@ static Graph* Conv2DBackpropFilter(int batch, int height, int width,
#define BM_Conv2DBwdFilter(T, FMT, N, H, W, C, FH, FW, FC, SH, SW, PADDING, \ #define BM_Conv2DBwdFilter(T, FMT, N, H, W, C, FH, FW, FC, SH, SW, PADDING, \
type) \ type) \
static void BM_NAME(BM_Conv2DBackpropFilter, type, T, FMT, N, H, W, C, FH, \ static void BM_NAME(BM_Conv2DBackpropFilter, type, T, FMT, N, H, W, C, FH, \
FW, FC, SH, SW, PADDING)(int iters) { \ FW, FC, SH, SW, \
testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * \ PADDING)(::testing::benchmark::State & state) { \
(C)); \ test::Benchmark(#type, \
test::Benchmark(#type, Conv2DBackpropFilter<T>(N, H, W, C, FH, FW, FC, SH, \ Conv2DBackpropFilter<T>(N, H, W, C, FH, FW, FC, SH, SW, \
SW, PADDING, FORMAT_##FMT)) \ PADDING, FORMAT_##FMT), \
.Run(iters); \ /*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * (N) * \
(H) * (W) * (C)); \
} \ } \
BENCHMARK(BM_NAME(BM_Conv2DBackpropFilter, type, T, FMT, N, H, W, C, FH, FW, \ BENCHMARK(BM_NAME(BM_Conv2DBackpropFilter, type, T, FMT, N, H, W, C, FH, FW, \
FC, SH, SW, PADDING)); FC, SH, SW, PADDING));

View File

@ -84,9 +84,9 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
.Input(backprop) .Input(backprop)
.Attr("T", DataTypeToEnum<T>::value) .Attr("T", DataTypeToEnum<T>::value)
.Attr("strides", {1, stride_h, stride_w, 1}) .Attr("strides", {1, stride_h, stride_w, 1})
.Attr("padding", padding == Padding::SAME .Attr("padding", padding == Padding::SAME ? "SAME"
? "SAME" : padding == Padding::VALID ? "VALID"
: padding == Padding::VALID ? "VALID" : "N/A") : "N/A")
.Attr("data_format", ToString(data_format)) .Attr("data_format", ToString(data_format))
.Finalize(graph, &conv2d)); .Finalize(graph, &conv2d));
@ -115,12 +115,14 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
#define BM_Conv2DBwdInput(T, FMT, N, H, W, C, FW, FH, FC, SH, SW, PADDING, \ #define BM_Conv2DBwdInput(T, FMT, N, H, W, C, FW, FH, FC, SH, SW, PADDING, \
type) \ type) \
static void BM_NAME(BM_Conv2DBackpropInput, type, T, FMT, N, H, W, C, FH, \ static void BM_NAME(BM_Conv2DBackpropInput, type, T, FMT, N, H, W, C, FH, \
FW, FC, SH, SW, PADDING)(int iters) { \ FW, FC, SH, SW, \
testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * \ PADDING)(::testing::benchmark::State & state) { \
(C)); \ test::Benchmark(#type, \
test::Benchmark(#type, Conv2DBackpropInput<T>(N, H, W, C, FH, FW, FC, SH, \ Conv2DBackpropInput<T>(N, H, W, C, FH, FW, FC, SH, SW, \
SW, PADDING, FORMAT_##FMT)) \ PADDING, FORMAT_##FMT), \
.Run(iters); \ /*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(state.iterations() * (N) * (H) * (W) * (C)); \
} \ } \
BENCHMARK(BM_NAME(BM_Conv2DBackpropInput, type, T, FMT, N, H, W, C, FH, FW, \ BENCHMARK(BM_NAME(BM_Conv2DBackpropInput, type, T, FMT, N, H, W, C, FH, FW, \
FC, SH, SW, PADDING)); FC, SH, SW, PADDING));

View File

@ -91,10 +91,8 @@ TEST(SnapshotUtilTest, CombinationRoundTripTest) {
SnapshotRoundTrip(io::compression::kSnappy, 2); SnapshotRoundTrip(io::compression::kSnappy, 2);
} }
void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type, void SnapshotReaderBenchmarkLoop(::testing::benchmark::State& state,
int version) { std::string compression_type, int version) {
tensorflow::testing::StopTiming();
tensorflow::DataTypeVector dtypes; tensorflow::DataTypeVector dtypes;
std::vector<Tensor> tensors; std::vector<Tensor> tensors;
GenerateTensorVector(dtypes, tensors); GenerateTensorVector(dtypes, tensors);
@ -106,7 +104,7 @@ void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
TF_ASSERT_OK(Writer::Create(tensorflow::Env::Default(), filename, TF_ASSERT_OK(Writer::Create(tensorflow::Env::Default(), filename,
compression_type, version, dtypes, &writer)); compression_type, version, dtypes, &writer));
for (int i = 0; i < iters; ++i) { for (auto s : state) {
writer->WriteTensors(tensors).IgnoreError(); writer->WriteTensors(tensors).IgnoreError();
} }
TF_ASSERT_OK(writer->Close()); TF_ASSERT_OK(writer->Close());
@ -115,34 +113,32 @@ void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
TF_ASSERT_OK(Reader::Create(Env::Default(), filename, compression_type, TF_ASSERT_OK(Reader::Create(Env::Default(), filename, compression_type,
version, dtypes, &reader)); version, dtypes, &reader));
tensorflow::testing::StartTiming(); for (auto s : state) {
for (int i = 0; i < iters; ++i) {
std::vector<Tensor> read_tensors; std::vector<Tensor> read_tensors;
reader->ReadTensors(&read_tensors).IgnoreError(); reader->ReadTensors(&read_tensors).IgnoreError();
} }
tensorflow::testing::StopTiming();
TF_ASSERT_OK(Env::Default()->DeleteFile(filename)); TF_ASSERT_OK(Env::Default()->DeleteFile(filename));
} }
void SnapshotCustomReaderNoneBenchmark(int iters) { void SnapshotCustomReaderNoneBenchmark(::testing::benchmark::State& state) {
SnapshotReaderBenchmarkLoop(iters, io::compression::kNone, 1); SnapshotReaderBenchmarkLoop(state, io::compression::kNone, 1);
} }
void SnapshotCustomReaderGzipBenchmark(int iters) { void SnapshotCustomReaderGzipBenchmark(::testing::benchmark::State& state) {
SnapshotReaderBenchmarkLoop(iters, io::compression::kGzip, 1); SnapshotReaderBenchmarkLoop(state, io::compression::kGzip, 1);
} }
void SnapshotCustomReaderSnappyBenchmark(int iters) { void SnapshotCustomReaderSnappyBenchmark(::testing::benchmark::State& state) {
SnapshotReaderBenchmarkLoop(iters, io::compression::kSnappy, 1); SnapshotReaderBenchmarkLoop(state, io::compression::kSnappy, 1);
} }
void SnapshotTFRecordReaderNoneBenchmark(int iters) { void SnapshotTFRecordReaderNoneBenchmark(::testing::benchmark::State& state) {
SnapshotReaderBenchmarkLoop(iters, io::compression::kNone, 2); SnapshotReaderBenchmarkLoop(state, io::compression::kNone, 2);
} }
void SnapshotTFRecordReaderGzipBenchmark(int iters) { void SnapshotTFRecordReaderGzipBenchmark(::testing::benchmark::State& state) {
SnapshotReaderBenchmarkLoop(iters, io::compression::kGzip, 2); SnapshotReaderBenchmarkLoop(state, io::compression::kGzip, 2);
} }
BENCHMARK(SnapshotCustomReaderNoneBenchmark); BENCHMARK(SnapshotCustomReaderNoneBenchmark);
@ -151,10 +147,8 @@ BENCHMARK(SnapshotCustomReaderSnappyBenchmark);
BENCHMARK(SnapshotTFRecordReaderNoneBenchmark); BENCHMARK(SnapshotTFRecordReaderNoneBenchmark);
BENCHMARK(SnapshotTFRecordReaderGzipBenchmark); BENCHMARK(SnapshotTFRecordReaderGzipBenchmark);
void SnapshotWriterBenchmarkLoop(int iters, std::string compression_type, void SnapshotWriterBenchmarkLoop(::testing::benchmark::State& state,
int version) { std::string compression_type, int version) {
tensorflow::testing::StopTiming();
tensorflow::DataTypeVector dtypes; tensorflow::DataTypeVector dtypes;
std::vector<Tensor> tensors; std::vector<Tensor> tensors;
GenerateTensorVector(dtypes, tensors); GenerateTensorVector(dtypes, tensors);
@ -166,38 +160,36 @@ void SnapshotWriterBenchmarkLoop(int iters, std::string compression_type,
TF_ASSERT_OK(Writer::Create(tensorflow::Env::Default(), filename, TF_ASSERT_OK(Writer::Create(tensorflow::Env::Default(), filename,
compression_type, version, dtypes, &writer)); compression_type, version, dtypes, &writer));
tensorflow::testing::StartTiming(); for (auto s : state) {
for (int i = 0; i < iters; ++i) {
writer->WriteTensors(tensors).IgnoreError(); writer->WriteTensors(tensors).IgnoreError();
} }
writer->Close().IgnoreError(); writer->Close().IgnoreError();
tensorflow::testing::StopTiming();
TF_ASSERT_OK(Env::Default()->DeleteFile(filename)); TF_ASSERT_OK(Env::Default()->DeleteFile(filename));
} }
void SnapshotCustomWriterNoneBenchmark(int iters) { void SnapshotCustomWriterNoneBenchmark(::testing::benchmark::State& state) {
SnapshotWriterBenchmarkLoop(iters, io::compression::kNone, 1); SnapshotWriterBenchmarkLoop(state, io::compression::kNone, 1);
} }
void SnapshotCustomWriterGzipBenchmark(int iters) { void SnapshotCustomWriterGzipBenchmark(::testing::benchmark::State& state) {
SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 1); SnapshotWriterBenchmarkLoop(state, io::compression::kGzip, 1);
} }
void SnapshotCustomWriterSnappyBenchmark(int iters) { void SnapshotCustomWriterSnappyBenchmark(::testing::benchmark::State& state) {
SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 1); SnapshotWriterBenchmarkLoop(state, io::compression::kSnappy, 1);
} }
void SnapshotTFRecordWriterNoneBenchmark(int iters) { void SnapshotTFRecordWriterNoneBenchmark(::testing::benchmark::State& state) {
SnapshotWriterBenchmarkLoop(iters, io::compression::kNone, 2); SnapshotWriterBenchmarkLoop(state, io::compression::kNone, 2);
} }
void SnapshotTFRecordWriterGzipBenchmark(int iters) { void SnapshotTFRecordWriterGzipBenchmark(::testing::benchmark::State& state) {
SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 2); SnapshotWriterBenchmarkLoop(state, io::compression::kGzip, 2);
} }
void SnapshotTFRecordWriterSnappyBenchmark(int iters) { void SnapshotTFRecordWriterSnappyBenchmark(::testing::benchmark::State& state) {
SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 2); SnapshotWriterBenchmarkLoop(state, io::compression::kSnappy, 2);
} }
BENCHMARK(SnapshotCustomWriterNoneBenchmark); BENCHMARK(SnapshotCustomWriterNoneBenchmark);

View File

@ -35,8 +35,9 @@ class SpatialConvolutionBenchmarksSuite {
using Dimensions = Eigen::DSizes<Eigen::Index, 4>; using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
SpatialConvolutionBenchmarksSuite(int iters, Device& device) SpatialConvolutionBenchmarksSuite(::testing::benchmark::State& state,
: iters_(iters), device_(device) {} Device& device)
: state_(state), device_(device) {}
Eigen::Index BufferSize(const Dimensions& dims) { Eigen::Index BufferSize(const Dimensions& dims) {
return dims.TotalSize() * sizeof(Scalar); return dims.TotalSize() * sizeof(Scalar);
@ -62,12 +63,10 @@ class SpatialConvolutionBenchmarksSuite {
Filter filter(filter_data, filter_dims); Filter filter(filter_data, filter_dims);
Output output(output_data, output_dims); Output output(output_data, output_dims);
::tensorflow::testing::StartTiming(); for (auto s : state_) {
for (int i = 0; i < iters_; ++i) {
output.device(device_) = Eigen::SpatialConvolution(input, filter); output.device(device_) = Eigen::SpatialConvolution(input, filter);
tensorflow::testing::DoNotOptimize(output); tensorflow::testing::DoNotOptimize(output);
} }
::tensorflow::testing::StopTiming();
device_.deallocate(input_data); device_.deallocate(input_data);
device_.deallocate(filter_data); device_.deallocate(filter_data);
@ -102,13 +101,11 @@ class SpatialConvolutionBenchmarksSuite {
OutputBackward output_backward(output_backward_data, output_dims); OutputBackward output_backward(output_backward_data, output_dims);
InputBackward input_backward(input_backward_data, input_dims); InputBackward input_backward(input_backward_data, input_dims);
::tensorflow::testing::StartTiming(); for (auto s : state_) {
for (int i = 0; i < iters_; ++i) {
input_backward.device(device_) = Eigen::SpatialConvolutionBackwardInput( input_backward.device(device_) = Eigen::SpatialConvolutionBackwardInput(
filter, output_backward, input_rows, input_cols); filter, output_backward, input_rows, input_cols);
tensorflow::testing::DoNotOptimize(input_backward); tensorflow::testing::DoNotOptimize(input_backward);
} }
::tensorflow::testing::StopTiming();
device_.deallocate(filter_data); device_.deallocate(filter_data);
device_.deallocate(output_backward_data); device_.deallocate(output_backward_data);
@ -143,13 +140,11 @@ class SpatialConvolutionBenchmarksSuite {
OutputBackward output_backward(output_backward_data, input_dims); OutputBackward output_backward(output_backward_data, input_dims);
FilterBackward filter_backward(filter_backward_data, filter_dims); FilterBackward filter_backward(filter_backward_data, filter_dims);
::tensorflow::testing::StartTiming(); for (auto s : state_) {
for (int i = 0; i < iters_; ++i) {
filter_backward.device(device_) = Eigen::SpatialConvolutionBackwardKernel( filter_backward.device(device_) = Eigen::SpatialConvolutionBackwardKernel(
input, output_backward, filter_rows, filter_cols); input, output_backward, filter_rows, filter_cols);
tensorflow::testing::DoNotOptimize(filter_backward); tensorflow::testing::DoNotOptimize(filter_backward);
} }
::tensorflow::testing::StopTiming();
device_.deallocate(input_data); device_.deallocate(input_data);
device_.deallocate(output_backward_data); device_.deallocate(output_backward_data);
@ -157,7 +152,8 @@ class SpatialConvolutionBenchmarksSuite {
} }
private: private:
int iters_; ::testing::benchmark::State& state_;
Device& device_; Device& device_;
}; };
@ -170,8 +166,9 @@ class CuboidConvolutionBenchmarksSuite {
using Dimensions = Eigen::DSizes<Eigen::Index, 5>; using Dimensions = Eigen::DSizes<Eigen::Index, 5>;
CuboidConvolutionBenchmarksSuite(int iters, Device& device) CuboidConvolutionBenchmarksSuite(::testing::benchmark::State& state,
: iters_(iters), device_(device) {} Device& device)
: state_(state), device_(device) {}
Eigen::Index BufferSize(const Dimensions& dims) { Eigen::Index BufferSize(const Dimensions& dims) {
return dims.TotalSize() * sizeof(Scalar); return dims.TotalSize() * sizeof(Scalar);
@ -198,12 +195,10 @@ class CuboidConvolutionBenchmarksSuite {
Filter filter(filter_data, filter_dims); Filter filter(filter_data, filter_dims);
Output output(output_data, output_dims); Output output(output_data, output_dims);
::tensorflow::testing::StartTiming(); for (auto s : state_) {
for (int i = 0; i < iters_; ++i) {
output.device(device_) = Eigen::CuboidConvolution(input, filter); output.device(device_) = Eigen::CuboidConvolution(input, filter);
tensorflow::testing::DoNotOptimize(output); tensorflow::testing::DoNotOptimize(output);
} }
::tensorflow::testing::StopTiming();
device_.deallocate(input_data); device_.deallocate(input_data);
device_.deallocate(filter_data); device_.deallocate(filter_data);
@ -240,13 +235,11 @@ class CuboidConvolutionBenchmarksSuite {
OutputBackward output_backward(output_backward_data, output_dims); OutputBackward output_backward(output_backward_data, output_dims);
InputBackward input_backward(input_backward_data, input_dims); InputBackward input_backward(input_backward_data, input_dims);
::tensorflow::testing::StartTiming(); for (auto s : state_) {
for (int i = 0; i < iters_; ++i) {
input_backward.device(device_) = Eigen::CuboidConvolutionBackwardInput( input_backward.device(device_) = Eigen::CuboidConvolutionBackwardInput(
filter, output_backward, input_planes, input_rows, input_cols); filter, output_backward, input_planes, input_rows, input_cols);
tensorflow::testing::DoNotOptimize(input_backward); tensorflow::testing::DoNotOptimize(input_backward);
} }
::tensorflow::testing::StopTiming();
device_.deallocate(filter_data); device_.deallocate(filter_data);
device_.deallocate(output_backward_data); device_.deallocate(output_backward_data);
@ -283,13 +276,11 @@ class CuboidConvolutionBenchmarksSuite {
OutputBackward output_backward(output_backward_data, output_dims); OutputBackward output_backward(output_backward_data, output_dims);
FilterBackward filter_backward(filter_backward_data, filter_dims); FilterBackward filter_backward(filter_backward_data, filter_dims);
::tensorflow::testing::StartTiming(); for (auto s : state_) {
for (int i = 0; i < iters_; ++i) {
filter_backward.device(device_) = Eigen::CuboidConvolutionBackwardKernel( filter_backward.device(device_) = Eigen::CuboidConvolutionBackwardKernel(
input, output_backward, filter_planes, filter_rows, filter_cols); input, output_backward, filter_planes, filter_rows, filter_cols);
tensorflow::testing::DoNotOptimize(filter_backward); tensorflow::testing::DoNotOptimize(filter_backward);
} }
::tensorflow::testing::StopTiming();
device_.deallocate(input_data); device_.deallocate(input_data);
device_.deallocate(output_backward_data); device_.deallocate(output_backward_data);
@ -297,7 +288,7 @@ class CuboidConvolutionBenchmarksSuite {
} }
private: private:
int iters_; ::testing::benchmark::State& state_;
Device& device_; Device& device_;
}; };

View File

@ -27,19 +27,17 @@ limitations under the License.
// Spatial Convolutions // // Spatial Convolutions //
// -------------------------------------------------------------------------- // // -------------------------------------------------------------------------- //
void SpatialConvolution(int iters, int num_threads, void SpatialConvolution(::testing::benchmark::State& state, int num_threads,
/* Input dimensions: */ /* Input dimensions: */
int input_batches, int input_height, int input_width, int input_batches, int input_height, int input_width,
int input_depth, int input_depth,
/* Filter (kernel) dimensions: */ /* Filter (kernel) dimensions: */
int filter_count, int filter_height, int filter_width) { int filter_count, int filter_height, int filter_width) {
::tensorflow::testing::StopTiming();
CREATE_THREAD_POOL(num_threads); CREATE_THREAD_POOL(num_threads);
using Benchmark = using Benchmark =
SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>; SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
auto benchmark = Benchmark(iters, device); auto benchmark = Benchmark(state, device);
typename Benchmark::Dimensions input_dims(input_batches, input_height, typename Benchmark::Dimensions input_dims(input_batches, input_height,
input_width, input_depth); input_width, input_depth);
@ -52,23 +50,22 @@ void SpatialConvolution(int iters, int num_threads,
(input_dims.TotalSize() / input_depth) * filter_count; (input_dims.TotalSize() / input_depth) * filter_count;
auto flops = auto flops =
num_computed_elements * (input_depth * filter_height * filter_width); num_computed_elements * (input_depth * filter_height * filter_width);
::tensorflow::testing::ItemsProcessed(flops * iters); state.SetItemsProcessed(flops * state.iterations());
} }
void SpatialConvolutionBackwardInput(int iters, int num_threads, void SpatialConvolutionBackwardInput(::testing::benchmark::State& state,
int num_threads,
/* Input dimensions: */ /* Input dimensions: */
int input_batches, int input_height, int input_batches, int input_height,
int input_width, int input_depth, int input_width, int input_depth,
/* Filter (kernel) dimensions: */ /* Filter (kernel) dimensions: */
int filter_count, int filter_height, int filter_count, int filter_height,
int filter_width) { int filter_width) {
::tensorflow::testing::StopTiming();
CREATE_THREAD_POOL(num_threads); CREATE_THREAD_POOL(num_threads);
using Benchmark = using Benchmark =
SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>; SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
auto benchmark = Benchmark(iters, device); auto benchmark = Benchmark(state, device);
typename Benchmark::Dimensions input_dims(input_batches, input_height, typename Benchmark::Dimensions input_dims(input_batches, input_height,
input_width, input_depth); input_width, input_depth);
@ -80,23 +77,22 @@ void SpatialConvolutionBackwardInput(int iters, int num_threads,
auto num_computed_elements = input_dims.TotalSize(); auto num_computed_elements = input_dims.TotalSize();
auto flops = auto flops =
num_computed_elements * (input_depth * filter_height * filter_width); num_computed_elements * (input_depth * filter_height * filter_width);
::tensorflow::testing::ItemsProcessed(flops * iters); state.SetItemsProcessed(flops * state.iterations());
} }
void SpatialConvolutionBackwardKernel(int iters, int num_threads, void SpatialConvolutionBackwardKernel(::testing::benchmark::State& state,
int num_threads,
/* Input dimensions: */ /* Input dimensions: */
int input_batches, int input_height, int input_batches, int input_height,
int input_width, int input_depth, int input_width, int input_depth,
/* Filter (kernel) dimensions: */ /* Filter (kernel) dimensions: */
int filter_count, int filter_height, int filter_count, int filter_height,
int filter_width) { int filter_width) {
::tensorflow::testing::StopTiming();
CREATE_THREAD_POOL(num_threads); CREATE_THREAD_POOL(num_threads);
using Benchmark = using Benchmark =
SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>; SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
auto benchmark = Benchmark(iters, device); auto benchmark = Benchmark(state, device);
typename Benchmark::Dimensions input_dims(input_batches, input_height, typename Benchmark::Dimensions input_dims(input_batches, input_height,
input_width, input_depth); input_width, input_depth);
@ -108,7 +104,7 @@ void SpatialConvolutionBackwardKernel(int iters, int num_threads,
auto num_computed_elements = filter_dims.TotalSize(); auto num_computed_elements = filter_dims.TotalSize();
auto flops = auto flops =
num_computed_elements * (input_batches * input_height * input_width); num_computed_elements * (input_batches * input_height * input_width);
::tensorflow::testing::ItemsProcessed(flops * iters); state.SetItemsProcessed(flops * state.iterations());
} }
// Macro arguments names: --------------------------------------------------- // // Macro arguments names: --------------------------------------------------- //
@ -126,26 +122,26 @@ void SpatialConvolutionBackwardKernel(int iters, int num_threads,
#define BM_SpatialConvolution(NT, N, H, W, C, FC, FH, FW, LABEL) \ #define BM_SpatialConvolution(NT, N, H, W, C, FC, FH, FW, LABEL) \
static void BM_SPATIAL_NAME(SpatialConvolution, NT, N, H, W, C, FC, FH, \ static void BM_SPATIAL_NAME(SpatialConvolution, NT, N, H, W, C, FC, FH, \
FW)(int iters) { \ FW)(::testing::benchmark::State & state) { \
::tensorflow::testing::SetLabel(LABEL); \ state.SetLabel(LABEL); \
SpatialConvolution(iters, NT, N, H, W, C, FC, FH, FW); \ SpatialConvolution(state, NT, N, H, W, C, FC, FH, FW); \
} \ } \
BENCHMARK(BM_SPATIAL_NAME(SpatialConvolution, NT, N, H, W, C, FC, FH, FW)) BENCHMARK(BM_SPATIAL_NAME(SpatialConvolution, NT, N, H, W, C, FC, FH, FW))
#define BM_SpatialConvolutionBwdInput(NT, N, H, W, C, FC, FH, FW, LABEL) \ #define BM_SpatialConvolutionBwdInput(NT, N, H, W, C, FC, FH, FW, LABEL) \
static void BM_SPATIAL_NAME(SpatialConvolutionBwdInput, NT, N, H, W, C, FC, \ static void BM_SPATIAL_NAME(SpatialConvolutionBwdInput, NT, N, H, W, C, FC, \
FH, FW)(int iters) { \ FH, FW)(::testing::benchmark::State & state) { \
::tensorflow::testing::SetLabel(LABEL); \ state.SetLabel(LABEL); \
SpatialConvolutionBackwardInput(iters, NT, N, H, W, C, FC, FH, FW); \ SpatialConvolutionBackwardInput(state, NT, N, H, W, C, FC, FH, FW); \
} \ } \
BENCHMARK( \ BENCHMARK( \
BM_SPATIAL_NAME(SpatialConvolutionBwdInput, NT, N, H, W, C, FC, FH, FW)) BM_SPATIAL_NAME(SpatialConvolutionBwdInput, NT, N, H, W, C, FC, FH, FW))
#define BM_SpatialConvolutionBwdKernel(NT, N, H, W, C, FC, FH, FW, LABEL) \ #define BM_SpatialConvolutionBwdKernel(NT, N, H, W, C, FC, FH, FW, LABEL) \
static void BM_SPATIAL_NAME(SpatialConvolutionBwdKernel, NT, N, H, W, C, FC, \ static void BM_SPATIAL_NAME(SpatialConvolutionBwdKernel, NT, N, H, W, C, FC, \
FH, FW)(int iters) { \ FH, FW)(::testing::benchmark::State & state) { \
::tensorflow::testing::SetLabel(LABEL); \ state.SetLabel(LABEL); \
SpatialConvolutionBackwardKernel(iters, NT, N, H, W, C, FC, FH, FW); \ SpatialConvolutionBackwardKernel(state, NT, N, H, W, C, FC, FH, FW); \
} \ } \
BENCHMARK(BM_SPATIAL_NAME(SpatialConvolutionBwdKernel, NT, N, H, W, C, FC, \ BENCHMARK(BM_SPATIAL_NAME(SpatialConvolutionBwdKernel, NT, N, H, W, C, FC, \
FH, FW)) FH, FW))
@ -248,20 +244,18 @@ BM_SpatialConvolutionsBwdKernel(32, 7, 7, 192, 384, 3, 3, "conv5_00_3x3");
// Cuboid Convolutions // // Cuboid Convolutions //
// -------------------------------------------------------------------------- // // -------------------------------------------------------------------------- //
void CuboidConvolution(int iters, int num_threads, void CuboidConvolution(::testing::benchmark::State& state, int num_threads,
/* Input dimensions: */ /* Input dimensions: */
int input_batches, int input_height, int input_width, int input_batches, int input_height, int input_width,
int input_planes, int input_depth, int input_planes, int input_depth,
/* Filter (kernel) dimensions: */ /* Filter (kernel) dimensions: */
int filter_count, int filter_height, int filter_width, int filter_count, int filter_height, int filter_width,
int filter_planes) { int filter_planes) {
::tensorflow::testing::StopTiming();
CREATE_THREAD_POOL(num_threads); CREATE_THREAD_POOL(num_threads);
using Benchmark = using Benchmark =
CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>; CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
auto benchmark = Benchmark(iters, device); auto benchmark = Benchmark(state, device);
typename Benchmark::Dimensions input_dims( typename Benchmark::Dimensions input_dims(
input_batches, input_height, input_width, input_planes, input_depth); input_batches, input_height, input_width, input_planes, input_depth);
@ -274,10 +268,11 @@ void CuboidConvolution(int iters, int num_threads,
(input_dims.TotalSize() / input_depth) * filter_count; (input_dims.TotalSize() / input_depth) * filter_count;
auto flops = num_computed_elements * auto flops = num_computed_elements *
(input_depth * filter_height * filter_width * filter_planes); (input_depth * filter_height * filter_width * filter_planes);
::tensorflow::testing::ItemsProcessed(flops * iters); state.SetItemsProcessed(flops * state.iterations());
} }
void CuboidConvolutionBackwardInput(int iters, int num_threads, void CuboidConvolutionBackwardInput(::testing::benchmark::State& state,
int num_threads,
/* Input dimensions: */ /* Input dimensions: */
int input_batches, int input_height, int input_batches, int input_height,
int input_width, int input_planes, int input_width, int input_planes,
@ -285,13 +280,11 @@ void CuboidConvolutionBackwardInput(int iters, int num_threads,
/* Filter (kernel) dimensions: */ /* Filter (kernel) dimensions: */
int filter_count, int filter_height, int filter_count, int filter_height,
int filter_width, int filter_planes) { int filter_width, int filter_planes) {
::tensorflow::testing::StopTiming();
CREATE_THREAD_POOL(num_threads); CREATE_THREAD_POOL(num_threads);
using Benchmark = using Benchmark =
CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>; CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
auto benchmark = Benchmark(iters, device); auto benchmark = Benchmark(state, device);
typename Benchmark::Dimensions input_dims( typename Benchmark::Dimensions input_dims(
input_batches, input_height, input_width, input_planes, input_depth); input_batches, input_height, input_width, input_planes, input_depth);
@ -303,10 +296,11 @@ void CuboidConvolutionBackwardInput(int iters, int num_threads,
auto num_computed_elements = input_dims.TotalSize(); auto num_computed_elements = input_dims.TotalSize();
auto flops = num_computed_elements * auto flops = num_computed_elements *
(input_depth * filter_height * filter_width * filter_planes); (input_depth * filter_height * filter_width * filter_planes);
::tensorflow::testing::ItemsProcessed(flops * iters); state.SetItemsProcessed(flops * state.iterations());
} }
void CuboidConvolutionBackwardKernel(int iters, int num_threads, void CuboidConvolutionBackwardKernel(::testing::benchmark::State& state,
int num_threads,
/* Input dimensions: */ /* Input dimensions: */
int input_batches, int input_height, int input_batches, int input_height,
int input_width, int input_planes, int input_width, int input_planes,
@ -314,13 +308,11 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
/* Filter (kernel) dimensions: */ /* Filter (kernel) dimensions: */
int filter_count, int filter_height, int filter_count, int filter_height,
int filter_width, int filter_planes) { int filter_width, int filter_planes) {
::tensorflow::testing::StopTiming();
CREATE_THREAD_POOL(num_threads); CREATE_THREAD_POOL(num_threads);
using Benchmark = using Benchmark =
CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>; CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
auto benchmark = Benchmark(iters, device); auto benchmark = Benchmark(state, device);
typename Benchmark::Dimensions input_dims( typename Benchmark::Dimensions input_dims(
input_batches, input_height, input_width, input_planes, input_depth); input_batches, input_height, input_width, input_planes, input_depth);
@ -332,9 +324,16 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
auto num_computed_elements = filter_dims.TotalSize(); auto num_computed_elements = filter_dims.TotalSize();
auto flops = num_computed_elements * auto flops = num_computed_elements *
(input_batches * input_height * input_width * input_planes); (input_batches * input_height * input_width * input_planes);
::tensorflow::testing::ItemsProcessed(flops * iters); state.SetItemsProcessed(flops * state.iterations());
} }
// The multiple #'s in the function names + the `::testing::benchmark::State&`
// as parameters apparently confuses clang if they are not on the same line. So
// we need to turn off LINT and clang-format for this block.
//
// clang-format off
// NOLINTBEGIN
// Macro arguments names: --------------------------------------------------- // // Macro arguments names: --------------------------------------------------- //
// NT: num threads // NT: num threads
// N: batch size // N: batch size
@ -354,33 +353,33 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
_f_##FC##_##FH##_##FW##_##FP) _f_##FC##_##FH##_##FW##_##FP)
#define BM_CuboidConvolution(NT, N, H, W, P, C, FC, FH, FW, FP, LABEL) \ #define BM_CuboidConvolution(NT, N, H, W, P, C, FC, FH, FW, FP, LABEL) \
static void BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, \ static void BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) { \
FP)(int iters) { \ state.SetLabel(LABEL); \
::tensorflow::testing::SetLabel(LABEL); \ CuboidConvolution(state, NT, N, H, W, P, C, FC, FH, FW, FP); \
CuboidConvolution(iters, NT, N, H, W, P, C, FC, FH, FW, FP); \
} \ } \
BENCHMARK( \ BENCHMARK( \
BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, FP)) BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, FP))
#define BM_CuboidConvolutionBwdInput(NT, N, H, W, P, C, FC, FH, FW, FP, LABEL) \ #define BM_CuboidConvolutionBwdInput(NT, N, H, W, P, C, FC, FH, FW, FP, LABEL) \
static void BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, \ static void BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) { \
FH, FW, FP)(int iters) { \ state.SetLabel(LABEL); \
::tensorflow::testing::SetLabel(LABEL); \ CuboidConvolutionBackwardInput(state, NT, N, H, W, P, C, FC, FH, FW, FP); \
CuboidConvolutionBackwardInput(iters, NT, N, H, W, P, C, FC, FH, FW, FP); \
} \ } \
BENCHMARK(BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, \ BENCHMARK(BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, \
FH, FW, FP)) FH, FW, FP))
#define BM_CuboidConvolutionBwdKernel(NT, N, H, W, P, C, FC, FH, FW, FP, \ #define BM_CuboidConvolutionBwdKernel(NT, N, H, W, P, C, FC, FH, FW, FP, \
LABEL) \ LABEL) \
static void BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, \ static void BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) { \
FC, FH, FW, FP)(int iters) { \ state.SetLabel(LABEL); \
::tensorflow::testing::SetLabel(LABEL); \ CuboidConvolutionBackwardKernel(state, NT, N, H, W, P, C, FC, FH, FW, FP); \
CuboidConvolutionBackwardKernel(iters, NT, N, H, W, P, C, FC, FH, FW, FP); \
} \ } \
BENCHMARK(BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, FC, \ BENCHMARK(BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, FC, \
FH, FW, FP)) FH, FW, FP))
// NOLINTEND
// clang-format on
#define BM_CuboidConvolutions(N, H, W, P, C, FC, FH, FW, FP, LABEL) \ #define BM_CuboidConvolutions(N, H, W, P, C, FC, FH, FW, FP, LABEL) \
BM_CuboidConvolution(2, N, H, W, P, C, FC, FH, FW, FP, LABEL); \ BM_CuboidConvolution(2, N, H, W, P, C, FC, FH, FW, FP, LABEL); \
BM_CuboidConvolution(4, N, H, W, P, C, FC, FH, FW, FP, LABEL); \ BM_CuboidConvolution(4, N, H, W, P, C, FC, FH, FW, FP, LABEL); \

View File

@ -283,18 +283,23 @@ static Graph* FusedBatchNormGrad(int n, int h, int w, int c, bool is_training,
// -------------------------------------------------------------------------- // // -------------------------------------------------------------------------- //
// FusedBatchNorm inference // FusedBatchNorm inference
// -------------------------------------------------------------------------- // // -------------------------------------------------------------------------- //
// clang-format off
// NOLINTBEGIN
#define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)(::testing::benchmark::State & state) { \
test::Benchmark( \
#DEVICE, \
FusedBatchNormInference<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT), \
/*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(state.iterations() * N * H * W * C); \
} \
BENCHMARK( \
BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)) \
->UseRealTime();
#define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \ // NOLINTEND
static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, \ // clang-format on
DEVICE)(int iters) { \
testing::UseRealTime(); \
testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
test::Benchmark(#DEVICE, FusedBatchNormInference<T>( \
N, H, W, C, IS_TRAINING, FORMAT_##FORMAT)) \
.Run(iters); \
} \
BENCHMARK( \
BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE));
BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, cpu); BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, cpu);
BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, cpu); BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, cpu);
@ -320,17 +325,19 @@ BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NCHW, gpu);
// FusedBatchNorm gradient // FusedBatchNorm gradient
// -------------------------------------------------------------------------- // // -------------------------------------------------------------------------- //
#define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \ #define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \ static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
DEVICE)(int iters) { \ DEVICE)(::testing::benchmark::State & state) { \
testing::UseRealTime(); \ test::Benchmark( \
testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \ #DEVICE, \
test::Benchmark(#DEVICE, FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING, \ FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT), \
FORMAT_##FORMAT)) \ /*old_benchmark_api*/ false) \
.Run(iters); \ .Run(state); \
} \ state.SetItemsProcessed(state.iterations() * N * H * W * C); \
BENCHMARK(BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \ } \
DEVICE)); BENCHMARK( \
BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)) \
->UseRealTime();
#define BM_FusedBatchNormGradResnetShapes(T, IS_TRAINING, FORMAT, DEVICE) \ #define BM_FusedBatchNormGradResnetShapes(T, IS_TRAINING, FORMAT, DEVICE) \
BM_FusedBatchNormGrad(64, 56, 56, 64, T, IS_TRAINING, FORMAT, DEVICE); \ BM_FusedBatchNormGrad(64, 56, 56, 64, T, IS_TRAINING, FORMAT, DEVICE); \

View File

@ -98,14 +98,16 @@ static Graph* BandedTriangularSolve(int64 num_bands, int64 n, int64 m,
// BS: boolean indicating whether to use the banded solver // BS: boolean indicating whether to use the banded solver
// T: C++ type of scalars (e.g. float, std::complex) // T: C++ type of scalars (e.g. float, std::complex)
// TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128 // TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
#define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D) \ #define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D) \
static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT( \ static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT( \
int iters) { \ ::testing::benchmark::State& state) { \
testing::UseRealTime(); \ test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT), \
testing::ItemsProcessed(static_cast<int64>(iters) * K * N + N * M); \ /*old_benchmark_api*/ false) \
test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT)).Run(iters); \ .Run(state); \
} \ state.SetItemsProcessed(state.iterations() * K * N + N * M); \
BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT); } \
BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT) \
->UseRealTime();
#define BM_BandedTriangularSolve(K, N, M, BS, D) \ #define BM_BandedTriangularSolve(K, N, M, BS, D) \
BM_BandedTriangularSolveDev(K, N, M, BS, float, DT_FLOAT, D); \ BM_BandedTriangularSolveDev(K, N, M, BS, float, DT_FLOAT, D); \

View File

@ -101,18 +101,18 @@ static Graph* MatrixTriangularSolveWithBroadcast(int64 b0, int64 b1, int64 m,
// T: C++ type of scalars (e.g. float, std::complex) // T: C++ type of scalars (e.g. float, std::complex)
// TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128 // TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
// D: Device (e.g. cpu, gpu) // D: Device (e.g. cpu, gpu)
#define BM_MatrixTriangularSolveDev(B1, B2, M, N, MB, T, TT, D) \ #define BM_MatrixTriangularSolveDev(B1, B2, M, N, MB, T, TT, D) \
static void \ static void \
BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D( \ BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D( \
int iters) { \ ::testing::benchmark::State& state) { \
testing::UseRealTime(); \ state.SetItemsProcessed(state.iterations() * std::max(B1, B2) * M * M * \
testing::ItemsProcessed(static_cast<int64>(iters) * std::max(B1, B2) * M * \ N * 2); \
M * N * 2); \ test::Benchmark( \
test::Benchmark( \ #D, MatrixTriangularSolveWithBroadcast<T>(B1, B2, M, N, MB, TT), \
#D, MatrixTriangularSolveWithBroadcast<T>(B1, B2, M, N, MB, TT)) \ /*old_benchmark_api*/ false) \
.Run(iters); \ .Run(state); \
} \ } \
BENCHMARK( \ BENCHMARK( \
BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D); BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -56,20 +56,25 @@ static Graph* ConstructSpaceToBatchGraph(
// The BM_Expand macro is needed for this to build with VC++. // The BM_Expand macro is needed for this to build with VC++.
#define BM_Expand(x) x #define BM_Expand(x) x
// Macro is already longer than 80 chars.
// NOLINTBEGIN
#define BM_SpaceToBatchDev(OP, DEVICE, DTYPE, B, H, W, D, BS, P00, P01, P10, \ #define BM_SpaceToBatchDev(OP, DEVICE, DTYPE, B, H, W, D, BS, P00, P01, P10, \
P11) \ P11) \
static void \ static void \
BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11( \ BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11( \
int iters) { \ ::testing::benchmark::State& state) { \
testing::ItemsProcessed(static_cast<int64>(iters) * B * (H + P00 + P01) * \ test::Benchmark( \
#DEVICE, \
ConstructSpaceToBatchGraph(#OP, TensorShape({B, H, W, D}), BS, DTYPE, \
{{P00, P01}, {P10, P11}}), \
/*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(state.iterations() * B * (H + P00 + P01) * \
(W + P10 + P11) * D); \ (W + P10 + P11) * D); \
test::Benchmark(#DEVICE, ConstructSpaceToBatchGraph( \
#OP, TensorShape({B, H, W, D}), BS, DTYPE, \
{{P00, P01}, {P10, P11}})) \
.Run(iters); \
} \ } \
BENCHMARK( \ BENCHMARK( \
BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11); BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11);
// NOLINTEND
#define BM_SpaceToBatch(OP, ...) \ #define BM_SpaceToBatch(OP, ...) \
BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \ BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \
BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \ BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \

View File

@ -107,36 +107,30 @@ static Graph* ReplicatedSparseMatMul(int m, int n, int d, float sparsity_1,
#define BM_SPARSE(M, K, N, S1, S2, TRA, TRB, TA, TB) \ #define BM_SPARSE(M, K, N, S1, S2, TRA, TRB, TA, TB) \
static void \ static void \
BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB( \ BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB( \
int iters) { \ ::testing::benchmark::State& state) { \
testing::StopTiming(); \
testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2); \
auto label = strings::Printf("tr_a: %d tr_b: %d sp_a: %0.2f sp_b: %0.2f", \ auto label = strings::Printf("tr_a: %d tr_b: %d sp_a: %0.2f sp_b: %0.2f", \
TRA, TRB, S1 / 100.0, S2 / 100.0); \ TRA, TRB, S1 / 100.0, S2 / 100.0); \
testing::SetLabel(label); \ state.SetLabel(label); \
testing::UseRealTime(); \
auto g = SparseMatMul<TA, TB>(M, N, K, S1 / 100.0, S2 / 100.0, TRA, TRB); \ auto g = SparseMatMul<TA, TB>(M, N, K, S1 / 100.0, S2 / 100.0, TRA, TRB); \
testing::StartTiming(); \ test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); \
test::Benchmark("cpu", g).Run(iters); \
} \ } \
BENCHMARK( \ BENCHMARK( \
BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB); BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB) \
->UseRealTime();
#define BM_SPARSE_REPLICATED(M, K, N, S1, S2, Copies) \ #define BM_SPARSE_REPLICATED(M, K, N, S1, S2, Copies) \
static void BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies( \ static void BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies( \
int iters) { \ ::testing::benchmark::State& state) { \
testing::StopTiming(); \
testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * Copies * \
2); \
auto label = strings::Printf("copies: %d sp_a: %0.2f sp_b: %0.2f", \ auto label = strings::Printf("copies: %d sp_a: %0.2f sp_b: %0.2f", \
(Copies), S1 / 100.0, S2 / 100.0); \ (Copies), S1 / 100.0, S2 / 100.0); \
testing::SetLabel(label); \ state.SetLabel(label); \
testing::UseRealTime(); \
auto g = \ auto g = \
ReplicatedSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, (Copies)); \ ReplicatedSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, (Copies)); \
testing::StartTiming(); \ test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); \
test::Benchmark("cpu", g).Run(iters); \ state.SetItemsProcessed(state.iterations() * M * K * N * Copies * 2); \
} \ } \
BENCHMARK(BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies); BENCHMARK(BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies) \
->UseRealTime();
#define BM_SPARSE_FLOAT(M, K, N, S1, S2, TRA, TRB) \ #define BM_SPARSE_FLOAT(M, K, N, S1, S2, TRA, TRB) \
BM_SPARSE(M, K, N, S1, S2, TRA, TRB, float, float) BM_SPARSE(M, K, N, S1, S2, TRA, TRB, float, float)
@ -219,22 +213,21 @@ static Graph* MultiSparseMatMul(int m, int n, int d, float sparsity_1,
return g; return g;
} }
#define BM_SPARSE_MULTI(M, K, N, S1, S2, Copies) \ // clang-format off
static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies( \ // NOLINTBEGIN
int iters) { \ #define BM_SPARSE_MULTI(M, K, N, S1, S2, Copies) \
testing::StopTiming(); \ static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies(::testing::benchmark::State& state) { \
testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2 * 2 * \ auto label = strings::Printf("%d_%d_%d_%d_%0.2f_%0.2f", M, K, N, Copies, \
Copies); \ S1 / 100.0, S2 / 100.0); \
auto label = strings::Printf("%d_%d_%d_%d_%0.2f_%0.2f", M, K, N, Copies, \ state.SetLabel(label); \
S1 / 100.0, S2 / 100.0); \ auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, Copies); \
testing::SetLabel(label); \ test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); \
testing::UseRealTime(); \ state.SetItemsProcessed(state.iterations() * M * K * N * 2 * 2 * Copies); \
auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, Copies); \ } \
testing::StartTiming(); \ BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies) \
test::Benchmark("cpu", g).Run(iters); \ ->UseRealTime();
} \ // NOLINTEND
BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies); // clang-format on
BM_SPARSE_MULTI(1024, 2140, 4096, 0, 82, 1); BM_SPARSE_MULTI(1024, 2140, 4096, 0, 82, 1);
BM_SPARSE_MULTI(1024, 4096, 2048, 83, 83, 1); BM_SPARSE_MULTI(1024, 4096, 2048, 83, 83, 1);
BM_SPARSE_MULTI(400, 800, 2560, 85, 85, 1); BM_SPARSE_MULTI(400, 800, 2560, 85, 85, 1);

View File

@ -68,19 +68,22 @@ static Graph* SparseTensorDenseMatmul(int nnz, int m, int k, int n,
return g; return g;
} }
// NOLINTBEGIN
#define BM_SparseTensorDenseMatmulDev(NNZ, M, K, N, TA, TB, DEVICE) \ #define BM_SparseTensorDenseMatmulDev(NNZ, M, K, N, TA, TB, DEVICE) \
static void \ static void \
BM_SparseTensorDenseMatmul##_##NNZ##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE( \ BM_SparseTensorDenseMatmul##_##NNZ##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE( \
int iters) { \ ::testing::benchmark::State& state) { \
int64 items_per_iter = (static_cast<int64>(NNZ) * (TB ? K : N)); \ int64 items_per_iter = (static_cast<int64>(NNZ) * (TB ? K : N)); \
testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter); \ test::Benchmark(#DEVICE, SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB), \
testing::BytesProcessed(static_cast<int64>(iters) * items_per_iter * \ /*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(state.iterations() * items_per_iter); \
state.SetBytesProcessed(state.iterations() * items_per_iter * \
sizeof(float)); \ sizeof(float)); \
test::Benchmark(#DEVICE, SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB)) \
.Run(iters); \
} \ } \
BENCHMARK( \ BENCHMARK( \
BM_SparseTensorDenseMatmul##_##NNZ##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE); BM_SparseTensorDenseMatmul##_##NNZ##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE);
// NOLINTEND
#define BM_SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB) \ #define BM_SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB) \
BM_SparseTensorDenseMatmulDev(NNZ, M, K, N, TA, TB, cpu); \ BM_SparseTensorDenseMatmulDev(NNZ, M, K, N, TA, TB, cpu); \