Updated benchmarks to newer API

PiperOrigin-RevId: 358013972
Change-Id: I99f0f538a39845408fbc29dcd60652c42eaf652e
This commit is contained in:
A. Unique TensorFlower 2021-02-17 12:58:26 -08:00 committed by TensorFlower Gardener
parent 7d45aa8560
commit 1178262a2a
11 changed files with 210 additions and 213 deletions

View File

@ -116,12 +116,15 @@ static Graph* Conv2DBackpropFilter(int batch, int height, int width,
#define BM_Conv2DBwdFilter(T, FMT, N, H, W, C, FH, FW, FC, SH, SW, PADDING, \
type) \
static void BM_NAME(BM_Conv2DBackpropFilter, type, T, FMT, N, H, W, C, FH, \
FW, FC, SH, SW, PADDING)(int iters) { \
testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * \
(C)); \
test::Benchmark(#type, Conv2DBackpropFilter<T>(N, H, W, C, FH, FW, FC, SH, \
SW, PADDING, FORMAT_##FMT)) \
.Run(iters); \
FW, FC, SH, SW, \
PADDING)(::testing::benchmark::State & state) { \
test::Benchmark(#type, \
Conv2DBackpropFilter<T>(N, H, W, C, FH, FW, FC, SH, SW, \
PADDING, FORMAT_##FMT), \
/*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(static_cast<int64>(state.iterations()) * (N) * \
(H) * (W) * (C)); \
} \
BENCHMARK(BM_NAME(BM_Conv2DBackpropFilter, type, T, FMT, N, H, W, C, FH, FW, \
FC, SH, SW, PADDING));

View File

@ -84,9 +84,9 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
.Input(backprop)
.Attr("T", DataTypeToEnum<T>::value)
.Attr("strides", {1, stride_h, stride_w, 1})
.Attr("padding", padding == Padding::SAME
? "SAME"
: padding == Padding::VALID ? "VALID" : "N/A")
.Attr("padding", padding == Padding::SAME ? "SAME"
: padding == Padding::VALID ? "VALID"
: "N/A")
.Attr("data_format", ToString(data_format))
.Finalize(graph, &conv2d));
@ -115,12 +115,14 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
#define BM_Conv2DBwdInput(T, FMT, N, H, W, C, FW, FH, FC, SH, SW, PADDING, \
type) \
static void BM_NAME(BM_Conv2DBackpropInput, type, T, FMT, N, H, W, C, FH, \
FW, FC, SH, SW, PADDING)(int iters) { \
testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * \
(C)); \
test::Benchmark(#type, Conv2DBackpropInput<T>(N, H, W, C, FH, FW, FC, SH, \
SW, PADDING, FORMAT_##FMT)) \
.Run(iters); \
FW, FC, SH, SW, \
PADDING)(::testing::benchmark::State & state) { \
test::Benchmark(#type, \
Conv2DBackpropInput<T>(N, H, W, C, FH, FW, FC, SH, SW, \
PADDING, FORMAT_##FMT), \
/*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(state.iterations() * (N) * (H) * (W) * (C)); \
} \
BENCHMARK(BM_NAME(BM_Conv2DBackpropInput, type, T, FMT, N, H, W, C, FH, FW, \
FC, SH, SW, PADDING));

View File

@ -91,10 +91,8 @@ TEST(SnapshotUtilTest, CombinationRoundTripTest) {
SnapshotRoundTrip(io::compression::kSnappy, 2);
}
void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
int version) {
tensorflow::testing::StopTiming();
void SnapshotReaderBenchmarkLoop(::testing::benchmark::State& state,
std::string compression_type, int version) {
tensorflow::DataTypeVector dtypes;
std::vector<Tensor> tensors;
GenerateTensorVector(dtypes, tensors);
@ -106,7 +104,7 @@ void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
TF_ASSERT_OK(Writer::Create(tensorflow::Env::Default(), filename,
compression_type, version, dtypes, &writer));
for (int i = 0; i < iters; ++i) {
for (auto s : state) {
writer->WriteTensors(tensors).IgnoreError();
}
TF_ASSERT_OK(writer->Close());
@ -115,34 +113,32 @@ void SnapshotReaderBenchmarkLoop(int iters, std::string compression_type,
TF_ASSERT_OK(Reader::Create(Env::Default(), filename, compression_type,
version, dtypes, &reader));
tensorflow::testing::StartTiming();
for (int i = 0; i < iters; ++i) {
for (auto s : state) {
std::vector<Tensor> read_tensors;
reader->ReadTensors(&read_tensors).IgnoreError();
}
tensorflow::testing::StopTiming();
TF_ASSERT_OK(Env::Default()->DeleteFile(filename));
}
void SnapshotCustomReaderNoneBenchmark(int iters) {
SnapshotReaderBenchmarkLoop(iters, io::compression::kNone, 1);
void SnapshotCustomReaderNoneBenchmark(::testing::benchmark::State& state) {
SnapshotReaderBenchmarkLoop(state, io::compression::kNone, 1);
}
void SnapshotCustomReaderGzipBenchmark(int iters) {
SnapshotReaderBenchmarkLoop(iters, io::compression::kGzip, 1);
void SnapshotCustomReaderGzipBenchmark(::testing::benchmark::State& state) {
SnapshotReaderBenchmarkLoop(state, io::compression::kGzip, 1);
}
void SnapshotCustomReaderSnappyBenchmark(int iters) {
SnapshotReaderBenchmarkLoop(iters, io::compression::kSnappy, 1);
void SnapshotCustomReaderSnappyBenchmark(::testing::benchmark::State& state) {
SnapshotReaderBenchmarkLoop(state, io::compression::kSnappy, 1);
}
void SnapshotTFRecordReaderNoneBenchmark(int iters) {
SnapshotReaderBenchmarkLoop(iters, io::compression::kNone, 2);
void SnapshotTFRecordReaderNoneBenchmark(::testing::benchmark::State& state) {
SnapshotReaderBenchmarkLoop(state, io::compression::kNone, 2);
}
void SnapshotTFRecordReaderGzipBenchmark(int iters) {
SnapshotReaderBenchmarkLoop(iters, io::compression::kGzip, 2);
void SnapshotTFRecordReaderGzipBenchmark(::testing::benchmark::State& state) {
SnapshotReaderBenchmarkLoop(state, io::compression::kGzip, 2);
}
BENCHMARK(SnapshotCustomReaderNoneBenchmark);
@ -151,10 +147,8 @@ BENCHMARK(SnapshotCustomReaderSnappyBenchmark);
BENCHMARK(SnapshotTFRecordReaderNoneBenchmark);
BENCHMARK(SnapshotTFRecordReaderGzipBenchmark);
void SnapshotWriterBenchmarkLoop(int iters, std::string compression_type,
int version) {
tensorflow::testing::StopTiming();
void SnapshotWriterBenchmarkLoop(::testing::benchmark::State& state,
std::string compression_type, int version) {
tensorflow::DataTypeVector dtypes;
std::vector<Tensor> tensors;
GenerateTensorVector(dtypes, tensors);
@ -166,38 +160,36 @@ void SnapshotWriterBenchmarkLoop(int iters, std::string compression_type,
TF_ASSERT_OK(Writer::Create(tensorflow::Env::Default(), filename,
compression_type, version, dtypes, &writer));
tensorflow::testing::StartTiming();
for (int i = 0; i < iters; ++i) {
for (auto s : state) {
writer->WriteTensors(tensors).IgnoreError();
}
writer->Close().IgnoreError();
tensorflow::testing::StopTiming();
TF_ASSERT_OK(Env::Default()->DeleteFile(filename));
}
void SnapshotCustomWriterNoneBenchmark(int iters) {
SnapshotWriterBenchmarkLoop(iters, io::compression::kNone, 1);
void SnapshotCustomWriterNoneBenchmark(::testing::benchmark::State& state) {
SnapshotWriterBenchmarkLoop(state, io::compression::kNone, 1);
}
void SnapshotCustomWriterGzipBenchmark(int iters) {
SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 1);
void SnapshotCustomWriterGzipBenchmark(::testing::benchmark::State& state) {
SnapshotWriterBenchmarkLoop(state, io::compression::kGzip, 1);
}
void SnapshotCustomWriterSnappyBenchmark(int iters) {
SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 1);
void SnapshotCustomWriterSnappyBenchmark(::testing::benchmark::State& state) {
SnapshotWriterBenchmarkLoop(state, io::compression::kSnappy, 1);
}
void SnapshotTFRecordWriterNoneBenchmark(int iters) {
SnapshotWriterBenchmarkLoop(iters, io::compression::kNone, 2);
void SnapshotTFRecordWriterNoneBenchmark(::testing::benchmark::State& state) {
SnapshotWriterBenchmarkLoop(state, io::compression::kNone, 2);
}
void SnapshotTFRecordWriterGzipBenchmark(int iters) {
SnapshotWriterBenchmarkLoop(iters, io::compression::kGzip, 2);
void SnapshotTFRecordWriterGzipBenchmark(::testing::benchmark::State& state) {
SnapshotWriterBenchmarkLoop(state, io::compression::kGzip, 2);
}
void SnapshotTFRecordWriterSnappyBenchmark(int iters) {
SnapshotWriterBenchmarkLoop(iters, io::compression::kSnappy, 2);
void SnapshotTFRecordWriterSnappyBenchmark(::testing::benchmark::State& state) {
SnapshotWriterBenchmarkLoop(state, io::compression::kSnappy, 2);
}
BENCHMARK(SnapshotCustomWriterNoneBenchmark);

View File

@ -35,8 +35,9 @@ class SpatialConvolutionBenchmarksSuite {
using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
SpatialConvolutionBenchmarksSuite(int iters, Device& device)
: iters_(iters), device_(device) {}
SpatialConvolutionBenchmarksSuite(::testing::benchmark::State& state,
Device& device)
: state_(state), device_(device) {}
Eigen::Index BufferSize(const Dimensions& dims) {
return dims.TotalSize() * sizeof(Scalar);
@ -62,12 +63,10 @@ class SpatialConvolutionBenchmarksSuite {
Filter filter(filter_data, filter_dims);
Output output(output_data, output_dims);
::tensorflow::testing::StartTiming();
for (int i = 0; i < iters_; ++i) {
for (auto s : state_) {
output.device(device_) = Eigen::SpatialConvolution(input, filter);
tensorflow::testing::DoNotOptimize(output);
}
::tensorflow::testing::StopTiming();
device_.deallocate(input_data);
device_.deallocate(filter_data);
@ -102,13 +101,11 @@ class SpatialConvolutionBenchmarksSuite {
OutputBackward output_backward(output_backward_data, output_dims);
InputBackward input_backward(input_backward_data, input_dims);
::tensorflow::testing::StartTiming();
for (int i = 0; i < iters_; ++i) {
for (auto s : state_) {
input_backward.device(device_) = Eigen::SpatialConvolutionBackwardInput(
filter, output_backward, input_rows, input_cols);
tensorflow::testing::DoNotOptimize(input_backward);
}
::tensorflow::testing::StopTiming();
device_.deallocate(filter_data);
device_.deallocate(output_backward_data);
@ -143,13 +140,11 @@ class SpatialConvolutionBenchmarksSuite {
OutputBackward output_backward(output_backward_data, input_dims);
FilterBackward filter_backward(filter_backward_data, filter_dims);
::tensorflow::testing::StartTiming();
for (int i = 0; i < iters_; ++i) {
for (auto s : state_) {
filter_backward.device(device_) = Eigen::SpatialConvolutionBackwardKernel(
input, output_backward, filter_rows, filter_cols);
tensorflow::testing::DoNotOptimize(filter_backward);
}
::tensorflow::testing::StopTiming();
device_.deallocate(input_data);
device_.deallocate(output_backward_data);
@ -157,7 +152,8 @@ class SpatialConvolutionBenchmarksSuite {
}
private:
int iters_;
::testing::benchmark::State& state_;
Device& device_;
};
@ -170,8 +166,9 @@ class CuboidConvolutionBenchmarksSuite {
using Dimensions = Eigen::DSizes<Eigen::Index, 5>;
CuboidConvolutionBenchmarksSuite(int iters, Device& device)
: iters_(iters), device_(device) {}
CuboidConvolutionBenchmarksSuite(::testing::benchmark::State& state,
Device& device)
: state_(state), device_(device) {}
Eigen::Index BufferSize(const Dimensions& dims) {
return dims.TotalSize() * sizeof(Scalar);
@ -198,12 +195,10 @@ class CuboidConvolutionBenchmarksSuite {
Filter filter(filter_data, filter_dims);
Output output(output_data, output_dims);
::tensorflow::testing::StartTiming();
for (int i = 0; i < iters_; ++i) {
for (auto s : state_) {
output.device(device_) = Eigen::CuboidConvolution(input, filter);
tensorflow::testing::DoNotOptimize(output);
}
::tensorflow::testing::StopTiming();
device_.deallocate(input_data);
device_.deallocate(filter_data);
@ -240,13 +235,11 @@ class CuboidConvolutionBenchmarksSuite {
OutputBackward output_backward(output_backward_data, output_dims);
InputBackward input_backward(input_backward_data, input_dims);
::tensorflow::testing::StartTiming();
for (int i = 0; i < iters_; ++i) {
for (auto s : state_) {
input_backward.device(device_) = Eigen::CuboidConvolutionBackwardInput(
filter, output_backward, input_planes, input_rows, input_cols);
tensorflow::testing::DoNotOptimize(input_backward);
}
::tensorflow::testing::StopTiming();
device_.deallocate(filter_data);
device_.deallocate(output_backward_data);
@ -283,13 +276,11 @@ class CuboidConvolutionBenchmarksSuite {
OutputBackward output_backward(output_backward_data, output_dims);
FilterBackward filter_backward(filter_backward_data, filter_dims);
::tensorflow::testing::StartTiming();
for (int i = 0; i < iters_; ++i) {
for (auto s : state_) {
filter_backward.device(device_) = Eigen::CuboidConvolutionBackwardKernel(
input, output_backward, filter_planes, filter_rows, filter_cols);
tensorflow::testing::DoNotOptimize(filter_backward);
}
::tensorflow::testing::StopTiming();
device_.deallocate(input_data);
device_.deallocate(output_backward_data);
@ -297,7 +288,7 @@ class CuboidConvolutionBenchmarksSuite {
}
private:
int iters_;
::testing::benchmark::State& state_;
Device& device_;
};

View File

@ -27,19 +27,17 @@ limitations under the License.
// Spatial Convolutions //
// -------------------------------------------------------------------------- //
void SpatialConvolution(int iters, int num_threads,
void SpatialConvolution(::testing::benchmark::State& state, int num_threads,
/* Input dimensions: */
int input_batches, int input_height, int input_width,
int input_depth,
/* Filter (kernel) dimensions: */
int filter_count, int filter_height, int filter_width) {
::tensorflow::testing::StopTiming();
CREATE_THREAD_POOL(num_threads);
using Benchmark =
SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
auto benchmark = Benchmark(iters, device);
auto benchmark = Benchmark(state, device);
typename Benchmark::Dimensions input_dims(input_batches, input_height,
input_width, input_depth);
@ -52,23 +50,22 @@ void SpatialConvolution(int iters, int num_threads,
(input_dims.TotalSize() / input_depth) * filter_count;
auto flops =
num_computed_elements * (input_depth * filter_height * filter_width);
::tensorflow::testing::ItemsProcessed(flops * iters);
state.SetItemsProcessed(flops * state.iterations());
}
void SpatialConvolutionBackwardInput(int iters, int num_threads,
void SpatialConvolutionBackwardInput(::testing::benchmark::State& state,
int num_threads,
/* Input dimensions: */
int input_batches, int input_height,
int input_width, int input_depth,
/* Filter (kernel) dimensions: */
int filter_count, int filter_height,
int filter_width) {
::tensorflow::testing::StopTiming();
CREATE_THREAD_POOL(num_threads);
using Benchmark =
SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
auto benchmark = Benchmark(iters, device);
auto benchmark = Benchmark(state, device);
typename Benchmark::Dimensions input_dims(input_batches, input_height,
input_width, input_depth);
@ -80,23 +77,22 @@ void SpatialConvolutionBackwardInput(int iters, int num_threads,
auto num_computed_elements = input_dims.TotalSize();
auto flops =
num_computed_elements * (input_depth * filter_height * filter_width);
::tensorflow::testing::ItemsProcessed(flops * iters);
state.SetItemsProcessed(flops * state.iterations());
}
void SpatialConvolutionBackwardKernel(int iters, int num_threads,
void SpatialConvolutionBackwardKernel(::testing::benchmark::State& state,
int num_threads,
/* Input dimensions: */
int input_batches, int input_height,
int input_width, int input_depth,
/* Filter (kernel) dimensions: */
int filter_count, int filter_height,
int filter_width) {
::tensorflow::testing::StopTiming();
CREATE_THREAD_POOL(num_threads);
using Benchmark =
SpatialConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
auto benchmark = Benchmark(iters, device);
auto benchmark = Benchmark(state, device);
typename Benchmark::Dimensions input_dims(input_batches, input_height,
input_width, input_depth);
@ -108,7 +104,7 @@ void SpatialConvolutionBackwardKernel(int iters, int num_threads,
auto num_computed_elements = filter_dims.TotalSize();
auto flops =
num_computed_elements * (input_batches * input_height * input_width);
::tensorflow::testing::ItemsProcessed(flops * iters);
state.SetItemsProcessed(flops * state.iterations());
}
// Macro arguments names: --------------------------------------------------- //
@ -126,26 +122,26 @@ void SpatialConvolutionBackwardKernel(int iters, int num_threads,
#define BM_SpatialConvolution(NT, N, H, W, C, FC, FH, FW, LABEL) \
static void BM_SPATIAL_NAME(SpatialConvolution, NT, N, H, W, C, FC, FH, \
FW)(int iters) { \
::tensorflow::testing::SetLabel(LABEL); \
SpatialConvolution(iters, NT, N, H, W, C, FC, FH, FW); \
FW)(::testing::benchmark::State & state) { \
state.SetLabel(LABEL); \
SpatialConvolution(state, NT, N, H, W, C, FC, FH, FW); \
} \
BENCHMARK(BM_SPATIAL_NAME(SpatialConvolution, NT, N, H, W, C, FC, FH, FW))
#define BM_SpatialConvolutionBwdInput(NT, N, H, W, C, FC, FH, FW, LABEL) \
static void BM_SPATIAL_NAME(SpatialConvolutionBwdInput, NT, N, H, W, C, FC, \
FH, FW)(int iters) { \
::tensorflow::testing::SetLabel(LABEL); \
SpatialConvolutionBackwardInput(iters, NT, N, H, W, C, FC, FH, FW); \
FH, FW)(::testing::benchmark::State & state) { \
state.SetLabel(LABEL); \
SpatialConvolutionBackwardInput(state, NT, N, H, W, C, FC, FH, FW); \
} \
BENCHMARK( \
BM_SPATIAL_NAME(SpatialConvolutionBwdInput, NT, N, H, W, C, FC, FH, FW))
#define BM_SpatialConvolutionBwdKernel(NT, N, H, W, C, FC, FH, FW, LABEL) \
static void BM_SPATIAL_NAME(SpatialConvolutionBwdKernel, NT, N, H, W, C, FC, \
FH, FW)(int iters) { \
::tensorflow::testing::SetLabel(LABEL); \
SpatialConvolutionBackwardKernel(iters, NT, N, H, W, C, FC, FH, FW); \
FH, FW)(::testing::benchmark::State & state) { \
state.SetLabel(LABEL); \
SpatialConvolutionBackwardKernel(state, NT, N, H, W, C, FC, FH, FW); \
} \
BENCHMARK(BM_SPATIAL_NAME(SpatialConvolutionBwdKernel, NT, N, H, W, C, FC, \
FH, FW))
@ -248,20 +244,18 @@ BM_SpatialConvolutionsBwdKernel(32, 7, 7, 192, 384, 3, 3, "conv5_00_3x3");
// Cuboid Convolutions //
// -------------------------------------------------------------------------- //
void CuboidConvolution(int iters, int num_threads,
void CuboidConvolution(::testing::benchmark::State& state, int num_threads,
/* Input dimensions: */
int input_batches, int input_height, int input_width,
int input_planes, int input_depth,
/* Filter (kernel) dimensions: */
int filter_count, int filter_height, int filter_width,
int filter_planes) {
::tensorflow::testing::StopTiming();
CREATE_THREAD_POOL(num_threads);
using Benchmark =
CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
auto benchmark = Benchmark(iters, device);
auto benchmark = Benchmark(state, device);
typename Benchmark::Dimensions input_dims(
input_batches, input_height, input_width, input_planes, input_depth);
@ -274,10 +268,11 @@ void CuboidConvolution(int iters, int num_threads,
(input_dims.TotalSize() / input_depth) * filter_count;
auto flops = num_computed_elements *
(input_depth * filter_height * filter_width * filter_planes);
::tensorflow::testing::ItemsProcessed(flops * iters);
state.SetItemsProcessed(flops * state.iterations());
}
void CuboidConvolutionBackwardInput(int iters, int num_threads,
void CuboidConvolutionBackwardInput(::testing::benchmark::State& state,
int num_threads,
/* Input dimensions: */
int input_batches, int input_height,
int input_width, int input_planes,
@ -285,13 +280,11 @@ void CuboidConvolutionBackwardInput(int iters, int num_threads,
/* Filter (kernel) dimensions: */
int filter_count, int filter_height,
int filter_width, int filter_planes) {
::tensorflow::testing::StopTiming();
CREATE_THREAD_POOL(num_threads);
using Benchmark =
CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
auto benchmark = Benchmark(iters, device);
auto benchmark = Benchmark(state, device);
typename Benchmark::Dimensions input_dims(
input_batches, input_height, input_width, input_planes, input_depth);
@ -303,10 +296,11 @@ void CuboidConvolutionBackwardInput(int iters, int num_threads,
auto num_computed_elements = input_dims.TotalSize();
auto flops = num_computed_elements *
(input_depth * filter_height * filter_width * filter_planes);
::tensorflow::testing::ItemsProcessed(flops * iters);
state.SetItemsProcessed(flops * state.iterations());
}
void CuboidConvolutionBackwardKernel(int iters, int num_threads,
void CuboidConvolutionBackwardKernel(::testing::benchmark::State& state,
int num_threads,
/* Input dimensions: */
int input_batches, int input_height,
int input_width, int input_planes,
@ -314,13 +308,11 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
/* Filter (kernel) dimensions: */
int filter_count, int filter_height,
int filter_width, int filter_planes) {
::tensorflow::testing::StopTiming();
CREATE_THREAD_POOL(num_threads);
using Benchmark =
CuboidConvolutionBenchmarksSuite<float, Eigen::ThreadPoolDevice>;
auto benchmark = Benchmark(iters, device);
auto benchmark = Benchmark(state, device);
typename Benchmark::Dimensions input_dims(
input_batches, input_height, input_width, input_planes, input_depth);
@ -332,9 +324,16 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
auto num_computed_elements = filter_dims.TotalSize();
auto flops = num_computed_elements *
(input_batches * input_height * input_width * input_planes);
::tensorflow::testing::ItemsProcessed(flops * iters);
state.SetItemsProcessed(flops * state.iterations());
}
// The multiple #'s in the function names + the `::testing::benchmark::State&`
// as parameters apparently confuses clang if they are not on the same line. So
// we need to turn off LINT and clang-format for this block.
//
// clang-format off
// NOLINTBEGIN
// Macro arguments names: --------------------------------------------------- //
// NT: num threads
// N: batch size
@ -354,33 +353,33 @@ void CuboidConvolutionBackwardKernel(int iters, int num_threads,
_f_##FC##_##FH##_##FW##_##FP)
#define BM_CuboidConvolution(NT, N, H, W, P, C, FC, FH, FW, FP, LABEL) \
static void BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, \
FP)(int iters) { \
::tensorflow::testing::SetLabel(LABEL); \
CuboidConvolution(iters, NT, N, H, W, P, C, FC, FH, FW, FP); \
static void BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) { \
state.SetLabel(LABEL); \
CuboidConvolution(state, NT, N, H, W, P, C, FC, FH, FW, FP); \
} \
BENCHMARK( \
BM_CUBOID_NAME(CuboidConvolution, NT, N, H, W, P, C, FC, FH, FW, FP))
#define BM_CuboidConvolutionBwdInput(NT, N, H, W, P, C, FC, FH, FW, FP, LABEL) \
static void BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, \
FH, FW, FP)(int iters) { \
::tensorflow::testing::SetLabel(LABEL); \
CuboidConvolutionBackwardInput(iters, NT, N, H, W, P, C, FC, FH, FW, FP); \
static void BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) { \
state.SetLabel(LABEL); \
CuboidConvolutionBackwardInput(state, NT, N, H, W, P, C, FC, FH, FW, FP); \
} \
BENCHMARK(BM_CUBOID_NAME(CuboidConvolutionBwdInput, NT, N, H, W, P, C, FC, \
FH, FW, FP))
#define BM_CuboidConvolutionBwdKernel(NT, N, H, W, P, C, FC, FH, FW, FP, \
LABEL) \
static void BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, \
FC, FH, FW, FP)(int iters) { \
::tensorflow::testing::SetLabel(LABEL); \
CuboidConvolutionBackwardKernel(iters, NT, N, H, W, P, C, FC, FH, FW, FP); \
static void BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, FC, FH, FW, FP)(::testing::benchmark::State & state) { \
state.SetLabel(LABEL); \
CuboidConvolutionBackwardKernel(state, NT, N, H, W, P, C, FC, FH, FW, FP); \
} \
BENCHMARK(BM_CUBOID_NAME(CuboidConvolutionBwdKernel, NT, N, H, W, P, C, FC, \
FH, FW, FP))
// NOLINTEND
// clang-format on
#define BM_CuboidConvolutions(N, H, W, P, C, FC, FH, FW, FP, LABEL) \
BM_CuboidConvolution(2, N, H, W, P, C, FC, FH, FW, FP, LABEL); \
BM_CuboidConvolution(4, N, H, W, P, C, FC, FH, FW, FP, LABEL); \

View File

@ -283,18 +283,23 @@ static Graph* FusedBatchNormGrad(int n, int h, int w, int c, bool is_training,
// -------------------------------------------------------------------------- //
// FusedBatchNorm inference
// -------------------------------------------------------------------------- //
// clang-format off
// NOLINTBEGIN
#define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)(::testing::benchmark::State & state) { \
test::Benchmark( \
#DEVICE, \
FusedBatchNormInference<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT), \
/*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(state.iterations() * N * H * W * C); \
} \
BENCHMARK( \
BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)) \
->UseRealTime();
#define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, \
DEVICE)(int iters) { \
testing::UseRealTime(); \
testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
test::Benchmark(#DEVICE, FusedBatchNormInference<T>( \
N, H, W, C, IS_TRAINING, FORMAT_##FORMAT)) \
.Run(iters); \
} \
BENCHMARK( \
BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE));
// NOLINTEND
// clang-format on
BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, cpu);
BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, cpu);
@ -320,17 +325,19 @@ BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NCHW, gpu);
// FusedBatchNorm gradient
// -------------------------------------------------------------------------- //
#define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
DEVICE)(int iters) { \
testing::UseRealTime(); \
testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
test::Benchmark(#DEVICE, FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING, \
FORMAT_##FORMAT)) \
.Run(iters); \
} \
BENCHMARK(BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
DEVICE));
#define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
DEVICE)(::testing::benchmark::State & state) { \
test::Benchmark( \
#DEVICE, \
FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT), \
/*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(state.iterations() * N * H * W * C); \
} \
BENCHMARK( \
BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)) \
->UseRealTime();
#define BM_FusedBatchNormGradResnetShapes(T, IS_TRAINING, FORMAT, DEVICE) \
BM_FusedBatchNormGrad(64, 56, 56, 64, T, IS_TRAINING, FORMAT, DEVICE); \

View File

@ -98,14 +98,16 @@ static Graph* BandedTriangularSolve(int64 num_bands, int64 n, int64 m,
// BS: boolean indicating whether to use the banded solver
// T: C++ type of scalars (e.g. float, std::complex)
// TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
#define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D) \
static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT( \
int iters) { \
testing::UseRealTime(); \
testing::ItemsProcessed(static_cast<int64>(iters) * K * N + N * M); \
test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT)).Run(iters); \
} \
BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT);
#define BM_BandedTriangularSolveDev(K, N, M, BS, T, TT, D) \
static void BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT( \
::testing::benchmark::State& state) { \
test::Benchmark(#D, BandedTriangularSolve<T>(K, N, M, BS, TT), \
/*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(state.iterations() * K * N + N * M); \
} \
BENCHMARK(BM_BandedTriangularSolve##_##K##_##N##_##M##_##BS##_##TT) \
->UseRealTime();
#define BM_BandedTriangularSolve(K, N, M, BS, D) \
BM_BandedTriangularSolveDev(K, N, M, BS, float, DT_FLOAT, D); \

View File

@ -101,18 +101,18 @@ static Graph* MatrixTriangularSolveWithBroadcast(int64 b0, int64 b1, int64 m,
// T: C++ type of scalars (e.g. float, std::complex)
// TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
// D: Device (e.g. cpu, gpu)
#define BM_MatrixTriangularSolveDev(B1, B2, M, N, MB, T, TT, D) \
static void \
BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D( \
int iters) { \
testing::UseRealTime(); \
testing::ItemsProcessed(static_cast<int64>(iters) * std::max(B1, B2) * M * \
M * N * 2); \
test::Benchmark( \
#D, MatrixTriangularSolveWithBroadcast<T>(B1, B2, M, N, MB, TT)) \
.Run(iters); \
} \
BENCHMARK( \
#define BM_MatrixTriangularSolveDev(B1, B2, M, N, MB, T, TT, D) \
static void \
BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D( \
::testing::benchmark::State& state) { \
state.SetItemsProcessed(state.iterations() * std::max(B1, B2) * M * M * \
N * 2); \
test::Benchmark( \
#D, MatrixTriangularSolveWithBroadcast<T>(B1, B2, M, N, MB, TT), \
/*old_benchmark_api*/ false) \
.Run(state); \
} \
BENCHMARK( \
BM_MatrixTriangularSolve##_##B1##_##B2##_##M##_##N##_##MB##_##TT##_##D);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -56,20 +56,25 @@ static Graph* ConstructSpaceToBatchGraph(
// The BM_Expand macro is needed for this to build with VC++.
#define BM_Expand(x) x
// Macro is already longer than 80 chars.
// NOLINTBEGIN
#define BM_SpaceToBatchDev(OP, DEVICE, DTYPE, B, H, W, D, BS, P00, P01, P10, \
P11) \
static void \
BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11( \
int iters) { \
testing::ItemsProcessed(static_cast<int64>(iters) * B * (H + P00 + P01) * \
::testing::benchmark::State& state) { \
test::Benchmark( \
#DEVICE, \
ConstructSpaceToBatchGraph(#OP, TensorShape({B, H, W, D}), BS, DTYPE, \
{{P00, P01}, {P10, P11}}), \
/*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(state.iterations() * B * (H + P00 + P01) * \
(W + P10 + P11) * D); \
test::Benchmark(#DEVICE, ConstructSpaceToBatchGraph( \
#OP, TensorShape({B, H, W, D}), BS, DTYPE, \
{{P00, P01}, {P10, P11}})) \
.Run(iters); \
} \
BENCHMARK( \
BM_##OP##_##DEVICE##_##DTYPE##_##B##_##H##_##W##_##D##_bs##BS##_pad##P00##_##P01##_##P10##_##P11);
// NOLINTEND
#define BM_SpaceToBatch(OP, ...) \
BM_Expand(BM_SpaceToBatchDev(OP, cpu, DT_FLOAT, __VA_ARGS__)); \
BM_Expand(BM_SpaceToBatchDev(OP, gpu, DT_FLOAT, __VA_ARGS__)); \

View File

@ -107,36 +107,30 @@ static Graph* ReplicatedSparseMatMul(int m, int n, int d, float sparsity_1,
#define BM_SPARSE(M, K, N, S1, S2, TRA, TRB, TA, TB) \
static void \
BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB( \
int iters) { \
testing::StopTiming(); \
testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2); \
::testing::benchmark::State& state) { \
auto label = strings::Printf("tr_a: %d tr_b: %d sp_a: %0.2f sp_b: %0.2f", \
TRA, TRB, S1 / 100.0, S2 / 100.0); \
testing::SetLabel(label); \
testing::UseRealTime(); \
state.SetLabel(label); \
auto g = SparseMatMul<TA, TB>(M, N, K, S1 / 100.0, S2 / 100.0, TRA, TRB); \
testing::StartTiming(); \
test::Benchmark("cpu", g).Run(iters); \
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); \
} \
BENCHMARK( \
BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB);
BM_Sparse##_##M##_##K##_##N##_##S1##_##S2##_##TRA##_##TRB##_##TA##_##TB) \
->UseRealTime();
#define BM_SPARSE_REPLICATED(M, K, N, S1, S2, Copies) \
static void BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies( \
int iters) { \
testing::StopTiming(); \
testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * Copies * \
2); \
::testing::benchmark::State& state) { \
auto label = strings::Printf("copies: %d sp_a: %0.2f sp_b: %0.2f", \
(Copies), S1 / 100.0, S2 / 100.0); \
testing::SetLabel(label); \
testing::UseRealTime(); \
state.SetLabel(label); \
auto g = \
ReplicatedSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, (Copies)); \
testing::StartTiming(); \
test::Benchmark("cpu", g).Run(iters); \
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); \
state.SetItemsProcessed(state.iterations() * M * K * N * Copies * 2); \
} \
BENCHMARK(BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies);
BENCHMARK(BM_Sparse_replicated##_##M##_##K##_##N##_##S1##_##S2##_##Copies) \
->UseRealTime();
#define BM_SPARSE_FLOAT(M, K, N, S1, S2, TRA, TRB) \
BM_SPARSE(M, K, N, S1, S2, TRA, TRB, float, float)
@ -219,22 +213,21 @@ static Graph* MultiSparseMatMul(int m, int n, int d, float sparsity_1,
return g;
}
#define BM_SPARSE_MULTI(M, K, N, S1, S2, Copies) \
static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies( \
int iters) { \
testing::StopTiming(); \
testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2 * 2 * \
Copies); \
auto label = strings::Printf("%d_%d_%d_%d_%0.2f_%0.2f", M, K, N, Copies, \
S1 / 100.0, S2 / 100.0); \
testing::SetLabel(label); \
testing::UseRealTime(); \
auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, Copies); \
testing::StartTiming(); \
test::Benchmark("cpu", g).Run(iters); \
} \
BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies);
// clang-format off
// NOLINTBEGIN
#define BM_SPARSE_MULTI(M, K, N, S1, S2, Copies) \
static void BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies(::testing::benchmark::State& state) { \
auto label = strings::Printf("%d_%d_%d_%d_%0.2f_%0.2f", M, K, N, Copies, \
S1 / 100.0, S2 / 100.0); \
state.SetLabel(label); \
auto g = MultiSparseMatMul(M, N, K, S1 / 100.0, S2 / 100.0, Copies); \
test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state); \
state.SetItemsProcessed(state.iterations() * M * K * N * 2 * 2 * Copies); \
} \
BENCHMARK(BM_Sparse_Multi##_##M##_##K##_##N##_##S1##_##S2##_##Copies) \
->UseRealTime();
// NOLINTEND
// clang-format on
BM_SPARSE_MULTI(1024, 2140, 4096, 0, 82, 1);
BM_SPARSE_MULTI(1024, 4096, 2048, 83, 83, 1);
BM_SPARSE_MULTI(400, 800, 2560, 85, 85, 1);

View File

@ -68,19 +68,22 @@ static Graph* SparseTensorDenseMatmul(int nnz, int m, int k, int n,
return g;
}
// NOLINTBEGIN
#define BM_SparseTensorDenseMatmulDev(NNZ, M, K, N, TA, TB, DEVICE) \
static void \
BM_SparseTensorDenseMatmul##_##NNZ##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE( \
int iters) { \
::testing::benchmark::State& state) { \
int64 items_per_iter = (static_cast<int64>(NNZ) * (TB ? K : N)); \
testing::ItemsProcessed(static_cast<int64>(iters) * items_per_iter); \
testing::BytesProcessed(static_cast<int64>(iters) * items_per_iter * \
test::Benchmark(#DEVICE, SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB), \
/*old_benchmark_api*/ false) \
.Run(state); \
state.SetItemsProcessed(state.iterations() * items_per_iter); \
state.SetBytesProcessed(state.iterations() * items_per_iter * \
sizeof(float)); \
test::Benchmark(#DEVICE, SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB)) \
.Run(iters); \
} \
BENCHMARK( \
BM_SparseTensorDenseMatmul##_##NNZ##_##M##_##K##_##N##_##TA##_##TB##_##DEVICE);
// NOLINTEND
#define BM_SparseTensorDenseMatmul(NNZ, M, K, N, TA, TB) \
BM_SparseTensorDenseMatmulDev(NNZ, M, K, N, TA, TB, cpu); \