From 5d55514c43db082ccef44354470473258692c259 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 27 Nov 2019 12:42:25 -0800
Subject: [PATCH] Add CPU benchmarks for Conv2D gradients

PiperOrigin-RevId: 282817699
Change-Id: Ib658159605dec1ed653b53db9422742971868101
---
 .../conv_grad_filter_ops_benchmark_test.cc    | 110 ++++++++++--------
 .../conv_grad_input_ops_benchmark_test.cc     |  98 +++++++++-------
 2 files changed, 112 insertions(+), 96 deletions(-)
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops_benchmark_test.cc b/tensorflow/core/kernels/conv_grad_filter_ops_benchmark_test.cc
index bb6eb846408..9b168045047 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_benchmark_test.cc
@@ -38,20 +38,21 @@ static Tensor MakeRandomTensor(const TensorShape& shape) {
 
 template <typename T>
 static Graph* Conv2DBackpropFilter(int batch, int height, int width,
-                                   int in_depth, int filter_w, int filter_h,
-                                   int out_depth, TensorFormat data_format) {
+                                   int in_depth, int filter_h, int filter_w,
+                                   int out_depth, int stride_h, int stride_w,
+                                   TensorFormat data_format) {
   auto* graph = new Graph(OpRegistry::Global());
 
   Tensor input_t = data_format == FORMAT_NHWC
                        ? MakeRandomTensor<T>({batch, height, width, in_depth})
                        : MakeRandomTensor<T>({batch, in_depth, height, width});
   Tensor filter_t =
-      MakeRandomTensor<T>({filter_w, filter_h, in_depth, out_depth});
+      MakeRandomTensor<T>({filter_h, filter_w, in_depth, out_depth});
 
   // Compute dimensions for the `out_backprop` tensor.
   Conv2DParameters params;
   params.dilations = {1, 1, 1, 1};
-  params.strides = {1, 1, 1, 1};
+  params.strides = {1, stride_h, stride_w, 1};
   params.padding = Padding::SAME;
   params.data_format = data_format;
 
@@ -83,7 +84,7 @@ static Graph* Conv2DBackpropFilter(int batch, int height, int width,
           .Input(filter_dims)
           .Input(backprop)
           .Attr("T", DataTypeToEnum<T>::value)
-          .Attr("strides", {1, 1, 1, 1})
+          .Attr("strides", {1, stride_h, stride_w, 1})
           .Attr("padding", "SAME")
           .Attr("data_format", ToString(data_format))
           .Finalize(graph, &conv2d));
@@ -91,12 +92,6 @@ static Graph* Conv2DBackpropFilter(int batch, int height, int width,
   return graph;
 }
 
-// -------------------------------------------------------------------------- //
-// The following benchmarks are used to compare different data format
-// performance for different data types. They make sense only when CUDA enabled,
-// because on CPU we only support data in NHWC.
-// -------------------------------------------------------------------------- //
-
 // Macro arguments names: --------------------------------------------------- //
 //      T: data type
 // FORMAT: data format (NHWC or NCHW)
@@ -107,57 +102,70 @@ static Graph* Conv2DBackpropFilter(int batch, int height, int width,
 //     FC: filter count
 //     FH: filter height
 //     FW: filter width
+//     SH: stride height
+//     SW: stride width
 
-#define BM_NAME(name, type, T, FORMAT, N, H, W, C, FW, FH, FC) \
-  name##_##T##_##FORMAT##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
+#define BM_CONCAT(a, b) a##_##b
 
-#define BM_Conv2DBwdFilterFmt(T, FORMAT, N, H, W, C, FW, FH, FC, type)        \
-  static void BM_NAME(BM_Conv2DBackpropFilter, type, T, FORMAT, N, H, W, C,   \
-                      FW, FH, FC)(int iters) {                                \
-    testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) *     \
-                            (C));                                             \
-    test::Benchmark(#type, Conv2DBackpropFilter<T>(N, H, W, C, FW, FH, FC,    \
-                                                   FORMAT_##FORMAT))          \
-        .Run(iters);                                                          \
-  }                                                                           \
-  BENCHMARK(BM_NAME(BM_Conv2DBackpropFilter, type, T, FORMAT, N, H, W, C, FW, \
-                    FH, FC));
+#define BM_NAME(name, type, T, FORMAT, N, H, W, C, FH, FW, FC, SH, SW) \
+  BM_CONCAT(name##_##T##_##FORMAT##_##type##_in##N##x##H##x##W##x##C,  \
+            f##FH##x##FW##x##FC##_##s##SH##x##SW)
+
+#define BM_Conv2DBwdFilterFmt(T, FORMAT, N, H, W, C, FH, FW, FC, SH, SW, type) \
+  static void BM_NAME(BM_Conv2DBackpropFilter, type, T, FORMAT, N, H, W, C,    \
+                      FH, FW, FC, SH, SW)(int iters) {                         \
+    testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) *      \
+                            (C));                                              \
+    test::Benchmark(#type, Conv2DBackpropFilter<T>(N, H, W, C, FH, FW, FC, SH, \
+                                                   SW, FORMAT_##FORMAT))       \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(BM_NAME(BM_Conv2DBackpropFilter, type, T, FORMAT, N, H, W, C, FH,  \
+                    FW, FC, SH, SW));
+
+// ResNet50-ish convolutions.
+#define BENCHMARK_DTYPE(FORMAT, BATCH, T, D)                                 \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 64, 1, 1, D);    \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 256, 1, 1, D);   \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 256, 1, 1, 64, 1, 1, D);   \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 64, 3, 3, 64, 1, 1, D);    \
+                                                                             \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 128, 1, 1, D);  \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 512, 1, 1, D);  \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 512, 1, 1, 128, 1, 1, D);  \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 512, 3, 3, 128, 1, 1, D);  \
+                                                                             \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 256, 1, 1, D);  \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 1024, 1, 1, D); \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 1024, 1, 1, 256, 1, 1, D); \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 256, 3, 3, 256, 1, 1, D);
 
-#if GOOGLE_CUDA
 using fp32 = float;
 using fp16 = Eigen::half;
 
-// ResNet50-ish convolutions.
-#define BENCHMARK_DTYPE(FORMAT, BATCH, T)                                \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 64, gpu);    \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 256, gpu);   \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 256, 1, 1, 64, gpu);   \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 64, 3, 3, 64, gpu);    \
-                                                                         \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 128, gpu);  \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 512, gpu);  \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 512, 1, 1, 128, gpu);  \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 512, 3, 3, 128, gpu);  \
-                                                                         \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 256, gpu);  \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 1024, gpu); \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 1024, 1, 1, 256, gpu); \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 256, 3, 3, 256, gpu);
+BENCHMARK_DTYPE(NHWC, 8, fp32, cpu);
+BENCHMARK_DTYPE(NHWC, 16, fp32, cpu);
+BENCHMARK_DTYPE(NHWC, 32, fp32, cpu);
 
-BENCHMARK_DTYPE(NHWC, 32, fp32);
-BENCHMARK_DTYPE(NCHW, 32, fp32);
+#if GOOGLE_CUDA
+// -------------------------------------------------------------------------- //
+// The following benchmarks are used to compare different data format
+// performance for different data types. They make sense only when CUDA enabled,
+// because on CPU we only support data in NHWC.
+// -------------------------------------------------------------------------- //
 
-BENCHMARK_DTYPE(NHWC, 32, fp16);
-BENCHMARK_DTYPE(NCHW, 32, fp16);
+BENCHMARK_DTYPE(NHWC, 32, fp32, gpu);
+BENCHMARK_DTYPE(NCHW, 32, fp32, gpu);
 
-BENCHMARK_DTYPE(NHWC, 64, fp32);
-BENCHMARK_DTYPE(NCHW, 64, fp32);
+BENCHMARK_DTYPE(NHWC, 32, fp16, gpu);
+BENCHMARK_DTYPE(NCHW, 32, fp16, gpu);
 
-BENCHMARK_DTYPE(NHWC, 64, fp16);
-BENCHMARK_DTYPE(NCHW, 64, fp16);
+BENCHMARK_DTYPE(NHWC, 64, fp32, gpu);
+BENCHMARK_DTYPE(NCHW, 64, fp32, gpu);
+
+BENCHMARK_DTYPE(NHWC, 64, fp16, gpu);
+BENCHMARK_DTYPE(NCHW, 64, fp16, gpu);
 
 #endif  // GOOGLE_CUDA
 
-BM_Conv2DBwdFilterFmt(float, NHWC, 8, 32, 32, 128, 1, 1, 128, cpu);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_grad_input_ops_benchmark_test.cc b/tensorflow/core/kernels/conv_grad_input_ops_benchmark_test.cc
index 938ef976ed8..70a08b2496c 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops_benchmark_test.cc
@@ -38,8 +38,9 @@ static Tensor MakeRandomTensor(const TensorShape& shape) {
 
 template <typename T>
 static Graph* Conv2DBackpropInput(int batch, int height, int width,
-                                  int in_depth, int filter_w, int filter_h,
-                                  int out_depth, TensorFormat data_format) {
+                                  int in_depth, int filter_h, int filter_w,
+                                  int out_depth, int stride_h, int stride_w,
+                                  TensorFormat data_format) {
   auto* graph = new Graph(OpRegistry::Global());
 
   Tensor input_t = data_format == FORMAT_NHWC
@@ -51,7 +52,7 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
   // Compute dimensions for the `out_backprop` tensor.
   Conv2DParameters params;
   params.dilations = {1, 1, 1, 1};
-  params.strides = {1, 1, 1, 1};
+  params.strides = {1, stride_h, stride_w, 1};
   params.padding = Padding::SAME;
   params.data_format = data_format;
 
@@ -83,7 +84,7 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
           .Input(filter)
           .Input(backprop)
           .Attr("T", DataTypeToEnum<T>::value)
-          .Attr("strides", {1, 1, 1, 1})
+          .Attr("strides", {1, stride_h, stride_w, 1})
           .Attr("padding", "SAME")
           .Attr("data_format", ToString(data_format))
           .Finalize(graph, &conv2d));
@@ -91,12 +92,6 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
   return graph;
 }
 
-// -------------------------------------------------------------------------- //
-// The following benchmarks are used to compare different data format
-// performance for different data types. They make sense only when CUDA enabled,
-// because on CPU we only support data in NHWC.
-// -------------------------------------------------------------------------- //
-
 // Macro arguments names: --------------------------------------------------- //
 //      T: data type
 // FORMAT: data format (NHWC or NCHW)
@@ -107,57 +102,70 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
 //     FC: filter count
 //     FH: filter height
 //     FW: filter width
+//     SH: stride height
+//     SW: stride width
 
-#define BM_NAME(name, type, T, FORMAT, N, H, W, C, FW, FH, FC) \
-  name##_##T##_##FORMAT##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
+#define BM_CONCAT(a, b) a##_##b
 
-#define BM_Conv2DBwdInputFmt(T, FORMAT, N, H, W, C, FW, FH, FC, type)          \
-  static void BM_NAME(BM_Conv2DBackpropInput, type, T, FORMAT, N, H, W, C, FW, \
-                      FH, FC)(int iters) {                                     \
+#define BM_NAME(name, type, T, FORMAT, N, H, W, C, FH, FW, FC, SH, SW) \
+  BM_CONCAT(name##_##T##_##FORMAT##_##type##_in##N##x##H##x##W##x##C,  \
+            f##FH##x##FW##x##FC##_##s##SH##x##SW)
+
+#define BM_Conv2DBwdInputFmt(T, FORMAT, N, H, W, C, FW, FH, FC, SH, SW, type)  \
+  static void BM_NAME(BM_Conv2DBackpropInput, type, T, FORMAT, N, H, W, C, FH, \
+                      FW, FC, SH, SW)(int iters) {                             \
     testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) *      \
                             (C));                                              \
-    test::Benchmark(#type, Conv2DBackpropInput<T>(N, H, W, C, FW, FH, FC,      \
-                                                  FORMAT_##FORMAT))            \
+    test::Benchmark(#type, Conv2DBackpropInput<T>(N, H, W, C, FH, FW, FC, SH,  \
+                                                  SW, FORMAT_##FORMAT))        \
         .Run(iters);                                                           \
   }                                                                            \
-  BENCHMARK(BM_NAME(BM_Conv2DBackpropInput, type, T, FORMAT, N, H, W, C, FW,   \
-                    FH, FC));
+  BENCHMARK(BM_NAME(BM_Conv2DBackpropInput, type, T, FORMAT, N, H, W, C, FH,   \
+                    FW, FC, SH, SW));
 
-#if GOOGLE_CUDA
 using fp32 = float;
 using fp16 = Eigen::half;
 
 // ResNet50-ish convolutions.
-#define BENCHMARK_DTYPE(FORMAT, BATCH, T)                               \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 64, gpu);    \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 256, gpu);   \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 256, 1, 1, 64, gpu);   \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 64, 3, 3, 64, gpu);    \
-                                                                        \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 128, gpu);  \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 512, gpu);  \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 512, 1, 1, 128, gpu);  \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 512, 3, 3, 128, gpu);  \
-                                                                        \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 256, gpu);  \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 1024, gpu); \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 1024, 1, 1, 256, gpu); \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 256, 3, 3, 256, gpu);
+#define BENCHMARK_DTYPE(FORMAT, BATCH, T, D)                                \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 64, 1, 1, D);    \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 256, 1, 1, D);   \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 256, 1, 1, 64, 1, 1, D);   \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 64, 3, 3, 64, 1, 1, D);    \
+                                                                            \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 128, 1, 1, D);  \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 512, 1, 1, D);  \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 512, 1, 1, 128, 1, 1, D);  \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 512, 3, 3, 128, 1, 1, D);  \
+                                                                            \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 256, 1, 1, D);  \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 1024, 1, 1, D); \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 1024, 1, 1, 256, 1, 1, D); \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 256, 3, 3, 256, 1, 1, D);
 
-BENCHMARK_DTYPE(NHWC, 32, fp32);
-BENCHMARK_DTYPE(NCHW, 32, fp32);
+BENCHMARK_DTYPE(NHWC, 8, fp32, cpu);
+BENCHMARK_DTYPE(NHWC, 16, fp32, cpu);
+BENCHMARK_DTYPE(NHWC, 32, fp32, cpu);
 
-BENCHMARK_DTYPE(NHWC, 32, fp16);
-BENCHMARK_DTYPE(NCHW, 32, fp16);
+#if GOOGLE_CUDA
+// -------------------------------------------------------------------------- //
+// The following benchmarks are used to compare different data format
+// performance for different data types. They make sense only when CUDA enabled,
+// because on CPU we only support data in NHWC.
+// -------------------------------------------------------------------------- //
 
-BENCHMARK_DTYPE(NHWC, 64, fp32);
-BENCHMARK_DTYPE(NCHW, 64, fp32);
+BENCHMARK_DTYPE(NHWC, 32, fp32, gpu);
+BENCHMARK_DTYPE(NCHW, 32, fp32, gpu);
 
-BENCHMARK_DTYPE(NHWC, 64, fp16);
-BENCHMARK_DTYPE(NCHW, 64, fp16);
+BENCHMARK_DTYPE(NHWC, 32, fp16, gpu);
+BENCHMARK_DTYPE(NCHW, 32, fp16, gpu);
+
+BENCHMARK_DTYPE(NHWC, 64, fp32, gpu);
+BENCHMARK_DTYPE(NCHW, 64, fp32, gpu);
+
+BENCHMARK_DTYPE(NHWC, 64, fp16, gpu);
+BENCHMARK_DTYPE(NCHW, 64, fp16, gpu);
 
 #endif  // GOOGLE_CUDA
 
-BM_Conv2DBwdInputFmt(float, NHWC, 8, 32, 32, 128, 1, 1, 128, cpu);
-
 }  // namespace tensorflow