Add CPU benchmarks for Conv2D gradients

PiperOrigin-RevId: 282817699 Change-Id: Ib658159605dec1ed653b53db9422742971868101
2019-11-27 12:42:25 -08:00 · 2019-11-27 12:42:25 -08:00 · 5d55514c43
commit 5d55514c43
parent fe77f79256
2 changed files with 112 additions and 96 deletions
--- a/tensorflow/core/kernels/conv_grad_filter_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_benchmark_test.cc
@ -38,20 +38,21 @@ static Tensor MakeRandomTensor(const TensorShape& shape) {

 template <typename T>
 static Graph* Conv2DBackpropFilter(int batch, int height, int width,
-                                   int in_depth, int filter_w, int filter_h,
-                                   int out_depth, TensorFormat data_format) {
+                                   int in_depth, int filter_h, int filter_w,
+                                   int out_depth, int stride_h, int stride_w,
+                                   TensorFormat data_format) {
  auto* graph = new Graph(OpRegistry::Global());

  Tensor input_t = data_format == FORMAT_NHWC
                       ? MakeRandomTensor<T>({batch, height, width, in_depth})
                       : MakeRandomTensor<T>({batch, in_depth, height, width});
  Tensor filter_t =
-      MakeRandomTensor<T>({filter_w, filter_h, in_depth, out_depth});
+      MakeRandomTensor<T>({filter_h, filter_w, in_depth, out_depth});

  // Compute dimensions for the `out_backprop` tensor.
  Conv2DParameters params;
  params.dilations = {1, 1, 1, 1};
-  params.strides = {1, 1, 1, 1};
+  params.strides = {1, stride_h, stride_w, 1};
  params.padding = Padding::SAME;
  params.data_format = data_format;

@ -83,7 +84,7 @@ static Graph* Conv2DBackpropFilter(int batch, int height, int width,
          .Input(filter_dims)
          .Input(backprop)
          .Attr("T", DataTypeToEnum<T>::value)
-          .Attr("strides", {1, 1, 1, 1})
+          .Attr("strides", {1, stride_h, stride_w, 1})
          .Attr("padding", "SAME")
          .Attr("data_format", ToString(data_format))
          .Finalize(graph, &conv2d));
@ -91,12 +92,6 @@ static Graph* Conv2DBackpropFilter(int batch, int height, int width,
  return graph;
 }

-// -------------------------------------------------------------------------- //
-// The following benchmarks are used to compare different data format
-// performance for different data types. They make sense only when CUDA enabled,
-// because on CPU we only support data in NHWC.
-// -------------------------------------------------------------------------- //
-
 // Macro arguments names: --------------------------------------------------- //
 //      T: data type
 // FORMAT: data format (NHWC or NCHW)
@ -107,57 +102,70 @@ static Graph* Conv2DBackpropFilter(int batch, int height, int width,
 //     FC: filter count
 //     FH: filter height
 //     FW: filter width
+//     SH: stride height
+//     SW: stride width

-#define BM_NAME(name, type, T, FORMAT, N, H, W, C, FW, FH, FC) \
-  name##_##T##_##FORMAT##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
+#define BM_CONCAT(a, b) a##_##b

-#define BM_Conv2DBwdFilterFmt(T, FORMAT, N, H, W, C, FW, FH, FC, type)        \
-  static void BM_NAME(BM_Conv2DBackpropFilter, type, T, FORMAT, N, H, W, C,   \
-                      FW, FH, FC)(int iters) {                                \
-    testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) *     \
-                            (C));                                             \
-    test::Benchmark(#type, Conv2DBackpropFilter<T>(N, H, W, C, FW, FH, FC,    \
-                                                   FORMAT_##FORMAT))          \
-        .Run(iters);                                                          \
-  }                                                                           \
-  BENCHMARK(BM_NAME(BM_Conv2DBackpropFilter, type, T, FORMAT, N, H, W, C, FW, \
-                    FH, FC));
+#define BM_NAME(name, type, T, FORMAT, N, H, W, C, FH, FW, FC, SH, SW) \
+  BM_CONCAT(name##_##T##_##FORMAT##_##type##_in##N##x##H##x##W##x##C,  \
+            f##FH##x##FW##x##FC##_##s##SH##x##SW)
+
+#define BM_Conv2DBwdFilterFmt(T, FORMAT, N, H, W, C, FH, FW, FC, SH, SW, type) \
+  static void BM_NAME(BM_Conv2DBackpropFilter, type, T, FORMAT, N, H, W, C,    \
+                      FH, FW, FC, SH, SW)(int iters) {                         \
+    testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) *      \
+                            (C));                                              \
+    test::Benchmark(#type, Conv2DBackpropFilter<T>(N, H, W, C, FH, FW, FC, SH, \
+                                                   SW, FORMAT_##FORMAT))       \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(BM_NAME(BM_Conv2DBackpropFilter, type, T, FORMAT, N, H, W, C, FH,  \
+                    FW, FC, SH, SW));
+
+// ResNet50-ish convolutions.
+#define BENCHMARK_DTYPE(FORMAT, BATCH, T, D)                                 \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 64, 1, 1, D);    \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 256, 1, 1, D);   \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 256, 1, 1, 64, 1, 1, D);   \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 64, 3, 3, 64, 1, 1, D);    \
+                                                                             \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 128, 1, 1, D);  \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 512, 1, 1, D);  \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 512, 1, 1, 128, 1, 1, D);  \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 512, 3, 3, 128, 1, 1, D);  \
+                                                                             \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 256, 1, 1, D);  \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 1024, 1, 1, D); \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 1024, 1, 1, 256, 1, 1, D); \
+  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 256, 3, 3, 256, 1, 1, D);

-#if GOOGLE_CUDA
 using fp32 = float;
 using fp16 = Eigen::half;

-// ResNet50-ish convolutions.
-#define BENCHMARK_DTYPE(FORMAT, BATCH, T)                                \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 64, gpu);    \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 256, gpu);   \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 256, 1, 1, 64, gpu);   \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 56, 56, 64, 3, 3, 64, gpu);    \
-                                                                         \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 128, gpu);  \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 512, gpu);  \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 512, 1, 1, 128, gpu);  \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 28, 28, 512, 3, 3, 128, gpu);  \
-                                                                         \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 256, gpu);  \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 1024, gpu); \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 1024, 1, 1, 256, gpu); \
-  BM_Conv2DBwdFilterFmt(T, FORMAT, BATCH, 14, 14, 256, 3, 3, 256, gpu);
+BENCHMARK_DTYPE(NHWC, 8, fp32, cpu);
+BENCHMARK_DTYPE(NHWC, 16, fp32, cpu);
+BENCHMARK_DTYPE(NHWC, 32, fp32, cpu);

-BENCHMARK_DTYPE(NHWC, 32, fp32);
-BENCHMARK_DTYPE(NCHW, 32, fp32);
+#if GOOGLE_CUDA
+// -------------------------------------------------------------------------- //
+// The following benchmarks are used to compare different data format
+// performance for different data types. They make sense only when CUDA enabled,
+// because on CPU we only support data in NHWC.
+// -------------------------------------------------------------------------- //

-BENCHMARK_DTYPE(NHWC, 32, fp16);
-BENCHMARK_DTYPE(NCHW, 32, fp16);
+BENCHMARK_DTYPE(NHWC, 32, fp32, gpu);
+BENCHMARK_DTYPE(NCHW, 32, fp32, gpu);

-BENCHMARK_DTYPE(NHWC, 64, fp32);
-BENCHMARK_DTYPE(NCHW, 64, fp32);
+BENCHMARK_DTYPE(NHWC, 32, fp16, gpu);
+BENCHMARK_DTYPE(NCHW, 32, fp16, gpu);

-BENCHMARK_DTYPE(NHWC, 64, fp16);
-BENCHMARK_DTYPE(NCHW, 64, fp16);
+BENCHMARK_DTYPE(NHWC, 64, fp32, gpu);
+BENCHMARK_DTYPE(NCHW, 64, fp32, gpu);
+
+BENCHMARK_DTYPE(NHWC, 64, fp16, gpu);
+BENCHMARK_DTYPE(NCHW, 64, fp16, gpu);

 #endif  // GOOGLE_CUDA

-BM_Conv2DBwdFilterFmt(float, NHWC, 8, 32, 32, 128, 1, 1, 128, cpu);
-
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/conv_grad_input_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops_benchmark_test.cc
@ -38,8 +38,9 @@ static Tensor MakeRandomTensor(const TensorShape& shape) {

 template <typename T>
 static Graph* Conv2DBackpropInput(int batch, int height, int width,
-                                  int in_depth, int filter_w, int filter_h,
-                                  int out_depth, TensorFormat data_format) {
+                                  int in_depth, int filter_h, int filter_w,
+                                  int out_depth, int stride_h, int stride_w,
+                                  TensorFormat data_format) {
  auto* graph = new Graph(OpRegistry::Global());

  Tensor input_t = data_format == FORMAT_NHWC
@ -51,7 +52,7 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
  // Compute dimensions for the `out_backprop` tensor.
  Conv2DParameters params;
  params.dilations = {1, 1, 1, 1};
-  params.strides = {1, 1, 1, 1};
+  params.strides = {1, stride_h, stride_w, 1};
  params.padding = Padding::SAME;
  params.data_format = data_format;

@ -83,7 +84,7 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
          .Input(filter)
          .Input(backprop)
          .Attr("T", DataTypeToEnum<T>::value)
-          .Attr("strides", {1, 1, 1, 1})
+          .Attr("strides", {1, stride_h, stride_w, 1})
          .Attr("padding", "SAME")
          .Attr("data_format", ToString(data_format))
          .Finalize(graph, &conv2d));
@ -91,12 +92,6 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
  return graph;
 }

-// -------------------------------------------------------------------------- //
-// The following benchmarks are used to compare different data format
-// performance for different data types. They make sense only when CUDA enabled,
-// because on CPU we only support data in NHWC.
-// -------------------------------------------------------------------------- //
-
 // Macro arguments names: --------------------------------------------------- //
 //      T: data type
 // FORMAT: data format (NHWC or NCHW)
@ -107,57 +102,70 @@ static Graph* Conv2DBackpropInput(int batch, int height, int width,
 //     FC: filter count
 //     FH: filter height
 //     FW: filter width
+//     SH: stride height
+//     SW: stride width

-#define BM_NAME(name, type, T, FORMAT, N, H, W, C, FW, FH, FC) \
-  name##_##T##_##FORMAT##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
+#define BM_CONCAT(a, b) a##_##b

-#define BM_Conv2DBwdInputFmt(T, FORMAT, N, H, W, C, FW, FH, FC, type)          \
-  static void BM_NAME(BM_Conv2DBackpropInput, type, T, FORMAT, N, H, W, C, FW, \
-                      FH, FC)(int iters) {                                     \
+#define BM_NAME(name, type, T, FORMAT, N, H, W, C, FH, FW, FC, SH, SW) \
+  BM_CONCAT(name##_##T##_##FORMAT##_##type##_in##N##x##H##x##W##x##C,  \
+            f##FH##x##FW##x##FC##_##s##SH##x##SW)
+
+#define BM_Conv2DBwdInputFmt(T, FORMAT, N, H, W, C, FW, FH, FC, SH, SW, type)  \
+  static void BM_NAME(BM_Conv2DBackpropInput, type, T, FORMAT, N, H, W, C, FH, \
+                      FW, FC, SH, SW)(int iters) {                             \
    testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) *      \
                            (C));                                              \
-    test::Benchmark(#type, Conv2DBackpropInput<T>(N, H, W, C, FW, FH, FC,      \
-                                                  FORMAT_##FORMAT))            \
+    test::Benchmark(#type, Conv2DBackpropInput<T>(N, H, W, C, FH, FW, FC, SH,  \
+                                                  SW, FORMAT_##FORMAT))        \
        .Run(iters);                                                           \
  }                                                                            \
-  BENCHMARK(BM_NAME(BM_Conv2DBackpropInput, type, T, FORMAT, N, H, W, C, FW,   \
-                    FH, FC));
+  BENCHMARK(BM_NAME(BM_Conv2DBackpropInput, type, T, FORMAT, N, H, W, C, FH,   \
+                    FW, FC, SH, SW));

-#if GOOGLE_CUDA
 using fp32 = float;
 using fp16 = Eigen::half;

 // ResNet50-ish convolutions.
-#define BENCHMARK_DTYPE(FORMAT, BATCH, T)                               \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 64, gpu);    \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 256, gpu);   \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 256, 1, 1, 64, gpu);   \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 64, 3, 3, 64, gpu);    \
-                                                                        \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 128, gpu);  \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 512, gpu);  \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 512, 1, 1, 128, gpu);  \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 512, 3, 3, 128, gpu);  \
-                                                                        \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 256, gpu);  \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 1024, gpu); \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 1024, 1, 1, 256, gpu); \
-  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 256, 3, 3, 256, gpu);
+#define BENCHMARK_DTYPE(FORMAT, BATCH, T, D)                                \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 64, 1, 1, D);    \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 64, 1, 1, 256, 1, 1, D);   \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 256, 1, 1, 64, 1, 1, D);   \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 56, 56, 64, 3, 3, 64, 1, 1, D);    \
+                                                                            \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 128, 1, 1, D);  \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 128, 1, 1, 512, 1, 1, D);  \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 512, 1, 1, 128, 1, 1, D);  \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 28, 28, 512, 3, 3, 128, 1, 1, D);  \
+                                                                            \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 256, 1, 1, D);  \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 256, 1, 1, 1024, 1, 1, D); \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 1024, 1, 1, 256, 1, 1, D); \
+  BM_Conv2DBwdInputFmt(T, FORMAT, BATCH, 14, 14, 256, 3, 3, 256, 1, 1, D);

-BENCHMARK_DTYPE(NHWC, 32, fp32);
-BENCHMARK_DTYPE(NCHW, 32, fp32);
+BENCHMARK_DTYPE(NHWC, 8, fp32, cpu);
+BENCHMARK_DTYPE(NHWC, 16, fp32, cpu);
+BENCHMARK_DTYPE(NHWC, 32, fp32, cpu);

-BENCHMARK_DTYPE(NHWC, 32, fp16);
-BENCHMARK_DTYPE(NCHW, 32, fp16);
+#if GOOGLE_CUDA
+// -------------------------------------------------------------------------- //
+// The following benchmarks are used to compare different data format
+// performance for different data types. They make sense only when CUDA enabled,
+// because on CPU we only support data in NHWC.
+// -------------------------------------------------------------------------- //

-BENCHMARK_DTYPE(NHWC, 64, fp32);
-BENCHMARK_DTYPE(NCHW, 64, fp32);
+BENCHMARK_DTYPE(NHWC, 32, fp32, gpu);
+BENCHMARK_DTYPE(NCHW, 32, fp32, gpu);

-BENCHMARK_DTYPE(NHWC, 64, fp16);
-BENCHMARK_DTYPE(NCHW, 64, fp16);
+BENCHMARK_DTYPE(NHWC, 32, fp16, gpu);
+BENCHMARK_DTYPE(NCHW, 32, fp16, gpu);
+
+BENCHMARK_DTYPE(NHWC, 64, fp32, gpu);
+BENCHMARK_DTYPE(NCHW, 64, fp32, gpu);
+
+BENCHMARK_DTYPE(NHWC, 64, fp16, gpu);
+BENCHMARK_DTYPE(NCHW, 64, fp16, gpu);

 #endif  // GOOGLE_CUDA

-BM_Conv2DBwdInputFmt(float, NHWC, 8, 32, 32, 128, 1, 1, 128, cpu);
-
 }  // namespace tensorflow