Add benchmark for the SpatialConvolution specialized gemm_pack_rhs.

PiperOrigin-RevId: 219839163
2018-11-02 12:09:37 -07:00 · 2018-11-02 12:09:37 -07:00 · eaa673fd2b
commit eaa673fd2b
parent 3dce771c0e
7 changed files with 184 additions and 10 deletions
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -2414,12 +2414,8 @@ tf_cc_tests(
    ],
    deps = [
        ":eigen_helpers",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core:test",
        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
    ],
 )

--- a/tensorflow/core/kernels/eigen_activations_test.cc
+++ b/tensorflow/core/kernels/eigen_activations_test.cc
@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/

 #include "tensorflow/core/kernels/eigen_activations.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"

 namespace Eigen {
--- a/tensorflow/core/kernels/eigen_attention_test.cc
+++ b/tensorflow/core/kernels/eigen_attention_test.cc
@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/

 #include "tensorflow/core/kernels/eigen_attention.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"

 namespace Eigen {
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/

 #include "tensorflow/core/kernels/eigen_backward_spatial_convolutions.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h"
 #include "tensorflow/core/platform/test.h"

--- a/tensorflow/core/kernels/eigen_pooling_test.cc
+++ b/tensorflow/core/kernels/eigen_pooling_test.cc
@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/

 #include "tensorflow/core/kernels/eigen_pooling.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"

 namespace Eigen {
--- a/tensorflow/core/kernels/eigen_softmax_test.cc
+++ b/tensorflow/core/kernels/eigen_softmax_test.cc
@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/

 #include "tensorflow/core/kernels/eigen_softmax.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"

 namespace Eigen {
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/

 #include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"

 namespace Eigen {

@ -1373,4 +1373,187 @@ TEST(EigenSpatialConvolutionsTest, SpatialConvContractionMapper) {
  EigenApprox(8.0f, direct(0, 1, 3, 0));
 }

+static void PackRhsHelper(int iters,
+                          /* Input dimensions: */
+                          int input_batches, int input_cols, int input_rows,
+                          int input_depth,
+                          /* Filter (kernel) dimensions: */
+                          int filter_count, int filter_cols, int filter_rows) {
+  tensorflow::testing::UseRealTime();
+  tensorflow::testing::StopTiming();
+
+  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
+
+  // Default Eigen::Tensor layout is column major, so we configure dimensions
+  // starting from the inner most (channels aka depth in this case).
+  Dimensions input_dims(input_depth, input_rows, input_cols, input_batches);
+
+  using Traits = typename Eigen::internal::gebp_traits<float, float>;
+  static const int packet_size = Eigen::internal::packet_traits<float>::size;
+
+  // Reshape dimensions.
+  using NewDimension = Eigen::array<Eigen::Index, 2>;
+
+  // Contraction dimensions.
+  using nocontract_t = Eigen::array<Eigen::Index, 1>;
+  using contract_t = Eigen::array<Eigen::Index, 1>;
+
+  // Input to the TensorImagePatchOp.
+  using ArgType = Tensor<float, 4>;
+
+  using Evaluator = TensorEvaluator<
+      const TensorReshapingOp<
+          NewDimension, const TensorImagePatchOp<Dynamic, Dynamic, ArgType>>,
+      Eigen::DefaultDevice>;
+
+  using InputMapper = Eigen::internal::TensorContractionInputMapper<
+      float, Index, Eigen::internal::Rhs, Evaluator,  //
+      nocontract_t, contract_t,                       //
+      packet_size,                                    //
+      /*inner_dim_contiguous*/ true,                  //
+      /*inner_dim_reordered*/ false,                  //
+      /*Alignment*/ 0>;
+
+  using SubMapper = Eigen::internal::TensorContractionSubMapper<
+      float, Index, Eigen::internal::Rhs, Evaluator,  //
+      nocontract_t, contract_t,                       //
+      packet_size,                                    //
+      /*inner_dim_contiguous*/ true,                  //
+      /*inner_dim_reordered*/ false,                  //
+      /*Alignment*/ 0>;
+
+  using PackRhsImpl =
+      Eigen::internal::gemm_pack_rhs<float, Eigen::Index, SubMapper,  //
+                                     Traits::nr,                      //
+                                     ColMajor,                        //
+                                     /*Conjugate*/ false,             //
+                                     /*PanelMode*/ false>;
+
+  Eigen::DefaultDevice device;
+
+  // Actual contract dimensions are not important.
+  const Eigen::Index not_important = -1234;
+  nocontract_t nocontract_dim = {not_important};
+  contract_t contract_dim = {not_important};
+
+  // We use tensor of the same dimensions to store packed data.
+  Tensor<float, 4> packed(input_dims);
+
+  // We generate multiple input tensors, around 512mb in total size to measure
+  // realistic workload when input data in not in L1-L3 cache.
+  size_t input_bytes = input_dims.TotalSize() * sizeof(float);
+  size_t mem_size_bytes = 1024 * 1024 * 512;
+  size_t num_inputs =
+      std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
+
+  std::vector<Tensor<float, 4>> inputs;
+  std::vector<Evaluator> evaluators;
+  std::vector<InputMapper> input_mappers;
+
+  for (int i = 0; i < num_inputs; ++i) {
+    inputs.emplace_back(input_dims);
+    inputs[i].setRandom();
+
+    // 1. Extract image patches from input tensor. All strides are `1`.
+    const auto image_patch_op = TensorImagePatchOp<Dynamic, Dynamic, ArgType>(
+        inputs[i],                                             //
+        filter_rows, filter_cols,                              //
+        /*row_strides=*/1, /*col_strides=*/1,                  //
+        /*in_row_strides=*/1, /*in_col_strides=*/1,            //
+        /*row_inflate_strides=*/1, /*col_inflate_strides=*/1,  //
+        Eigen::PADDING_SAME, /*padding_value=*/0.0);
+
+    // 2. Reshape extracted patches into "virtual" 2d tensor.
+    NewDimension reshape_dims = {
+        input_depth * filter_rows * filter_cols,  // patch size
+        // PADDING_SAME: output {rows, cols} == input {rows, cols}
+        input_rows * input_cols * input_batches};  // num_patches
+    const auto reshape_op =
+        TensorReshapingOp<NewDimension, decltype(image_patch_op)>(
+            image_patch_op, reshape_dims);
+
+    evaluators.emplace_back(reshape_op, device);
+
+    input_mappers.emplace_back(evaluators[i], nocontract_dim, nocontract_dim,
+                               contract_dim, contract_dim);
+  }
+
+  // We read properties of extracted image patches directly from evaluator.
+  const Index patch_depth = evaluators[0].impl().dimensions()[0];
+  const Index patch_rows = evaluators[0].impl().dimensions()[1];
+  const Index patch_cols = evaluators[0].impl().dimensions()[2];
+
+  // Number of patches is the same as the maximum column available through the
+  // InputMapper (SubMapper).
+  const Index num_patches = evaluators[0].impl().dimensions()[3];
+
+  // The size of a single patch, it's the same as the maximum depth available
+  // through the InputMapper (SubMapper).
+  const Index patch_size = patch_depth * patch_rows * patch_cols;
+
+  PackRhsImpl pack_rhs;
+
+  // This is the typical size of the rhs block used in Tensor contractions.
+  const Index default_depth = 320;  // must be multiple of 8
+  const Index default_cols = 280;
+
+  const Index packed_total_size = input_dims.TotalSize();
+
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    int input_idx =
+        num_inputs == 1 ? 1 : internal::random<int>(0, num_inputs - 1);
+
+    // Depth offset must be a multiple of 8 (float packet size with AVX2).
+    Index depth_offset = (internal::random<Index>(0, patch_size - 10) / 8) * 8;
+    Index col_offset = internal::random<Index>(0, num_patches - 10);
+
+    Index depth = std::min(default_depth, patch_size - depth_offset);
+    Index cols = std::min(default_cols, num_patches - col_offset);
+
+    // Write packed data to random memory location to emulate cold caches.
+    Index packed_size = depth * cols;
+    Index packed_offset =
+        internal::random<Index>(0, packed_total_size - packed_size - 1);
+
+    pack_rhs(packed.data() + packed_offset,
+             input_mappers[input_idx].getSubMapper(depth_offset, col_offset),
+             depth, cols);
+  }
+  tensorflow::testing::StopTiming();
+
+  std::ostringstream stringStream;
+  stringStream << "patch: depth=" << patch_depth << " rows=" << patch_rows
+               << " cols=" << patch_cols << " num_patches=" << num_patches
+               << " patch_size=" << patch_size << " num_inputs=" << num_inputs;
+  tensorflow::testing::SetLabel(stringStream.str());
+}
+
+#define BM_NAME(prefix, N, H, W, C, FC, FH, FW) \
+  BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW
+
+#define BM_PackRhs(N, H, W, C, FC, FH, FW)                          \
+  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW)(int iters) { \
+    PackRhsHelper(iters, N, H, W, C, FC, FH, FW);                   \
+  }                                                                 \
+  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW))
+
+// Number of input channel (input depth) it equal to the number of patch
+// channels (patch depth).
+
+// NOTE: This is the most common case in Tensorflow models.
+// Fast path: input channel dimension is the multiple of the packet size.
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 32,     //
+           /*num_filters*/ 64,  //
+           /*filter*/ 5, 5);
+
+// Slow path: input channel dimension is not the multiple of the packet size.
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 30,     //
+           /*num_filters*/ 64,  //
+           /*filter*/ 5, 5);
+
 }  // namespace Eigen