From 5a8c1c2a9e34eaaefc49645c8024f6433464a695 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 21 Jun 2019 14:03:20 -0700
Subject: [PATCH] Make the kStandardCpp kernel layout and the cache-friendly
 traversal threshold part of the Spec, allowing to write tests
 (test_special_specs_*) that test non-default values.

PiperOrigin-RevId: 254464005
---
 tensorflow/lite/experimental/ruy/BUILD        | 11 ++++----
 tensorflow/lite/experimental/ruy/block_map.cc |  4 +--
 tensorflow/lite/experimental/ruy/block_map.h  | 16 +----------
 tensorflow/lite/experimental/ruy/dispatch.h   |  3 ++
 .../lite/experimental/ruy/internal_matrix.h   | 12 ++------
 tensorflow/lite/experimental/ruy/kernel.h     |  4 +--
 tensorflow/lite/experimental/ruy/matrix.h     | 13 +++++++++
 tensorflow/lite/experimental/ruy/spec.h       | 19 +++++++++++++
 .../experimental/ruy/test_special_specs.cc    | 28 +++++++++++++++++++
 tensorflow/lite/experimental/ruy/trmul.cc     | 12 ++++----
 tensorflow/lite/experimental/ruy/trmul.h      |  1 +
 11 files changed, 84 insertions(+), 39 deletions(-)

diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD
index aa621e3f53e..8577e959efa 100644
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@@ -44,11 +44,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "spec",
-    hdrs = ["spec.h"],
-)
-
 cc_library(
     name = "size_util",
     hdrs = ["size_util.h"],
@@ -213,6 +208,12 @@ cc_library(
     deps = [":check_macros"],
 )
 
+cc_library(
+    name = "spec",
+    hdrs = ["spec.h"],
+    deps = [":matrix"],
+)
+
 cc_library(
     name = "internal_matrix",
     hdrs = ["internal_matrix.h"],
diff --git a/tensorflow/lite/experimental/ruy/block_map.cc b/tensorflow/lite/experimental/ruy/block_map.cc
index 5e3ef859e7a..08b3c6064f9 100644
--- a/tensorflow/lite/experimental/ruy/block_map.cc
+++ b/tensorflow/lite/experimental/ruy/block_map.cc
@@ -81,7 +81,7 @@ int floor_log2_quotient(int num, int denom) {
 
 void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
                   int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
-                  BlockMap* block_map) {
+                  int cache_friendly_traversal_threshold, BlockMap* block_map) {
   gemmlowp::ScopedProfilingLabel label("MakeBlockMap");
   RUY_DCHECK_GE(rows, kernel_rows);
   RUY_DCHECK_GE(cols, kernel_cols);
@@ -89,7 +89,7 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
   block_map->traversal_order = BlockMapTraversalOrder::kLinear;
   if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) &&
       (rows * lhs_scalar_size + cols * rhs_scalar_size) * depth >=
-          kCacheFriendlyLoopThreshold) {
+          cache_friendly_traversal_threshold) {
     block_map->traversal_order = RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U)
                                      ? BlockMapTraversalOrder::kFractalU
                                      : BlockMapTraversalOrder::kFractalZ;
diff --git a/tensorflow/lite/experimental/ruy/block_map.h b/tensorflow/lite/experimental/ruy/block_map.h
index 7a0f74cb359..b0567ea481f 100644
--- a/tensorflow/lite/experimental/ruy/block_map.h
+++ b/tensorflow/lite/experimental/ruy/block_map.h
@@ -20,20 +20,6 @@ limitations under the License.
 
 namespace ruy {
 
-// The value and even the meaning of this constant are empirically
-// determined. Coarsely speaking, it's compared with the size of source
-// LHS and RHS operands to determine whether they are big enough to be worth
-// traversing in a more complicated "cache friendly" order. The current
-// value is roughly the minimum size of a L1 cache on any CPU that we currently
-// care about, e.g. ARM Cortex-A53. But we honestly don't even know the precise
-// extent to which this should be related to L1 cache size.
-//
-// A lower value is not necessarily 'safer' from a cache-friendliness
-// perspective: it means switching sooner (at smaller sizes) to more complicated
-// traversal orders, which might be adversarial to the CPU's auto-prefetching
-// or to the TLB.
-static constexpr int kCacheFriendlyLoopThreshold = 32 * 1024;
-
 enum class BlockMapTraversalOrder {
   // Plain old row-by-row or column-by-column traversal.
   kLinear,
@@ -126,7 +112,7 @@ struct BlockMap {
 // matrix multiplication with the given parameters.
 void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
                   int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
-                  BlockMap* block_map);
+                  int cache_friendly_traversal_threshold, BlockMap* block_map);
 
 // Maps an integer index to a (block_r, block_c) block position in the grid.
 void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index,
diff --git a/tensorflow/lite/experimental/ruy/dispatch.h b/tensorflow/lite/experimental/ruy/dispatch.h
index be0a4e5a641..9044be70bb7 100644
--- a/tensorflow/lite/experimental/ruy/dispatch.h
+++ b/tensorflow/lite/experimental/ruy/dispatch.h
@@ -189,6 +189,9 @@ void PopulateTrMulParams(TrMulParams* params) {
       &RunPack<ThePath, RhsKernelLayout, RhsScalar, PackedRhsScalar>;
   params->run_kernel =
       &RunKernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
+
+  params->cache_friendly_traversal_threshold =
+      Spec::cache_friendly_traversal_threshold();
   return;
 }
 
diff --git a/tensorflow/lite/experimental/ruy/internal_matrix.h b/tensorflow/lite/experimental/ruy/internal_matrix.h
index a8e8f1cae68..f44ce444dc4 100644
--- a/tensorflow/lite/experimental/ruy/internal_matrix.h
+++ b/tensorflow/lite/experimental/ruy/internal_matrix.h
@@ -100,6 +100,8 @@ limitations under the License.
 namespace ruy {
 
 // KernelLayout describes small-scale block structure in a packed matrix layout.
+// It's a runtime (as opposed to compile-time-constant) version of the
+// FixedKernelLayout struct used to declare kernel layouts.
 //
 // This is is sometimes known as "tiling" in other contexts.
 //
@@ -117,16 +119,6 @@ struct KernelLayout {
   std::uint8_t cols = 1;
 };
 
-// Compile time version of KernelLayout, suitable for template metaprogramming.
-// In particular, partial template specializations of Kernel use this type to
-// statically declare their kernel format.
-template <Order tOrder, int tRows, int tCols>
-struct FixedKernelLayout {
-  static constexpr Order kOrder = tOrder;
-  static constexpr int kRows = tRows;
-  static constexpr int kCols = tCols;
-};
-
 // A packed matrix has a small-scale block structure that is not present in in
 // the input matrices. This block structure is necessary for the kernels to
 // process data efficiently.
diff --git a/tensorflow/lite/experimental/ruy/kernel.h b/tensorflow/lite/experimental/ruy/kernel.h
index 8b03bc727e0..6b4c98f4322 100644
--- a/tensorflow/lite/experimental/ruy/kernel.h
+++ b/tensorflow/lite/experimental/ruy/kernel.h
@@ -148,8 +148,8 @@ template <typename LhsScalar, typename RhsScalar, typename DstScalar,
           typename Spec>
 struct Kernel<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar, Spec> {
   using AccumScalar = typename Spec::AccumScalar;
-  using LhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
-  using RhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
+  using LhsLayout = typename Spec::StandardCppKernelLhsLayout;
+  using RhsLayout = typename Spec::StandardCppKernelRhsLayout;
   explicit Kernel(Tuning) {}
   void Run(const PackedMatrix<LhsScalar>& lhs,
            const PackedMatrix<RhsScalar>& rhs, const Spec& spec, int start_row,
diff --git a/tensorflow/lite/experimental/ruy/matrix.h b/tensorflow/lite/experimental/ruy/matrix.h
index 7a3351ba1ac..3f26f091a79 100644
--- a/tensorflow/lite/experimental/ruy/matrix.h
+++ b/tensorflow/lite/experimental/ruy/matrix.h
@@ -148,6 +148,19 @@ StreamType& operator<<(StreamType& stream, const Matrix<Scalar>& mat) {
   return stream;
 }
 
+// Compile-time version of KernelLayout, used to declare kernel layouts in a
+// way that can be consumed by compile-time logic.
+// See how partial specializations of Kernel use it to declare their layouts.
+// The only reason why this is currently part of the public API is to
+// allow testing various layouts for the Path::kStandardCpp kernel, as a
+// testing-only feature. See Spec::StandardCppKernelLhsLayout.
+template <Order tOrder, int tRows, int tCols>
+struct FixedKernelLayout {
+  static constexpr Order kOrder = tOrder;
+  static constexpr int kRows = tRows;
+  static constexpr int kCols = tCols;
+};
+
 }  // namespace ruy
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_MATRIX_H_
diff --git a/tensorflow/lite/experimental/ruy/spec.h b/tensorflow/lite/experimental/ruy/spec.h
index b4d5901320f..091344503ed 100644
--- a/tensorflow/lite/experimental/ruy/spec.h
+++ b/tensorflow/lite/experimental/ruy/spec.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <limits>
 #include <type_traits>
 
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+
 namespace ruy {
 
 // Our 'general' loop structure (the default) involves multi-threading and
@@ -96,6 +98,23 @@ struct BasicSpec {
   // See above enum ZeroPointSupport
   static constexpr ZeroPointSupport kZeroPointSupport =
       ZeroPointSupport::kGeneral;
+  // Testing-only, not meant to be used by actual users:
+  // Used for testing of various kernel layouts.
+  using StandardCppKernelLhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
+  using StandardCppKernelRhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
+  // The value and even the meaning of this value are empirically
+  // determined. Coarsely speaking, it's compared with the size of source
+  // LHS and RHS operands to determine whether they are big enough to be worth
+  // traversing in a more complicated "cache friendly" order. The current
+  // value is roughly the minimum size of a L1 cache on any CPU that we
+  // currently care about, e.g. ARM Cortex-A53. But we honestly don't even know
+  // the precise extent to which this should be related to L1 cache size.
+  //
+  // A lower value is not necessarily 'safer' from a cache-friendliness
+  // perspective: it means switching sooner (at smaller sizes) to more
+  // complicated traversal orders, which might be adversarial to the CPU's
+  // auto-prefetching or to the TLB.
+  static int cache_friendly_traversal_threshold() { return 32 * 1024; }
 };
 
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/test_special_specs.cc b/tensorflow/lite/experimental/ruy/test_special_specs.cc
index 5e1d8d980f5..daaadf1757e 100644
--- a/tensorflow/lite/experimental/ruy/test_special_specs.cc
+++ b/tensorflow/lite/experimental/ruy/test_special_specs.cc
@@ -36,6 +36,14 @@ struct RCCSpec : BasicSpec<AccumScalar, DstScalar> {
   static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kRCC;
 };
 
+template <typename AccumScalar, typename DstScalar, typename LhsKernelLayout,
+          typename RhsKernelLayout>
+struct StandardCppKernelLayoutSpec : BasicSpec<AccumScalar, DstScalar> {
+  using StandardCppKernelLhsLayout = LhsKernelLayout;
+  using StandardCppKernelRhsLayout = RhsKernelLayout;
+  static int cache_friendly_traversal_threshold() { return 0; }
+};
+
 using LhsScalar = RUY_TEST_LHSSCALAR;
 using RhsScalar = RUY_TEST_RHSSCALAR;
 using AccumScalar = RUY_TEST_ACCUMSCALAR;
@@ -123,4 +131,24 @@ TEST(TestSpecialSpecs, RCC) {
   TestNonRCC<RCCTestSet>(81, 93, 72, ExpectedOutcome::kDeath);
 }
 
+template <typename LhsKernelLayout, typename RhsKernelLayout>
+void TestStandardCppKernelLayout() {
+  using SpecType =
+      StandardCppKernelLayoutSpec<AccumScalar, DstScalar, LhsKernelLayout,
+                                  RhsKernelLayout>;
+  using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>;
+  for (int size = 1; size < 10; size++) {
+    TestLinearAllOrders<TestSetType>(size, size, size);
+  }
+  TestLinearAllOrders<TestSetType>(87, 34, 56);
+  TestLinearAllOrders<TestSetType>(123, 234, 78);
+}
+
+TEST(TestSpecialSpecs, StandardCppKernelLayout) {
+  TestStandardCppKernelLayout<FixedKernelLayout<Order::kColMajor, 1, 1>,
+                              FixedKernelLayout<Order::kColMajor, 1, 1>>();
+  TestStandardCppKernelLayout<FixedKernelLayout<Order::kRowMajor, 4, 4>,
+                              FixedKernelLayout<Order::kRowMajor, 4, 4>>();
+}
+
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/trmul.cc b/tensorflow/lite/experimental/ruy/trmul.cc
index c2dcfcf1810..c7beb6ea042 100644
--- a/tensorflow/lite/experimental/ruy/trmul.cc
+++ b/tensorflow/lite/experimental/ruy/trmul.cc
@@ -169,10 +169,10 @@ int GetThreadCount(Context* context, int rows, int cols, int depth) {
   return clamp(guess, 1, context->max_num_threads);
 }
 
-LoopStructure GetLoopStructure(int thread_count, int rows, int cols,
-                               int depth) {
+LoopStructure GetLoopStructure(int thread_count, int rows, int cols, int depth,
+                               int cache_friendly_traversal_threshold) {
   if (thread_count == 1 &&
-      (rows + cols) * depth < kCacheFriendlyLoopThreshold) {
+      (rows + cols) * depth < cache_friendly_traversal_threshold) {
     return LoopStructure::kSimple;
   }
   return LoopStructure::kGeneral;
@@ -195,7 +195,9 @@ void TrMul(TrMulParams* params, Context* context) {
   const int cols_rounded_up = packed_rhs.layout.cols;
 
   int thread_count = GetThreadCount(context, rows, cols, depth);
-  const auto loop_structure = GetLoopStructure(thread_count, rows, cols, depth);
+  const auto loop_structure =
+      GetLoopStructure(thread_count, rows, cols, depth,
+                       params->cache_friendly_traversal_threshold);
   Allocator* allocator = context->GetMainAllocator();
 
   if (!params->lhs_is_prepacked) {
@@ -231,7 +233,7 @@ void TrMul(TrMulParams* params, Context* context) {
   MakeBlockMap(rows_rounded_up, cols_rounded_up, depth,
                packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
                packed_lhs.data_type.size, packed_rhs.data_type.size,
-               &block_map);
+               params->cache_friendly_traversal_threshold, &block_map);
   std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
   std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
   std::uint32_t num_blocks = NumBlocks(block_map);
diff --git a/tensorflow/lite/experimental/ruy/trmul.h b/tensorflow/lite/experimental/ruy/trmul.h
index 64b43aca37f..1a3872bc2ba 100644
--- a/tensorflow/lite/experimental/ruy/trmul.h
+++ b/tensorflow/lite/experimental/ruy/trmul.h
@@ -62,6 +62,7 @@ struct TrMulParams {
   PMatrix packed_rhs;
   bool lhs_is_prepacked = false;
   bool rhs_is_prepacked = false;
+  int cache_friendly_traversal_threshold = 0;
 
   // Type-erased Spec.
   void* spec = nullptr;