Make the kStandardCpp kernel layout and the cache-friendly traversal

threshold part of the Spec, allowing to write tests (test_special_specs_*) that test non-default values. PiperOrigin-RevId: 254464005
2019-06-21 14:03:20 -07:00 · 2019-06-21 14:03:20 -07:00 · 5a8c1c2a9e
commit 5a8c1c2a9e
parent ecb03b1538
11 changed files with 84 additions and 39 deletions
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@ -44,11 +44,6 @@ cc_test(
    ],
 )

-cc_library(
-    name = "spec",
-    hdrs = ["spec.h"],
-)
-
 cc_library(
    name = "size_util",
    hdrs = ["size_util.h"],
@ -213,6 +208,12 @@ cc_library(
    deps = [":check_macros"],
 )

+cc_library(
+    name = "spec",
+    hdrs = ["spec.h"],
+    deps = [":matrix"],
+)
+
 cc_library(
    name = "internal_matrix",
    hdrs = ["internal_matrix.h"],
--- a/tensorflow/lite/experimental/ruy/block_map.cc
+++ b/tensorflow/lite/experimental/ruy/block_map.cc
@ -81,7 +81,7 @@ int floor_log2_quotient(int num, int denom) {

 void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
                  int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
-                  BlockMap* block_map) {
+                  int cache_friendly_traversal_threshold, BlockMap* block_map) {
  gemmlowp::ScopedProfilingLabel label("MakeBlockMap");
  RUY_DCHECK_GE(rows, kernel_rows);
  RUY_DCHECK_GE(cols, kernel_cols);
@ -89,7 +89,7 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
  block_map->traversal_order = BlockMapTraversalOrder::kLinear;
  if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) &&
      (rows * lhs_scalar_size + cols * rhs_scalar_size) * depth >=
-          kCacheFriendlyLoopThreshold) {
+          cache_friendly_traversal_threshold) {
    block_map->traversal_order = RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U)
                                     ? BlockMapTraversalOrder::kFractalU
                                     : BlockMapTraversalOrder::kFractalZ;
--- a/tensorflow/lite/experimental/ruy/block_map.h
+++ b/tensorflow/lite/experimental/ruy/block_map.h
@ -20,20 +20,6 @@ limitations under the License.

 namespace ruy {

-// The value and even the meaning of this constant are empirically
-// determined. Coarsely speaking, it's compared with the size of source
-// LHS and RHS operands to determine whether they are big enough to be worth
-// traversing in a more complicated "cache friendly" order. The current
-// value is roughly the minimum size of a L1 cache on any CPU that we currently
-// care about, e.g. ARM Cortex-A53. But we honestly don't even know the precise
-// extent to which this should be related to L1 cache size.
-//
-// A lower value is not necessarily 'safer' from a cache-friendliness
-// perspective: it means switching sooner (at smaller sizes) to more complicated
-// traversal orders, which might be adversarial to the CPU's auto-prefetching
-// or to the TLB.
-static constexpr int kCacheFriendlyLoopThreshold = 32 * 1024;
-
 enum class BlockMapTraversalOrder {
  // Plain old row-by-row or column-by-column traversal.
  kLinear,
@ -126,7 +112,7 @@ struct BlockMap {
 // matrix multiplication with the given parameters.
 void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
                  int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
-                  BlockMap* block_map);
+                  int cache_friendly_traversal_threshold, BlockMap* block_map);

 // Maps an integer index to a (block_r, block_c) block position in the grid.
 void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index,
--- a/tensorflow/lite/experimental/ruy/dispatch.h
+++ b/tensorflow/lite/experimental/ruy/dispatch.h
@ -189,6 +189,9 @@ void PopulateTrMulParams(TrMulParams* params) {
      &RunPack<ThePath, RhsKernelLayout, RhsScalar, PackedRhsScalar>;
  params->run_kernel =
      &RunKernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
+
+  params->cache_friendly_traversal_threshold =
+      Spec::cache_friendly_traversal_threshold();
  return;
 }

--- a/tensorflow/lite/experimental/ruy/internal_matrix.h
+++ b/tensorflow/lite/experimental/ruy/internal_matrix.h
@ -100,6 +100,8 @@ limitations under the License.
 namespace ruy {

 // KernelLayout describes small-scale block structure in a packed matrix layout.
+// It's a runtime (as opposed to compile-time-constant) version of the
+// FixedKernelLayout struct used to declare kernel layouts.
 //
 // This is is sometimes known as "tiling" in other contexts.
 //
@ -117,16 +119,6 @@ struct KernelLayout {
  std::uint8_t cols = 1;
 };

-// Compile time version of KernelLayout, suitable for template metaprogramming.
-// In particular, partial template specializations of Kernel use this type to
-// statically declare their kernel format.
-template <Order tOrder, int tRows, int tCols>
-struct FixedKernelLayout {
-  static constexpr Order kOrder = tOrder;
-  static constexpr int kRows = tRows;
-  static constexpr int kCols = tCols;
-};
-
 // A packed matrix has a small-scale block structure that is not present in in
 // the input matrices. This block structure is necessary for the kernels to
 // process data efficiently.
--- a/tensorflow/lite/experimental/ruy/kernel.h
+++ b/tensorflow/lite/experimental/ruy/kernel.h
@ -148,8 +148,8 @@ template <typename LhsScalar, typename RhsScalar, typename DstScalar,
          typename Spec>
 struct Kernel<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar, Spec> {
  using AccumScalar = typename Spec::AccumScalar;
-  using LhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
-  using RhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
+  using LhsLayout = typename Spec::StandardCppKernelLhsLayout;
+  using RhsLayout = typename Spec::StandardCppKernelRhsLayout;
  explicit Kernel(Tuning) {}
  void Run(const PackedMatrix<LhsScalar>& lhs,
           const PackedMatrix<RhsScalar>& rhs, const Spec& spec, int start_row,
--- a/tensorflow/lite/experimental/ruy/matrix.h
+++ b/tensorflow/lite/experimental/ruy/matrix.h
@ -148,6 +148,19 @@ StreamType& operator<<(StreamType& stream, const Matrix<Scalar>& mat) {
  return stream;
 }

+// Compile-time version of KernelLayout, used to declare kernel layouts in a
+// way that can be consumed by compile-time logic.
+// See how partial specializations of Kernel use it to declare their layouts.
+// The only reason why this is currently part of the public API is to
+// allow testing various layouts for the Path::kStandardCpp kernel, as a
+// testing-only feature. See Spec::StandardCppKernelLhsLayout.
+template <Order tOrder, int tRows, int tCols>
+struct FixedKernelLayout {
+  static constexpr Order kOrder = tOrder;
+  static constexpr int kRows = tRows;
+  static constexpr int kCols = tCols;
+};
+
 }  // namespace ruy

 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_MATRIX_H_
--- a/tensorflow/lite/experimental/ruy/spec.h
+++ b/tensorflow/lite/experimental/ruy/spec.h
@ -20,6 +20,8 @@ limitations under the License.
 #include <limits>
 #include <type_traits>

+#include "tensorflow/lite/experimental/ruy/matrix.h"
+
 namespace ruy {

 // Our 'general' loop structure (the default) involves multi-threading and
@ -96,6 +98,23 @@ struct BasicSpec {
  // See above enum ZeroPointSupport
  static constexpr ZeroPointSupport kZeroPointSupport =
      ZeroPointSupport::kGeneral;
+  // Testing-only, not meant to be used by actual users:
+  // Used for testing of various kernel layouts.
+  using StandardCppKernelLhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
+  using StandardCppKernelRhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
+  // The value and even the meaning of this value are empirically
+  // determined. Coarsely speaking, it's compared with the size of source
+  // LHS and RHS operands to determine whether they are big enough to be worth
+  // traversing in a more complicated "cache friendly" order. The current
+  // value is roughly the minimum size of a L1 cache on any CPU that we
+  // currently care about, e.g. ARM Cortex-A53. But we honestly don't even know
+  // the precise extent to which this should be related to L1 cache size.
+  //
+  // A lower value is not necessarily 'safer' from a cache-friendliness
+  // perspective: it means switching sooner (at smaller sizes) to more
+  // complicated traversal orders, which might be adversarial to the CPU's
+  // auto-prefetching or to the TLB.
+  static int cache_friendly_traversal_threshold() { return 32 * 1024; }
 };

 }  // namespace ruy
--- a/tensorflow/lite/experimental/ruy/test_special_specs.cc
+++ b/tensorflow/lite/experimental/ruy/test_special_specs.cc
@ -36,6 +36,14 @@ struct RCCSpec : BasicSpec<AccumScalar, DstScalar> {
  static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kRCC;
 };

+template <typename AccumScalar, typename DstScalar, typename LhsKernelLayout,
+          typename RhsKernelLayout>
+struct StandardCppKernelLayoutSpec : BasicSpec<AccumScalar, DstScalar> {
+  using StandardCppKernelLhsLayout = LhsKernelLayout;
+  using StandardCppKernelRhsLayout = RhsKernelLayout;
+  static int cache_friendly_traversal_threshold() { return 0; }
+};
+
 using LhsScalar = RUY_TEST_LHSSCALAR;
 using RhsScalar = RUY_TEST_RHSSCALAR;
 using AccumScalar = RUY_TEST_ACCUMSCALAR;
@ -123,4 +131,24 @@ TEST(TestSpecialSpecs, RCC) {
  TestNonRCC<RCCTestSet>(81, 93, 72, ExpectedOutcome::kDeath);
 }

+template <typename LhsKernelLayout, typename RhsKernelLayout>
+void TestStandardCppKernelLayout() {
+  using SpecType =
+      StandardCppKernelLayoutSpec<AccumScalar, DstScalar, LhsKernelLayout,
+                                  RhsKernelLayout>;
+  using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>;
+  for (int size = 1; size < 10; size++) {
+    TestLinearAllOrders<TestSetType>(size, size, size);
+  }
+  TestLinearAllOrders<TestSetType>(87, 34, 56);
+  TestLinearAllOrders<TestSetType>(123, 234, 78);
+}
+
+TEST(TestSpecialSpecs, StandardCppKernelLayout) {
+  TestStandardCppKernelLayout<FixedKernelLayout<Order::kColMajor, 1, 1>,
+                              FixedKernelLayout<Order::kColMajor, 1, 1>>();
+  TestStandardCppKernelLayout<FixedKernelLayout<Order::kRowMajor, 4, 4>,
+                              FixedKernelLayout<Order::kRowMajor, 4, 4>>();
+}
+
 }  // namespace ruy
--- a/tensorflow/lite/experimental/ruy/trmul.cc
+++ b/tensorflow/lite/experimental/ruy/trmul.cc
@ -169,10 +169,10 @@ int GetThreadCount(Context* context, int rows, int cols, int depth) {
  return clamp(guess, 1, context->max_num_threads);
 }

-LoopStructure GetLoopStructure(int thread_count, int rows, int cols,
-                               int depth) {
+LoopStructure GetLoopStructure(int thread_count, int rows, int cols, int depth,
+                               int cache_friendly_traversal_threshold) {
  if (thread_count == 1 &&
-      (rows + cols) * depth < kCacheFriendlyLoopThreshold) {
+      (rows + cols) * depth < cache_friendly_traversal_threshold) {
    return LoopStructure::kSimple;
  }
  return LoopStructure::kGeneral;
@ -195,7 +195,9 @@ void TrMul(TrMulParams* params, Context* context) {
  const int cols_rounded_up = packed_rhs.layout.cols;

  int thread_count = GetThreadCount(context, rows, cols, depth);
-  const auto loop_structure = GetLoopStructure(thread_count, rows, cols, depth);
+  const auto loop_structure =
+      GetLoopStructure(thread_count, rows, cols, depth,
+                       params->cache_friendly_traversal_threshold);
  Allocator* allocator = context->GetMainAllocator();

  if (!params->lhs_is_prepacked) {
@ -231,7 +233,7 @@ void TrMul(TrMulParams* params, Context* context) {
  MakeBlockMap(rows_rounded_up, cols_rounded_up, depth,
               packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
               packed_lhs.data_type.size, packed_rhs.data_type.size,
-               &block_map);
+               params->cache_friendly_traversal_threshold, &block_map);
  std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
  std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
  std::uint32_t num_blocks = NumBlocks(block_map);
--- a/tensorflow/lite/experimental/ruy/trmul.h
+++ b/tensorflow/lite/experimental/ruy/trmul.h
@ -62,6 +62,7 @@ struct TrMulParams {
  PMatrix packed_rhs;
  bool lhs_is_prepacked = false;
  bool rhs_is_prepacked = false;
+  int cache_friendly_traversal_threshold = 0;

  // Type-erased Spec.
  void* spec = nullptr;