Make the kStandardCpp kernel layout and the cache-friendly traversal

threshold part of the Spec, allowing to write tests (test_special_specs_*) that test non-default values. PiperOrigin-RevId: 254464005
2019-06-21 14:03:20 -07:00 · 2019-06-21 14:03:20 -07:00 · 5a8c1c2a9e
commit 5a8c1c2a9e
parent ecb03b1538
11 changed files with 84 additions and 39 deletions
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@ -44,11 +44,6 @@ cc_test(
    ],
 )
 cc_library(
    name = "spec",
    hdrs = ["spec.h"],
 )
 cc_library(
    name = "size_util",
    hdrs = ["size_util.h"],
@ -213,6 +208,12 @@ cc_library(
    deps = [":check_macros"],
 )
 cc_library(
    name = "spec",
    hdrs = ["spec.h"],
    deps = [":matrix"],
 )
 cc_library(
    name = "internal_matrix",
    hdrs = ["internal_matrix.h"],
--- a/tensorflow/lite/experimental/ruy/block_map.cc
+++ b/tensorflow/lite/experimental/ruy/block_map.cc
@ -81,7 +81,7 @@ int floor_log2_quotient(int num, int denom) {
 void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
                  int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
-                  BlockMap* block_map) {
+                  int cache_friendly_traversal_threshold, BlockMap* block_map) {
  gemmlowp::ScopedProfilingLabel label("MakeBlockMap");
  RUY_DCHECK_GE(rows, kernel_rows);
  RUY_DCHECK_GE(cols, kernel_cols);
@ -89,7 +89,7 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
  block_map->traversal_order = BlockMapTraversalOrder::kLinear;
  if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) &&
      (rows * lhs_scalar_size + cols * rhs_scalar_size) * depth >=
-          kCacheFriendlyLoopThreshold) {
+          cache_friendly_traversal_threshold) {
    block_map->traversal_order = RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U)
                                     ? BlockMapTraversalOrder::kFractalU
                                     : BlockMapTraversalOrder::kFractalZ;
--- a/tensorflow/lite/experimental/ruy/block_map.h
+++ b/tensorflow/lite/experimental/ruy/block_map.h
@ -20,20 +20,6 @@ limitations under the License.
 namespace ruy {
 // The value and even the meaning of this constant are empirically
 // determined. Coarsely speaking, it's compared with the size of source
 // LHS and RHS operands to determine whether they are big enough to be worth
 // traversing in a more complicated "cache friendly" order. The current
 // value is roughly the minimum size of a L1 cache on any CPU that we currently
 // care about, e.g. ARM Cortex-A53. But we honestly don't even know the precise
 // extent to which this should be related to L1 cache size.
 //
 // A lower value is not necessarily 'safer' from a cache-friendliness
 // perspective: it means switching sooner (at smaller sizes) to more complicated
 // traversal orders, which might be adversarial to the CPU's auto-prefetching
 // or to the TLB.
 static constexpr int kCacheFriendlyLoopThreshold = 32 * 1024;
 enum class BlockMapTraversalOrder {
  // Plain old row-by-row or column-by-column traversal.
  kLinear,
@ -126,7 +112,7 @@ struct BlockMap {
 // matrix multiplication with the given parameters.
 void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
                  int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
-                  BlockMap* block_map);
+                  int cache_friendly_traversal_threshold, BlockMap* block_map);
 // Maps an integer index to a (block_r, block_c) block position in the grid.
 void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index,
--- a/tensorflow/lite/experimental/ruy/dispatch.h
+++ b/tensorflow/lite/experimental/ruy/dispatch.h
@ -189,6 +189,9 @@ void PopulateTrMulParams(TrMulParams* params) {
      &RunPack<ThePath, RhsKernelLayout, RhsScalar, PackedRhsScalar>;
  params->run_kernel =
      &RunKernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
  params->cache_friendly_traversal_threshold =
      Spec::cache_friendly_traversal_threshold();
  return;
 }
--- a/tensorflow/lite/experimental/ruy/internal_matrix.h
+++ b/tensorflow/lite/experimental/ruy/internal_matrix.h
@ -100,6 +100,8 @@ limitations under the License.
 namespace ruy {
 // KernelLayout describes small-scale block structure in a packed matrix layout.
 // It's a runtime (as opposed to compile-time-constant) version of the
 // FixedKernelLayout struct used to declare kernel layouts.
 //
 // This is is sometimes known as "tiling" in other contexts.
 //
@ -117,16 +119,6 @@ struct KernelLayout {
  std::uint8_t cols = 1;
 };
 // Compile time version of KernelLayout, suitable for template metaprogramming.
 // In particular, partial template specializations of Kernel use this type to
 // statically declare their kernel format.
 template <Order tOrder, int tRows, int tCols>
 struct FixedKernelLayout {
  static constexpr Order kOrder = tOrder;
  static constexpr int kRows = tRows;
  static constexpr int kCols = tCols;
 };
 // A packed matrix has a small-scale block structure that is not present in in
 // the input matrices. This block structure is necessary for the kernels to
 // process data efficiently.
--- a/tensorflow/lite/experimental/ruy/kernel.h
+++ b/tensorflow/lite/experimental/ruy/kernel.h
@ -148,8 +148,8 @@ template <typename LhsScalar, typename RhsScalar, typename DstScalar,
          typename Spec>
 struct Kernel<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar, Spec> {
  using AccumScalar = typename Spec::AccumScalar;
-  using LhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
+  using LhsLayout = typename Spec::StandardCppKernelLhsLayout;
-  using RhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
+  using RhsLayout = typename Spec::StandardCppKernelRhsLayout;
  explicit Kernel(Tuning) {}
  void Run(const PackedMatrix<LhsScalar>& lhs,
           const PackedMatrix<RhsScalar>& rhs, const Spec& spec, int start_row,
--- a/tensorflow/lite/experimental/ruy/matrix.h
+++ b/tensorflow/lite/experimental/ruy/matrix.h
@ -148,6 +148,19 @@ StreamType& operator<<(StreamType& stream, const Matrix<Scalar>& mat) {
  return stream;
 }
 // Compile-time version of KernelLayout, used to declare kernel layouts in a
 // way that can be consumed by compile-time logic.
 // See how partial specializations of Kernel use it to declare their layouts.
 // The only reason why this is currently part of the public API is to
 // allow testing various layouts for the Path::kStandardCpp kernel, as a
 // testing-only feature. See Spec::StandardCppKernelLhsLayout.
 template <Order tOrder, int tRows, int tCols>
 struct FixedKernelLayout {
  static constexpr Order kOrder = tOrder;
  static constexpr int kRows = tRows;
  static constexpr int kCols = tCols;
 };
 }  // namespace ruy
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_MATRIX_H_
--- a/tensorflow/lite/experimental/ruy/spec.h
+++ b/tensorflow/lite/experimental/ruy/spec.h
@ -20,6 +20,8 @@ limitations under the License.
 #include <limits>
 #include <type_traits>
 #include "tensorflow/lite/experimental/ruy/matrix.h"
 namespace ruy {
 // Our 'general' loop structure (the default) involves multi-threading and
@ -96,6 +98,23 @@ struct BasicSpec {
  // See above enum ZeroPointSupport
  static constexpr ZeroPointSupport kZeroPointSupport =
      ZeroPointSupport::kGeneral;
  // Testing-only, not meant to be used by actual users:
  // Used for testing of various kernel layouts.
  using StandardCppKernelLhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
  using StandardCppKernelRhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
  // The value and even the meaning of this value are empirically
  // determined. Coarsely speaking, it's compared with the size of source
  // LHS and RHS operands to determine whether they are big enough to be worth
  // traversing in a more complicated "cache friendly" order. The current
  // value is roughly the minimum size of a L1 cache on any CPU that we
  // currently care about, e.g. ARM Cortex-A53. But we honestly don't even know
  // the precise extent to which this should be related to L1 cache size.
  //
  // A lower value is not necessarily 'safer' from a cache-friendliness
  // perspective: it means switching sooner (at smaller sizes) to more
  // complicated traversal orders, which might be adversarial to the CPU's
  // auto-prefetching or to the TLB.
  static int cache_friendly_traversal_threshold() { return 32 * 1024; }
 };
 }  // namespace ruy
--- a/tensorflow/lite/experimental/ruy/test_special_specs.cc
+++ b/tensorflow/lite/experimental/ruy/test_special_specs.cc
@ -36,6 +36,14 @@ struct RCCSpec : BasicSpec<AccumScalar, DstScalar> {
  static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kRCC;
 };
 template <typename AccumScalar, typename DstScalar, typename LhsKernelLayout,
          typename RhsKernelLayout>
 struct StandardCppKernelLayoutSpec : BasicSpec<AccumScalar, DstScalar> {
  using StandardCppKernelLhsLayout = LhsKernelLayout;
  using StandardCppKernelRhsLayout = RhsKernelLayout;
  static int cache_friendly_traversal_threshold() { return 0; }
 };
 using LhsScalar = RUY_TEST_LHSSCALAR;
 using RhsScalar = RUY_TEST_RHSSCALAR;
 using AccumScalar = RUY_TEST_ACCUMSCALAR;
@ -123,4 +131,24 @@ TEST(TestSpecialSpecs, RCC) {
  TestNonRCC<RCCTestSet>(81, 93, 72, ExpectedOutcome::kDeath);
 }
 template <typename LhsKernelLayout, typename RhsKernelLayout>
 void TestStandardCppKernelLayout() {
  using SpecType =
      StandardCppKernelLayoutSpec<AccumScalar, DstScalar, LhsKernelLayout,
                                  RhsKernelLayout>;
  using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>;
  for (int size = 1; size < 10; size++) {
    TestLinearAllOrders<TestSetType>(size, size, size);
  }
  TestLinearAllOrders<TestSetType>(87, 34, 56);
  TestLinearAllOrders<TestSetType>(123, 234, 78);
 }
 TEST(TestSpecialSpecs, StandardCppKernelLayout) {
  TestStandardCppKernelLayout<FixedKernelLayout<Order::kColMajor, 1, 1>,
                              FixedKernelLayout<Order::kColMajor, 1, 1>>();
  TestStandardCppKernelLayout<FixedKernelLayout<Order::kRowMajor, 4, 4>,
                              FixedKernelLayout<Order::kRowMajor, 4, 4>>();
 }
 }  // namespace ruy
--- a/tensorflow/lite/experimental/ruy/trmul.cc
+++ b/tensorflow/lite/experimental/ruy/trmul.cc
@ -169,10 +169,10 @@ int GetThreadCount(Context* context, int rows, int cols, int depth) {
  return clamp(guess, 1, context->max_num_threads);
 }
-LoopStructure GetLoopStructure(int thread_count, int rows, int cols,
+LoopStructure GetLoopStructure(int thread_count, int rows, int cols, int depth,
-                               int depth) {
+                               int cache_friendly_traversal_threshold) {
  if (thread_count == 1 &&
-      (rows + cols) * depth < kCacheFriendlyLoopThreshold) {
+      (rows + cols) * depth < cache_friendly_traversal_threshold) {
    return LoopStructure::kSimple;
  }
  return LoopStructure::kGeneral;
@ -195,7 +195,9 @@ void TrMul(TrMulParams* params, Context* context) {
  const int cols_rounded_up = packed_rhs.layout.cols;
  int thread_count = GetThreadCount(context, rows, cols, depth);
-  const auto loop_structure = GetLoopStructure(thread_count, rows, cols, depth);
+  const auto loop_structure =
      GetLoopStructure(thread_count, rows, cols, depth,
                       params->cache_friendly_traversal_threshold);
  Allocator* allocator = context->GetMainAllocator();
  if (!params->lhs_is_prepacked) {
@ -231,7 +233,7 @@ void TrMul(TrMulParams* params, Context* context) {
  MakeBlockMap(rows_rounded_up, cols_rounded_up, depth,
               packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
               packed_lhs.data_type.size, packed_rhs.data_type.size,
-               &block_map);
+               params->cache_friendly_traversal_threshold, &block_map);
  std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
  std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
  std::uint32_t num_blocks = NumBlocks(block_map);
--- a/tensorflow/lite/experimental/ruy/trmul.h
+++ b/tensorflow/lite/experimental/ruy/trmul.h
@ -62,6 +62,7 @@ struct TrMulParams {
  PMatrix packed_rhs;
  bool lhs_is_prepacked = false;
  bool rhs_is_prepacked = false;
  int cache_friendly_traversal_threshold = 0;
  // Type-erased Spec.
  void* spec = nullptr;