From 5a8c1c2a9e34eaaefc49645c8024f6433464a695 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 21 Jun 2019 14:03:20 -0700 Subject: [PATCH] Make the kStandardCpp kernel layout and the cache-friendly traversal threshold part of the Spec, allowing to write tests (test_special_specs_*) that test non-default values. PiperOrigin-RevId: 254464005 --- tensorflow/lite/experimental/ruy/BUILD | 11 ++++---- tensorflow/lite/experimental/ruy/block_map.cc | 4 +-- tensorflow/lite/experimental/ruy/block_map.h | 16 +---------- tensorflow/lite/experimental/ruy/dispatch.h | 3 ++ .../lite/experimental/ruy/internal_matrix.h | 12 ++------ tensorflow/lite/experimental/ruy/kernel.h | 4 +-- tensorflow/lite/experimental/ruy/matrix.h | 13 +++++++++ tensorflow/lite/experimental/ruy/spec.h | 19 +++++++++++++ .../experimental/ruy/test_special_specs.cc | 28 +++++++++++++++++++ tensorflow/lite/experimental/ruy/trmul.cc | 12 ++++---- tensorflow/lite/experimental/ruy/trmul.h | 1 + 11 files changed, 84 insertions(+), 39 deletions(-) diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD index aa621e3f53e..8577e959efa 100644 --- a/tensorflow/lite/experimental/ruy/BUILD +++ b/tensorflow/lite/experimental/ruy/BUILD @@ -44,11 +44,6 @@ cc_test( ], ) -cc_library( - name = "spec", - hdrs = ["spec.h"], -) - cc_library( name = "size_util", hdrs = ["size_util.h"], @@ -213,6 +208,12 @@ cc_library( deps = [":check_macros"], ) +cc_library( + name = "spec", + hdrs = ["spec.h"], + deps = [":matrix"], +) + cc_library( name = "internal_matrix", hdrs = ["internal_matrix.h"], diff --git a/tensorflow/lite/experimental/ruy/block_map.cc b/tensorflow/lite/experimental/ruy/block_map.cc index 5e3ef859e7a..08b3c6064f9 100644 --- a/tensorflow/lite/experimental/ruy/block_map.cc +++ b/tensorflow/lite/experimental/ruy/block_map.cc @@ -81,7 +81,7 @@ int floor_log2_quotient(int num, int denom) { void MakeBlockMap(int rows, int cols, int depth, int kernel_rows, int kernel_cols, int lhs_scalar_size, int rhs_scalar_size, - BlockMap* block_map) { + int cache_friendly_traversal_threshold, BlockMap* block_map) { gemmlowp::ScopedProfilingLabel label("MakeBlockMap"); RUY_DCHECK_GE(rows, kernel_rows); RUY_DCHECK_GE(cols, kernel_cols); @@ -89,7 +89,7 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows, block_map->traversal_order = BlockMapTraversalOrder::kLinear; if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) && (rows * lhs_scalar_size + cols * rhs_scalar_size) * depth >= - kCacheFriendlyLoopThreshold) { + cache_friendly_traversal_threshold) { block_map->traversal_order = RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U) ? BlockMapTraversalOrder::kFractalU : BlockMapTraversalOrder::kFractalZ; diff --git a/tensorflow/lite/experimental/ruy/block_map.h b/tensorflow/lite/experimental/ruy/block_map.h index 7a0f74cb359..b0567ea481f 100644 --- a/tensorflow/lite/experimental/ruy/block_map.h +++ b/tensorflow/lite/experimental/ruy/block_map.h @@ -20,20 +20,6 @@ limitations under the License. namespace ruy { -// The value and even the meaning of this constant are empirically -// determined. Coarsely speaking, it's compared with the size of source -// LHS and RHS operands to determine whether they are big enough to be worth -// traversing in a more complicated "cache friendly" order. The current -// value is roughly the minimum size of a L1 cache on any CPU that we currently -// care about, e.g. ARM Cortex-A53. But we honestly don't even know the precise -// extent to which this should be related to L1 cache size. -// -// A lower value is not necessarily 'safer' from a cache-friendliness -// perspective: it means switching sooner (at smaller sizes) to more complicated -// traversal orders, which might be adversarial to the CPU's auto-prefetching -// or to the TLB. -static constexpr int kCacheFriendlyLoopThreshold = 32 * 1024; - enum class BlockMapTraversalOrder { // Plain old row-by-row or column-by-column traversal. kLinear, @@ -126,7 +112,7 @@ struct BlockMap { // matrix multiplication with the given parameters. void MakeBlockMap(int rows, int cols, int depth, int kernel_rows, int kernel_cols, int lhs_scalar_size, int rhs_scalar_size, - BlockMap* block_map); + int cache_friendly_traversal_threshold, BlockMap* block_map); // Maps an integer index to a (block_r, block_c) block position in the grid. void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index, diff --git a/tensorflow/lite/experimental/ruy/dispatch.h b/tensorflow/lite/experimental/ruy/dispatch.h index be0a4e5a641..9044be70bb7 100644 --- a/tensorflow/lite/experimental/ruy/dispatch.h +++ b/tensorflow/lite/experimental/ruy/dispatch.h @@ -189,6 +189,9 @@ void PopulateTrMulParams(TrMulParams* params) { &RunPack; params->run_kernel = &RunKernel; + + params->cache_friendly_traversal_threshold = + Spec::cache_friendly_traversal_threshold(); return; } diff --git a/tensorflow/lite/experimental/ruy/internal_matrix.h b/tensorflow/lite/experimental/ruy/internal_matrix.h index a8e8f1cae68..f44ce444dc4 100644 --- a/tensorflow/lite/experimental/ruy/internal_matrix.h +++ b/tensorflow/lite/experimental/ruy/internal_matrix.h @@ -100,6 +100,8 @@ limitations under the License. namespace ruy { // KernelLayout describes small-scale block structure in a packed matrix layout. +// It's a runtime (as opposed to compile-time-constant) version of the +// FixedKernelLayout struct used to declare kernel layouts. // // This is is sometimes known as "tiling" in other contexts. // @@ -117,16 +119,6 @@ struct KernelLayout { std::uint8_t cols = 1; }; -// Compile time version of KernelLayout, suitable for template metaprogramming. -// In particular, partial template specializations of Kernel use this type to -// statically declare their kernel format. -template -struct FixedKernelLayout { - static constexpr Order kOrder = tOrder; - static constexpr int kRows = tRows; - static constexpr int kCols = tCols; -}; - // A packed matrix has a small-scale block structure that is not present in in // the input matrices. This block structure is necessary for the kernels to // process data efficiently. diff --git a/tensorflow/lite/experimental/ruy/kernel.h b/tensorflow/lite/experimental/ruy/kernel.h index 8b03bc727e0..6b4c98f4322 100644 --- a/tensorflow/lite/experimental/ruy/kernel.h +++ b/tensorflow/lite/experimental/ruy/kernel.h @@ -148,8 +148,8 @@ template struct Kernel { using AccumScalar = typename Spec::AccumScalar; - using LhsLayout = FixedKernelLayout; - using RhsLayout = FixedKernelLayout; + using LhsLayout = typename Spec::StandardCppKernelLhsLayout; + using RhsLayout = typename Spec::StandardCppKernelRhsLayout; explicit Kernel(Tuning) {} void Run(const PackedMatrix& lhs, const PackedMatrix& rhs, const Spec& spec, int start_row, diff --git a/tensorflow/lite/experimental/ruy/matrix.h b/tensorflow/lite/experimental/ruy/matrix.h index 7a3351ba1ac..3f26f091a79 100644 --- a/tensorflow/lite/experimental/ruy/matrix.h +++ b/tensorflow/lite/experimental/ruy/matrix.h @@ -148,6 +148,19 @@ StreamType& operator<<(StreamType& stream, const Matrix& mat) { return stream; } +// Compile-time version of KernelLayout, used to declare kernel layouts in a +// way that can be consumed by compile-time logic. +// See how partial specializations of Kernel use it to declare their layouts. +// The only reason why this is currently part of the public API is to +// allow testing various layouts for the Path::kStandardCpp kernel, as a +// testing-only feature. See Spec::StandardCppKernelLhsLayout. +template +struct FixedKernelLayout { + static constexpr Order kOrder = tOrder; + static constexpr int kRows = tRows; + static constexpr int kCols = tCols; +}; + } // namespace ruy #endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_MATRIX_H_ diff --git a/tensorflow/lite/experimental/ruy/spec.h b/tensorflow/lite/experimental/ruy/spec.h index b4d5901320f..091344503ed 100644 --- a/tensorflow/lite/experimental/ruy/spec.h +++ b/tensorflow/lite/experimental/ruy/spec.h @@ -20,6 +20,8 @@ limitations under the License. #include #include +#include "tensorflow/lite/experimental/ruy/matrix.h" + namespace ruy { // Our 'general' loop structure (the default) involves multi-threading and @@ -96,6 +98,23 @@ struct BasicSpec { // See above enum ZeroPointSupport static constexpr ZeroPointSupport kZeroPointSupport = ZeroPointSupport::kGeneral; + // Testing-only, not meant to be used by actual users: + // Used for testing of various kernel layouts. + using StandardCppKernelLhsLayout = FixedKernelLayout; + using StandardCppKernelRhsLayout = FixedKernelLayout; + // The value and even the meaning of this value are empirically + // determined. Coarsely speaking, it's compared with the size of source + // LHS and RHS operands to determine whether they are big enough to be worth + // traversing in a more complicated "cache friendly" order. The current + // value is roughly the minimum size of a L1 cache on any CPU that we + // currently care about, e.g. ARM Cortex-A53. But we honestly don't even know + // the precise extent to which this should be related to L1 cache size. + // + // A lower value is not necessarily 'safer' from a cache-friendliness + // perspective: it means switching sooner (at smaller sizes) to more + // complicated traversal orders, which might be adversarial to the CPU's + // auto-prefetching or to the TLB. + static int cache_friendly_traversal_threshold() { return 32 * 1024; } }; } // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/test_special_specs.cc b/tensorflow/lite/experimental/ruy/test_special_specs.cc index 5e1d8d980f5..daaadf1757e 100644 --- a/tensorflow/lite/experimental/ruy/test_special_specs.cc +++ b/tensorflow/lite/experimental/ruy/test_special_specs.cc @@ -36,6 +36,14 @@ struct RCCSpec : BasicSpec { static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kRCC; }; +template +struct StandardCppKernelLayoutSpec : BasicSpec { + using StandardCppKernelLhsLayout = LhsKernelLayout; + using StandardCppKernelRhsLayout = RhsKernelLayout; + static int cache_friendly_traversal_threshold() { return 0; } +}; + using LhsScalar = RUY_TEST_LHSSCALAR; using RhsScalar = RUY_TEST_RHSSCALAR; using AccumScalar = RUY_TEST_ACCUMSCALAR; @@ -123,4 +131,24 @@ TEST(TestSpecialSpecs, RCC) { TestNonRCC(81, 93, 72, ExpectedOutcome::kDeath); } +template +void TestStandardCppKernelLayout() { + using SpecType = + StandardCppKernelLayoutSpec; + using TestSetType = TestSet; + for (int size = 1; size < 10; size++) { + TestLinearAllOrders(size, size, size); + } + TestLinearAllOrders(87, 34, 56); + TestLinearAllOrders(123, 234, 78); +} + +TEST(TestSpecialSpecs, StandardCppKernelLayout) { + TestStandardCppKernelLayout, + FixedKernelLayout>(); + TestStandardCppKernelLayout, + FixedKernelLayout>(); +} + } // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/trmul.cc b/tensorflow/lite/experimental/ruy/trmul.cc index c2dcfcf1810..c7beb6ea042 100644 --- a/tensorflow/lite/experimental/ruy/trmul.cc +++ b/tensorflow/lite/experimental/ruy/trmul.cc @@ -169,10 +169,10 @@ int GetThreadCount(Context* context, int rows, int cols, int depth) { return clamp(guess, 1, context->max_num_threads); } -LoopStructure GetLoopStructure(int thread_count, int rows, int cols, - int depth) { +LoopStructure GetLoopStructure(int thread_count, int rows, int cols, int depth, + int cache_friendly_traversal_threshold) { if (thread_count == 1 && - (rows + cols) * depth < kCacheFriendlyLoopThreshold) { + (rows + cols) * depth < cache_friendly_traversal_threshold) { return LoopStructure::kSimple; } return LoopStructure::kGeneral; @@ -195,7 +195,9 @@ void TrMul(TrMulParams* params, Context* context) { const int cols_rounded_up = packed_rhs.layout.cols; int thread_count = GetThreadCount(context, rows, cols, depth); - const auto loop_structure = GetLoopStructure(thread_count, rows, cols, depth); + const auto loop_structure = + GetLoopStructure(thread_count, rows, cols, depth, + params->cache_friendly_traversal_threshold); Allocator* allocator = context->GetMainAllocator(); if (!params->lhs_is_prepacked) { @@ -231,7 +233,7 @@ void TrMul(TrMulParams* params, Context* context) { MakeBlockMap(rows_rounded_up, cols_rounded_up, depth, packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols, packed_lhs.data_type.size, packed_rhs.data_type.size, - &block_map); + params->cache_friendly_traversal_threshold, &block_map); std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map); std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map); std::uint32_t num_blocks = NumBlocks(block_map); diff --git a/tensorflow/lite/experimental/ruy/trmul.h b/tensorflow/lite/experimental/ruy/trmul.h index 64b43aca37f..1a3872bc2ba 100644 --- a/tensorflow/lite/experimental/ruy/trmul.h +++ b/tensorflow/lite/experimental/ruy/trmul.h @@ -62,6 +62,7 @@ struct TrMulParams { PMatrix packed_rhs; bool lhs_is_prepacked = false; bool rhs_is_prepacked = false; + int cache_friendly_traversal_threshold = 0; // Type-erased Spec. void* spec = nullptr;