Make the kStandardCpp kernel layout and the cache-friendly traversal

threshold part of the Spec, allowing to write tests (test_special_specs_*)
that test non-default values.

PiperOrigin-RevId: 254464005
This commit is contained in:
Benoit Jacob 2019-06-21 14:03:20 -07:00 committed by TensorFlower Gardener
parent ecb03b1538
commit 5a8c1c2a9e
11 changed files with 84 additions and 39 deletions

View File

@ -44,11 +44,6 @@ cc_test(
], ],
) )
cc_library(
name = "spec",
hdrs = ["spec.h"],
)
cc_library( cc_library(
name = "size_util", name = "size_util",
hdrs = ["size_util.h"], hdrs = ["size_util.h"],
@ -213,6 +208,12 @@ cc_library(
deps = [":check_macros"], deps = [":check_macros"],
) )
cc_library(
name = "spec",
hdrs = ["spec.h"],
deps = [":matrix"],
)
cc_library( cc_library(
name = "internal_matrix", name = "internal_matrix",
hdrs = ["internal_matrix.h"], hdrs = ["internal_matrix.h"],

View File

@ -81,7 +81,7 @@ int floor_log2_quotient(int num, int denom) {
void MakeBlockMap(int rows, int cols, int depth, int kernel_rows, void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
int kernel_cols, int lhs_scalar_size, int rhs_scalar_size, int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
BlockMap* block_map) { int cache_friendly_traversal_threshold, BlockMap* block_map) {
gemmlowp::ScopedProfilingLabel label("MakeBlockMap"); gemmlowp::ScopedProfilingLabel label("MakeBlockMap");
RUY_DCHECK_GE(rows, kernel_rows); RUY_DCHECK_GE(rows, kernel_rows);
RUY_DCHECK_GE(cols, kernel_cols); RUY_DCHECK_GE(cols, kernel_cols);
@ -89,7 +89,7 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
block_map->traversal_order = BlockMapTraversalOrder::kLinear; block_map->traversal_order = BlockMapTraversalOrder::kLinear;
if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) && if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) &&
(rows * lhs_scalar_size + cols * rhs_scalar_size) * depth >= (rows * lhs_scalar_size + cols * rhs_scalar_size) * depth >=
kCacheFriendlyLoopThreshold) { cache_friendly_traversal_threshold) {
block_map->traversal_order = RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U) block_map->traversal_order = RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U)
? BlockMapTraversalOrder::kFractalU ? BlockMapTraversalOrder::kFractalU
: BlockMapTraversalOrder::kFractalZ; : BlockMapTraversalOrder::kFractalZ;

View File

@ -20,20 +20,6 @@ limitations under the License.
namespace ruy { namespace ruy {
// The value and even the meaning of this constant are empirically
// determined. Coarsely speaking, it's compared with the size of source
// LHS and RHS operands to determine whether they are big enough to be worth
// traversing in a more complicated "cache friendly" order. The current
// value is roughly the minimum size of a L1 cache on any CPU that we currently
// care about, e.g. ARM Cortex-A53. But we honestly don't even know the precise
// extent to which this should be related to L1 cache size.
//
// A lower value is not necessarily 'safer' from a cache-friendliness
// perspective: it means switching sooner (at smaller sizes) to more complicated
// traversal orders, which might be adversarial to the CPU's auto-prefetching
// or to the TLB.
static constexpr int kCacheFriendlyLoopThreshold = 32 * 1024;
enum class BlockMapTraversalOrder { enum class BlockMapTraversalOrder {
// Plain old row-by-row or column-by-column traversal. // Plain old row-by-row or column-by-column traversal.
kLinear, kLinear,
@ -126,7 +112,7 @@ struct BlockMap {
// matrix multiplication with the given parameters. // matrix multiplication with the given parameters.
void MakeBlockMap(int rows, int cols, int depth, int kernel_rows, void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
int kernel_cols, int lhs_scalar_size, int rhs_scalar_size, int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
BlockMap* block_map); int cache_friendly_traversal_threshold, BlockMap* block_map);
// Maps an integer index to a (block_r, block_c) block position in the grid. // Maps an integer index to a (block_r, block_c) block position in the grid.
void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index, void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index,

View File

@ -189,6 +189,9 @@ void PopulateTrMulParams(TrMulParams* params) {
&RunPack<ThePath, RhsKernelLayout, RhsScalar, PackedRhsScalar>; &RunPack<ThePath, RhsKernelLayout, RhsScalar, PackedRhsScalar>;
params->run_kernel = params->run_kernel =
&RunKernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>; &RunKernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
params->cache_friendly_traversal_threshold =
Spec::cache_friendly_traversal_threshold();
return; return;
} }

View File

@ -100,6 +100,8 @@ limitations under the License.
namespace ruy { namespace ruy {
// KernelLayout describes small-scale block structure in a packed matrix layout. // KernelLayout describes small-scale block structure in a packed matrix layout.
// It's a runtime (as opposed to compile-time-constant) version of the
// FixedKernelLayout struct used to declare kernel layouts.
// //
// This is is sometimes known as "tiling" in other contexts. // This is is sometimes known as "tiling" in other contexts.
// //
@ -117,16 +119,6 @@ struct KernelLayout {
std::uint8_t cols = 1; std::uint8_t cols = 1;
}; };
// Compile time version of KernelLayout, suitable for template metaprogramming.
// In particular, partial template specializations of Kernel use this type to
// statically declare their kernel format.
template <Order tOrder, int tRows, int tCols>
struct FixedKernelLayout {
static constexpr Order kOrder = tOrder;
static constexpr int kRows = tRows;
static constexpr int kCols = tCols;
};
// A packed matrix has a small-scale block structure that is not present in in // A packed matrix has a small-scale block structure that is not present in in
// the input matrices. This block structure is necessary for the kernels to // the input matrices. This block structure is necessary for the kernels to
// process data efficiently. // process data efficiently.

View File

@ -148,8 +148,8 @@ template <typename LhsScalar, typename RhsScalar, typename DstScalar,
typename Spec> typename Spec>
struct Kernel<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar, Spec> { struct Kernel<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar, Spec> {
using AccumScalar = typename Spec::AccumScalar; using AccumScalar = typename Spec::AccumScalar;
using LhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>; using LhsLayout = typename Spec::StandardCppKernelLhsLayout;
using RhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>; using RhsLayout = typename Spec::StandardCppKernelRhsLayout;
explicit Kernel(Tuning) {} explicit Kernel(Tuning) {}
void Run(const PackedMatrix<LhsScalar>& lhs, void Run(const PackedMatrix<LhsScalar>& lhs,
const PackedMatrix<RhsScalar>& rhs, const Spec& spec, int start_row, const PackedMatrix<RhsScalar>& rhs, const Spec& spec, int start_row,

View File

@ -148,6 +148,19 @@ StreamType& operator<<(StreamType& stream, const Matrix<Scalar>& mat) {
return stream; return stream;
} }
// Compile-time version of KernelLayout, used to declare kernel layouts in a
// way that can be consumed by compile-time logic.
// See how partial specializations of Kernel use it to declare their layouts.
// The only reason why this is currently part of the public API is to
// allow testing various layouts for the Path::kStandardCpp kernel, as a
// testing-only feature. See Spec::StandardCppKernelLhsLayout.
template <Order tOrder, int tRows, int tCols>
struct FixedKernelLayout {
static constexpr Order kOrder = tOrder;
static constexpr int kRows = tRows;
static constexpr int kCols = tCols;
};
} // namespace ruy } // namespace ruy
#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_MATRIX_H_ #endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_MATRIX_H_

View File

@ -20,6 +20,8 @@ limitations under the License.
#include <limits> #include <limits>
#include <type_traits> #include <type_traits>
#include "tensorflow/lite/experimental/ruy/matrix.h"
namespace ruy { namespace ruy {
// Our 'general' loop structure (the default) involves multi-threading and // Our 'general' loop structure (the default) involves multi-threading and
@ -96,6 +98,23 @@ struct BasicSpec {
// See above enum ZeroPointSupport // See above enum ZeroPointSupport
static constexpr ZeroPointSupport kZeroPointSupport = static constexpr ZeroPointSupport kZeroPointSupport =
ZeroPointSupport::kGeneral; ZeroPointSupport::kGeneral;
// Testing-only, not meant to be used by actual users:
// Used for testing of various kernel layouts.
using StandardCppKernelLhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
using StandardCppKernelRhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
// The value and even the meaning of this value are empirically
// determined. Coarsely speaking, it's compared with the size of source
// LHS and RHS operands to determine whether they are big enough to be worth
// traversing in a more complicated "cache friendly" order. The current
// value is roughly the minimum size of a L1 cache on any CPU that we
// currently care about, e.g. ARM Cortex-A53. But we honestly don't even know
// the precise extent to which this should be related to L1 cache size.
//
// A lower value is not necessarily 'safer' from a cache-friendliness
// perspective: it means switching sooner (at smaller sizes) to more
// complicated traversal orders, which might be adversarial to the CPU's
// auto-prefetching or to the TLB.
static int cache_friendly_traversal_threshold() { return 32 * 1024; }
}; };
} // namespace ruy } // namespace ruy

View File

@ -36,6 +36,14 @@ struct RCCSpec : BasicSpec<AccumScalar, DstScalar> {
static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kRCC; static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kRCC;
}; };
template <typename AccumScalar, typename DstScalar, typename LhsKernelLayout,
typename RhsKernelLayout>
struct StandardCppKernelLayoutSpec : BasicSpec<AccumScalar, DstScalar> {
using StandardCppKernelLhsLayout = LhsKernelLayout;
using StandardCppKernelRhsLayout = RhsKernelLayout;
static int cache_friendly_traversal_threshold() { return 0; }
};
using LhsScalar = RUY_TEST_LHSSCALAR; using LhsScalar = RUY_TEST_LHSSCALAR;
using RhsScalar = RUY_TEST_RHSSCALAR; using RhsScalar = RUY_TEST_RHSSCALAR;
using AccumScalar = RUY_TEST_ACCUMSCALAR; using AccumScalar = RUY_TEST_ACCUMSCALAR;
@ -123,4 +131,24 @@ TEST(TestSpecialSpecs, RCC) {
TestNonRCC<RCCTestSet>(81, 93, 72, ExpectedOutcome::kDeath); TestNonRCC<RCCTestSet>(81, 93, 72, ExpectedOutcome::kDeath);
} }
template <typename LhsKernelLayout, typename RhsKernelLayout>
void TestStandardCppKernelLayout() {
using SpecType =
StandardCppKernelLayoutSpec<AccumScalar, DstScalar, LhsKernelLayout,
RhsKernelLayout>;
using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>;
for (int size = 1; size < 10; size++) {
TestLinearAllOrders<TestSetType>(size, size, size);
}
TestLinearAllOrders<TestSetType>(87, 34, 56);
TestLinearAllOrders<TestSetType>(123, 234, 78);
}
TEST(TestSpecialSpecs, StandardCppKernelLayout) {
TestStandardCppKernelLayout<FixedKernelLayout<Order::kColMajor, 1, 1>,
FixedKernelLayout<Order::kColMajor, 1, 1>>();
TestStandardCppKernelLayout<FixedKernelLayout<Order::kRowMajor, 4, 4>,
FixedKernelLayout<Order::kRowMajor, 4, 4>>();
}
} // namespace ruy } // namespace ruy

View File

@ -169,10 +169,10 @@ int GetThreadCount(Context* context, int rows, int cols, int depth) {
return clamp(guess, 1, context->max_num_threads); return clamp(guess, 1, context->max_num_threads);
} }
LoopStructure GetLoopStructure(int thread_count, int rows, int cols, LoopStructure GetLoopStructure(int thread_count, int rows, int cols, int depth,
int depth) { int cache_friendly_traversal_threshold) {
if (thread_count == 1 && if (thread_count == 1 &&
(rows + cols) * depth < kCacheFriendlyLoopThreshold) { (rows + cols) * depth < cache_friendly_traversal_threshold) {
return LoopStructure::kSimple; return LoopStructure::kSimple;
} }
return LoopStructure::kGeneral; return LoopStructure::kGeneral;
@ -195,7 +195,9 @@ void TrMul(TrMulParams* params, Context* context) {
const int cols_rounded_up = packed_rhs.layout.cols; const int cols_rounded_up = packed_rhs.layout.cols;
int thread_count = GetThreadCount(context, rows, cols, depth); int thread_count = GetThreadCount(context, rows, cols, depth);
const auto loop_structure = GetLoopStructure(thread_count, rows, cols, depth); const auto loop_structure =
GetLoopStructure(thread_count, rows, cols, depth,
params->cache_friendly_traversal_threshold);
Allocator* allocator = context->GetMainAllocator(); Allocator* allocator = context->GetMainAllocator();
if (!params->lhs_is_prepacked) { if (!params->lhs_is_prepacked) {
@ -231,7 +233,7 @@ void TrMul(TrMulParams* params, Context* context) {
MakeBlockMap(rows_rounded_up, cols_rounded_up, depth, MakeBlockMap(rows_rounded_up, cols_rounded_up, depth,
packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols, packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
packed_lhs.data_type.size, packed_rhs.data_type.size, packed_lhs.data_type.size, packed_rhs.data_type.size,
&block_map); params->cache_friendly_traversal_threshold, &block_map);
std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map); std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map); std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
std::uint32_t num_blocks = NumBlocks(block_map); std::uint32_t num_blocks = NumBlocks(block_map);

View File

@ -62,6 +62,7 @@ struct TrMulParams {
PMatrix packed_rhs; PMatrix packed_rhs;
bool lhs_is_prepacked = false; bool lhs_is_prepacked = false;
bool rhs_is_prepacked = false; bool rhs_is_prepacked = false;
int cache_friendly_traversal_threshold = 0;
// Type-erased Spec. // Type-erased Spec.
void* spec = nullptr; void* spec = nullptr;