Make the kStandardCpp kernel layout and the cache-friendly traversal
threshold part of the Spec, allowing to write tests (test_special_specs_*) that test non-default values. PiperOrigin-RevId: 254464005
This commit is contained in:
parent
ecb03b1538
commit
5a8c1c2a9e
@ -44,11 +44,6 @@ cc_test(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "spec",
|
||||
hdrs = ["spec.h"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "size_util",
|
||||
hdrs = ["size_util.h"],
|
||||
@ -213,6 +208,12 @@ cc_library(
|
||||
deps = [":check_macros"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "spec",
|
||||
hdrs = ["spec.h"],
|
||||
deps = [":matrix"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "internal_matrix",
|
||||
hdrs = ["internal_matrix.h"],
|
||||
|
@ -81,7 +81,7 @@ int floor_log2_quotient(int num, int denom) {
|
||||
|
||||
void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
|
||||
int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
|
||||
BlockMap* block_map) {
|
||||
int cache_friendly_traversal_threshold, BlockMap* block_map) {
|
||||
gemmlowp::ScopedProfilingLabel label("MakeBlockMap");
|
||||
RUY_DCHECK_GE(rows, kernel_rows);
|
||||
RUY_DCHECK_GE(cols, kernel_cols);
|
||||
@ -89,7 +89,7 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
|
||||
block_map->traversal_order = BlockMapTraversalOrder::kLinear;
|
||||
if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) &&
|
||||
(rows * lhs_scalar_size + cols * rhs_scalar_size) * depth >=
|
||||
kCacheFriendlyLoopThreshold) {
|
||||
cache_friendly_traversal_threshold) {
|
||||
block_map->traversal_order = RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U)
|
||||
? BlockMapTraversalOrder::kFractalU
|
||||
: BlockMapTraversalOrder::kFractalZ;
|
||||
|
@ -20,20 +20,6 @@ limitations under the License.
|
||||
|
||||
namespace ruy {
|
||||
|
||||
// The value and even the meaning of this constant are empirically
|
||||
// determined. Coarsely speaking, it's compared with the size of source
|
||||
// LHS and RHS operands to determine whether they are big enough to be worth
|
||||
// traversing in a more complicated "cache friendly" order. The current
|
||||
// value is roughly the minimum size of a L1 cache on any CPU that we currently
|
||||
// care about, e.g. ARM Cortex-A53. But we honestly don't even know the precise
|
||||
// extent to which this should be related to L1 cache size.
|
||||
//
|
||||
// A lower value is not necessarily 'safer' from a cache-friendliness
|
||||
// perspective: it means switching sooner (at smaller sizes) to more complicated
|
||||
// traversal orders, which might be adversarial to the CPU's auto-prefetching
|
||||
// or to the TLB.
|
||||
static constexpr int kCacheFriendlyLoopThreshold = 32 * 1024;
|
||||
|
||||
enum class BlockMapTraversalOrder {
|
||||
// Plain old row-by-row or column-by-column traversal.
|
||||
kLinear,
|
||||
@ -126,7 +112,7 @@ struct BlockMap {
|
||||
// matrix multiplication with the given parameters.
|
||||
void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
|
||||
int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
|
||||
BlockMap* block_map);
|
||||
int cache_friendly_traversal_threshold, BlockMap* block_map);
|
||||
|
||||
// Maps an integer index to a (block_r, block_c) block position in the grid.
|
||||
void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index,
|
||||
|
@ -189,6 +189,9 @@ void PopulateTrMulParams(TrMulParams* params) {
|
||||
&RunPack<ThePath, RhsKernelLayout, RhsScalar, PackedRhsScalar>;
|
||||
params->run_kernel =
|
||||
&RunKernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
|
||||
|
||||
params->cache_friendly_traversal_threshold =
|
||||
Spec::cache_friendly_traversal_threshold();
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -100,6 +100,8 @@ limitations under the License.
|
||||
namespace ruy {
|
||||
|
||||
// KernelLayout describes small-scale block structure in a packed matrix layout.
|
||||
// It's a runtime (as opposed to compile-time-constant) version of the
|
||||
// FixedKernelLayout struct used to declare kernel layouts.
|
||||
//
|
||||
// This is is sometimes known as "tiling" in other contexts.
|
||||
//
|
||||
@ -117,16 +119,6 @@ struct KernelLayout {
|
||||
std::uint8_t cols = 1;
|
||||
};
|
||||
|
||||
// Compile time version of KernelLayout, suitable for template metaprogramming.
|
||||
// In particular, partial template specializations of Kernel use this type to
|
||||
// statically declare their kernel format.
|
||||
template <Order tOrder, int tRows, int tCols>
|
||||
struct FixedKernelLayout {
|
||||
static constexpr Order kOrder = tOrder;
|
||||
static constexpr int kRows = tRows;
|
||||
static constexpr int kCols = tCols;
|
||||
};
|
||||
|
||||
// A packed matrix has a small-scale block structure that is not present in in
|
||||
// the input matrices. This block structure is necessary for the kernels to
|
||||
// process data efficiently.
|
||||
|
@ -148,8 +148,8 @@ template <typename LhsScalar, typename RhsScalar, typename DstScalar,
|
||||
typename Spec>
|
||||
struct Kernel<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar, Spec> {
|
||||
using AccumScalar = typename Spec::AccumScalar;
|
||||
using LhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
|
||||
using RhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
|
||||
using LhsLayout = typename Spec::StandardCppKernelLhsLayout;
|
||||
using RhsLayout = typename Spec::StandardCppKernelRhsLayout;
|
||||
explicit Kernel(Tuning) {}
|
||||
void Run(const PackedMatrix<LhsScalar>& lhs,
|
||||
const PackedMatrix<RhsScalar>& rhs, const Spec& spec, int start_row,
|
||||
|
@ -148,6 +148,19 @@ StreamType& operator<<(StreamType& stream, const Matrix<Scalar>& mat) {
|
||||
return stream;
|
||||
}
|
||||
|
||||
// Compile-time version of KernelLayout, used to declare kernel layouts in a
|
||||
// way that can be consumed by compile-time logic.
|
||||
// See how partial specializations of Kernel use it to declare their layouts.
|
||||
// The only reason why this is currently part of the public API is to
|
||||
// allow testing various layouts for the Path::kStandardCpp kernel, as a
|
||||
// testing-only feature. See Spec::StandardCppKernelLhsLayout.
|
||||
template <Order tOrder, int tRows, int tCols>
|
||||
struct FixedKernelLayout {
|
||||
static constexpr Order kOrder = tOrder;
|
||||
static constexpr int kRows = tRows;
|
||||
static constexpr int kCols = tCols;
|
||||
};
|
||||
|
||||
} // namespace ruy
|
||||
|
||||
#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_MATRIX_H_
|
||||
|
@ -20,6 +20,8 @@ limitations under the License.
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
|
||||
#include "tensorflow/lite/experimental/ruy/matrix.h"
|
||||
|
||||
namespace ruy {
|
||||
|
||||
// Our 'general' loop structure (the default) involves multi-threading and
|
||||
@ -96,6 +98,23 @@ struct BasicSpec {
|
||||
// See above enum ZeroPointSupport
|
||||
static constexpr ZeroPointSupport kZeroPointSupport =
|
||||
ZeroPointSupport::kGeneral;
|
||||
// Testing-only, not meant to be used by actual users:
|
||||
// Used for testing of various kernel layouts.
|
||||
using StandardCppKernelLhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
|
||||
using StandardCppKernelRhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
|
||||
// The value and even the meaning of this value are empirically
|
||||
// determined. Coarsely speaking, it's compared with the size of source
|
||||
// LHS and RHS operands to determine whether they are big enough to be worth
|
||||
// traversing in a more complicated "cache friendly" order. The current
|
||||
// value is roughly the minimum size of a L1 cache on any CPU that we
|
||||
// currently care about, e.g. ARM Cortex-A53. But we honestly don't even know
|
||||
// the precise extent to which this should be related to L1 cache size.
|
||||
//
|
||||
// A lower value is not necessarily 'safer' from a cache-friendliness
|
||||
// perspective: it means switching sooner (at smaller sizes) to more
|
||||
// complicated traversal orders, which might be adversarial to the CPU's
|
||||
// auto-prefetching or to the TLB.
|
||||
static int cache_friendly_traversal_threshold() { return 32 * 1024; }
|
||||
};
|
||||
|
||||
} // namespace ruy
|
||||
|
@ -36,6 +36,14 @@ struct RCCSpec : BasicSpec<AccumScalar, DstScalar> {
|
||||
static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kRCC;
|
||||
};
|
||||
|
||||
template <typename AccumScalar, typename DstScalar, typename LhsKernelLayout,
|
||||
typename RhsKernelLayout>
|
||||
struct StandardCppKernelLayoutSpec : BasicSpec<AccumScalar, DstScalar> {
|
||||
using StandardCppKernelLhsLayout = LhsKernelLayout;
|
||||
using StandardCppKernelRhsLayout = RhsKernelLayout;
|
||||
static int cache_friendly_traversal_threshold() { return 0; }
|
||||
};
|
||||
|
||||
using LhsScalar = RUY_TEST_LHSSCALAR;
|
||||
using RhsScalar = RUY_TEST_RHSSCALAR;
|
||||
using AccumScalar = RUY_TEST_ACCUMSCALAR;
|
||||
@ -123,4 +131,24 @@ TEST(TestSpecialSpecs, RCC) {
|
||||
TestNonRCC<RCCTestSet>(81, 93, 72, ExpectedOutcome::kDeath);
|
||||
}
|
||||
|
||||
template <typename LhsKernelLayout, typename RhsKernelLayout>
|
||||
void TestStandardCppKernelLayout() {
|
||||
using SpecType =
|
||||
StandardCppKernelLayoutSpec<AccumScalar, DstScalar, LhsKernelLayout,
|
||||
RhsKernelLayout>;
|
||||
using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>;
|
||||
for (int size = 1; size < 10; size++) {
|
||||
TestLinearAllOrders<TestSetType>(size, size, size);
|
||||
}
|
||||
TestLinearAllOrders<TestSetType>(87, 34, 56);
|
||||
TestLinearAllOrders<TestSetType>(123, 234, 78);
|
||||
}
|
||||
|
||||
TEST(TestSpecialSpecs, StandardCppKernelLayout) {
|
||||
TestStandardCppKernelLayout<FixedKernelLayout<Order::kColMajor, 1, 1>,
|
||||
FixedKernelLayout<Order::kColMajor, 1, 1>>();
|
||||
TestStandardCppKernelLayout<FixedKernelLayout<Order::kRowMajor, 4, 4>,
|
||||
FixedKernelLayout<Order::kRowMajor, 4, 4>>();
|
||||
}
|
||||
|
||||
} // namespace ruy
|
||||
|
@ -169,10 +169,10 @@ int GetThreadCount(Context* context, int rows, int cols, int depth) {
|
||||
return clamp(guess, 1, context->max_num_threads);
|
||||
}
|
||||
|
||||
LoopStructure GetLoopStructure(int thread_count, int rows, int cols,
|
||||
int depth) {
|
||||
LoopStructure GetLoopStructure(int thread_count, int rows, int cols, int depth,
|
||||
int cache_friendly_traversal_threshold) {
|
||||
if (thread_count == 1 &&
|
||||
(rows + cols) * depth < kCacheFriendlyLoopThreshold) {
|
||||
(rows + cols) * depth < cache_friendly_traversal_threshold) {
|
||||
return LoopStructure::kSimple;
|
||||
}
|
||||
return LoopStructure::kGeneral;
|
||||
@ -195,7 +195,9 @@ void TrMul(TrMulParams* params, Context* context) {
|
||||
const int cols_rounded_up = packed_rhs.layout.cols;
|
||||
|
||||
int thread_count = GetThreadCount(context, rows, cols, depth);
|
||||
const auto loop_structure = GetLoopStructure(thread_count, rows, cols, depth);
|
||||
const auto loop_structure =
|
||||
GetLoopStructure(thread_count, rows, cols, depth,
|
||||
params->cache_friendly_traversal_threshold);
|
||||
Allocator* allocator = context->GetMainAllocator();
|
||||
|
||||
if (!params->lhs_is_prepacked) {
|
||||
@ -231,7 +233,7 @@ void TrMul(TrMulParams* params, Context* context) {
|
||||
MakeBlockMap(rows_rounded_up, cols_rounded_up, depth,
|
||||
packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
|
||||
packed_lhs.data_type.size, packed_rhs.data_type.size,
|
||||
&block_map);
|
||||
params->cache_friendly_traversal_threshold, &block_map);
|
||||
std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
|
||||
std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
|
||||
std::uint32_t num_blocks = NumBlocks(block_map);
|
||||
|
@ -62,6 +62,7 @@ struct TrMulParams {
|
||||
PMatrix packed_rhs;
|
||||
bool lhs_is_prepacked = false;
|
||||
bool rhs_is_prepacked = false;
|
||||
int cache_friendly_traversal_threshold = 0;
|
||||
|
||||
// Type-erased Spec.
|
||||
void* spec = nullptr;
|
||||
|
Loading…
Reference in New Issue
Block a user