Make the kStandardCpp kernel layout and the cache-friendly traversal
threshold part of the Spec, allowing to write tests (test_special_specs_*) that test non-default values. PiperOrigin-RevId: 254464005
This commit is contained in:
parent
ecb03b1538
commit
5a8c1c2a9e
@ -44,11 +44,6 @@ cc_test(
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
cc_library(
|
|
||||||
name = "spec",
|
|
||||||
hdrs = ["spec.h"],
|
|
||||||
)
|
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "size_util",
|
name = "size_util",
|
||||||
hdrs = ["size_util.h"],
|
hdrs = ["size_util.h"],
|
||||||
@ -213,6 +208,12 @@ cc_library(
|
|||||||
deps = [":check_macros"],
|
deps = [":check_macros"],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "spec",
|
||||||
|
hdrs = ["spec.h"],
|
||||||
|
deps = [":matrix"],
|
||||||
|
)
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "internal_matrix",
|
name = "internal_matrix",
|
||||||
hdrs = ["internal_matrix.h"],
|
hdrs = ["internal_matrix.h"],
|
||||||
|
@ -81,7 +81,7 @@ int floor_log2_quotient(int num, int denom) {
|
|||||||
|
|
||||||
void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
|
void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
|
||||||
int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
|
int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
|
||||||
BlockMap* block_map) {
|
int cache_friendly_traversal_threshold, BlockMap* block_map) {
|
||||||
gemmlowp::ScopedProfilingLabel label("MakeBlockMap");
|
gemmlowp::ScopedProfilingLabel label("MakeBlockMap");
|
||||||
RUY_DCHECK_GE(rows, kernel_rows);
|
RUY_DCHECK_GE(rows, kernel_rows);
|
||||||
RUY_DCHECK_GE(cols, kernel_cols);
|
RUY_DCHECK_GE(cols, kernel_cols);
|
||||||
@ -89,7 +89,7 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
|
|||||||
block_map->traversal_order = BlockMapTraversalOrder::kLinear;
|
block_map->traversal_order = BlockMapTraversalOrder::kLinear;
|
||||||
if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) &&
|
if (RUY_OPT_ENABLED(RUY_OPT_FRACTAL) &&
|
||||||
(rows * lhs_scalar_size + cols * rhs_scalar_size) * depth >=
|
(rows * lhs_scalar_size + cols * rhs_scalar_size) * depth >=
|
||||||
kCacheFriendlyLoopThreshold) {
|
cache_friendly_traversal_threshold) {
|
||||||
block_map->traversal_order = RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U)
|
block_map->traversal_order = RUY_OPT_ENABLED(RUY_OPT_FRACTAL_U)
|
||||||
? BlockMapTraversalOrder::kFractalU
|
? BlockMapTraversalOrder::kFractalU
|
||||||
: BlockMapTraversalOrder::kFractalZ;
|
: BlockMapTraversalOrder::kFractalZ;
|
||||||
|
@ -20,20 +20,6 @@ limitations under the License.
|
|||||||
|
|
||||||
namespace ruy {
|
namespace ruy {
|
||||||
|
|
||||||
// The value and even the meaning of this constant are empirically
|
|
||||||
// determined. Coarsely speaking, it's compared with the size of source
|
|
||||||
// LHS and RHS operands to determine whether they are big enough to be worth
|
|
||||||
// traversing in a more complicated "cache friendly" order. The current
|
|
||||||
// value is roughly the minimum size of a L1 cache on any CPU that we currently
|
|
||||||
// care about, e.g. ARM Cortex-A53. But we honestly don't even know the precise
|
|
||||||
// extent to which this should be related to L1 cache size.
|
|
||||||
//
|
|
||||||
// A lower value is not necessarily 'safer' from a cache-friendliness
|
|
||||||
// perspective: it means switching sooner (at smaller sizes) to more complicated
|
|
||||||
// traversal orders, which might be adversarial to the CPU's auto-prefetching
|
|
||||||
// or to the TLB.
|
|
||||||
static constexpr int kCacheFriendlyLoopThreshold = 32 * 1024;
|
|
||||||
|
|
||||||
enum class BlockMapTraversalOrder {
|
enum class BlockMapTraversalOrder {
|
||||||
// Plain old row-by-row or column-by-column traversal.
|
// Plain old row-by-row or column-by-column traversal.
|
||||||
kLinear,
|
kLinear,
|
||||||
@ -126,7 +112,7 @@ struct BlockMap {
|
|||||||
// matrix multiplication with the given parameters.
|
// matrix multiplication with the given parameters.
|
||||||
void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
|
void MakeBlockMap(int rows, int cols, int depth, int kernel_rows,
|
||||||
int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
|
int kernel_cols, int lhs_scalar_size, int rhs_scalar_size,
|
||||||
BlockMap* block_map);
|
int cache_friendly_traversal_threshold, BlockMap* block_map);
|
||||||
|
|
||||||
// Maps an integer index to a (block_r, block_c) block position in the grid.
|
// Maps an integer index to a (block_r, block_c) block position in the grid.
|
||||||
void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index,
|
void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index,
|
||||||
|
@ -189,6 +189,9 @@ void PopulateTrMulParams(TrMulParams* params) {
|
|||||||
&RunPack<ThePath, RhsKernelLayout, RhsScalar, PackedRhsScalar>;
|
&RunPack<ThePath, RhsKernelLayout, RhsScalar, PackedRhsScalar>;
|
||||||
params->run_kernel =
|
params->run_kernel =
|
||||||
&RunKernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
|
&RunKernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
|
||||||
|
|
||||||
|
params->cache_friendly_traversal_threshold =
|
||||||
|
Spec::cache_friendly_traversal_threshold();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -100,6 +100,8 @@ limitations under the License.
|
|||||||
namespace ruy {
|
namespace ruy {
|
||||||
|
|
||||||
// KernelLayout describes small-scale block structure in a packed matrix layout.
|
// KernelLayout describes small-scale block structure in a packed matrix layout.
|
||||||
|
// It's a runtime (as opposed to compile-time-constant) version of the
|
||||||
|
// FixedKernelLayout struct used to declare kernel layouts.
|
||||||
//
|
//
|
||||||
// This is is sometimes known as "tiling" in other contexts.
|
// This is is sometimes known as "tiling" in other contexts.
|
||||||
//
|
//
|
||||||
@ -117,16 +119,6 @@ struct KernelLayout {
|
|||||||
std::uint8_t cols = 1;
|
std::uint8_t cols = 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Compile time version of KernelLayout, suitable for template metaprogramming.
|
|
||||||
// In particular, partial template specializations of Kernel use this type to
|
|
||||||
// statically declare their kernel format.
|
|
||||||
template <Order tOrder, int tRows, int tCols>
|
|
||||||
struct FixedKernelLayout {
|
|
||||||
static constexpr Order kOrder = tOrder;
|
|
||||||
static constexpr int kRows = tRows;
|
|
||||||
static constexpr int kCols = tCols;
|
|
||||||
};
|
|
||||||
|
|
||||||
// A packed matrix has a small-scale block structure that is not present in in
|
// A packed matrix has a small-scale block structure that is not present in in
|
||||||
// the input matrices. This block structure is necessary for the kernels to
|
// the input matrices. This block structure is necessary for the kernels to
|
||||||
// process data efficiently.
|
// process data efficiently.
|
||||||
|
@ -148,8 +148,8 @@ template <typename LhsScalar, typename RhsScalar, typename DstScalar,
|
|||||||
typename Spec>
|
typename Spec>
|
||||||
struct Kernel<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar, Spec> {
|
struct Kernel<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar, Spec> {
|
||||||
using AccumScalar = typename Spec::AccumScalar;
|
using AccumScalar = typename Spec::AccumScalar;
|
||||||
using LhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
|
using LhsLayout = typename Spec::StandardCppKernelLhsLayout;
|
||||||
using RhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
|
using RhsLayout = typename Spec::StandardCppKernelRhsLayout;
|
||||||
explicit Kernel(Tuning) {}
|
explicit Kernel(Tuning) {}
|
||||||
void Run(const PackedMatrix<LhsScalar>& lhs,
|
void Run(const PackedMatrix<LhsScalar>& lhs,
|
||||||
const PackedMatrix<RhsScalar>& rhs, const Spec& spec, int start_row,
|
const PackedMatrix<RhsScalar>& rhs, const Spec& spec, int start_row,
|
||||||
|
@ -148,6 +148,19 @@ StreamType& operator<<(StreamType& stream, const Matrix<Scalar>& mat) {
|
|||||||
return stream;
|
return stream;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Compile-time version of KernelLayout, used to declare kernel layouts in a
|
||||||
|
// way that can be consumed by compile-time logic.
|
||||||
|
// See how partial specializations of Kernel use it to declare their layouts.
|
||||||
|
// The only reason why this is currently part of the public API is to
|
||||||
|
// allow testing various layouts for the Path::kStandardCpp kernel, as a
|
||||||
|
// testing-only feature. See Spec::StandardCppKernelLhsLayout.
|
||||||
|
template <Order tOrder, int tRows, int tCols>
|
||||||
|
struct FixedKernelLayout {
|
||||||
|
static constexpr Order kOrder = tOrder;
|
||||||
|
static constexpr int kRows = tRows;
|
||||||
|
static constexpr int kCols = tCols;
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace ruy
|
} // namespace ruy
|
||||||
|
|
||||||
#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_MATRIX_H_
|
#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_MATRIX_H_
|
||||||
|
@ -20,6 +20,8 @@ limitations under the License.
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
|
||||||
|
#include "tensorflow/lite/experimental/ruy/matrix.h"
|
||||||
|
|
||||||
namespace ruy {
|
namespace ruy {
|
||||||
|
|
||||||
// Our 'general' loop structure (the default) involves multi-threading and
|
// Our 'general' loop structure (the default) involves multi-threading and
|
||||||
@ -96,6 +98,23 @@ struct BasicSpec {
|
|||||||
// See above enum ZeroPointSupport
|
// See above enum ZeroPointSupport
|
||||||
static constexpr ZeroPointSupport kZeroPointSupport =
|
static constexpr ZeroPointSupport kZeroPointSupport =
|
||||||
ZeroPointSupport::kGeneral;
|
ZeroPointSupport::kGeneral;
|
||||||
|
// Testing-only, not meant to be used by actual users:
|
||||||
|
// Used for testing of various kernel layouts.
|
||||||
|
using StandardCppKernelLhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
|
||||||
|
using StandardCppKernelRhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
|
||||||
|
// The value and even the meaning of this value are empirically
|
||||||
|
// determined. Coarsely speaking, it's compared with the size of source
|
||||||
|
// LHS and RHS operands to determine whether they are big enough to be worth
|
||||||
|
// traversing in a more complicated "cache friendly" order. The current
|
||||||
|
// value is roughly the minimum size of a L1 cache on any CPU that we
|
||||||
|
// currently care about, e.g. ARM Cortex-A53. But we honestly don't even know
|
||||||
|
// the precise extent to which this should be related to L1 cache size.
|
||||||
|
//
|
||||||
|
// A lower value is not necessarily 'safer' from a cache-friendliness
|
||||||
|
// perspective: it means switching sooner (at smaller sizes) to more
|
||||||
|
// complicated traversal orders, which might be adversarial to the CPU's
|
||||||
|
// auto-prefetching or to the TLB.
|
||||||
|
static int cache_friendly_traversal_threshold() { return 32 * 1024; }
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace ruy
|
} // namespace ruy
|
||||||
|
@ -36,6 +36,14 @@ struct RCCSpec : BasicSpec<AccumScalar, DstScalar> {
|
|||||||
static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kRCC;
|
static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kRCC;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <typename AccumScalar, typename DstScalar, typename LhsKernelLayout,
|
||||||
|
typename RhsKernelLayout>
|
||||||
|
struct StandardCppKernelLayoutSpec : BasicSpec<AccumScalar, DstScalar> {
|
||||||
|
using StandardCppKernelLhsLayout = LhsKernelLayout;
|
||||||
|
using StandardCppKernelRhsLayout = RhsKernelLayout;
|
||||||
|
static int cache_friendly_traversal_threshold() { return 0; }
|
||||||
|
};
|
||||||
|
|
||||||
using LhsScalar = RUY_TEST_LHSSCALAR;
|
using LhsScalar = RUY_TEST_LHSSCALAR;
|
||||||
using RhsScalar = RUY_TEST_RHSSCALAR;
|
using RhsScalar = RUY_TEST_RHSSCALAR;
|
||||||
using AccumScalar = RUY_TEST_ACCUMSCALAR;
|
using AccumScalar = RUY_TEST_ACCUMSCALAR;
|
||||||
@ -123,4 +131,24 @@ TEST(TestSpecialSpecs, RCC) {
|
|||||||
TestNonRCC<RCCTestSet>(81, 93, 72, ExpectedOutcome::kDeath);
|
TestNonRCC<RCCTestSet>(81, 93, 72, ExpectedOutcome::kDeath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename LhsKernelLayout, typename RhsKernelLayout>
|
||||||
|
void TestStandardCppKernelLayout() {
|
||||||
|
using SpecType =
|
||||||
|
StandardCppKernelLayoutSpec<AccumScalar, DstScalar, LhsKernelLayout,
|
||||||
|
RhsKernelLayout>;
|
||||||
|
using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>;
|
||||||
|
for (int size = 1; size < 10; size++) {
|
||||||
|
TestLinearAllOrders<TestSetType>(size, size, size);
|
||||||
|
}
|
||||||
|
TestLinearAllOrders<TestSetType>(87, 34, 56);
|
||||||
|
TestLinearAllOrders<TestSetType>(123, 234, 78);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(TestSpecialSpecs, StandardCppKernelLayout) {
|
||||||
|
TestStandardCppKernelLayout<FixedKernelLayout<Order::kColMajor, 1, 1>,
|
||||||
|
FixedKernelLayout<Order::kColMajor, 1, 1>>();
|
||||||
|
TestStandardCppKernelLayout<FixedKernelLayout<Order::kRowMajor, 4, 4>,
|
||||||
|
FixedKernelLayout<Order::kRowMajor, 4, 4>>();
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace ruy
|
} // namespace ruy
|
||||||
|
@ -169,10 +169,10 @@ int GetThreadCount(Context* context, int rows, int cols, int depth) {
|
|||||||
return clamp(guess, 1, context->max_num_threads);
|
return clamp(guess, 1, context->max_num_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
LoopStructure GetLoopStructure(int thread_count, int rows, int cols,
|
LoopStructure GetLoopStructure(int thread_count, int rows, int cols, int depth,
|
||||||
int depth) {
|
int cache_friendly_traversal_threshold) {
|
||||||
if (thread_count == 1 &&
|
if (thread_count == 1 &&
|
||||||
(rows + cols) * depth < kCacheFriendlyLoopThreshold) {
|
(rows + cols) * depth < cache_friendly_traversal_threshold) {
|
||||||
return LoopStructure::kSimple;
|
return LoopStructure::kSimple;
|
||||||
}
|
}
|
||||||
return LoopStructure::kGeneral;
|
return LoopStructure::kGeneral;
|
||||||
@ -195,7 +195,9 @@ void TrMul(TrMulParams* params, Context* context) {
|
|||||||
const int cols_rounded_up = packed_rhs.layout.cols;
|
const int cols_rounded_up = packed_rhs.layout.cols;
|
||||||
|
|
||||||
int thread_count = GetThreadCount(context, rows, cols, depth);
|
int thread_count = GetThreadCount(context, rows, cols, depth);
|
||||||
const auto loop_structure = GetLoopStructure(thread_count, rows, cols, depth);
|
const auto loop_structure =
|
||||||
|
GetLoopStructure(thread_count, rows, cols, depth,
|
||||||
|
params->cache_friendly_traversal_threshold);
|
||||||
Allocator* allocator = context->GetMainAllocator();
|
Allocator* allocator = context->GetMainAllocator();
|
||||||
|
|
||||||
if (!params->lhs_is_prepacked) {
|
if (!params->lhs_is_prepacked) {
|
||||||
@ -231,7 +233,7 @@ void TrMul(TrMulParams* params, Context* context) {
|
|||||||
MakeBlockMap(rows_rounded_up, cols_rounded_up, depth,
|
MakeBlockMap(rows_rounded_up, cols_rounded_up, depth,
|
||||||
packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
|
packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
|
||||||
packed_lhs.data_type.size, packed_rhs.data_type.size,
|
packed_lhs.data_type.size, packed_rhs.data_type.size,
|
||||||
&block_map);
|
params->cache_friendly_traversal_threshold, &block_map);
|
||||||
std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
|
std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
|
||||||
std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
|
std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
|
||||||
std::uint32_t num_blocks = NumBlocks(block_map);
|
std::uint32_t num_blocks = NumBlocks(block_map);
|
||||||
|
@ -62,6 +62,7 @@ struct TrMulParams {
|
|||||||
PMatrix packed_rhs;
|
PMatrix packed_rhs;
|
||||||
bool lhs_is_prepacked = false;
|
bool lhs_is_prepacked = false;
|
||||||
bool rhs_is_prepacked = false;
|
bool rhs_is_prepacked = false;
|
||||||
|
int cache_friendly_traversal_threshold = 0;
|
||||||
|
|
||||||
// Type-erased Spec.
|
// Type-erased Spec.
|
||||||
void* spec = nullptr;
|
void* spec = nullptr;
|
||||||
|
Loading…
Reference in New Issue
Block a user