From 4bbf04134f7f8d2de03b8494dd0d2d24b811d31e Mon Sep 17 00:00:00 2001 From: Sean Silva Date: Tue, 30 Apr 2019 11:41:40 -0700 Subject: [PATCH] Detemplatize TrMul and introduce type-erased TrMulParams. The TrMulParams abstraction will allow us to cleanly implement pre-packing and packed matrix caching. See comment at the top of internal_matrix.h for more info about this change. One way to look at this CL is to walk through starting from ruy::Mul and see the new code structure. The change to more purpose-defined Matrix types also allowed separating Layout from the new PackedLayout. Layout (which is part of the user-facing Matrix) is now inherently linear (no kernel layout block structure), which allowed simplifying LoopStructure::kPackedLinearRCC to just "PackedRCC", but "packed" (which in this context means stride is equal to the inner dimension) is not something Ruy cares deeply about, which allowed simplifying to just kRCC. PiperOrigin-RevId: 245990070 --- tensorflow/lite/experimental/ruy/BUILD | 28 +- tensorflow/lite/experimental/ruy/allocator.h | 11 +- tensorflow/lite/experimental/ruy/benchmark.cc | 5 +- tensorflow/lite/experimental/ruy/common.h | 145 +------ tensorflow/lite/experimental/ruy/context.h | 7 + tensorflow/lite/experimental/ruy/dispatch.h | 293 +++++++++++--- tensorflow/lite/experimental/ruy/impl.h | 369 ++++++----------- .../lite/experimental/ruy/internal_matrix.h | 382 ++++++++++++++++++ tensorflow/lite/experimental/ruy/kernel.h | 77 ++-- tensorflow/lite/experimental/ruy/matrix.h | 40 +- tensorflow/lite/experimental/ruy/pack.h | 45 ++- tensorflow/lite/experimental/ruy/path.h | 31 +- tensorflow/lite/experimental/ruy/ruy.h | 13 +- tensorflow/lite/experimental/ruy/spec.h | 17 +- tensorflow/lite/experimental/ruy/test.h | 114 ++---- tensorflow/lite/experimental/ruy/test_fast.cc | 10 +- tensorflow/lite/experimental/ruy/test_slow.cc | 8 +- .../experimental/ruy/test_special_specs.cc | 17 +- 18 files changed, 935 insertions(+), 677 deletions(-) create mode 100644 tensorflow/lite/experimental/ruy/internal_matrix.h diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD index b2b35c26300..97ac38e998d 100644 --- a/tensorflow/lite/experimental/ruy/BUILD +++ b/tensorflow/lite/experimental/ruy/BUILD @@ -195,6 +195,17 @@ cc_library( deps = [":check_macros"], ) +cc_library( + name = "internal_matrix", + hdrs = ["internal_matrix.h"], + deps = [ + ":check_macros", + ":common", + ":matrix", + ":size_util", + ], +) + cc_library( name = "common", hdrs = [ @@ -205,7 +216,6 @@ cc_library( ":matrix", ":opt_set", ":path", - ":size_util", ], ) @@ -219,7 +229,7 @@ cc_library( ], deps = [ ":common", - ":matrix", + ":internal_matrix", ":opt_set", ":path", ":size_util", @@ -240,7 +250,7 @@ cc_library( ], deps = [ ":common", - ":matrix", + ":internal_matrix", ":opt_set", ":path", ":spec", @@ -256,11 +266,7 @@ cc_library( "dispatch.h", "impl.h", ], - hdrs = [ - "matrix.h", - "path.h", - "ruy.h", - ], + hdrs = ["ruy.h"], visibility = ruy_visibility(), deps = [ ":allocator", @@ -269,8 +275,10 @@ cc_library( ":common", ":context", ":kernel", + ":matrix", ":opt_set", ":pack", + ":path", ":size_util", ":spec", ":thread_pool", @@ -388,7 +396,3 @@ ruy_benchmark_opt_sets( "7ff", ], ) - -load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite") - -tflite_portable_test_suite() diff --git a/tensorflow/lite/experimental/ruy/allocator.h b/tensorflow/lite/experimental/ruy/allocator.h index 789731a505d..5edf6930866 100644 --- a/tensorflow/lite/experimental/ruy/allocator.h +++ b/tensorflow/lite/experimental/ruy/allocator.h @@ -146,12 +146,17 @@ class AlignedAllocator { // typed buffer. class Allocator { public: + void* AllocateBytes(std::size_t num_bytes) { + if (num_bytes == 0) { + return nullptr; + } + return aligned.AllocateAlignedBytes( + round_up_pot(num_bytes, detail::AlignedAllocator::kAlignment)); + } template void Allocate(std::size_t count, Pointer* out) { using T = typename std::pointer_traits::element_type; - std::size_t num_bytes = - round_up_pot(count * sizeof(T), detail::AlignedAllocator::kAlignment); - *out = static_cast(aligned.AllocateAlignedBytes(num_bytes)); + *out = static_cast(AllocateBytes(count * sizeof(T))); } void FreeAll() { aligned.FreeAll(); } diff --git a/tensorflow/lite/experimental/ruy/benchmark.cc b/tensorflow/lite/experimental/ruy/benchmark.cc index ccf7f5dbb54..55b02d24df9 100644 --- a/tensorflow/lite/experimental/ruy/benchmark.cc +++ b/tensorflow/lite/experimental/ruy/benchmark.cc @@ -36,8 +36,7 @@ struct BenchmarkShape { }; template -std::vector> BenchmarkPackedLinearRCC( - const BenchmarkShape& shape) { +std::vector> BenchmarkRCC(const BenchmarkShape& shape) { TestSetType test_set; test_set.rows = shape.rows; test_set.depth = shape.depth; @@ -104,7 +103,7 @@ void Benchmark() { for (int i = 0; i < shapes.size(); i++) { const auto& shape = shapes[i]; - const auto& results = BenchmarkPackedLinearRCC(shape); + const auto& results = BenchmarkRCC(shape); if (i == 0) { if (benchmark_cubic) { printf("size"); diff --git a/tensorflow/lite/experimental/ruy/common.h b/tensorflow/lite/experimental/ruy/common.h index 53ebbe955ec..3f6e8ac25f5 100644 --- a/tensorflow/lite/experimental/ruy/common.h +++ b/tensorflow/lite/experimental/ruy/common.h @@ -21,13 +21,11 @@ limitations under the License. #include #include #include -#include #include "tensorflow/lite/experimental/ruy/check_macros.h" #include "tensorflow/lite/experimental/ruy/matrix.h" #include "tensorflow/lite/experimental/ruy/opt_set.h" #include "tensorflow/lite/experimental/ruy/path.h" -#include "tensorflow/lite/experimental/ruy/size_util.h" #ifdef __aarch64__ #include @@ -44,111 +42,17 @@ limitations under the License. namespace ruy { -inline void MakeSimpleLayout(int rows, int cols, Order order, Layout* layout) { - layout->rows = rows; - layout->cols = cols; - layout->order = order; - layout->stride = order == Order::kColMajor ? rows : cols; - layout->kernel.order = order; - layout->kernel.rows = 1; - layout->kernel.cols = 1; -} - -inline bool IsLinear(const Layout& layout) { - return layout.kernel.rows == 1 && layout.kernel.cols == 1; -} - -inline bool IsPacked(const Layout& layout) { - if (layout.order == Order::kColMajor) { - return layout.stride == layout.rows; - } else { - return layout.stride == layout.cols; - } -} - -inline bool IsPackedLinear(const Layout& layout) { - return IsPacked(layout) && IsLinear(layout); -} - -inline bool IsRowMajor(const Layout& layout) { - return layout.order == Order::kRowMajor; -} - -inline bool IsColMajor(const Layout& layout) { - return layout.order == Order::kColMajor; -} - -inline bool IsLinearColMajor(const Layout& layout) { - return IsLinear(layout) && IsColMajor(layout); -} - -inline bool IsPackedLinearColMajor(const Layout& layout) { - return IsLinearColMajor(layout) && IsPacked(layout); -} - -inline bool IsLinearRowMajor(const Layout& layout) { - return IsLinear(layout) && IsRowMajor(layout); -} - -inline bool IsPackedLinearRowMajor(const Layout& layout) { - return IsLinearRowMajor(layout) && IsPacked(layout); -} - -inline int FlatSize(const Layout& layout) { - const int outerdim = - layout.order == Order::kColMajor ? layout.cols : layout.rows; - return layout.stride * outerdim; -} - -// TODO(b/130417400) add a unit test -inline int Offset(const Layout& layout, int row, int col) { - // TODO(benoitjacob) - should check this but this make the _slow tests take - // 5x longer. Find a mitigation like in Eigen with an 'internal' variant - // bypassing the check? - // RUY_DCHECK_GE(row, 0); - // RUY_DCHECK_GE(col, 0); - // RUY_DCHECK_LT(row, layout.rows); - // RUY_DCHECK_LT(col, layout.cols); - if (IsLinear(layout)) { - int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride; - int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride; - return row * row_stride + col * col_stride; - } else { - RUY_DCHECK(is_pot(layout.kernel.rows)); - RUY_DCHECK(is_pot(layout.kernel.cols)); - int row_outer = row & ~(layout.kernel.rows - 1); - int col_outer = col & ~(layout.kernel.cols - 1); - int row_stride_outer = - layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride; - int col_stride_outer = - layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride; - int offset_outer = - row_outer * row_stride_outer + col_outer * col_stride_outer; - int row_inner = row - row_outer; - int col_inner = col - col_outer; - int row_stride_inner = - layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols; - int col_stride_inner = - layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows; - int offset_inner = - row_inner * row_stride_inner + col_inner * col_stride_inner; - return offset_outer + offset_inner; - } -} - -template -const Scalar* ElementPtr(const Matrix& mat, int row, int col) { - return mat.data.get() + Offset(mat.layout, row, col); -} - -template -Scalar* ElementPtr(Matrix* mat, int row, int col) { - return mat->data.get() + Offset(mat->layout, row, col); -} - -template -Scalar Element(const Matrix& mat, int row, int col) { - return *ElementPtr(mat, row, col); +// Helper for type-erasing a pointer. +// +// Often inside Ruy, a template parameter holds type information statically, but +// we would like to have a function signature that doesn't depend on the +// template parameters, so that we can dispatch indirectly across multiple +// implementations. This helper is at the core of such type-erasure. +// +// The opposite of this operation is just `static_cast(void_ptr)`. +template +void* ToVoidPtr(T* p) { + return const_cast(static_cast(p)); } // We need this where we have multiple threads potentially writing concurrently @@ -176,33 +80,6 @@ Scalar SymmetricZeroPoint() { return std::numeric_limits::max() / 2 + 1; } -template -struct TrMulImpl; - -template -struct FixedKernelLayout { - static constexpr Order kOrder = tOrder; - static constexpr int kRows = tRows; - static constexpr int kCols = tCols; -}; - -inline void Transpose(Order* order) { - *order = *order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor; -} - -inline void Transpose(Layout* layout) { - Transpose(&layout->order); - Transpose(&layout->kernel.order); - std::swap(layout->rows, layout->cols); - std::swap(layout->kernel.rows, layout->kernel.cols); -} - -template -inline void Transpose(Matrix* matrix) { - Transpose(&matrix->layout); -} - } // namespace ruy #endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_COMMON_H_ diff --git a/tensorflow/lite/experimental/ruy/context.h b/tensorflow/lite/experimental/ruy/context.h index b768dad09a5..48b02f88de7 100644 --- a/tensorflow/lite/experimental/ruy/context.h +++ b/tensorflow/lite/experimental/ruy/context.h @@ -65,6 +65,13 @@ struct Context final { } } + template + Path GetPathToTake() { + last_taken_path = + GetMostSignificantPath(CompiledPaths & GetRuntimeEnabledPaths()); + return last_taken_path; + } + void SetRuntimeEnabledPaths(Path paths); Path GetRuntimeEnabledPaths(); diff --git a/tensorflow/lite/experimental/ruy/dispatch.h b/tensorflow/lite/experimental/ruy/dispatch.h index 3386e14226f..50bece5f41a 100644 --- a/tensorflow/lite/experimental/ruy/dispatch.h +++ b/tensorflow/lite/experimental/ruy/dispatch.h @@ -52,10 +52,10 @@ namespace ruy { template void EnforceLayoutSupport(const Layout& lhs_layout, const Layout& rhs_layout, const Layout& dst_layout) { - if (Spec::kLayoutSupport == LayoutSupport::kPackedLinearRCC) { - RUY_DCHECK(IsPackedLinearRowMajor(lhs_layout)); - RUY_DCHECK(IsPackedLinearColMajor(rhs_layout)); - RUY_DCHECK(IsPackedLinearColMajor(dst_layout)); + if (Spec::kLayoutSupport == LayoutSupport::kRCC) { + RUY_DCHECK(IsRowMajor(lhs_layout)); + RUY_DCHECK(IsColMajor(rhs_layout)); + RUY_DCHECK(IsColMajor(dst_layout)); } } @@ -84,21 +84,108 @@ void EnforceZeroPointSupport(LhsScalar lhs_zero_point, RhsScalar rhs_zero_point, CheckZeroPoint(dst_zero_point); } -// GetTrMulImplRunFn is implemented with template metaprogramming by mutual -// recursion between PathSearchCountdown and PathSearchCompiledPaths. +inline bool IsColMajorTrMul(const DMatrix& lhs, const DMatrix& rhs, + const DMatrix& dst) { + return IsColMajor(lhs.layout) && IsColMajor(rhs.layout) && + IsColMajor(dst.layout); +} + +inline void CreatePackedLayout(const Layout& src, const Type& scalar, + const KernelLayout& kernel_layout, + PackedLayout* packed) { + packed->order = Order::kColMajor; + packed->rows = round_up_pot(src.rows, kernel_layout.rows); + packed->cols = round_up_pot(src.cols, kernel_layout.cols); + packed->kernel = kernel_layout; + int inner_size = packed->rows; + if (RUY_OPT_SET & RUY_OPT_AVOID_ALIASING) { + packed->stride = + (inner_size * scalar.size) % 1024 ? inner_size : inner_size + 64; + } else { + packed->stride = inner_size; + } +} + +template +void CreatePackedMatrix(const DMatrix& src, const KernelLayout& kernel_layout, + PMatrix* packed) { + // Ruy always uses 32-bit signed accumulators for quantized + // matrix multiplication, so we would like to always use std::int32_t + // unconditionally for SumsType. + // However, for floating point types, we still need a reasonable type here to + // avoid tripping assertions elsewhere in the code. + using SumsType = + typename std::conditional::value, Scalar, + std::int32_t>::type; + + packed->data_type = Type::Create(); + packed->sums_type = Type::Create(); + CreatePackedLayout(src.layout, packed->data_type, kernel_layout, + &packed->layout); + packed->zero_point = Pack(src.zero_point); +} + +template +void PopulateTrMulParams(TrMulParams* params) { + static_assert((ThePath & Path::kReference) == Path::kNone, + "Path::kReference should not do TrMul"); + // The optimized code paths only handle a very specific set of layouts. + // Fall back to Path::kStandardCpp if needed. + if (ThePath != Path::kStandardCpp) { + if (!IsColMajorTrMul(params->lhs, params->rhs, params->dst)) { + PopulateTrMulParams(params); + return; + } + } + + using PackedLhsScalar = PackedType; + using PackedRhsScalar = PackedType; + using Kernel = + Kernel; + using LhsKernelLayout = typename Kernel::LhsLayout; + using RhsKernelLayout = typename Kernel::RhsLayout; + + CreatePackedMatrix( + params->lhs, ToKernelLayout(), ¶ms->packed_lhs); + CreatePackedMatrix( + params->rhs, ToKernelLayout(), ¶ms->packed_rhs); + + params->lhs_run_pack = + &RunPack; + params->rhs_run_pack = + &RunPack; + params->run_kernel = + &RunKernel; + return; +} + +// PopulateTrMulParamsAllCompiledPaths calls into one of multiple +// instantiations of PopulateTrMulParams. For each bit that is set in +// CompiledPaths, it statically instantiates PopulateTrMulParams with a Path +// corresponding to that single bit. The call to PopulateTrMulParams is +// guarded by a runtime check that it is in fact the dynamically selected path. // -// GetTrMulImplRunFn is logically implementing the following computation: +// PopulateTrMulParamsAllCompiledPaths is implemented with template +// metaprogramming by mutual recursion between PathSearchCountdown and +// PathSearchCompiledPaths. // -// decltype(&TrMulImpl<...>::Run) GetTrMulImplRunFn(Path single_path) { +// PopulateTrMulParamsAllCompiledPaths is logically implementing the following +// computation: +// +// template +// void PopulateTrMulParamsAllCompiledPaths(Path the_path, +// TrMulParams* params) { // for (int bit = 8 * sizeof(Path) - 1; bit != -1; bit--) { // [1] // Path current_path = static_cast(1 << bit); // if ((CompiledPaths & current_path) != Path::kNone) { // [2] -// if (current_path == single_path) { // [3] -// return &TrMulImpl::Run; +// if (current_path == the_path) { // [3] +// PopulateTrMulParams(the_path, params); +// return; // } // } // } -// return nullptr; // [4] // } // // @@ -110,15 +197,13 @@ void EnforceZeroPointSupport(LhsScalar lhs_zero_point, RhsScalar rhs_zero_point, // doing the whole computation at C++ compile time. // [3] - Done by the `if` in the main definition of // PathSearchOnlyCompiledPaths. -// [4] - Done by the partial specialization of PathSearchCountdown. // // The template metaprogramming is necessary because: -// - In `TrMulImpl::Run`, current_path must be a C++ +// - In `PopulateTrMulParams`, current_path must be a C++ // compile-time constant. -// - GetTrMulImplRunFn must not instantiate -// `TrMulImpl::Run` for paths that are not in -// CompiledPaths, since that can result in bogus instantiations which cause -// a compile time failure. +// - PopulateTrMulParamsAllCompiledPaths must not instantiate +// inner loops for paths that are not in CompiledPaths, since that can result in +// bogus instantiations which cause a compile time failure. template struct PathSearchCountdown; @@ -128,29 +213,25 @@ template struct PathSearchOnlyCompiledPaths { static constexpr Path kCurrentPath = static_cast(1 << BitNumber); - static decltype( - &TrMulImpl::Run) - Search(Path single_path) { - if (kCurrentPath == single_path) { - return &TrMulImpl::Run; + static void Search(Path the_path, TrMulParams* params) { + if (kCurrentPath == the_path) { + PopulateTrMulParams( + params); + return; } - return PathSearchCountdown::Search(single_path); + PathSearchCountdown::Search(the_path, params); } }; -// Skip instantiating TrMulImpl if CompiledPaths doesn't contain the -// specified path. +// Skip this iteration if CompiledPaths doesn't contain the specified path. template struct PathSearchOnlyCompiledPaths { - static decltype( - &TrMulImpl::Run) - Search(Path single_path) { - return PathSearchCountdown::Search(single_path); + static void Search(Path the_path, TrMulParams* params) { + PathSearchCountdown::Search(the_path, params); } }; @@ -158,12 +239,10 @@ template struct PathSearchCountdown { static constexpr Path kCurrentPath = static_cast(1 << BitNumber); - static decltype( - &TrMulImpl::Run) - Search(Path single_path) { - return PathSearchOnlyCompiledPaths< + static void Search(Path the_path, TrMulParams* params) { + PathSearchOnlyCompiledPaths< CompiledPaths, (CompiledPaths & kCurrentPath) != Path::kNone, BitNumber, - LhsScalar, RhsScalar, DstScalar, Spec>::Search(single_path); + LhsScalar, RhsScalar, DstScalar, Spec>::Search(the_path, params); } }; @@ -173,48 +252,132 @@ template struct PathSearchCountdown { - static decltype( - &TrMulImpl::Run) - Search(Path single_path) { - return nullptr; - } + static void Search(Path the_path, TrMulParams* params) { RUY_DCHECK(false); } }; template -decltype(&TrMulImpl::Run) -GetTrMulImplRunFn(Path single_path) { +void PopulateTrMulParamsAllCompiledPaths(Path the_path, TrMulParams* params) { return PathSearchCountdown::Search(single_path); + RhsScalar, DstScalar, Spec>::Search(the_path, + params); +} + +template +void CreateTrMulParams(const Matrix& lhs, + const Matrix& rhs, const Spec& spec, + Context* context, Matrix* dst, Path the_path, + TrMulParams* params) { + // Fill in the fields we already know. + params->lhs = ToDMatrix(lhs); + params->rhs = ToDMatrix(rhs); + params->dst = ToDMatrix(*dst); + params->spec = ToVoidPtr(&spec); + + // Create inner loops and packed matrices based on the Path. + PopulateTrMulParamsAllCompiledPaths(the_path, params); +} + +template +void ReferenceMul(const Matrix& lhs, const Matrix& rhs, + const Spec& spec, Matrix* dst) { + gemmlowp::ScopedProfilingLabel label("ReferenceMul"); + for (int i = 0; i < lhs.layout.rows; i++) { + for (int j = 0; j < rhs.layout.cols; j++) { + using AccumScalar = typename Spec::AccumScalar; + AccumScalar accum = 0; + for (int k = 0; k < lhs.layout.cols; k++) { + AccumScalar lhs_val = Element(lhs, i, k); + AccumScalar rhs_val = Element(rhs, k, j); + accum += (lhs_val - lhs.zero_point) * (rhs_val - rhs.zero_point); + } + if (spec.bias) { + accum += spec.bias[i]; + } + ApplyMultiplier(spec, i, &accum); + accum += dst->zero_point; + accum = std::min(accum, spec.clamp_max); + accum = std::max(accum, spec.clamp_min); + *ElementPtr(dst, i, j) = static_cast(accum); + } + } +} + +// Compile-time dispatch to ReferenceMul. This allows us to statically ensure +// that there is no call to ReferenceMul in the user's binary. +template +struct CompileTimeEnabledReferenceMul { + template + static void Run(const Matrix& lhs, const Matrix& rhs, + const Spec& spec, Matrix* dst) { + ReferenceMul(lhs, rhs, spec, dst); + } +}; + +// When this partial specialization is chosen, it ensures that ReferenceMul +// is never compiled. +template <> +struct CompileTimeEnabledReferenceMul { + template + static void Run(const Matrix& lhs, const Matrix& rhs, + const Spec& spec, Matrix* dst) { + RUY_DCHECK(false); + } }; template -struct MulDispatch { - void Mul(const Matrix& lhs, const Matrix& rhs, - const Spec& spec, Context* context, Matrix* dst) { - gemmlowp::ScopedProfilingLabel label("Mul"); +void DispatchMul(const Matrix& lhs, const Matrix& rhs, + const Spec& spec, Context* context, Matrix* dst) { + static_assert(CompiledPaths != Path::kNone, "Must compile at least one Path"); + static_assert((CompiledPaths & ~kAllPaths) == Path::kNone, + "CompiledPaths must be a subset of ruy::kAllPaths"); - const Path runtime_enabled_paths = context->GetRuntimeEnabledPaths(); - // The above query should resolve to specific paths, never return kNone. - RUY_DCHECK(runtime_enabled_paths != Path::kNone); + gemmlowp::ScopedProfilingLabel label("Mul"); - Path single_path = - GetMostSignificantPath(CompiledPaths & runtime_enabled_paths); - auto tr_mul_impl_run_fn = - GetTrMulImplRunFn( - single_path); - context->last_taken_path = single_path; + EnforceLayoutSupport(lhs.layout, rhs.layout, dst->layout); + EnforceZeroPointSupport(lhs.zero_point, rhs.zero_point, + dst->zero_point); - EnforceLayoutSupport(lhs.layout, rhs.layout, dst->layout); - EnforceZeroPointSupport(lhs.zero_point, rhs.zero_point, - dst->zero_point); + // This should be a constant, for a given machine and CompiledPaths. + // There is a back door to override it for testing, but in production it will + // always be the "best" Path. I.e. the one with the newest SIMD instructions + // available on the present machine, and avoiding Path::kReference unless + // no other path is compiled. + // + // Unfortunately, it is not a *static* constant, since it depends on runtime + // detection of the available SIMD instructions. + Path the_path = context->GetPathToTake(); - Matrix lhs_copy(lhs); - Transpose(&lhs_copy); - tr_mul_impl_run_fn(lhs_copy, rhs, spec, context, dst); + // Production code should probably never execute Path::kReference. + // Path::kReference implements a Mul, not a TrMul like the rest of Ruy, so if + // that's what we need to do, then get it out of the way before going down the + // TrMul path. + if (the_path == Path::kReference) { + constexpr bool ReferenceMulIsEnabled = + (CompiledPaths & Path::kReference) != Path::kNone; + CompileTimeEnabledReferenceMul::Run(lhs, rhs, spec, + dst); + return; } -}; + + // As described in the comment at the top of this file, Ruy internally + // converts Mul into TrMul. We handle that here. + // + // This is Ruy's main code path. + constexpr Path TrMulCompiledPaths = CompiledPaths & ~Path::kReference; + Matrix transposed_lhs(lhs); + Transpose(&transposed_lhs); + TrMulParams params; + CreateTrMulParams(transposed_lhs, rhs, spec, context, dst, + the_path, ¶ms); + TrMul(¶ms, context); +} } // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/impl.h b/tensorflow/lite/experimental/ruy/impl.h index ee26b9687a9..edab51dec7a 100644 --- a/tensorflow/lite/experimental/ruy/impl.h +++ b/tensorflow/lite/experimental/ruy/impl.h @@ -17,7 +17,6 @@ limitations under the License. #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_IMPL_H_ #include -#include #include "profiling/instrumentation.h" #include "tensorflow/lite/experimental/ruy/allocator.h" @@ -25,6 +24,7 @@ limitations under the License. #include "tensorflow/lite/experimental/ruy/common.h" #include "tensorflow/lite/experimental/ruy/context.h" #include "tensorflow/lite/experimental/ruy/kernel.h" +#include "tensorflow/lite/experimental/ruy/matrix.h" #include "tensorflow/lite/experimental/ruy/opt_set.h" #include "tensorflow/lite/experimental/ruy/pack.h" #include "tensorflow/lite/experimental/ruy/thread_pool.h" @@ -33,31 +33,49 @@ limitations under the License. namespace ruy { -template -struct TrMulTask final : Task { - using AccumScalar = typename Spec::AccumScalar; - TrMulTask(const Matrix& lhs_, const Matrix& rhs_, - Matrix* packed_lhs_, - Matrix* packed_rhs_, Matrix* result_, - const BlockMap& block_map_, +// Type-erased data needed for implementing TrMul. +struct TrMulParams { + // Helper functions for invoking the function pointers. + void LhsRunPack(Tuning tuning, int start_c, int end_c) { + lhs_run_pack(tuning, lhs, &packed_lhs, start_c, end_c); + } + void RhsRunPack(Tuning tuning, int start_c, int end_c) { + rhs_run_pack(tuning, rhs, &packed_rhs, start_c, end_c); + } + void RunKernel(Tuning tuning, int start_r, int start_c, int end_r, + int end_c) { + run_kernel(tuning, packed_lhs, packed_rhs, spec, start_r, start_c, end_r, + end_c, &dst); + } + // Function pointers to type-erased entry points for kernels and packers. + RunPackFn* lhs_run_pack = nullptr; + RunPackFn* rhs_run_pack = nullptr; + RunKernelFn* run_kernel = nullptr; + + // Matrices and packed matrices. + DMatrix lhs; + DMatrix rhs; + DMatrix dst; + PMatrix packed_lhs; + PMatrix packed_rhs; + + // Type-erased Spec. + void* spec = nullptr; +}; + +struct TrMulTask final : Task { + TrMulTask(TrMulParams* params_, const BlockMap& block_map_, std::atomic* atomic_n_, std::uint32_t thread_id_, std::atomic* lhs_packed_, std::atomic* rhs_packed_, - const Spec& spec_, TuningResolver* tuning_resolver_, - Allocator* local_allocator_, Trace* trace_) - : lhs(lhs_), - rhs(rhs_), - packed_lhs(packed_lhs_), - packed_rhs(packed_rhs_), - result(result_), + TuningResolver* tuning_resolver_, Allocator* local_allocator_, + Trace* trace_) + : params(params_), block_map(block_map_), atomic_n(atomic_n_), thread_id(thread_id_), lhs_packed(lhs_packed_), rhs_packed(rhs_packed_), - spec(spec_), tuning_resolver(tuning_resolver_), local_allocator(local_allocator_), trace(trace_) {} @@ -81,13 +99,7 @@ struct TrMulTask final : Task { memset(local_rhs_packed, 0, num_blocks_of_cols * sizeof(bool)); } - using Kernel = - Kernel; - using LhsKernelLayout = typename Kernel::RhsLayout; - using RhsKernelLayout = typename Kernel::RhsLayout; - const Tuning tuning = tuning_resolver->Resolve(); - Kernel kernel(tuning); TraceRecordThreadLoopStart(thread_id, trace); @@ -104,6 +116,7 @@ struct TrMulTask final : Task { GetBlockMatrixCoords(block_map, block_r, block_c, &start_r, &start_c, &end_r, &end_c); TraceRecordBlockCoordsComputed(n, trace); + while (n < num_blocks) { // Get index of next block to handle next_n = atomic_n->fetch_add(1, std::memory_order_relaxed); @@ -134,8 +147,7 @@ struct TrMulTask final : Task { // different contention with other processes. if (local_lhs_packed && !local_lhs_packed[block_r]) { if (!lhs_packed[block_r].load(std::memory_order_acquire)) { - Pack(tuning, lhs, packed_lhs, start_r, - end_r); + params->LhsRunPack(tuning, start_r, end_r); TraceRecordBlockPackedLhs(n, trace); local_lhs_packed[block_r] = true; lhs_packed[block_r].store(true, std::memory_order_release); @@ -144,16 +156,14 @@ struct TrMulTask final : Task { // Maybe pack the current RHS block. Same comments as above for LHS. if (local_rhs_packed && !local_rhs_packed[block_c]) { if (!rhs_packed[block_c].load(std::memory_order_acquire)) { - Pack(tuning, rhs, packed_rhs, start_c, - end_c); + params->RhsRunPack(tuning, start_c, end_c); TraceRecordBlockPackedRhs(n, trace); local_rhs_packed[block_c] = true; rhs_packed[block_c].store(true, std::memory_order_release); } } // Actually do matrix multiplication work - RunKernel(kernel, *packed_lhs, *packed_rhs, spec, start_r, start_c, end_r, - end_c, result); + params->RunKernel(tuning, start_r, start_c, end_r, end_c); TraceRecordBlockFinished(n, trace); n = next_n; block_r = next_block_r; @@ -170,54 +180,20 @@ struct TrMulTask final : Task { } private: - const Matrix& lhs; - const Matrix& rhs; - Matrix* packed_lhs; - Matrix* packed_rhs; - - Matrix* result; + TrMulParams* params; const BlockMap& block_map; std::atomic* atomic_n; std::uint32_t thread_id; std::atomic* lhs_packed; std::atomic* rhs_packed; - const Spec& spec; TuningResolver* tuning_resolver; Allocator* local_allocator; Trace* trace; }; -template -void CreatePackedMatrix(Tuning tuning, const Matrix& src, - Allocator* allocator, - Matrix* packed) { - packed->zero_point = src.zero_point - SymmetricZeroPoint() + - SymmetricZeroPoint(); - packed->layout = src.layout; - packed->layout.order = Order::kColMajor; - packed->layout.rows = round_up_pot(src.layout.rows, FixedKernelLayout::kRows); - packed->layout.cols = round_up_pot(src.layout.cols, FixedKernelLayout::kCols); - packed->layout.kernel.order = FixedKernelLayout::kOrder; - packed->layout.kernel.rows = FixedKernelLayout::kRows; - packed->layout.kernel.cols = FixedKernelLayout::kCols; - int innersize = (packed->layout.order == Order::kColMajor) - ? packed->layout.rows - : packed->layout.cols; - int outersize = (packed->layout.order == Order::kColMajor) - ? packed->layout.cols - : packed->layout.rows; - if (RUY_OPT_SET & RUY_OPT_AVOID_ALIASING) { - if (tuning == Tuning::kInOrder) { - packed->layout.stride = - (innersize * sizeof(Scalar)) % 1024 ? innersize : innersize + 64; - } else { - packed->layout.stride = - (innersize * sizeof(Scalar)) % 4096 ? innersize : innersize + 64; - } - } else { - packed->layout.stride = innersize; - } - allocator->Allocate(outersize * packed->layout.stride, &packed->data); +inline void AllocatePMatrix(Allocator* allocator, PMatrix* packed) { + packed->data = allocator->AllocateBytes(DataSize(*packed)); + packed->sums = allocator->AllocateBytes(SumsSize(*packed)); } inline int GetThreadCount(Context* context, int rows, int cols, int depth) { @@ -228,12 +204,8 @@ inline int GetThreadCount(Context* context, int rows, int cols, int depth) { return clamp(guess, 1, context->max_num_threads); } -template LoopStructure GetLoopStructure(int thread_count, int rows, int cols, int depth) { - if (Spec::kLoopStructure != LoopStructure::kAuto) { - return Spec::kLoopStructure; - } if (thread_count == 1 && (rows + cols) * depth < kCacheFriendlyLoopThreshold) { return LoopStructure::kSimple; @@ -249,180 +221,105 @@ inline Tuning GetTuning(Context* context) { return tuning_resolver->Resolve(); } -// General TrMulImpl definition. See the reference-code implementation given -// in the partial specialization below for ThePath==kReference. -template -struct TrMulImpl { - using AccumScalar = typename Spec::AccumScalar; - static void Run(const Matrix& lhs, const Matrix& rhs, - const Spec& spec, Context* context, Matrix* dst) { - // Fall back, if needed, to Path::kStandardCpp. - if (ThePath != Path::kStandardCpp) { - if (!IsLinear(lhs.layout) || !IsLinear(rhs.layout) || - !IsLinear(dst->layout) || lhs.layout.order != Order::kColMajor || - rhs.layout.order != Order::kColMajor || - dst->layout.order != Order::kColMajor) { - TrMulImpl::Run(lhs, rhs, spec, context, dst); - return; - } - } +void TrMul(TrMulParams* params, Context* context) { + gemmlowp::ScopedProfilingLabel label("TrMul"); - gemmlowp::ScopedProfilingLabel label("TrMulImpl"); - using PackedLhsScalar = PackedType; - using PackedRhsScalar = PackedType; - using Kernel = - Kernel; - using LhsKernelLayout = typename Kernel::LhsLayout; - using RhsKernelLayout = typename Kernel::RhsLayout; + PMatrix& packed_lhs = params->packed_lhs; + PMatrix& packed_rhs = params->packed_rhs; + DMatrix& lhs = params->lhs; + DMatrix& rhs = params->rhs; - const int rows = lhs.layout.cols; - const int cols = rhs.layout.cols; - const int depth = lhs.layout.rows; - const int rows_rounded_up = round_up_pot(rows, LhsKernelLayout::kCols); - const int cols_rounded_up = round_up_pot(cols, RhsKernelLayout::kCols); + const int rows = lhs.layout.cols; + const int cols = rhs.layout.cols; + const int depth = lhs.layout.rows; + const int rows_rounded_up = packed_lhs.layout.cols; + const int cols_rounded_up = packed_rhs.layout.cols; - int thread_count = GetThreadCount(context, rows, cols, depth); - const auto loop_structure = - GetLoopStructure(thread_count, rows, cols, depth); - const Tuning tuning = GetTuning(context); - Allocator* allocator = context->GetMainAllocator(); + int thread_count = GetThreadCount(context, rows, cols, depth); + const auto loop_structure = GetLoopStructure(thread_count, rows, cols, depth); + const Tuning tuning = GetTuning(context); + Allocator* allocator = context->GetMainAllocator(); + AllocatePMatrix(allocator, &packed_lhs); + AllocatePMatrix(allocator, &packed_rhs); - // The packed matrices. - Matrix packed_lhs; - Matrix packed_rhs; - const bool lhs_use_packing_sums = - Pack(rhs.zero_point) != 0; - const bool rhs_use_packing_sums = - Pack(lhs.zero_point) != 0; + if (loop_structure == LoopStructure::kSimple) { + gemmlowp::ScopedProfilingLabel label_simple("TrMulImpl, simple loop"); - // Allocate the packed matrices. - CreatePackedMatrix(tuning, lhs, allocator, &packed_lhs); - CreatePackedMatrix(tuning, rhs, allocator, &packed_rhs); - if (lhs_use_packing_sums) { - allocator->Allocate(rows_rounded_up, &packed_lhs.sums); - } - if (rhs_use_packing_sums) { - allocator->Allocate(cols_rounded_up, &packed_rhs.sums); - } - - if (loop_structure == LoopStructure::kSimple) { - gemmlowp::ScopedProfilingLabel label_simple("TrMulImpl, simple loop"); - - Pack(tuning, lhs, &packed_lhs, 0, - rows_rounded_up); - Pack(tuning, rhs, &packed_rhs, 0, - cols_rounded_up); - - Kernel kernel(tuning); - RunKernel(kernel, packed_lhs, packed_rhs, spec, 0, 0, rows_rounded_up, - cols_rounded_up, dst); - - allocator->FreeAll(); - return; - } - - gemmlowp::ScopedProfilingLabel label_general("TrMulImpl, general case"); - - auto* trace = NewTraceOrNull(&context->tracing, rows, depth, cols); - TraceRecordStart(trace); - - // Initialize block map. - BlockMap block_map; - MakeBlockMap(rows_rounded_up, cols_rounded_up, depth, - LhsKernelLayout::kCols, RhsKernelLayout::kCols, - sizeof(LhsScalar), sizeof(RhsScalar), &block_map); - std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map); - std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map); - std::uint32_t num_blocks = NumBlocks(block_map); - RUY_DCHECK_EQ(num_blocks, num_blocks_of_rows * num_blocks_of_cols); - - // Initialize per-thread state. - thread_count = clamp(thread_count, 1, num_blocks); - context->EnsureNPerThreadStates(thread_count); - for (auto& per_thread_state : context->per_thread_states) { - per_thread_state->tuning_resolver.SetTuning(context->explicit_tuning); - } - - // Allocate memory. - std::atomic* lhs_packed; - allocator->Allocate(num_blocks_of_rows, &lhs_packed); - std::atomic* rhs_packed; - allocator->Allocate(num_blocks_of_cols, &rhs_packed); - std::atomic* atomic_n; - allocator->Allocate(1, &atomic_n); - using TaskType = TrMulTask; - TaskType* tasks; - allocator->Allocate(thread_count, &tasks); - Task** tasks_ptrs; - allocator->Allocate(thread_count, &tasks_ptrs); - - // Initialize allocated data. - for (int i = 0; i < num_blocks_of_rows; i++) { - lhs_packed[i].store(false, std::memory_order_release); - } - for (int i = 0; i < num_blocks_of_cols; i++) { - rhs_packed[i].store(false, std::memory_order_release); - } - atomic_n->store(thread_count); - - for (int i = 0; i < thread_count; i++) { - tasks_ptrs[i] = static_cast(tasks + i); - new (tasks_ptrs[i]) - TaskType(lhs, rhs, &packed_lhs, &packed_rhs, dst, block_map, atomic_n, - i, lhs_packed, rhs_packed, spec, - &context->per_thread_states[i]->tuning_resolver, - &context->per_thread_states[i]->allocator, trace); - } - - // Do the computation. - TraceRecordExecute(trace); - TraceStartRecordingBlockAndThreadFields(block_map, thread_count, trace); - - context->workers_pool.Execute(thread_count, tasks_ptrs); - - // Finish up. - for (int i = 0; i < thread_count; i++) { - tasks[i].~TaskType(); - } - - TraceRecordEnd(trace); + params->LhsRunPack(tuning, 0, rows_rounded_up); + params->RhsRunPack(tuning, 0, cols_rounded_up); + params->RunKernel(tuning, 0, 0, rows_rounded_up, cols_rounded_up); allocator->FreeAll(); + return; } -}; -// Reference code for TrMul, doing a transpose-multiply: compute -// Destination = Transpose(LHS) * RHS -template -struct TrMulImpl { - static void Run(const Matrix& lhs, const Matrix& rhs, - const Spec& spec, Context*, Matrix* dst) { - gemmlowp::ScopedProfilingLabel label("TrMulImpl Reference"); - for (int i = 0; i < lhs.layout.cols; i++) { - for (int j = 0; j < rhs.layout.cols; j++) { - using AccumScalar = typename Spec::AccumScalar; - AccumScalar accum = 0; - for (int k = 0; k < lhs.layout.rows; k++) { - AccumScalar lhs_val = Element(lhs, k, i); - AccumScalar rhs_val = Element(rhs, k, j); - accum += (lhs_val - lhs.zero_point) * (rhs_val - rhs.zero_point); - } - if (spec.bias) { - accum += spec.bias[i]; - } - ApplyMultiplier(spec, i, &accum); - accum += dst->zero_point; - accum = std::min(accum, spec.clamp_max); - accum = std::max(accum, spec.clamp_min); - *ElementPtr(dst, i, j) = static_cast(accum); - } - } + gemmlowp::ScopedProfilingLabel label_general("TrMulImpl, general case"); + + auto* trace = NewTraceOrNull(&context->tracing, rows, depth, cols); + TraceRecordStart(trace); + + // Initialize block map. + BlockMap block_map; + MakeBlockMap(rows_rounded_up, cols_rounded_up, depth, + packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols, + packed_lhs.data_type.size, packed_rhs.data_type.size, + &block_map); + std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map); + std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map); + std::uint32_t num_blocks = NumBlocks(block_map); + RUY_DCHECK_EQ(num_blocks, num_blocks_of_rows * num_blocks_of_cols); + + // Initialize per-thread state. + thread_count = clamp(thread_count, 1, num_blocks); + context->EnsureNPerThreadStates(thread_count); + for (auto& per_thread_state : context->per_thread_states) { + per_thread_state->tuning_resolver.SetTuning(context->explicit_tuning); } -}; + + // Allocate memory. + std::atomic* lhs_packed; + allocator->Allocate(num_blocks_of_rows, &lhs_packed); + std::atomic* rhs_packed; + allocator->Allocate(num_blocks_of_cols, &rhs_packed); + std::atomic* atomic_n; + allocator->Allocate(1, &atomic_n); + TrMulTask* tasks; + allocator->Allocate(thread_count, &tasks); + Task** tasks_ptrs; + allocator->Allocate(thread_count, &tasks_ptrs); + + // Initialize allocated data. + for (int i = 0; i < num_blocks_of_rows; i++) { + lhs_packed[i].store(false, std::memory_order_release); + } + for (int i = 0; i < num_blocks_of_cols; i++) { + rhs_packed[i].store(false, std::memory_order_release); + } + atomic_n->store(thread_count); + + for (int i = 0; i < thread_count; i++) { + tasks_ptrs[i] = static_cast(tasks + i); + new (tasks_ptrs[i]) + TrMulTask(params, block_map, atomic_n, i, lhs_packed, rhs_packed, + &context->per_thread_states[i]->tuning_resolver, + &context->per_thread_states[i]->allocator, trace); + } + + // Do the computation. + TraceRecordExecute(trace); + TraceStartRecordingBlockAndThreadFields(block_map, thread_count, trace); + + context->workers_pool.Execute(thread_count, tasks_ptrs); + + // Finish up. + for (int i = 0; i < thread_count; i++) { + tasks[i].~TrMulTask(); + } + + TraceRecordEnd(trace); + + allocator->FreeAll(); +} } // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/internal_matrix.h b/tensorflow/lite/experimental/ruy/internal_matrix.h new file mode 100644 index 00000000000..9a7d6ee6938 --- /dev/null +++ b/tensorflow/lite/experimental/ruy/internal_matrix.h @@ -0,0 +1,382 @@ +/* Copyright 2019 Google LLC. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Internal types and helpers for matrices. +// +// Ruy has a couple slightly different notions of matrices, besides the +// Matrix class that we expose to the user-facing API. +// +// TODO(silvasean): Put parts of this architecture description somewhere more +// prominent. +// +// The 4 different matrix types are: +// - Matrix: This is a user-facing type on Ruy's external API boundary. +// - DMatrix: This is a type-erased version of Matrix. "D" = "dynamic". +// - PMatrix: This represents a packed matrix, which requires tracking kernel +// layout and row/column sums for quantization. It is type-erased. +// - PackedMatrix: This is a statically typed variant of PMatrix for +// convenience inside typed routines. +// +// Note that Matrix is *not* implemented in terms of the internal types. It +// is an independent, simple, and user-facing type. +// +// The use of type-erasure might seem surprising for a library like Ruy with a +// heavily-templated entry point, but it is motivated by the desire for most of +// Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3 +// main parts: +// - "front-end" (dispatch.h) - this is the highly templated ruy::Mul entry +// point, along with routines that select RunKernel and RunPack implementations +// statically based on those template parameters. +// - "back-end" (kernel.h, pack.h)- this consists of the implementations of +// RunKernel and RunPack, often in assembly code, which are the building blocks +// that Ruy calls to perform matrix multiplication. These are templated so that +// only the requested types/Path's are actually emitted by the compiler. +// - "middle-end" (impl.h) - this is the part of Ruy that orchestrates the +// calls to the "back-end" optimized building blocks. This layer has to deal +// with issues like cache locality and low-overhead multi-threading. +// +// There is a desire for the "middle-end" to be non-templated in order to +// simplify the implementation and reduce code-size. We type-erase when going +// from the "front-end" to the "middle-end", and un-type-erase going from the +// "middle-end" to the "back-end". The un-type-erasure is possible because the +// "front-end" is responsible for instantiating the needed "back-end" templates, +// and thus the static type information is still present. +// +// Each layer of Ruy uses matrix types: +// - "front-end": Matrix +// - "middle-end": DMatrix, PMatrix +// - "back-end": Matrix, PackedMatrix +// +// The use of separate types for packed matrices is not essential, but makes it +// obvious at a glance whether a matrix is a packed matrix or not. We would +// reconsider this decision if there was significant duplication between packed +// and unpacked matrices, but that doesn't seem to be the case at the moment. +// +// Another goal is to keep the user-facing Matrix as simple and +// understandable as possible. Ideally, a user should be able to read the struct +// definition for Matrix and see a very simple definition with no internal +// details like sums and kernel block layout. +// +// To present another structured view of our various matrix types, here's a +// table: +// User matrices Packed matrices +// +---------------------------------- +// Templated | Matrix PackedMatrix +// Type-erased | DMatrix PMatrix + +#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_ +#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_ + +#include +#include + +#include "tensorflow/lite/experimental/ruy/common.h" +#include "tensorflow/lite/experimental/ruy/matrix.h" +#include "tensorflow/lite/experimental/ruy/size_util.h" + +namespace ruy { + +// KernelLayout describes small-scale block structure in a packed matrix layout. +// +// This is is sometimes known as "tiling" in other contexts. +// +// For example, consider a packed matrix in column-major format with a +// column-major KernelLayout. The matrix logically has a shape of +// `[cols, rows]`. However, the matrix is laid out as though it were a 4D array +// of shape `[cols / kcols, rows / krows, kcols, krows]`. +// +// Note that in the case of kcols=1, krows=1, this degenerates to +// `[cols, rows, 1, 1]` which is equivalent to having no small-scale block +// structure. +struct KernelLayout { + Order order = Order::kColMajor; + std::uint8_t rows = 1; + std::uint8_t cols = 1; +}; + +// Compile time version of KernelLayout, suitable for template metaprogramming. +// In particular, partial template specializations of Kernel use this type to +// statically declare their kernel format. +template +struct FixedKernelLayout { + static constexpr Order kOrder = tOrder; + static constexpr int kRows = tRows; + static constexpr int kCols = tCols; +}; + +// A packed matrix has a small-scale block structure that is not present in in +// the input matrices. This block structure is necessary for the kernels to +// process data efficiently. +// +// This struct is very similar to Layout, but has the extra KernelLayout field. +struct PackedLayout { + std::int32_t rows = 0; + std::int32_t cols = 0; + // Stride is the offset between two adjacent matrix elements + // in the non-contiguous direction. + std::int32_t stride = 0; + Order order = Order::kColMajor; + // Small scale layout shuffling, potentially departing from + // linear row-major or column-major storage. See KernelLayout. + KernelLayout kernel; +}; + +// Dynamic representation for a type. +// +// The most important field in this struct is the size, which Ruy uses to know +// how much memory to allocate without having to be templated on a type. +// Signed-ness and floating-point-ness are mainly present as debugging checks. +// +// Note: Ruy does not use this struct to to dynamically dispatch between +// different typed implementations. As described in the comment at the top of +// this file, Ruy's "front-end", which is templated, instantiates all the +// necessary "back-end" routines with complete static knowledge of all the +// types. +struct Type { + template + static Type Create() { + Type ret; + ret.is_signed = std::is_signed::value; + ret.is_floating_point = std::is_floating_point::value; + ret.size = sizeof(T); + return ret; + } + + template + void AssertIs() const { + RUY_DCHECK(is_signed == Create().is_signed); + RUY_DCHECK(is_floating_point == Create().is_floating_point); + RUY_DCHECK(size == Create().size); + } + + bool is_signed = false; + bool is_floating_point = false; + std::uint8_t size = 0; +}; + +// Type-erased matrix. +struct DMatrix { + Type data_type; + void* data = nullptr; + Layout layout; + std::int32_t zero_point = 0; +}; + +// Type-erased packed matrix. +struct PMatrix { + Type data_type; + void* data = nullptr; + Type sums_type; + void* sums = nullptr; + PackedLayout layout; + std::int32_t zero_point = 0; +}; + +// Convenient typed helper for packed matrices. +template +struct PackedMatrix { + // The row/column sums needed for quantized matrix multiplication when + // the opposite operand of the multiplication uses a non-symmetric zero + // point. + // This member is only relevant for packed matrices. + // Additionally, Ruy always uses 32-bit signed accumulators for quantized + // matrix multiplication. + // For floating point types, there is no quantization, so this pointer + // will always be null. We still need code referencing it to compile + // though, even if it is always branched around. Hence we use Scalar* + // itself as the type in that case. + using SumsType = + typename std::conditional::value, Scalar, + std::int32_t>::type; + + Scalar* data = nullptr; + SumsType* sums = nullptr; + PackedLayout layout; + std::int32_t zero_point = 0; +}; + +template +DMatrix ToDMatrix(const Matrix& matrix) { + DMatrix ret; + ret.data_type = Type::Create(); + ret.data = ToVoidPtr(matrix.data.get()); + ret.layout = matrix.layout; + ret.zero_point = matrix.zero_point; + return ret; +} + +template +Matrix ToMatrix(const DMatrix& dmatrix) { + dmatrix.data_type.AssertIs(); + Matrix ret; + ret.data = static_cast(dmatrix.data); + ret.layout = dmatrix.layout; + ret.zero_point = dmatrix.zero_point; + return ret; +} + +template +PackedMatrix ToPackedMatrix(const PMatrix& pmatrix) { + using SumsType = typename PackedMatrix::SumsType; + pmatrix.data_type.AssertIs(); + pmatrix.sums_type.AssertIs(); + PackedMatrix ret; + ret.data = static_cast(pmatrix.data); + ret.sums = static_cast(pmatrix.sums); + ret.layout = pmatrix.layout; + ret.zero_point = pmatrix.zero_point; + return ret; +} + +// Helpers for Layout / PackedLayout. + +inline bool IsPacked(const Layout& layout) { + if (layout.order == Order::kColMajor) { + return layout.stride == layout.rows; + } else { + return layout.stride == layout.cols; + } +} + +inline bool IsRowMajor(const Layout& layout) { + return layout.order == Order::kRowMajor; +} + +template +inline bool IsColMajor(const LayoutOrPackedLayout& layout) { + return layout.order == Order::kColMajor; +} + +template +inline int FlatSize(const LayoutOrPackedLayout& layout) { + const int outerdim = + layout.order == Order::kColMajor ? layout.cols : layout.rows; + return layout.stride * outerdim; +} + +// TODO(b/130417400) add a unit test +inline int Offset(const Layout& layout, int row, int col) { + // TODO(benoitjacob) - should check this but this make the _slow tests take + // 5x longer. Find a mitigation like in Eigen with an 'internal' variant + // bypassing the check? + // RUY_DCHECK_GE(row, 0); + // RUY_DCHECK_GE(col, 0); + // RUY_DCHECK_LT(row, layout.rows); + // RUY_DCHECK_LT(col, layout.cols); + int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride; + int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride; + return row * row_stride + col * col_stride; +} + +// TODO(b/130417400) add a unit test +inline int Offset(const PackedLayout& layout, int row, int col) { + RUY_DCHECK(is_pot(layout.kernel.rows)); + RUY_DCHECK(is_pot(layout.kernel.cols)); + int row_outer = row & ~(layout.kernel.rows - 1); + int col_outer = col & ~(layout.kernel.cols - 1); + int row_stride_outer = + layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride; + int col_stride_outer = + layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride; + int offset_outer = + row_outer * row_stride_outer + col_outer * col_stride_outer; + int row_inner = row - row_outer; + int col_inner = col - col_outer; + int row_stride_inner = + layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols; + int col_stride_inner = + layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows; + int offset_inner = + row_inner * row_stride_inner + col_inner * col_stride_inner; + return offset_outer + offset_inner; +} + +// Helpers for Matrix. + +template +const Scalar* ElementPtr(const Matrix& mat, int row, int col) { + return mat.data.get() + Offset(mat.layout, row, col); +} + +template +Scalar* ElementPtr(Matrix* mat, int row, int col) { + return mat->data.get() + Offset(mat->layout, row, col); +} + +template +Scalar Element(const Matrix& mat, int row, int col) { + return *ElementPtr(mat, row, col); +} + +// Helpers for PackedMatrix. +// Duplicated from Matrix, but the duplication seems acceptable. + +template +const Scalar* ElementPtr(const PackedMatrix& mat, int row, int col) { + return mat.data + Offset(mat.layout, row, col); +} + +template +Scalar* ElementPtr(PackedMatrix* mat, int row, int col) { + return mat->data + Offset(mat->layout, row, col); +} + +template +Scalar Element(const PackedMatrix& mat, int row, int col) { + return *ElementPtr(mat, row, col); +} + +// Helpers for PMatrix. + +inline std::size_t DataSize(const PMatrix& packed) { + return FlatSize(packed.layout) * packed.data_type.size; +} + +inline std::size_t SumsSize(const PMatrix& packed) { + // Packed matrices are only relevant for Ruy's TrMul implementations. For + // TrMul, the number of sums is always equal to the number of columns. + return packed.layout.cols * packed.sums_type.size; +} + +// Transpose helpers. + +inline void Transpose(Order* order) { + *order = *order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor; +} + +inline void Transpose(Layout* layout) { + Transpose(&layout->order); + std::swap(layout->rows, layout->cols); +} + +template +inline void Transpose(Matrix* matrix) { + Transpose(&matrix->layout); +} + +// Helpers for KernelLayout. + +template +KernelLayout ToKernelLayout() { + KernelLayout ret; + ret.order = FixedKernelLayout::kOrder; + ret.rows = FixedKernelLayout::kRows; + ret.cols = FixedKernelLayout::kCols; + return ret; +} + +} // namespace ruy + +#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_ diff --git a/tensorflow/lite/experimental/ruy/kernel.h b/tensorflow/lite/experimental/ruy/kernel.h index b5804016e8d..01128348db7 100644 --- a/tensorflow/lite/experimental/ruy/kernel.h +++ b/tensorflow/lite/experimental/ruy/kernel.h @@ -21,7 +21,7 @@ limitations under the License. #include "fixedpoint/fixedpoint.h" #include "profiling/instrumentation.h" #include "tensorflow/lite/experimental/ruy/common.h" -#include "tensorflow/lite/experimental/ruy/matrix.h" +#include "tensorflow/lite/experimental/ruy/internal_matrix.h" #include "tensorflow/lite/experimental/ruy/opt_set.h" #include "tensorflow/lite/experimental/ruy/path.h" #include "tensorflow/lite/experimental/ruy/size_util.h" @@ -36,12 +36,12 @@ struct Kernel {}; template -void RunKernel( - const Kernel& kernel, - const Matrix& lhs, const Matrix& rhs, - const Spec& spec, int start_row, int start_col, int end_row, int end_col, - Matrix* dst) { +void RunKernelTyped(Tuning tuning, const PackedMatrix& lhs, + const PackedMatrix& rhs, const Spec& spec, + int start_row, int start_col, int end_row, int end_col, + Matrix* dst) { using Kernel = Kernel; + Kernel kernel(tuning); #if RUY_OPT_SET & RUY_OPT_FAT_KERNEL kernel.Run(lhs, rhs, spec, start_row, start_col, end_row, end_col, dst); #else @@ -57,6 +57,24 @@ void RunKernel( #endif } +// Main entry point for kernels. +template +void RunKernel(Tuning tuning, const PMatrix& lhs, const PMatrix& rhs, + void* spec, int start_row, int start_col, int end_row, + int end_col, DMatrix* dst) { + Matrix mdst = ToMatrix(*dst); + RunKernelTyped( + tuning, ToPackedMatrix(lhs), ToPackedMatrix(rhs), + *static_cast(spec), start_row, start_col, end_row, end_col, + &mdst); +} + +// The signature of RunKernel is the same, regardless of template parameters. +using RunKernelFn = + decltype(RunKernel>); + // Copied from TF Lite code. inline std::int32_t MultiplyByQuantizedMultiplier( std::int32_t x, std::int32_t quantized_multiplier, int shift) { @@ -118,16 +136,17 @@ struct Kernel { using LhsLayout = FixedKernelLayout; using RhsLayout = FixedKernelLayout; explicit Kernel(Tuning) {} - void Run(const Matrix& lhs, const Matrix& rhs, - const Spec& spec, int start_row, int start_col, int end_row, - int end_col, Matrix* dst) const { + void Run(const PackedMatrix& lhs, + const PackedMatrix& rhs, const Spec& spec, int start_row, + int start_col, int end_row, int end_col, + Matrix* dst) const { gemmlowp::ScopedProfilingLabel label("Kernel (Standard Cpp)"); const int depth = lhs.layout.rows; for (int i = start_row; i < end_row; i++) { for (int j = start_col; j < end_col; j++) { using AccumScalar = typename Spec::AccumScalar; AccumScalar accum = 0; - for (int k = 0; k < lhs.layout.rows; k++) { + for (int k = 0; k < depth; k++) { AccumScalar lhs_val = Element(lhs, k, i); AccumScalar rhs_val = Element(rhs, k, j); accum += lhs_val * rhs_val; @@ -136,10 +155,10 @@ struct Kernel { accum += spec.bias[i]; } if (lhs.zero_point) { - accum -= lhs.zero_point * rhs.sums.get()[j]; + accum -= lhs.zero_point * rhs.sums[j]; } if (rhs.zero_point) { - accum -= rhs.zero_point * lhs.sums.get()[i]; + accum -= rhs.zero_point * lhs.sums[i]; } if (lhs.zero_point && rhs.zero_point) { accum += lhs.zero_point * rhs.zero_point * depth; @@ -233,8 +252,8 @@ struct KernelParams8bit { }; template -void MakeKernelParams8bit(const Matrix& lhs, - const Matrix& rhs, +void MakeKernelParams8bit(const PackedMatrix& lhs, + const PackedMatrix& rhs, const BasicSpec& spec, int start_row, int start_col, int end_row, int end_col, Matrix* dst, @@ -249,20 +268,20 @@ void MakeKernelParams8bit(const Matrix& lhs, RUY_DCHECK_EQ(end_row % LhsCols, 0); RUY_DCHECK_EQ(end_col % RhsCols, 0); - params->lhs_base_ptr = lhs.data.get() + start_row * lhs.layout.stride; - params->rhs_base_ptr = rhs.data.get() + start_col * rhs.layout.stride; + params->lhs_base_ptr = lhs.data + start_row * lhs.layout.stride; + params->rhs_base_ptr = rhs.data + start_col * rhs.layout.stride; params->flags = 0; params->bias = params->zero_data; if (spec.bias) { params->bias = spec.bias; params->flags |= RUY_ASM_FLAG_HAS_BIAS; } - if (lhs.sums.get()) { - params->lhs_sums = lhs.sums.get(); + if (lhs.sums) { + params->lhs_sums = lhs.sums; params->flags |= RUY_ASM_FLAG_HAS_LHS_SUMS; } - if (rhs.sums.get()) { - params->rhs_sums = rhs.sums.get(); + if (rhs.sums) { + params->rhs_sums = rhs.sums; params->flags |= RUY_ASM_FLAG_HAS_RHS_SUMS; } params->start_row = start_row; @@ -314,7 +333,8 @@ struct Kernel; Tuning tuning = Tuning::kAuto; explicit Kernel(Tuning tuning_) : tuning(tuning_) {} - void Run(const Matrix& lhs, const Matrix& rhs, + void Run(const PackedMatrix& lhs, + const PackedMatrix& rhs, const BasicSpec& spec, int start_row, int start_col, int end_row, int end_col, Matrix* dst) const { @@ -336,7 +356,8 @@ struct Kernel; using RhsLayout = FixedKernelLayout; explicit Kernel(Tuning tuning_) : tuning(tuning_) {} - void Run(const Matrix& lhs, const Matrix& rhs, + void Run(const PackedMatrix& lhs, + const PackedMatrix& rhs, const BasicSpec& spec, int start_row, int start_col, int end_row, int end_col, Matrix* dst) const { @@ -375,8 +396,8 @@ struct KernelParamsFloat { }; template -inline void MakeKernelParamsFloat(const Matrix& lhs, - const Matrix& rhs, +inline void MakeKernelParamsFloat(const PackedMatrix& lhs, + const PackedMatrix& rhs, const BasicSpec& spec, int start_row, int start_col, int end_row, int end_col, Matrix* dst, @@ -389,8 +410,8 @@ inline void MakeKernelParamsFloat(const Matrix& lhs, RUY_DCHECK_EQ(end_row % LhsCols, 0); RUY_DCHECK_EQ(end_col % RhsCols, 0); - params->lhs_base_ptr = lhs.data.get() + start_row * lhs.layout.stride; - params->rhs_base_ptr = rhs.data.get() + start_col * rhs.layout.stride; + params->lhs_base_ptr = lhs.data + start_row * lhs.layout.stride; + params->rhs_base_ptr = rhs.data + start_col * rhs.layout.stride; params->dst_base_ptr = dst->data.get() + start_col * dst->layout.stride + start_row; @@ -428,7 +449,7 @@ struct Kernel> { using LhsLayout = FixedKernelLayout; using RhsLayout = FixedKernelLayout; explicit Kernel(Tuning tuning_) : tuning(tuning_) {} - void Run(const Matrix& lhs, const Matrix& rhs, + void Run(const PackedMatrix& lhs, const PackedMatrix& rhs, const BasicSpec& spec, int start_row, int start_col, int end_row, int end_col, Matrix* dst) const { KernelParamsFloat params; @@ -451,7 +472,7 @@ struct Kernel> using Base = Kernel>; explicit Kernel(Tuning tuning_) : Base(tuning_) {} - void Run(const Matrix& lhs, const Matrix& rhs, + void Run(const PackedMatrix& lhs, const PackedMatrix& rhs, const BasicSpec& spec, int start_row, int start_col, int end_row, int end_col, Matrix* dst) const { KernelParamsFloat params; diff --git a/tensorflow/lite/experimental/ruy/matrix.h b/tensorflow/lite/experimental/ruy/matrix.h index e7cf4a6bb9e..49b7c1df14a 100644 --- a/tensorflow/lite/experimental/ruy/matrix.h +++ b/tensorflow/lite/experimental/ruy/matrix.h @@ -27,17 +27,6 @@ namespace ruy { // 'column-major' means that each column is contiguous in memory. enum class Order : std::uint8_t { kColMajor, kRowMajor }; -// KernelLayout describes small-scale block structure in a matrix layout. -// The default (rows = 1, cols = 1) means no such small-scale block structure, -// since 1x1 blocks is the same as no blocks. In that case, the overall -// matrix layout is just the usual linear row-major or column-major layout -// described by the other members of struct Layout. -struct KernelLayout final { - Order order = Order::kColMajor; - std::uint8_t rows = 1; - std::uint8_t cols = 1; -}; - // Describes the shape and storage layout of a matrix. struct Layout final { std::int32_t rows = 0; @@ -46,10 +35,6 @@ struct Layout final { // in the non-contiguous direction. std::int32_t stride = 0; Order order = Order::kColMajor; - - // Small scale layout shuffling, potentially departing from - // linear row-major or column-major storage. See KernelLayout. - KernelLayout kernel; }; namespace detail { @@ -110,16 +95,12 @@ class ConstCheckingPtr final { // signed or unsigned. template struct Matrix final { - void operator=(const Matrix& other) { data = other.data; layout = other.layout; zero_point = other.zero_point; } - private: - - public: // The underlying buffer wrapped by this matrix. detail::ConstCheckingPtr data; // The shape and data layout of this matrix. @@ -127,22 +108,15 @@ struct Matrix final { // The zero_point, i.e. which Scalar value is to be interpreted as zero. // When Scalar is floating-point, this must be 0. Scalar zero_point = 0; - // The row/column sums needed for quantized matrix multiplication when - // the opposite operand of the multiplication uses a non-symmetric zero - // point. - // This member is only relevant for packed matrices. - // Additionally, Ruy always uses 32-bit signed accumulators for quantized - // matrix multiplication. - // For floating point types, there is no quantization, so this pointer - // will always be null. We still need code referencing it to compile - // though, even if it is always branched around. Hence we use Scalar* - // itself as the type in that case. - using SumsType = - typename std::conditional::value, Scalar, - std::int32_t>::type; - detail::ConstCheckingPtr sums; }; +inline void MakeSimpleLayout(int rows, int cols, Order order, Layout* layout) { + layout->rows = rows; + layout->cols = cols; + layout->order = order; + layout->stride = order == Order::kColMajor ? rows : cols; +} + template StreamType& operator<<(StreamType& stream, const Matrix& mat) { for (int row = 0; row < mat.layout.rows; row++) { diff --git a/tensorflow/lite/experimental/ruy/pack.h b/tensorflow/lite/experimental/ruy/pack.h index 65b1a1f594c..753a202e69e 100644 --- a/tensorflow/lite/experimental/ruy/pack.h +++ b/tensorflow/lite/experimental/ruy/pack.h @@ -20,6 +20,7 @@ limitations under the License. #include "profiling/instrumentation.h" #include "tensorflow/lite/experimental/ruy/common.h" +#include "tensorflow/lite/experimental/ruy/internal_matrix.h" #include "tensorflow/lite/experimental/ruy/opt_set.h" #include "tensorflow/lite/experimental/ruy/tune.h" @@ -63,11 +64,11 @@ template { static void Run(Tuning, const Matrix& src_matrix, - Matrix* packed_matrix, int start_col, + PackedMatrix* packed_matrix, int start_col, int end_col) { gemmlowp::ScopedProfilingLabel label("Pack (generic)"); RUY_DCHECK_EQ((end_col - start_col) % FixedKernelLayout::kCols, 0); - SumsType* sums = packed_matrix->sums.get(); + SumsType* sums = packed_matrix->sums; for (int col = start_col; col < end_col; col++) { SumsType accum = 0; for (int row = 0; row < packed_matrix->layout.rows; row++) { @@ -129,12 +130,12 @@ struct PackImpl, Scalar, std::is_same::value ? 0 : 0x80; static void Run(Tuning tuning, const Matrix& src_matrix, - Matrix* packed_matrix, int start_col, + PackedMatrix* packed_matrix, int start_col, int end_col) { - RUY_DCHECK(IsLinearColMajor(src_matrix.layout)); + RUY_DCHECK(IsColMajor(src_matrix.layout)); RUY_DCHECK(IsColMajor(packed_matrix->layout)); RUY_DCHECK_EQ(start_col % 4, 0); - std::int32_t* sums = packed_matrix->sums.get(); + std::int32_t* sums = packed_matrix->sums; Scalar zerobuf[16]; memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf)); for (int block_col = start_col; block_col < end_col; block_col += 4) { @@ -166,7 +167,7 @@ struct PackImpl, Scalar, } } std::int8_t* packed_ptr = - packed_matrix->data.get() + packed_matrix->layout.stride * block_col; + packed_matrix->data + packed_matrix->layout.stride * block_col; std::int32_t* sums_ptr = sums ? sums + block_col : nullptr; if (__builtin_expect(tuning == Tuning::kInOrder, true)) { Pack8bitNeonInOrder( @@ -193,12 +194,12 @@ struct PackImpl, std::is_same::value ? 0 : 0x80; static void Run(Tuning tuning, const Matrix& src_matrix, - Matrix* packed_matrix, int start_col, + PackedMatrix* packed_matrix, int start_col, int end_col) { - RUY_DCHECK(IsLinearColMajor(src_matrix.layout)); + RUY_DCHECK(IsColMajor(src_matrix.layout)); RUY_DCHECK(IsColMajor(packed_matrix->layout)); RUY_DCHECK_EQ(start_col % 8, 0); - std::int32_t* sums = packed_matrix->sums.get(); + std::int32_t* sums = packed_matrix->sums; Scalar zerobuf[16]; memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf)); for (int block_col = start_col; block_col < end_col; block_col += 4) { @@ -230,7 +231,7 @@ struct PackImpl, } } std::int8_t* packed_ptr = - packed_matrix->data.get() + + packed_matrix->data + packed_matrix->layout.stride * (block_col & ~7) + ((block_col & 4) * 4); std::int32_t* sums_ptr = sums ? sums + block_col : nullptr; @@ -264,8 +265,9 @@ template <> struct PackImpl, float, float, float> { static void Run(Tuning tuning, const Matrix& src_matrix, - Matrix* packed_matrix, int start_col, int end_col) { - RUY_DCHECK(IsLinearColMajor(src_matrix.layout)); + PackedMatrix* packed_matrix, int start_col, + int end_col) { + RUY_DCHECK(IsColMajor(src_matrix.layout)); RUY_DCHECK(IsColMajor(packed_matrix->layout)); RUY_DCHECK_EQ(start_col % 8, 0); const float zerobuf[4] = {0}; @@ -297,7 +299,7 @@ struct PackImpl, float, src_inc3 = 0; } } - float* packed_ptr = packed_matrix->data.get() + + float* packed_ptr = packed_matrix->data + packed_matrix->layout.stride * (block_col & ~7) + ((block_col & 4)); if (__builtin_expect(tuning == Tuning::kInOrder, true)) { @@ -317,15 +319,24 @@ struct PackImpl, float, #endif // (defined __aarch64__) && (RUY_OPT_SET & RUY_OPT_ASM) +// Main entry point for packing. template -void Pack(Tuning tuning, const Matrix& src_matrix, - Matrix* packed_matrix, int start_col, int end_col) { - using SumsType = typename Matrix::SumsType; +void RunPack(Tuning tuning, const DMatrix& src_matrix, PMatrix* packed_matrix, + int start_col, int end_col) { + using SumsType = typename PackedMatrix::SumsType; + Matrix src = ToMatrix(src_matrix); + PackedMatrix packed = + ToPackedMatrix(*packed_matrix); PackImpl::Run( - tuning, src_matrix, packed_matrix, start_col, end_col); + tuning, src, &packed, start_col, end_col); } +// The signature of RunPack is the same, regardless of its template parameters. +using RunPackFn = decltype( + RunPack, + std::int8_t, std::int8_t>); + } // namespace ruy #endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_ diff --git a/tensorflow/lite/experimental/ruy/path.h b/tensorflow/lite/experimental/ruy/path.h index 2589c19f95d..15f4755cfad 100644 --- a/tensorflow/lite/experimental/ruy/path.h +++ b/tensorflow/lite/experimental/ruy/path.h @@ -44,31 +44,12 @@ namespace ruy { // at runtime; then, typically in dispatch.h, we internally pick one // specific path and from there on, internal Ruy code deals with only one // path. -// -// When a user selects a set of compiled paths, Ruy internally dispatches to the -// "best" one, which typically means the newest optimized instructions for a -// given base architecture (such as ARM). Higher values of this enum correspond -// to "better" code paths within a given base architecture for which Ruy has -// optimized code paths. enum class Path : std::uint8_t { - // This is a special null value, representing the absence of any path. + // Higher values have higher precedence. kNone = 0, - // Reference multiplication code. - // The main purpose of this path is to have a very simple standalone Mul - // implementation to check against. - // This path bypasses almost all of Ruy's internal implementation details. - // - // This is intended for testing/development. - kReference = 0x1, - // Standard C++ implementation of Ruy's architecture-specific parts. - // Unlike Path::kReference, this path exercises most of Ruy's internal logic. - // - // This is intended for testing/development. - kStandardCpp = 0x2, - // Optimized path using a widely available subset of ARM NEON instructions. + kReference = 0x1, // reference code. + kStandardCpp = 0x2, // Standard C++ only. No SIMD or other arch features. kNeon = 0x4, - // Optimized path making use of ARM NEON dot product instructions that are - // available on newer ARM cores. kNeonDotprod = 0x8, }; @@ -87,12 +68,14 @@ inline constexpr Path operator^(Path p, Path q) { static_cast(q)); } +inline constexpr Path operator~(Path p) { + return static_cast(~static_cast(p)); +} + inline Path GetMostSignificantPath(Path path_mask) { return static_cast(round_down_pot(static_cast(path_mask))); } -// ruy::kAllPaths represents all Path's that make sense to on a given -// base architecture. #ifdef __aarch64__ constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp | Path::kNeon | Path::kNeonDotprod; diff --git a/tensorflow/lite/experimental/ruy/ruy.h b/tensorflow/lite/experimental/ruy/ruy.h index 371576ebf5e..d9f88f6575a 100644 --- a/tensorflow/lite/experimental/ruy/ruy.h +++ b/tensorflow/lite/experimental/ruy/ruy.h @@ -25,16 +25,17 @@ limitations under the License. namespace ruy { -// Performs a multiplication of matrices. This is Ruy's only API entry point. -// Should be self-explanatory given the documentation for each of Matrix, -// Spec and Context. See the code for ReferenceMul in dispatch.h for a reference -// implementation. +// Performs a multiplication of matrices. This is Ruy's only API entry point. +// Should be self-explanatory given the above documentation for each of Matrix, +// Spec and Context. See reference code in reference.h, with the caveat that +// that is reference code for transpose-multiply (TrMul) not just multiply; +// see the translation between the two in transpose_dispatch.h. template void Mul(const Matrix& lhs, const Matrix& rhs, const Spec& spec, Context* context, Matrix* dst) { - MulDispatch dispatch; - dispatch.Mul(lhs, rhs, spec, context, dst); + DispatchMul( + lhs, rhs, spec, context, dst); } } // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/spec.h b/tensorflow/lite/experimental/ruy/spec.h index ed48416cc06..2afe4604e5c 100644 --- a/tensorflow/lite/experimental/ruy/spec.h +++ b/tensorflow/lite/experimental/ruy/spec.h @@ -18,7 +18,6 @@ limitations under the License. #include #include -#include namespace ruy { @@ -38,16 +37,14 @@ enum class LoopStructure { kGeneral, kSimple, kAuto }; enum class ZeroPointSupport { kGeneral, kSymmetric }; // In general we allow all Layout's, even if we may use slow paths for some -// kinds of layouts. By choosing kPackedLinearRCC, one may opt out of this and +// kinds of layouts. By choosing kRCC, one may opt out of this and // only keep support for the simplest and most efficient combination of // Layout's, in exchange for smaller code size. The case covered by -// kPackedLinearRCC is that where all matrix layouts are linear (no sub-block -// structure), packed (no striding), and where the storage orders are exactly -// the following: +// kRCC is where the storage orders are exactly the following: // - LHS is RowMajor // - RHS is ColMajor // - Destination is ColMajor -enum class LayoutSupport { kGeneral, kPackedLinearRCC }; +enum class LayoutSupport { kGeneral, kRCC }; // A Spec describes all about a matrix multiplication operation that isn't // encoded in the LHS, RHS and destination matrices. Some of that information @@ -84,13 +81,9 @@ struct BasicSpec { // multiplier_fixedpoint_perchannel must be nullptr. const int* multiplier_exponent_perchannel = nullptr; // min clamp bound of destination values. - DstScalar clamp_min = std::is_floating_point::value - ? -std::numeric_limits::infinity() - : std::numeric_limits::lowest(); + DstScalar clamp_min = std::numeric_limits::lowest(); // max clamp bound of destination values. - DstScalar clamp_max = std::is_floating_point::value - ? std::numeric_limits::infinity() - : std::numeric_limits::max(); + DstScalar clamp_max = std::numeric_limits::max(); // See above enum LoopStructure static constexpr LoopStructure kLoopStructure = LoopStructure::kAuto; // See above enum LayoutSupport diff --git a/tensorflow/lite/experimental/ruy/test.h b/tensorflow/lite/experimental/ruy/test.h index dc1e52e579a..9d32d3dec47 100644 --- a/tensorflow/lite/experimental/ruy/test.h +++ b/tensorflow/lite/experimental/ruy/test.h @@ -297,17 +297,13 @@ void MakeRandomVector(RandomRange range, int size, std::vector* dst) { } } -enum class LayoutStyle { kPackedLinear, kLinear, kBlocked }; +enum class LayoutStyle { kPackedLinear, kLinear }; -void MakeLayout(int rows, int cols, int kernel_rows, int kernel_cols, - Order order, Order kernel_order, LayoutStyle layout_style, +void MakeLayout(int rows, int cols, Order order, LayoutStyle layout_style, Layout* layout) { layout->rows = rows; layout->cols = cols; layout->order = order; - layout->kernel.order = kernel_order; - layout->kernel.rows = kernel_rows; - layout->kernel.cols = kernel_cols; const int packed_stride = order == Order::kColMajor ? rows : cols; @@ -340,12 +336,10 @@ void VerifyConsistentFields(const StorageMatrix& storage_matrix) { } template -void MakeRandom(int rows, int cols, int kernel_rows, int kernel_cols, - Order order, Order kernel_order, Scalar zero_point, +void MakeRandom(int rows, int cols, Order order, Scalar zero_point, LayoutStyle layout_style, RandomRange range, StorageMatrix* storage_matrix) { - MakeLayout(rows, cols, kernel_rows, kernel_cols, order, kernel_order, - layout_style, &storage_matrix->matrix.layout); + MakeLayout(rows, cols, order, layout_style, &storage_matrix->matrix.layout); storage_matrix->matrix.zero_point = zero_point; UniformRandomDistribution data_dist(range); MakeRandomVector(&data_dist, FlatSize(storage_matrix->matrix.layout), @@ -443,13 +437,8 @@ struct TestSet final { int rows = 0; int cols = 0; int depth = 0; - int kernel_rows = 1; - int kernel_cols = 1; - int kernel_depth = 1; Order lhs_order = Order::kRowMajor; Order rhs_order = Order::kColMajor; - Order lhs_kernel_order = Order::kRowMajor; - Order rhs_kernel_order = Order::kColMajor; Order dst_order = Order::kColMajor; LayoutStyle layout_style = LayoutStyle::kPackedLinear; ExpectedOutcome expected_outcome = ExpectedOutcome::kSuccess; @@ -526,7 +515,6 @@ void EvalRuy(Path path, Tuning tuning, const Matrix& lhs, template void WrapGemmlowp(const Matrix& src, gemmlowp::MatrixMap* dst) { - RUY_CHECK(IsLinear(src.layout)); RUY_CHECK(src.layout.order == (tOrder == gemmlowp::MapOrder::ColMajor ? Order::kColMajor : Order::kRowMajor)); @@ -537,7 +525,6 @@ void WrapGemmlowp(const Matrix& src, template void WrapGemmlowpMutable(Matrix* src, gemmlowp::MatrixMap* dst) { - RUY_CHECK(IsLinear(src->layout)); RUY_CHECK(src->layout.order == (tOrder == gemmlowp::MapOrder::ColMajor ? Order::kColMajor : Order::kRowMajor)); @@ -706,9 +693,6 @@ template void EvalEigen(const Matrix& lhs, const Matrix& rhs, const Spec& spec, int max_num_threads, Matrix* dst) { - RUY_CHECK(IsLinear(lhs.layout)); - RUY_CHECK(IsLinear(rhs.layout)); - RUY_CHECK(IsLinear(dst->layout)); RUY_CHECK_EQ(lhs.zero_point, 0); RUY_CHECK_EQ(rhs.zero_point, 0); RUY_CHECK_EQ(dst->zero_point, 0); @@ -802,9 +786,9 @@ void EvalEigenTensor(const Matrix& lhs, const Matrix& rhs, RUY_CHECK_EQ(spec.multiplier_exponent, 0); // Eigen::TensorMap only supports packed layouts - RUY_CHECK(IsPackedLinear(lhs.layout)); - RUY_CHECK(IsPackedLinear(rhs.layout)); - RUY_CHECK(IsPackedLinear(dst->layout)); + RUY_CHECK(IsPacked(lhs.layout)); + RUY_CHECK(IsPacked(rhs.layout)); + RUY_CHECK(IsPacked(dst->layout)); using TensorLhsType = Eigen::TensorMap>; @@ -1433,11 +1417,9 @@ void TestSet::MakeZeroPoints() { template void TestSet::MakeLhsRhs() { RUY_CHECK(life_stage == LifeStage::kHasZeroPoints); - MakeRandom(rows, depth, kernel_rows, kernel_depth, lhs_order, - lhs_kernel_order, lhs_zero_point, layout_style, + MakeRandom(rows, depth, lhs_order, lhs_zero_point, layout_style, RandomRange::kAvoidMinValue, &lhs); - MakeRandom(depth, cols, kernel_depth, kernel_cols, rhs_order, - rhs_kernel_order, rhs_zero_point, layout_style, + MakeRandom(depth, cols, rhs_order, rhs_zero_point, layout_style, RandomRange::kGeneral, &rhs); life_stage = LifeStage::kHasLhsRhs; } @@ -1531,8 +1513,7 @@ void TestSet::MakeResultPaths() { using TestSetType = TestSet; - if (!getenv("NOEXT") && IsLinear(lhs.matrix.layout) && - IsLinear(rhs.matrix.layout)) { + if (!getenv("NOEXT")) { if (SupportsGemmlowp::kValue) { #ifdef GEMMLOWP_SSE4 const bool gemmlowp_supported = !spec.multiplier_fixedpoint_perchannel; @@ -1569,8 +1550,8 @@ void TestSet::MakeResultPaths() { TestResult& result = results.back(); result.path = path; result.tuning = tuning; - MakeRandom(rows, cols, 1, 1, dst_order, dst_order, dst_zero_point, - layout_style, RandomRange::kGeneral, &result.storage_matrix); + MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style, + RandomRange::kGeneral, &result.storage_matrix); } } @@ -1578,8 +1559,8 @@ void TestSet::MakeResultPaths() { results.emplace_back(); TestResult& result = results.back(); result.external_path = external_path; - MakeRandom(rows, cols, 1, 1, dst_order, dst_order, dst_zero_point, - layout_style, RandomRange::kGeneral, &result.storage_matrix); + MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style, + RandomRange::kGeneral, &result.storage_matrix); } life_stage = LifeStage::kHasResultPaths; @@ -1918,8 +1899,7 @@ void TestSet::Verify() { } template -void TestPackedLinearRCC(int rows, int depth, int cols, - ExpectedOutcome expected_outcome) { +void TestRCC(int rows, int depth, int cols, ExpectedOutcome expected_outcome) { TestSetType test_set; test_set.rows = rows; test_set.depth = depth; @@ -1933,9 +1913,23 @@ void TestPackedLinearRCC(int rows, int depth, int cols, } template -void TestPackedLinearRCC(int rows, int depth, int cols) { - TestPackedLinearRCC(rows, depth, cols, - ExpectedOutcome::kSuccess); +void TestRCC(int rows, int depth, int cols) { + TestRCC(rows, depth, cols, ExpectedOutcome::kSuccess); +} + +template +void TestNonRCC(int rows, int depth, int cols, + ExpectedOutcome expected_outcome) { + TestSetType test_set; + test_set.rows = rows; + test_set.depth = depth; + test_set.cols = cols; + test_set.lhs_order = Order::kColMajor; + test_set.rhs_order = Order::kColMajor; + test_set.dst_order = Order::kColMajor; + test_set.layout_style = LayoutStyle::kPackedLinear; + test_set.expected_outcome = expected_outcome; + test_set.Run(); } template @@ -1967,50 +1961,6 @@ void TestLinearAllOrders(int rows, int depth, int cols) { ExpectedOutcome::kSuccess); } -template -void TestNonLinearAllOrders(int rows, int depth, int cols, int kernel_rows, - int kernel_depth, int kernel_cols, - ExpectedOutcome expected_outcome) { - const std::vector orders{Order::kColMajor, Order::kRowMajor}; - - for (Order lhs_order : orders) { - for (Order rhs_order : orders) { - for (Order dst_order : orders) { - for (Order lhs_kernel_order : orders) { - for (Order rhs_kernel_order : orders) { - TestSetType test_set; - test_set.rows = rows; - test_set.depth = depth; - test_set.cols = cols; - test_set.kernel_rows = kernel_rows; - test_set.kernel_depth = kernel_depth; - test_set.kernel_cols = kernel_cols; - test_set.lhs_order = lhs_order; - test_set.rhs_order = rhs_order; - test_set.lhs_kernel_order = lhs_kernel_order; - test_set.rhs_kernel_order = rhs_kernel_order; - test_set.dst_order = dst_order; - test_set.layout_style = LayoutStyle::kLinear; - test_set.expected_outcome = expected_outcome; - test_set.Run(); - } - } - } - } - } -} - -template -void TestNonLinearAllOrders(int rows, int depth, int cols, int kernel_rows, - int kernel_depth, int kernel_cols) { - RUY_CHECK_EQ(rows % kernel_rows, 0); - RUY_CHECK_EQ(depth % kernel_depth, 0); - RUY_CHECK_EQ(cols % kernel_cols, 0); - TestNonLinearAllOrders(rows, depth, cols, kernel_rows, - kernel_depth, kernel_cols, - ExpectedOutcome::kSuccess); -} - } // namespace ruy #endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_TEST_H_ diff --git a/tensorflow/lite/experimental/ruy/test_fast.cc b/tensorflow/lite/experimental/ruy/test_fast.cc index 06533a082e3..7026bca4616 100644 --- a/tensorflow/lite/experimental/ruy/test_fast.cc +++ b/tensorflow/lite/experimental/ruy/test_fast.cc @@ -56,7 +56,7 @@ TEST(RuyTest, TestSquareMuls) { }; for (int size : sizes) { - TestPackedLinearRCC(size, size, size); + TestRCC(size, size, size); TestLinearAllOrders(size, size, size); } } @@ -73,7 +73,7 @@ TEST(RuyTest, TestMiscMuls) { } TEST(RuyTest, TestDeepMuls) { - TestPackedLinearRCC(1, 50001, 1); + TestRCC(1, 50001, 1); TestLinearAllOrders(5, 5001, 4); TestLinearAllOrders(9, 1025, 10); } @@ -94,10 +94,4 @@ TEST(RuyTest, TestNarrowMuls) { } } -TEST(RuyTest, TestNonLinear) { - TestNonLinearAllOrders(10, 11, 12, 2, 1, 4); - TestNonLinearAllOrders(10, 12, 11, 2, 4, 1); - TestNonLinearAllOrders(8, 2, 4, 8, 2, 4); - TestNonLinearAllOrders(24, 32, 16, 8, 16, 4); -} } // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/test_slow.cc b/tensorflow/lite/experimental/ruy/test_slow.cc index e19fb720cb4..0a10a163075 100644 --- a/tensorflow/lite/experimental/ruy/test_slow.cc +++ b/tensorflow/lite/experimental/ruy/test_slow.cc @@ -29,10 +29,10 @@ using TestSetType = TEST(RuyTest, TestBigNarrowMuls) { for (int width : {1, 2, 3, 4, 5, 8}) { - TestPackedLinearRCC(width, 401, 601); - TestPackedLinearRCC(587, 443, width); + TestRCC(width, 401, 601); + TestRCC(587, 443, width); } - TestPackedLinearRCC(512, 256, 16); + TestRCC(512, 256, 16); } TEST(RuyTest, TestBigShallowMuls) { @@ -42,7 +42,7 @@ TEST(RuyTest, TestBigShallowMuls) { } TEST(RuyTest, TestBigMuls) { - TestPackedLinearRCC(225, 303, 199); + TestRCC(225, 303, 199); TestLinearAllOrders(256, 192, 128); } diff --git a/tensorflow/lite/experimental/ruy/test_special_specs.cc b/tensorflow/lite/experimental/ruy/test_special_specs.cc index 9498e2bf192..5e1d8d980f5 100644 --- a/tensorflow/lite/experimental/ruy/test_special_specs.cc +++ b/tensorflow/lite/experimental/ruy/test_special_specs.cc @@ -32,9 +32,8 @@ struct ZeroPointSupportSpec : BasicSpec { }; template -struct PackedLinearRCCSpec : BasicSpec { - static constexpr LayoutSupport kLayoutSupport = - LayoutSupport::kPackedLinearRCC; +struct RCCSpec : BasicSpec { + static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kRCC; }; using LhsScalar = RUY_TEST_LHSSCALAR; @@ -117,13 +116,11 @@ TEST(TestSpecialSpecs, ZeroPointSupport) { SymmetricZeroPoint() - 1, ExpectedOutcome::kDeath); } -TEST(TestSpecialSpecs, PackedLinearRCC) { - using PackedLinearRCCSpec = PackedLinearRCCSpec; - using PackedLinearRCCTestSet = - TestSet; - TestPackedLinearRCC(81, 93, 72); - TestLinearAllOrders(81, 93, 72, - ExpectedOutcome::kDeath); +TEST(TestSpecialSpecs, RCC) { + using RCCSpec = RCCSpec; + using RCCTestSet = TestSet; + TestRCC(81, 93, 72); + TestNonRCC(81, 93, 72, ExpectedOutcome::kDeath); } } // namespace ruy