From 4bbf04134f7f8d2de03b8494dd0d2d24b811d31e Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Tue, 30 Apr 2019 11:41:40 -0700
Subject: [PATCH] Detemplatize TrMul and introduce type-erased TrMulParams.

The TrMulParams abstraction will allow us to cleanly implement pre-packing and packed matrix caching.

See comment at the top of internal_matrix.h for more info about this change.

One way to look at this CL is to walk through starting from ruy::Mul and see the new code structure.

The change to more purpose-defined Matrix types also allowed separating Layout from the new PackedLayout. Layout (which is part of the user-facing Matrix<T>) is now inherently linear (no kernel layout block structure), which allowed simplifying LoopStructure::kPackedLinearRCC to just "PackedRCC", but "packed" (which in this context means stride is equal to the inner dimension) is not something Ruy cares deeply about, which allowed simplifying to just kRCC.

PiperOrigin-RevId: 245990070
---
 tensorflow/lite/experimental/ruy/BUILD        |  28 +-
 tensorflow/lite/experimental/ruy/allocator.h  |  11 +-
 tensorflow/lite/experimental/ruy/benchmark.cc |   5 +-
 tensorflow/lite/experimental/ruy/common.h     | 145 +------
 tensorflow/lite/experimental/ruy/context.h    |   7 +
 tensorflow/lite/experimental/ruy/dispatch.h   | 293 +++++++++++---
 tensorflow/lite/experimental/ruy/impl.h       | 369 ++++++-----------
 .../lite/experimental/ruy/internal_matrix.h   | 382 ++++++++++++++++++
 tensorflow/lite/experimental/ruy/kernel.h     |  77 ++--
 tensorflow/lite/experimental/ruy/matrix.h     |  40 +-
 tensorflow/lite/experimental/ruy/pack.h       |  45 ++-
 tensorflow/lite/experimental/ruy/path.h       |  31 +-
 tensorflow/lite/experimental/ruy/ruy.h        |  13 +-
 tensorflow/lite/experimental/ruy/spec.h       |  17 +-
 tensorflow/lite/experimental/ruy/test.h       | 114 ++----
 tensorflow/lite/experimental/ruy/test_fast.cc |  10 +-
 tensorflow/lite/experimental/ruy/test_slow.cc |   8 +-
 .../experimental/ruy/test_special_specs.cc    |  17 +-
 18 files changed, 935 insertions(+), 677 deletions(-)
 create mode 100644 tensorflow/lite/experimental/ruy/internal_matrix.h
diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD
index b2b35c26300..97ac38e998d 100644
--- a/tensorflow/lite/experimental/ruy/BUILD
+++ b/tensorflow/lite/experimental/ruy/BUILD
@@ -195,6 +195,17 @@ cc_library(
     deps = [":check_macros"],
 )
 
+cc_library(
+    name = "internal_matrix",
+    hdrs = ["internal_matrix.h"],
+    deps = [
+        ":check_macros",
+        ":common",
+        ":matrix",
+        ":size_util",
+    ],
+)
+
 cc_library(
     name = "common",
     hdrs = [
@@ -205,7 +216,6 @@ cc_library(
         ":matrix",
         ":opt_set",
         ":path",
-        ":size_util",
     ],
 )
 
@@ -219,7 +229,7 @@ cc_library(
     ],
     deps = [
         ":common",
-        ":matrix",
+        ":internal_matrix",
         ":opt_set",
         ":path",
         ":size_util",
@@ -240,7 +250,7 @@ cc_library(
     ],
     deps = [
         ":common",
-        ":matrix",
+        ":internal_matrix",
         ":opt_set",
         ":path",
         ":spec",
@@ -256,11 +266,7 @@ cc_library(
         "dispatch.h",
         "impl.h",
     ],
-    hdrs = [
-        "matrix.h",
-        "path.h",
-        "ruy.h",
-    ],
+    hdrs = ["ruy.h"],
     visibility = ruy_visibility(),
     deps = [
         ":allocator",
@@ -269,8 +275,10 @@ cc_library(
         ":common",
         ":context",
         ":kernel",
+        ":matrix",
         ":opt_set",
         ":pack",
+        ":path",
         ":size_util",
         ":spec",
         ":thread_pool",
@@ -388,7 +396,3 @@ ruy_benchmark_opt_sets(
         "7ff",
     ],
 )
-
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-
-tflite_portable_test_suite()
diff --git a/tensorflow/lite/experimental/ruy/allocator.h b/tensorflow/lite/experimental/ruy/allocator.h
index 789731a505d..5edf6930866 100644
--- a/tensorflow/lite/experimental/ruy/allocator.h
+++ b/tensorflow/lite/experimental/ruy/allocator.h
@@ -146,12 +146,17 @@ class AlignedAllocator {
 // typed buffer.
 class Allocator {
  public:
+  void* AllocateBytes(std::size_t num_bytes) {
+    if (num_bytes == 0) {
+      return nullptr;
+    }
+    return aligned.AllocateAlignedBytes(
+        round_up_pot(num_bytes, detail::AlignedAllocator::kAlignment));
+  }
   template <typename Pointer>
   void Allocate(std::size_t count, Pointer* out) {
     using T = typename std::pointer_traits<Pointer>::element_type;
-    std::size_t num_bytes =
-        round_up_pot(count * sizeof(T), detail::AlignedAllocator::kAlignment);
-    *out = static_cast<T*>(aligned.AllocateAlignedBytes(num_bytes));
+    *out = static_cast<T*>(AllocateBytes(count * sizeof(T)));
   }
 
   void FreeAll() { aligned.FreeAll(); }
diff --git a/tensorflow/lite/experimental/ruy/benchmark.cc b/tensorflow/lite/experimental/ruy/benchmark.cc
index ccf7f5dbb54..55b02d24df9 100644
--- a/tensorflow/lite/experimental/ruy/benchmark.cc
+++ b/tensorflow/lite/experimental/ruy/benchmark.cc
@@ -36,8 +36,7 @@ struct BenchmarkShape {
 };
 
 template <typename TestSetType>
-std::vector<TestResult<DstScalar>> BenchmarkPackedLinearRCC(
-    const BenchmarkShape& shape) {
+std::vector<TestResult<DstScalar>> BenchmarkRCC(const BenchmarkShape& shape) {
   TestSetType test_set;
   test_set.rows = shape.rows;
   test_set.depth = shape.depth;
@@ -104,7 +103,7 @@ void Benchmark() {
 
   for (int i = 0; i < shapes.size(); i++) {
     const auto& shape = shapes[i];
-    const auto& results = BenchmarkPackedLinearRCC<TestSetType>(shape);
+    const auto& results = BenchmarkRCC<TestSetType>(shape);
     if (i == 0) {
       if (benchmark_cubic) {
         printf("size");
diff --git a/tensorflow/lite/experimental/ruy/common.h b/tensorflow/lite/experimental/ruy/common.h
index 53ebbe955ec..3f6e8ac25f5 100644
--- a/tensorflow/lite/experimental/ruy/common.h
+++ b/tensorflow/lite/experimental/ruy/common.h
@@ -21,13 +21,11 @@ limitations under the License.
 #include <atomic>
 #include <limits>
 #include <type_traits>
-#include <utility>
 
 #include "tensorflow/lite/experimental/ruy/check_macros.h"
 #include "tensorflow/lite/experimental/ruy/matrix.h"
 #include "tensorflow/lite/experimental/ruy/opt_set.h"
 #include "tensorflow/lite/experimental/ruy/path.h"
-#include "tensorflow/lite/experimental/ruy/size_util.h"
 
 #ifdef __aarch64__
 #include <arm_neon.h>
@@ -44,111 +42,17 @@ limitations under the License.
 
 namespace ruy {
 
-inline void MakeSimpleLayout(int rows, int cols, Order order, Layout* layout) {
-  layout->rows = rows;
-  layout->cols = cols;
-  layout->order = order;
-  layout->stride = order == Order::kColMajor ? rows : cols;
-  layout->kernel.order = order;
-  layout->kernel.rows = 1;
-  layout->kernel.cols = 1;
-}
-
-inline bool IsLinear(const Layout& layout) {
-  return layout.kernel.rows == 1 && layout.kernel.cols == 1;
-}
-
-inline bool IsPacked(const Layout& layout) {
-  if (layout.order == Order::kColMajor) {
-    return layout.stride == layout.rows;
-  } else {
-    return layout.stride == layout.cols;
-  }
-}
-
-inline bool IsPackedLinear(const Layout& layout) {
-  return IsPacked(layout) && IsLinear(layout);
-}
-
-inline bool IsRowMajor(const Layout& layout) {
-  return layout.order == Order::kRowMajor;
-}
-
-inline bool IsColMajor(const Layout& layout) {
-  return layout.order == Order::kColMajor;
-}
-
-inline bool IsLinearColMajor(const Layout& layout) {
-  return IsLinear(layout) && IsColMajor(layout);
-}
-
-inline bool IsPackedLinearColMajor(const Layout& layout) {
-  return IsLinearColMajor(layout) && IsPacked(layout);
-}
-
-inline bool IsLinearRowMajor(const Layout& layout) {
-  return IsLinear(layout) && IsRowMajor(layout);
-}
-
-inline bool IsPackedLinearRowMajor(const Layout& layout) {
-  return IsLinearRowMajor(layout) && IsPacked(layout);
-}
-
-inline int FlatSize(const Layout& layout) {
-  const int outerdim =
-      layout.order == Order::kColMajor ? layout.cols : layout.rows;
-  return layout.stride * outerdim;
-}
-
-// TODO(b/130417400) add a unit test
-inline int Offset(const Layout& layout, int row, int col) {
-  // TODO(benoitjacob)  - should check this but this make the _slow tests take
-  // 5x longer.  Find a mitigation like in Eigen with an 'internal' variant
-  // bypassing the check?
-  // RUY_DCHECK_GE(row, 0);
-  // RUY_DCHECK_GE(col, 0);
-  // RUY_DCHECK_LT(row, layout.rows);
-  // RUY_DCHECK_LT(col, layout.cols);
-  if (IsLinear(layout)) {
-    int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride;
-    int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride;
-    return row * row_stride + col * col_stride;
-  } else {
-    RUY_DCHECK(is_pot(layout.kernel.rows));
-    RUY_DCHECK(is_pot(layout.kernel.cols));
-    int row_outer = row & ~(layout.kernel.rows - 1);
-    int col_outer = col & ~(layout.kernel.cols - 1);
-    int row_stride_outer =
-        layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride;
-    int col_stride_outer =
-        layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride;
-    int offset_outer =
-        row_outer * row_stride_outer + col_outer * col_stride_outer;
-    int row_inner = row - row_outer;
-    int col_inner = col - col_outer;
-    int row_stride_inner =
-        layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols;
-    int col_stride_inner =
-        layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows;
-    int offset_inner =
-        row_inner * row_stride_inner + col_inner * col_stride_inner;
-    return offset_outer + offset_inner;
-  }
-}
-
-template <typename Scalar>
-const Scalar* ElementPtr(const Matrix<Scalar>& mat, int row, int col) {
-  return mat.data.get() + Offset(mat.layout, row, col);
-}
-
-template <typename Scalar>
-Scalar* ElementPtr(Matrix<Scalar>* mat, int row, int col) {
-  return mat->data.get() + Offset(mat->layout, row, col);
-}
-
-template <typename Scalar>
-Scalar Element(const Matrix<Scalar>& mat, int row, int col) {
-  return *ElementPtr(mat, row, col);
+// Helper for type-erasing a pointer.
+//
+// Often inside Ruy, a template parameter holds type information statically, but
+// we would like to have a function signature that doesn't depend on the
+// template parameters, so that we can dispatch indirectly across multiple
+// implementations. This helper is at the core of such type-erasure.
+//
+// The opposite of this operation is just `static_cast<T*>(void_ptr)`.
+template <typename T>
+void* ToVoidPtr(T* p) {
+  return const_cast<void*>(static_cast<const void*>(p));
 }
 
 // We need this where we have multiple threads potentially writing concurrently
@@ -176,33 +80,6 @@ Scalar SymmetricZeroPoint() {
   return std::numeric_limits<Scalar>::max() / 2 + 1;
 }
 
-template <Path ThePath, typename LhsScalar, typename RhsScalar,
-          typename DstScalar, typename Spec>
-struct TrMulImpl;
-
-template <Order tOrder, int tRows, int tCols>
-struct FixedKernelLayout {
-  static constexpr Order kOrder = tOrder;
-  static constexpr int kRows = tRows;
-  static constexpr int kCols = tCols;
-};
-
-inline void Transpose(Order* order) {
-  *order = *order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
-}
-
-inline void Transpose(Layout* layout) {
-  Transpose(&layout->order);
-  Transpose(&layout->kernel.order);
-  std::swap(layout->rows, layout->cols);
-  std::swap(layout->kernel.rows, layout->kernel.cols);
-}
-
-template <typename Scalar>
-inline void Transpose(Matrix<Scalar>* matrix) {
-  Transpose(&matrix->layout);
-}
-
 }  // namespace ruy
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_COMMON_H_
diff --git a/tensorflow/lite/experimental/ruy/context.h b/tensorflow/lite/experimental/ruy/context.h
index b768dad09a5..48b02f88de7 100644
--- a/tensorflow/lite/experimental/ruy/context.h
+++ b/tensorflow/lite/experimental/ruy/context.h
@@ -65,6 +65,13 @@ struct Context final {
     }
   }
 
+  template <Path CompiledPaths>
+  Path GetPathToTake() {
+    last_taken_path =
+        GetMostSignificantPath(CompiledPaths & GetRuntimeEnabledPaths());
+    return last_taken_path;
+  }
+
   void SetRuntimeEnabledPaths(Path paths);
   Path GetRuntimeEnabledPaths();
 
diff --git a/tensorflow/lite/experimental/ruy/dispatch.h b/tensorflow/lite/experimental/ruy/dispatch.h
index 3386e14226f..50bece5f41a 100644
--- a/tensorflow/lite/experimental/ruy/dispatch.h
+++ b/tensorflow/lite/experimental/ruy/dispatch.h
@@ -52,10 +52,10 @@ namespace ruy {
 template <typename Spec>
 void EnforceLayoutSupport(const Layout& lhs_layout, const Layout& rhs_layout,
                           const Layout& dst_layout) {
-  if (Spec::kLayoutSupport == LayoutSupport::kPackedLinearRCC) {
-    RUY_DCHECK(IsPackedLinearRowMajor(lhs_layout));
-    RUY_DCHECK(IsPackedLinearColMajor(rhs_layout));
-    RUY_DCHECK(IsPackedLinearColMajor(dst_layout));
+  if (Spec::kLayoutSupport == LayoutSupport::kRCC) {
+    RUY_DCHECK(IsRowMajor(lhs_layout));
+    RUY_DCHECK(IsColMajor(rhs_layout));
+    RUY_DCHECK(IsColMajor(dst_layout));
   }
 }
 
@@ -84,21 +84,108 @@ void EnforceZeroPointSupport(LhsScalar lhs_zero_point, RhsScalar rhs_zero_point,
   CheckZeroPoint<Spec>(dst_zero_point);
 }
 
-// GetTrMulImplRunFn is implemented with template metaprogramming by mutual
-// recursion between PathSearchCountdown and PathSearchCompiledPaths.
+inline bool IsColMajorTrMul(const DMatrix& lhs, const DMatrix& rhs,
+                            const DMatrix& dst) {
+  return IsColMajor(lhs.layout) && IsColMajor(rhs.layout) &&
+         IsColMajor(dst.layout);
+}
+
+inline void CreatePackedLayout(const Layout& src, const Type& scalar,
+                               const KernelLayout& kernel_layout,
+                               PackedLayout* packed) {
+  packed->order = Order::kColMajor;
+  packed->rows = round_up_pot(src.rows, kernel_layout.rows);
+  packed->cols = round_up_pot(src.cols, kernel_layout.cols);
+  packed->kernel = kernel_layout;
+  int inner_size = packed->rows;
+  if (RUY_OPT_SET & RUY_OPT_AVOID_ALIASING) {
+    packed->stride =
+        (inner_size * scalar.size) % 1024 ? inner_size : inner_size + 64;
+  } else {
+    packed->stride = inner_size;
+  }
+}
+
+template <typename Scalar, typename PackedScalar>
+void CreatePackedMatrix(const DMatrix& src, const KernelLayout& kernel_layout,
+                        PMatrix* packed) {
+  // Ruy always uses 32-bit signed accumulators for quantized
+  // matrix multiplication, so we would like to always use std::int32_t
+  // unconditionally for SumsType.
+  // However, for floating point types, we still need a reasonable type here to
+  // avoid tripping assertions elsewhere in the code.
+  using SumsType =
+      typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
+                                std::int32_t>::type;
+
+  packed->data_type = Type::Create<PackedScalar>();
+  packed->sums_type = Type::Create<SumsType>();
+  CreatePackedLayout(src.layout, packed->data_type, kernel_layout,
+                     &packed->layout);
+  packed->zero_point = Pack<PackedScalar, Scalar>(src.zero_point);
+}
+
+template <Path ThePath, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+void PopulateTrMulParams(TrMulParams* params) {
+  static_assert((ThePath & Path::kReference) == Path::kNone,
+                "Path::kReference should not do TrMul");
+  // The optimized code paths only handle a very specific set of layouts.
+  // Fall back to Path::kStandardCpp if needed.
+  if (ThePath != Path::kStandardCpp) {
+    if (!IsColMajorTrMul(params->lhs, params->rhs, params->dst)) {
+      PopulateTrMulParams<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar,
+                          Spec>(params);
+      return;
+    }
+  }
+
+  using PackedLhsScalar = PackedType<ThePath, LhsScalar>;
+  using PackedRhsScalar = PackedType<ThePath, RhsScalar>;
+  using Kernel =
+      Kernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
+  using LhsKernelLayout = typename Kernel::LhsLayout;
+  using RhsKernelLayout = typename Kernel::RhsLayout;
+
+  CreatePackedMatrix<LhsScalar, PackedLhsScalar>(
+      params->lhs, ToKernelLayout<LhsKernelLayout>(), &params->packed_lhs);
+  CreatePackedMatrix<RhsScalar, PackedRhsScalar>(
+      params->rhs, ToKernelLayout<RhsKernelLayout>(), &params->packed_rhs);
+
+  params->lhs_run_pack =
+      &RunPack<ThePath, LhsKernelLayout, LhsScalar, PackedLhsScalar>;
+  params->rhs_run_pack =
+      &RunPack<ThePath, RhsKernelLayout, RhsScalar, PackedRhsScalar>;
+  params->run_kernel =
+      &RunKernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
+  return;
+}
+
+// PopulateTrMulParamsAllCompiledPaths calls into one of multiple
+// instantiations of PopulateTrMulParams. For each bit that is set in
+// CompiledPaths, it statically instantiates PopulateTrMulParams with a Path
+// corresponding to that single bit. The call to PopulateTrMulParams is
+// guarded by a runtime check that it is in fact the dynamically selected path.
 //
-// GetTrMulImplRunFn is logically implementing the following computation:
+// PopulateTrMulParamsAllCompiledPaths is implemented with template
+// metaprogramming by mutual recursion between PathSearchCountdown and
+// PathSearchCompiledPaths.
 //
-// decltype(&TrMulImpl<...>::Run) GetTrMulImplRunFn(Path single_path) {
+// PopulateTrMulParamsAllCompiledPaths is logically implementing the following
+// computation:
+//
+// template <Path CompiledPaths>
+// void PopulateTrMulParamsAllCompiledPaths(Path the_path,
+//                                            TrMulParams* params) {
 //   for (int bit = 8 * sizeof(Path) - 1; bit != -1; bit--) { // [1]
 //     Path current_path = static_cast<Path>(1 << bit);
 //     if ((CompiledPaths & current_path) != Path::kNone) { // [2]
-//       if (current_path == single_path) { // [3]
-//         return &TrMulImpl<current_path, ...>::Run;
+//       if (current_path == the_path) { // [3]
+//         PopulateTrMulParams<current_path, ...>(the_path, params);
+//         return;
 //       }
 //     }
 //   }
-//   return nullptr; // [4]
 // }
 //
 //
@@ -110,15 +197,13 @@ void EnforceZeroPointSupport(LhsScalar lhs_zero_point, RhsScalar rhs_zero_point,
 // doing the whole computation at C++ compile time.
 // [3] - Done by the `if` in the main definition of
 // PathSearchOnlyCompiledPaths.
-// [4] - Done by the partial specialization of PathSearchCountdown.
 //
 // The template metaprogramming is necessary because:
-// - In `TrMulImpl<current_path, ...>::Run`, current_path must be a C++
+// - In `PopulateTrMulParams<current_path, ...>`, current_path must be a C++
 // compile-time constant.
-// - GetTrMulImplRunFn must not instantiate
-// `TrMulImpl<curent_path, ...>::Run` for paths that are not in
-// CompiledPaths, since that can result in bogus instantiations which cause
-// a compile time failure.
+// - PopulateTrMulParamsAllCompiledPaths must not instantiate
+// inner loops for paths that are not in CompiledPaths, since that can result in
+// bogus instantiations which cause a compile time failure.
 template <Path CompiledPaths, int BitNumber, typename LhsScalar,
           typename RhsScalar, typename DstScalar, typename Spec>
 struct PathSearchCountdown;
@@ -128,29 +213,25 @@ template <Path CompiledPaths, bool InCompiledPaths, int BitNumber,
           typename Spec>
 struct PathSearchOnlyCompiledPaths {
   static constexpr Path kCurrentPath = static_cast<Path>(1 << BitNumber);
-  static decltype(
-      &TrMulImpl<Path::kNone, LhsScalar, RhsScalar, DstScalar, Spec>::Run)
-  Search(Path single_path) {
-    if (kCurrentPath == single_path) {
-      return &TrMulImpl<kCurrentPath, LhsScalar, RhsScalar, DstScalar,
-                        Spec>::Run;
+  static void Search(Path the_path, TrMulParams* params) {
+    if (kCurrentPath == the_path) {
+      PopulateTrMulParams<kCurrentPath, LhsScalar, RhsScalar, DstScalar, Spec>(
+          params);
+      return;
     }
-    return PathSearchCountdown<CompiledPaths, BitNumber - 1, LhsScalar,
-                               RhsScalar, DstScalar, Spec>::Search(single_path);
+    PathSearchCountdown<CompiledPaths, BitNumber - 1, LhsScalar, RhsScalar,
+                        DstScalar, Spec>::Search(the_path, params);
   }
 };
 
-// Skip instantiating TrMulImpl if CompiledPaths doesn't contain the
-// specified path.
+// Skip this iteration if CompiledPaths doesn't contain the specified path.
 template <Path CompiledPaths, int BitNumber, typename LhsScalar,
           typename RhsScalar, typename DstScalar, typename Spec>
 struct PathSearchOnlyCompiledPaths<CompiledPaths, false, BitNumber, LhsScalar,
                                    RhsScalar, DstScalar, Spec> {
-  static decltype(
-      &TrMulImpl<Path::kNone, LhsScalar, RhsScalar, DstScalar, Spec>::Run)
-  Search(Path single_path) {
-    return PathSearchCountdown<CompiledPaths, BitNumber - 1, LhsScalar,
-                               RhsScalar, DstScalar, Spec>::Search(single_path);
+  static void Search(Path the_path, TrMulParams* params) {
+    PathSearchCountdown<CompiledPaths, BitNumber - 1, LhsScalar, RhsScalar,
+                        DstScalar, Spec>::Search(the_path, params);
   }
 };
 
@@ -158,12 +239,10 @@ template <Path CompiledPaths, int BitNumber, typename LhsScalar,
           typename RhsScalar, typename DstScalar, typename Spec>
 struct PathSearchCountdown {
   static constexpr Path kCurrentPath = static_cast<Path>(1 << BitNumber);
-  static decltype(
-      &TrMulImpl<Path::kNone, LhsScalar, RhsScalar, DstScalar, Spec>::Run)
-  Search(Path single_path) {
-    return PathSearchOnlyCompiledPaths<
+  static void Search(Path the_path, TrMulParams* params) {
+    PathSearchOnlyCompiledPaths<
         CompiledPaths, (CompiledPaths & kCurrentPath) != Path::kNone, BitNumber,
-        LhsScalar, RhsScalar, DstScalar, Spec>::Search(single_path);
+        LhsScalar, RhsScalar, DstScalar, Spec>::Search(the_path, params);
   }
 };
 
@@ -173,48 +252,132 @@ template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
           typename DstScalar, typename Spec>
 struct PathSearchCountdown<CompiledPaths, -1, LhsScalar, RhsScalar, DstScalar,
                            Spec> {
-  static decltype(
-      &TrMulImpl<Path::kNone, LhsScalar, RhsScalar, DstScalar, Spec>::Run)
-  Search(Path single_path) {
-    return nullptr;
-  }
+  static void Search(Path the_path, TrMulParams* params) { RUY_DCHECK(false); }
 };
 
 template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
           typename DstScalar, typename Spec>
-decltype(&TrMulImpl<Path::kNone, LhsScalar, RhsScalar, DstScalar, Spec>::Run)
-GetTrMulImplRunFn(Path single_path) {
+void PopulateTrMulParamsAllCompiledPaths(Path the_path, TrMulParams* params) {
   return PathSearchCountdown<CompiledPaths, 8 * sizeof(Path) - 1, LhsScalar,
-                             RhsScalar, DstScalar, Spec>::Search(single_path);
+                             RhsScalar, DstScalar, Spec>::Search(the_path,
+                                                                 params);
+}
+
+template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+void CreateTrMulParams(const Matrix<LhsScalar>& lhs,
+                       const Matrix<RhsScalar>& rhs, const Spec& spec,
+                       Context* context, Matrix<DstScalar>* dst, Path the_path,
+                       TrMulParams* params) {
+  // Fill in the fields we already know.
+  params->lhs = ToDMatrix(lhs);
+  params->rhs = ToDMatrix(rhs);
+  params->dst = ToDMatrix(*dst);
+  params->spec = ToVoidPtr(&spec);
+
+  // Create inner loops and packed matrices based on the Path.
+  PopulateTrMulParamsAllCompiledPaths<CompiledPaths, LhsScalar, RhsScalar,
+                                      DstScalar, Spec>(the_path, params);
+}
+
+template <typename LhsScalar, typename RhsScalar, typename DstScalar,
+          typename Spec>
+void ReferenceMul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
+                  const Spec& spec, Matrix<DstScalar>* dst) {
+  gemmlowp::ScopedProfilingLabel label("ReferenceMul");
+  for (int i = 0; i < lhs.layout.rows; i++) {
+    for (int j = 0; j < rhs.layout.cols; j++) {
+      using AccumScalar = typename Spec::AccumScalar;
+      AccumScalar accum = 0;
+      for (int k = 0; k < lhs.layout.cols; k++) {
+        AccumScalar lhs_val = Element(lhs, i, k);
+        AccumScalar rhs_val = Element(rhs, k, j);
+        accum += (lhs_val - lhs.zero_point) * (rhs_val - rhs.zero_point);
+      }
+      if (spec.bias) {
+        accum += spec.bias[i];
+      }
+      ApplyMultiplier(spec, i, &accum);
+      accum += dst->zero_point;
+      accum = std::min<AccumScalar>(accum, spec.clamp_max);
+      accum = std::max<AccumScalar>(accum, spec.clamp_min);
+      *ElementPtr(dst, i, j) = static_cast<DstScalar>(accum);
+    }
+  }
+}
+
+// Compile-time dispatch to ReferenceMul. This allows us to statically ensure
+// that there is no call to ReferenceMul in the user's binary.
+template <bool ReferenceMulIsEnabled>
+struct CompileTimeEnabledReferenceMul {
+  template <typename LhsScalar, typename RhsScalar, typename DstScalar,
+            typename Spec>
+  static void Run(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
+                  const Spec& spec, Matrix<DstScalar>* dst) {
+    ReferenceMul(lhs, rhs, spec, dst);
+  }
+};
+
+// When this partial specialization is chosen, it ensures that ReferenceMul
+// is never compiled.
+template <>
+struct CompileTimeEnabledReferenceMul</*ReferenceMulIsEnabled=*/false> {
+  template <typename LhsScalar, typename RhsScalar, typename DstScalar,
+            typename Spec>
+  static void Run(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
+                  const Spec& spec, Matrix<DstScalar>* dst) {
+    RUY_DCHECK(false);
+  }
 };
 
 template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
           typename DstScalar, typename Spec>
-struct MulDispatch {
-  void Mul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
-           const Spec& spec, Context* context, Matrix<DstScalar>* dst) {
-    gemmlowp::ScopedProfilingLabel label("Mul");
+void DispatchMul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
+                 const Spec& spec, Context* context, Matrix<DstScalar>* dst) {
+  static_assert(CompiledPaths != Path::kNone, "Must compile at least one Path");
+  static_assert((CompiledPaths & ~kAllPaths) == Path::kNone,
+                "CompiledPaths must be a subset of ruy::kAllPaths");
 
-    const Path runtime_enabled_paths = context->GetRuntimeEnabledPaths();
-    // The above query should resolve to specific paths, never return kNone.
-    RUY_DCHECK(runtime_enabled_paths != Path::kNone);
+  gemmlowp::ScopedProfilingLabel label("Mul");
 
-    Path single_path =
-        GetMostSignificantPath(CompiledPaths & runtime_enabled_paths);
-    auto tr_mul_impl_run_fn =
-        GetTrMulImplRunFn<CompiledPaths, LhsScalar, RhsScalar, DstScalar, Spec>(
-            single_path);
-    context->last_taken_path = single_path;
+  EnforceLayoutSupport<Spec>(lhs.layout, rhs.layout, dst->layout);
+  EnforceZeroPointSupport<Spec>(lhs.zero_point, rhs.zero_point,
+                                dst->zero_point);
 
-    EnforceLayoutSupport<Spec>(lhs.layout, rhs.layout, dst->layout);
-    EnforceZeroPointSupport<Spec>(lhs.zero_point, rhs.zero_point,
-                                  dst->zero_point);
+  // This should be a constant, for a given machine and CompiledPaths.
+  // There is a back door to override it for testing, but in production it will
+  // always be the "best" Path. I.e. the one with the newest SIMD instructions
+  // available on the present machine, and avoiding Path::kReference unless
+  // no other path is compiled.
+  //
+  // Unfortunately, it is not a *static* constant, since it depends on runtime
+  // detection of the available SIMD instructions.
+  Path the_path = context->GetPathToTake<CompiledPaths>();
 
-    Matrix<LhsScalar> lhs_copy(lhs);
-    Transpose(&lhs_copy);
-    tr_mul_impl_run_fn(lhs_copy, rhs, spec, context, dst);
+  // Production code should probably never execute Path::kReference.
+  // Path::kReference implements a Mul, not a TrMul like the rest of Ruy, so if
+  // that's what we need to do, then get it out of the way before going down the
+  // TrMul path.
+  if (the_path == Path::kReference) {
+    constexpr bool ReferenceMulIsEnabled =
+        (CompiledPaths & Path::kReference) != Path::kNone;
+    CompileTimeEnabledReferenceMul<ReferenceMulIsEnabled>::Run(lhs, rhs, spec,
+                                                               dst);
+    return;
   }
-};
+
+  // As described in the comment at the top of this file, Ruy internally
+  // converts Mul into TrMul. We handle that here.
+  //
+  // This is Ruy's main code path.
+  constexpr Path TrMulCompiledPaths = CompiledPaths & ~Path::kReference;
+  Matrix<LhsScalar> transposed_lhs(lhs);
+  Transpose(&transposed_lhs);
+  TrMulParams params;
+  CreateTrMulParams<TrMulCompiledPaths>(transposed_lhs, rhs, spec, context, dst,
+                                        the_path, &params);
+  TrMul(&params, context);
+}
 
 }  // namespace ruy
 
diff --git a/tensorflow/lite/experimental/ruy/impl.h b/tensorflow/lite/experimental/ruy/impl.h
index ee26b9687a9..edab51dec7a 100644
--- a/tensorflow/lite/experimental/ruy/impl.h
+++ b/tensorflow/lite/experimental/ruy/impl.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_IMPL_H_
 
 #include <cstring>
-#include <vector>
 
 #include "profiling/instrumentation.h"
 #include "tensorflow/lite/experimental/ruy/allocator.h"
@@ -25,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/ruy/common.h"
 #include "tensorflow/lite/experimental/ruy/context.h"
 #include "tensorflow/lite/experimental/ruy/kernel.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
 #include "tensorflow/lite/experimental/ruy/opt_set.h"
 #include "tensorflow/lite/experimental/ruy/pack.h"
 #include "tensorflow/lite/experimental/ruy/thread_pool.h"
@@ -33,31 +33,49 @@ limitations under the License.
 
 namespace ruy {
 
-template <Path ThePath, typename LhsScalar, typename RhsScalar,
-          typename PackedLhsScalar, typename PackedRhsScalar,
-          typename DstScalar, typename Spec>
-struct TrMulTask final : Task {
-  using AccumScalar = typename Spec::AccumScalar;
-  TrMulTask(const Matrix<LhsScalar>& lhs_, const Matrix<RhsScalar>& rhs_,
-            Matrix<PackedLhsScalar>* packed_lhs_,
-            Matrix<PackedRhsScalar>* packed_rhs_, Matrix<DstScalar>* result_,
-            const BlockMap& block_map_,
+// Type-erased data needed for implementing TrMul.
+struct TrMulParams {
+  // Helper functions for invoking the function pointers.
+  void LhsRunPack(Tuning tuning, int start_c, int end_c) {
+    lhs_run_pack(tuning, lhs, &packed_lhs, start_c, end_c);
+  }
+  void RhsRunPack(Tuning tuning, int start_c, int end_c) {
+    rhs_run_pack(tuning, rhs, &packed_rhs, start_c, end_c);
+  }
+  void RunKernel(Tuning tuning, int start_r, int start_c, int end_r,
+                 int end_c) {
+    run_kernel(tuning, packed_lhs, packed_rhs, spec, start_r, start_c, end_r,
+               end_c, &dst);
+  }
 
+  // Function pointers to type-erased entry points for kernels and packers.
+  RunPackFn* lhs_run_pack = nullptr;
+  RunPackFn* rhs_run_pack = nullptr;
+  RunKernelFn* run_kernel = nullptr;
+
+  // Matrices and packed matrices.
+  DMatrix lhs;
+  DMatrix rhs;
+  DMatrix dst;
+  PMatrix packed_lhs;
+  PMatrix packed_rhs;
+
+  // Type-erased Spec.
+  void* spec = nullptr;
+};
+
+struct TrMulTask final : Task {
+  TrMulTask(TrMulParams* params_, const BlockMap& block_map_,
             std::atomic<std::uint32_t>* atomic_n_, std::uint32_t thread_id_,
             std::atomic<bool>* lhs_packed_, std::atomic<bool>* rhs_packed_,
-            const Spec& spec_, TuningResolver* tuning_resolver_,
-            Allocator* local_allocator_, Trace* trace_)
-      : lhs(lhs_),
-        rhs(rhs_),
-        packed_lhs(packed_lhs_),
-        packed_rhs(packed_rhs_),
-        result(result_),
+            TuningResolver* tuning_resolver_, Allocator* local_allocator_,
+            Trace* trace_)
+      : params(params_),
         block_map(block_map_),
         atomic_n(atomic_n_),
         thread_id(thread_id_),
         lhs_packed(lhs_packed_),
         rhs_packed(rhs_packed_),
-        spec(spec_),
         tuning_resolver(tuning_resolver_),
         local_allocator(local_allocator_),
         trace(trace_) {}
@@ -81,13 +99,7 @@ struct TrMulTask final : Task {
       memset(local_rhs_packed, 0, num_blocks_of_cols * sizeof(bool));
     }
 
-    using Kernel =
-        Kernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
-    using LhsKernelLayout = typename Kernel::RhsLayout;
-    using RhsKernelLayout = typename Kernel::RhsLayout;
-
     const Tuning tuning = tuning_resolver->Resolve();
-    Kernel kernel(tuning);
 
     TraceRecordThreadLoopStart(thread_id, trace);
 
@@ -104,6 +116,7 @@ struct TrMulTask final : Task {
     GetBlockMatrixCoords(block_map, block_r, block_c, &start_r, &start_c,
                          &end_r, &end_c);
     TraceRecordBlockCoordsComputed(n, trace);
+
     while (n < num_blocks) {
       // Get index of next block to handle
       next_n = atomic_n->fetch_add(1, std::memory_order_relaxed);
@@ -134,8 +147,7 @@ struct TrMulTask final : Task {
       // different contention with other processes.
       if (local_lhs_packed && !local_lhs_packed[block_r]) {
         if (!lhs_packed[block_r].load(std::memory_order_acquire)) {
-          Pack<ThePath, LhsKernelLayout>(tuning, lhs, packed_lhs, start_r,
-                                         end_r);
+          params->LhsRunPack(tuning, start_r, end_r);
           TraceRecordBlockPackedLhs(n, trace);
           local_lhs_packed[block_r] = true;
           lhs_packed[block_r].store(true, std::memory_order_release);
@@ -144,16 +156,14 @@ struct TrMulTask final : Task {
       // Maybe pack the current RHS block. Same comments as above for LHS.
       if (local_rhs_packed && !local_rhs_packed[block_c]) {
         if (!rhs_packed[block_c].load(std::memory_order_acquire)) {
-          Pack<ThePath, RhsKernelLayout>(tuning, rhs, packed_rhs, start_c,
-                                         end_c);
+          params->RhsRunPack(tuning, start_c, end_c);
           TraceRecordBlockPackedRhs(n, trace);
           local_rhs_packed[block_c] = true;
           rhs_packed[block_c].store(true, std::memory_order_release);
         }
       }
       // Actually do matrix multiplication work
-      RunKernel(kernel, *packed_lhs, *packed_rhs, spec, start_r, start_c, end_r,
-                end_c, result);
+      params->RunKernel(tuning, start_r, start_c, end_r, end_c);
       TraceRecordBlockFinished(n, trace);
       n = next_n;
       block_r = next_block_r;
@@ -170,54 +180,20 @@ struct TrMulTask final : Task {
   }
 
  private:
-  const Matrix<LhsScalar>& lhs;
-  const Matrix<RhsScalar>& rhs;
-  Matrix<PackedLhsScalar>* packed_lhs;
-  Matrix<PackedRhsScalar>* packed_rhs;
-
-  Matrix<DstScalar>* result;
+  TrMulParams* params;
   const BlockMap& block_map;
   std::atomic<std::uint32_t>* atomic_n;
   std::uint32_t thread_id;
   std::atomic<bool>* lhs_packed;
   std::atomic<bool>* rhs_packed;
-  const Spec& spec;
   TuningResolver* tuning_resolver;
   Allocator* local_allocator;
   Trace* trace;
 };
 
-template <typename FixedKernelLayout, typename Scalar, typename PackedScalar>
-void CreatePackedMatrix(Tuning tuning, const Matrix<Scalar>& src,
-                        Allocator* allocator,
-                        Matrix<PackedScalar>* packed) {
-  packed->zero_point = src.zero_point - SymmetricZeroPoint<Scalar>() +
-                       SymmetricZeroPoint<PackedScalar>();
-  packed->layout = src.layout;
-  packed->layout.order = Order::kColMajor;
-  packed->layout.rows = round_up_pot(src.layout.rows, FixedKernelLayout::kRows);
-  packed->layout.cols = round_up_pot(src.layout.cols, FixedKernelLayout::kCols);
-  packed->layout.kernel.order = FixedKernelLayout::kOrder;
-  packed->layout.kernel.rows = FixedKernelLayout::kRows;
-  packed->layout.kernel.cols = FixedKernelLayout::kCols;
-  int innersize = (packed->layout.order == Order::kColMajor)
-                      ? packed->layout.rows
-                      : packed->layout.cols;
-  int outersize = (packed->layout.order == Order::kColMajor)
-                      ? packed->layout.cols
-                      : packed->layout.rows;
-  if (RUY_OPT_SET & RUY_OPT_AVOID_ALIASING) {
-    if (tuning == Tuning::kInOrder) {
-      packed->layout.stride =
-          (innersize * sizeof(Scalar)) % 1024 ? innersize : innersize + 64;
-    } else {
-      packed->layout.stride =
-          (innersize * sizeof(Scalar)) % 4096 ? innersize : innersize + 64;
-    }
-  } else {
-    packed->layout.stride = innersize;
-  }
-  allocator->Allocate(outersize * packed->layout.stride, &packed->data);
+inline void AllocatePMatrix(Allocator* allocator, PMatrix* packed) {
+  packed->data = allocator->AllocateBytes(DataSize(*packed));
+  packed->sums = allocator->AllocateBytes(SumsSize(*packed));
 }
 
 inline int GetThreadCount(Context* context, int rows, int cols, int depth) {
@@ -228,12 +204,8 @@ inline int GetThreadCount(Context* context, int rows, int cols, int depth) {
   return clamp(guess, 1, context->max_num_threads);
 }
 
-template <typename Spec>
 LoopStructure GetLoopStructure(int thread_count, int rows, int cols,
                                int depth) {
-  if (Spec::kLoopStructure != LoopStructure::kAuto) {
-    return Spec::kLoopStructure;
-  }
   if (thread_count == 1 &&
       (rows + cols) * depth < kCacheFriendlyLoopThreshold) {
     return LoopStructure::kSimple;
@@ -249,180 +221,105 @@ inline Tuning GetTuning(Context* context) {
   return tuning_resolver->Resolve();
 }
 
-// General TrMulImpl definition.  See the reference-code implementation given
-// in the partial specialization below for ThePath==kReference.
-template <Path ThePath, typename LhsScalar, typename RhsScalar,
-          typename DstScalar, typename Spec>
-struct TrMulImpl {
-  using AccumScalar = typename Spec::AccumScalar;
-  static void Run(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
-                  const Spec& spec, Context* context, Matrix<DstScalar>* dst) {
-    // Fall back, if needed, to Path::kStandardCpp.
-    if (ThePath != Path::kStandardCpp) {
-      if (!IsLinear(lhs.layout) || !IsLinear(rhs.layout) ||
-          !IsLinear(dst->layout) || lhs.layout.order != Order::kColMajor ||
-          rhs.layout.order != Order::kColMajor ||
-          dst->layout.order != Order::kColMajor) {
-        TrMulImpl<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar,
-                  Spec>::Run(lhs, rhs, spec, context, dst);
-        return;
-      }
-    }
+void TrMul(TrMulParams* params, Context* context) {
+  gemmlowp::ScopedProfilingLabel label("TrMul");
 
-    gemmlowp::ScopedProfilingLabel label("TrMulImpl");
-    using PackedLhsScalar = PackedType<ThePath, LhsScalar>;
-    using PackedRhsScalar = PackedType<ThePath, RhsScalar>;
-    using Kernel =
-        Kernel<ThePath, PackedLhsScalar, PackedRhsScalar, DstScalar, Spec>;
-    using LhsKernelLayout = typename Kernel::LhsLayout;
-    using RhsKernelLayout = typename Kernel::RhsLayout;
+  PMatrix& packed_lhs = params->packed_lhs;
+  PMatrix& packed_rhs = params->packed_rhs;
+  DMatrix& lhs = params->lhs;
+  DMatrix& rhs = params->rhs;
 
-    const int rows = lhs.layout.cols;
-    const int cols = rhs.layout.cols;
-    const int depth = lhs.layout.rows;
-    const int rows_rounded_up = round_up_pot(rows, LhsKernelLayout::kCols);
-    const int cols_rounded_up = round_up_pot(cols, RhsKernelLayout::kCols);
+  const int rows = lhs.layout.cols;
+  const int cols = rhs.layout.cols;
+  const int depth = lhs.layout.rows;
+  const int rows_rounded_up = packed_lhs.layout.cols;
+  const int cols_rounded_up = packed_rhs.layout.cols;
 
-    int thread_count = GetThreadCount(context, rows, cols, depth);
-    const auto loop_structure =
-        GetLoopStructure<Spec>(thread_count, rows, cols, depth);
-    const Tuning tuning = GetTuning(context);
-    Allocator* allocator = context->GetMainAllocator();
+  int thread_count = GetThreadCount(context, rows, cols, depth);
+  const auto loop_structure = GetLoopStructure(thread_count, rows, cols, depth);
+  const Tuning tuning = GetTuning(context);
+  Allocator* allocator = context->GetMainAllocator();
+  AllocatePMatrix(allocator, &packed_lhs);
+  AllocatePMatrix(allocator, &packed_rhs);
 
-    // The packed matrices.
-    Matrix<PackedLhsScalar> packed_lhs;
-    Matrix<PackedRhsScalar> packed_rhs;
-    const bool lhs_use_packing_sums =
-        Pack<PackedRhsScalar>(rhs.zero_point) != 0;
-    const bool rhs_use_packing_sums =
-        Pack<PackedLhsScalar>(lhs.zero_point) != 0;
+  if (loop_structure == LoopStructure::kSimple) {
+    gemmlowp::ScopedProfilingLabel label_simple("TrMulImpl, simple loop");
 
-    // Allocate the packed matrices.
-    CreatePackedMatrix<LhsKernelLayout>(tuning, lhs, allocator, &packed_lhs);
-    CreatePackedMatrix<RhsKernelLayout>(tuning, rhs, allocator, &packed_rhs);
-    if (lhs_use_packing_sums) {
-      allocator->Allocate(rows_rounded_up, &packed_lhs.sums);
-    }
-    if (rhs_use_packing_sums) {
-      allocator->Allocate(cols_rounded_up, &packed_rhs.sums);
-    }
-
-    if (loop_structure == LoopStructure::kSimple) {
-      gemmlowp::ScopedProfilingLabel label_simple("TrMulImpl, simple loop");
-
-      Pack<ThePath, LhsKernelLayout>(tuning, lhs, &packed_lhs, 0,
-                                     rows_rounded_up);
-      Pack<ThePath, RhsKernelLayout>(tuning, rhs, &packed_rhs, 0,
-                                     cols_rounded_up);
-
-      Kernel kernel(tuning);
-      RunKernel(kernel, packed_lhs, packed_rhs, spec, 0, 0, rows_rounded_up,
-                cols_rounded_up, dst);
-
-      allocator->FreeAll();
-      return;
-    }
-
-    gemmlowp::ScopedProfilingLabel label_general("TrMulImpl, general case");
-
-    auto* trace = NewTraceOrNull(&context->tracing, rows, depth, cols);
-    TraceRecordStart(trace);
-
-    // Initialize block map.
-    BlockMap block_map;
-    MakeBlockMap(rows_rounded_up, cols_rounded_up, depth,
-                 LhsKernelLayout::kCols, RhsKernelLayout::kCols,
-                 sizeof(LhsScalar), sizeof(RhsScalar), &block_map);
-    std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
-    std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
-    std::uint32_t num_blocks = NumBlocks(block_map);
-    RUY_DCHECK_EQ(num_blocks, num_blocks_of_rows * num_blocks_of_cols);
-
-    // Initialize per-thread state.
-    thread_count = clamp(thread_count, 1, num_blocks);
-    context->EnsureNPerThreadStates(thread_count);
-    for (auto& per_thread_state : context->per_thread_states) {
-      per_thread_state->tuning_resolver.SetTuning(context->explicit_tuning);
-    }
-
-    // Allocate memory.
-    std::atomic<bool>* lhs_packed;
-    allocator->Allocate(num_blocks_of_rows, &lhs_packed);
-    std::atomic<bool>* rhs_packed;
-    allocator->Allocate(num_blocks_of_cols, &rhs_packed);
-    std::atomic<std::uint32_t>* atomic_n;
-    allocator->Allocate(1, &atomic_n);
-    using TaskType = TrMulTask<ThePath, LhsScalar, RhsScalar, PackedLhsScalar,
-                               PackedRhsScalar, DstScalar, Spec>;
-    TaskType* tasks;
-    allocator->Allocate(thread_count, &tasks);
-    Task** tasks_ptrs;
-    allocator->Allocate(thread_count, &tasks_ptrs);
-
-    // Initialize allocated data.
-    for (int i = 0; i < num_blocks_of_rows; i++) {
-      lhs_packed[i].store(false, std::memory_order_release);
-    }
-    for (int i = 0; i < num_blocks_of_cols; i++) {
-      rhs_packed[i].store(false, std::memory_order_release);
-    }
-    atomic_n->store(thread_count);
-
-    for (int i = 0; i < thread_count; i++) {
-      tasks_ptrs[i] = static_cast<Task*>(tasks + i);
-      new (tasks_ptrs[i])
-          TaskType(lhs, rhs, &packed_lhs, &packed_rhs, dst, block_map, atomic_n,
-                   i, lhs_packed, rhs_packed, spec,
-                   &context->per_thread_states[i]->tuning_resolver,
-                   &context->per_thread_states[i]->allocator, trace);
-    }
-
-    // Do the computation.
-    TraceRecordExecute(trace);
-    TraceStartRecordingBlockAndThreadFields(block_map, thread_count, trace);
-
-    context->workers_pool.Execute(thread_count, tasks_ptrs);
-
-    // Finish up.
-    for (int i = 0; i < thread_count; i++) {
-      tasks[i].~TaskType();
-    }
-
-    TraceRecordEnd(trace);
+    params->LhsRunPack(tuning, 0, rows_rounded_up);
+    params->RhsRunPack(tuning, 0, cols_rounded_up);
+    params->RunKernel(tuning, 0, 0, rows_rounded_up, cols_rounded_up);
 
     allocator->FreeAll();
+    return;
   }
-};
 
-// Reference code for TrMul, doing a transpose-multiply: compute
-//   Destination = Transpose(LHS) * RHS
-template <typename LhsScalar, typename RhsScalar, typename DstScalar,
-          typename Spec>
-struct TrMulImpl<Path::kReference, LhsScalar, RhsScalar, DstScalar, Spec> {
-  static void Run(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
-                  const Spec& spec, Context*, Matrix<DstScalar>* dst) {
-    gemmlowp::ScopedProfilingLabel label("TrMulImpl Reference");
-    for (int i = 0; i < lhs.layout.cols; i++) {
-      for (int j = 0; j < rhs.layout.cols; j++) {
-        using AccumScalar = typename Spec::AccumScalar;
-        AccumScalar accum = 0;
-        for (int k = 0; k < lhs.layout.rows; k++) {
-          AccumScalar lhs_val = Element(lhs, k, i);
-          AccumScalar rhs_val = Element(rhs, k, j);
-          accum += (lhs_val - lhs.zero_point) * (rhs_val - rhs.zero_point);
-        }
-        if (spec.bias) {
-          accum += spec.bias[i];
-        }
-        ApplyMultiplier(spec, i, &accum);
-        accum += dst->zero_point;
-        accum = std::min<AccumScalar>(accum, spec.clamp_max);
-        accum = std::max<AccumScalar>(accum, spec.clamp_min);
-        *ElementPtr(dst, i, j) = static_cast<DstScalar>(accum);
-      }
-    }
+  gemmlowp::ScopedProfilingLabel label_general("TrMulImpl, general case");
+
+  auto* trace = NewTraceOrNull(&context->tracing, rows, depth, cols);
+  TraceRecordStart(trace);
+
+  // Initialize block map.
+  BlockMap block_map;
+  MakeBlockMap(rows_rounded_up, cols_rounded_up, depth,
+               packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols,
+               packed_lhs.data_type.size, packed_rhs.data_type.size,
+               &block_map);
+  std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map);
+  std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map);
+  std::uint32_t num_blocks = NumBlocks(block_map);
+  RUY_DCHECK_EQ(num_blocks, num_blocks_of_rows * num_blocks_of_cols);
+
+  // Initialize per-thread state.
+  thread_count = clamp(thread_count, 1, num_blocks);
+  context->EnsureNPerThreadStates(thread_count);
+  for (auto& per_thread_state : context->per_thread_states) {
+    per_thread_state->tuning_resolver.SetTuning(context->explicit_tuning);
   }
-};
+
+  // Allocate memory.
+  std::atomic<bool>* lhs_packed;
+  allocator->Allocate(num_blocks_of_rows, &lhs_packed);
+  std::atomic<bool>* rhs_packed;
+  allocator->Allocate(num_blocks_of_cols, &rhs_packed);
+  std::atomic<std::uint32_t>* atomic_n;
+  allocator->Allocate(1, &atomic_n);
+  TrMulTask* tasks;
+  allocator->Allocate(thread_count, &tasks);
+  Task** tasks_ptrs;
+  allocator->Allocate(thread_count, &tasks_ptrs);
+
+  // Initialize allocated data.
+  for (int i = 0; i < num_blocks_of_rows; i++) {
+    lhs_packed[i].store(false, std::memory_order_release);
+  }
+  for (int i = 0; i < num_blocks_of_cols; i++) {
+    rhs_packed[i].store(false, std::memory_order_release);
+  }
+  atomic_n->store(thread_count);
+
+  for (int i = 0; i < thread_count; i++) {
+    tasks_ptrs[i] = static_cast<Task*>(tasks + i);
+    new (tasks_ptrs[i])
+        TrMulTask(params, block_map, atomic_n, i, lhs_packed, rhs_packed,
+                  &context->per_thread_states[i]->tuning_resolver,
+                  &context->per_thread_states[i]->allocator, trace);
+  }
+
+  // Do the computation.
+  TraceRecordExecute(trace);
+  TraceStartRecordingBlockAndThreadFields(block_map, thread_count, trace);
+
+  context->workers_pool.Execute(thread_count, tasks_ptrs);
+
+  // Finish up.
+  for (int i = 0; i < thread_count; i++) {
+    tasks[i].~TrMulTask();
+  }
+
+  TraceRecordEnd(trace);
+
+  allocator->FreeAll();
+}
 
 }  // namespace ruy
 
diff --git a/tensorflow/lite/experimental/ruy/internal_matrix.h b/tensorflow/lite/experimental/ruy/internal_matrix.h
new file mode 100644
index 00000000000..9a7d6ee6938
--- /dev/null
+++ b/tensorflow/lite/experimental/ruy/internal_matrix.h
@@ -0,0 +1,382 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Internal types and helpers for matrices.
+//
+// Ruy has a couple slightly different notions of matrices, besides the
+// Matrix<T> class that we expose to the user-facing API.
+//
+// TODO(silvasean): Put parts of this architecture description somewhere more
+// prominent.
+//
+// The 4 different matrix types are:
+// - Matrix<T>: This is a user-facing type on Ruy's external API boundary.
+// - DMatrix: This is a type-erased version of Matrix<T>. "D" = "dynamic".
+// - PMatrix: This represents a packed matrix, which requires tracking kernel
+// layout and row/column sums for quantization. It is type-erased.
+// - PackedMatrix<T>: This is a statically typed variant of PMatrix for
+// convenience inside typed routines.
+//
+// Note that Matrix<T> is *not* implemented in terms of the internal types. It
+// is an independent, simple, and user-facing type.
+//
+// The use of type-erasure might seem surprising for a library like Ruy with a
+// heavily-templated entry point, but it is motivated by the desire for most of
+// Ruy's "middle-end" to be non-templated. Ruy can be thought of as having 3
+// main parts:
+// - "front-end" (dispatch.h) - this is the highly templated ruy::Mul entry
+// point, along with routines that select RunKernel and RunPack implementations
+// statically based on those template parameters.
+// - "back-end" (kernel.h, pack.h)- this consists of the implementations of
+// RunKernel and RunPack, often in assembly code, which are the building blocks
+// that Ruy calls to perform matrix multiplication.  These are templated so that
+// only the requested types/Path's are actually emitted by the compiler.
+// - "middle-end" (impl.h) - this is the part of Ruy that orchestrates the
+// calls to the "back-end" optimized building blocks. This layer has to deal
+// with issues like cache locality and low-overhead multi-threading.
+//
+// There is a desire for the "middle-end" to be non-templated in order to
+// simplify the implementation and reduce code-size. We type-erase when going
+// from the "front-end" to the "middle-end", and un-type-erase going from the
+// "middle-end" to the "back-end". The un-type-erasure is possible because the
+// "front-end" is responsible for instantiating the needed "back-end" templates,
+// and thus the static type information is still present.
+//
+// Each layer of Ruy uses matrix types:
+// - "front-end": Matrix<T>
+// - "middle-end": DMatrix, PMatrix
+// - "back-end": Matrix<T>, PackedMatrix<T>
+//
+// The use of separate types for packed matrices is not essential, but makes it
+// obvious at a glance whether a matrix is a packed matrix or not. We would
+// reconsider this decision if there was significant duplication between packed
+// and unpacked matrices, but that doesn't seem to be the case at the moment.
+//
+// Another goal is to keep the user-facing Matrix<T> as simple and
+// understandable as possible. Ideally, a user should be able to read the struct
+// definition for Matrix<T> and see a very simple definition with no internal
+// details like sums and kernel block layout.
+//
+// To present another structured view of our various matrix types, here's a
+// table:
+//                User matrices    Packed matrices
+//             +----------------------------------
+// Templated   |  Matrix<T>        PackedMatrix<T>
+// Type-erased |  DMatrix          PMatrix
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
+
+#include <type_traits>
+#include <utility>
+
+#include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/size_util.h"
+
+namespace ruy {
+
+// KernelLayout describes small-scale block structure in a packed matrix layout.
+//
+// This is is sometimes known as "tiling" in other contexts.
+//
+// For example, consider a packed matrix in column-major format with a
+// column-major KernelLayout. The matrix logically has a shape of
+// `[cols, rows]`. However, the matrix is laid out as though it were a 4D array
+// of shape `[cols / kcols, rows / krows, kcols, krows]`.
+//
+// Note that in the case of kcols=1, krows=1, this degenerates to
+// `[cols, rows, 1, 1]` which is equivalent to having no small-scale block
+// structure.
+struct KernelLayout {
+  Order order = Order::kColMajor;
+  std::uint8_t rows = 1;
+  std::uint8_t cols = 1;
+};
+
+// Compile time version of KernelLayout, suitable for template metaprogramming.
+// In particular, partial template specializations of Kernel use this type to
+// statically declare their kernel format.
+template <Order tOrder, int tRows, int tCols>
+struct FixedKernelLayout {
+  static constexpr Order kOrder = tOrder;
+  static constexpr int kRows = tRows;
+  static constexpr int kCols = tCols;
+};
+
+// A packed matrix has a small-scale block structure that is not present in in
+// the input matrices. This block structure is necessary for the kernels to
+// process data efficiently.
+//
+// This struct is very similar to Layout, but has the extra KernelLayout field.
+struct PackedLayout {
+  std::int32_t rows = 0;
+  std::int32_t cols = 0;
+  // Stride is the offset between two adjacent matrix elements
+  // in the non-contiguous direction.
+  std::int32_t stride = 0;
+  Order order = Order::kColMajor;
+  // Small scale layout shuffling, potentially departing from
+  // linear row-major or column-major storage. See KernelLayout.
+  KernelLayout kernel;
+};
+
+// Dynamic representation for a type.
+//
+// The most important field in this struct is the size, which Ruy uses to know
+// how much memory to allocate without having to be templated on a type.
+// Signed-ness and floating-point-ness are mainly present as debugging checks.
+//
+// Note: Ruy does not use this struct to to dynamically dispatch between
+// different typed implementations. As described in the comment at the top of
+// this file, Ruy's "front-end", which is templated, instantiates all the
+// necessary "back-end" routines with complete static knowledge of all the
+// types.
+struct Type {
+  template <typename T>
+  static Type Create() {
+    Type ret;
+    ret.is_signed = std::is_signed<T>::value;
+    ret.is_floating_point = std::is_floating_point<T>::value;
+    ret.size = sizeof(T);
+    return ret;
+  }
+
+  template <typename T>
+  void AssertIs() const {
+    RUY_DCHECK(is_signed == Create<T>().is_signed);
+    RUY_DCHECK(is_floating_point == Create<T>().is_floating_point);
+    RUY_DCHECK(size == Create<T>().size);
+  }
+
+  bool is_signed = false;
+  bool is_floating_point = false;
+  std::uint8_t size = 0;
+};
+
+// Type-erased matrix.
+struct DMatrix {
+  Type data_type;
+  void* data = nullptr;
+  Layout layout;
+  std::int32_t zero_point = 0;
+};
+
+// Type-erased packed matrix.
+struct PMatrix {
+  Type data_type;
+  void* data = nullptr;
+  Type sums_type;
+  void* sums = nullptr;
+  PackedLayout layout;
+  std::int32_t zero_point = 0;
+};
+
+// Convenient typed helper for packed matrices.
+template <typename Scalar>
+struct PackedMatrix {
+  // The row/column sums needed for quantized matrix multiplication when
+  // the opposite operand of the multiplication uses a non-symmetric zero
+  // point.
+  // This member is only relevant for packed matrices.
+  // Additionally, Ruy always uses 32-bit signed accumulators for quantized
+  // matrix multiplication.
+  // For floating point types, there is no quantization, so this pointer
+  // will always be null. We still need code referencing it to compile
+  // though, even if it is always branched around. Hence we use Scalar*
+  // itself as the type in that case.
+  using SumsType =
+      typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
+                                std::int32_t>::type;
+
+  Scalar* data = nullptr;
+  SumsType* sums = nullptr;
+  PackedLayout layout;
+  std::int32_t zero_point = 0;
+};
+
+template <typename T>
+DMatrix ToDMatrix(const Matrix<T>& matrix) {
+  DMatrix ret;
+  ret.data_type = Type::Create<T>();
+  ret.data = ToVoidPtr(matrix.data.get());
+  ret.layout = matrix.layout;
+  ret.zero_point = matrix.zero_point;
+  return ret;
+}
+
+template <typename T>
+Matrix<T> ToMatrix(const DMatrix& dmatrix) {
+  dmatrix.data_type.AssertIs<T>();
+  Matrix<T> ret;
+  ret.data = static_cast<T*>(dmatrix.data);
+  ret.layout = dmatrix.layout;
+  ret.zero_point = dmatrix.zero_point;
+  return ret;
+}
+
+template <typename T>
+PackedMatrix<T> ToPackedMatrix(const PMatrix& pmatrix) {
+  using SumsType = typename PackedMatrix<T>::SumsType;
+  pmatrix.data_type.AssertIs<T>();
+  pmatrix.sums_type.AssertIs<SumsType>();
+  PackedMatrix<T> ret;
+  ret.data = static_cast<T*>(pmatrix.data);
+  ret.sums = static_cast<SumsType*>(pmatrix.sums);
+  ret.layout = pmatrix.layout;
+  ret.zero_point = pmatrix.zero_point;
+  return ret;
+}
+
+// Helpers for Layout / PackedLayout.
+
+inline bool IsPacked(const Layout& layout) {
+  if (layout.order == Order::kColMajor) {
+    return layout.stride == layout.rows;
+  } else {
+    return layout.stride == layout.cols;
+  }
+}
+
+inline bool IsRowMajor(const Layout& layout) {
+  return layout.order == Order::kRowMajor;
+}
+
+template <typename LayoutOrPackedLayout>
+inline bool IsColMajor(const LayoutOrPackedLayout& layout) {
+  return layout.order == Order::kColMajor;
+}
+
+template <typename LayoutOrPackedLayout>
+inline int FlatSize(const LayoutOrPackedLayout& layout) {
+  const int outerdim =
+      layout.order == Order::kColMajor ? layout.cols : layout.rows;
+  return layout.stride * outerdim;
+}
+
+// TODO(b/130417400) add a unit test
+inline int Offset(const Layout& layout, int row, int col) {
+  // TODO(benoitjacob)  - should check this but this make the _slow tests take
+  // 5x longer.  Find a mitigation like in Eigen with an 'internal' variant
+  // bypassing the check?
+  // RUY_DCHECK_GE(row, 0);
+  // RUY_DCHECK_GE(col, 0);
+  // RUY_DCHECK_LT(row, layout.rows);
+  // RUY_DCHECK_LT(col, layout.cols);
+  int row_stride = layout.order == Order::kColMajor ? 1 : layout.stride;
+  int col_stride = layout.order == Order::kRowMajor ? 1 : layout.stride;
+  return row * row_stride + col * col_stride;
+}
+
+// TODO(b/130417400) add a unit test
+inline int Offset(const PackedLayout& layout, int row, int col) {
+  RUY_DCHECK(is_pot(layout.kernel.rows));
+  RUY_DCHECK(is_pot(layout.kernel.cols));
+  int row_outer = row & ~(layout.kernel.rows - 1);
+  int col_outer = col & ~(layout.kernel.cols - 1);
+  int row_stride_outer =
+      layout.order == Order::kColMajor ? layout.kernel.cols : layout.stride;
+  int col_stride_outer =
+      layout.order == Order::kRowMajor ? layout.kernel.rows : layout.stride;
+  int offset_outer =
+      row_outer * row_stride_outer + col_outer * col_stride_outer;
+  int row_inner = row - row_outer;
+  int col_inner = col - col_outer;
+  int row_stride_inner =
+      layout.kernel.order == Order::kColMajor ? 1 : layout.kernel.cols;
+  int col_stride_inner =
+      layout.kernel.order == Order::kRowMajor ? 1 : layout.kernel.rows;
+  int offset_inner =
+      row_inner * row_stride_inner + col_inner * col_stride_inner;
+  return offset_outer + offset_inner;
+}
+
+// Helpers for Matrix<T>.
+
+template <typename Scalar>
+const Scalar* ElementPtr(const Matrix<Scalar>& mat, int row, int col) {
+  return mat.data.get() + Offset(mat.layout, row, col);
+}
+
+template <typename Scalar>
+Scalar* ElementPtr(Matrix<Scalar>* mat, int row, int col) {
+  return mat->data.get() + Offset(mat->layout, row, col);
+}
+
+template <typename Scalar>
+Scalar Element(const Matrix<Scalar>& mat, int row, int col) {
+  return *ElementPtr(mat, row, col);
+}
+
+// Helpers for PackedMatrix<T>.
+// Duplicated from Matrix<T>, but the duplication seems acceptable.
+
+template <typename Scalar>
+const Scalar* ElementPtr(const PackedMatrix<Scalar>& mat, int row, int col) {
+  return mat.data + Offset(mat.layout, row, col);
+}
+
+template <typename Scalar>
+Scalar* ElementPtr(PackedMatrix<Scalar>* mat, int row, int col) {
+  return mat->data + Offset(mat->layout, row, col);
+}
+
+template <typename Scalar>
+Scalar Element(const PackedMatrix<Scalar>& mat, int row, int col) {
+  return *ElementPtr(mat, row, col);
+}
+
+// Helpers for PMatrix.
+
+inline std::size_t DataSize(const PMatrix& packed) {
+  return FlatSize(packed.layout) * packed.data_type.size;
+}
+
+inline std::size_t SumsSize(const PMatrix& packed) {
+  // Packed matrices are only relevant for Ruy's TrMul implementations. For
+  // TrMul, the number of sums is always equal to the number of columns.
+  return packed.layout.cols * packed.sums_type.size;
+}
+
+// Transpose helpers.
+
+inline void Transpose(Order* order) {
+  *order = *order == Order::kColMajor ? Order::kRowMajor : Order::kColMajor;
+}
+
+inline void Transpose(Layout* layout) {
+  Transpose(&layout->order);
+  std::swap(layout->rows, layout->cols);
+}
+
+template <typename Scalar>
+inline void Transpose(Matrix<Scalar>* matrix) {
+  Transpose(&matrix->layout);
+}
+
+// Helpers for KernelLayout.
+
+template <typename FixedKernelLayout>
+KernelLayout ToKernelLayout() {
+  KernelLayout ret;
+  ret.order = FixedKernelLayout::kOrder;
+  ret.rows = FixedKernelLayout::kRows;
+  ret.cols = FixedKernelLayout::kCols;
+  return ret;
+}
+
+}  // namespace ruy
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_INTERNAL_MATRIX_H_
diff --git a/tensorflow/lite/experimental/ruy/kernel.h b/tensorflow/lite/experimental/ruy/kernel.h
index b5804016e8d..01128348db7 100644
--- a/tensorflow/lite/experimental/ruy/kernel.h
+++ b/tensorflow/lite/experimental/ruy/kernel.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "profiling/instrumentation.h"
 #include "tensorflow/lite/experimental/ruy/common.h"
-#include "tensorflow/lite/experimental/ruy/matrix.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
 #include "tensorflow/lite/experimental/ruy/opt_set.h"
 #include "tensorflow/lite/experimental/ruy/path.h"
 #include "tensorflow/lite/experimental/ruy/size_util.h"
@@ -36,12 +36,12 @@ struct Kernel {};
 
 template <Path ThePath, typename LhsScalar, typename RhsScalar,
           typename DstScalar, typename Spec>
-void RunKernel(
-    const Kernel<ThePath, LhsScalar, RhsScalar, DstScalar, Spec>& kernel,
-    const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
-    const Spec& spec, int start_row, int start_col, int end_row, int end_col,
-    Matrix<DstScalar>* dst) {
+void RunKernelTyped(Tuning tuning, const PackedMatrix<LhsScalar>& lhs,
+                    const PackedMatrix<RhsScalar>& rhs, const Spec& spec,
+                    int start_row, int start_col, int end_row, int end_col,
+                    Matrix<DstScalar>* dst) {
   using Kernel = Kernel<ThePath, LhsScalar, RhsScalar, DstScalar, Spec>;
+  Kernel kernel(tuning);
 #if RUY_OPT_SET & RUY_OPT_FAT_KERNEL
   kernel.Run(lhs, rhs, spec, start_row, start_col, end_row, end_col, dst);
 #else
@@ -57,6 +57,24 @@ void RunKernel(
 #endif
 }
 
+// Main entry point for kernels.
+template <Path ThePath, typename LhsScalar, typename RhsScalar,
+          typename DstScalar, typename Spec>
+void RunKernel(Tuning tuning, const PMatrix& lhs, const PMatrix& rhs,
+               void* spec, int start_row, int start_col, int end_row,
+               int end_col, DMatrix* dst) {
+  Matrix<DstScalar> mdst = ToMatrix<DstScalar>(*dst);
+  RunKernelTyped<ThePath, LhsScalar, RhsScalar, DstScalar, Spec>(
+      tuning, ToPackedMatrix<LhsScalar>(lhs), ToPackedMatrix<RhsScalar>(rhs),
+      *static_cast<const Spec*>(spec), start_row, start_col, end_row, end_col,
+      &mdst);
+}
+
+// The signature of RunKernel is the same, regardless of template parameters.
+using RunKernelFn =
+    decltype(RunKernel<Path::kStandardCpp, std::int8_t, std::int8_t,
+                       std::int8_t, BasicSpec<std::int32_t, std::int8_t>>);
+
 // Copied from TF Lite code.
 inline std::int32_t MultiplyByQuantizedMultiplier(
     std::int32_t x, std::int32_t quantized_multiplier, int shift) {
@@ -118,16 +136,17 @@ struct Kernel<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar, Spec> {
   using LhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
   using RhsLayout = FixedKernelLayout<Order::kColMajor, 1, 1>;
   explicit Kernel(Tuning) {}
-  void Run(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
-           const Spec& spec, int start_row, int start_col, int end_row,
-           int end_col, Matrix<DstScalar>* dst) const {
+  void Run(const PackedMatrix<LhsScalar>& lhs,
+           const PackedMatrix<RhsScalar>& rhs, const Spec& spec, int start_row,
+           int start_col, int end_row, int end_col,
+           Matrix<DstScalar>* dst) const {
     gemmlowp::ScopedProfilingLabel label("Kernel (Standard Cpp)");
     const int depth = lhs.layout.rows;
     for (int i = start_row; i < end_row; i++) {
       for (int j = start_col; j < end_col; j++) {
         using AccumScalar = typename Spec::AccumScalar;
         AccumScalar accum = 0;
-        for (int k = 0; k < lhs.layout.rows; k++) {
+        for (int k = 0; k < depth; k++) {
           AccumScalar lhs_val = Element(lhs, k, i);
           AccumScalar rhs_val = Element(rhs, k, j);
           accum += lhs_val * rhs_val;
@@ -136,10 +155,10 @@ struct Kernel<Path::kStandardCpp, LhsScalar, RhsScalar, DstScalar, Spec> {
           accum += spec.bias[i];
         }
         if (lhs.zero_point) {
-          accum -= lhs.zero_point * rhs.sums.get()[j];
+          accum -= lhs.zero_point * rhs.sums[j];
         }
         if (rhs.zero_point) {
-          accum -= rhs.zero_point * lhs.sums.get()[i];
+          accum -= rhs.zero_point * lhs.sums[i];
         }
         if (lhs.zero_point && rhs.zero_point) {
           accum += lhs.zero_point * rhs.zero_point * depth;
@@ -233,8 +252,8 @@ struct KernelParams8bit {
 };
 
 template <typename DstScalar, int LhsCols, int RhsCols>
-void MakeKernelParams8bit(const Matrix<std::int8_t>& lhs,
-                          const Matrix<std::int8_t>& rhs,
+void MakeKernelParams8bit(const PackedMatrix<std::int8_t>& lhs,
+                          const PackedMatrix<std::int8_t>& rhs,
                           const BasicSpec<std::int32_t, DstScalar>& spec,
                           int start_row, int start_col, int end_row,
                           int end_col, Matrix<DstScalar>* dst,
@@ -249,20 +268,20 @@ void MakeKernelParams8bit(const Matrix<std::int8_t>& lhs,
   RUY_DCHECK_EQ(end_row % LhsCols, 0);
   RUY_DCHECK_EQ(end_col % RhsCols, 0);
 
-  params->lhs_base_ptr = lhs.data.get() + start_row * lhs.layout.stride;
-  params->rhs_base_ptr = rhs.data.get() + start_col * rhs.layout.stride;
+  params->lhs_base_ptr = lhs.data + start_row * lhs.layout.stride;
+  params->rhs_base_ptr = rhs.data + start_col * rhs.layout.stride;
   params->flags = 0;
   params->bias = params->zero_data;
   if (spec.bias) {
     params->bias = spec.bias;
     params->flags |= RUY_ASM_FLAG_HAS_BIAS;
   }
-  if (lhs.sums.get()) {
-    params->lhs_sums = lhs.sums.get();
+  if (lhs.sums) {
+    params->lhs_sums = lhs.sums;
     params->flags |= RUY_ASM_FLAG_HAS_LHS_SUMS;
   }
-  if (rhs.sums.get()) {
-    params->rhs_sums = rhs.sums.get();
+  if (rhs.sums) {
+    params->rhs_sums = rhs.sums;
     params->flags |= RUY_ASM_FLAG_HAS_RHS_SUMS;
   }
   params->start_row = start_row;
@@ -314,7 +333,8 @@ struct Kernel<Path::kNeon, std::int8_t, std::int8_t, DstScalar,
   using RhsLayout = FixedKernelLayout<Order::kColMajor, 16, 4>;
   Tuning tuning = Tuning::kAuto;
   explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const Matrix<std::int8_t>& lhs, const Matrix<std::int8_t>& rhs,
+  void Run(const PackedMatrix<std::int8_t>& lhs,
+           const PackedMatrix<std::int8_t>& rhs,
            const BasicSpec<std::int32_t, DstScalar>& spec, int start_row,
            int start_col, int end_row, int end_col,
            Matrix<DstScalar>* dst) const {
@@ -336,7 +356,8 @@ struct Kernel<Path::kNeonDotprod, std::int8_t, std::int8_t, DstScalar,
   using LhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
   using RhsLayout = FixedKernelLayout<Order::kColMajor, 4, 8>;
   explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const Matrix<std::int8_t>& lhs, const Matrix<std::int8_t>& rhs,
+  void Run(const PackedMatrix<std::int8_t>& lhs,
+           const PackedMatrix<std::int8_t>& rhs,
            const BasicSpec<std::int32_t, DstScalar>& spec, int start_row,
            int start_col, int end_row, int end_col,
            Matrix<DstScalar>* dst) const {
@@ -375,8 +396,8 @@ struct KernelParamsFloat {
 };
 
 template <int LhsCols, int RhsCols>
-inline void MakeKernelParamsFloat(const Matrix<float>& lhs,
-                                  const Matrix<float>& rhs,
+inline void MakeKernelParamsFloat(const PackedMatrix<float>& lhs,
+                                  const PackedMatrix<float>& rhs,
                                   const BasicSpec<float, float>& spec,
                                   int start_row, int start_col, int end_row,
                                   int end_col, Matrix<float>* dst,
@@ -389,8 +410,8 @@ inline void MakeKernelParamsFloat(const Matrix<float>& lhs,
   RUY_DCHECK_EQ(end_row % LhsCols, 0);
   RUY_DCHECK_EQ(end_col % RhsCols, 0);
 
-  params->lhs_base_ptr = lhs.data.get() + start_row * lhs.layout.stride;
-  params->rhs_base_ptr = rhs.data.get() + start_col * rhs.layout.stride;
+  params->lhs_base_ptr = lhs.data + start_row * lhs.layout.stride;
+  params->rhs_base_ptr = rhs.data + start_col * rhs.layout.stride;
   params->dst_base_ptr =
       dst->data.get() + start_col * dst->layout.stride + start_row;
 
@@ -428,7 +449,7 @@ struct Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>> {
   using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
   using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
   explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
-  void Run(const Matrix<float>& lhs, const Matrix<float>& rhs,
+  void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
            const BasicSpec<float, float>& spec, int start_row, int start_col,
            int end_row, int end_col, Matrix<float>* dst) const {
     KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
@@ -451,7 +472,7 @@ struct Kernel<Path::kNeonDotprod, float, float, float, BasicSpec<float, float>>
   using Base =
       Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>>;
   explicit Kernel(Tuning tuning_) : Base(tuning_) {}
-  void Run(const Matrix<float>& lhs, const Matrix<float>& rhs,
+  void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
            const BasicSpec<float, float>& spec, int start_row, int start_col,
            int end_row, int end_col, Matrix<float>* dst) const {
     KernelParamsFloat<LhsLayout::kCols, RhsLayout::kCols> params;
diff --git a/tensorflow/lite/experimental/ruy/matrix.h b/tensorflow/lite/experimental/ruy/matrix.h
index e7cf4a6bb9e..49b7c1df14a 100644
--- a/tensorflow/lite/experimental/ruy/matrix.h
+++ b/tensorflow/lite/experimental/ruy/matrix.h
@@ -27,17 +27,6 @@ namespace ruy {
 // 'column-major' means that each column is contiguous in memory.
 enum class Order : std::uint8_t { kColMajor, kRowMajor };
 
-// KernelLayout describes small-scale block structure in a matrix layout.
-// The default (rows = 1, cols = 1) means no such small-scale block structure,
-// since 1x1 blocks is the same as no blocks. In that case, the overall
-// matrix layout is just the usual linear row-major or column-major layout
-// described by the other members of struct Layout.
-struct KernelLayout final {
-  Order order = Order::kColMajor;
-  std::uint8_t rows = 1;
-  std::uint8_t cols = 1;
-};
-
 // Describes the shape and storage layout of a matrix.
 struct Layout final {
   std::int32_t rows = 0;
@@ -46,10 +35,6 @@ struct Layout final {
   // in the non-contiguous direction.
   std::int32_t stride = 0;
   Order order = Order::kColMajor;
-
-  // Small scale layout shuffling, potentially departing from
-  // linear row-major or column-major storage. See KernelLayout.
-  KernelLayout kernel;
 };
 
 namespace detail {
@@ -110,16 +95,12 @@ class ConstCheckingPtr final {
 // signed or unsigned.
 template <typename Scalar>
 struct Matrix final {
-
   void operator=(const Matrix& other) {
     data = other.data;
     layout = other.layout;
     zero_point = other.zero_point;
   }
 
- private:
-
- public:
   // The underlying buffer wrapped by this matrix.
   detail::ConstCheckingPtr<Scalar> data;
   // The shape and data layout of this matrix.
@@ -127,22 +108,15 @@ struct Matrix final {
   // The zero_point, i.e. which Scalar value is to be interpreted as zero.
   // When Scalar is floating-point, this must be 0.
   Scalar zero_point = 0;
-  // The row/column sums needed for quantized matrix multiplication when
-  // the opposite operand of the multiplication uses a non-symmetric zero
-  // point.
-  // This member is only relevant for packed matrices.
-  // Additionally, Ruy always uses 32-bit signed accumulators for quantized
-  // matrix multiplication.
-  // For floating point types, there is no quantization, so this pointer
-  // will always be null. We still need code referencing it to compile
-  // though, even if it is always branched around. Hence we use Scalar*
-  // itself as the type in that case.
-  using SumsType =
-      typename std::conditional<std::is_floating_point<Scalar>::value, Scalar,
-                                std::int32_t>::type;
-  detail::ConstCheckingPtr<SumsType> sums;
 };
 
+inline void MakeSimpleLayout(int rows, int cols, Order order, Layout* layout) {
+  layout->rows = rows;
+  layout->cols = cols;
+  layout->order = order;
+  layout->stride = order == Order::kColMajor ? rows : cols;
+}
+
 template <typename StreamType, typename Scalar>
 StreamType& operator<<(StreamType& stream, const Matrix<Scalar>& mat) {
   for (int row = 0; row < mat.layout.rows; row++) {
diff --git a/tensorflow/lite/experimental/ruy/pack.h b/tensorflow/lite/experimental/ruy/pack.h
index 65b1a1f594c..753a202e69e 100644
--- a/tensorflow/lite/experimental/ruy/pack.h
+++ b/tensorflow/lite/experimental/ruy/pack.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "profiling/instrumentation.h"
 #include "tensorflow/lite/experimental/ruy/common.h"
+#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
 #include "tensorflow/lite/experimental/ruy/opt_set.h"
 #include "tensorflow/lite/experimental/ruy/tune.h"
 
@@ -63,11 +64,11 @@ template <typename FixedKernelLayout, typename Scalar, typename PackedScalar,
 struct PackImpl<Path::kStandardCpp, FixedKernelLayout, Scalar, PackedScalar,
                 SumsType> {
   static void Run(Tuning, const Matrix<Scalar>& src_matrix,
-                  Matrix<PackedScalar>* packed_matrix, int start_col,
+                  PackedMatrix<PackedScalar>* packed_matrix, int start_col,
                   int end_col) {
     gemmlowp::ScopedProfilingLabel label("Pack (generic)");
     RUY_DCHECK_EQ((end_col - start_col) % FixedKernelLayout::kCols, 0);
-    SumsType* sums = packed_matrix->sums.get();
+    SumsType* sums = packed_matrix->sums;
     for (int col = start_col; col < end_col; col++) {
       SumsType accum = 0;
       for (int row = 0; row < packed_matrix->layout.rows; row++) {
@@ -129,12 +130,12 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 4>, Scalar,
       std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
 
   static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
-                  Matrix<std::int8_t>* packed_matrix, int start_col,
+                  PackedMatrix<std::int8_t>* packed_matrix, int start_col,
                   int end_col) {
-    RUY_DCHECK(IsLinearColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ(start_col % 4, 0);
-    std::int32_t* sums = packed_matrix->sums.get();
+    std::int32_t* sums = packed_matrix->sums;
     Scalar zerobuf[16];
     memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf));
     for (int block_col = start_col; block_col < end_col; block_col += 4) {
@@ -166,7 +167,7 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 16, 4>, Scalar,
         }
       }
       std::int8_t* packed_ptr =
-          packed_matrix->data.get() + packed_matrix->layout.stride * block_col;
+          packed_matrix->data + packed_matrix->layout.stride * block_col;
       std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
       if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
         Pack8bitNeonInOrder(
@@ -193,12 +194,12 @@ struct PackImpl<Path::kNeonDotprod, FixedKernelLayout<Order::kColMajor, 4, 8>,
       std::is_same<Scalar, std::int8_t>::value ? 0 : 0x80;
 
   static void Run(Tuning tuning, const Matrix<Scalar>& src_matrix,
-                  Matrix<std::int8_t>* packed_matrix, int start_col,
+                  PackedMatrix<std::int8_t>* packed_matrix, int start_col,
                   int end_col) {
-    RUY_DCHECK(IsLinearColMajor(src_matrix.layout));
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ(start_col % 8, 0);
-    std::int32_t* sums = packed_matrix->sums.get();
+    std::int32_t* sums = packed_matrix->sums;
     Scalar zerobuf[16];
     memset(zerobuf, src_matrix.zero_point, sizeof(zerobuf));
     for (int block_col = start_col; block_col < end_col; block_col += 4) {
@@ -230,7 +231,7 @@ struct PackImpl<Path::kNeonDotprod, FixedKernelLayout<Order::kColMajor, 4, 8>,
         }
       }
       std::int8_t* packed_ptr =
-          packed_matrix->data.get() +
+          packed_matrix->data +
           packed_matrix->layout.stride * (block_col & ~7) +
           ((block_col & 4) * 4);
       std::int32_t* sums_ptr = sums ? sums + block_col : nullptr;
@@ -264,8 +265,9 @@ template <>
 struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 1, 8>, float,
                 float, float> {
   static void Run(Tuning tuning, const Matrix<float>& src_matrix,
-                  Matrix<float>* packed_matrix, int start_col, int end_col) {
-    RUY_DCHECK(IsLinearColMajor(src_matrix.layout));
+                  PackedMatrix<float>* packed_matrix, int start_col,
+                  int end_col) {
+    RUY_DCHECK(IsColMajor(src_matrix.layout));
     RUY_DCHECK(IsColMajor(packed_matrix->layout));
     RUY_DCHECK_EQ(start_col % 8, 0);
     const float zerobuf[4] = {0};
@@ -297,7 +299,7 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 1, 8>, float,
           src_inc3 = 0;
         }
       }
-      float* packed_ptr = packed_matrix->data.get() +
+      float* packed_ptr = packed_matrix->data +
                           packed_matrix->layout.stride * (block_col & ~7) +
                           ((block_col & 4));
       if (__builtin_expect(tuning == Tuning::kInOrder, true)) {
@@ -317,15 +319,24 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kColMajor, 1, 8>, float,
 
 #endif  // (defined __aarch64__) && (RUY_OPT_SET & RUY_OPT_ASM)
 
+// Main entry point for packing.
 template <Path ThePath, typename FixedKernelLayout, typename Scalar,
           typename PackedScalar>
-void Pack(Tuning tuning, const Matrix<Scalar>& src_matrix,
-          Matrix<PackedScalar>* packed_matrix, int start_col, int end_col) {
-  using SumsType = typename Matrix<PackedScalar>::SumsType;
+void RunPack(Tuning tuning, const DMatrix& src_matrix, PMatrix* packed_matrix,
+             int start_col, int end_col) {
+  using SumsType = typename PackedMatrix<PackedScalar>::SumsType;
+  Matrix<Scalar> src = ToMatrix<Scalar>(src_matrix);
+  PackedMatrix<PackedScalar> packed =
+      ToPackedMatrix<PackedScalar>(*packed_matrix);
   PackImpl<ThePath, FixedKernelLayout, Scalar, PackedScalar, SumsType>::Run(
-      tuning, src_matrix, packed_matrix, start_col, end_col);
+      tuning, src, &packed, start_col, end_col);
 }
 
+// The signature of RunPack is the same, regardless of its template parameters.
+using RunPackFn = decltype(
+    RunPack<Path::kStandardCpp, FixedKernelLayout<Order::kColMajor, 1, 1>,
+            std::int8_t, std::int8_t>);
+
 }  // namespace ruy
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_
diff --git a/tensorflow/lite/experimental/ruy/path.h b/tensorflow/lite/experimental/ruy/path.h
index 2589c19f95d..15f4755cfad 100644
--- a/tensorflow/lite/experimental/ruy/path.h
+++ b/tensorflow/lite/experimental/ruy/path.h
@@ -44,31 +44,12 @@ namespace ruy {
 // at runtime; then, typically in dispatch.h, we internally pick one
 // specific path and from there on, internal Ruy code deals with only one
 // path.
-//
-// When a user selects a set of compiled paths, Ruy internally dispatches to the
-// "best" one, which typically means the newest optimized instructions for a
-// given base architecture (such as ARM). Higher values of this enum correspond
-// to "better" code paths within a given base architecture for which Ruy has
-// optimized code paths.
 enum class Path : std::uint8_t {
-  // This is a special null value, representing the absence of any path.
+  // Higher values have higher precedence.
   kNone = 0,
-  // Reference multiplication code.
-  // The main purpose of this path is to have a very simple standalone Mul
-  // implementation to check against.
-  // This path bypasses almost all of Ruy's internal implementation details.
-  //
-  // This is intended for testing/development.
-  kReference = 0x1,
-  // Standard C++ implementation of Ruy's architecture-specific parts.
-  // Unlike Path::kReference, this path exercises most of Ruy's internal logic.
-  //
-  // This is intended for testing/development.
-  kStandardCpp = 0x2,
-  // Optimized path using a widely available subset of ARM NEON instructions.
+  kReference = 0x1,    // reference code.
+  kStandardCpp = 0x2,  // Standard C++ only. No SIMD or other arch features.
   kNeon = 0x4,
-  // Optimized path making use of ARM NEON dot product instructions that are
-  // available on newer ARM cores.
   kNeonDotprod = 0x8,
 };
 
@@ -87,12 +68,14 @@ inline constexpr Path operator^(Path p, Path q) {
                            static_cast<std::uint32_t>(q));
 }
 
+inline constexpr Path operator~(Path p) {
+  return static_cast<Path>(~static_cast<std::uint32_t>(p));
+}
+
 inline Path GetMostSignificantPath(Path path_mask) {
   return static_cast<Path>(round_down_pot(static_cast<int>(path_mask)));
 }
 
-// ruy::kAllPaths represents all Path's that make sense to on a given
-// base architecture.
 #ifdef __aarch64__
 constexpr Path kAllPaths =
     Path::kReference | Path::kStandardCpp | Path::kNeon | Path::kNeonDotprod;
diff --git a/tensorflow/lite/experimental/ruy/ruy.h b/tensorflow/lite/experimental/ruy/ruy.h
index 371576ebf5e..d9f88f6575a 100644
--- a/tensorflow/lite/experimental/ruy/ruy.h
+++ b/tensorflow/lite/experimental/ruy/ruy.h
@@ -25,16 +25,17 @@ limitations under the License.
 
 namespace ruy {
 
-// Performs a multiplication of matrices. This is Ruy's only API entry point.
-// Should be self-explanatory given the documentation for each of Matrix,
-// Spec and Context. See the code for ReferenceMul in dispatch.h for a reference
-// implementation.
+// Performs a multiplication of matrices.  This is Ruy's only API entry point.
+// Should be self-explanatory given the above documentation for each of Matrix,
+// Spec and Context. See reference code in reference.h, with the caveat that
+// that is reference code for transpose-multiply (TrMul) not just multiply;
+// see the translation between the two in transpose_dispatch.h.
 template <Path CompiledPaths, typename LhsScalar, typename RhsScalar,
           typename DstScalar, typename Spec>
 void Mul(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
          const Spec& spec, Context* context, Matrix<DstScalar>* dst) {
-  MulDispatch<CompiledPaths, LhsScalar, RhsScalar, DstScalar, Spec> dispatch;
-  dispatch.Mul(lhs, rhs, spec, context, dst);
+  DispatchMul<CompiledPaths, LhsScalar, RhsScalar, DstScalar, Spec>(
+      lhs, rhs, spec, context, dst);
 }
 
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/spec.h b/tensorflow/lite/experimental/ruy/spec.h
index ed48416cc06..2afe4604e5c 100644
--- a/tensorflow/lite/experimental/ruy/spec.h
+++ b/tensorflow/lite/experimental/ruy/spec.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <limits>
-#include <type_traits>
 
 namespace ruy {
 
@@ -38,16 +37,14 @@ enum class LoopStructure { kGeneral, kSimple, kAuto };
 enum class ZeroPointSupport { kGeneral, kSymmetric };
 
 // In general we allow all Layout's, even if we may use slow paths for some
-// kinds of layouts. By choosing kPackedLinearRCC, one may opt out of this and
+// kinds of layouts. By choosing kRCC, one may opt out of this and
 // only keep support for the simplest and most efficient combination of
 // Layout's, in exchange for smaller code size. The case covered by
-// kPackedLinearRCC is that where all matrix layouts are linear (no sub-block
-// structure), packed (no striding), and where the storage orders are exactly
-// the following:
+// kRCC is where the storage orders are exactly the following:
 //    - LHS is RowMajor
 //    - RHS is ColMajor
 //    - Destination is ColMajor
-enum class LayoutSupport { kGeneral, kPackedLinearRCC };
+enum class LayoutSupport { kGeneral, kRCC };
 
 // A Spec describes all about a matrix multiplication operation that isn't
 // encoded in the LHS, RHS and destination matrices. Some of that information
@@ -84,13 +81,9 @@ struct BasicSpec {
   // multiplier_fixedpoint_perchannel must be nullptr.
   const int* multiplier_exponent_perchannel = nullptr;
   // min clamp bound of destination values.
-  DstScalar clamp_min = std::is_floating_point<DstScalar>::value
-                            ? -std::numeric_limits<DstScalar>::infinity()
-                            : std::numeric_limits<DstScalar>::lowest();
+  DstScalar clamp_min = std::numeric_limits<DstScalar>::lowest();
   // max clamp bound of destination values.
-  DstScalar clamp_max = std::is_floating_point<DstScalar>::value
-                            ? std::numeric_limits<DstScalar>::infinity()
-                            : std::numeric_limits<DstScalar>::max();
+  DstScalar clamp_max = std::numeric_limits<DstScalar>::max();
   // See above enum LoopStructure
   static constexpr LoopStructure kLoopStructure = LoopStructure::kAuto;
   // See above enum LayoutSupport
diff --git a/tensorflow/lite/experimental/ruy/test.h b/tensorflow/lite/experimental/ruy/test.h
index dc1e52e579a..9d32d3dec47 100644
--- a/tensorflow/lite/experimental/ruy/test.h
+++ b/tensorflow/lite/experimental/ruy/test.h
@@ -297,17 +297,13 @@ void MakeRandomVector(RandomRange range, int size, std::vector<Scalar>* dst) {
   }
 }
 
-enum class LayoutStyle { kPackedLinear, kLinear, kBlocked };
+enum class LayoutStyle { kPackedLinear, kLinear };
 
-void MakeLayout(int rows, int cols, int kernel_rows, int kernel_cols,
-                Order order, Order kernel_order, LayoutStyle layout_style,
+void MakeLayout(int rows, int cols, Order order, LayoutStyle layout_style,
                 Layout* layout) {
   layout->rows = rows;
   layout->cols = cols;
   layout->order = order;
-  layout->kernel.order = kernel_order;
-  layout->kernel.rows = kernel_rows;
-  layout->kernel.cols = kernel_cols;
 
   const int packed_stride = order == Order::kColMajor ? rows : cols;
 
@@ -340,12 +336,10 @@ void VerifyConsistentFields(const StorageMatrix<Scalar>& storage_matrix) {
 }
 
 template <typename Scalar>
-void MakeRandom(int rows, int cols, int kernel_rows, int kernel_cols,
-                Order order, Order kernel_order, Scalar zero_point,
+void MakeRandom(int rows, int cols, Order order, Scalar zero_point,
                 LayoutStyle layout_style, RandomRange range,
                 StorageMatrix<Scalar>* storage_matrix) {
-  MakeLayout(rows, cols, kernel_rows, kernel_cols, order, kernel_order,
-             layout_style, &storage_matrix->matrix.layout);
+  MakeLayout(rows, cols, order, layout_style, &storage_matrix->matrix.layout);
   storage_matrix->matrix.zero_point = zero_point;
   UniformRandomDistribution<Scalar> data_dist(range);
   MakeRandomVector(&data_dist, FlatSize(storage_matrix->matrix.layout),
@@ -443,13 +437,8 @@ struct TestSet final {
   int rows = 0;
   int cols = 0;
   int depth = 0;
-  int kernel_rows = 1;
-  int kernel_cols = 1;
-  int kernel_depth = 1;
   Order lhs_order = Order::kRowMajor;
   Order rhs_order = Order::kColMajor;
-  Order lhs_kernel_order = Order::kRowMajor;
-  Order rhs_kernel_order = Order::kColMajor;
   Order dst_order = Order::kColMajor;
   LayoutStyle layout_style = LayoutStyle::kPackedLinear;
   ExpectedOutcome expected_outcome = ExpectedOutcome::kSuccess;
@@ -526,7 +515,6 @@ void EvalRuy(Path path, Tuning tuning, const Matrix<LhsScalar>& lhs,
 template <typename Scalar, gemmlowp::MapOrder tOrder>
 void WrapGemmlowp(const Matrix<Scalar>& src,
                   gemmlowp::MatrixMap<const Scalar, tOrder>* dst) {
-  RUY_CHECK(IsLinear(src.layout));
   RUY_CHECK(src.layout.order == (tOrder == gemmlowp::MapOrder::ColMajor
                                      ? Order::kColMajor
                                      : Order::kRowMajor));
@@ -537,7 +525,6 @@ void WrapGemmlowp(const Matrix<Scalar>& src,
 template <typename Scalar, gemmlowp::MapOrder tOrder>
 void WrapGemmlowpMutable(Matrix<Scalar>* src,
                          gemmlowp::MatrixMap<Scalar, tOrder>* dst) {
-  RUY_CHECK(IsLinear(src->layout));
   RUY_CHECK(src->layout.order == (tOrder == gemmlowp::MapOrder::ColMajor
                                       ? Order::kColMajor
                                       : Order::kRowMajor));
@@ -706,9 +693,6 @@ template <Order LhsOrder, Order RhsOrder, Order DstOrder, typename LhsScalar,
           typename RhsScalar, typename DstScalar, typename Spec>
 void EvalEigen(const Matrix<LhsScalar>& lhs, const Matrix<RhsScalar>& rhs,
                const Spec& spec, int max_num_threads, Matrix<DstScalar>* dst) {
-  RUY_CHECK(IsLinear(lhs.layout));
-  RUY_CHECK(IsLinear(rhs.layout));
-  RUY_CHECK(IsLinear(dst->layout));
   RUY_CHECK_EQ(lhs.zero_point, 0);
   RUY_CHECK_EQ(rhs.zero_point, 0);
   RUY_CHECK_EQ(dst->zero_point, 0);
@@ -802,9 +786,9 @@ void EvalEigenTensor(const Matrix<Scalar>& lhs, const Matrix<Scalar>& rhs,
   RUY_CHECK_EQ(spec.multiplier_exponent, 0);
 
   // Eigen::TensorMap only supports packed layouts
-  RUY_CHECK(IsPackedLinear(lhs.layout));
-  RUY_CHECK(IsPackedLinear(rhs.layout));
-  RUY_CHECK(IsPackedLinear(dst->layout));
+  RUY_CHECK(IsPacked(lhs.layout));
+  RUY_CHECK(IsPacked(rhs.layout));
+  RUY_CHECK(IsPacked(dst->layout));
 
   using TensorLhsType =
       Eigen::TensorMap<Eigen::Tensor<const Scalar, 2, Eigen::ColMajor>>;
@@ -1433,11 +1417,9 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::MakeZeroPoints() {
 template <typename LhsScalar, typename RhsScalar, typename SpecType>
 void TestSet<LhsScalar, RhsScalar, SpecType>::MakeLhsRhs() {
   RUY_CHECK(life_stage == LifeStage::kHasZeroPoints);
-  MakeRandom(rows, depth, kernel_rows, kernel_depth, lhs_order,
-             lhs_kernel_order, lhs_zero_point, layout_style,
+  MakeRandom(rows, depth, lhs_order, lhs_zero_point, layout_style,
              RandomRange::kAvoidMinValue, &lhs);
-  MakeRandom(depth, cols, kernel_depth, kernel_cols, rhs_order,
-             rhs_kernel_order, rhs_zero_point, layout_style,
+  MakeRandom(depth, cols, rhs_order, rhs_zero_point, layout_style,
              RandomRange::kGeneral, &rhs);
   life_stage = LifeStage::kHasLhsRhs;
 }
@@ -1531,8 +1513,7 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::MakeResultPaths() {
 
   using TestSetType = TestSet<LhsScalar, RhsScalar, SpecType>;
 
-  if (!getenv("NOEXT") && IsLinear(lhs.matrix.layout) &&
-      IsLinear(rhs.matrix.layout)) {
+  if (!getenv("NOEXT")) {
     if (SupportsGemmlowp<TestSetType>::kValue) {
 #ifdef GEMMLOWP_SSE4
       const bool gemmlowp_supported = !spec.multiplier_fixedpoint_perchannel;
@@ -1569,8 +1550,8 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::MakeResultPaths() {
       TestResult<DstScalar>& result = results.back();
       result.path = path;
       result.tuning = tuning;
-      MakeRandom(rows, cols, 1, 1, dst_order, dst_order, dst_zero_point,
-                 layout_style, RandomRange::kGeneral, &result.storage_matrix);
+      MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style,
+                 RandomRange::kGeneral, &result.storage_matrix);
     }
   }
 
@@ -1578,8 +1559,8 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::MakeResultPaths() {
     results.emplace_back();
     TestResult<DstScalar>& result = results.back();
     result.external_path = external_path;
-    MakeRandom(rows, cols, 1, 1, dst_order, dst_order, dst_zero_point,
-               layout_style, RandomRange::kGeneral, &result.storage_matrix);
+    MakeRandom(rows, cols, dst_order, dst_zero_point, layout_style,
+               RandomRange::kGeneral, &result.storage_matrix);
   }
 
   life_stage = LifeStage::kHasResultPaths;
@@ -1918,8 +1899,7 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::Verify() {
 }
 
 template <typename TestSetType>
-void TestPackedLinearRCC(int rows, int depth, int cols,
-                         ExpectedOutcome expected_outcome) {
+void TestRCC(int rows, int depth, int cols, ExpectedOutcome expected_outcome) {
   TestSetType test_set;
   test_set.rows = rows;
   test_set.depth = depth;
@@ -1933,9 +1913,23 @@ void TestPackedLinearRCC(int rows, int depth, int cols,
 }
 
 template <typename TestSetType>
-void TestPackedLinearRCC(int rows, int depth, int cols) {
-  TestPackedLinearRCC<TestSetType>(rows, depth, cols,
-                                   ExpectedOutcome::kSuccess);
+void TestRCC(int rows, int depth, int cols) {
+  TestRCC<TestSetType>(rows, depth, cols, ExpectedOutcome::kSuccess);
+}
+
+template <typename TestSetType>
+void TestNonRCC(int rows, int depth, int cols,
+                ExpectedOutcome expected_outcome) {
+  TestSetType test_set;
+  test_set.rows = rows;
+  test_set.depth = depth;
+  test_set.cols = cols;
+  test_set.lhs_order = Order::kColMajor;
+  test_set.rhs_order = Order::kColMajor;
+  test_set.dst_order = Order::kColMajor;
+  test_set.layout_style = LayoutStyle::kPackedLinear;
+  test_set.expected_outcome = expected_outcome;
+  test_set.Run();
 }
 
 template <typename TestSetType>
@@ -1967,50 +1961,6 @@ void TestLinearAllOrders(int rows, int depth, int cols) {
                                    ExpectedOutcome::kSuccess);
 }
 
-template <typename TestSetType>
-void TestNonLinearAllOrders(int rows, int depth, int cols, int kernel_rows,
-                            int kernel_depth, int kernel_cols,
-                            ExpectedOutcome expected_outcome) {
-  const std::vector<Order> orders{Order::kColMajor, Order::kRowMajor};
-
-  for (Order lhs_order : orders) {
-    for (Order rhs_order : orders) {
-      for (Order dst_order : orders) {
-        for (Order lhs_kernel_order : orders) {
-          for (Order rhs_kernel_order : orders) {
-            TestSetType test_set;
-            test_set.rows = rows;
-            test_set.depth = depth;
-            test_set.cols = cols;
-            test_set.kernel_rows = kernel_rows;
-            test_set.kernel_depth = kernel_depth;
-            test_set.kernel_cols = kernel_cols;
-            test_set.lhs_order = lhs_order;
-            test_set.rhs_order = rhs_order;
-            test_set.lhs_kernel_order = lhs_kernel_order;
-            test_set.rhs_kernel_order = rhs_kernel_order;
-            test_set.dst_order = dst_order;
-            test_set.layout_style = LayoutStyle::kLinear;
-            test_set.expected_outcome = expected_outcome;
-            test_set.Run();
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename TestSetType>
-void TestNonLinearAllOrders(int rows, int depth, int cols, int kernel_rows,
-                            int kernel_depth, int kernel_cols) {
-  RUY_CHECK_EQ(rows % kernel_rows, 0);
-  RUY_CHECK_EQ(depth % kernel_depth, 0);
-  RUY_CHECK_EQ(cols % kernel_cols, 0);
-  TestNonLinearAllOrders<TestSetType>(rows, depth, cols, kernel_rows,
-                                      kernel_depth, kernel_cols,
-                                      ExpectedOutcome::kSuccess);
-}
-
 }  // namespace ruy
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_RUY_TEST_H_
diff --git a/tensorflow/lite/experimental/ruy/test_fast.cc b/tensorflow/lite/experimental/ruy/test_fast.cc
index 06533a082e3..7026bca4616 100644
--- a/tensorflow/lite/experimental/ruy/test_fast.cc
+++ b/tensorflow/lite/experimental/ruy/test_fast.cc
@@ -56,7 +56,7 @@ TEST(RuyTest, TestSquareMuls) {
   };
 
   for (int size : sizes) {
-    TestPackedLinearRCC<TestSetType>(size, size, size);
+    TestRCC<TestSetType>(size, size, size);
     TestLinearAllOrders<TestSetType>(size, size, size);
   }
 }
@@ -73,7 +73,7 @@ TEST(RuyTest, TestMiscMuls) {
 }
 
 TEST(RuyTest, TestDeepMuls) {
-  TestPackedLinearRCC<TestSetType>(1, 50001, 1);
+  TestRCC<TestSetType>(1, 50001, 1);
   TestLinearAllOrders<TestSetType>(5, 5001, 4);
   TestLinearAllOrders<TestSetType>(9, 1025, 10);
 }
@@ -94,10 +94,4 @@ TEST(RuyTest, TestNarrowMuls) {
   }
 }
 
-TEST(RuyTest, TestNonLinear) {
-  TestNonLinearAllOrders<TestSetType>(10, 11, 12, 2, 1, 4);
-  TestNonLinearAllOrders<TestSetType>(10, 12, 11, 2, 4, 1);
-  TestNonLinearAllOrders<TestSetType>(8, 2, 4, 8, 2, 4);
-  TestNonLinearAllOrders<TestSetType>(24, 32, 16, 8, 16, 4);
-}
 }  // namespace ruy
diff --git a/tensorflow/lite/experimental/ruy/test_slow.cc b/tensorflow/lite/experimental/ruy/test_slow.cc
index e19fb720cb4..0a10a163075 100644
--- a/tensorflow/lite/experimental/ruy/test_slow.cc
+++ b/tensorflow/lite/experimental/ruy/test_slow.cc
@@ -29,10 +29,10 @@ using TestSetType =
 
 TEST(RuyTest, TestBigNarrowMuls) {
   for (int width : {1, 2, 3, 4, 5, 8}) {
-    TestPackedLinearRCC<TestSetType>(width, 401, 601);
-    TestPackedLinearRCC<TestSetType>(587, 443, width);
+    TestRCC<TestSetType>(width, 401, 601);
+    TestRCC<TestSetType>(587, 443, width);
   }
-  TestPackedLinearRCC<TestSetType>(512, 256, 16);
+  TestRCC<TestSetType>(512, 256, 16);
 }
 
 TEST(RuyTest, TestBigShallowMuls) {
@@ -42,7 +42,7 @@ TEST(RuyTest, TestBigShallowMuls) {
 }
 
 TEST(RuyTest, TestBigMuls) {
-  TestPackedLinearRCC<TestSetType>(225, 303, 199);
+  TestRCC<TestSetType>(225, 303, 199);
   TestLinearAllOrders<TestSetType>(256, 192, 128);
 }
 
diff --git a/tensorflow/lite/experimental/ruy/test_special_specs.cc b/tensorflow/lite/experimental/ruy/test_special_specs.cc
index 9498e2bf192..5e1d8d980f5 100644
--- a/tensorflow/lite/experimental/ruy/test_special_specs.cc
+++ b/tensorflow/lite/experimental/ruy/test_special_specs.cc
@@ -32,9 +32,8 @@ struct ZeroPointSupportSpec : BasicSpec<AccumScalar, DstScalar> {
 };
 
 template <typename AccumScalar, typename DstScalar>
-struct PackedLinearRCCSpec : BasicSpec<AccumScalar, DstScalar> {
-  static constexpr LayoutSupport kLayoutSupport =
-      LayoutSupport::kPackedLinearRCC;
+struct RCCSpec : BasicSpec<AccumScalar, DstScalar> {
+  static constexpr LayoutSupport kLayoutSupport = LayoutSupport::kRCC;
 };
 
 using LhsScalar = RUY_TEST_LHSSCALAR;
@@ -117,13 +116,11 @@ TEST(TestSpecialSpecs, ZeroPointSupport) {
       SymmetricZeroPoint<DstScalar>() - 1, ExpectedOutcome::kDeath);
 }
 
-TEST(TestSpecialSpecs, PackedLinearRCC) {
-  using PackedLinearRCCSpec = PackedLinearRCCSpec<AccumScalar, DstScalar>;
-  using PackedLinearRCCTestSet =
-      TestSet<LhsScalar, RhsScalar, PackedLinearRCCSpec>;
-  TestPackedLinearRCC<PackedLinearRCCTestSet>(81, 93, 72);
-  TestLinearAllOrders<PackedLinearRCCTestSet>(81, 93, 72,
-                                              ExpectedOutcome::kDeath);
+TEST(TestSpecialSpecs, RCC) {
+  using RCCSpec = RCCSpec<AccumScalar, DstScalar>;
+  using RCCTestSet = TestSet<LhsScalar, RhsScalar, RCCSpec>;
+  TestRCC<RCCTestSet>(81, 93, 72);
+  TestNonRCC<RCCTestSet>(81, 93, 72, ExpectedOutcome::kDeath);
 }
 
 }  // namespace ruy