From 8a8a109e56751730caf2a4da830fb3e63e069cda Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Thu, 9 May 2019 21:40:54 -0700 Subject: [PATCH] Fix the kernel layout description for the float NEON kernels. is the correct layout, is essentially the same layout as as we were declaring since they're both row-major and differ only in the number of block rows, but it still makes a difference on padding: the kernel expects input packed matrix data to be padded to the next multiple of 4 rows, and by incorrectly describing the kernel layout as , we did not honor that. As we reflected the same incorrect layout description in pack.h, we were also using packing code that wrote padded data, but we were not allocating a suitably enlarged buffer, so my guess is that we were overrunning buffers, explaining the non-deterministic failures. Asan didn't see anything because this is assembly code, we would need valgrind for that. PiperOrigin-RevId: 247550630 --- tensorflow/lite/experimental/ruy/kernel.h | 4 ++-- tensorflow/lite/experimental/ruy/pack.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/lite/experimental/ruy/kernel.h b/tensorflow/lite/experimental/ruy/kernel.h index fae8515790e..80402b046a2 100644 --- a/tensorflow/lite/experimental/ruy/kernel.h +++ b/tensorflow/lite/experimental/ruy/kernel.h @@ -453,8 +453,8 @@ void KernelFloatNeonDotprodInOrder(const KernelParamsFloat<8, 8>& params); template <> struct Kernel> { Tuning tuning = Tuning::kAuto; - using LhsLayout = FixedKernelLayout; - using RhsLayout = FixedKernelLayout; + using LhsLayout = FixedKernelLayout; + using RhsLayout = FixedKernelLayout; explicit Kernel(Tuning tuning_) : tuning(tuning_) {} void Run(const PackedMatrix& lhs, const PackedMatrix& rhs, const BasicSpec& spec, int start_row, int start_col, diff --git a/tensorflow/lite/experimental/ruy/pack.h b/tensorflow/lite/experimental/ruy/pack.h index 520099cabfc..556821087d7 100644 --- a/tensorflow/lite/experimental/ruy/pack.h +++ b/tensorflow/lite/experimental/ruy/pack.h @@ -262,7 +262,7 @@ void PackFloatNeonInOrder(const float* src_ptr0, const float* src_ptr1, float* packed_ptr, int start_col, int end_col); template <> -struct PackImpl, float, +struct PackImpl, float, float, float> { static void Run(Tuning tuning, const Matrix& src_matrix, PackedMatrix* packed_matrix, int start_col,