Fix the kernel layout description for the float NEON kernels.

<Rowmajor,4,8> is the correct layout, is essentially the same
layout as <Rowmajor,1,8> as we were declaring since they're both
row-major and differ only in the number of block rows, but it still
makes a difference on padding: the kernel expects input packed matrix
data to be padded to the next multiple of 4 rows, and by incorrectly
describing the kernel layout as <rowmajor,1,8>, we did not honor that.
As we reflected the same incorrect layout description in pack.h, we were
also using packing code that wrote padded data, but we were not allocating
a suitably enlarged buffer, so my guess is that we were overrunning buffers,
explaining the non-deterministic failures. Asan didn't see anything because
this is assembly code, we would need valgrind for that.

PiperOrigin-RevId: 247550630
This commit is contained in:
Benoit Jacob 2019-05-09 21:40:54 -07:00 committed by TensorFlower Gardener
parent 4224db98ef
commit 8a8a109e56
2 changed files with 3 additions and 3 deletions

View File

@ -453,8 +453,8 @@ void KernelFloatNeonDotprodInOrder(const KernelParamsFloat<8, 8>& params);
template <>
struct Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>> {
Tuning tuning = Tuning::kAuto;
using LhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
using RhsLayout = FixedKernelLayout<Order::kRowMajor, 1, 8>;
using LhsLayout = FixedKernelLayout<Order::kRowMajor, 4, 8>;
using RhsLayout = FixedKernelLayout<Order::kRowMajor, 4, 8>;
explicit Kernel(Tuning tuning_) : tuning(tuning_) {}
void Run(const PackedMatrix<float>& lhs, const PackedMatrix<float>& rhs,
const BasicSpec<float, float>& spec, int start_row, int start_col,

View File

@ -262,7 +262,7 @@ void PackFloatNeonInOrder(const float* src_ptr0, const float* src_ptr1,
float* packed_ptr, int start_col, int end_col);
template <>
struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 4, 8>, float,
float, float> {
static void Run(Tuning tuning, const Matrix<float>& src_matrix,
PackedMatrix<float>* packed_matrix, int start_col,