516 lines
21 KiB
C++
516 lines
21 KiB
C++
/* Copyright 2019 Google LLC. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#include "profiling/instrumentation.h"
|
|
#include "tensorflow/lite/experimental/ruy/kernel.h"
|
|
#include "tensorflow/lite/experimental/ruy/platform.h"
|
|
|
|
namespace ruy {
|
|
|
|
#if RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
|
|
|
#define RUY_ASM_LABEL_STORE_UINT8 91
|
|
#define RUY_ASM_LABEL_STORE_INT8 92
|
|
#define RUY_ASM_LABEL_STORE_INT16 93
|
|
#define RUY_ASM_LABEL_STORE_INT32 94
|
|
#define RUY_ASM_LABEL_AFTER_STORE 99
|
|
|
|
#define RUY_OFFSET_LHS_BASE_PTR 0
|
|
#define RUY_OFFSET_RHS_BASE_PTR 4
|
|
#define RUY_OFFSET_DST_BASE_PTR 8
|
|
#define RUY_OFFSET_BIAS 12
|
|
#define RUY_OFFSET_START_ROW 16
|
|
#define RUY_OFFSET_START_COL 20
|
|
#define RUY_OFFSET_LAST_ROW 24
|
|
#define RUY_OFFSET_LAST_COL 28
|
|
#define RUY_OFFSET_DST_ROWS 32
|
|
#define RUY_OFFSET_DST_COLS 36
|
|
#define RUY_OFFSET_LHS_STRIDE 40
|
|
#define RUY_OFFSET_RHS_STRIDE 44
|
|
#define RUY_OFFSET_DST_STRIDE 48
|
|
#define RUY_OFFSET_DEPTH 52
|
|
#define RUY_OFFSET_CLAMP_MIN 56
|
|
#define RUY_OFFSET_CLAMP_MAX 60
|
|
#define RUY_OFFSET_FLAGS 64
|
|
|
|
#define RUY_STACK_OFFSET_SIZE 96
|
|
#define RUY_STACK_OFFSET_DST_COL_PTR 0
|
|
#define RUY_STACK_OFFSET_DST_PTR 16
|
|
#define RUY_STACK_OFFSET_ROW 32
|
|
#define RUY_STACK_OFFSET_COL 48
|
|
#define RUY_STACK_OFFSET_LHS_COL_PTR 64
|
|
#define RUY_STACK_OFFSET_RHS_COL_PTR 80
|
|
|
|
template <typename Params>
|
|
void CheckOffsetsInKernelParamsFloat32(const Params&) {
|
|
static_assert(offsetof(Params, lhs_base_ptr) == RUY_OFFSET_LHS_BASE_PTR, "");
|
|
static_assert(offsetof(Params, rhs_base_ptr) == RUY_OFFSET_RHS_BASE_PTR, "");
|
|
static_assert(offsetof(Params, dst_base_ptr) == RUY_OFFSET_DST_BASE_PTR, "");
|
|
static_assert(offsetof(Params, bias) == RUY_OFFSET_BIAS, "");
|
|
static_assert(offsetof(Params, start_row) == RUY_OFFSET_START_ROW, "");
|
|
static_assert(offsetof(Params, start_col) == RUY_OFFSET_START_COL, "");
|
|
static_assert(offsetof(Params, last_row) == RUY_OFFSET_LAST_ROW, "");
|
|
static_assert(offsetof(Params, last_col) == RUY_OFFSET_LAST_COL, "");
|
|
static_assert(offsetof(Params, dst_rows) == RUY_OFFSET_DST_ROWS, "");
|
|
static_assert(offsetof(Params, lhs_stride) == RUY_OFFSET_LHS_STRIDE, "");
|
|
static_assert(offsetof(Params, rhs_stride) == RUY_OFFSET_RHS_STRIDE, "");
|
|
static_assert(offsetof(Params, dst_stride) == RUY_OFFSET_DST_STRIDE, "");
|
|
static_assert(offsetof(Params, depth) == RUY_OFFSET_DEPTH, "");
|
|
static_assert(offsetof(Params, clamp_min) == RUY_OFFSET_CLAMP_MIN, "");
|
|
static_assert(offsetof(Params, clamp_max) == RUY_OFFSET_CLAMP_MAX, "");
|
|
static_assert(offsetof(Params, flags) == RUY_OFFSET_FLAGS, "");
|
|
}
|
|
|
|
// Float kernel for ARM32 out-of-order cores.
|
|
// Just like Float 64 version, except accumulate in to 8x4 block to only
|
|
// use 16 128-bit NEON registers. This is a "first pass" kernel and not
|
|
// tuned. It is meant to run on out-of-order CPUs like the Krait 400 or A9.
|
|
void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
|
|
CheckOffsetsInKernelParamsFloat32(params);
|
|
gemmlowp::ScopedProfilingLabel label(
|
|
"Kernel (kNeon, optimized for out-of-order cores)");
|
|
|
|
const float* lhs_ptr = params.lhs_base_ptr;
|
|
const float* rhs_ptr = params.rhs_base_ptr;
|
|
// In ARM32 NEON, there are 16 128-bit "q" registers. These registers are
|
|
// each composed of two 64-bit "d" registers. The asm kernel below has the
|
|
// following NEON register allocation:
|
|
// Registers q3 -- q10 are accumulators. During accumulation,
|
|
// q0 -- q2 (d0 -- d5) are used to load data from LHS and RHS. q0 and q1
|
|
// are used to load a 8x1 block of LHS, and q2 is used to load a 1x4 block
|
|
// of RHS, like this:
|
|
|
|
// Register layout in "q" registers:
|
|
// RHS 1x4 block
|
|
// /--------------------------\
|
|
// |q2.s[0] ... q2.s[3] |
|
|
// \--------------------------/
|
|
// LHS 8x1 block
|
|
// /---------------------\ /--------------------- \
|
|
// | q0.s[0] | | q3.s[0] ... q9.s[0] |
|
|
// | ... | | ... ... |
|
|
// | q0.s[3] | | q3.s[3] q9.s[3] |
|
|
// | q1.s[0] | | q4.s[0] q10.s[0] |
|
|
// | ... | | ... ... ... |
|
|
// | q1.s[3] | | q4.s[3] .. q10.s[3] |
|
|
// \---------------------/ \--------------------------/
|
|
// accumulators 8x4 block
|
|
// q11, q14, q15 currently unused. q12 and q13 are used to load
|
|
// parameters used for the post-accumulation part of the kernel.
|
|
// For completeness, here is the register layout in "d" registers:
|
|
// RHS 1x4 block
|
|
// /--------------------------\
|
|
// |d4[0] ... d5[1] |
|
|
// \--------------------------/
|
|
// LHS 8x1 block
|
|
// /---------------------\ /--------------------------\
|
|
// | d0[0] | | d6[0] ... d18[0] |
|
|
// | ... | | ... ... |
|
|
// | d1[1] | | d7[1] d19[1] |
|
|
// | d2[0] | | d8[0] d20[0] |
|
|
// | ... | | ... ... ... |
|
|
// | d3[1] | | d9[1] ... d21[1] |
|
|
// \---------------------/ \--------------------------/
|
|
// accumulators 8x4 block
|
|
asm volatile(
|
|
#define RUY_MAKE_ZERO(reg) "mov r0, #0\n vdup.32 " #reg ", r0\n"
|
|
|
|
// clang-format off
|
|
|
|
// Load the first 32 bytes of LHS and RHS data.
|
|
// Load q0, q1
|
|
"vld1.32 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n"
|
|
"pld [%[lhs_ptr]]\n"
|
|
// Load q2
|
|
"vld1.32 {d4, d5}, [%[rhs_ptr]]!\n"
|
|
"pld [%[rhs_ptr]]\n"
|
|
|
|
"sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
|
|
|
|
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
|
|
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
|
|
|
|
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
|
|
"str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
|
|
|
|
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
|
|
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
|
|
|
|
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_START_COL) "]\n"
|
|
"str r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
|
|
|
|
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
|
|
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
|
|
|
|
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_RHS_BASE_PTR) "]\n"
|
|
"str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
|
|
// Clear accumulators.
|
|
RUY_MAKE_ZERO(q3)
|
|
RUY_MAKE_ZERO(q4)
|
|
RUY_MAKE_ZERO(q5)
|
|
RUY_MAKE_ZERO(q6)
|
|
RUY_MAKE_ZERO(q7)
|
|
RUY_MAKE_ZERO(q8)
|
|
RUY_MAKE_ZERO(q9)
|
|
RUY_MAKE_ZERO(q10)
|
|
|
|
// r1 is the number of levels of depth that we have already loaded
|
|
// LHS and RHS data for. Corresponding to the initial ld1 instructions
|
|
// above, this is currently 1.
|
|
"mov r1, #1\n"
|
|
|
|
// Main loop of the whole GEMM, over rows and columns of the
|
|
// destination matrix.
|
|
"1:\n"
|
|
|
|
// Accumulation loop
|
|
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
|
|
"cmp r1, r2\n"
|
|
"beq 79f\n"
|
|
|
|
"2:\n"
|
|
|
|
"vmla.f32 q3, q0, d4[0]\n"
|
|
"vmla.f32 q5, q0, d4[1]\n"
|
|
"vmla.f32 q7, q0, d5[0]\n"
|
|
"vmla.f32 q9, q0, d5[1]\n"
|
|
"vld1.32 {d0, d1}, [%[lhs_ptr]]!\n" // Reload LHS
|
|
|
|
"vmla.f32 q4, q1, d4[0]\n"
|
|
"vmla.f32 q6, q1, d4[1]\n"
|
|
"vmla.f32 q8, q1, d5[0]\n"
|
|
"vmla.f32 q10, q1, d5[1]\n"
|
|
"vld1.32 {d2, d3}, [%[lhs_ptr]]!\n" // Reload LHS
|
|
"pld [%[lhs_ptr]]\n"
|
|
"vld1.32 {d4, d5}, [%[rhs_ptr]]!\n" // Reload RHS
|
|
"pld [%[rhs_ptr]]\n"
|
|
|
|
"add r1, r1, #1\n"
|
|
"cmp r1, r2\n"
|
|
|
|
"blt 2b\n"
|
|
|
|
"79:\n"
|
|
|
|
// End of the inner loop on depth. Now perform the remaining
|
|
// multiply-adds of the last level of depth, for which the LHS
|
|
// and RHS data is already loaded.
|
|
|
|
"vmla.f32 q3, q0, d4[0]\n"
|
|
"vmla.f32 q5, q0, d4[1]\n"
|
|
"vmla.f32 q7, q0, d5[0]\n"
|
|
"vmla.f32 q9, q0, d5[1]\n"
|
|
|
|
"vmla.f32 q4, q1, d4[0]\n"
|
|
"vmla.f32 q6, q1, d4[1]\n"
|
|
"vmla.f32 q8, q1, d5[0]\n"
|
|
"vmla.f32 q10, q1, d5[1]\n"
|
|
|
|
// End of accumulation. The registers q3 -- q10 contain the final
|
|
// float32 accumulator values of the current 8x8 destination block.
|
|
// We now have to compute the final values from these accumulators
|
|
// and advance to the next 8x8 block. We intertwine
|
|
// these two aspects whenever possible for optimal pipelining, both
|
|
// at the data flow level (prefetch data for next block as early as
|
|
// possible) and instruction pipelining level (some of the next-block
|
|
// work can dual-issue with some of the final work on the current
|
|
// block).
|
|
|
|
// Logic to advance to the next block in preparation for the next
|
|
// iteration of the main loop. For now, we only want to compute
|
|
// the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
|
|
// not yet ready to update the values of row and col, as we still need
|
|
// the current values for the rest of the work on the current block.
|
|
|
|
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
|
|
"ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
|
|
"cmp r1, r3\n" // Have we finished the last row?
|
|
|
|
"bge 4f\n" // If finished last row, go to 4
|
|
// Not finished last row: then advance to next row.
|
|
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
|
|
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
|
|
"add r4, r4, r1, lsl #3\n"
|
|
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
|
|
"b 5f\n"
|
|
"4:\n" // Finished last row...
|
|
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
|
|
// Go back to first row
|
|
"str r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
|
|
// Now we need to advance to the next column. If we already
|
|
// finished the last column, then in principle we are done, however
|
|
// we can't just return here, as we need to allow the end work of the
|
|
// current block to complete. The good news is that at this point it
|
|
// doesn't matter what data we load for the next column, since
|
|
// we will exit from the main loop below before actually storing
|
|
// anything computed from that data.
|
|
"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
|
|
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
|
|
"cmp r8, r4\n" // Have we finished the last column?
|
|
"bge 5f\n" // If yes, just carry on without updating the column pointer.
|
|
// Not finished last column: then advance to next column.
|
|
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
|
|
"ldr r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
|
|
"add r10, r10, r1, lsl #2\n"
|
|
"str r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
|
|
"5:\n"
|
|
|
|
// Set the LHS and RHS data pointers to the start of the columns just
|
|
// computed.
|
|
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
|
|
"mov %[lhs_ptr], r4\n"
|
|
"ldr r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
|
|
"mov %[rhs_ptr], r5\n"
|
|
|
|
// Load some parameters needed for the end work on current block.
|
|
"ldrb r4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
|
|
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
|
|
|
|
// Offset these base pointers as needed given the current row, col.
|
|
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
|
|
"add r5, r1, r8, lsl #2\n"
|
|
|
|
"tst r4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
|
|
"it ne\n"
|
|
"movne r1, r5\n"
|
|
|
|
// Load 8 bias values.
|
|
"vld1.32 {d24, d25, d26, d27}, [r1]\n"
|
|
|
|
// Now that we know what LHS and RHS data the next iteration of the
|
|
// main loop will need to load, we start loading the first 32 bytes of
|
|
// each of LHS and RHS, into q0 -- q2, as we don't need q0 -- q2 anymore
|
|
// in the rest of the work on the current block.
|
|
// Load q0, q1
|
|
"vld1.32 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n"
|
|
"pld [%[lhs_ptr]]\n"
|
|
// Load q2
|
|
"vld1.32 {d4, d5}, [%[rhs_ptr]]!\n"
|
|
"pld [%[rhs_ptr]]\n"
|
|
|
|
// Perform the bias-addition (per the above, we have just folded into
|
|
// the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
|
|
"vadd.f32 q3, q3, q12\n"
|
|
"vadd.f32 q4, q4, q13\n"
|
|
"vadd.f32 q5, q5, q12\n"
|
|
"vadd.f32 q6, q6, q13\n"
|
|
"vadd.f32 q7, q7, q12\n"
|
|
"vadd.f32 q8, q8, q13\n"
|
|
"vadd.f32 q9, q9, q12\n"
|
|
"vadd.f32 q10, q10, q13\n"
|
|
|
|
// Load the clamp_min, clamp_max bounds
|
|
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
|
|
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
|
|
"vdup.32 q12, r2\n" // clamp_min
|
|
"vdup.32 q13, r3\n" // clamp_max
|
|
|
|
// Apply the clamp_min bound
|
|
"vmax.f32 q3, q3, q12\n"
|
|
"vmax.f32 q4, q4, q12\n"
|
|
"vmax.f32 q5, q5, q12\n"
|
|
"vmax.f32 q6, q6, q12\n"
|
|
"vmax.f32 q7, q7, q12\n"
|
|
"vmax.f32 q8, q8, q12\n"
|
|
"vmax.f32 q9, q9, q12\n"
|
|
"vmax.f32 q10, q10, q12\n"
|
|
|
|
// Apply the clamp_max bound
|
|
"vmin.f32 q3, q3, q13\n"
|
|
"vmin.f32 q4, q4, q13\n"
|
|
"vmin.f32 q5, q5, q13\n"
|
|
"vmin.f32 q6, q6, q13\n"
|
|
"vmin.f32 q7, q7, q13\n"
|
|
"vmin.f32 q8, q8, q13\n"
|
|
"vmin.f32 q9, q9, q13\n"
|
|
"vmin.f32 q10, q10, q13\n"
|
|
|
|
// Compute how much of the 8x4 block of destination values that
|
|
// we have computed, fit in the destination matrix. Typically, all of
|
|
// it fits, but when the destination matrix shape is not a multiple
|
|
// of 8x4, there are some 8x8 blocks along the boundaries that do
|
|
// not fit entirely.
|
|
"ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
|
|
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
|
|
"sub r1, r1, r8\n"
|
|
|
|
"ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
|
|
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
|
|
"sub r2, r2, r4\n"
|
|
"mov r3, #8\n"
|
|
"mov r5, #4\n"
|
|
"cmp r1, #8\n"
|
|
// Compute r1 = how many rows of the 8x4 block fit
|
|
"it gt\n"
|
|
"movgt r1, r3\n"
|
|
"cmp r2, #4\n"
|
|
// Compute r2 = how many cols of the 8x4 block fit
|
|
"it gt\n"
|
|
"movgt r2, r5\n"
|
|
|
|
// Test if r1==8 && r2 == 4, i.e. if all of the 8x4 block fits.
|
|
"cmp r1, r3\n"
|
|
"it eq\n"
|
|
"cmpeq r2, r5\n"
|
|
// Yes, all of the 8x4 block fits, go to fast path.
|
|
"beq 30f\n"
|
|
// Not all of the 8x4 block fits.
|
|
// Set (r3 address, r4 stride) to write to dst_tmp_buf
|
|
"mov r3, %[dst_tmp_buf]\n"
|
|
"mov r4, #32\n"
|
|
"b 31f\n"
|
|
"30:\n"
|
|
// Yes, all of the 8x4 block fits.
|
|
// Set (r3 address, r4 stride) to write directly to destination matrix.
|
|
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
|
|
"ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
|
|
"mov r4, r5\n"
|
|
"31:\n"
|
|
|
|
// Write our float values to the destination described by
|
|
// (r3 address, r4 stride)
|
|
"vst1.32 {d6, d7, d8, d9}, [r3]\n"
|
|
"add r3, r3, r4\n"
|
|
RUY_MAKE_ZERO(q3)
|
|
RUY_MAKE_ZERO(q4)
|
|
"vst1.32 {d10, d11, d12, d13}, [r3]\n"
|
|
"add r3, r3, r4\n"
|
|
RUY_MAKE_ZERO(q5)
|
|
RUY_MAKE_ZERO(q6)
|
|
"vst1.32 {d14, d15, d16, d17}, [r3]\n"
|
|
"add r3, r3, r4\n"
|
|
RUY_MAKE_ZERO(q7)
|
|
RUY_MAKE_ZERO(q8)
|
|
"vst1.32 {d18, d19, d20, d21}, [r3]\n"
|
|
"add r3, r3, r4\n"
|
|
RUY_MAKE_ZERO(q9)
|
|
RUY_MAKE_ZERO(q10)
|
|
|
|
// If all of the 8x4 block fits, we just finished writing it to the
|
|
// destination, so we skip the next part.
|
|
"beq 41f\n"
|
|
// Not all of the 8x8 block fits in the destination matrix. We just
|
|
// wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
|
|
// it to copy into the destination matrix the part that fits.
|
|
"ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
|
|
"mov r3, %[dst_tmp_buf]\n"
|
|
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
|
|
"mov r6, #0\n"
|
|
"50:\n"
|
|
"mov r5, #0\n"
|
|
"51:\n"
|
|
"ldr r10, [r3, r5, lsl #2]\n"
|
|
"str r10, [r4, r5, lsl #2]\n"
|
|
"add r5, r5, #1\n"
|
|
"cmp r5, r1\n"
|
|
"blt 51b\n"
|
|
"add r6, r6, #1\n"
|
|
"add r3, r3, #32\n"
|
|
"add r4, r4, r8\n"
|
|
// r2 = how many cols of the 8x4 block fit
|
|
"cmp r6, r2\n"
|
|
"blt 50b\n"
|
|
"41:\n"
|
|
// Load dst_ptr, increment, and write back.
|
|
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
|
|
"add r4, r4, #32\n"
|
|
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
|
|
// At this point we have completely finished writing values to the
|
|
// destination matrix for the current block.
|
|
|
|
// Reload some params --- we had used r3, r5, r10 for a few other things
|
|
// since the last time we had loaded them.
|
|
"ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
|
|
"ldr r6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
|
|
"ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
|
|
|
|
// Move to the next block of the destination matrix, for the next iter
|
|
// of the main loop. Notice that lhs_col_ptr, rhs_col_ptr have already
|
|
// been updated earlier.
|
|
// Have we reached the end row?
|
|
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
|
|
"cmp r8, r3\n"
|
|
|
|
"beq 20f\n" // yes, end row.
|
|
// Not end row. Move to the next row.
|
|
"add r8, r8, #8\n"
|
|
// Store new value of row
|
|
"str r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
|
|
|
|
"b 21f\n"
|
|
"20:\n"
|
|
// Was already at end row.
|
|
// Move back to first row.
|
|
"str r6, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
|
|
// Move to the next column.
|
|
"ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
|
|
"add r4, r4, #4\n"
|
|
"str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
|
|
|
|
"ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
|
|
"ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
|
|
// Increment dst_col_ptr by 4 * dst_stride (i.e. 4 columns)
|
|
"add r1, r1, r8, lsl #2\n"
|
|
// Store dst_col_ptr
|
|
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
|
|
// Store dst_ptr
|
|
"str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
|
|
"21:\n"
|
|
|
|
// Main loop exit condition: have we hit the end column?
|
|
"ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
|
|
"ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
|
|
"cmp r8, r4\n"
|
|
|
|
// w1 is the number of levels of depth that we have already loaded
|
|
// LHS and RHS data for. Corresponding to the initial ld1 instructions
|
|
// above, this is currently 1.
|
|
"mov r1, #1\n"
|
|
|
|
"ble 1b\n"
|
|
|
|
// Restore stack pointer.
|
|
"add sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
|
|
|
|
// clang-format on
|
|
: [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr)
|
|
: [ params ] "r"(¶ms), [dst_tmp_buf] "r"(params.dst_tmp_buf)
|
|
// Clobber list must specify q registers (and not their constituent
|
|
// d registers). There is a (currently unexplained) slowdown if
|
|
// d registers are listed in the clobbers list.
|
|
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc",
|
|
"memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
|
|
"q9", "q10", "q12", "q13");
|
|
}
|
|
|
|
#undef RUY_OFFSET_BIAS
|
|
#undef RUY_OFFSET_FLAGS
|
|
#undef RUY_OFFSET_LHS_BASE_PTR
|
|
#undef RUY_OFFSET_CLAMP_MIN
|
|
#undef RUY_OFFSET_CLAMP_MAX
|
|
#undef RUY_OFFSET_START_ROW
|
|
#undef RUY_OFFSET_LAST_ROW
|
|
#undef RUY_OFFSET_LAST_COL
|
|
#undef RUY_OFFSET_LHS_STRIDE
|
|
#undef RUY_OFFSET_RHS_STRIDE
|
|
#undef RUY_OFFSET_DST_STRIDE
|
|
#undef RUY_OFFSET_DEPTH
|
|
#undef RUY_OFFSET_START_COL
|
|
#undef RUY_OFFSET_RHS_BASE_PTR
|
|
#undef RUY_OFFSET_DST_BASE_PTR
|
|
|
|
#endif // RUY_PLATFORM(NEON_32) && (RUY_OPT_ENABLED(RUY_OPT_ASM)
|
|
} // namespace ruy
|