Ruy - ARM32 asm optimizations

PiperOrigin-RevId: 260743891
This commit is contained in:
A. Unique TensorFlower 2019-07-30 10:26:00 -07:00 committed by TensorFlower Gardener
parent 81c0ae1ab3
commit 203f166d98
4 changed files with 42 additions and 81 deletions

View File

@ -73,9 +73,9 @@ void Benchmark() {
setenv("QUICK_BENCHMARK", "1", 0); setenv("QUICK_BENCHMARK", "1", 0);
#endif #endif
std::vector<int> sizes; std::vector<int> sizes;
for (int i = 16; i <= 4096; i *= 2) { for (int i = 16; i <= 2048; i *= 2) {
sizes.push_back(i); sizes.push_back(i);
if (i < 4096) { if (i < 2048) {
sizes.push_back(i * 3 / 2); sizes.push_back(i * 3 / 2);
} }
} }

View File

@ -130,15 +130,12 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
// clang-format off // clang-format off
// Load the first 32 bytes of LHS and RHS data. // Load the first 32 bytes of LHS and RHS data.
// Load q0 // Load q0, q1
"vld1.32 {d0}, [%[lhs_ptr]]!\n" "vld1.32 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n"
"vld1.32 {d1}, [%[lhs_ptr]]!\n" "pld [%[lhs_ptr]]\n"
// Load q1
"vld1.32 {d2}, [%[lhs_ptr]]!\n"
"vld1.32 {d3}, [%[lhs_ptr]]!\n"
// Load q2 // Load q2
"vld1.32 {d4}, [%[rhs_ptr]]!\n" "vld1.32 {d4, d5}, [%[rhs_ptr]]!\n"
"vld1.32 {d5}, [%[rhs_ptr]]!\n" "pld [%[rhs_ptr]]\n"
"sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n" "sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
@ -189,17 +186,16 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
"vmla.f32 q5, q0, d4[1]\n" "vmla.f32 q5, q0, d4[1]\n"
"vmla.f32 q7, q0, d5[0]\n" "vmla.f32 q7, q0, d5[0]\n"
"vmla.f32 q9, q0, d5[1]\n" "vmla.f32 q9, q0, d5[1]\n"
"vld1.32 {d0}, [%[lhs_ptr]]!\n" // Reload LHS 1 into r0 "vld1.32 {d0, d1}, [%[lhs_ptr]]!\n" // Reload LHS
"vld1.32 {d1}, [%[lhs_ptr]]!\n" // Reload LHS 1 into r0
"vmla.f32 q4, q1, d4[0]\n" "vmla.f32 q4, q1, d4[0]\n"
"vmla.f32 q6, q1, d4[1]\n" "vmla.f32 q6, q1, d4[1]\n"
"vmla.f32 q8, q1, d5[0]\n" "vmla.f32 q8, q1, d5[0]\n"
"vmla.f32 q10, q1, d5[1]\n" "vmla.f32 q10, q1, d5[1]\n"
"vld1.32 {d2}, [%[lhs_ptr]]!\n" // Reload LHS 2 into r1 "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n" // Reload LHS
"vld1.32 {d3}, [%[lhs_ptr]]!\n" // Reload LHS 2 into r1 "pld [%[lhs_ptr]]\n"
"vld1.32 {d4}, [%[rhs_ptr]]!\n" // Reload RHS into r2 "vld1.32 {d4, d5}, [%[rhs_ptr]]!\n" // Reload RHS
"vld1.32 {d5}, [%[rhs_ptr]]!\n" // Reload RHS into r2 "pld [%[rhs_ptr]]\n"
"add r1, r1, #1\n" "add r1, r1, #1\n"
"cmp r1, r2\n" "cmp r1, r2\n"
@ -291,25 +287,18 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
"movne r1, r5\n" "movne r1, r5\n"
// Load 8 bias values. // Load 8 bias values.
"vld1.32 {d24}, [r1]!\n" "vld1.32 {d24, d25, d26, d27}, [r1]\n"
"vld1.32 {d25}, [r1]!\n"
"vld1.32 {d26}, [r1]!\n"
"vld1.32 {d27}, [r1]\n"
// Now that we know what LHS and RHS data the next iteration of the // Now that we know what LHS and RHS data the next iteration of the
// main loop will need to load, we start loading the first 32 bytes of // main loop will need to load, we start loading the first 32 bytes of
// each of LHS and RHS, into q0 -- q2, as we don't need q0 -- q2 anymore // each of LHS and RHS, into q0 -- q2, as we don't need q0 -- q2 anymore
// in the rest of the work on the current block. // in the rest of the work on the current block.
// Load q0 // Load q0, q1
"vld1.32 {d0}, [%[lhs_ptr]]!\n" "vld1.32 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n"
"vld1.32 {d1}, [%[lhs_ptr]]!\n" "pld [%[lhs_ptr]]\n"
// Load q1
"vld1.32 {d2}, [%[lhs_ptr]]!\n"
"vld1.32 {d3}, [%[lhs_ptr]]!\n"
// Load q2 // Load q2
"vld1.32 {d4}, [%[rhs_ptr]]!\n" "vld1.32 {d4, d5}, [%[rhs_ptr]]!\n"
"vld1.32 {d5}, [%[rhs_ptr]]!\n" "pld [%[rhs_ptr]]\n"
// Perform the bias-addition (per the above, we have just folded into // Perform the bias-addition (per the above, we have just folded into
// the bias the (depth * lhs_zero_point * rhs_zero_point) term.) // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
@ -391,40 +380,20 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
"31:\n" "31:\n"
// Write our float values to the destination described by // Write our float values to the destination described by
// (r3 address, r4 stride). // (r3 address, r4 stride)
// q3 = d6, d7 "vst1.32 {d6, d7, d8, d9}, [r3]\n"
"vstr d6, [r3, #0]\n"
"vstr d7, [r3, #8]\n"
// q4 = d8, d9
"vstr d8, [r3, #16]\n"
"vstr d9, [r3, #24]\n"
"add r3, r3, r4\n" "add r3, r3, r4\n"
RUY_MAKE_ZERO(q3) RUY_MAKE_ZERO(q3)
RUY_MAKE_ZERO(q4) RUY_MAKE_ZERO(q4)
// q5 = d10, d11 "vst1.32 {d10, d11, d12, d13}, [r3]\n"
"vstr d10, [r3, #0]\n"
"vstr d11, [r3, #8]\n"
// q6 = d12, d13
"vstr d12, [r3, #16]\n"
"vstr d13, [r3, #24]\n"
"add r3, r3, r4\n" "add r3, r3, r4\n"
RUY_MAKE_ZERO(q5) RUY_MAKE_ZERO(q5)
RUY_MAKE_ZERO(q6) RUY_MAKE_ZERO(q6)
// q7 = d14, d15 "vst1.32 {d14, d15, d16, d17}, [r3]\n"
"vstr d14, [r3, #0]\n"
"vstr d15, [r3, #8]\n"
// q8 = d16, d17
"vstr d16, [r3, #16]\n"
"vstr d17, [r3, #24]\n"
"add r3, r3, r4\n" "add r3, r3, r4\n"
RUY_MAKE_ZERO(q7) RUY_MAKE_ZERO(q7)
RUY_MAKE_ZERO(q8) RUY_MAKE_ZERO(q8)
// q9 = d18, d19 "vst1.32 {d18, d19, d20, d21}, [r3]\n"
"vstr d18, [r3, #0]\n"
"vstr d19, [r3, #8]\n"
// q10 = d20, d21
"vstr d20, [r3, #16]\n"
"vstr d21, [r3, #24]\n"
"add r3, r3, r4\n" "add r3, r3, r4\n"
RUY_MAKE_ZERO(q9) RUY_MAKE_ZERO(q9)
RUY_MAKE_ZERO(q10) RUY_MAKE_ZERO(q10)
@ -518,10 +487,12 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
// clang-format on // clang-format on
: [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr) : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr)
: [ params ] "r"(&params), [dst_tmp_buf] "r"(params.dst_tmp_buf) : [ params ] "r"(&params), [dst_tmp_buf] "r"(params.dst_tmp_buf)
// Clobber list must specify q registers (and not their constituent
// d registers). There is a (currently unexplained) slowdown if
// d registers are listed in the clobbers list.
: "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc", : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc",
"memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"d9", "d10", "d12", "d13", "d14", "d15", "d16", "d17", "d18","d19", "q9", "q10", "q12", "q13");
"d20", "d21", "d22", "d23", "d24", "d25", "d26");
} }
#undef RUY_OFFSET_BIAS #undef RUY_OFFSET_BIAS

View File

@ -84,7 +84,6 @@ limitations under the License.
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_ #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_
#include <cstdint> #include <cstdint>
#include "profiling/instrumentation.h" #include "profiling/instrumentation.h"
#include "tensorflow/lite/experimental/ruy/common.h" #include "tensorflow/lite/experimental/ruy/common.h"
#include "tensorflow/lite/experimental/ruy/internal_matrix.h" #include "tensorflow/lite/experimental/ruy/internal_matrix.h"

View File

@ -1199,26 +1199,22 @@ void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
"beq 3f\n" "beq 3f\n"
#define RUY_LOAD_FOUR_BY_FOUR() \ #define RUY_LOAD_FOUR_BY_FOUR() \
/* Load q0 */ \ /* Load q0 */ \
"vldr d0, [%[src_ptr0], #0]\n" \ "vld1.32 {d0, d1}, [%[src_ptr0]]\n" \
"vldr d1, [%[src_ptr0], #8]\n" \
/* if src_inc0 != 0, add 16 to src_ptr0 */ \ /* if src_inc0 != 0, add 16 to src_ptr0 */ \
"and r3, %[src_inc], #1\n" \ "and r3, %[src_inc], #1\n" \
"add %[src_ptr0], %[src_ptr0], r3, lsl #4\n"\ "add %[src_ptr0], %[src_ptr0], r3, lsl #4\n"\
/* Load q1 */ \ /* Load q1 */ \
"vldr d2, [%[src_ptr1], #0]\n" \ "vld1.32 {d2, d3}, [%[src_ptr1]]\n" \
"vldr d3, [%[src_ptr1], #8]\n" \
/* if src_inc1 != 0, add 16 to src_ptr0 */ \ /* if src_inc1 != 0, add 16 to src_ptr0 */ \
"and r3, %[src_inc], #2\n" \ "and r3, %[src_inc], #2\n" \
"add %[src_ptr1], %[src_ptr1], r3, lsl #3\n"\ "add %[src_ptr1], %[src_ptr1], r3, lsl #3\n"\
/* Load q2 */ \ /* Load q2 */ \
"vldr d4, [%[src_ptr2], #0]\n" \ "vld1.32 {d4, d5}, [%[src_ptr2]]\n" \
"vldr d5, [%[src_ptr2], #8]\n" \
/* if src_inc2 != 0, add 16 to src_ptr0 */ \ /* if src_inc2 != 0, add 16 to src_ptr0 */ \
"and r3, %[src_inc], #4\n" \ "and r3, %[src_inc], #4\n" \
"add %[src_ptr2], %[src_ptr2], r3, lsl #2\n"\ "add %[src_ptr2], %[src_ptr2], r3, lsl #2\n"\
/* Load q3 */ \ /* Load q3 */ \
"vldr d6, [%[src_ptr3], #0]\n" \ "vld1.32 {d6, d7}, [%[src_ptr3]]\n" \
"vldr d7, [%[src_ptr3], #8]\n" \
/* if src_inc3 != 0, add 16 to src_ptr0 */ \ /* if src_inc3 != 0, add 16 to src_ptr0 */ \
"and r3, %[src_inc], #8\n" \ "and r3, %[src_inc], #8\n" \
"add %[src_ptr3], %[src_ptr3], r3, lsl #1\n"\ "add %[src_ptr3], %[src_ptr3], r3, lsl #1\n"\
@ -1253,20 +1249,16 @@ void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
#define RUY_STORE_FOUR_BY_FOUR() \ #define RUY_STORE_FOUR_BY_FOUR() \
/* Store q8, q10, q9, q11 */ \ /* Store q8, q10, q9, q11 */ \
/* q8 = d16, d17 */ \ /* q8 = d16, d17 */ \
"vstr d16, [%[packed_ptr], #0]\n" \ "vst1.32 {d16, d17}, [%[packed_ptr]]\n" \
"vstr d17, [%[packed_ptr], #8]\n" \
/* q10 = d20, d21 */ \ /* q10 = d20, d21 */ \
"add %[packed_ptr], %[packed_ptr], %[stride]\n" \ "add %[packed_ptr], %[packed_ptr], %[stride]\n" \
"vstr d20, [%[packed_ptr], #0]\n" \ "vst1.32 {d20, d21}, [%[packed_ptr]]\n" \
"vstr d21, [%[packed_ptr], #8]\n" \
/* q9 = d18, d19 */ \ /* q9 = d18, d19 */ \
"add %[packed_ptr], %[packed_ptr], %[stride]\n" \ "add %[packed_ptr], %[packed_ptr], %[stride]\n" \
"vstr d18, [%[packed_ptr], #0]\n" \ "vst1.32 {d18, d19}, [%[packed_ptr]]\n" \
"vstr d19, [%[packed_ptr], #8]\n" \
/* q11 = d22, d23 */ \ /* q11 = d22, d23 */ \
"add %[packed_ptr], %[packed_ptr], %[stride]\n" \ "add %[packed_ptr], %[packed_ptr], %[stride]\n" \
"vstr d22, [%[packed_ptr], #0]\n" \ "vst1.32 {d22, d23}, [%[packed_ptr]]\n" \
"vstr d23, [%[packed_ptr], #8]\n" \
"add %[packed_ptr], %[packed_ptr], %[stride]\n" \ "add %[packed_ptr], %[packed_ptr], %[stride]\n" \
RUY_STORE_FOUR_BY_FOUR() RUY_STORE_FOUR_BY_FOUR()
@ -1342,21 +1334,20 @@ void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1,
"mov r1, #32\n" "mov r1, #32\n"
#define RUY_STORE_ONE_ROW(ROW, REGISTER1, REGISTER2) \ #define RUY_STORE_ONE_ROW(ROW, REGISTER) \
"cmp r2, #" #ROW "\n" \ "cmp r2, #" #ROW "\n" \
"beq 4f\n" \ "beq 4f\n" \
"vstr " #REGISTER1 ", [%[packed_ptr]]\n" \ "vst1.32 {" #REGISTER "}, [%[packed_ptr]]\n" \
"vstr " #REGISTER2 ", [%[packed_ptr], #8]\n" \
"add %[packed_ptr], %[packed_ptr], %[stride]\n" "add %[packed_ptr], %[packed_ptr], %[stride]\n"
// Store q8 // Store q8
RUY_STORE_ONE_ROW(0, d16, d17) RUY_STORE_ONE_ROW(0, q8)
// Store q10 // Store q10
RUY_STORE_ONE_ROW(1, d20, d21) RUY_STORE_ONE_ROW(1, q10)
// Store q9 // Store q9
RUY_STORE_ONE_ROW(2, d18, d19) RUY_STORE_ONE_ROW(2, q9)
// Store q11 // Store q11
RUY_STORE_ONE_ROW(3, d22, d23) RUY_STORE_ONE_ROW(3, q11)
#undef RUY_STORE_ONE_ROW #undef RUY_STORE_ONE_ROW