From 203f166d98cb3236ed4c9cd33710e726672ba111 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 30 Jul 2019 10:26:00 -0700 Subject: [PATCH] Ruy - ARM32 asm optimizations PiperOrigin-RevId: 260743891 --- tensorflow/lite/experimental/ruy/benchmark.cc | 4 +- .../lite/experimental/ruy/kernel_arm32.cc | 81 ++++++------------- tensorflow/lite/experimental/ruy/pack.h | 1 - tensorflow/lite/experimental/ruy/pack_arm.cc | 37 ++++----- 4 files changed, 42 insertions(+), 81 deletions(-) diff --git a/tensorflow/lite/experimental/ruy/benchmark.cc b/tensorflow/lite/experimental/ruy/benchmark.cc index 7d055791a1e..9aabacc37b9 100644 --- a/tensorflow/lite/experimental/ruy/benchmark.cc +++ b/tensorflow/lite/experimental/ruy/benchmark.cc @@ -73,9 +73,9 @@ void Benchmark() { setenv("QUICK_BENCHMARK", "1", 0); #endif std::vector sizes; - for (int i = 16; i <= 4096; i *= 2) { + for (int i = 16; i <= 2048; i *= 2) { sizes.push_back(i); - if (i < 4096) { + if (i < 2048) { sizes.push_back(i * 3 / 2); } } diff --git a/tensorflow/lite/experimental/ruy/kernel_arm32.cc b/tensorflow/lite/experimental/ruy/kernel_arm32.cc index 8607f256c9a..61823a8402c 100644 --- a/tensorflow/lite/experimental/ruy/kernel_arm32.cc +++ b/tensorflow/lite/experimental/ruy/kernel_arm32.cc @@ -130,15 +130,12 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) { // clang-format off // Load the first 32 bytes of LHS and RHS data. - // Load q0 - "vld1.32 {d0}, [%[lhs_ptr]]!\n" - "vld1.32 {d1}, [%[lhs_ptr]]!\n" - // Load q1 - "vld1.32 {d2}, [%[lhs_ptr]]!\n" - "vld1.32 {d3}, [%[lhs_ptr]]!\n" + // Load q0, q1 + "vld1.32 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n" + "pld [%[lhs_ptr]]\n" // Load q2 - "vld1.32 {d4}, [%[rhs_ptr]]!\n" - "vld1.32 {d5}, [%[rhs_ptr]]!\n" + "vld1.32 {d4, d5}, [%[rhs_ptr]]!\n" + "pld [%[rhs_ptr]]\n" "sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n" @@ -189,17 +186,16 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) { "vmla.f32 q5, q0, d4[1]\n" "vmla.f32 q7, q0, d5[0]\n" "vmla.f32 q9, q0, d5[1]\n" - "vld1.32 {d0}, [%[lhs_ptr]]!\n" // Reload LHS 1 into r0 - "vld1.32 {d1}, [%[lhs_ptr]]!\n" // Reload LHS 1 into r0 + "vld1.32 {d0, d1}, [%[lhs_ptr]]!\n" // Reload LHS "vmla.f32 q4, q1, d4[0]\n" "vmla.f32 q6, q1, d4[1]\n" "vmla.f32 q8, q1, d5[0]\n" "vmla.f32 q10, q1, d5[1]\n" - "vld1.32 {d2}, [%[lhs_ptr]]!\n" // Reload LHS 2 into r1 - "vld1.32 {d3}, [%[lhs_ptr]]!\n" // Reload LHS 2 into r1 - "vld1.32 {d4}, [%[rhs_ptr]]!\n" // Reload RHS into r2 - "vld1.32 {d5}, [%[rhs_ptr]]!\n" // Reload RHS into r2 + "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n" // Reload LHS + "pld [%[lhs_ptr]]\n" + "vld1.32 {d4, d5}, [%[rhs_ptr]]!\n" // Reload RHS + "pld [%[rhs_ptr]]\n" "add r1, r1, #1\n" "cmp r1, r2\n" @@ -291,25 +287,18 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) { "movne r1, r5\n" // Load 8 bias values. - "vld1.32 {d24}, [r1]!\n" - "vld1.32 {d25}, [r1]!\n" - "vld1.32 {d26}, [r1]!\n" - "vld1.32 {d27}, [r1]\n" + "vld1.32 {d24, d25, d26, d27}, [r1]\n" // Now that we know what LHS and RHS data the next iteration of the // main loop will need to load, we start loading the first 32 bytes of // each of LHS and RHS, into q0 -- q2, as we don't need q0 -- q2 anymore // in the rest of the work on the current block. - // Load q0 - "vld1.32 {d0}, [%[lhs_ptr]]!\n" - "vld1.32 {d1}, [%[lhs_ptr]]!\n" - // Load q1 - "vld1.32 {d2}, [%[lhs_ptr]]!\n" - "vld1.32 {d3}, [%[lhs_ptr]]!\n" + // Load q0, q1 + "vld1.32 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n" + "pld [%[lhs_ptr]]\n" // Load q2 - "vld1.32 {d4}, [%[rhs_ptr]]!\n" - "vld1.32 {d5}, [%[rhs_ptr]]!\n" - + "vld1.32 {d4, d5}, [%[rhs_ptr]]!\n" + "pld [%[rhs_ptr]]\n" // Perform the bias-addition (per the above, we have just folded into // the bias the (depth * lhs_zero_point * rhs_zero_point) term.) @@ -391,40 +380,20 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) { "31:\n" // Write our float values to the destination described by - // (r3 address, r4 stride). - // q3 = d6, d7 - "vstr d6, [r3, #0]\n" - "vstr d7, [r3, #8]\n" - // q4 = d8, d9 - "vstr d8, [r3, #16]\n" - "vstr d9, [r3, #24]\n" + // (r3 address, r4 stride) + "vst1.32 {d6, d7, d8, d9}, [r3]\n" "add r3, r3, r4\n" RUY_MAKE_ZERO(q3) RUY_MAKE_ZERO(q4) - // q5 = d10, d11 - "vstr d10, [r3, #0]\n" - "vstr d11, [r3, #8]\n" - // q6 = d12, d13 - "vstr d12, [r3, #16]\n" - "vstr d13, [r3, #24]\n" + "vst1.32 {d10, d11, d12, d13}, [r3]\n" "add r3, r3, r4\n" RUY_MAKE_ZERO(q5) RUY_MAKE_ZERO(q6) - // q7 = d14, d15 - "vstr d14, [r3, #0]\n" - "vstr d15, [r3, #8]\n" - // q8 = d16, d17 - "vstr d16, [r3, #16]\n" - "vstr d17, [r3, #24]\n" + "vst1.32 {d14, d15, d16, d17}, [r3]\n" "add r3, r3, r4\n" RUY_MAKE_ZERO(q7) RUY_MAKE_ZERO(q8) - // q9 = d18, d19 - "vstr d18, [r3, #0]\n" - "vstr d19, [r3, #8]\n" - // q10 = d20, d21 - "vstr d20, [r3, #16]\n" - "vstr d21, [r3, #24]\n" + "vst1.32 {d18, d19, d20, d21}, [r3]\n" "add r3, r3, r4\n" RUY_MAKE_ZERO(q9) RUY_MAKE_ZERO(q10) @@ -518,10 +487,12 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) { // clang-format on : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr) : [ params ] "r"(¶ms), [dst_tmp_buf] "r"(params.dst_tmp_buf) + // Clobber list must specify q registers (and not their constituent + // d registers). There is a (currently unexplained) slowdown if + // d registers are listed in the clobbers list. : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc", - "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", - "d9", "d10", "d12", "d13", "d14", "d15", "d16", "d17", "d18","d19", - "d20", "d21", "d22", "d23", "d24", "d25", "d26"); + "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q12", "q13"); } #undef RUY_OFFSET_BIAS diff --git a/tensorflow/lite/experimental/ruy/pack.h b/tensorflow/lite/experimental/ruy/pack.h index dd2a631faa5..0665e1ea7c3 100644 --- a/tensorflow/lite/experimental/ruy/pack.h +++ b/tensorflow/lite/experimental/ruy/pack.h @@ -84,7 +84,6 @@ limitations under the License. #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_ #include - #include "profiling/instrumentation.h" #include "tensorflow/lite/experimental/ruy/common.h" #include "tensorflow/lite/experimental/ruy/internal_matrix.h" diff --git a/tensorflow/lite/experimental/ruy/pack_arm.cc b/tensorflow/lite/experimental/ruy/pack_arm.cc index 2ac955cadca..7e0814546e1 100644 --- a/tensorflow/lite/experimental/ruy/pack_arm.cc +++ b/tensorflow/lite/experimental/ruy/pack_arm.cc @@ -1199,26 +1199,22 @@ void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1, "beq 3f\n" #define RUY_LOAD_FOUR_BY_FOUR() \ /* Load q0 */ \ - "vldr d0, [%[src_ptr0], #0]\n" \ - "vldr d1, [%[src_ptr0], #8]\n" \ + "vld1.32 {d0, d1}, [%[src_ptr0]]\n" \ /* if src_inc0 != 0, add 16 to src_ptr0 */ \ "and r3, %[src_inc], #1\n" \ "add %[src_ptr0], %[src_ptr0], r3, lsl #4\n"\ /* Load q1 */ \ - "vldr d2, [%[src_ptr1], #0]\n" \ - "vldr d3, [%[src_ptr1], #8]\n" \ + "vld1.32 {d2, d3}, [%[src_ptr1]]\n" \ /* if src_inc1 != 0, add 16 to src_ptr0 */ \ "and r3, %[src_inc], #2\n" \ "add %[src_ptr1], %[src_ptr1], r3, lsl #3\n"\ /* Load q2 */ \ - "vldr d4, [%[src_ptr2], #0]\n" \ - "vldr d5, [%[src_ptr2], #8]\n" \ + "vld1.32 {d4, d5}, [%[src_ptr2]]\n" \ /* if src_inc2 != 0, add 16 to src_ptr0 */ \ "and r3, %[src_inc], #4\n" \ "add %[src_ptr2], %[src_ptr2], r3, lsl #2\n"\ /* Load q3 */ \ - "vldr d6, [%[src_ptr3], #0]\n" \ - "vldr d7, [%[src_ptr3], #8]\n" \ + "vld1.32 {d6, d7}, [%[src_ptr3]]\n" \ /* if src_inc3 != 0, add 16 to src_ptr0 */ \ "and r3, %[src_inc], #8\n" \ "add %[src_ptr3], %[src_ptr3], r3, lsl #1\n"\ @@ -1253,20 +1249,16 @@ void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1, #define RUY_STORE_FOUR_BY_FOUR() \ /* Store q8, q10, q9, q11 */ \ /* q8 = d16, d17 */ \ - "vstr d16, [%[packed_ptr], #0]\n" \ - "vstr d17, [%[packed_ptr], #8]\n" \ + "vst1.32 {d16, d17}, [%[packed_ptr]]\n" \ /* q10 = d20, d21 */ \ "add %[packed_ptr], %[packed_ptr], %[stride]\n" \ - "vstr d20, [%[packed_ptr], #0]\n" \ - "vstr d21, [%[packed_ptr], #8]\n" \ + "vst1.32 {d20, d21}, [%[packed_ptr]]\n" \ /* q9 = d18, d19 */ \ "add %[packed_ptr], %[packed_ptr], %[stride]\n" \ - "vstr d18, [%[packed_ptr], #0]\n" \ - "vstr d19, [%[packed_ptr], #8]\n" \ + "vst1.32 {d18, d19}, [%[packed_ptr]]\n" \ /* q11 = d22, d23 */ \ "add %[packed_ptr], %[packed_ptr], %[stride]\n" \ - "vstr d22, [%[packed_ptr], #0]\n" \ - "vstr d23, [%[packed_ptr], #8]\n" \ + "vst1.32 {d22, d23}, [%[packed_ptr]]\n" \ "add %[packed_ptr], %[packed_ptr], %[stride]\n" \ RUY_STORE_FOUR_BY_FOUR() @@ -1342,21 +1334,20 @@ void PackFloatNeonOutOfOrder(const float* src_ptr0, const float* src_ptr1, "mov r1, #32\n" -#define RUY_STORE_ONE_ROW(ROW, REGISTER1, REGISTER2) \ +#define RUY_STORE_ONE_ROW(ROW, REGISTER) \ "cmp r2, #" #ROW "\n" \ "beq 4f\n" \ - "vstr " #REGISTER1 ", [%[packed_ptr]]\n" \ - "vstr " #REGISTER2 ", [%[packed_ptr], #8]\n" \ + "vst1.32 {" #REGISTER "}, [%[packed_ptr]]\n" \ "add %[packed_ptr], %[packed_ptr], %[stride]\n" // Store q8 - RUY_STORE_ONE_ROW(0, d16, d17) + RUY_STORE_ONE_ROW(0, q8) // Store q10 - RUY_STORE_ONE_ROW(1, d20, d21) + RUY_STORE_ONE_ROW(1, q10) // Store q9 - RUY_STORE_ONE_ROW(2, d18, d19) + RUY_STORE_ONE_ROW(2, q9) // Store q11 - RUY_STORE_ONE_ROW(3, d22, d23) + RUY_STORE_ONE_ROW(3, q11) #undef RUY_STORE_ONE_ROW