Use movi NEON instruction to zero out registers
Currently `dup` is used to zero our NEON registers in the packing and AArch64 kernel code. According to the [Cortex A72 optimization guide](https://developer.arm.com/documentation/uan0016/a/) which is used in the Raspberry PI 4, `dup` has an execution latency of 8 cycles and a throughput of 1 when copying from a general purpose register to a NEON register. This PR changes the code to use `movi` which has a latency of 3 cycles and a throughput of 2. This is also used in [LLVM for zeroing out registers](https://github.com/llvm/llvm-project/blob/master/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll), but please let me know if I am missing something here.
This commit is contained in:
parent
716e817347
commit
4c1a454e58
@ -288,13 +288,13 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"ld1 {v20.8b}, [x15], %[input_depth]\n"
|
||||
"saddw v14.8h, v26.8h, v14.8b\n"
|
||||
|
||||
"dup v21.4s, wzr\n"
|
||||
"movi v21.4s, #0\n"
|
||||
"saddw v15.8h, v26.8h, v15.8b\n"
|
||||
"dup v22.4s, wzr\n"
|
||||
"movi v22.4s, #0\n"
|
||||
"saddw v16.8h, v26.8h, v16.8b\n"
|
||||
"dup v23.4s, wzr\n"
|
||||
"movi v23.4s, #0\n"
|
||||
"saddw v17.8h, v26.8h, v17.8b\n"
|
||||
"dup v24.4s, wzr\n"
|
||||
"movi v24.4s, #0\n"
|
||||
|
||||
"saddw v18.8h, v26.8h, v18.8b\n"
|
||||
"saddw v19.8h, v26.8h, v19.8b\n"
|
||||
@ -383,14 +383,14 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"fcvtms v23.4s, v23.4s\n"
|
||||
"fcvtms v24.4s, v24.4s\n"
|
||||
|
||||
"dup v22.4s, wzr\n"
|
||||
"dup v24.4s, wzr\n"
|
||||
"movi v22.4s, #0\n"
|
||||
"movi v24.4s, #0\n"
|
||||
"saddw v9.8h, v26.8h, v9.8b\n"
|
||||
"saddw v12.8h, v26.8h, v12.8b\n"
|
||||
"saddw v15.8h, v26.8h, v15.8b\n"
|
||||
"dup v21.4s, wzr\n"
|
||||
"movi v21.4s, #0\n"
|
||||
"saddw v18.8h, v26.8h, v18.8b\n"
|
||||
"dup v23.4s, wzr\n"
|
||||
"movi v23.4s, #0\n"
|
||||
|
||||
// Mul-add right outputs.
|
||||
"smlal v21.4s, v0.4h, v10.4h\n"
|
||||
@ -480,8 +480,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"fcvtms v23.4s, v23.4s\n"
|
||||
"fcvtms v24.4s, v24.4s\n"
|
||||
|
||||
"dup v22.4s, wzr\n"
|
||||
"dup v24.4s, wzr\n"
|
||||
"movi v22.4s, #0\n"
|
||||
"movi v24.4s, #0\n"
|
||||
"saddw v9.8h, v26.8h, v9.8b\n"
|
||||
"saddw v10.8h, v26.8h, v10.8b\n"
|
||||
"saddw v11.8h, v26.8h, v11.8b\n"
|
||||
@ -489,9 +489,9 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"saddw v13.8h, v26.8h, v13.8b\n"
|
||||
"saddw v14.8h, v26.8h, v14.8b\n"
|
||||
"saddw v15.8h, v26.8h, v15.8b\n"
|
||||
"dup v21.4s, wzr\n"
|
||||
"movi v21.4s, #0\n"
|
||||
"saddw v16.8h, v26.8h, v16.8b\n"
|
||||
"dup v23.4s, wzr\n"
|
||||
"movi v23.4s, #0\n"
|
||||
"saddw v17.8h, v26.8h, v17.8b\n"
|
||||
"saddw v18.8h, v26.8h, v18.8b\n"
|
||||
"saddw v19.8h, v26.8h, v19.8b\n"
|
||||
@ -581,14 +581,14 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"fcvtms v23.4s, v23.4s\n"
|
||||
"fcvtms v24.4s, v24.4s\n"
|
||||
|
||||
"dup v22.4s, wzr\n"
|
||||
"dup v24.4s, wzr\n"
|
||||
"movi v22.4s, #0\n"
|
||||
"movi v24.4s, #0\n"
|
||||
"saddw v9.8h, v26.8h, v9.8b\n"
|
||||
"saddw v12.8h, v26.8h, v12.8b\n"
|
||||
"saddw v15.8h, v26.8h, v15.8b\n"
|
||||
"dup v21.4s, wzr\n"
|
||||
"movi v21.4s, #0\n"
|
||||
"saddw v18.8h, v26.8h, v18.8b\n"
|
||||
"dup v23.4s, wzr\n"
|
||||
"movi v23.4s, #0\n"
|
||||
|
||||
// Mul-add right outputs.
|
||||
"smlal v21.4s, v0.4h, v10.4h\n"
|
||||
@ -765,10 +765,10 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"ld1 {v17.8b}, [x14], %[input_depth]\n"
|
||||
"ld1 {v18.8b}, [x14], %[input_depth]\n"
|
||||
"ld1 {v19.8b}, [x14], %[input_depth]\n"
|
||||
"dup v21.4s, wzr\n"
|
||||
"dup v22.4s, wzr\n"
|
||||
"dup v23.4s, wzr\n"
|
||||
"dup v24.4s, wzr\n"
|
||||
"movi v21.4s, #0\n"
|
||||
"movi v22.4s, #0\n"
|
||||
"movi v23.4s, #0\n"
|
||||
"movi v24.4s, #0\n"
|
||||
|
||||
"saddw v9.8h, v26.8h, v9.8b\n"
|
||||
"saddw v10.8h, v26.8h, v10.8b\n"
|
||||
@ -880,8 +880,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"fcvtms v23.4s, v23.4s\n"
|
||||
"fcvtms v24.4s, v24.4s\n"
|
||||
|
||||
"dup v22.4s, wzr\n"
|
||||
"dup v24.4s, wzr\n"
|
||||
"movi v22.4s, #0\n"
|
||||
"movi v24.4s, #0\n"
|
||||
"saddw v9.8h, v26.8h, v9.8b\n"
|
||||
"saddw v10.8h, v26.8h, v10.8b\n"
|
||||
"saddw v11.8h, v26.8h, v11.8b\n"
|
||||
@ -889,9 +889,9 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"saddw v13.8h, v26.8h, v13.8b\n"
|
||||
"saddw v14.8h, v26.8h, v14.8b\n"
|
||||
"saddw v15.8h, v26.8h, v15.8b\n"
|
||||
"dup v21.4s, wzr\n"
|
||||
"movi v21.4s, #0\n"
|
||||
"saddw v16.8h, v26.8h, v16.8b\n"
|
||||
"dup v23.4s, wzr\n"
|
||||
"movi v23.4s, #0\n"
|
||||
"saddw v17.8h, v26.8h, v17.8b\n"
|
||||
"saddw v18.8h, v26.8h, v18.8b\n"
|
||||
"saddw v19.8h, v26.8h, v19.8b\n"
|
||||
@ -1202,19 +1202,19 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"ld1 {v15.8b}, [x13], %[input_depth]\n"
|
||||
"add x7, %[output_ptr], x19\n"
|
||||
"ld1 {v16.8b}, [x13], %[input_depth]\n"
|
||||
"dup v21.4s, wzr\n"
|
||||
"dup v22.4s, wzr\n"
|
||||
"dup v23.4s, wzr\n"
|
||||
"movi v21.4s, #0\n"
|
||||
"movi v22.4s, #0\n"
|
||||
"movi v23.4s, #0\n"
|
||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||
"dup v24.4s, wzr\n"
|
||||
"movi v24.4s, #0\n"
|
||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||
"dup v19.4s, wzr\n"
|
||||
"movi v19.4s, #0\n"
|
||||
"saddw v11.8h, v28.8h, v11.8b\n"
|
||||
"dup v20.4s, wzr\n"
|
||||
"movi v20.4s, #0\n"
|
||||
"saddw v14.8h, v28.8h, v14.8b\n"
|
||||
"dup v25.4s, wzr\n"
|
||||
"movi v25.4s, #0\n"
|
||||
"saddw v15.8h, v28.8h, v15.8b\n"
|
||||
"dup v26.4s, wzr\n"
|
||||
"movi v26.4s, #0\n"
|
||||
"saddw v16.8h, v28.8h, v16.8b\n"
|
||||
|
||||
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
|
||||
@ -1343,8 +1343,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"fcvtms v23.4s, v23.4s\n"
|
||||
"fcvtms v24.4s, v24.4s\n"
|
||||
|
||||
"dup v22.4s, wzr\n"
|
||||
"dup v24.4s, wzr\n"
|
||||
"movi v22.4s, #0\n"
|
||||
"movi v24.4s, #0\n"
|
||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||
"saddw v11.8h, v28.8h, v11.8b\n"
|
||||
@ -1373,9 +1373,9 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"smlal2 v20.4s, v3.8h, v14.8h\n"
|
||||
"ld1 {v14.8b}, [x13], %[input_depth]\n"
|
||||
"smlal v25.4s, v3.4h, v16.4h\n"
|
||||
"dup v21.4s, wzr\n"
|
||||
"movi v21.4s, #0\n"
|
||||
"smlal2 v26.4s, v3.8h, v16.8h\n"
|
||||
"dup v23.4s, wzr\n"
|
||||
"movi v23.4s, #0\n"
|
||||
"smlal v19.4s, v4.4h, v15.4h\n"
|
||||
"saddw v17.8h, v28.8h, v17.8b\n"
|
||||
"smlal2 v20.4s, v4.8h, v15.8h\n"
|
||||
@ -1423,14 +1423,14 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"fcvtms v25.4s, v25.4s\n"
|
||||
"fcvtms v26.4s, v26.4s\n"
|
||||
|
||||
"dup v20.4s, wzr\n"
|
||||
"dup v26.4s, wzr\n"
|
||||
"movi v20.4s, #0\n"
|
||||
"movi v26.4s, #0\n"
|
||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||
"saddw v11.8h, v28.8h, v11.8b\n"
|
||||
"dup v19.4s, wzr\n"
|
||||
"movi v19.4s, #0\n"
|
||||
"saddw v14.8h, v28.8h, v14.8b\n"
|
||||
"dup v25.4s, wzr\n"
|
||||
"movi v25.4s, #0\n"
|
||||
"saddw v15.8h, v28.8h, v15.8b\n"
|
||||
"saddw v16.8h, v28.8h, v16.8b\n"
|
||||
|
||||
@ -1557,8 +1557,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"fcvtms v23.4s, v23.4s\n"
|
||||
"fcvtms v24.4s, v24.4s\n"
|
||||
|
||||
"dup v22.4s, wzr\n"
|
||||
"dup v24.4s, wzr\n"
|
||||
"movi v22.4s, #0\n"
|
||||
"movi v24.4s, #0\n"
|
||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||
"saddw v11.8h, v28.8h, v11.8b\n"
|
||||
@ -1764,12 +1764,12 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"ld1 {v17.8b}, [x15], %[input_depth]\n"
|
||||
|
||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||
"dup v24.4s, wzr\n"
|
||||
"movi v24.4s, #0\n"
|
||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||
"dup v25.4s, wzr\n"
|
||||
"movi v25.4s, #0\n"
|
||||
"saddw v11.8h, v28.8h, v11.8b\n"
|
||||
"dup v26.4s, wzr\n"
|
||||
"dup v27.4s, wzr\n"
|
||||
"movi v26.4s, #0\n"
|
||||
"movi v27.4s, #0\n"
|
||||
"saddw v12.8h, v28.8h, v12.8b\n"
|
||||
"saddw v13.8h, v28.8h, v13.8b\n"
|
||||
"saddw v14.8h, v28.8h, v14.8b\n"
|
||||
@ -1879,17 +1879,17 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"fcvtms v26.4s, v26.4s\n"
|
||||
"fcvtms v27.4s, v27.4s\n"
|
||||
|
||||
"dup v25.4s, wzr\n"
|
||||
"movi v25.4s, #0\n"
|
||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||
"dup v27.4s, wzr\n"
|
||||
"movi v27.4s, #0\n"
|
||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||
"saddw v11.8h, v28.8h, v11.8b\n"
|
||||
"saddw v12.8h, v28.8h, v12.8b\n"
|
||||
"saddw v13.8h, v28.8h, v13.8b\n"
|
||||
"saddw v14.8h, v28.8h, v14.8b\n"
|
||||
"dup v24.4s, wzr\n"
|
||||
"movi v24.4s, #0\n"
|
||||
"saddw v15.8h, v28.8h, v15.8b\n"
|
||||
"dup v26.4s, wzr\n"
|
||||
"movi v26.4s, #0\n"
|
||||
"saddw v16.8h, v28.8h, v16.8b\n"
|
||||
"saddw v17.8h, v28.8h, v17.8b\n"
|
||||
|
||||
@ -2094,9 +2094,9 @@ struct DepthwiseConvHybridPartialPerChannel<
|
||||
"ldr w10, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
|
||||
"dup v30.4s, w9\n"
|
||||
"dup v31.4s, w10\n"
|
||||
"dup v16.4s, wzr\n"
|
||||
"movi v16.4s, #0\n"
|
||||
"saddw v8.8h, v26.8h, v8.8b\n"
|
||||
"dup v17.4s, wzr\n"
|
||||
"movi v17.4s, #0\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
|
||||
"ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
|
||||
@ -2133,9 +2133,9 @@ struct DepthwiseConvHybridPartialPerChannel<
|
||||
"fcvtms v17.4s, v17.4s\n"
|
||||
|
||||
"saddw v8.8h, v26.8h, v8.8b\n"
|
||||
"dup v16.4s, wzr\n"
|
||||
"movi v16.4s, #0\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
"dup v17.4s, wzr\n"
|
||||
"movi v17.4s, #0\n"
|
||||
"ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
|
||||
"ld1 {v10.4s}, [%[bias_ptr]], #16\n"
|
||||
"ld1 {v7.4s}, [%[per_channel_scales]], #16\n"
|
||||
@ -2241,9 +2241,9 @@ struct DepthwiseConvHybridPartialPerChannel<
|
||||
|
||||
// Add input and filter offsets.
|
||||
"saddw v8.8h, v26.8h, v8.8b\n"
|
||||
"dup v16.4s, wzr\n"
|
||||
"movi v16.4s, #0\n"
|
||||
"saddw v9.8h, v26.8h, v9.8b\n"
|
||||
"dup v17.4s, wzr\n"
|
||||
"movi v17.4s, #0\n"
|
||||
"saddw v10.8h, v26.8h, v10.8b\n"
|
||||
"saddw v11.8h, v26.8h, v11.8b\n"
|
||||
|
||||
@ -2290,9 +2290,9 @@ struct DepthwiseConvHybridPartialPerChannel<
|
||||
"fcvtms v17.4s, v17.4s\n"
|
||||
|
||||
"saddw v8.8h, v26.8h, v8.8b\n"
|
||||
"dup v16.4s, wzr\n"
|
||||
"movi v16.4s, #0\n"
|
||||
"saddw v9.8h, v26.8h, v9.8b\n"
|
||||
"dup v17.4s, wzr\n"
|
||||
"movi v17.4s, #0\n"
|
||||
"saddw v10.8h, v26.8h, v10.8b\n"
|
||||
"saddw v11.8h, v26.8h, v11.8b\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
@ -2417,9 +2417,9 @@ struct DepthwiseConvHybridPartialPerChannel<
|
||||
|
||||
// Add input and filter offsets.
|
||||
"saddw v8.8h, v26.8h, v8.8b\n"
|
||||
"dup v16.4s, wzr\n"
|
||||
"movi v16.4s, #0\n"
|
||||
"saddw v9.8h, v26.8h, v9.8b\n"
|
||||
"dup v17.4s, wzr\n"
|
||||
"movi v17.4s, #0\n"
|
||||
"saddw v10.8h, v26.8h, v10.8b\n"
|
||||
"saddw v11.8h, v26.8h, v11.8b\n"
|
||||
"saddw v12.8h, v26.8h, v12.8b\n"
|
||||
@ -2494,9 +2494,9 @@ struct DepthwiseConvHybridPartialPerChannel<
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
"sshll v1.8h, v1.8b, #0\n"
|
||||
"sshll v2.8h, v2.8b, #0\n"
|
||||
"dup v16.4s, wzr\n"
|
||||
"movi v16.4s, #0\n"
|
||||
"sshll v3.8h, v3.8b, #0\n"
|
||||
"dup v17.4s, wzr\n"
|
||||
"movi v17.4s, #0\n"
|
||||
"sshll v4.8h, v4.8b, #0\n"
|
||||
"sshll v5.8h, v5.8b, #0\n"
|
||||
"ld1 {v6.4s}, [%[bias_ptr]], #16\n"
|
||||
@ -2622,9 +2622,9 @@ struct DepthwiseConvHybridPartialPerChannel<
|
||||
|
||||
// Add input and filter offsets.
|
||||
"saddw v8.8h, v26.8h, v8.8b\n"
|
||||
"dup v16.4s, wzr\n"
|
||||
"movi v16.4s, #0\n"
|
||||
"saddw v9.8h, v26.8h, v9.8b\n"
|
||||
"dup v17.4s, wzr\n"
|
||||
"movi v17.4s, #0\n"
|
||||
"saddw v10.8h, v26.8h, v10.8b\n"
|
||||
"saddw v11.8h, v26.8h, v11.8b\n"
|
||||
"saddw v12.8h, v26.8h, v12.8b\n"
|
||||
@ -2701,9 +2701,9 @@ struct DepthwiseConvHybridPartialPerChannel<
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
"sshll v1.8h, v1.8b, #0\n"
|
||||
"sshll v2.8h, v2.8b, #0\n"
|
||||
"dup v16.4s, wzr\n"
|
||||
"movi v16.4s, #0\n"
|
||||
"sshll v3.8h, v3.8b, #0\n"
|
||||
"dup v17.4s, wzr\n"
|
||||
"movi v17.4s, #0\n"
|
||||
"sshll v4.8h, v4.8b, #0\n"
|
||||
"sshll v5.8h, v5.8b, #0\n"
|
||||
|
||||
|
@ -372,10 +372,10 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
|
||||
|
||||
asm volatile(
|
||||
// Zero out the accumulator registers.
|
||||
"dup v0.4s, wzr\n"
|
||||
"dup v1.4s, wzr\n"
|
||||
"dup v2.4s, wzr\n"
|
||||
"dup v3.4s, wzr\n"
|
||||
"movi v0.4s, #0\n"
|
||||
"movi v1.4s, #0\n"
|
||||
"movi v2.4s, #0\n"
|
||||
"movi v3.4s, #0\n"
|
||||
|
||||
"1:\n" // batch_cols_loop
|
||||
|
||||
@ -501,16 +501,16 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
|
||||
const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr;
|
||||
const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr;
|
||||
asm volatile(
|
||||
"dup v0.4s, wzr\n"
|
||||
"dup v1.4s, wzr\n"
|
||||
"dup v2.4s, wzr\n"
|
||||
"dup v3.4s, wzr\n"
|
||||
"movi v0.4s, #0\n"
|
||||
"movi v1.4s, #0\n"
|
||||
"movi v2.4s, #0\n"
|
||||
"movi v3.4s, #0\n"
|
||||
// Load zero points.
|
||||
"ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
|
||||
"ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
|
||||
// Zero out zero point accumulators.
|
||||
"dup v14.4s, wzr\n"
|
||||
"dup v15.4s, wzr\n"
|
||||
"movi v14.4s, #0\n"
|
||||
"movi v15.4s, #0\n"
|
||||
|
||||
// Load per channel scales if not null.
|
||||
"cmp %w[is_channel_scale_nullptr], #0\n"
|
||||
@ -746,9 +746,9 @@ static void DotprodSparseMatrixBatchVectorMultiplyAccumulate(
|
||||
|
||||
if (ledger_ptr != ledger_end) {
|
||||
asm volatile(
|
||||
"dup v0.4s, wzr\n"
|
||||
"dup v1.4s, wzr\n"
|
||||
"dup v8.4s, wzr\n"
|
||||
"movi v0.4s, #0\n"
|
||||
"movi v1.4s, #0\n"
|
||||
"movi v8.4s, #0\n"
|
||||
"mov x7, 0\n"
|
||||
|
||||
"1:\n" // chunks_loop
|
||||
|
Loading…
Reference in New Issue
Block a user