Merge pull request #44094 from lgeiger:movi-instr

PiperOrigin-RevId: 337959731
Change-Id: Ia22fe95306b1dc5a31c34e29245739dba1dab778
This commit is contained in:
TensorFlower Gardener 2020-10-19 16:38:11 -07:00
commit c6694e7a96
2 changed files with 98 additions and 98 deletions

View File

@ -288,13 +288,13 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"ld1 {v20.8b}, [x15], %[input_depth]\n" "ld1 {v20.8b}, [x15], %[input_depth]\n"
"saddw v14.8h, v26.8h, v14.8b\n" "saddw v14.8h, v26.8h, v14.8b\n"
"dup v21.4s, wzr\n" "movi v21.4s, #0\n"
"saddw v15.8h, v26.8h, v15.8b\n" "saddw v15.8h, v26.8h, v15.8b\n"
"dup v22.4s, wzr\n" "movi v22.4s, #0\n"
"saddw v16.8h, v26.8h, v16.8b\n" "saddw v16.8h, v26.8h, v16.8b\n"
"dup v23.4s, wzr\n" "movi v23.4s, #0\n"
"saddw v17.8h, v26.8h, v17.8b\n" "saddw v17.8h, v26.8h, v17.8b\n"
"dup v24.4s, wzr\n" "movi v24.4s, #0\n"
"saddw v18.8h, v26.8h, v18.8b\n" "saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n" "saddw v19.8h, v26.8h, v19.8b\n"
@ -383,14 +383,14 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v23.4s, v23.4s\n" "fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n" "fcvtms v24.4s, v24.4s\n"
"dup v22.4s, wzr\n" "movi v22.4s, #0\n"
"dup v24.4s, wzr\n" "movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n" "saddw v9.8h, v26.8h, v9.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n" "saddw v12.8h, v26.8h, v12.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n" "saddw v15.8h, v26.8h, v15.8b\n"
"dup v21.4s, wzr\n" "movi v21.4s, #0\n"
"saddw v18.8h, v26.8h, v18.8b\n" "saddw v18.8h, v26.8h, v18.8b\n"
"dup v23.4s, wzr\n" "movi v23.4s, #0\n"
// Mul-add right outputs. // Mul-add right outputs.
"smlal v21.4s, v0.4h, v10.4h\n" "smlal v21.4s, v0.4h, v10.4h\n"
@ -480,8 +480,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v23.4s, v23.4s\n" "fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n" "fcvtms v24.4s, v24.4s\n"
"dup v22.4s, wzr\n" "movi v22.4s, #0\n"
"dup v24.4s, wzr\n" "movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n" "saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n" "saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n" "saddw v11.8h, v26.8h, v11.8b\n"
@ -489,9 +489,9 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"saddw v13.8h, v26.8h, v13.8b\n" "saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n" "saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n" "saddw v15.8h, v26.8h, v15.8b\n"
"dup v21.4s, wzr\n" "movi v21.4s, #0\n"
"saddw v16.8h, v26.8h, v16.8b\n" "saddw v16.8h, v26.8h, v16.8b\n"
"dup v23.4s, wzr\n" "movi v23.4s, #0\n"
"saddw v17.8h, v26.8h, v17.8b\n" "saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n" "saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n" "saddw v19.8h, v26.8h, v19.8b\n"
@ -581,14 +581,14 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v23.4s, v23.4s\n" "fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n" "fcvtms v24.4s, v24.4s\n"
"dup v22.4s, wzr\n" "movi v22.4s, #0\n"
"dup v24.4s, wzr\n" "movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n" "saddw v9.8h, v26.8h, v9.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n" "saddw v12.8h, v26.8h, v12.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n" "saddw v15.8h, v26.8h, v15.8b\n"
"dup v21.4s, wzr\n" "movi v21.4s, #0\n"
"saddw v18.8h, v26.8h, v18.8b\n" "saddw v18.8h, v26.8h, v18.8b\n"
"dup v23.4s, wzr\n" "movi v23.4s, #0\n"
// Mul-add right outputs. // Mul-add right outputs.
"smlal v21.4s, v0.4h, v10.4h\n" "smlal v21.4s, v0.4h, v10.4h\n"
@ -765,10 +765,10 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"ld1 {v17.8b}, [x14], %[input_depth]\n" "ld1 {v17.8b}, [x14], %[input_depth]\n"
"ld1 {v18.8b}, [x14], %[input_depth]\n" "ld1 {v18.8b}, [x14], %[input_depth]\n"
"ld1 {v19.8b}, [x14], %[input_depth]\n" "ld1 {v19.8b}, [x14], %[input_depth]\n"
"dup v21.4s, wzr\n" "movi v21.4s, #0\n"
"dup v22.4s, wzr\n" "movi v22.4s, #0\n"
"dup v23.4s, wzr\n" "movi v23.4s, #0\n"
"dup v24.4s, wzr\n" "movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n" "saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n" "saddw v10.8h, v26.8h, v10.8b\n"
@ -880,8 +880,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v23.4s, v23.4s\n" "fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n" "fcvtms v24.4s, v24.4s\n"
"dup v22.4s, wzr\n" "movi v22.4s, #0\n"
"dup v24.4s, wzr\n" "movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n" "saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n" "saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n" "saddw v11.8h, v26.8h, v11.8b\n"
@ -889,9 +889,9 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"saddw v13.8h, v26.8h, v13.8b\n" "saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n" "saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n" "saddw v15.8h, v26.8h, v15.8b\n"
"dup v21.4s, wzr\n" "movi v21.4s, #0\n"
"saddw v16.8h, v26.8h, v16.8b\n" "saddw v16.8h, v26.8h, v16.8b\n"
"dup v23.4s, wzr\n" "movi v23.4s, #0\n"
"saddw v17.8h, v26.8h, v17.8b\n" "saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n" "saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n" "saddw v19.8h, v26.8h, v19.8b\n"
@ -1202,19 +1202,19 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"ld1 {v15.8b}, [x13], %[input_depth]\n" "ld1 {v15.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x19\n" "add x7, %[output_ptr], x19\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n" "ld1 {v16.8b}, [x13], %[input_depth]\n"
"dup v21.4s, wzr\n" "movi v21.4s, #0\n"
"dup v22.4s, wzr\n" "movi v22.4s, #0\n"
"dup v23.4s, wzr\n" "movi v23.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n" "saddw v9.8h, v28.8h, v9.8b\n"
"dup v24.4s, wzr\n" "movi v24.4s, #0\n"
"saddw v10.8h, v28.8h, v10.8b\n" "saddw v10.8h, v28.8h, v10.8b\n"
"dup v19.4s, wzr\n" "movi v19.4s, #0\n"
"saddw v11.8h, v28.8h, v11.8b\n" "saddw v11.8h, v28.8h, v11.8b\n"
"dup v20.4s, wzr\n" "movi v20.4s, #0\n"
"saddw v14.8h, v28.8h, v14.8b\n" "saddw v14.8h, v28.8h, v14.8b\n"
"dup v25.4s, wzr\n" "movi v25.4s, #0\n"
"saddw v15.8h, v28.8h, v15.8b\n" "saddw v15.8h, v28.8h, v15.8b\n"
"dup v26.4s, wzr\n" "movi v26.4s, #0\n"
"saddw v16.8h, v28.8h, v16.8b\n" "saddw v16.8h, v28.8h, v16.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n" "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
@ -1343,8 +1343,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v23.4s, v23.4s\n" "fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n" "fcvtms v24.4s, v24.4s\n"
"dup v22.4s, wzr\n" "movi v22.4s, #0\n"
"dup v24.4s, wzr\n" "movi v24.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n" "saddw v9.8h, v28.8h, v9.8b\n"
"saddw v10.8h, v28.8h, v10.8b\n" "saddw v10.8h, v28.8h, v10.8b\n"
"saddw v11.8h, v28.8h, v11.8b\n" "saddw v11.8h, v28.8h, v11.8b\n"
@ -1373,9 +1373,9 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"smlal2 v20.4s, v3.8h, v14.8h\n" "smlal2 v20.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n" "ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v3.4h, v16.4h\n" "smlal v25.4s, v3.4h, v16.4h\n"
"dup v21.4s, wzr\n" "movi v21.4s, #0\n"
"smlal2 v26.4s, v3.8h, v16.8h\n" "smlal2 v26.4s, v3.8h, v16.8h\n"
"dup v23.4s, wzr\n" "movi v23.4s, #0\n"
"smlal v19.4s, v4.4h, v15.4h\n" "smlal v19.4s, v4.4h, v15.4h\n"
"saddw v17.8h, v28.8h, v17.8b\n" "saddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v20.4s, v4.8h, v15.8h\n" "smlal2 v20.4s, v4.8h, v15.8h\n"
@ -1423,14 +1423,14 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v25.4s, v25.4s\n" "fcvtms v25.4s, v25.4s\n"
"fcvtms v26.4s, v26.4s\n" "fcvtms v26.4s, v26.4s\n"
"dup v20.4s, wzr\n" "movi v20.4s, #0\n"
"dup v26.4s, wzr\n" "movi v26.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n" "saddw v9.8h, v28.8h, v9.8b\n"
"saddw v10.8h, v28.8h, v10.8b\n" "saddw v10.8h, v28.8h, v10.8b\n"
"saddw v11.8h, v28.8h, v11.8b\n" "saddw v11.8h, v28.8h, v11.8b\n"
"dup v19.4s, wzr\n" "movi v19.4s, #0\n"
"saddw v14.8h, v28.8h, v14.8b\n" "saddw v14.8h, v28.8h, v14.8b\n"
"dup v25.4s, wzr\n" "movi v25.4s, #0\n"
"saddw v15.8h, v28.8h, v15.8b\n" "saddw v15.8h, v28.8h, v15.8b\n"
"saddw v16.8h, v28.8h, v16.8b\n" "saddw v16.8h, v28.8h, v16.8b\n"
@ -1557,8 +1557,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v23.4s, v23.4s\n" "fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n" "fcvtms v24.4s, v24.4s\n"
"dup v22.4s, wzr\n" "movi v22.4s, #0\n"
"dup v24.4s, wzr\n" "movi v24.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n" "saddw v9.8h, v28.8h, v9.8b\n"
"saddw v10.8h, v28.8h, v10.8b\n" "saddw v10.8h, v28.8h, v10.8b\n"
"saddw v11.8h, v28.8h, v11.8b\n" "saddw v11.8h, v28.8h, v11.8b\n"
@ -1764,12 +1764,12 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"ld1 {v17.8b}, [x15], %[input_depth]\n" "ld1 {v17.8b}, [x15], %[input_depth]\n"
"saddw v9.8h, v28.8h, v9.8b\n" "saddw v9.8h, v28.8h, v9.8b\n"
"dup v24.4s, wzr\n" "movi v24.4s, #0\n"
"saddw v10.8h, v28.8h, v10.8b\n" "saddw v10.8h, v28.8h, v10.8b\n"
"dup v25.4s, wzr\n" "movi v25.4s, #0\n"
"saddw v11.8h, v28.8h, v11.8b\n" "saddw v11.8h, v28.8h, v11.8b\n"
"dup v26.4s, wzr\n" "movi v26.4s, #0\n"
"dup v27.4s, wzr\n" "movi v27.4s, #0\n"
"saddw v12.8h, v28.8h, v12.8b\n" "saddw v12.8h, v28.8h, v12.8b\n"
"saddw v13.8h, v28.8h, v13.8b\n" "saddw v13.8h, v28.8h, v13.8b\n"
"saddw v14.8h, v28.8h, v14.8b\n" "saddw v14.8h, v28.8h, v14.8b\n"
@ -1879,17 +1879,17 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v26.4s, v26.4s\n" "fcvtms v26.4s, v26.4s\n"
"fcvtms v27.4s, v27.4s\n" "fcvtms v27.4s, v27.4s\n"
"dup v25.4s, wzr\n" "movi v25.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n" "saddw v9.8h, v28.8h, v9.8b\n"
"dup v27.4s, wzr\n" "movi v27.4s, #0\n"
"saddw v10.8h, v28.8h, v10.8b\n" "saddw v10.8h, v28.8h, v10.8b\n"
"saddw v11.8h, v28.8h, v11.8b\n" "saddw v11.8h, v28.8h, v11.8b\n"
"saddw v12.8h, v28.8h, v12.8b\n" "saddw v12.8h, v28.8h, v12.8b\n"
"saddw v13.8h, v28.8h, v13.8b\n" "saddw v13.8h, v28.8h, v13.8b\n"
"saddw v14.8h, v28.8h, v14.8b\n" "saddw v14.8h, v28.8h, v14.8b\n"
"dup v24.4s, wzr\n" "movi v24.4s, #0\n"
"saddw v15.8h, v28.8h, v15.8b\n" "saddw v15.8h, v28.8h, v15.8b\n"
"dup v26.4s, wzr\n" "movi v26.4s, #0\n"
"saddw v16.8h, v28.8h, v16.8b\n" "saddw v16.8h, v28.8h, v16.8b\n"
"saddw v17.8h, v28.8h, v17.8b\n" "saddw v17.8h, v28.8h, v17.8b\n"
@ -2094,9 +2094,9 @@ struct DepthwiseConvHybridPartialPerChannel<
"ldr w10, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n" "ldr w10, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.4s, w9\n" "dup v30.4s, w9\n"
"dup v31.4s, w10\n" "dup v31.4s, w10\n"
"dup v16.4s, wzr\n" "movi v16.4s, #0\n"
"saddw v8.8h, v26.8h, v8.8b\n" "saddw v8.8h, v26.8h, v8.8b\n"
"dup v17.4s, wzr\n" "movi v17.4s, #0\n"
"sshll v0.8h, v0.8b, #0\n" "sshll v0.8h, v0.8b, #0\n"
"ld1 {v6.4s}, [%[per_channel_scales]], #16\n" "ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
@ -2133,9 +2133,9 @@ struct DepthwiseConvHybridPartialPerChannel<
"fcvtms v17.4s, v17.4s\n" "fcvtms v17.4s, v17.4s\n"
"saddw v8.8h, v26.8h, v8.8b\n" "saddw v8.8h, v26.8h, v8.8b\n"
"dup v16.4s, wzr\n" "movi v16.4s, #0\n"
"sshll v0.8h, v0.8b, #0\n" "sshll v0.8h, v0.8b, #0\n"
"dup v17.4s, wzr\n" "movi v17.4s, #0\n"
"ld1 {v6.4s}, [%[per_channel_scales]], #16\n" "ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
"ld1 {v10.4s}, [%[bias_ptr]], #16\n" "ld1 {v10.4s}, [%[bias_ptr]], #16\n"
"ld1 {v7.4s}, [%[per_channel_scales]], #16\n" "ld1 {v7.4s}, [%[per_channel_scales]], #16\n"
@ -2241,9 +2241,9 @@ struct DepthwiseConvHybridPartialPerChannel<
// Add input and filter offsets. // Add input and filter offsets.
"saddw v8.8h, v26.8h, v8.8b\n" "saddw v8.8h, v26.8h, v8.8b\n"
"dup v16.4s, wzr\n" "movi v16.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n" "saddw v9.8h, v26.8h, v9.8b\n"
"dup v17.4s, wzr\n" "movi v17.4s, #0\n"
"saddw v10.8h, v26.8h, v10.8b\n" "saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n" "saddw v11.8h, v26.8h, v11.8b\n"
@ -2290,9 +2290,9 @@ struct DepthwiseConvHybridPartialPerChannel<
"fcvtms v17.4s, v17.4s\n" "fcvtms v17.4s, v17.4s\n"
"saddw v8.8h, v26.8h, v8.8b\n" "saddw v8.8h, v26.8h, v8.8b\n"
"dup v16.4s, wzr\n" "movi v16.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n" "saddw v9.8h, v26.8h, v9.8b\n"
"dup v17.4s, wzr\n" "movi v17.4s, #0\n"
"saddw v10.8h, v26.8h, v10.8b\n" "saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n" "saddw v11.8h, v26.8h, v11.8b\n"
"sshll v0.8h, v0.8b, #0\n" "sshll v0.8h, v0.8b, #0\n"
@ -2417,9 +2417,9 @@ struct DepthwiseConvHybridPartialPerChannel<
// Add input and filter offsets. // Add input and filter offsets.
"saddw v8.8h, v26.8h, v8.8b\n" "saddw v8.8h, v26.8h, v8.8b\n"
"dup v16.4s, wzr\n" "movi v16.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n" "saddw v9.8h, v26.8h, v9.8b\n"
"dup v17.4s, wzr\n" "movi v17.4s, #0\n"
"saddw v10.8h, v26.8h, v10.8b\n" "saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n" "saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n" "saddw v12.8h, v26.8h, v12.8b\n"
@ -2494,9 +2494,9 @@ struct DepthwiseConvHybridPartialPerChannel<
"sshll v0.8h, v0.8b, #0\n" "sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n" "sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n" "sshll v2.8h, v2.8b, #0\n"
"dup v16.4s, wzr\n" "movi v16.4s, #0\n"
"sshll v3.8h, v3.8b, #0\n" "sshll v3.8h, v3.8b, #0\n"
"dup v17.4s, wzr\n" "movi v17.4s, #0\n"
"sshll v4.8h, v4.8b, #0\n" "sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n" "sshll v5.8h, v5.8b, #0\n"
"ld1 {v6.4s}, [%[bias_ptr]], #16\n" "ld1 {v6.4s}, [%[bias_ptr]], #16\n"
@ -2622,9 +2622,9 @@ struct DepthwiseConvHybridPartialPerChannel<
// Add input and filter offsets. // Add input and filter offsets.
"saddw v8.8h, v26.8h, v8.8b\n" "saddw v8.8h, v26.8h, v8.8b\n"
"dup v16.4s, wzr\n" "movi v16.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n" "saddw v9.8h, v26.8h, v9.8b\n"
"dup v17.4s, wzr\n" "movi v17.4s, #0\n"
"saddw v10.8h, v26.8h, v10.8b\n" "saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n" "saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n" "saddw v12.8h, v26.8h, v12.8b\n"
@ -2701,9 +2701,9 @@ struct DepthwiseConvHybridPartialPerChannel<
"sshll v0.8h, v0.8b, #0\n" "sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n" "sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n" "sshll v2.8h, v2.8b, #0\n"
"dup v16.4s, wzr\n" "movi v16.4s, #0\n"
"sshll v3.8h, v3.8b, #0\n" "sshll v3.8h, v3.8b, #0\n"
"dup v17.4s, wzr\n" "movi v17.4s, #0\n"
"sshll v4.8h, v4.8b, #0\n" "sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n" "sshll v5.8h, v5.8b, #0\n"

View File

@ -372,10 +372,10 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
asm volatile( asm volatile(
// Zero out the accumulator registers. // Zero out the accumulator registers.
"dup v0.4s, wzr\n" "movi v0.4s, #0\n"
"dup v1.4s, wzr\n" "movi v1.4s, #0\n"
"dup v2.4s, wzr\n" "movi v2.4s, #0\n"
"dup v3.4s, wzr\n" "movi v3.4s, #0\n"
"1:\n" // batch_cols_loop "1:\n" // batch_cols_loop
@ -463,12 +463,12 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
"st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
"st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
"st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
: [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1),
[ vec_ptr ] "+r"(vec_ptr), [ result_ptr ] "+r"(result_ptr), [vec_ptr] "+r"(vec_ptr), [result_ptr] "+r"(result_ptr),
[ mat_ptr2 ] "+r"(mat_ptr2), [ mat_ptr3 ] "+r"(mat_ptr3) [mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3)
: [ mat_ptr0_end ] "r"(mat_ptr0_end), : [mat_ptr0_end] "r"(mat_ptr0_end),
[ scaling_factors_ptr ] "r"(scaling_factors_ptr), [scaling_factors_ptr] "r"(scaling_factors_ptr),
[ wide_rows ] "r"(wide_rows) [wide_rows] "r"(wide_rows)
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "cc", "memory"); "v10", "v11", "v12", "v13", "cc", "memory");
} }
@ -501,16 +501,16 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr; const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr;
const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr; const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr;
asm volatile( asm volatile(
"dup v0.4s, wzr\n" "movi v0.4s, #0\n"
"dup v1.4s, wzr\n" "movi v1.4s, #0\n"
"dup v2.4s, wzr\n" "movi v2.4s, #0\n"
"dup v3.4s, wzr\n" "movi v3.4s, #0\n"
// Load zero points. // Load zero points.
"ld1 {v7.4s}, [%[batch_offsets_ptr]]\n" "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
"ld1 {v4.4s}, [%[scaling_factors_ptr]]\n" "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
// Zero out zero point accumulators. // Zero out zero point accumulators.
"dup v14.4s, wzr\n" "movi v14.4s, #0\n"
"dup v15.4s, wzr\n" "movi v15.4s, #0\n"
// Load per channel scales if not null. // Load per channel scales if not null.
"cmp %w[is_channel_scale_nullptr], #0\n" "cmp %w[is_channel_scale_nullptr], #0\n"
@ -587,16 +587,16 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
"st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
"st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
"st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
: [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1),
[ vec_ptr ] "+r"(vec_ptr), [ result_ptr ] "+r"(result_ptr), [vec_ptr] "+r"(vec_ptr), [result_ptr] "+r"(result_ptr),
[ row_sums_ptr ] "+r"(row_sums_ptr) [row_sums_ptr] "+r"(row_sums_ptr)
: [ mat_ptr0_end ] "r"(mat_ptr0_end), : [mat_ptr0_end] "r"(mat_ptr0_end),
[ scaling_factors_ptr ] "r"(scaling_factors_ptr), [scaling_factors_ptr] "r"(scaling_factors_ptr),
[ wide_rows ] "r"(wide_rows), [wide_rows] "r"(wide_rows),
[ channel_scales_ptr ] "r"(channel_scales_ptr), [channel_scales_ptr] "r"(channel_scales_ptr),
[ batch_offsets_ptr ] "r"(batch_offsets_ptr), [batch_offsets_ptr] "r"(batch_offsets_ptr),
[ is_channel_scale_nullptr ] "r"(is_channel_scale_nullptr), [is_channel_scale_nullptr] "r"(is_channel_scale_nullptr),
[ is_row_sums_nullptr ] "r"(is_row_sums_nullptr) [is_row_sums_nullptr] "r"(is_row_sums_nullptr)
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1",
"cc", "memory"); "cc", "memory");
@ -746,9 +746,9 @@ static void DotprodSparseMatrixBatchVectorMultiplyAccumulate(
if (ledger_ptr != ledger_end) { if (ledger_ptr != ledger_end) {
asm volatile( asm volatile(
"dup v0.4s, wzr\n" "movi v0.4s, #0\n"
"dup v1.4s, wzr\n" "movi v1.4s, #0\n"
"dup v8.4s, wzr\n" "movi v8.4s, #0\n"
"mov x7, 0\n" "mov x7, 0\n"
"1:\n" // chunks_loop "1:\n" // chunks_loop
@ -775,9 +775,9 @@ static void DotprodSparseMatrixBatchVectorMultiplyAccumulate(
// We have to be careful to cast this value to 32 bits in order // We have to be careful to cast this value to 32 bits in order
// to interpret the sign bit properly. // to interpret the sign bit properly.
"mov %[row_sum], v1.d[0]\n" "mov %[row_sum], v1.d[0]\n"
: [ row_sum ] "=r"(row_sum), [ ledger_ptr ] "+r"(ledger_ptr), : [row_sum] "=r"(row_sum), [ledger_ptr] "+r"(ledger_ptr),
[ mat_ptr ] "+r"(mat_ptr), [ vec_ptr ] "+r"(vec_ptr) [mat_ptr] "+r"(mat_ptr), [vec_ptr] "+r"(vec_ptr)
: [ ledger_end ] "r"(ledger_end) : [ledger_end] "r"(ledger_end)
: "x0", "x1", "x7", "x8", "v0", "v1", "v8", "v9", "cc", "memory"); : "x0", "x1", "x7", "x8", "v0", "v1", "v8", "v9", "cc", "memory");
} }
result[batch * m_rows + row] += result[batch * m_rows + row] +=