Merge pull request #44094 from lgeiger:movi-instr

PiperOrigin-RevId: 337959731
Change-Id: Ia22fe95306b1dc5a31c34e29245739dba1dab778
This commit is contained in:
TensorFlower Gardener 2020-10-19 16:38:11 -07:00
commit c6694e7a96
2 changed files with 98 additions and 98 deletions

View File

@ -288,13 +288,13 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"ld1 {v20.8b}, [x15], %[input_depth]\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"dup v21.4s, wzr\n"
"movi v21.4s, #0\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"dup v22.4s, wzr\n"
"movi v22.4s, #0\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"dup v23.4s, wzr\n"
"movi v23.4s, #0\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"dup v24.4s, wzr\n"
"movi v24.4s, #0\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
@ -383,14 +383,14 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"dup v22.4s, wzr\n"
"dup v24.4s, wzr\n"
"movi v22.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"dup v21.4s, wzr\n"
"movi v21.4s, #0\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"dup v23.4s, wzr\n"
"movi v23.4s, #0\n"
// Mul-add right outputs.
"smlal v21.4s, v0.4h, v10.4h\n"
@ -480,8 +480,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"dup v22.4s, wzr\n"
"dup v24.4s, wzr\n"
"movi v22.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
@ -489,9 +489,9 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"dup v21.4s, wzr\n"
"movi v21.4s, #0\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"dup v23.4s, wzr\n"
"movi v23.4s, #0\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
@ -581,14 +581,14 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"dup v22.4s, wzr\n"
"dup v24.4s, wzr\n"
"movi v22.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"dup v21.4s, wzr\n"
"movi v21.4s, #0\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"dup v23.4s, wzr\n"
"movi v23.4s, #0\n"
// Mul-add right outputs.
"smlal v21.4s, v0.4h, v10.4h\n"
@ -765,10 +765,10 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"ld1 {v18.8b}, [x14], %[input_depth]\n"
"ld1 {v19.8b}, [x14], %[input_depth]\n"
"dup v21.4s, wzr\n"
"dup v22.4s, wzr\n"
"dup v23.4s, wzr\n"
"dup v24.4s, wzr\n"
"movi v21.4s, #0\n"
"movi v22.4s, #0\n"
"movi v23.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n"
@ -880,8 +880,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"dup v22.4s, wzr\n"
"dup v24.4s, wzr\n"
"movi v22.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
@ -889,9 +889,9 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"dup v21.4s, wzr\n"
"movi v21.4s, #0\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"dup v23.4s, wzr\n"
"movi v23.4s, #0\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
@ -1202,19 +1202,19 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x19\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"dup v21.4s, wzr\n"
"dup v22.4s, wzr\n"
"dup v23.4s, wzr\n"
"movi v21.4s, #0\n"
"movi v22.4s, #0\n"
"movi v23.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"dup v24.4s, wzr\n"
"movi v24.4s, #0\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"dup v19.4s, wzr\n"
"movi v19.4s, #0\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"dup v20.4s, wzr\n"
"movi v20.4s, #0\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"dup v25.4s, wzr\n"
"movi v25.4s, #0\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"dup v26.4s, wzr\n"
"movi v26.4s, #0\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
@ -1343,8 +1343,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"dup v22.4s, wzr\n"
"dup v24.4s, wzr\n"
"movi v22.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"saddw v11.8h, v28.8h, v11.8b\n"
@ -1373,9 +1373,9 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"smlal2 v20.4s, v3.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v25.4s, v3.4h, v16.4h\n"
"dup v21.4s, wzr\n"
"movi v21.4s, #0\n"
"smlal2 v26.4s, v3.8h, v16.8h\n"
"dup v23.4s, wzr\n"
"movi v23.4s, #0\n"
"smlal v19.4s, v4.4h, v15.4h\n"
"saddw v17.8h, v28.8h, v17.8b\n"
"smlal2 v20.4s, v4.8h, v15.8h\n"
@ -1423,14 +1423,14 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v25.4s, v25.4s\n"
"fcvtms v26.4s, v26.4s\n"
"dup v20.4s, wzr\n"
"dup v26.4s, wzr\n"
"movi v20.4s, #0\n"
"movi v26.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"dup v19.4s, wzr\n"
"movi v19.4s, #0\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"dup v25.4s, wzr\n"
"movi v25.4s, #0\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"saddw v16.8h, v28.8h, v16.8b\n"
@ -1557,8 +1557,8 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v23.4s, v23.4s\n"
"fcvtms v24.4s, v24.4s\n"
"dup v22.4s, wzr\n"
"dup v24.4s, wzr\n"
"movi v22.4s, #0\n"
"movi v24.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"saddw v11.8h, v28.8h, v11.8b\n"
@ -1764,12 +1764,12 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"ld1 {v17.8b}, [x15], %[input_depth]\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"dup v24.4s, wzr\n"
"movi v24.4s, #0\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"dup v25.4s, wzr\n"
"movi v25.4s, #0\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"dup v26.4s, wzr\n"
"dup v27.4s, wzr\n"
"movi v26.4s, #0\n"
"movi v27.4s, #0\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"saddw v14.8h, v28.8h, v14.8b\n"
@ -1879,17 +1879,17 @@ struct DepthwiseConvHybridWindowPerChannel<DepthwiseConvOutputRounding::kUpward,
"fcvtms v26.4s, v26.4s\n"
"fcvtms v27.4s, v27.4s\n"
"dup v25.4s, wzr\n"
"movi v25.4s, #0\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"dup v27.4s, wzr\n"
"movi v27.4s, #0\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"saddw v11.8h, v28.8h, v11.8b\n"
"saddw v12.8h, v28.8h, v12.8b\n"
"saddw v13.8h, v28.8h, v13.8b\n"
"saddw v14.8h, v28.8h, v14.8b\n"
"dup v24.4s, wzr\n"
"movi v24.4s, #0\n"
"saddw v15.8h, v28.8h, v15.8b\n"
"dup v26.4s, wzr\n"
"movi v26.4s, #0\n"
"saddw v16.8h, v28.8h, v16.8b\n"
"saddw v17.8h, v28.8h, v17.8b\n"
@ -2094,9 +2094,9 @@ struct DepthwiseConvHybridPartialPerChannel<
"ldr w10, [%[params_ptr], #" STR(OFFSET_FLOAT_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.4s, w9\n"
"dup v31.4s, w10\n"
"dup v16.4s, wzr\n"
"movi v16.4s, #0\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"dup v17.4s, wzr\n"
"movi v17.4s, #0\n"
"sshll v0.8h, v0.8b, #0\n"
"ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
@ -2133,9 +2133,9 @@ struct DepthwiseConvHybridPartialPerChannel<
"fcvtms v17.4s, v17.4s\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"dup v16.4s, wzr\n"
"movi v16.4s, #0\n"
"sshll v0.8h, v0.8b, #0\n"
"dup v17.4s, wzr\n"
"movi v17.4s, #0\n"
"ld1 {v6.4s}, [%[per_channel_scales]], #16\n"
"ld1 {v10.4s}, [%[bias_ptr]], #16\n"
"ld1 {v7.4s}, [%[per_channel_scales]], #16\n"
@ -2241,9 +2241,9 @@ struct DepthwiseConvHybridPartialPerChannel<
// Add input and filter offsets.
"saddw v8.8h, v26.8h, v8.8b\n"
"dup v16.4s, wzr\n"
"movi v16.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"dup v17.4s, wzr\n"
"movi v17.4s, #0\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
@ -2290,9 +2290,9 @@ struct DepthwiseConvHybridPartialPerChannel<
"fcvtms v17.4s, v17.4s\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"dup v16.4s, wzr\n"
"movi v16.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"dup v17.4s, wzr\n"
"movi v17.4s, #0\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"sshll v0.8h, v0.8b, #0\n"
@ -2417,9 +2417,9 @@ struct DepthwiseConvHybridPartialPerChannel<
// Add input and filter offsets.
"saddw v8.8h, v26.8h, v8.8b\n"
"dup v16.4s, wzr\n"
"movi v16.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"dup v17.4s, wzr\n"
"movi v17.4s, #0\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
@ -2494,9 +2494,9 @@ struct DepthwiseConvHybridPartialPerChannel<
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"dup v16.4s, wzr\n"
"movi v16.4s, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"dup v17.4s, wzr\n"
"movi v17.4s, #0\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v6.4s}, [%[bias_ptr]], #16\n"
@ -2622,9 +2622,9 @@ struct DepthwiseConvHybridPartialPerChannel<
// Add input and filter offsets.
"saddw v8.8h, v26.8h, v8.8b\n"
"dup v16.4s, wzr\n"
"movi v16.4s, #0\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"dup v17.4s, wzr\n"
"movi v17.4s, #0\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
@ -2701,9 +2701,9 @@ struct DepthwiseConvHybridPartialPerChannel<
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"dup v16.4s, wzr\n"
"movi v16.4s, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"dup v17.4s, wzr\n"
"movi v17.4s, #0\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"

View File

@ -372,10 +372,10 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
asm volatile(
// Zero out the accumulator registers.
"dup v0.4s, wzr\n"
"dup v1.4s, wzr\n"
"dup v2.4s, wzr\n"
"dup v3.4s, wzr\n"
"movi v0.4s, #0\n"
"movi v1.4s, #0\n"
"movi v2.4s, #0\n"
"movi v3.4s, #0\n"
"1:\n" // batch_cols_loop
@ -463,12 +463,12 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
"st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
"st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
"st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
: [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1),
[ vec_ptr ] "+r"(vec_ptr), [ result_ptr ] "+r"(result_ptr),
[ mat_ptr2 ] "+r"(mat_ptr2), [ mat_ptr3 ] "+r"(mat_ptr3)
: [ mat_ptr0_end ] "r"(mat_ptr0_end),
[ scaling_factors_ptr ] "r"(scaling_factors_ptr),
[ wide_rows ] "r"(wide_rows)
: [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1),
[vec_ptr] "+r"(vec_ptr), [result_ptr] "+r"(result_ptr),
[mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3)
: [mat_ptr0_end] "r"(mat_ptr0_end),
[scaling_factors_ptr] "r"(scaling_factors_ptr),
[wide_rows] "r"(wide_rows)
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "cc", "memory");
}
@ -501,16 +501,16 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr;
const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr;
asm volatile(
"dup v0.4s, wzr\n"
"dup v1.4s, wzr\n"
"dup v2.4s, wzr\n"
"dup v3.4s, wzr\n"
"movi v0.4s, #0\n"
"movi v1.4s, #0\n"
"movi v2.4s, #0\n"
"movi v3.4s, #0\n"
// Load zero points.
"ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
"ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
// Zero out zero point accumulators.
"dup v14.4s, wzr\n"
"dup v15.4s, wzr\n"
"movi v14.4s, #0\n"
"movi v15.4s, #0\n"
// Load per channel scales if not null.
"cmp %w[is_channel_scale_nullptr], #0\n"
@ -587,16 +587,16 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
"st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
"st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
"st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
: [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1),
[ vec_ptr ] "+r"(vec_ptr), [ result_ptr ] "+r"(result_ptr),
[ row_sums_ptr ] "+r"(row_sums_ptr)
: [ mat_ptr0_end ] "r"(mat_ptr0_end),
[ scaling_factors_ptr ] "r"(scaling_factors_ptr),
[ wide_rows ] "r"(wide_rows),
[ channel_scales_ptr ] "r"(channel_scales_ptr),
[ batch_offsets_ptr ] "r"(batch_offsets_ptr),
[ is_channel_scale_nullptr ] "r"(is_channel_scale_nullptr),
[ is_row_sums_nullptr ] "r"(is_row_sums_nullptr)
: [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1),
[vec_ptr] "+r"(vec_ptr), [result_ptr] "+r"(result_ptr),
[row_sums_ptr] "+r"(row_sums_ptr)
: [mat_ptr0_end] "r"(mat_ptr0_end),
[scaling_factors_ptr] "r"(scaling_factors_ptr),
[wide_rows] "r"(wide_rows),
[channel_scales_ptr] "r"(channel_scales_ptr),
[batch_offsets_ptr] "r"(batch_offsets_ptr),
[is_channel_scale_nullptr] "r"(is_channel_scale_nullptr),
[is_row_sums_nullptr] "r"(is_row_sums_nullptr)
: "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1",
"cc", "memory");
@ -746,9 +746,9 @@ static void DotprodSparseMatrixBatchVectorMultiplyAccumulate(
if (ledger_ptr != ledger_end) {
asm volatile(
"dup v0.4s, wzr\n"
"dup v1.4s, wzr\n"
"dup v8.4s, wzr\n"
"movi v0.4s, #0\n"
"movi v1.4s, #0\n"
"movi v8.4s, #0\n"
"mov x7, 0\n"
"1:\n" // chunks_loop
@ -775,9 +775,9 @@ static void DotprodSparseMatrixBatchVectorMultiplyAccumulate(
// We have to be careful to cast this value to 32 bits in order
// to interpret the sign bit properly.
"mov %[row_sum], v1.d[0]\n"
: [ row_sum ] "=r"(row_sum), [ ledger_ptr ] "+r"(ledger_ptr),
[ mat_ptr ] "+r"(mat_ptr), [ vec_ptr ] "+r"(vec_ptr)
: [ ledger_end ] "r"(ledger_end)
: [row_sum] "=r"(row_sum), [ledger_ptr] "+r"(ledger_ptr),
[mat_ptr] "+r"(mat_ptr), [vec_ptr] "+r"(vec_ptr)
: [ledger_end] "r"(ledger_end)
: "x0", "x1", "x7", "x8", "v0", "v1", "v8", "v9", "cc", "memory");
}
result[batch * m_rows + row] +=