Handle per-channel for stride == 2 && non-padding case.
PiperOrigin-RevId: 249852734
This commit is contained in:
parent
5735726ed6
commit
36147a36e5
@ -126,28 +126,16 @@ void PickReasonableMultiplier(
|
||||
bias_shape_inference, bias_data, output_shape_inference,
|
||||
&output_multiplier);
|
||||
|
||||
bool should_use_per_channel = true;
|
||||
|
||||
// TODO(b/132879305): Support stride == 2 per-channel case.
|
||||
if (params.stride_width == 2 || params.stride_height == 2) {
|
||||
should_use_per_channel = false;
|
||||
}
|
||||
|
||||
int base_multiplier;
|
||||
int base_shift;
|
||||
QuantizeMultiplier(output_multiplier, &base_multiplier, &base_shift);
|
||||
for (int i = 0; i < output_depth; ++i) {
|
||||
if (should_use_per_channel) {
|
||||
// multipliers typically range in [2^30 ; 2^31 - 1].
|
||||
// Values in [0, 2^30 - 1] are normally unused, but harmless.
|
||||
// Thus a good way to randomize multipliers is to subtract from them
|
||||
// a random value smaller than 2^30 but still significant compared to it.
|
||||
output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
|
||||
output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
|
||||
} else {
|
||||
output_multiplier_ptr[i] = base_multiplier;
|
||||
output_shift_ptr[i] = base_shift;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -411,15 +411,6 @@ inline bool Fast3x3FilterKernelSupported(
|
||||
}
|
||||
|
||||
if (quantization_type == QuantizationType::kPerChannelInt8) {
|
||||
// TODO(b/132879305): Support stride == 2 per-channel case.
|
||||
if (stride_height == 2 || stride_width == 2) {
|
||||
for (int i = 0; i < output_depth; ++i) {
|
||||
if (output_shift_ptr[i] != output_shift_ptr[0]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < output_depth; ++i) {
|
||||
if (output_shift_ptr[i] > 0) {
|
||||
return false;
|
||||
|
@ -1031,28 +1031,44 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
// interleaved with arithmetic operations to take advantage of
|
||||
// dual-issue pipelines. We also add input offsets as far from the loads
|
||||
// as possible to give loads enough cycles to fetch data from memory.
|
||||
//
|
||||
// This logic is copied and modified from the non-per-channel quantized
|
||||
// part.
|
||||
// The register planning here is really tricky:
|
||||
// v0-v29 are all used at least once for either filter/input/output,
|
||||
// some of them are used for output shift and output mulitplier, or
|
||||
// input/output offset.
|
||||
// Only v30 & v31 are only used for output activation min/max.
|
||||
// For per-channel case, we need 4 registers to hold output shift &
|
||||
// output multiplier. However, given the reality, we simply cannot do
|
||||
// that without reloading.
|
||||
//
|
||||
// So here's the plan:
|
||||
// We hold output_multiplier in v30 & v31, and we will load output_shift
|
||||
// into two consecutive registers each time before use.
|
||||
// We will duplicate output min & max before needed.
|
||||
// Sometimes we may borrow registers from input offset or bias, we will
|
||||
// dup them back after use.
|
||||
//
|
||||
|
||||
// Set "constant" registers. These registers may be replaced with temp
|
||||
// values from time to time when there are not enough NEON registers.
|
||||
// We use x9--x15 general purpose registers as they are caller-saved
|
||||
// temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf). // NOLINT
|
||||
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
|
||||
"ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
|
||||
"cmp %w[output_window_height], #2\n"
|
||||
"dup v28.8h, w0\n"
|
||||
"ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
|
||||
"dup v26.4s, w9\n"
|
||||
"ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
|
||||
"dup v27.4s, w1\n"
|
||||
"ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
|
||||
"dup v29.8h, w2\n"
|
||||
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
|
||||
"dup v30.16b, w3\n"
|
||||
"ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
|
||||
"dup v31.16b, w4\n"
|
||||
"ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
|
||||
"ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
|
||||
|
||||
// Deal with output multiplier.
|
||||
"ld1 {v30.4s, v31.4s}, [%[output_multiplier_ptr]]\n"
|
||||
|
||||
// Load filters and add offsets.
|
||||
"add x10, %[bias_ptr], #16\n"
|
||||
"ld1 {v0.8b}, [%[filter_ptr]], x5\n"
|
||||
@ -1207,14 +1223,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"add x13, x12, %[input_row_size]\n"
|
||||
"add x15, x13, %[input_row_size]\n"
|
||||
|
||||
"dup v28.4s, w9\n"
|
||||
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
|
||||
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
|
||||
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
|
||||
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
|
||||
"sqrshl v21.4s, v21.4s, v28.4s\n"
|
||||
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
|
||||
"sqrdmulh v21.4s, v21.4s, v30.4s\n"
|
||||
"sqrdmulh v22.4s, v22.4s, v31.4s\n"
|
||||
"sqrdmulh v23.4s, v23.4s, v30.4s\n"
|
||||
"sqrdmulh v24.4s, v24.4s, v31.4s\n"
|
||||
"sqrshl v21.4s, v21.4s, v27.4s\n"
|
||||
"sqrshl v22.4s, v22.4s, v28.4s\n"
|
||||
"sqrshl v23.4s, v23.4s, v28.4s\n"
|
||||
"sqrshl v23.4s, v23.4s, v27.4s\n"
|
||||
"sqrshl v24.4s, v24.4s, v28.4s\n"
|
||||
"dup v28.8h, w0\n"
|
||||
"sqxtn v21.4h, v21.4s\n"
|
||||
@ -1225,10 +1241,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"sqadd v23.8h, v23.8h, v29.8h\n"
|
||||
"sqxtn v21.8b, v21.8h\n"
|
||||
"sqxtn2 v21.16b, v23.8h\n"
|
||||
"dup v27.16b, w3\n"
|
||||
"dup v29.16b, w4\n"
|
||||
"ld1 {v22.4s}, [x10]\n"
|
||||
"smax v21.16b, v21.16b, v30.16b\n"
|
||||
"smin v21.16b, v21.16b, v31.16b\n"
|
||||
"smax v21.16b, v21.16b, v27.16b\n"
|
||||
"smin v21.16b, v21.16b, v29.16b\n"
|
||||
"ld1 {v24.4s}, [x10]\n"
|
||||
"dup v29.8h, w2\n"
|
||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||
"st1 {v21.8b}, [x6], x5\n"
|
||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||
@ -1276,14 +1295,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"smlal v25.4s, v5.4h, v18.4h\n"
|
||||
"smlal2 v26.4s, v5.8h, v18.8h\n"
|
||||
|
||||
"dup v28.4s, w9\n"
|
||||
"sqrdmulh v19.4s, v19.4s, v27.4s\n"
|
||||
"sqrdmulh v20.4s, v20.4s, v27.4s\n"
|
||||
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
|
||||
"sqrdmulh v26.4s, v26.4s, v27.4s\n"
|
||||
"sqrshl v19.4s, v19.4s, v28.4s\n"
|
||||
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
|
||||
"sqrdmulh v19.4s, v19.4s, v30.4s\n"
|
||||
"sqrdmulh v20.4s, v20.4s, v31.4s\n"
|
||||
"sqrdmulh v25.4s, v25.4s, v30.4s\n"
|
||||
"sqrdmulh v26.4s, v26.4s, v31.4s\n"
|
||||
"sqrshl v19.4s, v19.4s, v27.4s\n"
|
||||
"sqrshl v20.4s, v20.4s, v28.4s\n"
|
||||
"sqrshl v25.4s, v25.4s, v28.4s\n"
|
||||
"sqrshl v25.4s, v25.4s, v27.4s\n"
|
||||
"sqrshl v26.4s, v26.4s, v28.4s\n"
|
||||
"dup v28.8h, w0\n"
|
||||
"sqxtn v19.4h, v19.4s\n"
|
||||
@ -1294,10 +1313,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"sqadd v25.8h, v25.8h, v29.8h\n"
|
||||
"sqxtn v19.8b, v19.8h\n"
|
||||
"sqxtn2 v19.16b, v25.8h\n"
|
||||
"dup v27.16b, w3\n"
|
||||
"dup v29.16b, w4\n"
|
||||
"ld1 {v20.4s}, [x10]\n"
|
||||
"smax v19.16b, v19.16b, v30.16b\n"
|
||||
"smin v19.16b, v19.16b, v31.16b\n"
|
||||
"smax v19.16b, v19.16b, v27.16b\n"
|
||||
"smin v19.16b, v19.16b, v29.16b\n"
|
||||
"ld1 {v26.4s}, [x10]\n"
|
||||
"dup v29.8h, w2\n"
|
||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||
"st1 {v19.8b}, [x7], x5\n"
|
||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||
@ -1399,14 +1421,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"smlal2 v26.4s, v2.8h, v13.8h\n"
|
||||
"ld1 {v13.8b}, [x13]\n"
|
||||
|
||||
"dup v28.4s, w9\n"
|
||||
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
|
||||
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
|
||||
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
|
||||
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
|
||||
"sqrshl v21.4s, v21.4s, v28.4s\n"
|
||||
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
|
||||
"sqrdmulh v21.4s, v21.4s, v30.4s\n"
|
||||
"sqrdmulh v22.4s, v22.4s, v31.4s\n"
|
||||
"sqrdmulh v23.4s, v23.4s, v30.4s\n"
|
||||
"sqrdmulh v24.4s, v24.4s, v31.4s\n"
|
||||
"sqrshl v21.4s, v21.4s, v27.4s\n"
|
||||
"sqrshl v22.4s, v22.4s, v28.4s\n"
|
||||
"sqrshl v23.4s, v23.4s, v28.4s\n"
|
||||
"sqrshl v23.4s, v23.4s, v27.4s\n"
|
||||
"sqrshl v24.4s, v24.4s, v28.4s\n"
|
||||
"dup v28.8h, w0\n"
|
||||
"sqxtn v21.4h, v21.4s\n"
|
||||
@ -1417,10 +1439,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"sqadd v23.8h, v23.8h, v29.8h\n"
|
||||
"sqxtn v21.8b, v21.8h\n"
|
||||
"sqxtn2 v21.16b, v23.8h\n"
|
||||
"dup v27.16b, w3\n"
|
||||
"dup v29.16b, w4\n"
|
||||
"ld1 {v22.4s}, [x10]\n"
|
||||
"smax v21.16b, v21.16b, v30.16b\n"
|
||||
"smin v21.16b, v21.16b, v31.16b\n"
|
||||
"smax v21.16b, v21.16b, v27.16b\n"
|
||||
"smin v21.16b, v21.16b, v29.16b\n"
|
||||
"ld1 {v24.4s}, [x10]\n"
|
||||
"dup v29.8h, w2\n"
|
||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||
"st1 {v21.8b}, [x6], x5\n"
|
||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||
@ -1460,14 +1485,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"smlal v25.4s, v5.4h, v18.4h\n"
|
||||
"smlal2 v26.4s, v5.8h, v18.8h\n"
|
||||
|
||||
"dup v28.4s, w9\n"
|
||||
"sqrdmulh v19.4s, v19.4s, v27.4s\n"
|
||||
"sqrdmulh v20.4s, v20.4s, v27.4s\n"
|
||||
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
|
||||
"sqrdmulh v26.4s, v26.4s, v27.4s\n"
|
||||
"sqrshl v19.4s, v19.4s, v28.4s\n"
|
||||
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
|
||||
"sqrdmulh v19.4s, v19.4s, v30.4s\n"
|
||||
"sqrdmulh v20.4s, v20.4s, v31.4s\n"
|
||||
"sqrdmulh v25.4s, v25.4s, v30.4s\n"
|
||||
"sqrdmulh v26.4s, v26.4s, v31.4s\n"
|
||||
"sqrshl v19.4s, v19.4s, v27.4s\n"
|
||||
"sqrshl v20.4s, v20.4s, v28.4s\n"
|
||||
"sqrshl v25.4s, v25.4s, v28.4s\n"
|
||||
"sqrshl v25.4s, v25.4s, v27.4s\n"
|
||||
"sqrshl v26.4s, v26.4s, v28.4s\n"
|
||||
"dup v28.8h, w0\n"
|
||||
"sqxtn v19.4h, v19.4s\n"
|
||||
@ -1476,11 +1501,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"sqxtn2 v25.8h, v26.4s\n"
|
||||
"sqadd v19.8h, v19.8h, v29.8h\n"
|
||||
"sqadd v25.8h, v25.8h, v29.8h\n"
|
||||
"dup v27.16b, w3\n"
|
||||
"dup v29.16b, w4\n"
|
||||
"sqxtn v19.8b, v19.8h\n"
|
||||
"sqxtn2 v19.16b, v25.8h\n"
|
||||
"smax v19.16b, v19.16b, v30.16b\n"
|
||||
"smin v19.16b, v19.16b, v31.16b\n"
|
||||
"smax v19.16b, v19.16b, v27.16b\n"
|
||||
"smin v19.16b, v19.16b, v29.16b\n"
|
||||
"st1 {v19.8b}, [x7], x5\n"
|
||||
"dup v29.8h, w2\n"
|
||||
"mov v25.d[0], v19.d[1]\n"
|
||||
"st1 {v25.8b}, [x7]\n"
|
||||
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
|
||||
@ -1533,17 +1561,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"smlal v23.4s, v2.4h, v17.4h\n"
|
||||
"smlal2 v24.4s, v2.8h, v17.8h\n"
|
||||
|
||||
"dup v26.4s, w9\n"
|
||||
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
|
||||
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
|
||||
"ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
|
||||
"sqrdmulh v21.4s, v21.4s, v30.4s\n"
|
||||
"sqrdmulh v22.4s, v22.4s, v31.4s\n"
|
||||
"sqrshl v21.4s, v21.4s, v26.4s\n"
|
||||
"sqrshl v22.4s, v22.4s, v26.4s\n"
|
||||
"sqrshl v22.4s, v22.4s, v27.4s\n"
|
||||
"sqxtn v21.4h, v21.4s\n"
|
||||
"sqxtn2 v21.8h, v22.4s\n"
|
||||
"dup v26.16b, w3\n"
|
||||
"dup v27.16b, w4\n"
|
||||
"sqadd v21.8h, v21.8h, v29.8h\n"
|
||||
"sqxtn v21.8b, v21.8h\n"
|
||||
"smax v21.8b, v21.8b, v30.8b\n"
|
||||
"smin v21.8b, v21.8b, v31.8b\n"
|
||||
"smax v21.8b, v21.8b, v26.8b\n"
|
||||
"smin v21.8b, v21.8b, v27.8b\n"
|
||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||
"st1 {v21.8b}, [x6]\n"
|
||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||
@ -1566,16 +1596,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"smlal v23.4s, v8.4h, v16.4h\n"
|
||||
"smlal2 v24.4s, v8.8h, v16.8h\n"
|
||||
|
||||
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
|
||||
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
|
||||
"ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
|
||||
"sqrdmulh v23.4s, v23.4s, v30.4s\n"
|
||||
"sqrdmulh v24.4s, v24.4s, v31.4s\n"
|
||||
"sqrshl v23.4s, v23.4s, v26.4s\n"
|
||||
"sqrshl v24.4s, v24.4s, v26.4s\n"
|
||||
"sqrshl v24.4s, v24.4s, v27.4s\n"
|
||||
"sqxtn v23.4h, v23.4s\n"
|
||||
"sqxtn2 v23.8h, v24.4s\n"
|
||||
"dup v26.16b, w3\n"
|
||||
"dup v27.16b, w4\n"
|
||||
"sqadd v23.8h, v23.8h, v29.8h\n"
|
||||
"sqxtn v23.8b, v23.8h\n"
|
||||
"smax v23.8b, v23.8b, v30.8b\n"
|
||||
"smin v23.8b, v23.8b, v31.8b\n"
|
||||
"smax v23.8b, v23.8b, v26.8b\n"
|
||||
"smin v23.8b, v23.8b, v27.8b\n"
|
||||
"st1 {v23.8b}, [x7]\n"
|
||||
|
||||
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
|
||||
@ -1696,17 +1729,16 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"smlal v26.4s, v8.4h, v23.4h\n"
|
||||
"smlal2 v27.4s, v8.8h, v23.8h\n"
|
||||
|
||||
"dup v28.4s, w1\n"
|
||||
"dup v29.4s, w9\n"
|
||||
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
|
||||
"sqrdmulh v25.4s, v25.4s, v28.4s\n"
|
||||
"sqrdmulh v26.4s, v26.4s, v28.4s\n"
|
||||
"sqrdmulh v27.4s, v27.4s, v28.4s\n"
|
||||
"dup v28.8h, w2\n"
|
||||
"sqrshl v24.4s, v24.4s, v29.4s\n"
|
||||
"ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
|
||||
"sqrdmulh v24.4s, v24.4s, v30.4s\n"
|
||||
"sqrdmulh v25.4s, v25.4s, v31.4s\n"
|
||||
"sqrdmulh v26.4s, v26.4s, v30.4s\n"
|
||||
"sqrdmulh v27.4s, v27.4s, v31.4s\n"
|
||||
"sqrshl v24.4s, v24.4s, v28.4s\n"
|
||||
"sqrshl v25.4s, v25.4s, v29.4s\n"
|
||||
"sqrshl v26.4s, v26.4s, v29.4s\n"
|
||||
"sqrshl v26.4s, v26.4s, v28.4s\n"
|
||||
"sqrshl v27.4s, v27.4s, v29.4s\n"
|
||||
"dup v28.8h, w2\n"
|
||||
"sqxtn v24.4h, v24.4s\n"
|
||||
"sqxtn2 v24.8h, v25.4s\n"
|
||||
"sqxtn v26.4h, v26.4s\n"
|
||||
@ -1716,12 +1748,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"sqxtn v24.8b, v24.8h\n"
|
||||
"sqxtn2 v24.16b, v26.8h\n"
|
||||
"dup v28.8h, w0\n"
|
||||
"dup v27.16b, w3\n"
|
||||
"dup v29.16b, w4\n"
|
||||
"ld1 {v25.4s}, [x10]\n"
|
||||
"smax v24.16b, v24.16b, v30.16b\n"
|
||||
"smin v24.16b, v24.16b, v31.16b\n"
|
||||
"ld1 {v27.4s}, [x10]\n"
|
||||
"smax v24.16b, v24.16b, v27.16b\n"
|
||||
"smin v24.16b, v24.16b, v29.16b\n"
|
||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||
"st1 {v24.8b}, [x6], x5\n"
|
||||
"ld1 {v27.4s}, [x10]\n"
|
||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||
"mov v26.d[0], v24.d[1]\n"
|
||||
"st1 {v26.8b}, [x6], x5\n"
|
||||
@ -1794,17 +1828,16 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"smlal v26.4s, v8.4h, v23.4h\n"
|
||||
"smlal2 v27.4s, v8.8h, v23.8h\n"
|
||||
|
||||
"dup v28.4s, w1\n"
|
||||
"dup v29.4s, w9\n"
|
||||
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
|
||||
"sqrdmulh v25.4s, v25.4s, v28.4s\n"
|
||||
"sqrdmulh v26.4s, v26.4s, v28.4s\n"
|
||||
"sqrdmulh v27.4s, v27.4s, v28.4s\n"
|
||||
"dup v28.8h, w2\n"
|
||||
"sqrshl v24.4s, v24.4s, v29.4s\n"
|
||||
"ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
|
||||
"sqrdmulh v24.4s, v24.4s, v30.4s\n"
|
||||
"sqrdmulh v25.4s, v25.4s, v31.4s\n"
|
||||
"sqrdmulh v26.4s, v26.4s, v30.4s\n"
|
||||
"sqrdmulh v27.4s, v27.4s, v31.4s\n"
|
||||
"sqrshl v24.4s, v24.4s, v28.4s\n"
|
||||
"sqrshl v25.4s, v25.4s, v29.4s\n"
|
||||
"sqrshl v26.4s, v26.4s, v29.4s\n"
|
||||
"sqrshl v26.4s, v26.4s, v28.4s\n"
|
||||
"sqrshl v27.4s, v27.4s, v29.4s\n"
|
||||
"dup v28.8h, w2\n"
|
||||
"sqxtn v24.4h, v24.4s\n"
|
||||
"sqxtn2 v24.8h, v25.4s\n"
|
||||
"sqxtn v26.4h, v26.4s\n"
|
||||
@ -1812,19 +1845,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"sqadd v24.8h, v24.8h, v28.8h\n"
|
||||
"sqadd v26.8h, v26.8h, v28.8h\n"
|
||||
"sqxtn v24.8b, v24.8h\n"
|
||||
"dup v28.16b, w3\n"
|
||||
"dup v29.16b, w4\n"
|
||||
"sqxtn2 v24.16b, v26.8h\n"
|
||||
"dup v28.8h, w0\n"
|
||||
"smax v24.16b, v24.16b, v30.16b\n"
|
||||
"smin v24.16b, v24.16b, v31.16b\n"
|
||||
"smax v24.16b, v24.16b, v28.16b\n"
|
||||
"smin v24.16b, v24.16b, v29.16b\n"
|
||||
"st1 {v24.8b}, [x6], x5\n"
|
||||
"mov v26.d[0], v24.d[1]\n"
|
||||
"st1 {v26.8b}, [x6]\n"
|
||||
"dup v28.8h, w0\n"
|
||||
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
|
||||
|
||||
// Handle bottom right output if exists.
|
||||
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
|
||||
"dup v26.4s, w9\n"
|
||||
"dup v27.4s, w1\n"
|
||||
"dup v29.8h, w2\n"
|
||||
|
||||
"smlal v24.4s, v0.4h, v9.4h\n"
|
||||
@ -1846,16 +1879,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"smlal v24.4s, v8.4h, v17.4h\n"
|
||||
"smlal2 v25.4s, v8.8h, v17.8h\n"
|
||||
|
||||
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
|
||||
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
|
||||
"ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
|
||||
"sqrdmulh v24.4s, v24.4s, v30.4s\n"
|
||||
"sqrdmulh v25.4s, v25.4s, v31.4s\n"
|
||||
"sqrshl v24.4s, v24.4s, v26.4s\n"
|
||||
"sqrshl v25.4s, v25.4s, v26.4s\n"
|
||||
"sqrshl v25.4s, v25.4s, v27.4s\n"
|
||||
"sqxtn v24.4h, v24.4s\n"
|
||||
"sqxtn2 v24.8h, v25.4s\n"
|
||||
"dup v26.16b, w3\n"
|
||||
"dup v27.16b, w4\n"
|
||||
"sqadd v24.8h, v24.8h, v29.8h\n"
|
||||
"sqxtn v24.8b, v24.8h\n"
|
||||
"smax v24.8b, v24.8b, v30.8b\n"
|
||||
"smin v24.8b, v24.8b, v31.8b\n"
|
||||
"smax v24.8b, v24.8b, v26.8b\n"
|
||||
"smin v24.8b, v24.8b, v27.8b\n"
|
||||
"st1 {v24.8b}, [x6]\n"
|
||||
|
||||
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
|
||||
@ -1866,6 +1902,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
[output_window_height] "+r"(output_window_height)
|
||||
:
|
||||
// Inputs.
|
||||
[output_multiplier_ptr] "r"(output_multiplier_ptr),
|
||||
[output_shift_ptr] "r"(output_shift_ptr),
|
||||
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
|
||||
[input_depth] "r"(input_depth),
|
||||
[output_window_width] "r"(output_window_width),
|
||||
@ -1882,8 +1920,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
|
||||
"v30", "v31",
|
||||
// We use these general-purpose registers.
|
||||
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
|
||||
"x9", "x10", "x11", "x12", "x13", "x14", "x15",
|
||||
"x0", "x2", "x3", "x4", "x5", "x6", "x7",
|
||||
"x10", "x11", "x12", "x13", "x14", "x15",
|
||||
"x19", "x20");
|
||||
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
|
||||
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
|
||||
|
Loading…
Reference in New Issue
Block a user