Handle per-channel for stride == 2 && non-padding case.

PiperOrigin-RevId: 249852734
This commit is contained in:
Renjie Liu 2019-05-24 10:10:13 -07:00 committed by TensorFlower Gardener
parent 5735726ed6
commit 36147a36e5
3 changed files with 130 additions and 113 deletions

View File

@ -126,28 +126,16 @@ void PickReasonableMultiplier(
bias_shape_inference, bias_data, output_shape_inference,
&output_multiplier);
bool should_use_per_channel = true;
// TODO(b/132879305): Support stride == 2 per-channel case.
if (params.stride_width == 2 || params.stride_height == 2) {
should_use_per_channel = false;
}
int base_multiplier;
int base_shift;
QuantizeMultiplier(output_multiplier, &base_multiplier, &base_shift);
for (int i = 0; i < output_depth; ++i) {
if (should_use_per_channel) {
// multipliers typically range in [2^30 ; 2^31 - 1].
// Values in [0, 2^30 - 1] are normally unused, but harmless.
// Thus a good way to randomize multipliers is to subtract from them
// a random value smaller than 2^30 but still significant compared to it.
output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
} else {
output_multiplier_ptr[i] = base_multiplier;
output_shift_ptr[i] = base_shift;
}
// multipliers typically range in [2^30 ; 2^31 - 1].
// Values in [0, 2^30 - 1] are normally unused, but harmless.
// Thus a good way to randomize multipliers is to subtract from them
// a random value smaller than 2^30 but still significant compared to it.
output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
}
}

View File

@ -411,15 +411,6 @@ inline bool Fast3x3FilterKernelSupported(
}
if (quantization_type == QuantizationType::kPerChannelInt8) {
// TODO(b/132879305): Support stride == 2 per-channel case.
if (stride_height == 2 || stride_width == 2) {
for (int i = 0; i < output_depth; ++i) {
if (output_shift_ptr[i] != output_shift_ptr[0]) {
return false;
}
}
}
for (int i = 0; i < output_depth; ++i) {
if (output_shift_ptr[i] > 0) {
return false;

View File

@ -1031,28 +1031,44 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
// interleaved with arithmetic operations to take advantage of
// dual-issue pipelines. We also add input offsets as far from the loads
// as possible to give loads enough cycles to fetch data from memory.
//
// This logic is copied and modified from the non-per-channel quantized
// part.
// The register planning here is really tricky:
// v0-v29 are all used at least once for either filter/input/output,
// some of them are used for output shift and output mulitplier, or
// input/output offset.
// Only v30 & v31 are only used for output activation min/max.
// For per-channel case, we need 4 registers to hold output shift &
// output multiplier. However, given the reality, we simply cannot do
// that without reloading.
//
// So here's the plan:
// We hold output_multiplier in v30 & v31, and we will load output_shift
// into two consecutive registers each time before use.
// We will duplicate output min & max before needed.
// Sometimes we may borrow registers from input offset or bias, we will
// dup them back after use.
//
// Set "constant" registers. These registers may be replaced with temp
// values from time to time when there are not enough NEON registers.
// We use x9--x15 general purpose registers as they are caller-saved
// temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf). // NOLINT
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
"ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"cmp %w[output_window_height], #2\n"
"dup v28.8h, w0\n"
"ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
"dup v26.4s, w9\n"
"ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v27.4s, w1\n"
"ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"dup v29.8h, w2\n"
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.16b, w3\n"
"ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"dup v31.16b, w4\n"
"ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
"ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
// Deal with output multiplier.
"ld1 {v30.4s, v31.4s}, [%[output_multiplier_ptr]]\n"
// Load filters and add offsets.
"add x10, %[bias_ptr], #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], x5\n"
@ -1207,14 +1223,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"add x13, x12, %[input_row_size]\n"
"add x15, x13, %[input_row_size]\n"
"dup v28.4s, w9\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v28.4s\n"
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v21.4s, v21.4s, v30.4s\n"
"sqrdmulh v22.4s, v22.4s, v31.4s\n"
"sqrdmulh v23.4s, v23.4s, v30.4s\n"
"sqrdmulh v24.4s, v24.4s, v31.4s\n"
"sqrshl v21.4s, v21.4s, v27.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v27.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v21.4h, v21.4s\n"
@ -1225,10 +1241,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v27.16b, w3\n"
"dup v29.16b, w4\n"
"ld1 {v22.4s}, [x10]\n"
"smax v21.16b, v21.16b, v30.16b\n"
"smin v21.16b, v21.16b, v31.16b\n"
"smax v21.16b, v21.16b, v27.16b\n"
"smin v21.16b, v21.16b, v29.16b\n"
"ld1 {v24.4s}, [x10]\n"
"dup v29.8h, w2\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x5\n"
"saddw v10.8h, v28.8h, v10.8b\n"
@ -1276,14 +1295,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"smlal v25.4s, v5.4h, v18.4h\n"
"smlal2 v26.4s, v5.8h, v18.8h\n"
"dup v28.4s, w9\n"
"sqrdmulh v19.4s, v19.4s, v27.4s\n"
"sqrdmulh v20.4s, v20.4s, v27.4s\n"
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
"sqrdmulh v26.4s, v26.4s, v27.4s\n"
"sqrshl v19.4s, v19.4s, v28.4s\n"
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v19.4s, v19.4s, v30.4s\n"
"sqrdmulh v20.4s, v20.4s, v31.4s\n"
"sqrdmulh v25.4s, v25.4s, v30.4s\n"
"sqrdmulh v26.4s, v26.4s, v31.4s\n"
"sqrshl v19.4s, v19.4s, v27.4s\n"
"sqrshl v20.4s, v20.4s, v28.4s\n"
"sqrshl v25.4s, v25.4s, v28.4s\n"
"sqrshl v25.4s, v25.4s, v27.4s\n"
"sqrshl v26.4s, v26.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v19.4h, v19.4s\n"
@ -1294,10 +1313,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"sqadd v25.8h, v25.8h, v29.8h\n"
"sqxtn v19.8b, v19.8h\n"
"sqxtn2 v19.16b, v25.8h\n"
"dup v27.16b, w3\n"
"dup v29.16b, w4\n"
"ld1 {v20.4s}, [x10]\n"
"smax v19.16b, v19.16b, v30.16b\n"
"smin v19.16b, v19.16b, v31.16b\n"
"smax v19.16b, v19.16b, v27.16b\n"
"smin v19.16b, v19.16b, v29.16b\n"
"ld1 {v26.4s}, [x10]\n"
"dup v29.8h, w2\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"st1 {v19.8b}, [x7], x5\n"
"saddw v10.8h, v28.8h, v10.8b\n"
@ -1399,14 +1421,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"smlal2 v26.4s, v2.8h, v13.8h\n"
"ld1 {v13.8b}, [x13]\n"
"dup v28.4s, w9\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrshl v21.4s, v21.4s, v28.4s\n"
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v21.4s, v21.4s, v30.4s\n"
"sqrdmulh v22.4s, v22.4s, v31.4s\n"
"sqrdmulh v23.4s, v23.4s, v30.4s\n"
"sqrdmulh v24.4s, v24.4s, v31.4s\n"
"sqrshl v21.4s, v21.4s, v27.4s\n"
"sqrshl v22.4s, v22.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v28.4s\n"
"sqrshl v23.4s, v23.4s, v27.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v21.4h, v21.4s\n"
@ -1417,10 +1439,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v27.16b, w3\n"
"dup v29.16b, w4\n"
"ld1 {v22.4s}, [x10]\n"
"smax v21.16b, v21.16b, v30.16b\n"
"smin v21.16b, v21.16b, v31.16b\n"
"smax v21.16b, v21.16b, v27.16b\n"
"smin v21.16b, v21.16b, v29.16b\n"
"ld1 {v24.4s}, [x10]\n"
"dup v29.8h, w2\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x5\n"
"saddw v10.8h, v28.8h, v10.8b\n"
@ -1460,14 +1485,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"smlal v25.4s, v5.4h, v18.4h\n"
"smlal2 v26.4s, v5.8h, v18.8h\n"
"dup v28.4s, w9\n"
"sqrdmulh v19.4s, v19.4s, v27.4s\n"
"sqrdmulh v20.4s, v20.4s, v27.4s\n"
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
"sqrdmulh v26.4s, v26.4s, v27.4s\n"
"sqrshl v19.4s, v19.4s, v28.4s\n"
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v19.4s, v19.4s, v30.4s\n"
"sqrdmulh v20.4s, v20.4s, v31.4s\n"
"sqrdmulh v25.4s, v25.4s, v30.4s\n"
"sqrdmulh v26.4s, v26.4s, v31.4s\n"
"sqrshl v19.4s, v19.4s, v27.4s\n"
"sqrshl v20.4s, v20.4s, v28.4s\n"
"sqrshl v25.4s, v25.4s, v28.4s\n"
"sqrshl v25.4s, v25.4s, v27.4s\n"
"sqrshl v26.4s, v26.4s, v28.4s\n"
"dup v28.8h, w0\n"
"sqxtn v19.4h, v19.4s\n"
@ -1476,11 +1501,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"sqxtn2 v25.8h, v26.4s\n"
"sqadd v19.8h, v19.8h, v29.8h\n"
"sqadd v25.8h, v25.8h, v29.8h\n"
"dup v27.16b, w3\n"
"dup v29.16b, w4\n"
"sqxtn v19.8b, v19.8h\n"
"sqxtn2 v19.16b, v25.8h\n"
"smax v19.16b, v19.16b, v30.16b\n"
"smin v19.16b, v19.16b, v31.16b\n"
"smax v19.16b, v19.16b, v27.16b\n"
"smin v19.16b, v19.16b, v29.16b\n"
"st1 {v19.8b}, [x7], x5\n"
"dup v29.8h, w2\n"
"mov v25.d[0], v19.d[1]\n"
"st1 {v25.8b}, [x7]\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
@ -1533,17 +1561,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"smlal v23.4s, v2.4h, v17.4h\n"
"smlal2 v24.4s, v2.8h, v17.8h\n"
"dup v26.4s, w9\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
"ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v21.4s, v21.4s, v30.4s\n"
"sqrdmulh v22.4s, v22.4s, v31.4s\n"
"sqrshl v21.4s, v21.4s, v26.4s\n"
"sqrshl v22.4s, v22.4s, v26.4s\n"
"sqrshl v22.4s, v22.4s, v27.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"dup v26.16b, w3\n"
"dup v27.16b, w4\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"smax v21.8b, v21.8b, v30.8b\n"
"smin v21.8b, v21.8b, v31.8b\n"
"smax v21.8b, v21.8b, v26.8b\n"
"smin v21.8b, v21.8b, v27.8b\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"st1 {v21.8b}, [x6]\n"
"saddw v10.8h, v28.8h, v10.8b\n"
@ -1566,16 +1596,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"smlal v23.4s, v8.4h, v16.4h\n"
"smlal2 v24.4s, v8.8h, v16.8h\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v23.4s, v23.4s, v30.4s\n"
"sqrdmulh v24.4s, v24.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v26.4s\n"
"sqrshl v24.4s, v24.4s, v26.4s\n"
"sqrshl v24.4s, v24.4s, v27.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"dup v26.16b, w3\n"
"dup v27.16b, w4\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v23.8b, v23.8h\n"
"smax v23.8b, v23.8b, v30.8b\n"
"smin v23.8b, v23.8b, v31.8b\n"
"smax v23.8b, v23.8b, v26.8b\n"
"smin v23.8b, v23.8b, v27.8b\n"
"st1 {v23.8b}, [x7]\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
@ -1696,17 +1729,16 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"smlal v26.4s, v8.4h, v23.4h\n"
"smlal2 v27.4s, v8.8h, v23.8h\n"
"dup v28.4s, w1\n"
"dup v29.4s, w9\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrdmulh v25.4s, v25.4s, v28.4s\n"
"sqrdmulh v26.4s, v26.4s, v28.4s\n"
"sqrdmulh v27.4s, v27.4s, v28.4s\n"
"dup v28.8h, w2\n"
"sqrshl v24.4s, v24.4s, v29.4s\n"
"ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v24.4s, v24.4s, v30.4s\n"
"sqrdmulh v25.4s, v25.4s, v31.4s\n"
"sqrdmulh v26.4s, v26.4s, v30.4s\n"
"sqrdmulh v27.4s, v27.4s, v31.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"sqrshl v25.4s, v25.4s, v29.4s\n"
"sqrshl v26.4s, v26.4s, v29.4s\n"
"sqrshl v26.4s, v26.4s, v28.4s\n"
"sqrshl v27.4s, v27.4s, v29.4s\n"
"dup v28.8h, w2\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"sqxtn v26.4h, v26.4s\n"
@ -1716,12 +1748,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"sqxtn v24.8b, v24.8h\n"
"sqxtn2 v24.16b, v26.8h\n"
"dup v28.8h, w0\n"
"dup v27.16b, w3\n"
"dup v29.16b, w4\n"
"ld1 {v25.4s}, [x10]\n"
"smax v24.16b, v24.16b, v30.16b\n"
"smin v24.16b, v24.16b, v31.16b\n"
"ld1 {v27.4s}, [x10]\n"
"smax v24.16b, v24.16b, v27.16b\n"
"smin v24.16b, v24.16b, v29.16b\n"
"saddw v9.8h, v28.8h, v9.8b\n"
"st1 {v24.8b}, [x6], x5\n"
"ld1 {v27.4s}, [x10]\n"
"saddw v10.8h, v28.8h, v10.8b\n"
"mov v26.d[0], v24.d[1]\n"
"st1 {v26.8b}, [x6], x5\n"
@ -1794,17 +1828,16 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"smlal v26.4s, v8.4h, v23.4h\n"
"smlal2 v27.4s, v8.8h, v23.8h\n"
"dup v28.4s, w1\n"
"dup v29.4s, w9\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrdmulh v25.4s, v25.4s, v28.4s\n"
"sqrdmulh v26.4s, v26.4s, v28.4s\n"
"sqrdmulh v27.4s, v27.4s, v28.4s\n"
"dup v28.8h, w2\n"
"sqrshl v24.4s, v24.4s, v29.4s\n"
"ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v24.4s, v24.4s, v30.4s\n"
"sqrdmulh v25.4s, v25.4s, v31.4s\n"
"sqrdmulh v26.4s, v26.4s, v30.4s\n"
"sqrdmulh v27.4s, v27.4s, v31.4s\n"
"sqrshl v24.4s, v24.4s, v28.4s\n"
"sqrshl v25.4s, v25.4s, v29.4s\n"
"sqrshl v26.4s, v26.4s, v29.4s\n"
"sqrshl v26.4s, v26.4s, v28.4s\n"
"sqrshl v27.4s, v27.4s, v29.4s\n"
"dup v28.8h, w2\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"sqxtn v26.4h, v26.4s\n"
@ -1812,19 +1845,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"sqadd v24.8h, v24.8h, v28.8h\n"
"sqadd v26.8h, v26.8h, v28.8h\n"
"sqxtn v24.8b, v24.8h\n"
"dup v28.16b, w3\n"
"dup v29.16b, w4\n"
"sqxtn2 v24.16b, v26.8h\n"
"dup v28.8h, w0\n"
"smax v24.16b, v24.16b, v30.16b\n"
"smin v24.16b, v24.16b, v31.16b\n"
"smax v24.16b, v24.16b, v28.16b\n"
"smin v24.16b, v24.16b, v29.16b\n"
"st1 {v24.8b}, [x6], x5\n"
"mov v26.d[0], v24.d[1]\n"
"st1 {v26.8b}, [x6]\n"
"dup v28.8h, w0\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
// Handle bottom right output if exists.
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
"dup v26.4s, w9\n"
"dup v27.4s, w1\n"
"dup v29.8h, w2\n"
"smlal v24.4s, v0.4h, v9.4h\n"
@ -1846,16 +1879,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"smlal v24.4s, v8.4h, v17.4h\n"
"smlal2 v25.4s, v8.8h, v17.8h\n"
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
"ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
"sqrdmulh v24.4s, v24.4s, v30.4s\n"
"sqrdmulh v25.4s, v25.4s, v31.4s\n"
"sqrshl v24.4s, v24.4s, v26.4s\n"
"sqrshl v25.4s, v25.4s, v26.4s\n"
"sqrshl v25.4s, v25.4s, v27.4s\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"dup v26.16b, w3\n"
"dup v27.16b, w4\n"
"sqadd v24.8h, v24.8h, v29.8h\n"
"sqxtn v24.8b, v24.8h\n"
"smax v24.8b, v24.8b, v30.8b\n"
"smin v24.8b, v24.8b, v31.8b\n"
"smax v24.8b, v24.8b, v26.8b\n"
"smin v24.8b, v24.8b, v27.8b\n"
"st1 {v24.8b}, [x6]\n"
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
@ -1866,6 +1902,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
[output_window_height] "+r"(output_window_height)
:
// Inputs.
[output_multiplier_ptr] "r"(output_multiplier_ptr),
[output_shift_ptr] "r"(output_shift_ptr),
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
[input_depth] "r"(input_depth),
[output_window_width] "r"(output_window_width),
@ -1882,8 +1920,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
// We use these general-purpose registers.
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x0", "x2", "x3", "x4", "x5", "x6", "x7",
"x10", "x11", "x12", "x13", "x14", "x15",
"x19", "x20");
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP