Handle per-channel for stride == 2 && non-padding case.
PiperOrigin-RevId: 249852734
This commit is contained in:
parent
5735726ed6
commit
36147a36e5
@ -126,28 +126,16 @@ void PickReasonableMultiplier(
|
|||||||
bias_shape_inference, bias_data, output_shape_inference,
|
bias_shape_inference, bias_data, output_shape_inference,
|
||||||
&output_multiplier);
|
&output_multiplier);
|
||||||
|
|
||||||
bool should_use_per_channel = true;
|
|
||||||
|
|
||||||
// TODO(b/132879305): Support stride == 2 per-channel case.
|
|
||||||
if (params.stride_width == 2 || params.stride_height == 2) {
|
|
||||||
should_use_per_channel = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
int base_multiplier;
|
int base_multiplier;
|
||||||
int base_shift;
|
int base_shift;
|
||||||
QuantizeMultiplier(output_multiplier, &base_multiplier, &base_shift);
|
QuantizeMultiplier(output_multiplier, &base_multiplier, &base_shift);
|
||||||
for (int i = 0; i < output_depth; ++i) {
|
for (int i = 0; i < output_depth; ++i) {
|
||||||
if (should_use_per_channel) {
|
// multipliers typically range in [2^30 ; 2^31 - 1].
|
||||||
// multipliers typically range in [2^30 ; 2^31 - 1].
|
// Values in [0, 2^30 - 1] are normally unused, but harmless.
|
||||||
// Values in [0, 2^30 - 1] are normally unused, but harmless.
|
// Thus a good way to randomize multipliers is to subtract from them
|
||||||
// Thus a good way to randomize multipliers is to subtract from them
|
// a random value smaller than 2^30 but still significant compared to it.
|
||||||
// a random value smaller than 2^30 but still significant compared to it.
|
output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
|
||||||
output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
|
output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
|
||||||
output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
|
|
||||||
} else {
|
|
||||||
output_multiplier_ptr[i] = base_multiplier;
|
|
||||||
output_shift_ptr[i] = base_shift;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -411,15 +411,6 @@ inline bool Fast3x3FilterKernelSupported(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (quantization_type == QuantizationType::kPerChannelInt8) {
|
if (quantization_type == QuantizationType::kPerChannelInt8) {
|
||||||
// TODO(b/132879305): Support stride == 2 per-channel case.
|
|
||||||
if (stride_height == 2 || stride_width == 2) {
|
|
||||||
for (int i = 0; i < output_depth; ++i) {
|
|
||||||
if (output_shift_ptr[i] != output_shift_ptr[0]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < output_depth; ++i) {
|
for (int i = 0; i < output_depth; ++i) {
|
||||||
if (output_shift_ptr[i] > 0) {
|
if (output_shift_ptr[i] > 0) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -1031,28 +1031,44 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
// interleaved with arithmetic operations to take advantage of
|
// interleaved with arithmetic operations to take advantage of
|
||||||
// dual-issue pipelines. We also add input offsets as far from the loads
|
// dual-issue pipelines. We also add input offsets as far from the loads
|
||||||
// as possible to give loads enough cycles to fetch data from memory.
|
// as possible to give loads enough cycles to fetch data from memory.
|
||||||
|
//
|
||||||
|
// This logic is copied and modified from the non-per-channel quantized
|
||||||
|
// part.
|
||||||
|
// The register planning here is really tricky:
|
||||||
|
// v0-v29 are all used at least once for either filter/input/output,
|
||||||
|
// some of them are used for output shift and output mulitplier, or
|
||||||
|
// input/output offset.
|
||||||
|
// Only v30 & v31 are only used for output activation min/max.
|
||||||
|
// For per-channel case, we need 4 registers to hold output shift &
|
||||||
|
// output multiplier. However, given the reality, we simply cannot do
|
||||||
|
// that without reloading.
|
||||||
|
//
|
||||||
|
// So here's the plan:
|
||||||
|
// We hold output_multiplier in v30 & v31, and we will load output_shift
|
||||||
|
// into two consecutive registers each time before use.
|
||||||
|
// We will duplicate output min & max before needed.
|
||||||
|
// Sometimes we may borrow registers from input offset or bias, we will
|
||||||
|
// dup them back after use.
|
||||||
|
//
|
||||||
|
|
||||||
// Set "constant" registers. These registers may be replaced with temp
|
// Set "constant" registers. These registers may be replaced with temp
|
||||||
// values from time to time when there are not enough NEON registers.
|
// values from time to time when there are not enough NEON registers.
|
||||||
// We use x9--x15 general purpose registers as they are caller-saved
|
// We use x9--x15 general purpose registers as they are caller-saved
|
||||||
// temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf). // NOLINT
|
// temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf). // NOLINT
|
||||||
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
|
|
||||||
"ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
|
"ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
|
||||||
"cmp %w[output_window_height], #2\n"
|
"cmp %w[output_window_height], #2\n"
|
||||||
"dup v28.8h, w0\n"
|
"dup v28.8h, w0\n"
|
||||||
"ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
|
|
||||||
"dup v26.4s, w9\n"
|
|
||||||
"ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
|
"ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
|
||||||
"dup v27.4s, w1\n"
|
|
||||||
"ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
|
"ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
|
||||||
"dup v29.8h, w2\n"
|
"dup v29.8h, w2\n"
|
||||||
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
|
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
|
||||||
"dup v30.16b, w3\n"
|
|
||||||
"ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
|
"ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
|
||||||
"dup v31.16b, w4\n"
|
|
||||||
"ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
|
"ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
|
||||||
"ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
|
"ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
|
||||||
|
|
||||||
|
// Deal with output multiplier.
|
||||||
|
"ld1 {v30.4s, v31.4s}, [%[output_multiplier_ptr]]\n"
|
||||||
|
|
||||||
// Load filters and add offsets.
|
// Load filters and add offsets.
|
||||||
"add x10, %[bias_ptr], #16\n"
|
"add x10, %[bias_ptr], #16\n"
|
||||||
"ld1 {v0.8b}, [%[filter_ptr]], x5\n"
|
"ld1 {v0.8b}, [%[filter_ptr]], x5\n"
|
||||||
@ -1207,14 +1223,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"add x13, x12, %[input_row_size]\n"
|
"add x13, x12, %[input_row_size]\n"
|
||||||
"add x15, x13, %[input_row_size]\n"
|
"add x15, x13, %[input_row_size]\n"
|
||||||
|
|
||||||
"dup v28.4s, w9\n"
|
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
|
||||||
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
|
"sqrdmulh v21.4s, v21.4s, v30.4s\n"
|
||||||
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
|
"sqrdmulh v22.4s, v22.4s, v31.4s\n"
|
||||||
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
|
"sqrdmulh v23.4s, v23.4s, v30.4s\n"
|
||||||
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
|
"sqrdmulh v24.4s, v24.4s, v31.4s\n"
|
||||||
"sqrshl v21.4s, v21.4s, v28.4s\n"
|
"sqrshl v21.4s, v21.4s, v27.4s\n"
|
||||||
"sqrshl v22.4s, v22.4s, v28.4s\n"
|
"sqrshl v22.4s, v22.4s, v28.4s\n"
|
||||||
"sqrshl v23.4s, v23.4s, v28.4s\n"
|
"sqrshl v23.4s, v23.4s, v27.4s\n"
|
||||||
"sqrshl v24.4s, v24.4s, v28.4s\n"
|
"sqrshl v24.4s, v24.4s, v28.4s\n"
|
||||||
"dup v28.8h, w0\n"
|
"dup v28.8h, w0\n"
|
||||||
"sqxtn v21.4h, v21.4s\n"
|
"sqxtn v21.4h, v21.4s\n"
|
||||||
@ -1225,10 +1241,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"sqadd v23.8h, v23.8h, v29.8h\n"
|
"sqadd v23.8h, v23.8h, v29.8h\n"
|
||||||
"sqxtn v21.8b, v21.8h\n"
|
"sqxtn v21.8b, v21.8h\n"
|
||||||
"sqxtn2 v21.16b, v23.8h\n"
|
"sqxtn2 v21.16b, v23.8h\n"
|
||||||
|
"dup v27.16b, w3\n"
|
||||||
|
"dup v29.16b, w4\n"
|
||||||
"ld1 {v22.4s}, [x10]\n"
|
"ld1 {v22.4s}, [x10]\n"
|
||||||
"smax v21.16b, v21.16b, v30.16b\n"
|
"smax v21.16b, v21.16b, v27.16b\n"
|
||||||
"smin v21.16b, v21.16b, v31.16b\n"
|
"smin v21.16b, v21.16b, v29.16b\n"
|
||||||
"ld1 {v24.4s}, [x10]\n"
|
"ld1 {v24.4s}, [x10]\n"
|
||||||
|
"dup v29.8h, w2\n"
|
||||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||||
"st1 {v21.8b}, [x6], x5\n"
|
"st1 {v21.8b}, [x6], x5\n"
|
||||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||||
@ -1276,14 +1295,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"smlal v25.4s, v5.4h, v18.4h\n"
|
"smlal v25.4s, v5.4h, v18.4h\n"
|
||||||
"smlal2 v26.4s, v5.8h, v18.8h\n"
|
"smlal2 v26.4s, v5.8h, v18.8h\n"
|
||||||
|
|
||||||
"dup v28.4s, w9\n"
|
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
|
||||||
"sqrdmulh v19.4s, v19.4s, v27.4s\n"
|
"sqrdmulh v19.4s, v19.4s, v30.4s\n"
|
||||||
"sqrdmulh v20.4s, v20.4s, v27.4s\n"
|
"sqrdmulh v20.4s, v20.4s, v31.4s\n"
|
||||||
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
|
"sqrdmulh v25.4s, v25.4s, v30.4s\n"
|
||||||
"sqrdmulh v26.4s, v26.4s, v27.4s\n"
|
"sqrdmulh v26.4s, v26.4s, v31.4s\n"
|
||||||
"sqrshl v19.4s, v19.4s, v28.4s\n"
|
"sqrshl v19.4s, v19.4s, v27.4s\n"
|
||||||
"sqrshl v20.4s, v20.4s, v28.4s\n"
|
"sqrshl v20.4s, v20.4s, v28.4s\n"
|
||||||
"sqrshl v25.4s, v25.4s, v28.4s\n"
|
"sqrshl v25.4s, v25.4s, v27.4s\n"
|
||||||
"sqrshl v26.4s, v26.4s, v28.4s\n"
|
"sqrshl v26.4s, v26.4s, v28.4s\n"
|
||||||
"dup v28.8h, w0\n"
|
"dup v28.8h, w0\n"
|
||||||
"sqxtn v19.4h, v19.4s\n"
|
"sqxtn v19.4h, v19.4s\n"
|
||||||
@ -1294,10 +1313,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"sqadd v25.8h, v25.8h, v29.8h\n"
|
"sqadd v25.8h, v25.8h, v29.8h\n"
|
||||||
"sqxtn v19.8b, v19.8h\n"
|
"sqxtn v19.8b, v19.8h\n"
|
||||||
"sqxtn2 v19.16b, v25.8h\n"
|
"sqxtn2 v19.16b, v25.8h\n"
|
||||||
|
"dup v27.16b, w3\n"
|
||||||
|
"dup v29.16b, w4\n"
|
||||||
"ld1 {v20.4s}, [x10]\n"
|
"ld1 {v20.4s}, [x10]\n"
|
||||||
"smax v19.16b, v19.16b, v30.16b\n"
|
"smax v19.16b, v19.16b, v27.16b\n"
|
||||||
"smin v19.16b, v19.16b, v31.16b\n"
|
"smin v19.16b, v19.16b, v29.16b\n"
|
||||||
"ld1 {v26.4s}, [x10]\n"
|
"ld1 {v26.4s}, [x10]\n"
|
||||||
|
"dup v29.8h, w2\n"
|
||||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||||
"st1 {v19.8b}, [x7], x5\n"
|
"st1 {v19.8b}, [x7], x5\n"
|
||||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||||
@ -1399,14 +1421,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"smlal2 v26.4s, v2.8h, v13.8h\n"
|
"smlal2 v26.4s, v2.8h, v13.8h\n"
|
||||||
"ld1 {v13.8b}, [x13]\n"
|
"ld1 {v13.8b}, [x13]\n"
|
||||||
|
|
||||||
"dup v28.4s, w9\n"
|
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
|
||||||
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
|
"sqrdmulh v21.4s, v21.4s, v30.4s\n"
|
||||||
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
|
"sqrdmulh v22.4s, v22.4s, v31.4s\n"
|
||||||
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
|
"sqrdmulh v23.4s, v23.4s, v30.4s\n"
|
||||||
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
|
"sqrdmulh v24.4s, v24.4s, v31.4s\n"
|
||||||
"sqrshl v21.4s, v21.4s, v28.4s\n"
|
"sqrshl v21.4s, v21.4s, v27.4s\n"
|
||||||
"sqrshl v22.4s, v22.4s, v28.4s\n"
|
"sqrshl v22.4s, v22.4s, v28.4s\n"
|
||||||
"sqrshl v23.4s, v23.4s, v28.4s\n"
|
"sqrshl v23.4s, v23.4s, v27.4s\n"
|
||||||
"sqrshl v24.4s, v24.4s, v28.4s\n"
|
"sqrshl v24.4s, v24.4s, v28.4s\n"
|
||||||
"dup v28.8h, w0\n"
|
"dup v28.8h, w0\n"
|
||||||
"sqxtn v21.4h, v21.4s\n"
|
"sqxtn v21.4h, v21.4s\n"
|
||||||
@ -1417,10 +1439,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"sqadd v23.8h, v23.8h, v29.8h\n"
|
"sqadd v23.8h, v23.8h, v29.8h\n"
|
||||||
"sqxtn v21.8b, v21.8h\n"
|
"sqxtn v21.8b, v21.8h\n"
|
||||||
"sqxtn2 v21.16b, v23.8h\n"
|
"sqxtn2 v21.16b, v23.8h\n"
|
||||||
|
"dup v27.16b, w3\n"
|
||||||
|
"dup v29.16b, w4\n"
|
||||||
"ld1 {v22.4s}, [x10]\n"
|
"ld1 {v22.4s}, [x10]\n"
|
||||||
"smax v21.16b, v21.16b, v30.16b\n"
|
"smax v21.16b, v21.16b, v27.16b\n"
|
||||||
"smin v21.16b, v21.16b, v31.16b\n"
|
"smin v21.16b, v21.16b, v29.16b\n"
|
||||||
"ld1 {v24.4s}, [x10]\n"
|
"ld1 {v24.4s}, [x10]\n"
|
||||||
|
"dup v29.8h, w2\n"
|
||||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||||
"st1 {v21.8b}, [x6], x5\n"
|
"st1 {v21.8b}, [x6], x5\n"
|
||||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||||
@ -1460,14 +1485,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"smlal v25.4s, v5.4h, v18.4h\n"
|
"smlal v25.4s, v5.4h, v18.4h\n"
|
||||||
"smlal2 v26.4s, v5.8h, v18.8h\n"
|
"smlal2 v26.4s, v5.8h, v18.8h\n"
|
||||||
|
|
||||||
"dup v28.4s, w9\n"
|
"ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
|
||||||
"sqrdmulh v19.4s, v19.4s, v27.4s\n"
|
"sqrdmulh v19.4s, v19.4s, v30.4s\n"
|
||||||
"sqrdmulh v20.4s, v20.4s, v27.4s\n"
|
"sqrdmulh v20.4s, v20.4s, v31.4s\n"
|
||||||
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
|
"sqrdmulh v25.4s, v25.4s, v30.4s\n"
|
||||||
"sqrdmulh v26.4s, v26.4s, v27.4s\n"
|
"sqrdmulh v26.4s, v26.4s, v31.4s\n"
|
||||||
"sqrshl v19.4s, v19.4s, v28.4s\n"
|
"sqrshl v19.4s, v19.4s, v27.4s\n"
|
||||||
"sqrshl v20.4s, v20.4s, v28.4s\n"
|
"sqrshl v20.4s, v20.4s, v28.4s\n"
|
||||||
"sqrshl v25.4s, v25.4s, v28.4s\n"
|
"sqrshl v25.4s, v25.4s, v27.4s\n"
|
||||||
"sqrshl v26.4s, v26.4s, v28.4s\n"
|
"sqrshl v26.4s, v26.4s, v28.4s\n"
|
||||||
"dup v28.8h, w0\n"
|
"dup v28.8h, w0\n"
|
||||||
"sqxtn v19.4h, v19.4s\n"
|
"sqxtn v19.4h, v19.4s\n"
|
||||||
@ -1476,11 +1501,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"sqxtn2 v25.8h, v26.4s\n"
|
"sqxtn2 v25.8h, v26.4s\n"
|
||||||
"sqadd v19.8h, v19.8h, v29.8h\n"
|
"sqadd v19.8h, v19.8h, v29.8h\n"
|
||||||
"sqadd v25.8h, v25.8h, v29.8h\n"
|
"sqadd v25.8h, v25.8h, v29.8h\n"
|
||||||
|
"dup v27.16b, w3\n"
|
||||||
|
"dup v29.16b, w4\n"
|
||||||
"sqxtn v19.8b, v19.8h\n"
|
"sqxtn v19.8b, v19.8h\n"
|
||||||
"sqxtn2 v19.16b, v25.8h\n"
|
"sqxtn2 v19.16b, v25.8h\n"
|
||||||
"smax v19.16b, v19.16b, v30.16b\n"
|
"smax v19.16b, v19.16b, v27.16b\n"
|
||||||
"smin v19.16b, v19.16b, v31.16b\n"
|
"smin v19.16b, v19.16b, v29.16b\n"
|
||||||
"st1 {v19.8b}, [x7], x5\n"
|
"st1 {v19.8b}, [x7], x5\n"
|
||||||
|
"dup v29.8h, w2\n"
|
||||||
"mov v25.d[0], v19.d[1]\n"
|
"mov v25.d[0], v19.d[1]\n"
|
||||||
"st1 {v25.8b}, [x7]\n"
|
"st1 {v25.8b}, [x7]\n"
|
||||||
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
|
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
|
||||||
@ -1533,17 +1561,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"smlal v23.4s, v2.4h, v17.4h\n"
|
"smlal v23.4s, v2.4h, v17.4h\n"
|
||||||
"smlal2 v24.4s, v2.8h, v17.8h\n"
|
"smlal2 v24.4s, v2.8h, v17.8h\n"
|
||||||
|
|
||||||
"dup v26.4s, w9\n"
|
"ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
|
||||||
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
|
"sqrdmulh v21.4s, v21.4s, v30.4s\n"
|
||||||
"sqrdmulh v22.4s, v22.4s, v27.4s\n"
|
"sqrdmulh v22.4s, v22.4s, v31.4s\n"
|
||||||
"sqrshl v21.4s, v21.4s, v26.4s\n"
|
"sqrshl v21.4s, v21.4s, v26.4s\n"
|
||||||
"sqrshl v22.4s, v22.4s, v26.4s\n"
|
"sqrshl v22.4s, v22.4s, v27.4s\n"
|
||||||
"sqxtn v21.4h, v21.4s\n"
|
"sqxtn v21.4h, v21.4s\n"
|
||||||
"sqxtn2 v21.8h, v22.4s\n"
|
"sqxtn2 v21.8h, v22.4s\n"
|
||||||
|
"dup v26.16b, w3\n"
|
||||||
|
"dup v27.16b, w4\n"
|
||||||
"sqadd v21.8h, v21.8h, v29.8h\n"
|
"sqadd v21.8h, v21.8h, v29.8h\n"
|
||||||
"sqxtn v21.8b, v21.8h\n"
|
"sqxtn v21.8b, v21.8h\n"
|
||||||
"smax v21.8b, v21.8b, v30.8b\n"
|
"smax v21.8b, v21.8b, v26.8b\n"
|
||||||
"smin v21.8b, v21.8b, v31.8b\n"
|
"smin v21.8b, v21.8b, v27.8b\n"
|
||||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||||
"st1 {v21.8b}, [x6]\n"
|
"st1 {v21.8b}, [x6]\n"
|
||||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||||
@ -1566,16 +1596,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"smlal v23.4s, v8.4h, v16.4h\n"
|
"smlal v23.4s, v8.4h, v16.4h\n"
|
||||||
"smlal2 v24.4s, v8.8h, v16.8h\n"
|
"smlal2 v24.4s, v8.8h, v16.8h\n"
|
||||||
|
|
||||||
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
|
"ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
|
||||||
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
|
"sqrdmulh v23.4s, v23.4s, v30.4s\n"
|
||||||
|
"sqrdmulh v24.4s, v24.4s, v31.4s\n"
|
||||||
"sqrshl v23.4s, v23.4s, v26.4s\n"
|
"sqrshl v23.4s, v23.4s, v26.4s\n"
|
||||||
"sqrshl v24.4s, v24.4s, v26.4s\n"
|
"sqrshl v24.4s, v24.4s, v27.4s\n"
|
||||||
"sqxtn v23.4h, v23.4s\n"
|
"sqxtn v23.4h, v23.4s\n"
|
||||||
"sqxtn2 v23.8h, v24.4s\n"
|
"sqxtn2 v23.8h, v24.4s\n"
|
||||||
|
"dup v26.16b, w3\n"
|
||||||
|
"dup v27.16b, w4\n"
|
||||||
"sqadd v23.8h, v23.8h, v29.8h\n"
|
"sqadd v23.8h, v23.8h, v29.8h\n"
|
||||||
"sqxtn v23.8b, v23.8h\n"
|
"sqxtn v23.8b, v23.8h\n"
|
||||||
"smax v23.8b, v23.8b, v30.8b\n"
|
"smax v23.8b, v23.8b, v26.8b\n"
|
||||||
"smin v23.8b, v23.8b, v31.8b\n"
|
"smin v23.8b, v23.8b, v27.8b\n"
|
||||||
"st1 {v23.8b}, [x7]\n"
|
"st1 {v23.8b}, [x7]\n"
|
||||||
|
|
||||||
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
|
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
|
||||||
@ -1696,17 +1729,16 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"smlal v26.4s, v8.4h, v23.4h\n"
|
"smlal v26.4s, v8.4h, v23.4h\n"
|
||||||
"smlal2 v27.4s, v8.8h, v23.8h\n"
|
"smlal2 v27.4s, v8.8h, v23.8h\n"
|
||||||
|
|
||||||
"dup v28.4s, w1\n"
|
"ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
|
||||||
"dup v29.4s, w9\n"
|
"sqrdmulh v24.4s, v24.4s, v30.4s\n"
|
||||||
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
|
"sqrdmulh v25.4s, v25.4s, v31.4s\n"
|
||||||
"sqrdmulh v25.4s, v25.4s, v28.4s\n"
|
"sqrdmulh v26.4s, v26.4s, v30.4s\n"
|
||||||
"sqrdmulh v26.4s, v26.4s, v28.4s\n"
|
"sqrdmulh v27.4s, v27.4s, v31.4s\n"
|
||||||
"sqrdmulh v27.4s, v27.4s, v28.4s\n"
|
"sqrshl v24.4s, v24.4s, v28.4s\n"
|
||||||
"dup v28.8h, w2\n"
|
|
||||||
"sqrshl v24.4s, v24.4s, v29.4s\n"
|
|
||||||
"sqrshl v25.4s, v25.4s, v29.4s\n"
|
"sqrshl v25.4s, v25.4s, v29.4s\n"
|
||||||
"sqrshl v26.4s, v26.4s, v29.4s\n"
|
"sqrshl v26.4s, v26.4s, v28.4s\n"
|
||||||
"sqrshl v27.4s, v27.4s, v29.4s\n"
|
"sqrshl v27.4s, v27.4s, v29.4s\n"
|
||||||
|
"dup v28.8h, w2\n"
|
||||||
"sqxtn v24.4h, v24.4s\n"
|
"sqxtn v24.4h, v24.4s\n"
|
||||||
"sqxtn2 v24.8h, v25.4s\n"
|
"sqxtn2 v24.8h, v25.4s\n"
|
||||||
"sqxtn v26.4h, v26.4s\n"
|
"sqxtn v26.4h, v26.4s\n"
|
||||||
@ -1716,12 +1748,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"sqxtn v24.8b, v24.8h\n"
|
"sqxtn v24.8b, v24.8h\n"
|
||||||
"sqxtn2 v24.16b, v26.8h\n"
|
"sqxtn2 v24.16b, v26.8h\n"
|
||||||
"dup v28.8h, w0\n"
|
"dup v28.8h, w0\n"
|
||||||
|
"dup v27.16b, w3\n"
|
||||||
|
"dup v29.16b, w4\n"
|
||||||
"ld1 {v25.4s}, [x10]\n"
|
"ld1 {v25.4s}, [x10]\n"
|
||||||
"smax v24.16b, v24.16b, v30.16b\n"
|
"smax v24.16b, v24.16b, v27.16b\n"
|
||||||
"smin v24.16b, v24.16b, v31.16b\n"
|
"smin v24.16b, v24.16b, v29.16b\n"
|
||||||
"ld1 {v27.4s}, [x10]\n"
|
|
||||||
"saddw v9.8h, v28.8h, v9.8b\n"
|
"saddw v9.8h, v28.8h, v9.8b\n"
|
||||||
"st1 {v24.8b}, [x6], x5\n"
|
"st1 {v24.8b}, [x6], x5\n"
|
||||||
|
"ld1 {v27.4s}, [x10]\n"
|
||||||
"saddw v10.8h, v28.8h, v10.8b\n"
|
"saddw v10.8h, v28.8h, v10.8b\n"
|
||||||
"mov v26.d[0], v24.d[1]\n"
|
"mov v26.d[0], v24.d[1]\n"
|
||||||
"st1 {v26.8b}, [x6], x5\n"
|
"st1 {v26.8b}, [x6], x5\n"
|
||||||
@ -1794,17 +1828,16 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"smlal v26.4s, v8.4h, v23.4h\n"
|
"smlal v26.4s, v8.4h, v23.4h\n"
|
||||||
"smlal2 v27.4s, v8.8h, v23.8h\n"
|
"smlal2 v27.4s, v8.8h, v23.8h\n"
|
||||||
|
|
||||||
"dup v28.4s, w1\n"
|
"ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
|
||||||
"dup v29.4s, w9\n"
|
"sqrdmulh v24.4s, v24.4s, v30.4s\n"
|
||||||
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
|
"sqrdmulh v25.4s, v25.4s, v31.4s\n"
|
||||||
"sqrdmulh v25.4s, v25.4s, v28.4s\n"
|
"sqrdmulh v26.4s, v26.4s, v30.4s\n"
|
||||||
"sqrdmulh v26.4s, v26.4s, v28.4s\n"
|
"sqrdmulh v27.4s, v27.4s, v31.4s\n"
|
||||||
"sqrdmulh v27.4s, v27.4s, v28.4s\n"
|
"sqrshl v24.4s, v24.4s, v28.4s\n"
|
||||||
"dup v28.8h, w2\n"
|
|
||||||
"sqrshl v24.4s, v24.4s, v29.4s\n"
|
|
||||||
"sqrshl v25.4s, v25.4s, v29.4s\n"
|
"sqrshl v25.4s, v25.4s, v29.4s\n"
|
||||||
"sqrshl v26.4s, v26.4s, v29.4s\n"
|
"sqrshl v26.4s, v26.4s, v28.4s\n"
|
||||||
"sqrshl v27.4s, v27.4s, v29.4s\n"
|
"sqrshl v27.4s, v27.4s, v29.4s\n"
|
||||||
|
"dup v28.8h, w2\n"
|
||||||
"sqxtn v24.4h, v24.4s\n"
|
"sqxtn v24.4h, v24.4s\n"
|
||||||
"sqxtn2 v24.8h, v25.4s\n"
|
"sqxtn2 v24.8h, v25.4s\n"
|
||||||
"sqxtn v26.4h, v26.4s\n"
|
"sqxtn v26.4h, v26.4s\n"
|
||||||
@ -1812,19 +1845,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"sqadd v24.8h, v24.8h, v28.8h\n"
|
"sqadd v24.8h, v24.8h, v28.8h\n"
|
||||||
"sqadd v26.8h, v26.8h, v28.8h\n"
|
"sqadd v26.8h, v26.8h, v28.8h\n"
|
||||||
"sqxtn v24.8b, v24.8h\n"
|
"sqxtn v24.8b, v24.8h\n"
|
||||||
|
"dup v28.16b, w3\n"
|
||||||
|
"dup v29.16b, w4\n"
|
||||||
"sqxtn2 v24.16b, v26.8h\n"
|
"sqxtn2 v24.16b, v26.8h\n"
|
||||||
"dup v28.8h, w0\n"
|
"smax v24.16b, v24.16b, v28.16b\n"
|
||||||
"smax v24.16b, v24.16b, v30.16b\n"
|
"smin v24.16b, v24.16b, v29.16b\n"
|
||||||
"smin v24.16b, v24.16b, v31.16b\n"
|
|
||||||
"st1 {v24.8b}, [x6], x5\n"
|
"st1 {v24.8b}, [x6], x5\n"
|
||||||
"mov v26.d[0], v24.d[1]\n"
|
"mov v26.d[0], v24.d[1]\n"
|
||||||
"st1 {v26.8b}, [x6]\n"
|
"st1 {v26.8b}, [x6]\n"
|
||||||
|
"dup v28.8h, w0\n"
|
||||||
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
|
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
|
||||||
|
|
||||||
// Handle bottom right output if exists.
|
// Handle bottom right output if exists.
|
||||||
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
|
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
|
||||||
"dup v26.4s, w9\n"
|
|
||||||
"dup v27.4s, w1\n"
|
|
||||||
"dup v29.8h, w2\n"
|
"dup v29.8h, w2\n"
|
||||||
|
|
||||||
"smlal v24.4s, v0.4h, v9.4h\n"
|
"smlal v24.4s, v0.4h, v9.4h\n"
|
||||||
@ -1846,16 +1879,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"smlal v24.4s, v8.4h, v17.4h\n"
|
"smlal v24.4s, v8.4h, v17.4h\n"
|
||||||
"smlal2 v25.4s, v8.8h, v17.8h\n"
|
"smlal2 v25.4s, v8.8h, v17.8h\n"
|
||||||
|
|
||||||
"sqrdmulh v24.4s, v24.4s, v27.4s\n"
|
"ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
|
||||||
"sqrdmulh v25.4s, v25.4s, v27.4s\n"
|
"sqrdmulh v24.4s, v24.4s, v30.4s\n"
|
||||||
|
"sqrdmulh v25.4s, v25.4s, v31.4s\n"
|
||||||
"sqrshl v24.4s, v24.4s, v26.4s\n"
|
"sqrshl v24.4s, v24.4s, v26.4s\n"
|
||||||
"sqrshl v25.4s, v25.4s, v26.4s\n"
|
"sqrshl v25.4s, v25.4s, v27.4s\n"
|
||||||
"sqxtn v24.4h, v24.4s\n"
|
"sqxtn v24.4h, v24.4s\n"
|
||||||
"sqxtn2 v24.8h, v25.4s\n"
|
"sqxtn2 v24.8h, v25.4s\n"
|
||||||
|
"dup v26.16b, w3\n"
|
||||||
|
"dup v27.16b, w4\n"
|
||||||
"sqadd v24.8h, v24.8h, v29.8h\n"
|
"sqadd v24.8h, v24.8h, v29.8h\n"
|
||||||
"sqxtn v24.8b, v24.8h\n"
|
"sqxtn v24.8b, v24.8h\n"
|
||||||
"smax v24.8b, v24.8b, v30.8b\n"
|
"smax v24.8b, v24.8b, v26.8b\n"
|
||||||
"smin v24.8b, v24.8b, v31.8b\n"
|
"smin v24.8b, v24.8b, v27.8b\n"
|
||||||
"st1 {v24.8b}, [x6]\n"
|
"st1 {v24.8b}, [x6]\n"
|
||||||
|
|
||||||
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
|
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
|
||||||
@ -1866,6 +1902,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
[output_window_height] "+r"(output_window_height)
|
[output_window_height] "+r"(output_window_height)
|
||||||
:
|
:
|
||||||
// Inputs.
|
// Inputs.
|
||||||
|
[output_multiplier_ptr] "r"(output_multiplier_ptr),
|
||||||
|
[output_shift_ptr] "r"(output_shift_ptr),
|
||||||
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
|
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
|
||||||
[input_depth] "r"(input_depth),
|
[input_depth] "r"(input_depth),
|
||||||
[output_window_width] "r"(output_window_width),
|
[output_window_width] "r"(output_window_width),
|
||||||
@ -1882,8 +1920,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
|||||||
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
|
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
|
||||||
"v30", "v31",
|
"v30", "v31",
|
||||||
// We use these general-purpose registers.
|
// We use these general-purpose registers.
|
||||||
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
|
"x0", "x2", "x3", "x4", "x5", "x6", "x7",
|
||||||
"x9", "x10", "x11", "x12", "x13", "x14", "x15",
|
"x10", "x11", "x12", "x13", "x14", "x15",
|
||||||
"x19", "x20");
|
"x19", "x20");
|
||||||
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
|
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
|
||||||
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
|
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
|
||||||
|
Loading…
Reference in New Issue
Block a user