change depthwise_conv 3x3 filter offset add logic since for per-channel case we're always using symmetric quantization.
PiperOrigin-RevId: 250415602
This commit is contained in:
parent
a0be4ae064
commit
ee65ff1690
@ -280,6 +280,7 @@ void TryTestOneDepthwiseConv3x3Filter() {
|
||||
params.depth_multiplier = depth_multiplier;
|
||||
params.input_offset = input_offset;
|
||||
params.output_offset = output_offset;
|
||||
params.weights_offset = 0;
|
||||
params.quantized_activation_min = output_activation_min;
|
||||
params.quantized_activation_max = output_activation_max;
|
||||
|
||||
|
@ -46,7 +46,6 @@ namespace depthwise_conv {
|
||||
#define OFFSET_FILTER_ROW_SIZE 32
|
||||
#define OFFSET_INPUT_OFFSET 40
|
||||
#define OFFSET_OUTPUT_OFFSET 44
|
||||
#define OFFSET_FILTER_OFFSET 48
|
||||
#define OFFSET_OUTPUT_MULTIPLIER 52
|
||||
#define OFFSET_OUTPUT_ACTIVATION_MIN 56
|
||||
#define OFFSET_OUTPUT_ACTIVATION_MAX 60
|
||||
@ -78,9 +77,6 @@ static_assert(offsetof(DepthwiseConvParams, input_offset) ==
|
||||
static_assert(offsetof(DepthwiseConvParams, output_offset) ==
|
||||
OFFSET_OUTPUT_OFFSET,
|
||||
"");
|
||||
static_assert(offsetof(DepthwiseConvParams, filter_offset) ==
|
||||
OFFSET_FILTER_OFFSET,
|
||||
"");
|
||||
static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
|
||||
OFFSET_OUTPUT_MULTIPLIER,
|
||||
"");
|
||||
@ -131,6 +127,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
|
||||
const int64_t input_width_increment = 2 * input_depth;
|
||||
const int64_t input_height_increment = 2 * input_row_size;
|
||||
const int64_t output_height_increment = 2 * params_ptr->output_row_size;
|
||||
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
|
||||
|
||||
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
|
||||
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
|
||||
@ -208,10 +205,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
|
||||
"dup v29.8h, w2\n"
|
||||
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
|
||||
"ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
|
||||
"ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
|
||||
"add x10, %[bias_ptr], #16\n"
|
||||
"ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
|
||||
"dup v9.8h, w9\n"
|
||||
"dup v25.16b, w4\n"
|
||||
|
||||
// Deal with output multiplier & output shift.
|
||||
@ -221,22 +216,22 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
|
||||
// Load filters and add offsets.
|
||||
"ld1 {v0.8b}, [%[filter_ptr]], x3\n"
|
||||
"ld1 {v1.8b}, [%[filter_ptr]], x3\n"
|
||||
"saddw v0.8h, v9.8h, v0.8b\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
"ld1 {v2.8b}, [%[filter_ptr]], x3\n"
|
||||
"saddw v1.8h, v9.8h, v1.8b\n"
|
||||
"sshll v1.8h, v1.8b, #0\n"
|
||||
"ld1 {v3.8b}, [%[filter_ptr]], x3\n"
|
||||
"saddw v2.8h, v9.8h, v2.8b\n"
|
||||
"sshll v2.8h, v2.8b, #0\n"
|
||||
"ld1 {v4.8b}, [%[filter_ptr]], x3\n"
|
||||
"saddw v3.8h, v9.8h, v3.8b\n"
|
||||
"sshll v3.8h, v3.8b, #0\n"
|
||||
"ld1 {v5.8b}, [%[filter_ptr]], x3\n"
|
||||
"saddw v4.8h, v9.8h, v4.8b\n"
|
||||
"sshll v4.8h, v4.8b, #0\n"
|
||||
"ld1 {v6.8b}, [%[filter_ptr]], x3\n"
|
||||
"saddw v5.8h, v9.8h, v5.8b\n"
|
||||
"sshll v5.8h, v5.8b, #0\n"
|
||||
"ld1 {v7.8b}, [%[filter_ptr]], x3\n"
|
||||
"saddw v6.8h, v9.8h, v6.8b\n"
|
||||
"sshll v6.8h, v6.8b, #0\n"
|
||||
"ld1 {v8.8b}, [%[filter_ptr]], x3\n"
|
||||
"saddw v7.8h, v9.8h, v7.8b\n"
|
||||
"saddw v8.8h, v9.8h, v8.8b\n"
|
||||
"sshll v7.8h, v7.8b, #0\n"
|
||||
"sshll v8.8h, v8.8b, #0\n"
|
||||
|
||||
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
|
||||
|
||||
@ -987,6 +982,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
const int64_t input_width_increment = 4 * input_depth;
|
||||
const int64_t input_height_increment = 4 * input_row_size;
|
||||
const int64_t output_height_increment = 2 * params_ptr->output_row_size;
|
||||
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
|
||||
|
||||
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
|
||||
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
|
||||
@ -1064,7 +1060,6 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
|
||||
"ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
|
||||
"ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
|
||||
"ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
|
||||
|
||||
// Deal with output multiplier.
|
||||
"ld1 {v30.4s, v31.4s}, [%[output_multiplier_ptr]]\n"
|
||||
@ -1072,24 +1067,23 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
// Load filters and add offsets.
|
||||
"add x10, %[bias_ptr], #16\n"
|
||||
"ld1 {v0.8b}, [%[filter_ptr]], x5\n"
|
||||
"dup v9.8h, w20\n"
|
||||
"ld1 {v1.8b}, [%[filter_ptr]], x5\n"
|
||||
"saddw v0.8h, v9.8h, v0.8b\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
"ld1 {v2.8b}, [%[filter_ptr]], x5\n"
|
||||
"saddw v1.8h, v9.8h, v1.8b\n"
|
||||
"sshll v1.8h, v1.8b, #0\n"
|
||||
"ld1 {v3.8b}, [%[filter_ptr]], x5\n"
|
||||
"saddw v2.8h, v9.8h, v2.8b\n"
|
||||
"sshll v2.8h, v2.8b, #0\n"
|
||||
"ld1 {v4.8b}, [%[filter_ptr]], x5\n"
|
||||
"saddw v3.8h, v9.8h, v3.8b\n"
|
||||
"sshll v3.8h, v3.8b, #0\n"
|
||||
"ld1 {v5.8b}, [%[filter_ptr]], x5\n"
|
||||
"saddw v4.8h, v9.8h, v4.8b\n"
|
||||
"sshll v4.8h, v4.8b, #0\n"
|
||||
"ld1 {v6.8b}, [%[filter_ptr]], x5\n"
|
||||
"saddw v5.8h, v9.8h, v5.8b\n"
|
||||
"sshll v5.8h, v5.8b, #0\n"
|
||||
"ld1 {v7.8b}, [%[filter_ptr]], x5\n"
|
||||
"saddw v6.8h, v9.8h, v6.8b\n"
|
||||
"sshll v6.8h, v6.8b, #0\n"
|
||||
"ld1 {v8.8b}, [%[filter_ptr]]\n"
|
||||
"saddw v7.8h, v9.8h, v7.8b\n"
|
||||
"saddw v8.8h, v9.8h, v8.8b\n"
|
||||
"sshll v7.8h, v7.8b, #0\n"
|
||||
"sshll v8.8h, v8.8b, #0\n"
|
||||
|
||||
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
|
||||
|
||||
@ -1945,6 +1939,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
const int8* filter_ptr, const int32* bias_ptr,
|
||||
int8* output_ptr,
|
||||
const DepthwiseConvParams* params_ptr) {
|
||||
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
|
||||
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
|
||||
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
|
||||
asm volatile(
|
||||
@ -1964,14 +1959,12 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
|
||||
"ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
|
||||
"dup v30.16b, w9\n"
|
||||
"ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
|
||||
"dup v31.16b, w10\n"
|
||||
"dup v25.8h, w9\n"
|
||||
|
||||
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
|
||||
"saddw v8.8h, v26.8h, v8.8b\n"
|
||||
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
|
||||
"saddw v0.8h, v25.8h, v0.8b\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
|
||||
// Loads output_multiplier & output_shift.
|
||||
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
|
||||
@ -2003,7 +1996,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"st1 {v16.8b}, [%[output_ptr]], #8\n"
|
||||
"saddw v8.8h, v26.8h, v8.8b\n"
|
||||
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
|
||||
"saddw v0.8h, v25.8h, v0.8b\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
|
||||
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
|
||||
"ld1 {v10.4s}, [%[output_shift_ptr]], #16\n"
|
||||
@ -2041,7 +2034,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
// Clobbers.
|
||||
"cc", "memory",
|
||||
// We use these NEON registers.
|
||||
"v0", "v6", "v7", "v8", "v10", "v11", "v16", "v17", "v18", "v19", "v25",
|
||||
"v0", "v6", "v7", "v8", "v10", "v11", "v16", "v17", "v18", "v19",
|
||||
"v26", "v28", "v30", "v31",
|
||||
// We use these general-purpose registers.
|
||||
"x9", "x10", "x11");
|
||||
@ -2058,6 +2051,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
const int8* filter_ptr, const int32* bias_ptr,
|
||||
int8* output_ptr,
|
||||
const DepthwiseConvParams* params_ptr) {
|
||||
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
|
||||
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
|
||||
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
|
||||
asm volatile(
|
||||
@ -2097,9 +2091,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
|
||||
"ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
|
||||
"dup v30.16b, w6\n"
|
||||
"ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
|
||||
"dup v31.16b, w7\n"
|
||||
"dup v25.8h, w6\n"
|
||||
|
||||
// Loads output_multiplier & output_shift.
|
||||
"ld1 {v4.4s}, [%[output_multiplier_ptr]], #16\n"
|
||||
@ -2115,10 +2107,10 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"saddw v10.8h, v26.8h, v10.8b\n"
|
||||
"saddw v11.8h, v26.8h, v11.8b\n"
|
||||
|
||||
"saddw v0.8h, v25.8h, v0.8b\n"
|
||||
"saddw v1.8h, v25.8h, v1.8b\n"
|
||||
"saddw v2.8h, v25.8h, v2.8b\n"
|
||||
"saddw v3.8h, v25.8h, v3.8b\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
"sshll v1.8h, v1.8b, #0\n"
|
||||
"sshll v2.8h, v2.8b, #0\n"
|
||||
"sshll v3.8h, v3.8b, #0\n"
|
||||
|
||||
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
|
||||
|
||||
@ -2160,10 +2152,10 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
|
||||
"saddw v10.8h, v26.8h, v10.8b\n"
|
||||
"saddw v11.8h, v26.8h, v11.8b\n"
|
||||
"saddw v0.8h, v25.8h, v0.8b\n"
|
||||
"saddw v1.8h, v25.8h, v1.8b\n"
|
||||
"saddw v2.8h, v25.8h, v2.8b\n"
|
||||
"saddw v3.8h, v25.8h, v3.8b\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
"sshll v1.8h, v1.8b, #0\n"
|
||||
"sshll v2.8h, v2.8b, #0\n"
|
||||
"sshll v3.8h, v3.8b, #0\n"
|
||||
"ld1 {v4.4s}, [%[output_multiplier_ptr]], #16\n"
|
||||
"ld1 {v6.4s}, [%[output_shift_ptr]], #16\n"
|
||||
"ld1 {v5.4s}, [%[output_multiplier_ptr]], #16\n"
|
||||
@ -2207,7 +2199,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"cc", "memory",
|
||||
// We use these NEON registers.
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
||||
"v11", "v16", "v17","v18", "v19", "v25", "v26", "v28", "v30", "v31",
|
||||
"v11", "v16", "v17","v18", "v19", "v26", "v28", "v30", "v31",
|
||||
// We use these general-purpose registers.
|
||||
"x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
|
||||
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
|
||||
@ -2223,6 +2215,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
const int8* filter_ptr, const int32* bias_ptr,
|
||||
int8* output_ptr,
|
||||
const DepthwiseConvParams* params_ptr) {
|
||||
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
|
||||
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
|
||||
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
|
||||
asm volatile(
|
||||
@ -2268,9 +2261,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
|
||||
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
|
||||
"dup v30.8b, w12\n"
|
||||
"ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
|
||||
"dup v31.8b, w13\n"
|
||||
"dup v25.8h, w12\n"
|
||||
|
||||
// Loads output_multiplier & output_shift.
|
||||
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
|
||||
@ -2288,12 +2279,12 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"saddw v12.8h, v26.8h, v12.8b\n"
|
||||
"saddw v13.8h, v26.8h, v13.8b\n"
|
||||
|
||||
"saddw v0.8h, v25.8h, v0.8b\n"
|
||||
"saddw v1.8h, v25.8h, v1.8b\n"
|
||||
"saddw v2.8h, v25.8h, v2.8b\n"
|
||||
"saddw v3.8h, v25.8h, v3.8b\n"
|
||||
"saddw v4.8h, v25.8h, v4.8b\n"
|
||||
"saddw v5.8h, v25.8h, v5.8b\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
"sshll v1.8h, v1.8b, #0\n"
|
||||
"sshll v2.8h, v2.8b, #0\n"
|
||||
"sshll v3.8h, v3.8b, #0\n"
|
||||
"sshll v4.8h, v4.8b, #0\n"
|
||||
"sshll v5.8h, v5.8b, #0\n"
|
||||
|
||||
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
|
||||
|
||||
@ -2351,14 +2342,14 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"saddw v12.8h, v26.8h, v12.8b\n"
|
||||
"saddw v13.8h, v26.8h, v13.8b\n"
|
||||
|
||||
"saddw v0.8h, v25.8h, v0.8b\n"
|
||||
"saddw v1.8h, v25.8h, v1.8b\n"
|
||||
"saddw v2.8h, v25.8h, v2.8b\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
"sshll v1.8h, v1.8b, #0\n"
|
||||
"sshll v2.8h, v2.8b, #0\n"
|
||||
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
|
||||
"saddw v3.8h, v25.8h, v3.8b\n"
|
||||
"sshll v3.8h, v3.8b, #0\n"
|
||||
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
|
||||
"saddw v4.8h, v25.8h, v4.8b\n"
|
||||
"saddw v5.8h, v25.8h, v5.8b\n"
|
||||
"sshll v4.8h, v4.8b, #0\n"
|
||||
"sshll v5.8h, v5.8b, #0\n"
|
||||
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
|
||||
"ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
|
||||
"ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
|
||||
@ -2405,7 +2396,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"cc", "memory",
|
||||
// We use these NEON registers.
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
||||
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v25",
|
||||
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
|
||||
"v26", "v28", "v30", "v31",
|
||||
// We use these general-purpose registers.
|
||||
"x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
|
||||
@ -2421,6 +2412,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
const int8* filter_ptr, const int32* bias_ptr,
|
||||
int8* output_ptr,
|
||||
const DepthwiseConvParams* params_ptr) {
|
||||
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
|
||||
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
|
||||
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
|
||||
asm volatile(
|
||||
@ -2468,9 +2460,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
|
||||
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
|
||||
"dup v30.8b, w12\n"
|
||||
"ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
|
||||
"dup v31.8b, w13\n"
|
||||
"dup v25.8h, w12\n"
|
||||
|
||||
// Loads output_multiplier & output_shift.
|
||||
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
|
||||
@ -2488,12 +2478,12 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"saddw v12.8h, v26.8h, v12.8b\n"
|
||||
"saddw v13.8h, v26.8h, v13.8b\n"
|
||||
|
||||
"saddw v0.8h, v25.8h, v0.8b\n"
|
||||
"saddw v1.8h, v25.8h, v1.8b\n"
|
||||
"saddw v2.8h, v25.8h, v2.8b\n"
|
||||
"saddw v3.8h, v25.8h, v3.8b\n"
|
||||
"saddw v4.8h, v25.8h, v4.8b\n"
|
||||
"saddw v5.8h, v25.8h, v5.8b\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
"sshll v1.8h, v1.8b, #0\n"
|
||||
"sshll v2.8h, v2.8b, #0\n"
|
||||
"sshll v3.8h, v3.8b, #0\n"
|
||||
"sshll v4.8h, v4.8b, #0\n"
|
||||
"sshll v5.8h, v5.8b, #0\n"
|
||||
|
||||
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
|
||||
|
||||
@ -2553,14 +2543,14 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"saddw v12.8h, v26.8h, v12.8b\n"
|
||||
"saddw v13.8h, v26.8h, v13.8b\n"
|
||||
|
||||
"saddw v0.8h, v25.8h, v0.8b\n"
|
||||
"saddw v1.8h, v25.8h, v1.8b\n"
|
||||
"saddw v2.8h, v25.8h, v2.8b\n"
|
||||
"sshll v0.8h, v0.8b, #0\n"
|
||||
"sshll v1.8h, v1.8b, #0\n"
|
||||
"sshll v2.8h, v2.8b, #0\n"
|
||||
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
|
||||
"saddw v3.8h, v25.8h, v3.8b\n"
|
||||
"sshll v3.8h, v3.8b, #0\n"
|
||||
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
|
||||
"saddw v4.8h, v25.8h, v4.8b\n"
|
||||
"saddw v5.8h, v25.8h, v5.8b\n"
|
||||
"sshll v4.8h, v4.8b, #0\n"
|
||||
"sshll v5.8h, v5.8b, #0\n"
|
||||
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
|
||||
"ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
|
||||
"ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
|
||||
@ -2608,7 +2598,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
"cc", "memory",
|
||||
// We use these NEON registers.
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
|
||||
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v25",
|
||||
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
|
||||
"v26", "v28", "v30", "v31",
|
||||
// We use these general-purpose registers.
|
||||
"x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
|
||||
@ -2623,7 +2613,6 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
|
||||
#undef OFFSET_OUTPUT_ROW_SIZE
|
||||
#undef OFFSET_INPUT_OFFSET
|
||||
#undef OFFSET_OUTPUT_OFFSET
|
||||
#undef OFFSET_FILTER_OFFSET
|
||||
#undef OFFSET_OUTPUT_MULTIPLIER
|
||||
#undef OFFSET_OUTPUT_ACTIVATION_MIN
|
||||
#undef OFFSET_OUTPUT_ACTIVATION_MAX
|
||||
|
Loading…
Reference in New Issue
Block a user