change depthwise_conv 3x3 filter offset add logic since for per-channel case we're always using symmetric quantization.

PiperOrigin-RevId: 250415602
This commit is contained in:
Renjie Liu 2019-05-28 20:36:28 -07:00 committed by TensorFlower Gardener
parent a0be4ae064
commit ee65ff1690
2 changed files with 63 additions and 73 deletions

View File

@ -280,6 +280,7 @@ void TryTestOneDepthwiseConv3x3Filter() {
params.depth_multiplier = depth_multiplier;
params.input_offset = input_offset;
params.output_offset = output_offset;
params.weights_offset = 0;
params.quantized_activation_min = output_activation_min;
params.quantized_activation_max = output_activation_max;

View File

@ -46,7 +46,6 @@ namespace depthwise_conv {
#define OFFSET_FILTER_ROW_SIZE 32
#define OFFSET_INPUT_OFFSET 40
#define OFFSET_OUTPUT_OFFSET 44
#define OFFSET_FILTER_OFFSET 48
#define OFFSET_OUTPUT_MULTIPLIER 52
#define OFFSET_OUTPUT_ACTIVATION_MIN 56
#define OFFSET_OUTPUT_ACTIVATION_MAX 60
@ -78,9 +77,6 @@ static_assert(offsetof(DepthwiseConvParams, input_offset) ==
static_assert(offsetof(DepthwiseConvParams, output_offset) ==
OFFSET_OUTPUT_OFFSET,
"");
static_assert(offsetof(DepthwiseConvParams, filter_offset) ==
OFFSET_FILTER_OFFSET,
"");
static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
OFFSET_OUTPUT_MULTIPLIER,
"");
@ -131,6 +127,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
const int64_t input_width_increment = 2 * input_depth;
const int64_t input_height_increment = 2 * input_row_size;
const int64_t output_height_increment = 2 * params_ptr->output_row_size;
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
@ -208,10 +205,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
"dup v29.8h, w2\n"
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"add x10, %[bias_ptr], #16\n"
"ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
"dup v9.8h, w9\n"
"dup v25.16b, w4\n"
// Deal with output multiplier & output shift.
@ -221,22 +216,22 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
// Load filters and add offsets.
"ld1 {v0.8b}, [%[filter_ptr]], x3\n"
"ld1 {v1.8b}, [%[filter_ptr]], x3\n"
"saddw v0.8h, v9.8h, v0.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"ld1 {v2.8b}, [%[filter_ptr]], x3\n"
"saddw v1.8h, v9.8h, v1.8b\n"
"sshll v1.8h, v1.8b, #0\n"
"ld1 {v3.8b}, [%[filter_ptr]], x3\n"
"saddw v2.8h, v9.8h, v2.8b\n"
"sshll v2.8h, v2.8b, #0\n"
"ld1 {v4.8b}, [%[filter_ptr]], x3\n"
"saddw v3.8h, v9.8h, v3.8b\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v5.8b}, [%[filter_ptr]], x3\n"
"saddw v4.8h, v9.8h, v4.8b\n"
"sshll v4.8h, v4.8b, #0\n"
"ld1 {v6.8b}, [%[filter_ptr]], x3\n"
"saddw v5.8h, v9.8h, v5.8b\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v7.8b}, [%[filter_ptr]], x3\n"
"saddw v6.8h, v9.8h, v6.8b\n"
"sshll v6.8h, v6.8b, #0\n"
"ld1 {v8.8b}, [%[filter_ptr]], x3\n"
"saddw v7.8h, v9.8h, v7.8b\n"
"saddw v8.8h, v9.8h, v8.8b\n"
"sshll v7.8h, v7.8b, #0\n"
"sshll v8.8h, v8.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
@ -987,6 +982,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
const int64_t input_width_increment = 4 * input_depth;
const int64_t input_height_increment = 4 * input_row_size;
const int64_t output_height_increment = 2 * params_ptr->output_row_size;
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
@ -1064,7 +1060,6 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
"ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
// Deal with output multiplier.
"ld1 {v30.4s, v31.4s}, [%[output_multiplier_ptr]]\n"
@ -1072,24 +1067,23 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
// Load filters and add offsets.
"add x10, %[bias_ptr], #16\n"
"ld1 {v0.8b}, [%[filter_ptr]], x5\n"
"dup v9.8h, w20\n"
"ld1 {v1.8b}, [%[filter_ptr]], x5\n"
"saddw v0.8h, v9.8h, v0.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"ld1 {v2.8b}, [%[filter_ptr]], x5\n"
"saddw v1.8h, v9.8h, v1.8b\n"
"sshll v1.8h, v1.8b, #0\n"
"ld1 {v3.8b}, [%[filter_ptr]], x5\n"
"saddw v2.8h, v9.8h, v2.8b\n"
"sshll v2.8h, v2.8b, #0\n"
"ld1 {v4.8b}, [%[filter_ptr]], x5\n"
"saddw v3.8h, v9.8h, v3.8b\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v5.8b}, [%[filter_ptr]], x5\n"
"saddw v4.8h, v9.8h, v4.8b\n"
"sshll v4.8h, v4.8b, #0\n"
"ld1 {v6.8b}, [%[filter_ptr]], x5\n"
"saddw v5.8h, v9.8h, v5.8b\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v7.8b}, [%[filter_ptr]], x5\n"
"saddw v6.8h, v9.8h, v6.8b\n"
"sshll v6.8h, v6.8b, #0\n"
"ld1 {v8.8b}, [%[filter_ptr]]\n"
"saddw v7.8h, v9.8h, v7.8b\n"
"saddw v8.8h, v9.8h, v8.8b\n"
"sshll v7.8h, v7.8b, #0\n"
"sshll v8.8h, v8.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
@ -1945,6 +1939,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
const int8* filter_ptr, const int32* bias_ptr,
int8* output_ptr,
const DepthwiseConvParams* params_ptr) {
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
asm volatile(
@ -1964,14 +1959,12 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.16b, w9\n"
"ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"dup v31.16b, w10\n"
"dup v25.8h, w9\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"saddw v0.8h, v25.8h, v0.8b\n"
"sshll v0.8h, v0.8b, #0\n"
// Loads output_multiplier & output_shift.
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2003,7 +1996,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"st1 {v16.8b}, [%[output_ptr]], #8\n"
"saddw v8.8h, v26.8h, v8.8b\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"saddw v0.8h, v25.8h, v0.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v10.4s}, [%[output_shift_ptr]], #16\n"
@ -2041,7 +2034,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
// Clobbers.
"cc", "memory",
// We use these NEON registers.
"v0", "v6", "v7", "v8", "v10", "v11", "v16", "v17", "v18", "v19", "v25",
"v0", "v6", "v7", "v8", "v10", "v11", "v16", "v17", "v18", "v19",
"v26", "v28", "v30", "v31",
// We use these general-purpose registers.
"x9", "x10", "x11");
@ -2058,6 +2051,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
const int8* filter_ptr, const int32* bias_ptr,
int8* output_ptr,
const DepthwiseConvParams* params_ptr) {
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
asm volatile(
@ -2097,9 +2091,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.16b, w6\n"
"ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"dup v31.16b, w7\n"
"dup v25.8h, w6\n"
// Loads output_multiplier & output_shift.
"ld1 {v4.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2115,10 +2107,10 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v0.8h, v25.8h, v0.8b\n"
"saddw v1.8h, v25.8h, v1.8b\n"
"saddw v2.8h, v25.8h, v2.8b\n"
"saddw v3.8h, v25.8h, v3.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
@ -2160,10 +2152,10 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v0.8h, v25.8h, v0.8b\n"
"saddw v1.8h, v25.8h, v1.8b\n"
"saddw v2.8h, v25.8h, v2.8b\n"
"saddw v3.8h, v25.8h, v3.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v4.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v6.4s}, [%[output_shift_ptr]], #16\n"
"ld1 {v5.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2207,7 +2199,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"cc", "memory",
// We use these NEON registers.
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v16", "v17","v18", "v19", "v25", "v26", "v28", "v30", "v31",
"v11", "v16", "v17","v18", "v19", "v26", "v28", "v30", "v31",
// We use these general-purpose registers.
"x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
@ -2223,6 +2215,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
const int8* filter_ptr, const int32* bias_ptr,
int8* output_ptr,
const DepthwiseConvParams* params_ptr) {
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
asm volatile(
@ -2268,9 +2261,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.8b, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"dup v31.8b, w13\n"
"dup v25.8h, w12\n"
// Loads output_multiplier & output_shift.
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2288,12 +2279,12 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v0.8h, v25.8h, v0.8b\n"
"saddw v1.8h, v25.8h, v1.8b\n"
"saddw v2.8h, v25.8h, v2.8b\n"
"saddw v3.8h, v25.8h, v3.8b\n"
"saddw v4.8h, v25.8h, v4.8b\n"
"saddw v5.8h, v25.8h, v5.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
@ -2351,14 +2342,14 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v0.8h, v25.8h, v0.8b\n"
"saddw v1.8h, v25.8h, v1.8b\n"
"saddw v2.8h, v25.8h, v2.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"saddw v3.8h, v25.8h, v3.8b\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"saddw v4.8h, v25.8h, v4.8b\n"
"saddw v5.8h, v25.8h, v5.8b\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
"ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2405,7 +2396,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"cc", "memory",
// We use these NEON registers.
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v25",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v26", "v28", "v30", "v31",
// We use these general-purpose registers.
"x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
@ -2421,6 +2412,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
const int8* filter_ptr, const int32* bias_ptr,
int8* output_ptr,
const DepthwiseConvParams* params_ptr) {
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
asm volatile(
@ -2468,9 +2460,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"dup v30.8b, w12\n"
"ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
"dup v31.8b, w13\n"
"dup v25.8h, w12\n"
// Loads output_multiplier & output_shift.
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2488,12 +2478,12 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v0.8h, v25.8h, v0.8b\n"
"saddw v1.8h, v25.8h, v1.8b\n"
"saddw v2.8h, v25.8h, v2.8b\n"
"saddw v3.8h, v25.8h, v3.8b\n"
"saddw v4.8h, v25.8h, v4.8b\n"
"saddw v5.8h, v25.8h, v5.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"sshll v3.8h, v3.8b, #0\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
@ -2553,14 +2543,14 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v0.8h, v25.8h, v0.8b\n"
"saddw v1.8h, v25.8h, v1.8b\n"
"saddw v2.8h, v25.8h, v2.8b\n"
"sshll v0.8h, v0.8b, #0\n"
"sshll v1.8h, v1.8b, #0\n"
"sshll v2.8h, v2.8b, #0\n"
"ld1 {v16.4s}, [%[bias_ptr]], #16\n"
"saddw v3.8h, v25.8h, v3.8b\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v17.4s}, [%[bias_ptr]], #16\n"
"saddw v4.8h, v25.8h, v4.8b\n"
"saddw v5.8h, v25.8h, v5.8b\n"
"sshll v4.8h, v4.8b, #0\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
"ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
"ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2608,7 +2598,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
"cc", "memory",
// We use these NEON registers.
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v25",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v26", "v28", "v30", "v31",
// We use these general-purpose registers.
"x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
@ -2623,7 +2613,6 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
#undef OFFSET_OUTPUT_ROW_SIZE
#undef OFFSET_INPUT_OFFSET
#undef OFFSET_OUTPUT_OFFSET
#undef OFFSET_FILTER_OFFSET
#undef OFFSET_OUTPUT_MULTIPLIER
#undef OFFSET_OUTPUT_ACTIVATION_MIN
#undef OFFSET_OUTPUT_ACTIVATION_MAX