change depthwise_conv 3x3 filter offset add logic since for per-channel case we're always using symmetric quantization.

PiperOrigin-RevId: 250415602
2019-05-28 20:36:28 -07:00 · 2019-05-28 20:36:28 -07:00 · ee65ff1690
commit ee65ff1690
parent a0be4ae064
2 changed files with 63 additions and 73 deletions
--- a/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
@ -280,6 +280,7 @@ void TryTestOneDepthwiseConv3x3Filter() {
  params.depth_multiplier = depth_multiplier;
  params.input_offset = input_offset;
  params.output_offset = output_offset;
+  params.weights_offset = 0;
  params.quantized_activation_min = output_activation_min;
  params.quantized_activation_max = output_activation_max;

--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
@ -46,7 +46,6 @@ namespace depthwise_conv {
 #define OFFSET_FILTER_ROW_SIZE 32
 #define OFFSET_INPUT_OFFSET 40
 #define OFFSET_OUTPUT_OFFSET 44
-#define OFFSET_FILTER_OFFSET 48
 #define OFFSET_OUTPUT_MULTIPLIER 52
 #define OFFSET_OUTPUT_ACTIVATION_MIN 56
 #define OFFSET_OUTPUT_ACTIVATION_MAX 60
@ -78,9 +77,6 @@ static_assert(offsetof(DepthwiseConvParams, input_offset) ==
 static_assert(offsetof(DepthwiseConvParams, output_offset) ==
                  OFFSET_OUTPUT_OFFSET,
              "");
-static_assert(offsetof(DepthwiseConvParams, filter_offset) ==
-                  OFFSET_FILTER_OFFSET,
-              "");
 static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
                  OFFSET_OUTPUT_MULTIPLIER,
              "");
@ -131,6 +127,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
    const int64_t input_width_increment = 2 * input_depth;
    const int64_t input_height_increment = 2 * input_row_size;
    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);

 #define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
 #define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
@ -208,10 +205,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
        "dup v29.8h, w2\n"
        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
        "ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
-        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
        "add x10, %[bias_ptr], #16\n"
        "ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
-        "dup v9.8h, w9\n"
        "dup v25.16b, w4\n"

        // Deal with output multiplier & output shift.
@ -221,22 +216,22 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
        // Load filters and add offsets.
        "ld1 {v0.8b}, [%[filter_ptr]], x3\n"
        "ld1 {v1.8b}, [%[filter_ptr]], x3\n"
-        "saddw v0.8h, v9.8h, v0.8b\n"
+        "sshll v0.8h, v0.8b, #0\n"
        "ld1 {v2.8b}, [%[filter_ptr]], x3\n"
-        "saddw v1.8h, v9.8h, v1.8b\n"
+        "sshll v1.8h, v1.8b, #0\n"
        "ld1 {v3.8b}, [%[filter_ptr]], x3\n"
-        "saddw v2.8h, v9.8h, v2.8b\n"
+        "sshll v2.8h, v2.8b, #0\n"
        "ld1 {v4.8b}, [%[filter_ptr]], x3\n"
-        "saddw v3.8h, v9.8h, v3.8b\n"
+        "sshll v3.8h, v3.8b, #0\n"
        "ld1 {v5.8b}, [%[filter_ptr]], x3\n"
-        "saddw v4.8h, v9.8h, v4.8b\n"
+        "sshll v4.8h, v4.8b, #0\n"
        "ld1 {v6.8b}, [%[filter_ptr]], x3\n"
-        "saddw v5.8h, v9.8h, v5.8b\n"
+        "sshll v5.8h, v5.8b, #0\n"
        "ld1 {v7.8b}, [%[filter_ptr]], x3\n"
-        "saddw v6.8h, v9.8h, v6.8b\n"
+        "sshll v6.8h, v6.8b, #0\n"
        "ld1 {v8.8b}, [%[filter_ptr]], x3\n"
-        "saddw v7.8h, v9.8h, v7.8b\n"
-        "saddw v8.8h, v9.8h, v8.8b\n"
+        "sshll v7.8h, v7.8b, #0\n"
+        "sshll v8.8h, v8.8b, #0\n"

        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"

@ -987,6 +982,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
    const int64_t input_width_increment = 4 * input_depth;
    const int64_t input_height_increment = 4 * input_row_size;
    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);

 #define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
 #define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
@ -1064,7 +1060,6 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
        "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
        "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
-        "ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"

        // Deal with output multiplier.
        "ld1 {v30.4s, v31.4s}, [%[output_multiplier_ptr]]\n"
@ -1072,24 +1067,23 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
        // Load filters and add offsets.
        "add x10, %[bias_ptr], #16\n"
        "ld1 {v0.8b}, [%[filter_ptr]], x5\n"
-        "dup v9.8h, w20\n"
        "ld1 {v1.8b}, [%[filter_ptr]], x5\n"
-        "saddw v0.8h, v9.8h, v0.8b\n"
+        "sshll v0.8h, v0.8b, #0\n"
        "ld1 {v2.8b}, [%[filter_ptr]], x5\n"
-        "saddw v1.8h, v9.8h, v1.8b\n"
+        "sshll v1.8h, v1.8b, #0\n"
        "ld1 {v3.8b}, [%[filter_ptr]], x5\n"
-        "saddw v2.8h, v9.8h, v2.8b\n"
+        "sshll v2.8h, v2.8b, #0\n"
        "ld1 {v4.8b}, [%[filter_ptr]], x5\n"
-        "saddw v3.8h, v9.8h, v3.8b\n"
+        "sshll v3.8h, v3.8b, #0\n"
        "ld1 {v5.8b}, [%[filter_ptr]], x5\n"
-        "saddw v4.8h, v9.8h, v4.8b\n"
+        "sshll v4.8h, v4.8b, #0\n"
        "ld1 {v6.8b}, [%[filter_ptr]], x5\n"
-        "saddw v5.8h, v9.8h, v5.8b\n"
+        "sshll v5.8h, v5.8b, #0\n"
        "ld1 {v7.8b}, [%[filter_ptr]], x5\n"
-        "saddw v6.8h, v9.8h, v6.8b\n"
+        "sshll v6.8h, v6.8b, #0\n"
        "ld1 {v8.8b}, [%[filter_ptr]]\n"
-        "saddw v7.8h, v9.8h, v7.8b\n"
-        "saddw v8.8h, v9.8h, v8.8b\n"
+        "sshll v7.8h, v7.8b, #0\n"
+        "sshll v8.8h, v8.8b, #0\n"

        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"

@ -1945,6 +1939,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
                         const int8* filter_ptr, const int32* bias_ptr,
                         int8* output_ptr,
                         const DepthwiseConvParams* params_ptr) {
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
 #define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
 #define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
    asm volatile(
@ -1964,14 +1959,12 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
        "dup v30.16b, w9\n"
-        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
        "dup v31.16b, w10\n"
-        "dup v25.8h, w9\n"

        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
        "saddw v8.8h, v26.8h, v8.8b\n"
        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-        "saddw v0.8h, v25.8h, v0.8b\n"
+        "sshll v0.8h, v0.8b, #0\n"

        // Loads output_multiplier & output_shift.
        "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2003,7 +1996,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
          "st1 {v16.8b}, [%[output_ptr]], #8\n"
          "saddw v8.8h, v26.8h, v8.8b\n"
          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-          "saddw v0.8h, v25.8h, v0.8b\n"
+          "sshll v0.8h, v0.8b, #0\n"
          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
          "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
          "ld1 {v10.4s}, [%[output_shift_ptr]], #16\n"
@ -2041,7 +2034,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        // Clobbers.
        "cc", "memory",
        // We use these NEON registers.
-        "v0", "v6", "v7", "v8", "v10", "v11", "v16", "v17", "v18", "v19", "v25",
+        "v0", "v6", "v7", "v8", "v10", "v11", "v16", "v17", "v18", "v19",
        "v26", "v28", "v30", "v31",
        // We use these general-purpose registers.
        "x9", "x10", "x11");
@ -2058,6 +2051,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
                         const int8* filter_ptr, const int32* bias_ptr,
                         int8* output_ptr,
                         const DepthwiseConvParams* params_ptr) {
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
 #define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
 #define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
    asm volatile(
@ -2097,9 +2091,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
        "dup v30.16b, w6\n"
-        "ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
        "dup v31.16b, w7\n"
-        "dup v25.8h, w6\n"

        // Loads output_multiplier & output_shift.
        "ld1 {v4.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2115,10 +2107,10 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "saddw v10.8h, v26.8h, v10.8b\n"
        "saddw v11.8h, v26.8h, v11.8b\n"

-        "saddw v0.8h, v25.8h, v0.8b\n"
-        "saddw v1.8h, v25.8h, v1.8b\n"
-        "saddw v2.8h, v25.8h, v2.8b\n"
-        "saddw v3.8h, v25.8h, v3.8b\n"
+        "sshll v0.8h, v0.8b, #0\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "sshll v3.8h, v3.8b, #0\n"

        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"

@ -2160,10 +2152,10 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
          "saddw v10.8h, v26.8h, v10.8b\n"
          "saddw v11.8h, v26.8h, v11.8b\n"
-          "saddw v0.8h, v25.8h, v0.8b\n"
-          "saddw v1.8h, v25.8h, v1.8b\n"
-          "saddw v2.8h, v25.8h, v2.8b\n"
-          "saddw v3.8h, v25.8h, v3.8b\n"
+          "sshll v0.8h, v0.8b, #0\n"
+          "sshll v1.8h, v1.8b, #0\n"
+          "sshll v2.8h, v2.8b, #0\n"
+          "sshll v3.8h, v3.8b, #0\n"
          "ld1 {v4.4s}, [%[output_multiplier_ptr]], #16\n"
          "ld1 {v6.4s}, [%[output_shift_ptr]], #16\n"
          "ld1 {v5.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2207,7 +2199,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "cc", "memory",
        // We use these NEON registers.
        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-        "v11", "v16", "v17","v18", "v19", "v25", "v26", "v28", "v30", "v31",
+        "v11", "v16", "v17","v18", "v19", "v26", "v28", "v30", "v31",
        // We use these general-purpose registers.
        "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
 #undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
@ -2223,6 +2215,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
                         const int8* filter_ptr, const int32* bias_ptr,
                         int8* output_ptr,
                         const DepthwiseConvParams* params_ptr) {
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
 #define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
 #define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
    asm volatile(
@ -2268,9 +2261,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
        "dup v30.8b, w12\n"
-        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
        "dup v31.8b, w13\n"
-        "dup v25.8h, w12\n"

        // Loads output_multiplier & output_shift.
        "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2288,12 +2279,12 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "saddw v12.8h, v26.8h, v12.8b\n"
        "saddw v13.8h, v26.8h, v13.8b\n"

-        "saddw v0.8h, v25.8h, v0.8b\n"
-        "saddw v1.8h, v25.8h, v1.8b\n"
-        "saddw v2.8h, v25.8h, v2.8b\n"
-        "saddw v3.8h, v25.8h, v3.8b\n"
-        "saddw v4.8h, v25.8h, v4.8b\n"
-        "saddw v5.8h, v25.8h, v5.8b\n"
+        "sshll v0.8h, v0.8b, #0\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "sshll v3.8h, v3.8b, #0\n"
+        "sshll v4.8h, v4.8b, #0\n"
+        "sshll v5.8h, v5.8b, #0\n"

        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"

@ -2351,14 +2342,14 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
          "saddw v12.8h, v26.8h, v12.8b\n"
          "saddw v13.8h, v26.8h, v13.8b\n"

-          "saddw v0.8h, v25.8h, v0.8b\n"
-          "saddw v1.8h, v25.8h, v1.8b\n"
-          "saddw v2.8h, v25.8h, v2.8b\n"
+          "sshll v0.8h, v0.8b, #0\n"
+          "sshll v1.8h, v1.8b, #0\n"
+          "sshll v2.8h, v2.8b, #0\n"
          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-          "saddw v3.8h, v25.8h, v3.8b\n"
+          "sshll v3.8h, v3.8b, #0\n"
          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-          "saddw v4.8h, v25.8h, v4.8b\n"
-          "saddw v5.8h, v25.8h, v5.8b\n"
+          "sshll v4.8h, v4.8b, #0\n"
+          "sshll v5.8h, v5.8b, #0\n"
          "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
          "ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
          "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2405,7 +2396,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "cc", "memory",
        // We use these NEON registers.
        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v25",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
        "v26", "v28", "v30", "v31",
        // We use these general-purpose registers.
        "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
@ -2421,6 +2412,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
                         const int8* filter_ptr, const int32* bias_ptr,
                         int8* output_ptr,
                         const DepthwiseConvParams* params_ptr) {
+    TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
 #define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
 #define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
    asm volatile(
@ -2468,9 +2460,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
        "dup v30.8b, w12\n"
-        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
        "dup v31.8b, w13\n"
-        "dup v25.8h, w12\n"

        // Loads output_multiplier & output_shift.
        "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2488,12 +2478,12 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "saddw v12.8h, v26.8h, v12.8b\n"
        "saddw v13.8h, v26.8h, v13.8b\n"

-        "saddw v0.8h, v25.8h, v0.8b\n"
-        "saddw v1.8h, v25.8h, v1.8b\n"
-        "saddw v2.8h, v25.8h, v2.8b\n"
-        "saddw v3.8h, v25.8h, v3.8b\n"
-        "saddw v4.8h, v25.8h, v4.8b\n"
-        "saddw v5.8h, v25.8h, v5.8b\n"
+        "sshll v0.8h, v0.8b, #0\n"
+        "sshll v1.8h, v1.8b, #0\n"
+        "sshll v2.8h, v2.8b, #0\n"
+        "sshll v3.8h, v3.8b, #0\n"
+        "sshll v4.8h, v4.8b, #0\n"
+        "sshll v5.8h, v5.8b, #0\n"

        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"

@ -2553,14 +2543,14 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
          "saddw v12.8h, v26.8h, v12.8b\n"
          "saddw v13.8h, v26.8h, v13.8b\n"

-          "saddw v0.8h, v25.8h, v0.8b\n"
-          "saddw v1.8h, v25.8h, v1.8b\n"
-          "saddw v2.8h, v25.8h, v2.8b\n"
+          "sshll v0.8h, v0.8b, #0\n"
+          "sshll v1.8h, v1.8b, #0\n"
+          "sshll v2.8h, v2.8b, #0\n"
          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
-          "saddw v3.8h, v25.8h, v3.8b\n"
+          "sshll v3.8h, v3.8b, #0\n"
          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
-          "saddw v4.8h, v25.8h, v4.8b\n"
-          "saddw v5.8h, v25.8h, v5.8b\n"
+          "sshll v4.8h, v4.8b, #0\n"
+          "sshll v5.8h, v5.8b, #0\n"
          "ld1 {v6.4s}, [%[output_multiplier_ptr]], #16\n"
          "ld1 {v14.4s}, [%[output_shift_ptr]], #16\n"
          "ld1 {v7.4s}, [%[output_multiplier_ptr]], #16\n"
@ -2608,7 +2598,7 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
        "cc", "memory",
        // We use these NEON registers.
        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v25",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
        "v26", "v28", "v30", "v31",
        // We use these general-purpose registers.
        "x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
@ -2623,7 +2613,6 @@ struct DepthwiseConvPartialPerChannel<DepthwiseConvOutputRounding::kUpward,
 #undef OFFSET_OUTPUT_ROW_SIZE
 #undef OFFSET_INPUT_OFFSET
 #undef OFFSET_OUTPUT_OFFSET
-#undef OFFSET_FILTER_OFFSET
 #undef OFFSET_OUTPUT_MULTIPLIER
 #undef OFFSET_OUTPUT_ACTIVATION_MIN
 #undef OFFSET_OUTPUT_ACTIVATION_MAX