Handle per-channel for stride == 2 && non-padding case.

PiperOrigin-RevId: 249852734
2019-05-24 10:10:13 -07:00 · 2019-05-24 10:10:13 -07:00 · 36147a36e5
commit 36147a36e5
parent 5735726ed6
3 changed files with 130 additions and 113 deletions
--- a/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
@ -126,28 +126,16 @@ void PickReasonableMultiplier(
                       bias_shape_inference, bias_data, output_shape_inference,
                       &output_multiplier);

-  bool should_use_per_channel = true;
-
-  // TODO(b/132879305): Support stride == 2 per-channel case.
-  if (params.stride_width == 2 || params.stride_height == 2) {
-    should_use_per_channel = false;
-  }
-
  int base_multiplier;
  int base_shift;
  QuantizeMultiplier(output_multiplier, &base_multiplier, &base_shift);
  for (int i = 0; i < output_depth; ++i) {
-    if (should_use_per_channel) {
-      // multipliers typically range in [2^30 ; 2^31 - 1].
-      // Values in [0, 2^30 - 1] are normally unused, but harmless.
-      // Thus a good way to randomize multipliers is to subtract from them
-      // a random value smaller than 2^30 but still significant compared to it.
-      output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
-      output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
-    } else {
-      output_multiplier_ptr[i] = base_multiplier;
-      output_shift_ptr[i] = base_shift;
-    }
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
+    output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
  }
 }

--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
@ -411,15 +411,6 @@ inline bool Fast3x3FilterKernelSupported(
  }

  if (quantization_type == QuantizationType::kPerChannelInt8) {
-    // TODO(b/132879305): Support stride == 2 per-channel case.
-    if (stride_height == 2 || stride_width == 2) {
-      for (int i = 0; i < output_depth; ++i) {
-        if (output_shift_ptr[i] != output_shift_ptr[0]) {
-          return false;
-        }
-      }
-    }
-
    for (int i = 0; i < output_depth; ++i) {
      if (output_shift_ptr[i] > 0) {
        return false;
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
@ -1031,28 +1031,44 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
        // interleaved with arithmetic operations to take advantage of
        // dual-issue pipelines. We also add input offsets as far from the loads
        // as possible to give loads enough cycles to fetch data from memory.
+        //
+        // This logic is copied and modified from the non-per-channel quantized
+        // part.
+        // The register planning here is really tricky:
+        // v0-v29 are all used at least once for either filter/input/output,
+        // some of them are used for output shift and output mulitplier, or
+        // input/output offset.
+        // Only v30 & v31 are only used for output activation min/max.
+        // For per-channel case, we need 4 registers to hold output shift &
+        // output multiplier. However, given the reality, we simply cannot do
+        // that without reloading.
+        //
+        // So here's the plan:
+        // We hold output_multiplier in v30 & v31, and we will load output_shift
+        // into two consecutive registers each time before use.
+        // We will duplicate output min & max before needed.
+        // Sometimes we may borrow registers from input offset or bias, we will
+        // dup them back after use.
+        //

        // Set "constant" registers. These registers may be replaced with temp
        // values from time to time when there are not enough NEON registers.
        // We use x9--x15 general purpose registers as they are caller-saved
        // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
-        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
        "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
        "cmp %w[output_window_height], #2\n"
        "dup v28.8h, w0\n"
-        "ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
-        "dup v26.4s, w9\n"
        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
-        "dup v27.4s, w1\n"
        "ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
        "dup v29.8h, w2\n"
        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
-        "dup v30.16b, w3\n"
        "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
-        "dup v31.16b, w4\n"
        "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
        "ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"

+        // Deal with output multiplier.
+        "ld1 {v30.4s, v31.4s}, [%[output_multiplier_ptr]]\n"
+
        // Load filters and add offsets.
        "add x10, %[bias_ptr], #16\n"
        "ld1 {v0.8b}, [%[filter_ptr]], x5\n"
@ -1207,14 +1223,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
            "add x13, x12, %[input_row_size]\n"
            "add x15, x13, %[input_row_size]\n"

-            "dup v28.4s, w9\n"
-            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
-            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-            "sqrshl v21.4s, v21.4s, v28.4s\n"
+            "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+            "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v31.4s\n"
+            "sqrshl v21.4s, v21.4s, v27.4s\n"
            "sqrshl v22.4s, v22.4s, v28.4s\n"
-            "sqrshl v23.4s, v23.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v27.4s\n"
            "sqrshl v24.4s, v24.4s, v28.4s\n"
            "dup v28.8h, w0\n"
            "sqxtn v21.4h, v21.4s\n"
@ -1225,10 +1241,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
            "sqadd v23.8h, v23.8h, v29.8h\n"
            "sqxtn v21.8b, v21.8h\n"
            "sqxtn2 v21.16b, v23.8h\n"
+            "dup v27.16b, w3\n"
+            "dup v29.16b, w4\n"
            "ld1 {v22.4s}, [x10]\n"
-            "smax v21.16b, v21.16b, v30.16b\n"
-            "smin v21.16b, v21.16b, v31.16b\n"
+            "smax v21.16b, v21.16b, v27.16b\n"
+            "smin v21.16b, v21.16b, v29.16b\n"
            "ld1 {v24.4s}, [x10]\n"
+            "dup v29.8h, w2\n"
            "saddw v9.8h, v28.8h, v9.8b\n"
            "st1 {v21.8b}, [x6], x5\n"
            "saddw v10.8h, v28.8h, v10.8b\n"
@ -1276,14 +1295,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
            "smlal v25.4s, v5.4h, v18.4h\n"
            "smlal2 v26.4s, v5.8h, v18.8h\n"

-            "dup v28.4s, w9\n"
-            "sqrdmulh v19.4s, v19.4s, v27.4s\n"
-            "sqrdmulh v20.4s, v20.4s, v27.4s\n"
-            "sqrdmulh v25.4s, v25.4s, v27.4s\n"
-            "sqrdmulh v26.4s, v26.4s, v27.4s\n"
-            "sqrshl v19.4s, v19.4s, v28.4s\n"
+            "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+            "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+            "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+            "sqrdmulh v25.4s, v25.4s, v30.4s\n"
+            "sqrdmulh v26.4s, v26.4s, v31.4s\n"
+            "sqrshl v19.4s, v19.4s, v27.4s\n"
            "sqrshl v20.4s, v20.4s, v28.4s\n"
-            "sqrshl v25.4s, v25.4s, v28.4s\n"
+            "sqrshl v25.4s, v25.4s, v27.4s\n"
            "sqrshl v26.4s, v26.4s, v28.4s\n"
            "dup v28.8h, w0\n"
            "sqxtn v19.4h, v19.4s\n"
@ -1294,10 +1313,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
            "sqadd v25.8h, v25.8h, v29.8h\n"
            "sqxtn v19.8b, v19.8h\n"
            "sqxtn2 v19.16b, v25.8h\n"
+            "dup v27.16b, w3\n"
+            "dup v29.16b, w4\n"
            "ld1 {v20.4s}, [x10]\n"
-            "smax v19.16b, v19.16b, v30.16b\n"
-            "smin v19.16b, v19.16b, v31.16b\n"
+            "smax v19.16b, v19.16b, v27.16b\n"
+            "smin v19.16b, v19.16b, v29.16b\n"
            "ld1 {v26.4s}, [x10]\n"
+            "dup v29.8h, w2\n"
            "saddw v9.8h, v28.8h, v9.8b\n"
            "st1 {v19.8b}, [x7], x5\n"
            "saddw v10.8h, v28.8h, v10.8b\n"
@ -1399,14 +1421,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
          "smlal2 v26.4s, v2.8h, v13.8h\n"
          "ld1 {v13.8b}, [x13]\n"

-          "dup v28.4s, w9\n"
-          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
-          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v31.4s\n"
+          "sqrshl v21.4s, v21.4s, v27.4s\n"
          "sqrshl v22.4s, v22.4s, v28.4s\n"
-          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v27.4s\n"
          "sqrshl v24.4s, v24.4s, v28.4s\n"
          "dup v28.8h, w0\n"
          "sqxtn v21.4h, v21.4s\n"
@ -1417,10 +1439,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
          "sqadd v23.8h, v23.8h, v29.8h\n"
          "sqxtn v21.8b, v21.8h\n"
          "sqxtn2 v21.16b, v23.8h\n"
+          "dup v27.16b, w3\n"
+          "dup v29.16b, w4\n"
          "ld1 {v22.4s}, [x10]\n"
-          "smax v21.16b, v21.16b, v30.16b\n"
-          "smin v21.16b, v21.16b, v31.16b\n"
+          "smax v21.16b, v21.16b, v27.16b\n"
+          "smin v21.16b, v21.16b, v29.16b\n"
          "ld1 {v24.4s}, [x10]\n"
+          "dup v29.8h, w2\n"
          "saddw v9.8h, v28.8h, v9.8b\n"
          "st1 {v21.8b}, [x6], x5\n"
          "saddw v10.8h, v28.8h, v10.8b\n"
@ -1460,14 +1485,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
          "smlal v25.4s, v5.4h, v18.4h\n"
          "smlal2 v26.4s, v5.8h, v18.8h\n"

-          "dup v28.4s, w9\n"
-          "sqrdmulh v19.4s, v19.4s, v27.4s\n"
-          "sqrdmulh v20.4s, v20.4s, v27.4s\n"
-          "sqrdmulh v25.4s, v25.4s, v27.4s\n"
-          "sqrdmulh v26.4s, v26.4s, v27.4s\n"
-          "sqrshl v19.4s, v19.4s, v28.4s\n"
+          "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+          "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v30.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v31.4s\n"
+          "sqrshl v19.4s, v19.4s, v27.4s\n"
          "sqrshl v20.4s, v20.4s, v28.4s\n"
-          "sqrshl v25.4s, v25.4s, v28.4s\n"
+          "sqrshl v25.4s, v25.4s, v27.4s\n"
          "sqrshl v26.4s, v26.4s, v28.4s\n"
          "dup v28.8h, w0\n"
          "sqxtn v19.4h, v19.4s\n"
@ -1476,11 +1501,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
          "sqxtn2 v25.8h, v26.4s\n"
          "sqadd v19.8h, v19.8h, v29.8h\n"
          "sqadd v25.8h, v25.8h, v29.8h\n"
+          "dup v27.16b, w3\n"
+          "dup v29.16b, w4\n"
          "sqxtn v19.8b, v19.8h\n"
          "sqxtn2 v19.16b, v25.8h\n"
-          "smax v19.16b, v19.16b, v30.16b\n"
-          "smin v19.16b, v19.16b, v31.16b\n"
+          "smax v19.16b, v19.16b, v27.16b\n"
+          "smin v19.16b, v19.16b, v29.16b\n"
          "st1 {v19.8b}, [x7], x5\n"
+          "dup v29.8h, w2\n"
          "mov v25.d[0], v19.d[1]\n"
          "st1 {v25.8b}, [x7]\n"
          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
@ -1533,17 +1561,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
          "smlal v23.4s, v2.4h, v17.4h\n"
          "smlal2 v24.4s, v2.8h, v17.8h\n"

-          "dup v26.4s, w9\n"
-          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v31.4s\n"
          "sqrshl v21.4s, v21.4s, v26.4s\n"
-          "sqrshl v22.4s, v22.4s, v26.4s\n"
+          "sqrshl v22.4s, v22.4s, v27.4s\n"
          "sqxtn v21.4h, v21.4s\n"
          "sqxtn2 v21.8h, v22.4s\n"
+          "dup v26.16b, w3\n"
+          "dup v27.16b, w4\n"
          "sqadd v21.8h, v21.8h, v29.8h\n"
          "sqxtn v21.8b, v21.8h\n"
-          "smax v21.8b, v21.8b, v30.8b\n"
-          "smin v21.8b, v21.8b, v31.8b\n"
+          "smax v21.8b, v21.8b, v26.8b\n"
+          "smin v21.8b, v21.8b, v27.8b\n"
          "saddw v9.8h, v28.8h, v9.8b\n"
          "st1 {v21.8b}, [x6]\n"
          "saddw v10.8h, v28.8h, v10.8b\n"
@ -1566,16 +1596,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
          "smlal v23.4s, v8.4h, v16.4h\n"
          "smlal2 v24.4s, v8.8h, v16.8h\n"

-          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v31.4s\n"
          "sqrshl v23.4s, v23.4s, v26.4s\n"
-          "sqrshl v24.4s, v24.4s, v26.4s\n"
+          "sqrshl v24.4s, v24.4s, v27.4s\n"
          "sqxtn v23.4h, v23.4s\n"
          "sqxtn2 v23.8h, v24.4s\n"
+          "dup v26.16b, w3\n"
+          "dup v27.16b, w4\n"
          "sqadd v23.8h, v23.8h, v29.8h\n"
          "sqxtn v23.8b, v23.8h\n"
-          "smax v23.8b, v23.8b, v30.8b\n"
-          "smin v23.8b, v23.8b, v31.8b\n"
+          "smax v23.8b, v23.8b, v26.8b\n"
+          "smin v23.8b, v23.8b, v27.8b\n"
          "st1 {v23.8b}, [x7]\n"

          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
@ -1696,17 +1729,16 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
          "smlal v26.4s, v8.4h, v23.4h\n"
          "smlal2 v27.4s, v8.8h, v23.8h\n"

-          "dup v28.4s, w1\n"
-          "dup v29.4s, w9\n"
-          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
-          "sqrdmulh v25.4s, v25.4s, v28.4s\n"
-          "sqrdmulh v26.4s, v26.4s, v28.4s\n"
-          "sqrdmulh v27.4s, v27.4s, v28.4s\n"
-          "dup v28.8h, w2\n"
-          "sqrshl v24.4s, v24.4s, v29.4s\n"
+          "ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v31.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v30.4s\n"
+          "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
          "sqrshl v25.4s, v25.4s, v29.4s\n"
-          "sqrshl v26.4s, v26.4s, v29.4s\n"
+          "sqrshl v26.4s, v26.4s, v28.4s\n"
          "sqrshl v27.4s, v27.4s, v29.4s\n"
+          "dup v28.8h, w2\n"
          "sqxtn v24.4h, v24.4s\n"
          "sqxtn2 v24.8h, v25.4s\n"
          "sqxtn v26.4h, v26.4s\n"
@ -1716,12 +1748,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
          "sqxtn v24.8b, v24.8h\n"
          "sqxtn2 v24.16b, v26.8h\n"
          "dup v28.8h, w0\n"
+          "dup v27.16b, w3\n"
+          "dup v29.16b, w4\n"
          "ld1 {v25.4s}, [x10]\n"
-          "smax v24.16b, v24.16b, v30.16b\n"
-          "smin v24.16b, v24.16b, v31.16b\n"
-          "ld1 {v27.4s}, [x10]\n"
+          "smax v24.16b, v24.16b, v27.16b\n"
+          "smin v24.16b, v24.16b, v29.16b\n"
          "saddw v9.8h, v28.8h, v9.8b\n"
          "st1 {v24.8b}, [x6], x5\n"
+          "ld1 {v27.4s}, [x10]\n"
          "saddw v10.8h, v28.8h, v10.8b\n"
          "mov v26.d[0], v24.d[1]\n"
          "st1 {v26.8b}, [x6], x5\n"
@ -1794,17 +1828,16 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
        "smlal v26.4s, v8.4h, v23.4h\n"
        "smlal2 v27.4s, v8.8h, v23.8h\n"

-        "dup v28.4s, w1\n"
-        "dup v29.4s, w9\n"
-        "sqrdmulh v24.4s, v24.4s, v28.4s\n"
-        "sqrdmulh v25.4s, v25.4s, v28.4s\n"
-        "sqrdmulh v26.4s, v26.4s, v28.4s\n"
-        "sqrdmulh v27.4s, v27.4s, v28.4s\n"
-        "dup v28.8h, w2\n"
-        "sqrshl v24.4s, v24.4s, v29.4s\n"
+        "ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
+        "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v31.4s\n"
+        "sqrdmulh v26.4s, v26.4s, v30.4s\n"
+        "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+        "sqrshl v24.4s, v24.4s, v28.4s\n"
        "sqrshl v25.4s, v25.4s, v29.4s\n"
-        "sqrshl v26.4s, v26.4s, v29.4s\n"
+        "sqrshl v26.4s, v26.4s, v28.4s\n"
        "sqrshl v27.4s, v27.4s, v29.4s\n"
+        "dup v28.8h, w2\n"
        "sqxtn v24.4h, v24.4s\n"
        "sqxtn2 v24.8h, v25.4s\n"
        "sqxtn v26.4h, v26.4s\n"
@ -1812,19 +1845,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
        "sqadd v24.8h, v24.8h, v28.8h\n"
        "sqadd v26.8h, v26.8h, v28.8h\n"
        "sqxtn v24.8b, v24.8h\n"
+        "dup v28.16b, w3\n"
+        "dup v29.16b, w4\n"
        "sqxtn2 v24.16b, v26.8h\n"
-        "dup v28.8h, w0\n"
-        "smax v24.16b, v24.16b, v30.16b\n"
-        "smin v24.16b, v24.16b, v31.16b\n"
+        "smax v24.16b, v24.16b, v28.16b\n"
+        "smin v24.16b, v24.16b, v29.16b\n"
        "st1 {v24.8b}, [x6], x5\n"
        "mov v26.d[0], v24.d[1]\n"
        "st1 {v26.8b}, [x6]\n"
+        "dup v28.8h, w0\n"
        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"

        // Handle bottom right output if exists.
        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
-        "dup v26.4s, w9\n"
-        "dup v27.4s, w1\n"
        "dup v29.8h, w2\n"

        "smlal v24.4s, v0.4h, v9.4h\n"
@ -1846,16 +1879,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
        "smlal v24.4s, v8.4h, v17.4h\n"
        "smlal2 v25.4s, v8.8h, v17.8h\n"

-        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-        "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+        "ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
+        "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v31.4s\n"
        "sqrshl v24.4s, v24.4s, v26.4s\n"
-        "sqrshl v25.4s, v25.4s, v26.4s\n"
+        "sqrshl v25.4s, v25.4s, v27.4s\n"
        "sqxtn v24.4h, v24.4s\n"
        "sqxtn2 v24.8h, v25.4s\n"
+        "dup v26.16b, w3\n"
+        "dup v27.16b, w4\n"
        "sqadd v24.8h, v24.8h, v29.8h\n"
        "sqxtn v24.8b, v24.8h\n"
-        "smax v24.8b, v24.8b, v30.8b\n"
-        "smin v24.8b, v24.8b, v31.8b\n"
+        "smax v24.8b, v24.8b, v26.8b\n"
+        "smin v24.8b, v24.8b, v27.8b\n"
        "st1 {v24.8b}, [x6]\n"

        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
@ -1866,6 +1902,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
    [output_window_height] "+r"(output_window_height)
    :
    // Inputs.
+    [output_multiplier_ptr] "r"(output_multiplier_ptr),
+    [output_shift_ptr] "r"(output_shift_ptr),
    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
    [input_depth] "r"(input_depth),
    [output_window_width] "r"(output_window_width),
@ -1882,8 +1920,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
    "v30", "v31",
    // We use these general-purpose registers.
-    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
-    "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+    "x0", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x10", "x11", "x12", "x13", "x14", "x15",
    "x19", "x20");
 #undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
 #undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP