From 36147a36e50c0a8132231784d0e31cbb7b32d376 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Fri, 24 May 2019 10:10:13 -0700
Subject: [PATCH] Handle per-channel for stride == 2 && non-padding case.

PiperOrigin-RevId: 249852734
---
 ...epthwiseconv_per_channel_quantized_test.cc |  24 +-
 .../depthwiseconv_3x3_filter_common.h         |   9 -
 .../integer_ops/depthwise_conv_3x3_filter.h   | 210 +++++++++++-------
 3 files changed, 130 insertions(+), 113 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
index 3699d804d67..794d9b280b0 100644
--- a/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_per_channel_quantized_test.cc
@@ -126,28 +126,16 @@ void PickReasonableMultiplier(
                        bias_shape_inference, bias_data, output_shape_inference,
                        &output_multiplier);
 
-  bool should_use_per_channel = true;
-
-  // TODO(b/132879305): Support stride == 2 per-channel case.
-  if (params.stride_width == 2 || params.stride_height == 2) {
-    should_use_per_channel = false;
-  }
-
   int base_multiplier;
   int base_shift;
   QuantizeMultiplier(output_multiplier, &base_multiplier, &base_shift);
   for (int i = 0; i < output_depth; ++i) {
-    if (should_use_per_channel) {
-      // multipliers typically range in [2^30 ; 2^31 - 1].
-      // Values in [0, 2^30 - 1] are normally unused, but harmless.
-      // Thus a good way to randomize multipliers is to subtract from them
-      // a random value smaller than 2^30 but still significant compared to it.
-      output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
-      output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
-    } else {
-      output_multiplier_ptr[i] = base_multiplier;
-      output_shift_ptr[i] = base_shift;
-    }
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    output_multiplier_ptr[i] = base_multiplier - (std::rand() % (1 << 26));
+    output_shift_ptr[i] = base_shift - 1 + (std::rand() % 4);
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
index d6395de135f..bfa071d9a44 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h
@@ -411,15 +411,6 @@ inline bool Fast3x3FilterKernelSupported(
   }
 
   if (quantization_type == QuantizationType::kPerChannelInt8) {
-    // TODO(b/132879305): Support stride == 2 per-channel case.
-    if (stride_height == 2 || stride_width == 2) {
-      for (int i = 0; i < output_depth; ++i) {
-        if (output_shift_ptr[i] != output_shift_ptr[0]) {
-          return false;
-        }
-      }
-    }
-
     for (int i = 0; i < output_depth; ++i) {
       if (output_shift_ptr[i] > 0) {
         return false;
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
index ee495ddaa9d..40638454be9 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h
@@ -1031,28 +1031,44 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
         // interleaved with arithmetic operations to take advantage of
         // dual-issue pipelines. We also add input offsets as far from the loads
         // as possible to give loads enough cycles to fetch data from memory.
+        //
+        // This logic is copied and modified from the non-per-channel quantized
+        // part.
+        // The register planning here is really tricky:
+        // v0-v29 are all used at least once for either filter/input/output,
+        // some of them are used for output shift and output mulitplier, or
+        // input/output offset.
+        // Only v30 & v31 are only used for output activation min/max.
+        // For per-channel case, we need 4 registers to hold output shift &
+        // output multiplier. However, given the reality, we simply cannot do
+        // that without reloading.
+        //
+        // So here's the plan:
+        // We hold output_multiplier in v30 & v31, and we will load output_shift
+        // into two consecutive registers each time before use.
+        // We will duplicate output min & max before needed.
+        // Sometimes we may borrow registers from input offset or bias, we will
+        // dup them back after use.
+        //
 
         // Set "constant" registers. These registers may be replaced with temp
         // values from time to time when there are not enough NEON registers.
         // We use x9--x15 general purpose registers as they are caller-saved
         // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
-        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_RIGHT_SHIFT) "]\n"
         "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
         "cmp %w[output_window_height], #2\n"
         "dup v28.8h, w0\n"
-        "ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
-        "dup v26.4s, w9\n"
         "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
-        "dup v27.4s, w1\n"
         "ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
         "dup v29.8h, w2\n"
         "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
-        "dup v30.16b, w3\n"
         "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
-        "dup v31.16b, w4\n"
         "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
         "ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
 
+        // Deal with output multiplier.
+        "ld1 {v30.4s, v31.4s}, [%[output_multiplier_ptr]]\n"
+
         // Load filters and add offsets.
         "add x10, %[bias_ptr], #16\n"
         "ld1 {v0.8b}, [%[filter_ptr]], x5\n"
@@ -1207,14 +1223,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
             "add x13, x12, %[input_row_size]\n"
             "add x15, x13, %[input_row_size]\n"
 
-            "dup v28.4s, w9\n"
-            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
-            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-            "sqrshl v21.4s, v21.4s, v28.4s\n"
+            "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+            "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v31.4s\n"
+            "sqrshl v21.4s, v21.4s, v27.4s\n"
             "sqrshl v22.4s, v22.4s, v28.4s\n"
-            "sqrshl v23.4s, v23.4s, v28.4s\n"
+            "sqrshl v23.4s, v23.4s, v27.4s\n"
             "sqrshl v24.4s, v24.4s, v28.4s\n"
             "dup v28.8h, w0\n"
             "sqxtn v21.4h, v21.4s\n"
@@ -1225,10 +1241,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
             "sqadd v23.8h, v23.8h, v29.8h\n"
             "sqxtn v21.8b, v21.8h\n"
             "sqxtn2 v21.16b, v23.8h\n"
+            "dup v27.16b, w3\n"
+            "dup v29.16b, w4\n"
             "ld1 {v22.4s}, [x10]\n"
-            "smax v21.16b, v21.16b, v30.16b\n"
-            "smin v21.16b, v21.16b, v31.16b\n"
+            "smax v21.16b, v21.16b, v27.16b\n"
+            "smin v21.16b, v21.16b, v29.16b\n"
             "ld1 {v24.4s}, [x10]\n"
+            "dup v29.8h, w2\n"
             "saddw v9.8h, v28.8h, v9.8b\n"
             "st1 {v21.8b}, [x6], x5\n"
             "saddw v10.8h, v28.8h, v10.8b\n"
@@ -1276,14 +1295,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
             "smlal v25.4s, v5.4h, v18.4h\n"
             "smlal2 v26.4s, v5.8h, v18.8h\n"
 
-            "dup v28.4s, w9\n"
-            "sqrdmulh v19.4s, v19.4s, v27.4s\n"
-            "sqrdmulh v20.4s, v20.4s, v27.4s\n"
-            "sqrdmulh v25.4s, v25.4s, v27.4s\n"
-            "sqrdmulh v26.4s, v26.4s, v27.4s\n"
-            "sqrshl v19.4s, v19.4s, v28.4s\n"
+            "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+            "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+            "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+            "sqrdmulh v25.4s, v25.4s, v30.4s\n"
+            "sqrdmulh v26.4s, v26.4s, v31.4s\n"
+            "sqrshl v19.4s, v19.4s, v27.4s\n"
             "sqrshl v20.4s, v20.4s, v28.4s\n"
-            "sqrshl v25.4s, v25.4s, v28.4s\n"
+            "sqrshl v25.4s, v25.4s, v27.4s\n"
             "sqrshl v26.4s, v26.4s, v28.4s\n"
             "dup v28.8h, w0\n"
             "sqxtn v19.4h, v19.4s\n"
@@ -1294,10 +1313,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
             "sqadd v25.8h, v25.8h, v29.8h\n"
             "sqxtn v19.8b, v19.8h\n"
             "sqxtn2 v19.16b, v25.8h\n"
+            "dup v27.16b, w3\n"
+            "dup v29.16b, w4\n"
             "ld1 {v20.4s}, [x10]\n"
-            "smax v19.16b, v19.16b, v30.16b\n"
-            "smin v19.16b, v19.16b, v31.16b\n"
+            "smax v19.16b, v19.16b, v27.16b\n"
+            "smin v19.16b, v19.16b, v29.16b\n"
             "ld1 {v26.4s}, [x10]\n"
+            "dup v29.8h, w2\n"
             "saddw v9.8h, v28.8h, v9.8b\n"
             "st1 {v19.8b}, [x7], x5\n"
             "saddw v10.8h, v28.8h, v10.8b\n"
@@ -1399,14 +1421,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "smlal2 v26.4s, v2.8h, v13.8h\n"
           "ld1 {v13.8b}, [x13]\n"
 
-          "dup v28.4s, w9\n"
-          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
-          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-          "sqrshl v21.4s, v21.4s, v28.4s\n"
+          "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v31.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v31.4s\n"
+          "sqrshl v21.4s, v21.4s, v27.4s\n"
           "sqrshl v22.4s, v22.4s, v28.4s\n"
-          "sqrshl v23.4s, v23.4s, v28.4s\n"
+          "sqrshl v23.4s, v23.4s, v27.4s\n"
           "sqrshl v24.4s, v24.4s, v28.4s\n"
           "dup v28.8h, w0\n"
           "sqxtn v21.4h, v21.4s\n"
@@ -1417,10 +1439,13 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "sqadd v23.8h, v23.8h, v29.8h\n"
           "sqxtn v21.8b, v21.8h\n"
           "sqxtn2 v21.16b, v23.8h\n"
+          "dup v27.16b, w3\n"
+          "dup v29.16b, w4\n"
           "ld1 {v22.4s}, [x10]\n"
-          "smax v21.16b, v21.16b, v30.16b\n"
-          "smin v21.16b, v21.16b, v31.16b\n"
+          "smax v21.16b, v21.16b, v27.16b\n"
+          "smin v21.16b, v21.16b, v29.16b\n"
           "ld1 {v24.4s}, [x10]\n"
+          "dup v29.8h, w2\n"
           "saddw v9.8h, v28.8h, v9.8b\n"
           "st1 {v21.8b}, [x6], x5\n"
           "saddw v10.8h, v28.8h, v10.8b\n"
@@ -1460,14 +1485,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "smlal v25.4s, v5.4h, v18.4h\n"
           "smlal2 v26.4s, v5.8h, v18.8h\n"
 
-          "dup v28.4s, w9\n"
-          "sqrdmulh v19.4s, v19.4s, v27.4s\n"
-          "sqrdmulh v20.4s, v20.4s, v27.4s\n"
-          "sqrdmulh v25.4s, v25.4s, v27.4s\n"
-          "sqrdmulh v26.4s, v26.4s, v27.4s\n"
-          "sqrshl v19.4s, v19.4s, v28.4s\n"
+          "ld1 {v27.4s, v28.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+          "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v30.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v31.4s\n"
+          "sqrshl v19.4s, v19.4s, v27.4s\n"
           "sqrshl v20.4s, v20.4s, v28.4s\n"
-          "sqrshl v25.4s, v25.4s, v28.4s\n"
+          "sqrshl v25.4s, v25.4s, v27.4s\n"
           "sqrshl v26.4s, v26.4s, v28.4s\n"
           "dup v28.8h, w0\n"
           "sqxtn v19.4h, v19.4s\n"
@@ -1476,11 +1501,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "sqxtn2 v25.8h, v26.4s\n"
           "sqadd v19.8h, v19.8h, v29.8h\n"
           "sqadd v25.8h, v25.8h, v29.8h\n"
+          "dup v27.16b, w3\n"
+          "dup v29.16b, w4\n"
           "sqxtn v19.8b, v19.8h\n"
           "sqxtn2 v19.16b, v25.8h\n"
-          "smax v19.16b, v19.16b, v30.16b\n"
-          "smin v19.16b, v19.16b, v31.16b\n"
+          "smax v19.16b, v19.16b, v27.16b\n"
+          "smin v19.16b, v19.16b, v29.16b\n"
           "st1 {v19.8b}, [x7], x5\n"
+          "dup v29.8h, w2\n"
           "mov v25.d[0], v19.d[1]\n"
           "st1 {v25.8b}, [x7]\n"
           "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
@@ -1533,17 +1561,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "smlal v23.4s, v2.4h, v17.4h\n"
           "smlal2 v24.4s, v2.8h, v17.8h\n"
 
-          "dup v26.4s, w9\n"
-          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
-          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v31.4s\n"
           "sqrshl v21.4s, v21.4s, v26.4s\n"
-          "sqrshl v22.4s, v22.4s, v26.4s\n"
+          "sqrshl v22.4s, v22.4s, v27.4s\n"
           "sqxtn v21.4h, v21.4s\n"
           "sqxtn2 v21.8h, v22.4s\n"
+          "dup v26.16b, w3\n"
+          "dup v27.16b, w4\n"
           "sqadd v21.8h, v21.8h, v29.8h\n"
           "sqxtn v21.8b, v21.8h\n"
-          "smax v21.8b, v21.8b, v30.8b\n"
-          "smin v21.8b, v21.8b, v31.8b\n"
+          "smax v21.8b, v21.8b, v26.8b\n"
+          "smin v21.8b, v21.8b, v27.8b\n"
           "saddw v9.8h, v28.8h, v9.8b\n"
           "st1 {v21.8b}, [x6]\n"
           "saddw v10.8h, v28.8h, v10.8b\n"
@@ -1566,16 +1596,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "smlal v23.4s, v8.4h, v16.4h\n"
           "smlal2 v24.4s, v8.8h, v16.8h\n"
 
-          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
-          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v31.4s\n"
           "sqrshl v23.4s, v23.4s, v26.4s\n"
-          "sqrshl v24.4s, v24.4s, v26.4s\n"
+          "sqrshl v24.4s, v24.4s, v27.4s\n"
           "sqxtn v23.4h, v23.4s\n"
           "sqxtn2 v23.8h, v24.4s\n"
+          "dup v26.16b, w3\n"
+          "dup v27.16b, w4\n"
           "sqadd v23.8h, v23.8h, v29.8h\n"
           "sqxtn v23.8b, v23.8h\n"
-          "smax v23.8b, v23.8b, v30.8b\n"
-          "smin v23.8b, v23.8b, v31.8b\n"
+          "smax v23.8b, v23.8b, v26.8b\n"
+          "smin v23.8b, v23.8b, v27.8b\n"
           "st1 {v23.8b}, [x7]\n"
 
           DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
@@ -1696,17 +1729,16 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "smlal v26.4s, v8.4h, v23.4h\n"
           "smlal2 v27.4s, v8.8h, v23.8h\n"
 
-          "dup v28.4s, w1\n"
-          "dup v29.4s, w9\n"
-          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
-          "sqrdmulh v25.4s, v25.4s, v28.4s\n"
-          "sqrdmulh v26.4s, v26.4s, v28.4s\n"
-          "sqrdmulh v27.4s, v27.4s, v28.4s\n"
-          "dup v28.8h, w2\n"
-          "sqrshl v24.4s, v24.4s, v29.4s\n"
+          "ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
+          "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v31.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v30.4s\n"
+          "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+          "sqrshl v24.4s, v24.4s, v28.4s\n"
           "sqrshl v25.4s, v25.4s, v29.4s\n"
-          "sqrshl v26.4s, v26.4s, v29.4s\n"
+          "sqrshl v26.4s, v26.4s, v28.4s\n"
           "sqrshl v27.4s, v27.4s, v29.4s\n"
+          "dup v28.8h, w2\n"
           "sqxtn v24.4h, v24.4s\n"
           "sqxtn2 v24.8h, v25.4s\n"
           "sqxtn v26.4h, v26.4s\n"
@@ -1716,12 +1748,14 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
           "sqxtn v24.8b, v24.8h\n"
           "sqxtn2 v24.16b, v26.8h\n"
           "dup v28.8h, w0\n"
+          "dup v27.16b, w3\n"
+          "dup v29.16b, w4\n"
           "ld1 {v25.4s}, [x10]\n"
-          "smax v24.16b, v24.16b, v30.16b\n"
-          "smin v24.16b, v24.16b, v31.16b\n"
-          "ld1 {v27.4s}, [x10]\n"
+          "smax v24.16b, v24.16b, v27.16b\n"
+          "smin v24.16b, v24.16b, v29.16b\n"
           "saddw v9.8h, v28.8h, v9.8b\n"
           "st1 {v24.8b}, [x6], x5\n"
+          "ld1 {v27.4s}, [x10]\n"
           "saddw v10.8h, v28.8h, v10.8b\n"
           "mov v26.d[0], v24.d[1]\n"
           "st1 {v26.8b}, [x6], x5\n"
@@ -1794,17 +1828,16 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
         "smlal v26.4s, v8.4h, v23.4h\n"
         "smlal2 v27.4s, v8.8h, v23.8h\n"
 
-        "dup v28.4s, w1\n"
-        "dup v29.4s, w9\n"
-        "sqrdmulh v24.4s, v24.4s, v28.4s\n"
-        "sqrdmulh v25.4s, v25.4s, v28.4s\n"
-        "sqrdmulh v26.4s, v26.4s, v28.4s\n"
-        "sqrdmulh v27.4s, v27.4s, v28.4s\n"
-        "dup v28.8h, w2\n"
-        "sqrshl v24.4s, v24.4s, v29.4s\n"
+        "ld1 {v28.4s, v29.4s}, [%[output_shift_ptr]]\n"
+        "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v31.4s\n"
+        "sqrdmulh v26.4s, v26.4s, v30.4s\n"
+        "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+        "sqrshl v24.4s, v24.4s, v28.4s\n"
         "sqrshl v25.4s, v25.4s, v29.4s\n"
-        "sqrshl v26.4s, v26.4s, v29.4s\n"
+        "sqrshl v26.4s, v26.4s, v28.4s\n"
         "sqrshl v27.4s, v27.4s, v29.4s\n"
+        "dup v28.8h, w2\n"
         "sqxtn v24.4h, v24.4s\n"
         "sqxtn2 v24.8h, v25.4s\n"
         "sqxtn v26.4h, v26.4s\n"
@@ -1812,19 +1845,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
         "sqadd v24.8h, v24.8h, v28.8h\n"
         "sqadd v26.8h, v26.8h, v28.8h\n"
         "sqxtn v24.8b, v24.8h\n"
+        "dup v28.16b, w3\n"
+        "dup v29.16b, w4\n"
         "sqxtn2 v24.16b, v26.8h\n"
-        "dup v28.8h, w0\n"
-        "smax v24.16b, v24.16b, v30.16b\n"
-        "smin v24.16b, v24.16b, v31.16b\n"
+        "smax v24.16b, v24.16b, v28.16b\n"
+        "smin v24.16b, v24.16b, v29.16b\n"
         "st1 {v24.8b}, [x6], x5\n"
         "mov v26.d[0], v24.d[1]\n"
         "st1 {v26.8b}, [x6]\n"
+        "dup v28.8h, w0\n"
         "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
 
         // Handle bottom right output if exists.
         DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
-        "dup v26.4s, w9\n"
-        "dup v27.4s, w1\n"
         "dup v29.8h, w2\n"
 
         "smlal v24.4s, v0.4h, v9.4h\n"
@@ -1846,16 +1879,19 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
         "smlal v24.4s, v8.4h, v17.4h\n"
         "smlal2 v25.4s, v8.8h, v17.8h\n"
 
-        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
-        "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+        "ld1 {v26.4s, v27.4s}, [%[output_shift_ptr]]\n"
+        "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v31.4s\n"
         "sqrshl v24.4s, v24.4s, v26.4s\n"
-        "sqrshl v25.4s, v25.4s, v26.4s\n"
+        "sqrshl v25.4s, v25.4s, v27.4s\n"
         "sqxtn v24.4h, v24.4s\n"
         "sqxtn2 v24.8h, v25.4s\n"
+        "dup v26.16b, w3\n"
+        "dup v27.16b, w4\n"
         "sqadd v24.8h, v24.8h, v29.8h\n"
         "sqxtn v24.8b, v24.8h\n"
-        "smax v24.8b, v24.8b, v30.8b\n"
-        "smin v24.8b, v24.8b, v31.8b\n"
+        "smax v24.8b, v24.8b, v26.8b\n"
+        "smin v24.8b, v24.8b, v27.8b\n"
         "st1 {v24.8b}, [x6]\n"
 
         DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
@@ -1866,6 +1902,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
     [output_window_height] "+r"(output_window_height)
     :
     // Inputs.
+    [output_multiplier_ptr] "r"(output_multiplier_ptr),
+    [output_shift_ptr] "r"(output_shift_ptr),
     [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
     [input_depth] "r"(input_depth),
     [output_window_width] "r"(output_window_width),
@@ -1882,8 +1920,8 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
     "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
     "v30", "v31",
     // We use these general-purpose registers.
-    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
-    "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+    "x0", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x10", "x11", "x12", "x13", "x14", "x15",
     "x19", "x20");
 #undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
 #undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP