From 1f9d744e66ac342dce2b4c2c2badd84ab83d31f4 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Thu, 9 Apr 2020 11:53:15 -0700
Subject: [PATCH] HiFi intrinsics: Inline the confusingly named
 SaturatingMultiply function.

24bit x 24bit => 56 bit multiplications never saturate.

PiperOrigin-RevId: 305728765
Change-Id: Id77a6414dad99810fc87a2820713f43bca1cee98
---
 .../xtensa_hifimini/fixedpoint_utils.h        | 31 +++++--------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
index 74112463f3b..4ffb3653f50 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
@@ -31,22 +31,6 @@ namespace micro {
 namespace xtensa {
 namespace hifimini {
 
-//
-// Product of two fixed-point 24bit integers with right shift.
-//
-// Two 24bit integers from the HH side of a PR register entry are MAC into a QR
-// register. That value will be right shifted if |shift_length| is greater than
-// 0.
-//
-inline ae_q56s SaturatingMultiply(ae_p24x2s a_56, ae_p24x2s b_56,
-                                  int shift_length) {
-  ae_q56s result_56 = AE_MULP24S_HH(a_56, b_56);
-  if (shift_length > 0) {
-    return AE_Q56S_SRA(result_56, shift_length);
-  }
-  return result_56;
-}
-
 //
 // Multiply 32bit value by a quantized multiplier (w/ shift) and returns a 48bit
 // aligned value in the QR register.
@@ -57,7 +41,7 @@ inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
   // These boolean factors will carry an additional 2^8 (e.g 256) factor
   // throughout the equation to cover the missing 8 bits of precision when a
   // 32bit integer is outside the bounds of INT24. The additional scaling factor
-  // will be adjusted on the final SaturatingMultiply() call in this method.
+  // will be adjusted after the final multiplication in this method.
   //
   // The Q-notation comments in this method describe the calculations that take
   // place when both |x| and the shifted value of |1| overflow the INT24 limits.
@@ -110,8 +94,10 @@ inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
   // to 48bit aligned.
   // (Q23.0 / 2^16) * Q23.0 = Q47.0 / 2^16
   // (Q47.0 / 2^16) >> 7 = Q47.0
-  ae_q56s result_56 =
-      SaturatingMultiply(x_24x2, quantized_multiplier_24x2, shift_amount);
+  ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);
+  if (shift_amount > 0) {
+    result_56 = AE_Q56S_SRA(result_56, shift_amount);
+  }
 
   if (shift < 0) {
     // Handle any negative shift directly on the 48 bit value.
@@ -134,8 +120,7 @@ inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
   // the limits of INT24, which requires |AE_CONVERT_INT32_24x2()| to load the
   // left-most 24 bits of a 32bit integer. When this occurs, all Q values here
   // carry an additional division of 2^8 to account for this loss in precision.
-  // This division will be applied to the final shift of the result in
-  // |SaturatingMultiply()|.
+  // This division will be applied to the final shift after multiplication.
   //
   // The Q-notation comments in this method describe the calculations that take
   // place when both |x| and the shifted value of |1| overflow the INT24 limits.
@@ -176,8 +161,8 @@ inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
   // function:
   // (Q23.0 / 2^8) * Q23.0 = Q47.0 / 2^8
   // (Q47.0 / 2^8) >> 7 = Q47.0
-  ae_q56s result = SaturatingMultiply(x_shifted_24x2, quantized_multiplier_24x2,
-                                      shift_exceeds_24bits ? 15 : 23);
+  ae_q56s result = AE_MULP24S_HH(x_shifted_24x2, quantized_multiplier_24x2);
+  result = AE_Q56S_SRA(result, shift_exceeds_24bits ? 15 : 23);
 
   if (shift < 0) {
     // Handle any negative shift directly on the 48 bit value.