Simplify, speed up and improve accuracy for xtensa fixedpoint utils.

PiperOrigin-RevId: 309153806 Change-Id: I3ae4196fdaad56bdc7f6fd236d0f3c05c0ca50e0
2020-04-29 20:48:39 -07:00 · 2020-04-29 20:48:39 -07:00 · 3c520614a3
commit 3c520614a3
parent 645cd8445f
8 changed files with 67 additions and 205 deletions
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@ -69,7 +69,6 @@ cc_library(
            "xtensa_hifimini/quantize.cc",
            "xtensa_hifimini/softmax.cc",
            "xtensa_hifimini/svdf.cc",
-            "xtensa_hifimini/utils.h",
        ],
    }),
    hdrs = ["micro_ops.h"],
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"

 namespace tflite {
 namespace ops {
@ -66,7 +65,7 @@ void ConvPerChannel(const ConvParams& params, const int32* output_multiplier,
  const int output_width = output_shape.Dims(2);
  const int output_depth = output_shape.Dims(3);

-  ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
+  ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
  ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
  ae_q56s output_activation_min_56 = AE_CVTQ48A32S(output_activation_min);
  ae_q56s output_activation_max_56 = AE_CVTQ48A32S(output_activation_max);
@ -150,9 +149,6 @@ void ConvPerChannel(const ConvParams& params, const int32* output_multiplier,
              acc_24x2, output_multiplier[out_channel],
              output_shift[out_channel]);

-          // Shift from 48bit aligned to 32bit:
-          acc_56 = AE_Q56S_SLAI(acc_56, 16);
-
          // Add output offset, cap activation, and assign to the output:
          acc_56 = AE_ADDQ56(acc_56, output_offset_56);
          acc_56 = AE_MINQ56S(acc_56, output_activation_max_56);
@ -178,7 +174,7 @@ inline void Conv1x32Input32x32Filter(
    const RuntimeShape& filter_shape, const int8* filter_data,
    const RuntimeShape& bias_shape, const int32* bias_data,
    const RuntimeShape& output_shape, int8* output_data) {
-  ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
+  ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
  ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
  ae_q56s output_activation_max_56 = AE_CVTQ48A32S(quantized_activation_max);
  ae_q56s output_activation_min_56 = AE_CVTQ48A32S(quantized_activation_min);
@ -227,13 +223,10 @@ inline void Conv1x32Input32x32Filter(
    acc_56 = AE_Q56S_SLAI(acc_56, 8);
    ae_p24x2s acc_24x2 = AE_TRUNCP24Q48(acc_56);

-    // Apply quantized multiplier and accumulate result at 48bit
-    // alignment:
+    // Apply quantized multiplier and accumulate result at 48bit alignment.
+    // Convert the (unsigned) 32-bit multiplier down to a 24-bit multiplier.
    acc_56 = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
-        acc_24x2, output_multiplier[ch], output_shift[ch]);
-
-    // Shift from 48bit aligned to 32bit:
-    acc_56 = AE_Q56S_SLAI(acc_56, 16);
+        acc_24x2, output_multiplier[ch] >> 8, output_shift[ch]);

    // Add output offset, cap activation, and assign to the output:
    acc_56 = AE_ADDQ56(acc_56, output_offset_56);
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"

 namespace tflite {
 namespace ops {
@ -69,7 +68,7 @@ inline void DepthwiseConvPerChannel(
  const int output_width = output_shape.Dims(2);
  const int output_depth = output_shape.Dims(3);

-  ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
+  ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
  ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
  ae_q56s output_activation_min_56 = AE_CVTQ48A32S(output_activation_min);
  ae_q56s output_activation_max_56 = AE_CVTQ48A32S(output_activation_max);
@ -114,14 +113,14 @@ inline void DepthwiseConvPerChannel(
                  // shift into 24bit space. Note: value is duplicated in the HH
                  // and LL register - but all calculations are done on the HH
                  // side.
-                  ae_p24x2s input_val_24x2 = AE_CONVERT_INT32_24x2(input_val);
+                  ae_p24x2s input_val_24x2 = AE_MOVPA24(input_val);

                  // Add input offset (24bit aligned):
                  input_val_24x2 =
                      AE_P24S_ADDS_P24X2S(input_val_24x2, input_offset_24x2);

                  // Load filter 8bit value into 24bit alignment:
-                  ae_p24x2s filter_val_24x2 = AE_CONVERT_INT32_24x2(filter_val);
+                  ae_p24x2s filter_val_24x2 = AE_MOVPA24(filter_val);

                  // Multiply and accumulate the HH side of each 24x24 PR
                  // register:
@ -150,9 +149,6 @@ inline void DepthwiseConvPerChannel(
                acc_24x2, output_multiplier[output_channel],
                output_shift[output_channel]);

-            // Shift from 48bit aligned to 32bit:
-            acc_56 = AE_Q56S_SLAI(acc_56, 16);
-
            // Add output offset, cap activation, and assign to the output:
            acc_56 = AE_ADDQ56(acc_56, output_offset_56);
            acc_56 = AE_MINQ56S(acc_56, output_activation_max_56);
@ -181,9 +177,10 @@ inline void DepthwiseConv4x32MatchingInputAndFilter(
    const RuntimeShape& filter_shape, const int8* filter_data,
    const RuntimeShape& bias_shape, const int32* bias_data,
    const RuntimeShape& output_shape, int8* output_data) {
-  const int32_t mult = output_multiplier[0];
+  // Convert the (unsigned) 32-bit multiplier down to a 24-bit multiplier.
+  const int32_t mult = output_multiplier[0] >> 8;
  const int32_t shift = output_shift[0];
-  ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
+  ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
  ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
  ae_q56s output_activation_min_56 = AE_CVTQ48A32S(quantized_activation_min);
  ae_q56s output_activation_max_56 = AE_CVTQ48A32S(quantized_activation_max);
@ -270,10 +267,6 @@ inline void DepthwiseConv4x32MatchingInputAndFilter(
    block_1_acc = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
        acc_24x2_1, mult, shift);

-    // Shift from 48bit aligned to 32bit:
-    block_0_acc = AE_Q56S_SLAI(block_0_acc, 16);
-    block_1_acc = AE_Q56S_SLAI(block_1_acc, 16);
-
    // Add output offset, cap activation, and assign to the output:
    block_0_acc = AE_ADDQ56(block_0_acc, output_offset_56);
    block_1_acc = AE_ADDQ56(block_1_acc, output_offset_56);
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
@ -23,7 +23,6 @@ limitations under the License.
 #include <cstdint>

 #include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"

 namespace tflite {
 namespace ops {
@ -31,80 +30,9 @@ namespace micro {
 namespace xtensa {
 namespace hifimini {

-//
-// Multiply 32bit value by a quantized multiplier (w/ shift) and returns a 48bit
-// aligned value in the QR register.
-//
-inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
-  // These boolean factors will carry an additional 2^8 (e.g 256) factor
-  // throughout the equation to cover the missing 8 bits of precision when a
-  // 32bit integer is outside the bounds of INT24. The additional scaling factor
-  // will be adjusted after the final multiplication in this method.
-  //
-  // The Q-notation comments in this method describe the calculations that take
-  // place when both |x| and the shifted value of |1| overflow the INT24 limits.
-  bool x_exceeds_24bits = (x <= INT24_MIN || x >= INT24_MAX);
-  bool shift_exceeds_24bits = false;
-
-  // Q31.0 -> Q23.0 / 2^8
-  ae_p24x2s x_24x2 = AE_CONVERT_INT32_24x2(x);
-
-  if (shift > 0) {
-    int shifted = 1 << shift;
-    if (shifted <= INT24_MIN || shifted >= INT24_MAX) {
-      shift_exceeds_24bits = true;
-    }
-
-    // Load the shifted value into the PR register:
-    // Q31.0 -> Q23.0 / 2^8
-    ae_p24x2s shifted_24x2 = AE_CONVERT_INT32_24x2(shifted);
-
-    // (Q23.0 / 2^8) * (Q23.0 / 2^8) = Q47.0 / 2^16
-    ae_q56s sum_56 = AE_MULP24S_HH(x_24x2, shifted_24x2);
-
-    // Shift left into 24bit space:
-    // ((Q47.0 / 2^16) << 24) = Q23.24 / 2^16
-    sum_56 = AE_Q56S_SLAI(sum_56, 24);
-
-    // Truncate and place on the PR register:
-    // (Q23.24 / 2^16) -> Q23.0 / 2^16
-    x_24x2 = AE_TRUNCP24Q48(sum_56);
-  }
-
-  // Load the quantized multiplier into the PR register.
-  // NOTE: This method assumes that this param has been calculated for 24bit
-  // space - not 32bits.
-  // Q0.31 -> Q0.23
-  ae_p24x2s quantized_multiplier_24x2 =
-      AE_CONVERT_INT32_24x2(quantized_multiplier);
-
-  // Adjust for the additional 8 bits of lost precision throughout this
-  // function:
-  int shift_amount = 23;
-  if (x_exceeds_24bits) {
-    shift_amount = shift_amount - 8;
-  }
-  if (shift_exceeds_24bits) {
-    shift_amount = shift_amount - 8;
-  }
-
-  // Find the product of x and the quantized_multiplier and right shift
-  // to 48bit aligned.
-  // (Q23.0 / 2^16) * Q23.0 = Q47.0 / 2^16
-  // (Q47.0 / 2^16) >> 7 = Q47.0
-  ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);
-  if (shift_amount > 0) {
-    result_56 = AE_Q56S_SRA(result_56, shift_amount);
-  }
-
-  if (shift < 0) {
-    // Handle any negative shift directly on the 48 bit value.
-    result_56 = AE_Q56S_SRA(result_56, -shift);
-  }
-  return result_56;
-}
+// INT24 MIN/MAX
+#define INT24_MIN -8388608
+#define INT24_MAX 8388607

 //
 // Multiply 24bit value by a quantized multiplier (w/ shift) and returns a 48bit
@ -113,62 +41,62 @@ inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
 inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
                                             int32_t quantized_multiplier,
                                             int shift) {
-  // NOTE: x_24x2 = Q23.0
-
-  // This is an optimized version of a 32 bit MultiplyByQuantizedMultiplier
-  // operation of TFLite. Sometimes, the shifted value of |x_24x2| can exceed
-  // the limits of INT24, which requires |AE_CONVERT_INT32_24x2()| to load the
-  // left-most 24 bits of a 32bit integer. When this occurs, all Q values here
-  // carry an additional division of 2^8 to account for this loss in precision.
-  // This division will be applied to the final shift after multiplication.
+  // A value with 1 sign bit, N integer bits and M fractional bits is
+  // represented as QN+1.M since the sign bit is included in the integer bits.
+  //
+  // The Q notation in this method explains the values represented in each
+  // variable, along with an implicit division since the quantized_multiplier
+  // represents a value between 0.5 and 1.0 (Q1.X-1 where X is the bit precision
+  // of the type).
  //
-  // The Q-notation comments in this method describe the calculations that take
-  // place when both |x| and the shifted value of |1| overflow the INT24 limits.
-  bool shift_exceeds_24bits = false;
-
-  ae_p24x2s x_shifted_24x2 = x_24x2;
-  if (shift > 0) {
-    int shifted = 1 << shift;
-    if (shifted <= INT24_MIN || shifted >= INT24_MAX) {
-      shift_exceeds_24bits = true;
-    }
-    // Load the shifted value into the PR register:
-    // Q31.0 -> Q23.0 / 2^8
-    ae_p24x2s shifted_24x2 = AE_CONVERT_INT32_24x2(shifted);
-
-    // Q23.0 * (Q23.0 / 2^8) = Q47.0 / 2^8
-    ae_q56s sum_56 = AE_MULP24S_HH(x_24x2, shifted_24x2);
-
-    // Shift left into 24bit space:
-    // ((Q47.0 / 2^8) << 24) = Q23.24 / 2^8
-    sum_56 = AE_Q56S_SLAI(sum_56, 24);
-
-    // Truncate and place on the PR register:
-    // (Q23.24 / 2^8) -> Q23.0 / 2^8
-    x_shifted_24x2 = AE_ROUNDSP24Q48SYM(sum_56);
-  }
-
  // Load the quantized multiplier into the PR register.
  // NOTE: This method assumes that this param has been calculated for 24bit
  // space - not 32bits.
-  // Q0.31 -> Q0.23
-  ae_p24x2s quantized_multiplier_24x2 =
-      AE_CONVERT_INT32_24x2(quantized_multiplier);
+  // Q32.0 / 2^23 -> Q24.0 / 2^23 representing a Q1.23 multiplier.
+  ae_p24x2s quantized_multiplier_24x2 = AE_MOVPA24(quantized_multiplier);
+  // Shift right by 23 - 16 bits minus the specified shift.  This is because we
+  // keep 16 fractional bits until the end to perform rounding.  Subtract shift
+  // since shift is a left shift, and the 23-16 is a right shift.
+  int shift_amount = 7 - shift;

-  // Find the product of x and the quantized_multiplier and right shift
-  // to 48bit aligned.
-  // NOTE: Adjust for the additional 8 bits of lost precision throughout this
-  // function:
-  // (Q23.0 / 2^8) * Q23.0 = Q47.0 / 2^8
-  // (Q47.0 / 2^8) >> 7 = Q47.0
-  ae_q56s result = AE_MULP24S_HH(x_shifted_24x2, quantized_multiplier_24x2);
-  result = AE_Q56S_SRA(result, shift_exceeds_24bits ? 15 : 23);
+  // Find the product of x and the quantized_multiplier.
+  // Q24.0 / 2^23 * Q24.0 = Q48.0 / 2^23
+  // Q48.0 / 2^23 >> 7 = Q48.0 / 2^16
+  ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);

-  if (shift < 0) {
-    // Handle any negative shift directly on the 48 bit value.
-    result = AE_Q56S_SRA(result, -shift);
+  // Shift right if shift amount is positive, left if shift amount is negative.
+  if (shift_amount >= 0) {
+    result_56 = AE_Q56S_SRA(result_56, shift_amount);
+  } else {
+    result_56 = AE_Q56S_SLA(result_56, -shift_amount);
  }
-  return result;
+
+  // Round off the bottom 16 bits.
+  // Q48.0 / 2^16 -> Q32.0 aligned to 48 bits.
+  result_56 = AE_ROUNDSQ32SYM(result_56);
+  return result_56;
+}
+
+//
+// Multiply 32bit value by a quantized multiplier (w/ shift) and returns a 48bit
+// aligned value in the QR register.
+//
+inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
+                                             int32_t quantized_multiplier,
+                                             int shift) {
+  // Convert x into a 2x24bit PR register file. If x is outside the numerical
+  // limits of a 24bit integer, the "fractional" or lower 8bits are discarded.
+  // If x is within the range of a 24 bit integer, the "signed" or upper 8bits
+  // are discarded.
+  ae_p24x2s x_24x2;
+  if (x > INT24_MIN && x < INT24_MAX) {
+    x_24x2 = AE_MOVPA24(x);
+  } else {
+    x_24x2 = static_cast<ae_p24s>(*reinterpret_cast<ae_p24f*>(&x));
+    shift += 8;
+  }
+
+  return MultiplyByQuantizedMultiplier(x_24x2, quantized_multiplier, shift);
 }

 //
@ -193,6 +121,8 @@ inline void QuantizeMultiplier(float multiplier, int32_t* quantized_multiplier,
  }
  TFLITE_CHECK_LE(q_fixed, INT24_MAX);

+  // Ensure shift does not exceed 24-bit range.
+  TFLITE_CHECK_LE(*shift, 23);
  if (*shift < -23) {
    *shift = 0;
    q_fixed = 0;
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"

 namespace tflite {
 namespace ops {
@ -108,9 +107,6 @@ inline void FullyConnected(
      sum_56 = MultiplyByQuantizedMultiplier(sum_24x2, output_multiplier,
                                             output_shift);

-      // Align from 48bit to 32bit on the QR register:
-      sum_56 = AE_Q56S_SLAI(sum_56, 16);
-
      // Add output_offset and cap min/max values:
      sum_56 = AE_ADDQ56(sum_56, output_offset_56);
      sum_56 = AE_MINQ56S(sum_56, output_activation_max_56);
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"

 namespace tflite {
 namespace ops {
@ -43,7 +42,7 @@ void AffineQuantize(int scale_multiplier,

  const ae_p16x2s* input_data_ptr = (const ae_p16x2s*)(input_data - 2);

-  ae_p24x2s scale_multiplier_24x2 = AE_CONVERT_INT32_24x2(scale_multiplier);
+  ae_p24x2s scale_multiplier_24x2 = AE_MOVPA24(scale_multiplier);

  int iters = flat_size / 2;
  for (int i = 0; i < iters; i++) {
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
@ -25,8 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/activation_utils.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
-#include "tensorflow/lite/micro/micro_utils.h"

 namespace tflite {
 namespace ops {
@ -99,7 +97,7 @@ void EvalIntegerSVDF(

    ae_q56s output_int16_max_56 = AE_CVTQ48A32S(INT16_MAX);
    ae_q56s output_int16_min_56 = AE_CVTQ48A32S(INT16_MIN);
-    ae_p24x2s input_zp_24x2 = AE_CONVERT_INT32_24x2(input_zp);
+    ae_p24x2s input_zp_24x2 = AE_MOVPA24(input_zp);

    for (int b = 0; b < n_batch; b++) {
      const int8_t* weight_feature_ptr = weight_feature - 2;
@ -140,8 +138,6 @@ void EvalIntegerSVDF(
            tflite::ops::micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
                dot_prod_24x2, scale_1_a, scale_1_b);

-        // Align from 48bit to 32bit on the QR register
-        dot_prod_56 = AE_Q56S_SLAI(dot_prod_56, 16);
        // Cap min/max and convert to int32:
        dot_prod_56 = AE_MAXQ56S(dot_prod_56, output_int16_min_56);
        dot_prod_56 = AE_MINQ56S(dot_prod_56, output_int16_max_56);
@ -232,8 +228,6 @@ void EvalIntegerSVDF(
      ae_q56s x_56 =
          tflite::ops::micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
              scratch_output_tensor[i], scale_2_a, scale_2_b);
-      // Align from 48bit to 32bit on the QR register:
-      x_56 = AE_Q56S_SLAI(x_56, 16);
      // Add output adjustment:
      x_56 = AE_ADDQ56(x_56, output_zp_56);
      // Cap min/max and convert to int32 (already aligned to 32bit):
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h
@ -1,42 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_UTILS_H_
-#define TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_UTILS_H_
-
-#include <xtensa/tie/xt_hifi2.h>
-
-#include <cstdint>
-
-// INT24 MIN/MAX
-#define INT24_MIN -8388608
-#define INT24_MAX 8388607
-
-// Converts an int32 value into a 2x24bit PR register file. If the int32 value
-// is outside the numerical limits of a 24bit integer, the "fractional" or lower
-// 8bits are discarded. If the value is within the range of a 24 bit integer,
-// the "signed" or upper 8bits are discarded.
-inline ae_p24x2s AE_CONVERT_INT32_24x2(int32_t v) {
-  if (v > INT24_MIN && v < INT24_MAX) {
-    return *reinterpret_cast<ae_p24s*>(&v);
-  } else {
-    return static_cast<ae_p24s>(*reinterpret_cast<ae_p24f*>(&v));
-  }
-}
-
-// Shifts a 48bit accumulator value into 32bit space and returns the value.
-#define AE_CONVERT_Q56_INT32(v) AE_TRUNCA32Q48(AE_Q56S_SLAI(v, 16))
-
-#endif  // TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_UTILS_H_