Simplify, speed up and improve accuracy for xtensa fixedpoint utils.
PiperOrigin-RevId: 309153806 Change-Id: I3ae4196fdaad56bdc7f6fd236d0f3c05c0ca50e0
This commit is contained in:
parent
645cd8445f
commit
3c520614a3
@ -69,7 +69,6 @@ cc_library(
|
||||
"xtensa_hifimini/quantize.cc",
|
||||
"xtensa_hifimini/softmax.cc",
|
||||
"xtensa_hifimini/svdf.cc",
|
||||
"xtensa_hifimini/utils.h",
|
||||
],
|
||||
}),
|
||||
hdrs = ["micro_ops.h"],
|
||||
|
@ -25,7 +25,6 @@ limitations under the License.
|
||||
#include "tensorflow/lite/kernels/kernel_util.h"
|
||||
#include "tensorflow/lite/kernels/padding.h"
|
||||
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
|
||||
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace ops {
|
||||
@ -66,7 +65,7 @@ void ConvPerChannel(const ConvParams& params, const int32* output_multiplier,
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_depth = output_shape.Dims(3);
|
||||
|
||||
ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
|
||||
ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
|
||||
ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
|
||||
ae_q56s output_activation_min_56 = AE_CVTQ48A32S(output_activation_min);
|
||||
ae_q56s output_activation_max_56 = AE_CVTQ48A32S(output_activation_max);
|
||||
@ -150,9 +149,6 @@ void ConvPerChannel(const ConvParams& params, const int32* output_multiplier,
|
||||
acc_24x2, output_multiplier[out_channel],
|
||||
output_shift[out_channel]);
|
||||
|
||||
// Shift from 48bit aligned to 32bit:
|
||||
acc_56 = AE_Q56S_SLAI(acc_56, 16);
|
||||
|
||||
// Add output offset, cap activation, and assign to the output:
|
||||
acc_56 = AE_ADDQ56(acc_56, output_offset_56);
|
||||
acc_56 = AE_MINQ56S(acc_56, output_activation_max_56);
|
||||
@ -178,7 +174,7 @@ inline void Conv1x32Input32x32Filter(
|
||||
const RuntimeShape& filter_shape, const int8* filter_data,
|
||||
const RuntimeShape& bias_shape, const int32* bias_data,
|
||||
const RuntimeShape& output_shape, int8* output_data) {
|
||||
ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
|
||||
ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
|
||||
ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
|
||||
ae_q56s output_activation_max_56 = AE_CVTQ48A32S(quantized_activation_max);
|
||||
ae_q56s output_activation_min_56 = AE_CVTQ48A32S(quantized_activation_min);
|
||||
@ -227,13 +223,10 @@ inline void Conv1x32Input32x32Filter(
|
||||
acc_56 = AE_Q56S_SLAI(acc_56, 8);
|
||||
ae_p24x2s acc_24x2 = AE_TRUNCP24Q48(acc_56);
|
||||
|
||||
// Apply quantized multiplier and accumulate result at 48bit
|
||||
// alignment:
|
||||
// Apply quantized multiplier and accumulate result at 48bit alignment.
|
||||
// Convert the (unsigned) 32-bit multiplier down to a 24-bit multiplier.
|
||||
acc_56 = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
|
||||
acc_24x2, output_multiplier[ch], output_shift[ch]);
|
||||
|
||||
// Shift from 48bit aligned to 32bit:
|
||||
acc_56 = AE_Q56S_SLAI(acc_56, 16);
|
||||
acc_24x2, output_multiplier[ch] >> 8, output_shift[ch]);
|
||||
|
||||
// Add output offset, cap activation, and assign to the output:
|
||||
acc_56 = AE_ADDQ56(acc_56, output_offset_56);
|
||||
|
@ -25,7 +25,6 @@ limitations under the License.
|
||||
#include "tensorflow/lite/kernels/kernel_util.h"
|
||||
#include "tensorflow/lite/kernels/padding.h"
|
||||
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
|
||||
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace ops {
|
||||
@ -69,7 +68,7 @@ inline void DepthwiseConvPerChannel(
|
||||
const int output_width = output_shape.Dims(2);
|
||||
const int output_depth = output_shape.Dims(3);
|
||||
|
||||
ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
|
||||
ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
|
||||
ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
|
||||
ae_q56s output_activation_min_56 = AE_CVTQ48A32S(output_activation_min);
|
||||
ae_q56s output_activation_max_56 = AE_CVTQ48A32S(output_activation_max);
|
||||
@ -114,14 +113,14 @@ inline void DepthwiseConvPerChannel(
|
||||
// shift into 24bit space. Note: value is duplicated in the HH
|
||||
// and LL register - but all calculations are done on the HH
|
||||
// side.
|
||||
ae_p24x2s input_val_24x2 = AE_CONVERT_INT32_24x2(input_val);
|
||||
ae_p24x2s input_val_24x2 = AE_MOVPA24(input_val);
|
||||
|
||||
// Add input offset (24bit aligned):
|
||||
input_val_24x2 =
|
||||
AE_P24S_ADDS_P24X2S(input_val_24x2, input_offset_24x2);
|
||||
|
||||
// Load filter 8bit value into 24bit alignment:
|
||||
ae_p24x2s filter_val_24x2 = AE_CONVERT_INT32_24x2(filter_val);
|
||||
ae_p24x2s filter_val_24x2 = AE_MOVPA24(filter_val);
|
||||
|
||||
// Multiply and accumulate the HH side of each 24x24 PR
|
||||
// register:
|
||||
@ -150,9 +149,6 @@ inline void DepthwiseConvPerChannel(
|
||||
acc_24x2, output_multiplier[output_channel],
|
||||
output_shift[output_channel]);
|
||||
|
||||
// Shift from 48bit aligned to 32bit:
|
||||
acc_56 = AE_Q56S_SLAI(acc_56, 16);
|
||||
|
||||
// Add output offset, cap activation, and assign to the output:
|
||||
acc_56 = AE_ADDQ56(acc_56, output_offset_56);
|
||||
acc_56 = AE_MINQ56S(acc_56, output_activation_max_56);
|
||||
@ -181,9 +177,10 @@ inline void DepthwiseConv4x32MatchingInputAndFilter(
|
||||
const RuntimeShape& filter_shape, const int8* filter_data,
|
||||
const RuntimeShape& bias_shape, const int32* bias_data,
|
||||
const RuntimeShape& output_shape, int8* output_data) {
|
||||
const int32_t mult = output_multiplier[0];
|
||||
// Convert the (unsigned) 32-bit multiplier down to a 24-bit multiplier.
|
||||
const int32_t mult = output_multiplier[0] >> 8;
|
||||
const int32_t shift = output_shift[0];
|
||||
ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
|
||||
ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
|
||||
ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
|
||||
ae_q56s output_activation_min_56 = AE_CVTQ48A32S(quantized_activation_min);
|
||||
ae_q56s output_activation_max_56 = AE_CVTQ48A32S(quantized_activation_max);
|
||||
@ -270,10 +267,6 @@ inline void DepthwiseConv4x32MatchingInputAndFilter(
|
||||
block_1_acc = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
|
||||
acc_24x2_1, mult, shift);
|
||||
|
||||
// Shift from 48bit aligned to 32bit:
|
||||
block_0_acc = AE_Q56S_SLAI(block_0_acc, 16);
|
||||
block_1_acc = AE_Q56S_SLAI(block_1_acc, 16);
|
||||
|
||||
// Add output offset, cap activation, and assign to the output:
|
||||
block_0_acc = AE_ADDQ56(block_0_acc, output_offset_56);
|
||||
block_1_acc = AE_ADDQ56(block_1_acc, output_offset_56);
|
||||
|
@ -23,7 +23,6 @@ limitations under the License.
|
||||
#include <cstdint>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace ops {
|
||||
@ -31,80 +30,9 @@ namespace micro {
|
||||
namespace xtensa {
|
||||
namespace hifimini {
|
||||
|
||||
//
|
||||
// Multiply 32bit value by a quantized multiplier (w/ shift) and returns a 48bit
|
||||
// aligned value in the QR register.
|
||||
//
|
||||
inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
|
||||
int32_t quantized_multiplier,
|
||||
int shift) {
|
||||
// These boolean factors will carry an additional 2^8 (e.g 256) factor
|
||||
// throughout the equation to cover the missing 8 bits of precision when a
|
||||
// 32bit integer is outside the bounds of INT24. The additional scaling factor
|
||||
// will be adjusted after the final multiplication in this method.
|
||||
//
|
||||
// The Q-notation comments in this method describe the calculations that take
|
||||
// place when both |x| and the shifted value of |1| overflow the INT24 limits.
|
||||
bool x_exceeds_24bits = (x <= INT24_MIN || x >= INT24_MAX);
|
||||
bool shift_exceeds_24bits = false;
|
||||
|
||||
// Q31.0 -> Q23.0 / 2^8
|
||||
ae_p24x2s x_24x2 = AE_CONVERT_INT32_24x2(x);
|
||||
|
||||
if (shift > 0) {
|
||||
int shifted = 1 << shift;
|
||||
if (shifted <= INT24_MIN || shifted >= INT24_MAX) {
|
||||
shift_exceeds_24bits = true;
|
||||
}
|
||||
|
||||
// Load the shifted value into the PR register:
|
||||
// Q31.0 -> Q23.0 / 2^8
|
||||
ae_p24x2s shifted_24x2 = AE_CONVERT_INT32_24x2(shifted);
|
||||
|
||||
// (Q23.0 / 2^8) * (Q23.0 / 2^8) = Q47.0 / 2^16
|
||||
ae_q56s sum_56 = AE_MULP24S_HH(x_24x2, shifted_24x2);
|
||||
|
||||
// Shift left into 24bit space:
|
||||
// ((Q47.0 / 2^16) << 24) = Q23.24 / 2^16
|
||||
sum_56 = AE_Q56S_SLAI(sum_56, 24);
|
||||
|
||||
// Truncate and place on the PR register:
|
||||
// (Q23.24 / 2^16) -> Q23.0 / 2^16
|
||||
x_24x2 = AE_TRUNCP24Q48(sum_56);
|
||||
}
|
||||
|
||||
// Load the quantized multiplier into the PR register.
|
||||
// NOTE: This method assumes that this param has been calculated for 24bit
|
||||
// space - not 32bits.
|
||||
// Q0.31 -> Q0.23
|
||||
ae_p24x2s quantized_multiplier_24x2 =
|
||||
AE_CONVERT_INT32_24x2(quantized_multiplier);
|
||||
|
||||
// Adjust for the additional 8 bits of lost precision throughout this
|
||||
// function:
|
||||
int shift_amount = 23;
|
||||
if (x_exceeds_24bits) {
|
||||
shift_amount = shift_amount - 8;
|
||||
}
|
||||
if (shift_exceeds_24bits) {
|
||||
shift_amount = shift_amount - 8;
|
||||
}
|
||||
|
||||
// Find the product of x and the quantized_multiplier and right shift
|
||||
// to 48bit aligned.
|
||||
// (Q23.0 / 2^16) * Q23.0 = Q47.0 / 2^16
|
||||
// (Q47.0 / 2^16) >> 7 = Q47.0
|
||||
ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);
|
||||
if (shift_amount > 0) {
|
||||
result_56 = AE_Q56S_SRA(result_56, shift_amount);
|
||||
}
|
||||
|
||||
if (shift < 0) {
|
||||
// Handle any negative shift directly on the 48 bit value.
|
||||
result_56 = AE_Q56S_SRA(result_56, -shift);
|
||||
}
|
||||
return result_56;
|
||||
}
|
||||
// INT24 MIN/MAX
|
||||
#define INT24_MIN -8388608
|
||||
#define INT24_MAX 8388607
|
||||
|
||||
//
|
||||
// Multiply 24bit value by a quantized multiplier (w/ shift) and returns a 48bit
|
||||
@ -113,62 +41,62 @@ inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
|
||||
inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
|
||||
int32_t quantized_multiplier,
|
||||
int shift) {
|
||||
// NOTE: x_24x2 = Q23.0
|
||||
|
||||
// This is an optimized version of a 32 bit MultiplyByQuantizedMultiplier
|
||||
// operation of TFLite. Sometimes, the shifted value of |x_24x2| can exceed
|
||||
// the limits of INT24, which requires |AE_CONVERT_INT32_24x2()| to load the
|
||||
// left-most 24 bits of a 32bit integer. When this occurs, all Q values here
|
||||
// carry an additional division of 2^8 to account for this loss in precision.
|
||||
// This division will be applied to the final shift after multiplication.
|
||||
// A value with 1 sign bit, N integer bits and M fractional bits is
|
||||
// represented as QN+1.M since the sign bit is included in the integer bits.
|
||||
//
|
||||
// The Q notation in this method explains the values represented in each
|
||||
// variable, along with an implicit division since the quantized_multiplier
|
||||
// represents a value between 0.5 and 1.0 (Q1.X-1 where X is the bit precision
|
||||
// of the type).
|
||||
//
|
||||
// The Q-notation comments in this method describe the calculations that take
|
||||
// place when both |x| and the shifted value of |1| overflow the INT24 limits.
|
||||
bool shift_exceeds_24bits = false;
|
||||
|
||||
ae_p24x2s x_shifted_24x2 = x_24x2;
|
||||
if (shift > 0) {
|
||||
int shifted = 1 << shift;
|
||||
if (shifted <= INT24_MIN || shifted >= INT24_MAX) {
|
||||
shift_exceeds_24bits = true;
|
||||
}
|
||||
// Load the shifted value into the PR register:
|
||||
// Q31.0 -> Q23.0 / 2^8
|
||||
ae_p24x2s shifted_24x2 = AE_CONVERT_INT32_24x2(shifted);
|
||||
|
||||
// Q23.0 * (Q23.0 / 2^8) = Q47.0 / 2^8
|
||||
ae_q56s sum_56 = AE_MULP24S_HH(x_24x2, shifted_24x2);
|
||||
|
||||
// Shift left into 24bit space:
|
||||
// ((Q47.0 / 2^8) << 24) = Q23.24 / 2^8
|
||||
sum_56 = AE_Q56S_SLAI(sum_56, 24);
|
||||
|
||||
// Truncate and place on the PR register:
|
||||
// (Q23.24 / 2^8) -> Q23.0 / 2^8
|
||||
x_shifted_24x2 = AE_ROUNDSP24Q48SYM(sum_56);
|
||||
}
|
||||
|
||||
// Load the quantized multiplier into the PR register.
|
||||
// NOTE: This method assumes that this param has been calculated for 24bit
|
||||
// space - not 32bits.
|
||||
// Q0.31 -> Q0.23
|
||||
ae_p24x2s quantized_multiplier_24x2 =
|
||||
AE_CONVERT_INT32_24x2(quantized_multiplier);
|
||||
// Q32.0 / 2^23 -> Q24.0 / 2^23 representing a Q1.23 multiplier.
|
||||
ae_p24x2s quantized_multiplier_24x2 = AE_MOVPA24(quantized_multiplier);
|
||||
// Shift right by 23 - 16 bits minus the specified shift. This is because we
|
||||
// keep 16 fractional bits until the end to perform rounding. Subtract shift
|
||||
// since shift is a left shift, and the 23-16 is a right shift.
|
||||
int shift_amount = 7 - shift;
|
||||
|
||||
// Find the product of x and the quantized_multiplier and right shift
|
||||
// to 48bit aligned.
|
||||
// NOTE: Adjust for the additional 8 bits of lost precision throughout this
|
||||
// function:
|
||||
// (Q23.0 / 2^8) * Q23.0 = Q47.0 / 2^8
|
||||
// (Q47.0 / 2^8) >> 7 = Q47.0
|
||||
ae_q56s result = AE_MULP24S_HH(x_shifted_24x2, quantized_multiplier_24x2);
|
||||
result = AE_Q56S_SRA(result, shift_exceeds_24bits ? 15 : 23);
|
||||
// Find the product of x and the quantized_multiplier.
|
||||
// Q24.0 / 2^23 * Q24.0 = Q48.0 / 2^23
|
||||
// Q48.0 / 2^23 >> 7 = Q48.0 / 2^16
|
||||
ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);
|
||||
|
||||
if (shift < 0) {
|
||||
// Handle any negative shift directly on the 48 bit value.
|
||||
result = AE_Q56S_SRA(result, -shift);
|
||||
// Shift right if shift amount is positive, left if shift amount is negative.
|
||||
if (shift_amount >= 0) {
|
||||
result_56 = AE_Q56S_SRA(result_56, shift_amount);
|
||||
} else {
|
||||
result_56 = AE_Q56S_SLA(result_56, -shift_amount);
|
||||
}
|
||||
return result;
|
||||
|
||||
// Round off the bottom 16 bits.
|
||||
// Q48.0 / 2^16 -> Q32.0 aligned to 48 bits.
|
||||
result_56 = AE_ROUNDSQ32SYM(result_56);
|
||||
return result_56;
|
||||
}
|
||||
|
||||
//
|
||||
// Multiply 32bit value by a quantized multiplier (w/ shift) and returns a 48bit
|
||||
// aligned value in the QR register.
|
||||
//
|
||||
inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
|
||||
int32_t quantized_multiplier,
|
||||
int shift) {
|
||||
// Convert x into a 2x24bit PR register file. If x is outside the numerical
|
||||
// limits of a 24bit integer, the "fractional" or lower 8bits are discarded.
|
||||
// If x is within the range of a 24 bit integer, the "signed" or upper 8bits
|
||||
// are discarded.
|
||||
ae_p24x2s x_24x2;
|
||||
if (x > INT24_MIN && x < INT24_MAX) {
|
||||
x_24x2 = AE_MOVPA24(x);
|
||||
} else {
|
||||
x_24x2 = static_cast<ae_p24s>(*reinterpret_cast<ae_p24f*>(&x));
|
||||
shift += 8;
|
||||
}
|
||||
|
||||
return MultiplyByQuantizedMultiplier(x_24x2, quantized_multiplier, shift);
|
||||
}
|
||||
|
||||
//
|
||||
@ -193,6 +121,8 @@ inline void QuantizeMultiplier(float multiplier, int32_t* quantized_multiplier,
|
||||
}
|
||||
TFLITE_CHECK_LE(q_fixed, INT24_MAX);
|
||||
|
||||
// Ensure shift does not exceed 24-bit range.
|
||||
TFLITE_CHECK_LE(*shift, 23);
|
||||
if (*shift < -23) {
|
||||
*shift = 0;
|
||||
q_fixed = 0;
|
||||
|
@ -25,7 +25,6 @@ limitations under the License.
|
||||
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
|
||||
#include "tensorflow/lite/kernels/kernel_util.h"
|
||||
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
|
||||
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace ops {
|
||||
@ -108,9 +107,6 @@ inline void FullyConnected(
|
||||
sum_56 = MultiplyByQuantizedMultiplier(sum_24x2, output_multiplier,
|
||||
output_shift);
|
||||
|
||||
// Align from 48bit to 32bit on the QR register:
|
||||
sum_56 = AE_Q56S_SLAI(sum_56, 16);
|
||||
|
||||
// Add output_offset and cap min/max values:
|
||||
sum_56 = AE_ADDQ56(sum_56, output_offset_56);
|
||||
sum_56 = AE_MINQ56S(sum_56, output_activation_max_56);
|
||||
|
@ -22,7 +22,6 @@ limitations under the License.
|
||||
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
|
||||
#include "tensorflow/lite/kernels/kernel_util.h"
|
||||
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
|
||||
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace ops {
|
||||
@ -43,7 +42,7 @@ void AffineQuantize(int scale_multiplier,
|
||||
|
||||
const ae_p16x2s* input_data_ptr = (const ae_p16x2s*)(input_data - 2);
|
||||
|
||||
ae_p24x2s scale_multiplier_24x2 = AE_CONVERT_INT32_24x2(scale_multiplier);
|
||||
ae_p24x2s scale_multiplier_24x2 = AE_MOVPA24(scale_multiplier);
|
||||
|
||||
int iters = flat_size / 2;
|
||||
for (int i = 0; i < iters; i++) {
|
||||
|
@ -25,8 +25,6 @@ limitations under the License.
|
||||
#include "tensorflow/lite/kernels/op_macros.h"
|
||||
#include "tensorflow/lite/micro/kernels/activation_utils.h"
|
||||
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
|
||||
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
|
||||
#include "tensorflow/lite/micro/micro_utils.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace ops {
|
||||
@ -99,7 +97,7 @@ void EvalIntegerSVDF(
|
||||
|
||||
ae_q56s output_int16_max_56 = AE_CVTQ48A32S(INT16_MAX);
|
||||
ae_q56s output_int16_min_56 = AE_CVTQ48A32S(INT16_MIN);
|
||||
ae_p24x2s input_zp_24x2 = AE_CONVERT_INT32_24x2(input_zp);
|
||||
ae_p24x2s input_zp_24x2 = AE_MOVPA24(input_zp);
|
||||
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
const int8_t* weight_feature_ptr = weight_feature - 2;
|
||||
@ -140,8 +138,6 @@ void EvalIntegerSVDF(
|
||||
tflite::ops::micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
|
||||
dot_prod_24x2, scale_1_a, scale_1_b);
|
||||
|
||||
// Align from 48bit to 32bit on the QR register
|
||||
dot_prod_56 = AE_Q56S_SLAI(dot_prod_56, 16);
|
||||
// Cap min/max and convert to int32:
|
||||
dot_prod_56 = AE_MAXQ56S(dot_prod_56, output_int16_min_56);
|
||||
dot_prod_56 = AE_MINQ56S(dot_prod_56, output_int16_max_56);
|
||||
@ -232,8 +228,6 @@ void EvalIntegerSVDF(
|
||||
ae_q56s x_56 =
|
||||
tflite::ops::micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
|
||||
scratch_output_tensor[i], scale_2_a, scale_2_b);
|
||||
// Align from 48bit to 32bit on the QR register:
|
||||
x_56 = AE_Q56S_SLAI(x_56, 16);
|
||||
// Add output adjustment:
|
||||
x_56 = AE_ADDQ56(x_56, output_zp_56);
|
||||
// Cap min/max and convert to int32 (already aligned to 32bit):
|
||||
|
@ -1,42 +0,0 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_UTILS_H_
|
||||
#define TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_UTILS_H_
|
||||
|
||||
#include <xtensa/tie/xt_hifi2.h>
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
// INT24 MIN/MAX
|
||||
#define INT24_MIN -8388608
|
||||
#define INT24_MAX 8388607
|
||||
|
||||
// Converts an int32 value into a 2x24bit PR register file. If the int32 value
|
||||
// is outside the numerical limits of a 24bit integer, the "fractional" or lower
|
||||
// 8bits are discarded. If the value is within the range of a 24 bit integer,
|
||||
// the "signed" or upper 8bits are discarded.
|
||||
inline ae_p24x2s AE_CONVERT_INT32_24x2(int32_t v) {
|
||||
if (v > INT24_MIN && v < INT24_MAX) {
|
||||
return *reinterpret_cast<ae_p24s*>(&v);
|
||||
} else {
|
||||
return static_cast<ae_p24s>(*reinterpret_cast<ae_p24f*>(&v));
|
||||
}
|
||||
}
|
||||
|
||||
// Shifts a 48bit accumulator value into 32bit space and returns the value.
|
||||
#define AE_CONVERT_Q56_INT32(v) AE_TRUNCA32Q48(AE_Q56S_SLAI(v, 16))
|
||||
|
||||
#endif // TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_UTILS_H_
|
Loading…
Reference in New Issue
Block a user