Simplify, speed up and improve accuracy for xtensa fixedpoint utils.

PiperOrigin-RevId: 309153806
Change-Id: I3ae4196fdaad56bdc7f6fd236d0f3c05c0ca50e0
This commit is contained in:
Nat Jeffries 2020-04-29 20:48:39 -07:00 committed by TensorFlower Gardener
parent 645cd8445f
commit 3c520614a3
8 changed files with 67 additions and 205 deletions

View File

@ -69,7 +69,6 @@ cc_library(
"xtensa_hifimini/quantize.cc",
"xtensa_hifimini/softmax.cc",
"xtensa_hifimini/svdf.cc",
"xtensa_hifimini/utils.h",
],
}),
hdrs = ["micro_ops.h"],

View File

@ -25,7 +25,6 @@ limitations under the License.
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/padding.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
namespace tflite {
namespace ops {
@ -66,7 +65,7 @@ void ConvPerChannel(const ConvParams& params, const int32* output_multiplier,
const int output_width = output_shape.Dims(2);
const int output_depth = output_shape.Dims(3);
ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
ae_q56s output_activation_min_56 = AE_CVTQ48A32S(output_activation_min);
ae_q56s output_activation_max_56 = AE_CVTQ48A32S(output_activation_max);
@ -150,9 +149,6 @@ void ConvPerChannel(const ConvParams& params, const int32* output_multiplier,
acc_24x2, output_multiplier[out_channel],
output_shift[out_channel]);
// Shift from 48bit aligned to 32bit:
acc_56 = AE_Q56S_SLAI(acc_56, 16);
// Add output offset, cap activation, and assign to the output:
acc_56 = AE_ADDQ56(acc_56, output_offset_56);
acc_56 = AE_MINQ56S(acc_56, output_activation_max_56);
@ -178,7 +174,7 @@ inline void Conv1x32Input32x32Filter(
const RuntimeShape& filter_shape, const int8* filter_data,
const RuntimeShape& bias_shape, const int32* bias_data,
const RuntimeShape& output_shape, int8* output_data) {
ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
ae_q56s output_activation_max_56 = AE_CVTQ48A32S(quantized_activation_max);
ae_q56s output_activation_min_56 = AE_CVTQ48A32S(quantized_activation_min);
@ -227,13 +223,10 @@ inline void Conv1x32Input32x32Filter(
acc_56 = AE_Q56S_SLAI(acc_56, 8);
ae_p24x2s acc_24x2 = AE_TRUNCP24Q48(acc_56);
// Apply quantized multiplier and accumulate result at 48bit
// alignment:
// Apply quantized multiplier and accumulate result at 48bit alignment.
// Convert the (unsigned) 32-bit multiplier down to a 24-bit multiplier.
acc_56 = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
acc_24x2, output_multiplier[ch], output_shift[ch]);
// Shift from 48bit aligned to 32bit:
acc_56 = AE_Q56S_SLAI(acc_56, 16);
acc_24x2, output_multiplier[ch] >> 8, output_shift[ch]);
// Add output offset, cap activation, and assign to the output:
acc_56 = AE_ADDQ56(acc_56, output_offset_56);

View File

@ -25,7 +25,6 @@ limitations under the License.
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/padding.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
namespace tflite {
namespace ops {
@ -69,7 +68,7 @@ inline void DepthwiseConvPerChannel(
const int output_width = output_shape.Dims(2);
const int output_depth = output_shape.Dims(3);
ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
ae_q56s output_activation_min_56 = AE_CVTQ48A32S(output_activation_min);
ae_q56s output_activation_max_56 = AE_CVTQ48A32S(output_activation_max);
@ -114,14 +113,14 @@ inline void DepthwiseConvPerChannel(
// shift into 24bit space. Note: value is duplicated in the HH
// and LL register - but all calculations are done on the HH
// side.
ae_p24x2s input_val_24x2 = AE_CONVERT_INT32_24x2(input_val);
ae_p24x2s input_val_24x2 = AE_MOVPA24(input_val);
// Add input offset (24bit aligned):
input_val_24x2 =
AE_P24S_ADDS_P24X2S(input_val_24x2, input_offset_24x2);
// Load filter 8bit value into 24bit alignment:
ae_p24x2s filter_val_24x2 = AE_CONVERT_INT32_24x2(filter_val);
ae_p24x2s filter_val_24x2 = AE_MOVPA24(filter_val);
// Multiply and accumulate the HH side of each 24x24 PR
// register:
@ -150,9 +149,6 @@ inline void DepthwiseConvPerChannel(
acc_24x2, output_multiplier[output_channel],
output_shift[output_channel]);
// Shift from 48bit aligned to 32bit:
acc_56 = AE_Q56S_SLAI(acc_56, 16);
// Add output offset, cap activation, and assign to the output:
acc_56 = AE_ADDQ56(acc_56, output_offset_56);
acc_56 = AE_MINQ56S(acc_56, output_activation_max_56);
@ -181,9 +177,10 @@ inline void DepthwiseConv4x32MatchingInputAndFilter(
const RuntimeShape& filter_shape, const int8* filter_data,
const RuntimeShape& bias_shape, const int32* bias_data,
const RuntimeShape& output_shape, int8* output_data) {
const int32_t mult = output_multiplier[0];
// Convert the (unsigned) 32-bit multiplier down to a 24-bit multiplier.
const int32_t mult = output_multiplier[0] >> 8;
const int32_t shift = output_shift[0];
ae_p24x2s input_offset_24x2 = AE_CONVERT_INT32_24x2(input_offset);
ae_p24x2s input_offset_24x2 = AE_MOVPA24(input_offset);
ae_q56s output_offset_56 = AE_CVTQ48A32S(output_offset);
ae_q56s output_activation_min_56 = AE_CVTQ48A32S(quantized_activation_min);
ae_q56s output_activation_max_56 = AE_CVTQ48A32S(quantized_activation_max);
@ -270,10 +267,6 @@ inline void DepthwiseConv4x32MatchingInputAndFilter(
block_1_acc = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
acc_24x2_1, mult, shift);
// Shift from 48bit aligned to 32bit:
block_0_acc = AE_Q56S_SLAI(block_0_acc, 16);
block_1_acc = AE_Q56S_SLAI(block_1_acc, 16);
// Add output offset, cap activation, and assign to the output:
block_0_acc = AE_ADDQ56(block_0_acc, output_offset_56);
block_1_acc = AE_ADDQ56(block_1_acc, output_offset_56);

View File

@ -23,7 +23,6 @@ limitations under the License.
#include <cstdint>
#include "tensorflow/lite/kernels/internal/compatibility.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
namespace tflite {
namespace ops {
@ -31,80 +30,9 @@ namespace micro {
namespace xtensa {
namespace hifimini {
//
// Multiply 32bit value by a quantized multiplier (w/ shift) and returns a 48bit
// aligned value in the QR register.
//
inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
int32_t quantized_multiplier,
int shift) {
// These boolean factors will carry an additional 2^8 (e.g 256) factor
// throughout the equation to cover the missing 8 bits of precision when a
// 32bit integer is outside the bounds of INT24. The additional scaling factor
// will be adjusted after the final multiplication in this method.
//
// The Q-notation comments in this method describe the calculations that take
// place when both |x| and the shifted value of |1| overflow the INT24 limits.
bool x_exceeds_24bits = (x <= INT24_MIN || x >= INT24_MAX);
bool shift_exceeds_24bits = false;
// Q31.0 -> Q23.0 / 2^8
ae_p24x2s x_24x2 = AE_CONVERT_INT32_24x2(x);
if (shift > 0) {
int shifted = 1 << shift;
if (shifted <= INT24_MIN || shifted >= INT24_MAX) {
shift_exceeds_24bits = true;
}
// Load the shifted value into the PR register:
// Q31.0 -> Q23.0 / 2^8
ae_p24x2s shifted_24x2 = AE_CONVERT_INT32_24x2(shifted);
// (Q23.0 / 2^8) * (Q23.0 / 2^8) = Q47.0 / 2^16
ae_q56s sum_56 = AE_MULP24S_HH(x_24x2, shifted_24x2);
// Shift left into 24bit space:
// ((Q47.0 / 2^16) << 24) = Q23.24 / 2^16
sum_56 = AE_Q56S_SLAI(sum_56, 24);
// Truncate and place on the PR register:
// (Q23.24 / 2^16) -> Q23.0 / 2^16
x_24x2 = AE_TRUNCP24Q48(sum_56);
}
// Load the quantized multiplier into the PR register.
// NOTE: This method assumes that this param has been calculated for 24bit
// space - not 32bits.
// Q0.31 -> Q0.23
ae_p24x2s quantized_multiplier_24x2 =
AE_CONVERT_INT32_24x2(quantized_multiplier);
// Adjust for the additional 8 bits of lost precision throughout this
// function:
int shift_amount = 23;
if (x_exceeds_24bits) {
shift_amount = shift_amount - 8;
}
if (shift_exceeds_24bits) {
shift_amount = shift_amount - 8;
}
// Find the product of x and the quantized_multiplier and right shift
// to 48bit aligned.
// (Q23.0 / 2^16) * Q23.0 = Q47.0 / 2^16
// (Q47.0 / 2^16) >> 7 = Q47.0
ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);
if (shift_amount > 0) {
result_56 = AE_Q56S_SRA(result_56, shift_amount);
}
if (shift < 0) {
// Handle any negative shift directly on the 48 bit value.
result_56 = AE_Q56S_SRA(result_56, -shift);
}
return result_56;
}
// INT24 MIN/MAX
#define INT24_MIN -8388608
#define INT24_MAX 8388607
//
// Multiply 24bit value by a quantized multiplier (w/ shift) and returns a 48bit
@ -113,62 +41,62 @@ inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
int32_t quantized_multiplier,
int shift) {
// NOTE: x_24x2 = Q23.0
// This is an optimized version of a 32 bit MultiplyByQuantizedMultiplier
// operation of TFLite. Sometimes, the shifted value of |x_24x2| can exceed
// the limits of INT24, which requires |AE_CONVERT_INT32_24x2()| to load the
// left-most 24 bits of a 32bit integer. When this occurs, all Q values here
// carry an additional division of 2^8 to account for this loss in precision.
// This division will be applied to the final shift after multiplication.
// A value with 1 sign bit, N integer bits and M fractional bits is
// represented as QN+1.M since the sign bit is included in the integer bits.
//
// The Q notation in this method explains the values represented in each
// variable, along with an implicit division since the quantized_multiplier
// represents a value between 0.5 and 1.0 (Q1.X-1 where X is the bit precision
// of the type).
//
// The Q-notation comments in this method describe the calculations that take
// place when both |x| and the shifted value of |1| overflow the INT24 limits.
bool shift_exceeds_24bits = false;
ae_p24x2s x_shifted_24x2 = x_24x2;
if (shift > 0) {
int shifted = 1 << shift;
if (shifted <= INT24_MIN || shifted >= INT24_MAX) {
shift_exceeds_24bits = true;
}
// Load the shifted value into the PR register:
// Q31.0 -> Q23.0 / 2^8
ae_p24x2s shifted_24x2 = AE_CONVERT_INT32_24x2(shifted);
// Q23.0 * (Q23.0 / 2^8) = Q47.0 / 2^8
ae_q56s sum_56 = AE_MULP24S_HH(x_24x2, shifted_24x2);
// Shift left into 24bit space:
// ((Q47.0 / 2^8) << 24) = Q23.24 / 2^8
sum_56 = AE_Q56S_SLAI(sum_56, 24);
// Truncate and place on the PR register:
// (Q23.24 / 2^8) -> Q23.0 / 2^8
x_shifted_24x2 = AE_ROUNDSP24Q48SYM(sum_56);
}
// Load the quantized multiplier into the PR register.
// NOTE: This method assumes that this param has been calculated for 24bit
// space - not 32bits.
// Q0.31 -> Q0.23
ae_p24x2s quantized_multiplier_24x2 =
AE_CONVERT_INT32_24x2(quantized_multiplier);
// Q32.0 / 2^23 -> Q24.0 / 2^23 representing a Q1.23 multiplier.
ae_p24x2s quantized_multiplier_24x2 = AE_MOVPA24(quantized_multiplier);
// Shift right by 23 - 16 bits minus the specified shift. This is because we
// keep 16 fractional bits until the end to perform rounding. Subtract shift
// since shift is a left shift, and the 23-16 is a right shift.
int shift_amount = 7 - shift;
// Find the product of x and the quantized_multiplier and right shift
// to 48bit aligned.
// NOTE: Adjust for the additional 8 bits of lost precision throughout this
// function:
// (Q23.0 / 2^8) * Q23.0 = Q47.0 / 2^8
// (Q47.0 / 2^8) >> 7 = Q47.0
ae_q56s result = AE_MULP24S_HH(x_shifted_24x2, quantized_multiplier_24x2);
result = AE_Q56S_SRA(result, shift_exceeds_24bits ? 15 : 23);
// Find the product of x and the quantized_multiplier.
// Q24.0 / 2^23 * Q24.0 = Q48.0 / 2^23
// Q48.0 / 2^23 >> 7 = Q48.0 / 2^16
ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);
if (shift < 0) {
// Handle any negative shift directly on the 48 bit value.
result = AE_Q56S_SRA(result, -shift);
// Shift right if shift amount is positive, left if shift amount is negative.
if (shift_amount >= 0) {
result_56 = AE_Q56S_SRA(result_56, shift_amount);
} else {
result_56 = AE_Q56S_SLA(result_56, -shift_amount);
}
return result;
// Round off the bottom 16 bits.
// Q48.0 / 2^16 -> Q32.0 aligned to 48 bits.
result_56 = AE_ROUNDSQ32SYM(result_56);
return result_56;
}
//
// Multiply 32bit value by a quantized multiplier (w/ shift) and returns a 48bit
// aligned value in the QR register.
//
inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
int32_t quantized_multiplier,
int shift) {
// Convert x into a 2x24bit PR register file. If x is outside the numerical
// limits of a 24bit integer, the "fractional" or lower 8bits are discarded.
// If x is within the range of a 24 bit integer, the "signed" or upper 8bits
// are discarded.
ae_p24x2s x_24x2;
if (x > INT24_MIN && x < INT24_MAX) {
x_24x2 = AE_MOVPA24(x);
} else {
x_24x2 = static_cast<ae_p24s>(*reinterpret_cast<ae_p24f*>(&x));
shift += 8;
}
return MultiplyByQuantizedMultiplier(x_24x2, quantized_multiplier, shift);
}
//
@ -193,6 +121,8 @@ inline void QuantizeMultiplier(float multiplier, int32_t* quantized_multiplier,
}
TFLITE_CHECK_LE(q_fixed, INT24_MAX);
// Ensure shift does not exceed 24-bit range.
TFLITE_CHECK_LE(*shift, 23);
if (*shift < -23) {
*shift = 0;
q_fixed = 0;

View File

@ -25,7 +25,6 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
namespace tflite {
namespace ops {
@ -108,9 +107,6 @@ inline void FullyConnected(
sum_56 = MultiplyByQuantizedMultiplier(sum_24x2, output_multiplier,
output_shift);
// Align from 48bit to 32bit on the QR register:
sum_56 = AE_Q56S_SLAI(sum_56, 16);
// Add output_offset and cap min/max values:
sum_56 = AE_ADDQ56(sum_56, output_offset_56);
sum_56 = AE_MINQ56S(sum_56, output_activation_max_56);

View File

@ -22,7 +22,6 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
namespace tflite {
namespace ops {
@ -43,7 +42,7 @@ void AffineQuantize(int scale_multiplier,
const ae_p16x2s* input_data_ptr = (const ae_p16x2s*)(input_data - 2);
ae_p24x2s scale_multiplier_24x2 = AE_CONVERT_INT32_24x2(scale_multiplier);
ae_p24x2s scale_multiplier_24x2 = AE_MOVPA24(scale_multiplier);
int iters = flat_size / 2;
for (int i = 0; i < iters; i++) {

View File

@ -25,8 +25,6 @@ limitations under the License.
#include "tensorflow/lite/kernels/op_macros.h"
#include "tensorflow/lite/micro/kernels/activation_utils.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifimini/utils.h"
#include "tensorflow/lite/micro/micro_utils.h"
namespace tflite {
namespace ops {
@ -99,7 +97,7 @@ void EvalIntegerSVDF(
ae_q56s output_int16_max_56 = AE_CVTQ48A32S(INT16_MAX);
ae_q56s output_int16_min_56 = AE_CVTQ48A32S(INT16_MIN);
ae_p24x2s input_zp_24x2 = AE_CONVERT_INT32_24x2(input_zp);
ae_p24x2s input_zp_24x2 = AE_MOVPA24(input_zp);
for (int b = 0; b < n_batch; b++) {
const int8_t* weight_feature_ptr = weight_feature - 2;
@ -140,8 +138,6 @@ void EvalIntegerSVDF(
tflite::ops::micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
dot_prod_24x2, scale_1_a, scale_1_b);
// Align from 48bit to 32bit on the QR register
dot_prod_56 = AE_Q56S_SLAI(dot_prod_56, 16);
// Cap min/max and convert to int32:
dot_prod_56 = AE_MAXQ56S(dot_prod_56, output_int16_min_56);
dot_prod_56 = AE_MINQ56S(dot_prod_56, output_int16_max_56);
@ -232,8 +228,6 @@ void EvalIntegerSVDF(
ae_q56s x_56 =
tflite::ops::micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
scratch_output_tensor[i], scale_2_a, scale_2_b);
// Align from 48bit to 32bit on the QR register:
x_56 = AE_Q56S_SLAI(x_56, 16);
// Add output adjustment:
x_56 = AE_ADDQ56(x_56, output_zp_56);
// Cap min/max and convert to int32 (already aligned to 32bit):

View File

@ -1,42 +0,0 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_UTILS_H_
#define TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_UTILS_H_
#include <xtensa/tie/xt_hifi2.h>
#include <cstdint>
// INT24 MIN/MAX
#define INT24_MIN -8388608
#define INT24_MAX 8388607
// Converts an int32 value into a 2x24bit PR register file. If the int32 value
// is outside the numerical limits of a 24bit integer, the "fractional" or lower
// 8bits are discarded. If the value is within the range of a 24 bit integer,
// the "signed" or upper 8bits are discarded.
inline ae_p24x2s AE_CONVERT_INT32_24x2(int32_t v) {
if (v > INT24_MIN && v < INT24_MAX) {
return *reinterpret_cast<ae_p24s*>(&v);
} else {
return static_cast<ae_p24s>(*reinterpret_cast<ae_p24f*>(&v));
}
}
// Shifts a 48bit accumulator value into 32bit space and returns the value.
#define AE_CONVERT_Q56_INT32(v) AE_TRUNCA32Q48(AE_Q56S_SLAI(v, 16))
#endif // TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_UTILS_H_