Added 16-bit version of ADD/SUB operators. Broadcasting is included.

This commit is contained in:
Elena Zhelezina 2019-12-19 09:09:38 +00:00
parent a0c6417678
commit b94cb4732a
5 changed files with 147 additions and 44 deletions

View File

@ -93,12 +93,24 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
output_size = TfLiteIntArrayCopy(input1->dims); output_size = TfLiteIntArrayCopy(input1->dims);
} }
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
// 8bit -> 8bit general quantized path, with general rescalings // 8bit -> 8bit general quantized path, with general rescalings
// as well as, 16bit -> 16bit with general rescalings
bool general_16bit = input1->type == kTfLiteInt16 &&
input2->type == kTfLiteInt16 &&
output->type == kTfLiteInt16;
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
general_16bit) {
// 8bit -> 8bit general quantized path, with general rescalings
// as well as, 16bit -> 16bit with general rescalings
data->input1_offset = -input1->params.zero_point; data->input1_offset = -input1->params.zero_point;
data->input2_offset = -input2->params.zero_point; data->input2_offset = -input2->params.zero_point;
data->output_offset = output->params.zero_point; data->output_offset = output->params.zero_point;
data->left_shift = 20;
// The shift is set to 15 for 16-bit and 20 in case of 8-bit, accordingly.
// In case of 16-bit we have 65535 << 15 which is less than 1 << 31,
// therefore the addition will still fit in a 32 bit accumulator.
data->left_shift = general_16bit ? 15 : 20;
const double twice_max_input_scale = const double twice_max_input_scale =
2 * std::max(input1->params.scale, input2->params.scale); 2 * std::max(input1->params.scale, input2->params.scale);
const double real_input1_multiplier = const double real_input1_multiplier =
@ -221,7 +233,12 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
const TfLiteTensor* input1, const TfLiteTensor* input1,
const TfLiteTensor* input2, const TfLiteTensor* input2,
TfLiteTensor* output) { TfLiteTensor* output) {
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) { bool general_16bit = input1->type == kTfLiteInt16 &&
input2->type == kTfLiteInt16 &&
output->type == kTfLiteInt16;
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
general_16bit) {
tflite::ArithmeticParams op_params; tflite::ArithmeticParams op_params;
op_params.left_shift = data->left_shift; op_params.left_shift = data->left_shift;
op_params.input1_offset = data->input1_offset; op_params.input1_offset = data->input1_offset;
@ -256,6 +273,12 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
TF_LITE_ADD(optimized_integer_ops, Add, int8_t); TF_LITE_ADD(optimized_integer_ops, Add, int8_t);
} }
} }
} else if (output->type == kTfLiteInt16) {
if (need_broadcast) {
TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int16_t);
} else {
TF_LITE_ADD(reference_ops, Add, int16_t);
}
} else { } else {
if (kernel_type == kReference) { if (kernel_type == kReference) {
if (need_broadcast) { if (need_broadcast) {
@ -286,7 +309,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
// The quantized version of Add doesn't support activations, so we // The quantized version of Add doesn't support activations, so we
// always use BroadcastAdd. // always use BroadcastAdd.
if (kernel_type == kReference) { if (kernel_type == kReference) {
TF_LITE_ADD(reference_ops, Add); TF_LITE_ADD(reference_ops, AddLSTM);
} else { } else {
TF_LITE_ADD(optimized_ops, Add); TF_LITE_ADD(optimized_ops, Add);
} }

View File

@ -306,15 +306,18 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
const float kMin = -1.f; const float kMin = -1.f;
const float kMax = 32767.f / 32768.f; const float kMax = 32767.f / 32768.f;
float kQuantizedTolerance = GetToleranceInt16(kMin, kMax); float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
std::vector<std::vector<float>> inputs1 = { std::vector<std::vector<float>> inputs1 = {{0.1, 0.2, 0.3, 0.4, 0.9, 0.7},
{0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}}; {-0.8, 0.2, 0.4, 0.7, 0.1, 0.0},
std::vector<std::vector<float>> inputs2 = { {-0.8, 0.2, 0.7, 0.3, 0.9, 0.1}};
{0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}}; std::vector<std::vector<float>> inputs2 = {{0.6, 0.4, 0.3, 0.1, -0.1, 0.3},
std::vector<std::vector<float>> results = { {0.6, 0.4, 0.5, -0.8, 0.0, -1.0},
{0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}}; {0.6, 0.4, -0.8, 0.5, -0.9, 0.1}};
std::vector<std::vector<float>> results = {{0.7, 0.6, 0.6, 0.5, 0.8, 1.0},
{-0.2, 0.6, 0.9, -0.1, 0.1, -1.0},
{-0.2, 0.6, -0.1, 0.8, 0.0, 0.2}};
for (size_t i = 0; i < inputs1.size(); ++i) { for (size_t i = 0; i < inputs1.size(); ++i) {
QuantizedAddOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax}, QuantizedAddOpModel m({TensorType_INT16, {1, 2, 3, 1}, kMin, kMax},
{TensorType_INT16, {1, 2, 2, 1}, kMin, kMax}, {TensorType_INT16, {1, 2, 3, 1}, kMin, kMax},
{TensorType_INT16, {}, kMin, kMax}, {TensorType_INT16, {}, kMin, kMax},
ActivationFunctionType_NONE); ActivationFunctionType_NONE);
m.QuantizeAndPopulate<int16_t>(m.input1(), inputs1[i]); m.QuantizeAndPopulate<int16_t>(m.input1(), inputs1[i]);
@ -435,6 +438,10 @@ TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastInt8) {
QuantizedWithScalarBroadcast<TensorType_INT8, int8_t>(); QuantizedWithScalarBroadcast<TensorType_INT8, int8_t>();
} }
TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastInt16) {
QuantizedWithScalarBroadcast<TensorType_INT16, int16_t>();
}
template <enum TensorType tensor_type, typename integer_dtype> template <enum TensorType tensor_type, typename integer_dtype>
void QuantizedWithMixedBroadcast() { void QuantizedWithMixedBroadcast() {
float kQuantizedTolerance = GetTolerance(-3.f, 3.f); float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
@ -497,6 +504,10 @@ TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastInt8) {
QuantizedWithMixedBroadcast<TensorType_INT8, int8_t>(); QuantizedWithMixedBroadcast<TensorType_INT8, int8_t>();
} }
TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastInt16) {
QuantizedWithMixedBroadcast<TensorType_INT16, int16_t>();
}
template <enum TensorType tensor_type, typename integer_dtype> template <enum TensorType tensor_type, typename integer_dtype>
void QuantizedWithGenericBroadcast() { void QuantizedWithGenericBroadcast() {
float kQuantizedTolerance = GetTolerance(-1.0, 1.0); float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
@ -523,5 +534,9 @@ TEST(QuantizedAddOpModel, QuantizedWithGenericdBroadcastInt8) {
QuantizedWithGenericBroadcast<TensorType_INT8, int8_t>(); QuantizedWithGenericBroadcast<TensorType_INT8, int8_t>();
} }
TEST(QuantizedAddOpModel, QuantizedWithGenericdBroadcastInt16) {
QuantizedWithGenericBroadcast<TensorType_INT16, int16_t>();
}
} // namespace } // namespace
} // namespace tflite } // namespace tflite

View File

@ -51,13 +51,18 @@ inline void Add(const ArithmeticParams& params,
// Element-wise add that can often be used for inner loop of broadcast add as // Element-wise add that can often be used for inner loop of broadcast add as
// well as the non-broadcast add. // well as the non-broadcast add.
// This function is used for 8-bit as well as for 16-bit, but the accumulator
// is 32-bit for both cases. The overflow does not happen due to the
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
template <typename T>
inline void AddElementwise(int size, const ArithmeticParams& params, inline void AddElementwise(int size, const ArithmeticParams& params,
const uint8* input1_data, const uint8* input2_data, const T* input1_data, const T* input2_data,
uint8* output_data) { T* output_data) {
TFLITE_DCHECK_GT(params.input1_offset, -256); TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
TFLITE_DCHECK_GT(params.input2_offset, -256); TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
TFLITE_DCHECK_LT(params.input1_offset, 256); TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
TFLITE_DCHECK_LT(params.input2_offset, 256); TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
const int32 input1_val = params.input1_offset + input1_data[i]; const int32 input1_val = params.input1_offset + input1_data[i];
@ -78,7 +83,7 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
const int32 clamped_output = const int32 clamped_output =
std::min(params.quantized_activation_max, std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output)); std::max(params.quantized_activation_min, raw_output));
output_data[i] = static_cast<uint8>(clamped_output); output_data[i] = static_cast<T>(clamped_output);
} }
} }
@ -138,6 +143,24 @@ inline void Add(const ArithmeticParams& params,
const RuntimeShape& output_shape, int16* output_data) { const RuntimeShape& output_shape, int16* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min, TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max); params.quantized_activation_max);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
int max_value = std::numeric_limits<int16>::max();
TFLITE_DCHECK_GT(params.input1_offset, -max_value);
TFLITE_DCHECK_GT(params.input2_offset, -max_value);
TFLITE_DCHECK_LT(params.input1_offset, max_value);
TFLITE_DCHECK_LT(params.input2_offset, max_value);
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void AddLSTM(const ArithmeticParams& params,
const RuntimeShape& input1_shape, const int16* input1_data,
const RuntimeShape& input2_shape, const int16* input2_data,
const RuntimeShape& output_shape, int16* output_data) {
TFLITE_DCHECK_LE(params.quantized_activation_min,
params.quantized_activation_max);
const int input1_shift = params.input1_shift; const int input1_shift = params.input1_shift;
const int flat_size = const int flat_size =
@ -257,13 +280,14 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
} }
} }
inline void BroadcastAdd4DSlow(const ArithmeticParams& params, // This function is used for 8-bit as well as for 16-bit, but the accumulator
const RuntimeShape& input1_shape, // is 32-bit for both cases. The overflow does not happen due to the
const uint8* input1_data, // choice of the shift (20 or 15, accordingly - see add.cc for more comments).
const RuntimeShape& input2_shape, template <typename T>
const uint8* input2_data, inline void BroadcastAdd4DSlow(
const RuntimeShape& output_shape, const ArithmeticParams& params, const RuntimeShape& input1_shape,
uint8* output_data) { const T* input1_data, const RuntimeShape& input2_shape,
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
NdArrayDesc<4> desc1; NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2; NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@ -313,7 +337,7 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
std::min(params.quantized_activation_max, std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output)); std::max(params.quantized_activation_min, raw_output));
output_data[Offset(extended_output_shape, b, y, x, c)] = output_data[Offset(extended_output_shape, b, y, x, c)] =
static_cast<uint8>(clamped_output); static_cast<T>(clamped_output);
} }
} }
} }

View File

@ -72,13 +72,14 @@ void Free(TfLiteContext* context, void* buffer) {
delete reinterpret_cast<OpData*>(buffer); delete reinterpret_cast<OpData*>(buffer);
} }
TfLiteStatus Prepare8BitSubOp(TfLiteContext* context, TfLiteStatus PrepareGeneralSubOp(TfLiteContext* context,
const TfLiteTensor* input_1, const TfLiteTensor* input_1,
const TfLiteTensor* input_2, TfLiteTensor* output, const TfLiteTensor* input_2,
TfLiteSubParams* params, OpData* op_params, TfLiteTensor* output, TfLiteSubParams* params,
int op_sign) { OpData* op_params, int op_sign) {
TF_LITE_ENSURE(context, TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 ||
output->type == kTfLiteUInt8 || output->type == kTfLiteInt8); output->type == kTfLiteInt8 ||
output->type == kTfLiteInt16);
const auto& input1_quantization_params = input_1->params; const auto& input1_quantization_params = input_1->params;
const auto& input2_quantization_params = input_2->params; const auto& input2_quantization_params = input_2->params;
const auto& output_quantization_params = output->params; const auto& output_quantization_params = output->params;
@ -87,6 +88,9 @@ TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
if (output->type == kTfLiteUInt8) { if (output->type == kTfLiteUInt8) {
integer_type_min = std::numeric_limits<uint8_t>::min(); integer_type_min = std::numeric_limits<uint8_t>::min();
integer_type_max = std::numeric_limits<uint8_t>::max(); integer_type_max = std::numeric_limits<uint8_t>::max();
} else if (output->type == kTfLiteInt16) {
integer_type_min = std::numeric_limits<int16_t>::min();
integer_type_max = std::numeric_limits<int16_t>::max();
} else { } else {
// output->type == kTfLiteInt8 // output->type == kTfLiteInt8
integer_type_min = std::numeric_limits<int8_t>::min(); integer_type_min = std::numeric_limits<int8_t>::min();
@ -109,7 +113,11 @@ TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
op_params->input1_offset = -input1_quantization_params.zero_point; op_params->input1_offset = -input1_quantization_params.zero_point;
op_params->input2_offset = -input2_quantization_params.zero_point; op_params->input2_offset = -input2_quantization_params.zero_point;
op_params->output_offset = output_quantization_params.zero_point; op_params->output_offset = output_quantization_params.zero_point;
op_params->left_shift = 20;
// The shift is set to 15 in case of 16-bit and 20 in case of 8-bit,
// accordingly. In case of 16-bit we have 65535 << 15 which is less than 1 <<
// 31, therefore the addition will still fit in a 32 bit accumulator.
op_params->left_shift = output->type == kTfLiteInt16 ? 15 : 20;
const double twice_max_input_scale = const double twice_max_input_scale =
2 * std::max(input1_quantization_params.scale, 2 * std::max(input1_quantization_params.scale,
input2_quantization_params.scale); input2_quantization_params.scale);
@ -135,10 +143,11 @@ TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized( TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
context, params->activation, output, &op_params->output_activation_min, context, params->activation, output, &op_params->output_activation_min,
&op_params->output_activation_max)); &op_params->output_activation_max));
return kTfLiteOk; return kTfLiteOk;
} }
TfLiteStatus PrepareInt16SubOp(TfLiteContext* context, TfLiteStatus PrepareLSTMSubOp(TfLiteContext* context,
const TfLiteTensor* input1, const TfLiteTensor* input1,
const TfLiteTensor* input2, TfLiteTensor* output, const TfLiteTensor* input2, TfLiteTensor* output,
TfLiteSubParams* params, OpData* data) { TfLiteSubParams* params, OpData* data) {
@ -208,12 +217,21 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
output_size = TfLiteIntArrayCopy(input1->dims); output_size = TfLiteIntArrayCopy(input1->dims);
} }
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) { // 8bit -> 8bit general quantized path, with general rescalings
TF_LITE_ENSURE_OK(context, Prepare8BitSubOp(context, input1, input2, output, // as well as, 16bit -> 16bit with general rescalings
params, data, -1));
bool general_16bit = output->type == kTfLiteInt16 &&
input1->type == kTfLiteInt16 &&
input2->type == kTfLiteInt16;
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
general_16bit) {
TF_LITE_ENSURE_OK(context, PrepareGeneralSubOp(context, input1, input2,
output, params, data, -1));
} else if (output->type == kTfLiteInt16) { } else if (output->type == kTfLiteInt16) {
TF_LITE_ENSURE_OK(context, PrepareInt16SubOp(context, input1, input2, // LSTM-special case with scale parameter of POT
output, params, data)); TF_LITE_ENSURE_OK(context, PrepareLSTMSubOp(context, input1, input2, output,
params, data));
} }
return context->ResizeTensor(context, output, output_size); return context->ResizeTensor(context, output, output_size);
@ -288,6 +306,11 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
const bool need_broadcast = optimized_ops::ProcessBroadcastShapes( const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
GetTensorShape(input1), GetTensorShape(input2), &op_params); GetTensorShape(input1), GetTensorShape(input2), &op_params);
// 16bit -> 16bit with general rescaling
bool general_16bit = output->type == kTfLiteInt16 &&
input1->type == kTfLiteInt16 &&
input2->type == kTfLiteInt16;
#define TF_LITE_SUB(type, opname, data_type) \ #define TF_LITE_SUB(type, opname, data_type) \
type::opname(op_params, GetTensorShape(input1), \ type::opname(op_params, GetTensorShape(input1), \
GetTensorData<data_type>(input1), GetTensorShape(input2), \ GetTensorData<data_type>(input1), GetTensorShape(input2), \
@ -301,6 +324,12 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
} else { } else {
TF_LITE_SUB(reference_integer_ops, Add, int8_t); TF_LITE_SUB(reference_integer_ops, Add, int8_t);
} }
} else if (general_16bit) {
if (need_broadcast) {
TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, int16_t);
} else {
TF_LITE_SUB(reference_ops, Add, int16_t);
}
} else if (output->type == kTfLiteUInt8) { } else if (output->type == kTfLiteUInt8) {
if (kernel_type == kReference) { if (kernel_type == kReference) {
if (need_broadcast) { if (need_broadcast) {

View File

@ -226,6 +226,10 @@ TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt8) {
QuantizedTestsNoActivation<TensorType_INT8, int8_t>(); QuantizedTestsNoActivation<TensorType_INT8, int8_t>();
} }
TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt16Generic) {
QuantizedTestsNoActivation<TensorType_INT16, int16_t>();
}
template <TensorType tensor_type, typename integer_dtype> template <TensorType tensor_type, typename integer_dtype>
void QuantizedTestsActivationRELU_N1_TO_1() { void QuantizedTestsActivationRELU_N1_TO_1() {
float kQuantizedTolerance = GetTolerance(-1.0, 1.0); float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
@ -287,6 +291,10 @@ TEST(QuantizedSubOpModel, QuantizedVariousInputShapesInt8) {
QuantizedVariousInputShapes<TensorType_INT8, int8_t>(); QuantizedVariousInputShapes<TensorType_INT8, int8_t>();
} }
TEST(QuantizedSubOpModel, QuantizedVariousInputShapesInt16) {
QuantizedVariousInputShapes<TensorType_INT16, int16_t>();
}
template <TensorType tensor_type, typename integer_dtype> template <TensorType tensor_type, typename integer_dtype>
void QuantizedWithBroadcast() { void QuantizedWithBroadcast() {
float kQuantizedTolerance = GetTolerance(-3.0, 3.0); float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
@ -315,6 +323,10 @@ TEST(QuantizedSubOpModel, QuantizedWithBroadcastInt8) {
QuantizedWithBroadcast<TensorType_INT8, int8_t>(); QuantizedWithBroadcast<TensorType_INT8, int8_t>();
} }
TEST(QuantizedSubOpModel, QuantizedWithBroadcastInt16) {
QuantizedWithBroadcast<TensorType_INT16, int16_t>();
}
TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt16) { TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt16) {
const float kMin = -1.f; const float kMin = -1.f;
const float kMax = const float kMax =