Support quantized Mul with multiplier>1, even if that's a pathological case.

It occurs in dummy-quantized models.

While doing this, noticed some vmovn that should be saturating vqmovn.

Bump version of mul when rescale multiplier is greater than 1.

PiperOrigin-RevId: 261140129
This commit is contained in:
Benoit Jacob 2019-08-01 09:52:13 -07:00 committed by TensorFlower Gardener
parent 4d4df97469
commit 01ebab4044
12 changed files with 133 additions and 42 deletions

View File

@ -44,6 +44,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
vdup_n_s8(params.quantized_activation_min);
const auto output_activation_max_vector =
vdup_n_s8(params.quantized_activation_max);
const int left_shift = std::max(0, params.output_shift);
const int right_shift = std::max(0, -params.output_shift);
const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
for (; i <= size - 8; i += 8) {
// We load / store 8 at a time, multiplying as two sets of 4 int32s.
const auto input1_val_original = vld1_s8(input1_data + i);
@ -61,14 +64,16 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
auto p1 = vmull_s16(input2_val_low, input1_val_low);
auto p2 = vmull_s16(input2_val_high, input1_val_high);
p1 = vshlq_s32(p1, left_shift_vec);
p2 = vshlq_s32(p2, left_shift_vec);
p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
using gemmlowp::RoundingDivideByPOT;
p1 = RoundingDivideByPOT(p1, -params.output_shift);
p2 = RoundingDivideByPOT(p2, -params.output_shift);
p1 = RoundingDivideByPOT(p1, right_shift);
p2 = RoundingDivideByPOT(p2, right_shift);
const auto p1_narrowed = vmovn_s32(p1);
const auto p2_narrowed = vmovn_s32(p2);
const auto p1_narrowed = vqmovn_s32(p1);
const auto p2_narrowed = vqmovn_s32(p2);
const auto p =
vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
const auto clamped =
@ -83,7 +88,7 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
const int32 input2_val = params.input2_offset + input2_data[i];
const int32 unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
MultiplyByQuantizedMultiplier(input1_val * input2_val,
params.output_multiplier,
params.output_shift);
const int32 clamped_output =
@ -114,6 +119,9 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
vdup_n_s8(params.quantized_activation_min);
const auto output_activation_max_vector =
vdup_n_s8(params.quantized_activation_max);
const int left_shift = std::max(0, params.output_shift);
const int right_shift = std::max(0, -params.output_shift);
const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
for (; i <= size - 8; i += 8) {
// We load / store 8 at a time, multiplying as two sets of 4 int32s.
const auto input2_val_original = vld1_s8(input2_data + i);
@ -126,14 +134,16 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
auto p1 = vmull_n_s16(input2_val_low, input1_val);
auto p2 = vmull_n_s16(input2_val_high, input1_val);
p1 = vshlq_s32(p1, left_shift_vec);
p2 = vshlq_s32(p2, left_shift_vec);
p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
using gemmlowp::RoundingDivideByPOT;
p1 = RoundingDivideByPOT(p1, -params.output_shift);
p2 = RoundingDivideByPOT(p2, -params.output_shift);
p1 = RoundingDivideByPOT(p1, right_shift);
p2 = RoundingDivideByPOT(p2, right_shift);
const auto p1_narrowed = vmovn_s32(p1);
const auto p2_narrowed = vmovn_s32(p2);
const auto p1_narrowed = vqmovn_s32(p1);
const auto p2_narrowed = vqmovn_s32(p2);
const auto p =
vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
const auto clamped =
@ -147,7 +157,7 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
const int32 input2_val = params.input2_offset + input2_data[i];
const int32 unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
MultiplyByQuantizedMultiplier(input1_val * input2_val,
params.output_multiplier,
params.output_shift);
const int32 clamped_output =

View File

@ -2055,6 +2055,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
vdup_n_u8(params.quantized_activation_min);
const auto output_activation_max_vector =
vdup_n_u8(params.quantized_activation_max);
const int left_shift = std::max(0, params.output_shift);
const int right_shift = std::max(0, -params.output_shift);
const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
for (; i <= size - 8; i += 8) {
// We load / store 8 at a time, multiplying as two sets of 4 int32s.
const auto input1_val_original = vld1_u8(input1_data + i);
@ -2074,14 +2077,16 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
auto p1 = vmull_s16(input2_val_low, input1_val_low);
auto p2 = vmull_s16(input2_val_high, input1_val_high);
p1 = vshlq_s32(p1, left_shift_vec);
p2 = vshlq_s32(p2, left_shift_vec);
p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
using gemmlowp::RoundingDivideByPOT;
p1 = RoundingDivideByPOT(p1, -params.output_shift);
p2 = RoundingDivideByPOT(p2, -params.output_shift);
p1 = RoundingDivideByPOT(p1, right_shift);
p2 = RoundingDivideByPOT(p2, right_shift);
const auto p1_narrowed = vmovn_s32(p1);
const auto p2_narrowed = vmovn_s32(p2);
const auto p1_narrowed = vqmovn_s32(p1);
const auto p2_narrowed = vqmovn_s32(p2);
const auto p =
vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
const auto clamped =
@ -2096,7 +2101,7 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
const int32 input2_val = params.input2_offset + input2_data[i];
const int32 unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
MultiplyByQuantizedMultiplier(input1_val * input2_val,
params.output_multiplier,
params.output_shift);
const int32 clamped_output =
@ -2126,6 +2131,9 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
vdup_n_u8(params.quantized_activation_min);
const auto output_activation_max_vector =
vdup_n_u8(params.quantized_activation_max);
const int left_shift = std::max(0, params.output_shift);
const int right_shift = std::max(0, -params.output_shift);
const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
for (; i <= size - 8; i += 8) {
// We load / store 8 at a time, multiplying as two sets of 4 int32s.
const auto input2_val_original = vld1_u8(input2_data + i);
@ -2139,11 +2147,13 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
auto p1 = vmull_n_s16(input2_val_low, input1_val);
auto p2 = vmull_n_s16(input2_val_high, input1_val);
p1 = vshlq_s32(p1, left_shift_vec);
p2 = vshlq_s32(p2, left_shift_vec);
p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
using gemmlowp::RoundingDivideByPOT;
p1 = RoundingDivideByPOT(p1, -params.output_shift);
p2 = RoundingDivideByPOT(p2, -params.output_shift);
p1 = RoundingDivideByPOT(p1, right_shift);
p2 = RoundingDivideByPOT(p2, right_shift);
const auto p1_narrowed = vmovn_s32(p1);
const auto p2_narrowed = vmovn_s32(p2);

View File

@ -30,7 +30,7 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
const int32 input2_val = params.input2_offset + input2_data[i];
const int32 unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
MultiplyByQuantizedMultiplier(input1_val * input2_val,
params.output_multiplier,
params.output_shift);
const int32 clamped_output =
@ -112,8 +112,8 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
const int32 unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplierSmallerThanOneExp(
input1_val * input2_val, params.output_multiplier,
MultiplyByQuantizedMultiplier(input1_val * input2_val,
params.output_multiplier,
params.output_shift);
const int32 clamped_output = std::min(
params.quantized_activation_max,

View File

@ -899,7 +899,7 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
const int32 input2_val = params.input2_offset + input2_data[i];
const int32 unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
MultiplyByQuantizedMultiplier(input1_val * input2_val,
params.output_multiplier,
params.output_shift);
const int32 clamped_output =
@ -1003,8 +1003,8 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params,
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
const int32 unclamped_result =
params.output_offset +
MultiplyByQuantizedMultiplierSmallerThanOneExp(
input1_val * input2_val, params.output_multiplier,
MultiplyByQuantizedMultiplier(input1_val * input2_val,
params.output_multiplier,
params.output_shift);
const int32 clamped_output = std::min(
params.quantized_activation_max,

View File

@ -101,8 +101,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
output->type == kTfLiteInt16) {
double real_multiplier =
input1->params.scale * input2->params.scale / output->params.scale;
QuantizeMultiplierSmallerThanOneExp(
real_multiplier, &data->output_multiplier, &data->output_shift);
QuantizeMultiplier(real_multiplier, &data->output_multiplier,
&data->output_shift);
}
return context->ResizeTensor(context, output, output_size);

View File

@ -206,12 +206,37 @@ void NoActivation() {
kQuantizedTolerance)));
}
template <TensorType tensor_type, typename integer_dtype>
void NoActivationLargeMultiplier() {
// TODO(b/138722124): Remove this after setting the appropriate op version (3)
// for dependent tests.
if (SingleOpModel::GetForceUseNnapi()) {
// NNAPI doesn't currently support Mul with multiplier>1.
return;
}
// Intentionally pathological output range much narrower than needed
// to represent input values to exercise the multiplier>1 case.
QuantizedMulOpModel m({tensor_type, {1, 2, 2, 1}, -100, 100},
{tensor_type, {1, 2, 2, 1}, -100, 100},
{tensor_type, {}, -10, 10},
ActivationFunctionType_NONE);
m.QuantizeAndPopulate<integer_dtype>(m.input1(), {-4, 2, 3, 1});
m.QuantizeAndPopulate<integer_dtype>(m.input2(), {-1, -3, 4, 2});
m.Invoke();
// Note the large tolerance. This computation is inherently inaccurate.
const float kTolerance = 1.4f;
EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
ElementsAreArray(ArrayFloatNear({4, -6, 10, 2}, kTolerance)));
}
TEST(QuantizedMulOpTest, NoActivationUInt8) {
NoActivation<TensorType_UINT8, uint8_t>();
NoActivationLargeMultiplier<TensorType_UINT8, uint8_t>();
}
TEST(QuantizedMulOpTest, NoActivationInt8) {
NoActivation<TensorType_INT8, int8_t>();
NoActivationLargeMultiplier<TensorType_INT8, int8_t>();
}
TEST(QuantizedMulOpTest, NoActivationInt16) {

View File

@ -219,7 +219,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
/* min_version */ 1,
/* max_version */ 2);
AddBuiltin(BuiltinOperator_MUL, Register_MUL(), /* min_version */ 1,
/* max_version */ 2);
/* max_version */ 3);
AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION(),
/* min_version */ 1,
/* max_version */ 2);

View File

@ -198,6 +198,9 @@ void SingleOpModel::SetForceUseNnapi(bool use_nnapi) {
force_use_nnapi = use_nnapi;
}
// static
bool SingleOpModel::GetForceUseNnapi() { return force_use_nnapi; }
int32_t SingleOpModel::GetTensorSize(int index) const {
TfLiteTensor* t = interpreter_->tensor(index);
CHECK(t);

View File

@ -358,6 +358,7 @@ class SingleOpModel {
// Enables NNAPI delegate application during interpreter creation.
static void SetForceUseNnapi(bool use_nnapi);
static bool GetForceUseNnapi();
protected:
int32_t GetTensorSize(int index) const;

View File

@ -78,6 +78,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
{{OperatorType::kMinimum, 2}, "1.14.0"},
{{OperatorType::kMul, 1}, "1.5.0"},
{{OperatorType::kMul, 2}, "1.14.0"},
{{OperatorType::kMul, 3}, kPendingReleaseOpVersion},
{{OperatorType::kPad, 1}, "1.5.0"},
{{OperatorType::kPad, 2}, "1.14.0"},
{{OperatorType::kTile, 1}, "1.10.1"},

View File

@ -778,10 +778,23 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
}
int GetVersion(const OperatorSignature& op_signature) const override {
const string& input_name = op_signature.op->inputs[0];
const Array& input_array = op_signature.model->GetArray(input_name);
const string& input1_name = op_signature.op->inputs[0];
const string& input2_name = op_signature.op->inputs[1];
const string& output_name = op_signature.op->outputs[0];
const Array& input1_array = op_signature.model->GetArray(input1_name);
const Array& input2_array = op_signature.model->GetArray(input2_name);
const Array& output_array = op_signature.model->GetArray(output_name);
const auto& input1_quant = input1_array.quantization_params;
const auto& input2_quant = input2_array.quantization_params;
const auto& output_quant = output_array.quantization_params;
// Version 3 supports have a rescale value greater than or equal to 1.
if (input1_quant && input2_quant && output_quant &&
(input1_quant->scale * input2_quant->scale / output_quant->scale) >=
1.0) {
return 3;
}
// Version 2 supports signed int8 input types.
if (input_array.data_type == ArrayDataType::kInt8) {
if (input1_array.data_type == ArrayDataType::kInt8) {
return 2;
}
return 1;

View File

@ -917,7 +917,35 @@ TEST_F(OperatorTest, VersioningAddTest) { SimpleVersioningTest<AddOperator>(); }
TEST_F(OperatorTest, VersioningSubTest) { SimpleVersioningTest<SubOperator>(); }
TEST_F(OperatorTest, VersioningMulTest) { SimpleVersioningTest<MulOperator>(); }
void SimpleMulVersioningTest(ArrayDataType data_type, float multiplier,
int version) {
MulOperator op;
op.inputs = {"input1", "input2"};
op.outputs = {"output"};
auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
const BaseOperator* base_op = operator_by_type_map.at(op.type).get();
Model model;
Array& input0 = model.GetOrCreateArray(op.inputs[0]);
Array& input1 = model.GetOrCreateArray(op.inputs[1]);
Array& output = model.GetOrCreateArray(op.outputs[0]);
input0.data_type = data_type;
input0.GetOrCreateQuantizationParams().scale = 1.0f;
input1.data_type = data_type;
input1.GetOrCreateQuantizationParams().scale = 1.0f;
output.data_type = data_type;
output.GetOrCreateQuantizationParams().scale = 1.0f / multiplier;
OperatorSignature signature = {.op = &op, .model = &model};
EXPECT_EQ(base_op->GetVersion(signature), version);
}
TEST_F(OperatorTest, VersioningMulTest) {
SimpleMulVersioningTest(ArrayDataType::kUint8, 0.5f, 1);
SimpleMulVersioningTest(ArrayDataType::kInt8, 0.5f, 2);
SimpleMulVersioningTest(ArrayDataType::kInt8, 2.0f, 3);
}
TEST_F(OperatorTest, VersioningPadTest) { SimpleVersioningTest<PadOperator>(); }