From 01ebab40444bc20202336a36012aedc02210661e Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Thu, 1 Aug 2019 09:52:13 -0700 Subject: [PATCH] Support quantized Mul with multiplier>1, even if that's a pathological case. It occurs in dummy-quantized models. While doing this, noticed some vmovn that should be saturating vqmovn. Bump version of mul when rescale multiplier is greater than 1. PiperOrigin-RevId: 261140129 --- .../internal/optimized/integer_ops/mul.h | 38 ++++++++++++------- .../internal/optimized/optimized_ops.h | 28 +++++++++----- .../internal/reference/integer_ops/mul.h | 12 +++--- .../internal/reference/reference_ops.h | 12 +++--- tensorflow/lite/kernels/mul.cc | 4 +- tensorflow/lite/kernels/mul_test.cc | 25 ++++++++++++ tensorflow/lite/kernels/register.cc | 2 +- tensorflow/lite/kernels/test_util.cc | 3 ++ tensorflow/lite/kernels/test_util.h | 1 + tensorflow/lite/toco/tflite/op_version.cc | 1 + tensorflow/lite/toco/tflite/operator.cc | 19 ++++++++-- tensorflow/lite/toco/tflite/operator_test.cc | 30 ++++++++++++++- 12 files changed, 133 insertions(+), 42 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h index fa95f098d63..08b8da09915 100644 --- a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h +++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h @@ -44,6 +44,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params, vdup_n_s8(params.quantized_activation_min); const auto output_activation_max_vector = vdup_n_s8(params.quantized_activation_max); + const int left_shift = std::max(0, params.output_shift); + const int right_shift = std::max(0, -params.output_shift); + const int32x4_t left_shift_vec = vdupq_n_s32(left_shift); for (; i <= size - 8; i += 8) { // We load / store 8 at a time, multiplying as two sets of 4 int32s. const auto input1_val_original = vld1_s8(input1_data + i); @@ -61,14 +64,16 @@ inline void MulElementwise(int size, const ArithmeticParams& params, auto p1 = vmull_s16(input2_val_low, input1_val_low); auto p2 = vmull_s16(input2_val_high, input1_val_high); + p1 = vshlq_s32(p1, left_shift_vec); + p2 = vshlq_s32(p2, left_shift_vec); p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); using gemmlowp::RoundingDivideByPOT; - p1 = RoundingDivideByPOT(p1, -params.output_shift); - p2 = RoundingDivideByPOT(p2, -params.output_shift); + p1 = RoundingDivideByPOT(p1, right_shift); + p2 = RoundingDivideByPOT(p2, right_shift); - const auto p1_narrowed = vmovn_s32(p1); - const auto p2_narrowed = vmovn_s32(p2); + const auto p1_narrowed = vqmovn_s32(p1); + const auto p2_narrowed = vqmovn_s32(p2); const auto p = vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector); const auto clamped = @@ -83,9 +88,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params, const int32 input2_val = params.input2_offset + input2_data[i]; const int32 unclamped_result = params.output_offset + - MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val, - params.output_multiplier, - params.output_shift); + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); const int32 clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); @@ -114,6 +119,9 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params, vdup_n_s8(params.quantized_activation_min); const auto output_activation_max_vector = vdup_n_s8(params.quantized_activation_max); + const int left_shift = std::max(0, params.output_shift); + const int right_shift = std::max(0, -params.output_shift); + const int32x4_t left_shift_vec = vdupq_n_s32(left_shift); for (; i <= size - 8; i += 8) { // We load / store 8 at a time, multiplying as two sets of 4 int32s. const auto input2_val_original = vld1_s8(input2_data + i); @@ -126,14 +134,16 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params, auto p1 = vmull_n_s16(input2_val_low, input1_val); auto p2 = vmull_n_s16(input2_val_high, input1_val); + p1 = vshlq_s32(p1, left_shift_vec); + p2 = vshlq_s32(p2, left_shift_vec); p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); using gemmlowp::RoundingDivideByPOT; - p1 = RoundingDivideByPOT(p1, -params.output_shift); - p2 = RoundingDivideByPOT(p2, -params.output_shift); + p1 = RoundingDivideByPOT(p1, right_shift); + p2 = RoundingDivideByPOT(p2, right_shift); - const auto p1_narrowed = vmovn_s32(p1); - const auto p2_narrowed = vmovn_s32(p2); + const auto p1_narrowed = vqmovn_s32(p1); + const auto p2_narrowed = vqmovn_s32(p2); const auto p = vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector); const auto clamped = @@ -147,9 +157,9 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params, const int32 input2_val = params.input2_offset + input2_data[i]; const int32 unclamped_result = params.output_offset + - MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val, - params.output_multiplier, - params.output_shift); + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); const int32 clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h index 5798ea319de..9e44541369e 100644 --- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h @@ -2055,6 +2055,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params, vdup_n_u8(params.quantized_activation_min); const auto output_activation_max_vector = vdup_n_u8(params.quantized_activation_max); + const int left_shift = std::max(0, params.output_shift); + const int right_shift = std::max(0, -params.output_shift); + const int32x4_t left_shift_vec = vdupq_n_s32(left_shift); for (; i <= size - 8; i += 8) { // We load / store 8 at a time, multiplying as two sets of 4 int32s. const auto input1_val_original = vld1_u8(input1_data + i); @@ -2074,14 +2077,16 @@ inline void MulElementwise(int size, const ArithmeticParams& params, auto p1 = vmull_s16(input2_val_low, input1_val_low); auto p2 = vmull_s16(input2_val_high, input1_val_high); + p1 = vshlq_s32(p1, left_shift_vec); + p2 = vshlq_s32(p2, left_shift_vec); p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); using gemmlowp::RoundingDivideByPOT; - p1 = RoundingDivideByPOT(p1, -params.output_shift); - p2 = RoundingDivideByPOT(p2, -params.output_shift); + p1 = RoundingDivideByPOT(p1, right_shift); + p2 = RoundingDivideByPOT(p2, right_shift); - const auto p1_narrowed = vmovn_s32(p1); - const auto p2_narrowed = vmovn_s32(p2); + const auto p1_narrowed = vqmovn_s32(p1); + const auto p2_narrowed = vqmovn_s32(p2); const auto p = vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector); const auto clamped = @@ -2096,9 +2101,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params, const int32 input2_val = params.input2_offset + input2_data[i]; const int32 unclamped_result = params.output_offset + - MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val, - params.output_multiplier, - params.output_shift); + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); const int32 clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); @@ -2126,6 +2131,9 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params, vdup_n_u8(params.quantized_activation_min); const auto output_activation_max_vector = vdup_n_u8(params.quantized_activation_max); + const int left_shift = std::max(0, params.output_shift); + const int right_shift = std::max(0, -params.output_shift); + const int32x4_t left_shift_vec = vdupq_n_s32(left_shift); for (; i <= size - 8; i += 8) { // We load / store 8 at a time, multiplying as two sets of 4 int32s. const auto input2_val_original = vld1_u8(input2_data + i); @@ -2139,11 +2147,13 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params, auto p1 = vmull_n_s16(input2_val_low, input1_val); auto p2 = vmull_n_s16(input2_val_high, input1_val); + p1 = vshlq_s32(p1, left_shift_vec); + p2 = vshlq_s32(p2, left_shift_vec); p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); using gemmlowp::RoundingDivideByPOT; - p1 = RoundingDivideByPOT(p1, -params.output_shift); - p2 = RoundingDivideByPOT(p2, -params.output_shift); + p1 = RoundingDivideByPOT(p1, right_shift); + p2 = RoundingDivideByPOT(p2, right_shift); const auto p1_narrowed = vmovn_s32(p1); const auto p2_narrowed = vmovn_s32(p2); diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h index dad17fb7f4a..9c629ff2b8e 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h @@ -30,9 +30,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params, const int32 input2_val = params.input2_offset + input2_data[i]; const int32 unclamped_result = params.output_offset + - MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val, - params.output_multiplier, - params.output_shift); + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); const int32 clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); @@ -112,9 +112,9 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params, input2_data[SubscriptToIndex(desc2, b, y, x, c)]; const int32 unclamped_result = params.output_offset + - MultiplyByQuantizedMultiplierSmallerThanOneExp( - input1_val * input2_val, params.output_multiplier, - params.output_shift); + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); const int32 clamped_output = std::min( params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h index 31a337a71a3..01eddfa04f7 100644 --- a/tensorflow/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h @@ -899,9 +899,9 @@ inline void MulElementwise(int size, const ArithmeticParams& params, const int32 input2_val = params.input2_offset + input2_data[i]; const int32 unclamped_result = params.output_offset + - MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val, - params.output_multiplier, - params.output_shift); + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); const int32 clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); @@ -1003,9 +1003,9 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params, input2_data[SubscriptToIndex(desc2, b, y, x, c)]; const int32 unclamped_result = params.output_offset + - MultiplyByQuantizedMultiplierSmallerThanOneExp( - input1_val * input2_val, params.output_multiplier, - params.output_shift); + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); const int32 clamped_output = std::min( params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc index f11a1f3c426..8c7eaf85cae 100644 --- a/tensorflow/lite/kernels/mul.cc +++ b/tensorflow/lite/kernels/mul.cc @@ -101,8 +101,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { output->type == kTfLiteInt16) { double real_multiplier = input1->params.scale * input2->params.scale / output->params.scale; - QuantizeMultiplierSmallerThanOneExp( - real_multiplier, &data->output_multiplier, &data->output_shift); + QuantizeMultiplier(real_multiplier, &data->output_multiplier, + &data->output_shift); } return context->ResizeTensor(context, output, output_size); diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc index b6a7700aee1..30f9c526659 100644 --- a/tensorflow/lite/kernels/mul_test.cc +++ b/tensorflow/lite/kernels/mul_test.cc @@ -206,12 +206,37 @@ void NoActivation() { kQuantizedTolerance))); } +template +void NoActivationLargeMultiplier() { + // TODO(b/138722124): Remove this after setting the appropriate op version (3) + // for dependent tests. + if (SingleOpModel::GetForceUseNnapi()) { + // NNAPI doesn't currently support Mul with multiplier>1. + return; + } + // Intentionally pathological output range much narrower than needed + // to represent input values to exercise the multiplier>1 case. + QuantizedMulOpModel m({tensor_type, {1, 2, 2, 1}, -100, 100}, + {tensor_type, {1, 2, 2, 1}, -100, 100}, + {tensor_type, {}, -10, 10}, + ActivationFunctionType_NONE); + m.QuantizeAndPopulate(m.input1(), {-4, 2, 3, 1}); + m.QuantizeAndPopulate(m.input2(), {-1, -3, 4, 2}); + m.Invoke(); + // Note the large tolerance. This computation is inherently inaccurate. + const float kTolerance = 1.4f; + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({4, -6, 10, 2}, kTolerance))); +} + TEST(QuantizedMulOpTest, NoActivationUInt8) { NoActivation(); + NoActivationLargeMultiplier(); } TEST(QuantizedMulOpTest, NoActivationInt8) { NoActivation(); + NoActivationLargeMultiplier(); } TEST(QuantizedMulOpTest, NoActivationInt16) { diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc index 6832ac73f6d..cbb501165ff 100644 --- a/tensorflow/lite/kernels/register.cc +++ b/tensorflow/lite/kernels/register.cc @@ -219,7 +219,7 @@ BuiltinOpResolver::BuiltinOpResolver() { /* min_version */ 1, /* max_version */ 2); AddBuiltin(BuiltinOperator_MUL, Register_MUL(), /* min_version */ 1, - /* max_version */ 2); + /* max_version */ 3); AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION(), /* min_version */ 1, /* max_version */ 2); diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc index 9c4dead65f3..218d76a36b8 100644 --- a/tensorflow/lite/kernels/test_util.cc +++ b/tensorflow/lite/kernels/test_util.cc @@ -198,6 +198,9 @@ void SingleOpModel::SetForceUseNnapi(bool use_nnapi) { force_use_nnapi = use_nnapi; } +// static +bool SingleOpModel::GetForceUseNnapi() { return force_use_nnapi; } + int32_t SingleOpModel::GetTensorSize(int index) const { TfLiteTensor* t = interpreter_->tensor(index); CHECK(t); diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h index 1faae708340..472f27e413b 100644 --- a/tensorflow/lite/kernels/test_util.h +++ b/tensorflow/lite/kernels/test_util.h @@ -358,6 +358,7 @@ class SingleOpModel { // Enables NNAPI delegate application during interpreter creation. static void SetForceUseNnapi(bool use_nnapi); + static bool GetForceUseNnapi(); protected: int32_t GetTensorSize(int index) const; diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc index 1937f3efeb8..61419780989 100644 --- a/tensorflow/lite/toco/tflite/op_version.cc +++ b/tensorflow/lite/toco/tflite/op_version.cc @@ -78,6 +78,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) { {{OperatorType::kMinimum, 2}, "1.14.0"}, {{OperatorType::kMul, 1}, "1.5.0"}, {{OperatorType::kMul, 2}, "1.14.0"}, + {{OperatorType::kMul, 3}, kPendingReleaseOpVersion}, {{OperatorType::kPad, 1}, "1.5.0"}, {{OperatorType::kPad, 2}, "1.14.0"}, {{OperatorType::kTile, 1}, "1.10.1"}, diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc index b064ea396e1..a3db7a97fe7 100644 --- a/tensorflow/lite/toco/tflite/operator.cc +++ b/tensorflow/lite/toco/tflite/operator.cc @@ -778,10 +778,23 @@ class Mul : public BuiltinOperatorinputs[0]; - const Array& input_array = op_signature.model->GetArray(input_name); + const string& input1_name = op_signature.op->inputs[0]; + const string& input2_name = op_signature.op->inputs[1]; + const string& output_name = op_signature.op->outputs[0]; + const Array& input1_array = op_signature.model->GetArray(input1_name); + const Array& input2_array = op_signature.model->GetArray(input2_name); + const Array& output_array = op_signature.model->GetArray(output_name); + const auto& input1_quant = input1_array.quantization_params; + const auto& input2_quant = input2_array.quantization_params; + const auto& output_quant = output_array.quantization_params; + // Version 3 supports have a rescale value greater than or equal to 1. + if (input1_quant && input2_quant && output_quant && + (input1_quant->scale * input2_quant->scale / output_quant->scale) >= + 1.0) { + return 3; + } // Version 2 supports signed int8 input types. - if (input_array.data_type == ArrayDataType::kInt8) { + if (input1_array.data_type == ArrayDataType::kInt8) { return 2; } return 1; diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc index 3b007cb2514..557fa196daa 100644 --- a/tensorflow/lite/toco/tflite/operator_test.cc +++ b/tensorflow/lite/toco/tflite/operator_test.cc @@ -917,7 +917,35 @@ TEST_F(OperatorTest, VersioningAddTest) { SimpleVersioningTest(); } TEST_F(OperatorTest, VersioningSubTest) { SimpleVersioningTest(); } -TEST_F(OperatorTest, VersioningMulTest) { SimpleVersioningTest(); } +void SimpleMulVersioningTest(ArrayDataType data_type, float multiplier, + int version) { + MulOperator op; + op.inputs = {"input1", "input2"}; + op.outputs = {"output"}; + auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/); + const BaseOperator* base_op = operator_by_type_map.at(op.type).get(); + + Model model; + Array& input0 = model.GetOrCreateArray(op.inputs[0]); + Array& input1 = model.GetOrCreateArray(op.inputs[1]); + Array& output = model.GetOrCreateArray(op.outputs[0]); + + input0.data_type = data_type; + input0.GetOrCreateQuantizationParams().scale = 1.0f; + input1.data_type = data_type; + input1.GetOrCreateQuantizationParams().scale = 1.0f; + output.data_type = data_type; + output.GetOrCreateQuantizationParams().scale = 1.0f / multiplier; + + OperatorSignature signature = {.op = &op, .model = &model}; + EXPECT_EQ(base_op->GetVersion(signature), version); +} + +TEST_F(OperatorTest, VersioningMulTest) { + SimpleMulVersioningTest(ArrayDataType::kUint8, 0.5f, 1); + SimpleMulVersioningTest(ArrayDataType::kInt8, 0.5f, 2); + SimpleMulVersioningTest(ArrayDataType::kInt8, 2.0f, 3); +} TEST_F(OperatorTest, VersioningPadTest) { SimpleVersioningTest(); }