diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD index 44934fc1971..8527ec648ad 100644 --- a/tensorflow/lite/kernels/internal/BUILD +++ b/tensorflow/lite/kernels/internal/BUILD @@ -312,6 +312,7 @@ cc_library( "reference/integer_ops/dequantize.h", "reference/integer_ops/fully_connected.h", "reference/integer_ops/logistic.h", + "reference/integer_ops/mul.h", "reference/integer_ops/pooling.h", "reference/integer_ops/softmax.h", "reference/integer_ops/tanh.h", diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h new file mode 100644 index 00000000000..5e33d089945 --- /dev/null +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h @@ -0,0 +1,130 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_ + +#include "public/gemmlowp.h" +#include "tensorflow/lite/kernels/internal/common.h" + +namespace tflite { +namespace reference_integer_ops { + +inline void MulElementwise(int size, const ArithmeticParams& params, + const int8_t* input1_data, const int8_t* input2_data, + int8_t* output_data) { + for (int i = 0; i < size; ++i) { + const int32 input1_val = params.input1_offset + input1_data[i]; + const int32 input2_val = params.input2_offset + input2_data[i]; + const int32 unclamped_result = + params.output_offset + + MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32 clamped_output = + std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, unclamped_result)); + output_data[i] = static_cast(clamped_output); + } +} + +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int8_t* input1_data, + const RuntimeShape& input2_shape, const int8_t* input2_data, + const RuntimeShape& output_shape, int8_t* output_data) { + TFLITE_DCHECK_LE(params.quantized_activation_min, + params.quantized_activation_max); + gemmlowp::ScopedProfilingLabel label("Mul/8bit"); + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + + MulElementwise(flat_size, params, input1_data, input2_data, output_data); +} + +// Mul with 16 bit inputs and int8_t outputs. +inline void Mul(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, int8_t* output_data) { + gemmlowp::ScopedProfilingLabel label("Mul/Int16Int8"); + int32 output_offset = params.output_offset; + int32 output_activation_min = params.quantized_activation_min; + int32 output_activation_max = params.quantized_activation_max; + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + + const int flat_size = + MatchingFlatSize(input1_shape, input2_shape, output_shape); + + for (int i = 0; i < flat_size; i++) { + // F0 uses 0 integer bits, range [-1, 1]. + using F0 = gemmlowp::FixedPoint; + + F0 unclamped_result = + F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]); + int16 rescaled_result = + gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8); + int16 clamped_result = + std::min(output_activation_max - output_offset, rescaled_result); + clamped_result = + std::max(output_activation_min - output_offset, clamped_result); + output_data[i] = output_offset + clamped_result; + } +} + +inline void BroadcastMul4DSlow(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int8_t* input1_data, + const RuntimeShape& input2_shape, + const int8_t* input2_data, + const RuntimeShape& output_shape, + int8_t* output_data) { + gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit"); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + // The input shapes are extended as part of NdArrayDesc initialization. + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, + &desc2); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); + + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + const int32 input1_val = + params.input1_offset + + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; + const int32 input2_val = + params.input2_offset + + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; + const int32 unclamped_result = + params.output_offset + + MultiplyByQuantizedMultiplierSmallerThanOneExp( + input1_val * input2_val, params.output_multiplier, + params.output_shift); + const int32 clamped_output = std::min( + params.quantized_activation_max, + std::max(params.quantized_activation_min, unclamped_result)); + output_data[Offset(extended_output_shape, b, y, x, c)] = + static_cast(clamped_output); + } + } + } + } +} + +} // namespace reference_integer_ops +} // namespace tflite +#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_ diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc index 01039a70543..e0ff6724ea2 100644 --- a/tensorflow/lite/kernels/mul.cc +++ b/tensorflow/lite/kernels/mul.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h" #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h" @@ -87,8 +88,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { &data->output_activation_min, &data->output_activation_max); } + if (output->type == kTfLiteInt8) { + CalculateActivationRangeInt8(params->activation, output, + &data->output_activation_min, + &data->output_activation_max); + } - if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) { + if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 || + output->type == kTfLiteInt16) { double real_multiplier = input1->params.scale * input2->params.scale / output->params.scale; QuantizeMultiplierSmallerThanOneExp( @@ -151,8 +158,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, TfLiteMulParams* params, const OpData* data, const TfLiteTensor* input1, const TfLiteTensor* input2, TfLiteTensor* output) { - if (input1->type == kTfLiteUInt8 && input2->type == kTfLiteUInt8 && - output->type == kTfLiteUInt8) { + if (input1->type == input2->type && input1->type == output->type && + (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8)) { tflite::ArithmeticParams op_params; SetActivationParams(data->output_activation_min, data->output_activation_max, &op_params); @@ -163,23 +170,31 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, op_params.output_shift = data->output_shift; bool need_broadcast = optimized_ops::ProcessBroadcastShapes( GetTensorShape(input1), GetTensorShape(input2), &op_params); -#define TF_LITE_MUL(type, opname) \ - type::opname(op_params, GetTensorShape(input1), \ - GetTensorData(input1), GetTensorShape(input2), \ - GetTensorData(input2), GetTensorShape(output), \ - GetTensorData(output)) - - if (kernel_type == kReference) { +#define TF_LITE_MUL(type, opname, dtype) \ + type::opname(op_params, GetTensorShape(input1), \ + GetTensorData(input1), GetTensorShape(input2), \ + GetTensorData(input2), GetTensorShape(output), \ + GetTensorData(output)) + if (input1->type == kTfLiteInt8) { if (need_broadcast) { - TF_LITE_MUL(reference_ops, BroadcastMul4DSlow); + TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t); } else { - TF_LITE_MUL(reference_ops, Mul); + TF_LITE_MUL(reference_integer_ops, Mul, int8_t); } } else { - if (need_broadcast) { - TF_LITE_MUL(optimized_ops, BroadcastMulFivefold); + // type == kTfLiteUInt8 + if (kernel_type == kReference) { + if (need_broadcast) { + TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, uint8_t); + } else { + TF_LITE_MUL(reference_ops, Mul, uint8_t); + } } else { - TF_LITE_MUL(optimized_ops, Mul); + if (need_broadcast) { + TF_LITE_MUL(optimized_ops, BroadcastMulFivefold, uint8_t); + } else { + TF_LITE_MUL(optimized_ops, Mul, uint8_t); + } } } #undef TF_LITE_MUL @@ -198,8 +213,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, } #undef TF_LITE_MUL } else if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 && - output->type == kTfLiteUInt8) { -#define TF_LITE_MUL(type, opname) \ + (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8)) { +#define TF_LITE_MUL(type, opname, output_dtype) \ tflite::ArithmeticParams op_params; \ SetActivationParams(data->output_activation_min, \ data->output_activation_max, &op_params); \ @@ -207,11 +222,15 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, type::opname(op_params, GetTensorShape(input1), \ GetTensorData(input1), GetTensorShape(input2), \ GetTensorData(input2), GetTensorShape(output), \ - GetTensorData(output)) - if (kernel_type == kReference) { - TF_LITE_MUL(reference_ops, Mul); + GetTensorData(output)) + if (output->type == kTfLiteInt8) { + TF_LITE_MUL(reference_integer_ops, Mul, int8_t); } else { - TF_LITE_MUL(optimized_ops, Mul); + if (kernel_type == kReference) { + TF_LITE_MUL(reference_ops, Mul, uint8_t); + } else { + TF_LITE_MUL(optimized_ops, Mul, uint8_t); + } } #undef TF_LITE_MUL } else { @@ -233,14 +252,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) { EvalMul(context, node, params, data, input1, input2, output); - } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) { + } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 || + output->type == kTfLiteInt16) { TF_LITE_ENSURE_OK( context, EvalQuantized(context, node, params, data, input1, input2, output)); } else { context->ReportError(context, - "Mul only supports FLOAT32, INT32 and quantized UINT8 " - "and INT16 now, got %d.", + "Mul only supports FLOAT32, INT32 and quantized UINT8," + " INT8 and INT16 now, got %d.", output->type); return kTfLiteError; } diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc index 200cc26dadc..96f5a8a0e07 100644 --- a/tensorflow/lite/kernels/mul_test.cc +++ b/tensorflow/lite/kernels/mul_test.cc @@ -73,9 +73,10 @@ class QuantizedMulOpModel : public BaseMulOpModel { public: using BaseMulOpModel::BaseMulOpModel; + template std::vector GetDequantizedOutput() { - return Dequantize(ExtractVector(output_), - GetScale(output_), GetZeroPoint(output_)); + return Dequantize(ExtractVector(output_), + GetScale(output_), GetZeroPoint(output_)); } std::vector GetDequantizedOutputInt16() { @@ -191,19 +192,28 @@ TEST(IntegerMulOpTest, WithBroadcast) { } } -TEST(QuantizedMulOpTest, NoActivation) { - QuantizedMulOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, - {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, - {TensorType_UINT8, {}, -1.0, 1.0}, +template +void NoActivation() { + QuantizedMulOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, + {tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, + {tensor_type, {}, -1.0, 1.0}, ActivationFunctionType_NONE); - m.QuantizeAndPopulate(m.input1(), {-0.8, 0.2, 0.9, 0.7}); - m.QuantizeAndPopulate(m.input2(), {0.6, 0.4, 0.9, 0.8}); + m.QuantizeAndPopulate(m.input1(), {-0.8, 0.2, 0.9, 0.7}); + m.QuantizeAndPopulate(m.input2(), {0.6, 0.4, 0.9, 0.8}); m.Invoke(); - EXPECT_THAT(m.GetDequantizedOutput(), + EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({-0.48, 0.08, 0.81, 0.56}, kQuantizedTolerance))); } +TEST(QuantizedMulOpTest, NoActivationUInt8) { + NoActivation(); +} + +TEST(QuantizedMulOpTest, NoActivationInt8) { + NoActivation(); +} + TEST(QuantizedMulOpTest, NoActivationInt16) { const float kMin = -1.f; const float kMax = 32767.f / 32768.f; @@ -219,23 +229,32 @@ TEST(QuantizedMulOpTest, NoActivationInt16) { kQuantizedToleranceInt16))); } -TEST(QuantizedMulOpTest, NoActivationInt16WithUint8Output) { +template +void NoActivationInt16With8BitOutput() { const float kMinInt16 = -1.f; const float kMaxInt16 = 32767.f / 32768.f; const float kMinUint8 = -1.f; const float kMaxUint8 = 127.f / 128.f; QuantizedMulOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMinInt16, kMaxInt16}, {TensorType_INT16, {1, 2, 2, 1}, kMinInt16, kMaxInt16}, - {TensorType_UINT8, {}, kMinUint8, kMaxUint8}, + {tensor_type, {}, kMinUint8, kMaxUint8}, ActivationFunctionType_NONE); m.QuantizeAndPopulate(m.input1(), {-0.8, 0.2, 0.9, 0.7}); m.QuantizeAndPopulate(m.input2(), {0.6, 0.4, 0.9, 0.8}); m.Invoke(); - EXPECT_THAT(m.GetDequantizedOutput(), + EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({-0.48, 0.08, 0.81, 0.56}, kQuantizedTolerance))); } +TEST(QuantizedMulOpTest, NoActivationInt16WithUint8Output) { + NoActivationInt16With8BitOutput(); +} + +TEST(QuantizedMulOpTest, NoActivationInt16Withint8Output) { + NoActivationInt16With8BitOutput(); +} + // for quantized Mul, the error shouldn't exceed 2*step float GetTolerance(int min, int max) { float kQuantizedStep = (max - min) / 255.0; @@ -243,25 +262,35 @@ float GetTolerance(int min, int max) { return kQuantizedTolerance; } -TEST(QuantizedMulOpTest, WithBroadcast) { +template +void WithBroadcast() { float kQuantizedTolerance = GetTolerance(-3.0, 3.0); std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; for (int i = 0; i < test_shapes.size(); ++i) { - QuantizedMulOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0}, - {TensorType_UINT8, {}, -3.0, 3.0}, // always a scalar - {TensorType_UINT8, {}, -3.0, 3.0}, + QuantizedMulOpModel m({tensor_type, test_shapes[i], -3.0, 3.0}, + {tensor_type, {}, -3.0, 3.0}, // always a scalar + {tensor_type, {}, -3.0, 3.0}, ActivationFunctionType_NONE); - m.QuantizeAndPopulate(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0}); - m.QuantizeAndPopulate(m.input2(), {0.1}); + m.QuantizeAndPopulate(m.input1(), + {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0}); + m.QuantizeAndPopulate(m.input2(), {0.1}); m.Invoke(); - EXPECT_THAT(m.GetDequantizedOutput(), + EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear( {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2}, kQuantizedTolerance))) << "With shape number " << i; } } +TEST(QuantizedMulOpTest, WithBroadcastUInt8) { + WithBroadcast(); +} + +TEST(QuantizedMulOpTest, WithBroadcastInt8) { + WithBroadcast(); +} + } // namespace } // namespace tflite diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc index 6f8e48bb58f..e7dbe341618 100644 --- a/tensorflow/lite/toco/tflite/operator.cc +++ b/tensorflow/lite/toco/tflite/operator.cc @@ -764,6 +764,12 @@ class Mul : public BuiltinOperatorinputs[0]; + const Array& input_array = op_signature.model->GetArray(input_name); + // Version 2 supports signed int8 input types. + if (input_array.data_type == ArrayDataType::kInt8) { + return 2; + } return 1; } }; diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc index 434922e6168..f898cc5bc46 100644 --- a/tensorflow/lite/toco/tflite/operator_test.cc +++ b/tensorflow/lite/toco/tflite/operator_test.cc @@ -841,6 +841,8 @@ TEST_F(OperatorTest, VersioningAddTest) { SimpleVersioningTest(); } TEST_F(OperatorTest, VersioningSubTest) { SimpleVersioningTest(); } +TEST_F(OperatorTest, VersioningMulTest) { SimpleVersioningTest(); } + TEST_F(OperatorTest, VersioningPadTest) { SimpleVersioningTest(); } TEST_F(OperatorTest, VersioningPadV2Test) {