From a858c19b0c10a89639a4897155952d8c3bbd26de Mon Sep 17 00:00:00 2001 From: Elena Zhelezina Date: Wed, 18 Sep 2019 17:56:26 +0100 Subject: [PATCH 1/8] New implementation of TANH/Sigmoid 16-bit activation functions using LUT. We think the reference functions for 16-bit activation are too complex for efficient implementation on resource constrained platforms and propose to replace the functions with a lookup table approach as follows: First rescale the input data to fixed range of -10.7 to +10.7 Use a 256-entry lookup table for Sigmoid followed by linear interpolation to efficiently derive the result. The Sigmoid LUT table is used for the TANH function, because tanh(x) = 2*sigmoid(2*x) -1 and we take into account the symmetry is taked. The proposed reference kernel implementation also has higher accuracy than the existing one. On the current functions we measure a difference of up to 6.3 for sigmoid and 11.7 for tanh in quantized units compared to the floating point reference implementation over the 16-bit input range (representing -8.0 to +8.0). For the implementation of this patch we see the error reduced to less than 1.5 quantized units compared to floating point reference for both tanh and sigmoid. Change-Id: I4d1406928db65740c1750c9cd7bfffab30771419 --- tensorflow/lite/kernels/activations.cc | 148 +++++++++++++++++++- tensorflow/lite/kernels/activations_test.cc | 18 ++- 2 files changed, 153 insertions(+), 13 deletions(-) diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc index f43f61128ec..d0babbbcf0a 100644 --- a/tensorflow/lite/kernels/activations.cc +++ b/tensorflow/lite/kernels/activations.cc @@ -60,7 +60,8 @@ struct OpData { int input_left_shift = 0; int32_t input_range_radius = 0; int diff_min = 0; - uint8_t table[256] = {0}; + uint16_t table[256] = {0}; + uint16_t* table_zero = nullptr; }; struct SoftmaxOpData { @@ -154,6 +155,54 @@ inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4], #endif // TODO(b/143696793): move this to optimized_ops. +// We use combined sigmoid and tanh look-up table, since +// tanh(x) = 2*sigmoid(2*x) -1. +// Both functions are symmetric, so the LUT table is only needed +// for the absolute value of the input. +void PopulateLookupTableSigmoid(struct OpData* data) { + + // Table of sigmoid(i/24) at 0.16 format - 256 elements. + + auto table = std::initializer_list({ + 32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, + 38180, 38841, 39498, 40149, 40794, 41432, 42064, 42688, + 43304, 43912, 44511, 45102, 45683, 46255, 46817, 47369, + 47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409, + 51865, 52311, 52745, 53169, 53581, 53983, 54374, 54755, + 55125, 55485, 55834, 56174, 56503, 56823, 57133, 57433, + 57724, 58007, 58280, 58544, 58800, 59048, 59288, 59519, + 59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110, + 61279, 61441, 61599, 61750, 61896, 62036, 62172, 62302, + 62428, 62549, 62666, 62778, 62886, 62990, 63090, 63186, + 63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835, + 63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308, + 64357, 64405, 64450, 64494, 64536, 64576, 64614, 64652, + 64687, 64721, 64754, 64786, 64816, 64845, 64873, 64900, + 64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079, + 65097, 65115, 65132, 65149, 65164, 65179, 65194, 65208, + 65221, 65234, 65246, 65258, 65269, 65280, 65291, 65301, + 65310, 65319, 65328, 65337, 65345, 65352, 65360, 65367, + 65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415, + 65420, 65425, 65429, 65433, 65438, 65442, 65445, 65449, + 65453, 65456, 65459, 65462, 65465, 65468, 65471, 65474, + 65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491, + 65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504, + 65505, 65507, 65508, 65509, 65510, 65511, 65512, 65513, + 65514, 65515, 65516, 65517, 65517, 65518, 65519, 65520, + 65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524, + 65525, 65525, 65526, 65526, 65526, 65527, 65527, 65528, + 65528, 65528, 65529, 65529, 65529, 65529, 65530, 65530, + 65530, 65530, 65531, 65531, 65531, 65531, 65531, 65532, + 65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533, + 65533, 65533, 65533, 65533, 65533, 65534, 65534, 65534, + 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65535 + }); + + std::copy(table.begin(), table.end(), data->table); + + data->table_zero = &data->table[0]; +} + void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input, TfLiteTensor* output) { const int size = @@ -211,6 +260,89 @@ void QuantizedReluX(float act_min, float act_max, const TfLiteTensor* input, GetTensorShape(output), GetTensorData(output)); } +void EvalUsingLookupTableSigmoid16Bit(struct OpData* data, const TfLiteTensor* input, + TfLiteTensor* output) { + + const int size = MatchingFlatSize(GetTensorShape(input), GetTensorShape(output)); + + int16_t* ptr_output_data = GetTensorData(output); + const int16_t* ptr_input_data = GetTensorData(input); + + for (int i = 0; i < size; ++i, ptr_output_data++, ptr_input_data++) { + int32_t input_data = *ptr_input_data; + + // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and + // we do interpolation on unsigned values. + uint32_t abs_input_data = 3*abs(input_data); + + // We divide by 2 power of 9, because + // we need to divide by 2 in power of 7 for + // the input conversion + 1/4 from the scale above. + uint8_t uh = abs_input_data >> 9; + uint32_t ua = data->table_zero[uh]; + uint32_t ub = data->table_zero[uh+1]; + uint32_t ut = abs_input_data & 0x1ff; + + // Interpolation is done using the fractional bit. + uint32_t result = (ua << 9) + ut * (ub - ua); + + result = (input_data >=0) ? (result + (1 << 9)) : + ((1 << (16 + 9)) - result + (1 << 9) - 1); + + // Back to 16-bit. + result >>= 10; + + *ptr_output_data = result; + } +} + +void EvalUsingLookupTableTanh16Bit(struct OpData* data, const TfLiteTensor* input, + TfLiteTensor* output) { + + const int size = + MatchingFlatSize(GetTensorShape(input), GetTensorShape(output)); + + const int16_t* ptr_input_data = GetTensorData(input); + int16_t* ptr_output_data = GetTensorData(output); + + // We use the LUT for sigmoid and take into account, that + // tanh(x) = 2*sigmoid(2*x) - 1 + for (int i=0; i < size; ++i, ptr_input_data++, ptr_output_data++) { + + int32_t input_data = *ptr_input_data; + + if (data->input_left_shift == 1) { + input_data = gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data); + } + + // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7]. + uint32_t abs_input_data = 3*abs(input_data); + uint32_t uh = abs_input_data >> 8; + int32_t result; + + if (uh >= 255) { + // Saturate to maximum. + result = 0xFFFF<<8; + } else { + + uint32_t ua = data->table_zero[uh]; + uint32_t ub = data->table_zero[uh+1]; + + uint8_t ut = abs_input_data & 0xFF; + + result = (ua<<8) + ut*(ub-ua); + } + + result = (input_data>=0) ? (result - (1<<(14+9)) + (1<<(9-2))) : + (-result + (1<<(14+9)) + (1<<(9-2))-1); + + // Convert back to 16-bit. + result >>= (9-1); + + *ptr_output_data = result; + } +} + } // namespace void* Init(TfLiteContext* context, const char* buffer, size_t length) { @@ -418,6 +550,8 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) { } else if (input->type == kTfLiteInt8) { PopulateLookupTable(data, input, output, [](float value) { return std::tanh(value); }); + } else if (input->type == kTfLiteInt16 && kernel_type == kReference) { + PopulateLookupTableSigmoid(data); } } @@ -509,6 +643,10 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) { PopulateLookupTable(data, input, output, [](float value) { return 1.0f / (1.0f + std::exp(-value)); }); + } else if (input->type == kTfLiteInt16) { + TF_LITE_ENSURE(context, output->params.scale == 1. / 32768); + TF_LITE_ENSURE(context, output->params.zero_point == 0.); + PopulateLookupTableSigmoid(data); } } @@ -799,9 +937,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) { TanhParams params; params.input_left_shift = data->input_left_shift; if (kernel_type == kReference) { - reference_ops::Tanh( - params, GetTensorShape(input), GetTensorData(input), - GetTensorShape(output), GetTensorData(output)); + EvalUsingLookupTableTanh16Bit(data, input, output); } else { optimized_ops::Tanh( params, GetTensorShape(input), GetTensorData(input), @@ -871,9 +1007,7 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt16: { LogisticParams params; if (kernel_type == kReference) { - reference_ops::Logistic( - params, GetTensorShape(input), GetTensorData(input), - GetTensorShape(output), GetTensorData(output)); + EvalUsingLookupTableSigmoid16Bit(data, input, output); } else { optimized_ops::Logistic( params, GetTensorShape(input), GetTensorData(input), diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc index e80adce9c4c..134af8239b9 100644 --- a/tensorflow/lite/kernels/activations_test.cc +++ b/tensorflow/lite/kernels/activations_test.cc @@ -741,11 +741,13 @@ TEST_P(TanhOpTest, TanhInt16) { const float kMax = 32767.f / 32768.f; QuantizedActivationsOpModel m( GetRegistration(), BuiltinOperator_TANH, - /*input=*/{TensorType_INT16, {1, 2, 4, 1}, 8 * kMin, 8 * kMax}, - /*output=*/{TensorType_INT16, {1, 2, 4, 1}, kMin, kMax}); + /*input=*/{TensorType_INT16, {1, 2, 8, 1}, 8 * kMin, 8 * kMax}, + /*output=*/{TensorType_INT16, {1, 2, 8, 1}, kMin, kMax}); m.SetInput({ 0, -6, 2, 4, // -4, -2, 8, 1, // + 7, -8, 3, -5, // + 6, -1, -3, 5 }); m.Invoke(); EXPECT_THAT(m.GetDequantizedOutput(), @@ -753,6 +755,8 @@ TEST_P(TanhOpTest, TanhInt16) { { 0.0, -0.999987, 0.964027, 0.999329, // -0.999329, -0.96402, 0.99999, 0.76159, // + 0.999998337, -0.99999, 0.995054754, -0.999909204, // + 0.999999996, -0.76159, -0.995054754, 0.999909204 }, kQuantizedToleranceInt16))); } @@ -882,18 +886,20 @@ TEST_P(LogisticOpTest, SigmoidInt16) { const float kMax = 32767.f / 32768.f; QuantizedActivationsOpModel m( GetRegistration(), BuiltinOperator_LOGISTIC, - /*input=*/{TensorType_INT16, {1, 2, 4, 1}, 8 * kMin, 8 * kMax}, - /*output=*/{TensorType_INT16, {1, 2, 4, 1}, kMin, kMax}); + /*input=*/{TensorType_INT16, {1, 2, 6, 1}, 8 * kMin, 8 * kMax}, + /*output=*/{TensorType_INT16, {1, 2, 6, 1}, kMin, kMax}); m.SetInput({ 0, -6, 2, 4, // - 3, -2, 10, 1, // + 3, -2, 8, 1, // + 5, -8, 7, -3 }); m.Invoke(); EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear( { 0.5, 0.002473, 0.880797, 0.982014, // - 0.952574, 0.119203, 0.999955, 0.731059, // + 0.952574, 0.119203, 0.9995, 0.731059, // + 0.993307, 0.0003535, 0.999089, 0.047426 // }, kQuantizedToleranceInt16))); } From 279f9264c0503b975ee91e6070f8aed2698b51b6 Mon Sep 17 00:00:00 2001 From: Elena Zhelezina Date: Thu, 28 Nov 2019 16:28:35 +0000 Subject: [PATCH 2/8] Small improvement to TANH/Sigmoid implementation. Change-Id: Ia9fa7e70e15a5174a045ee5f98cf4f78e6a43ef6 --- tensorflow/lite/kernels/activations.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc index d0babbbcf0a..06da2c9a15d 100644 --- a/tensorflow/lite/kernels/activations.cc +++ b/tensorflow/lite/kernels/activations.cc @@ -312,7 +312,7 @@ void EvalUsingLookupTableTanh16Bit(struct OpData* data, const TfLiteTensor* inpu int32_t input_data = *ptr_input_data; if (data->input_left_shift == 1) { - input_data = gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data); + input_data <<= 1; } // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7]. From eaac6ea535cd2be0b33b0a2cd6664daab096364b Mon Sep 17 00:00:00 2001 From: Elena Zhelezina Date: Tue, 24 Dec 2019 09:25:25 +0000 Subject: [PATCH 3/8] Addressed review comments for TANH/Sigmoid function. --- tensorflow/lite/kernels/activations.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc index 06da2c9a15d..30d1ae6d402 100644 --- a/tensorflow/lite/kernels/activations.cc +++ b/tensorflow/lite/kernels/activations.cc @@ -550,7 +550,7 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) { } else if (input->type == kTfLiteInt8) { PopulateLookupTable(data, input, output, [](float value) { return std::tanh(value); }); - } else if (input->type == kTfLiteInt16 && kernel_type == kReference) { + } else if (input->type == kTfLiteInt16) { PopulateLookupTableSigmoid(data); } } @@ -936,12 +936,12 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt16: { TanhParams params; params.input_left_shift = data->input_left_shift; - if (kernel_type == kReference) { - EvalUsingLookupTableTanh16Bit(data, input, output); - } else { + if (kernel_type == kFixedPointOptimized) { optimized_ops::Tanh( params, GetTensorShape(input), GetTensorData(input), GetTensorShape(output), GetTensorData(output)); + } else { + EvalUsingLookupTableTanh16Bit(data, input, output); } return kTfLiteOk; } break; @@ -1006,12 +1006,12 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) { } case kTfLiteInt16: { LogisticParams params; - if (kernel_type == kReference) { - EvalUsingLookupTableSigmoid16Bit(data, input, output); - } else { + if (kernel_type == kFixedPointOptimized) { optimized_ops::Logistic( params, GetTensorShape(input), GetTensorData(input), GetTensorShape(output), GetTensorData(output)); + } else { + EvalUsingLookupTableSigmoid16Bit(data, input, output); } break; } From 38eeb4f5d18c6772886b1f41093f4681bb522108 Mon Sep 17 00:00:00 2001 From: Elena Zhelezina Date: Mon, 17 Feb 2020 12:55:02 +0000 Subject: [PATCH 4/8] Addressed reviewer comments. --- tensorflow/lite/kernels/activations.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc index 30d1ae6d402..495789e6306 100644 --- a/tensorflow/lite/kernels/activations.cc +++ b/tensorflow/lite/kernels/activations.cc @@ -60,8 +60,9 @@ struct OpData { int input_left_shift = 0; int32_t input_range_radius = 0; int diff_min = 0; - uint16_t table[256] = {0}; - uint16_t* table_zero = nullptr; + uint8_t table[256] = {0}; + uint16_t table_uint16[256] = {0}; + uint16_t* table_zero_uint16 = nullptr; }; struct SoftmaxOpData { @@ -198,9 +199,9 @@ void PopulateLookupTableSigmoid(struct OpData* data) { 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65535 }); - std::copy(table.begin(), table.end(), data->table); + std::copy(table.begin(), table.end(), data->table_uint16); - data->table_zero = &data->table[0]; + data->table_zero_uint16 = &data->table_uint16[0]; } void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input, @@ -211,7 +212,6 @@ void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input, const uint8_t* input_data = GetTensorData(input); int i = 0; #if __aarch64__ && __clang__ - // This code uses ARM64-only instructions. // TODO(b/143709993): Port to ARMv7 // Load the tables into registers. (4*4 128-bit registers) @@ -279,8 +279,8 @@ void EvalUsingLookupTableSigmoid16Bit(struct OpData* data, const TfLiteTensor* i // we need to divide by 2 in power of 7 for // the input conversion + 1/4 from the scale above. uint8_t uh = abs_input_data >> 9; - uint32_t ua = data->table_zero[uh]; - uint32_t ub = data->table_zero[uh+1]; + uint32_t ua = data->table_zero_uint16[uh]; + uint32_t ub = data->table_zero_uint16[uh+1]; uint32_t ut = abs_input_data & 0x1ff; // Interpolation is done using the fractional bit. @@ -325,8 +325,8 @@ void EvalUsingLookupTableTanh16Bit(struct OpData* data, const TfLiteTensor* inpu result = 0xFFFF<<8; } else { - uint32_t ua = data->table_zero[uh]; - uint32_t ub = data->table_zero[uh+1]; + uint32_t ua = data->table_zero_uint16[uh]; + uint32_t ub = data->table_zero_uint16[uh+1]; uint8_t ut = abs_input_data & 0xFF; @@ -645,7 +645,7 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) { }); } else if (input->type == kTfLiteInt16) { TF_LITE_ENSURE(context, output->params.scale == 1. / 32768); - TF_LITE_ENSURE(context, output->params.zero_point == 0.); + TF_LITE_ENSURE(context, output->params.zero_point == 0); PopulateLookupTableSigmoid(data); } } From e8ea83ab58aa63d9b0b86ff26e30293017022029 Mon Sep 17 00:00:00 2001 From: Elena Zhelezina Date: Mon, 2 Mar 2020 16:41:18 +0000 Subject: [PATCH 5/8] Moved implementation of Tanh/Sigmoid to integer_reference_ops per discussion. --- tensorflow/lite/kernels/activations.cc | 164 +++--------------- tensorflow/lite/kernels/internal/common.h | 32 ++++ .../internal/reference/integer_ops/logistic.h | 32 ++++ .../internal/reference/integer_ops/tanh.h | 39 +++++ 4 files changed, 125 insertions(+), 142 deletions(-) diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc index 495789e6306..305e2e7f6f6 100644 --- a/tensorflow/lite/kernels/activations.cc +++ b/tensorflow/lite/kernels/activations.cc @@ -29,6 +29,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/quantization_util.h" #include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h" #include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h" +#include "tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h" #include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h" #include "tensorflow/lite/kernels/internal/reference/logistic.h" #include "tensorflow/lite/kernels/internal/reference/reference_ops.h" @@ -61,8 +62,6 @@ struct OpData { int32_t input_range_radius = 0; int diff_min = 0; uint8_t table[256] = {0}; - uint16_t table_uint16[256] = {0}; - uint16_t* table_zero_uint16 = nullptr; }; struct SoftmaxOpData { @@ -156,54 +155,6 @@ inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4], #endif // TODO(b/143696793): move this to optimized_ops. -// We use combined sigmoid and tanh look-up table, since -// tanh(x) = 2*sigmoid(2*x) -1. -// Both functions are symmetric, so the LUT table is only needed -// for the absolute value of the input. -void PopulateLookupTableSigmoid(struct OpData* data) { - - // Table of sigmoid(i/24) at 0.16 format - 256 elements. - - auto table = std::initializer_list({ - 32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, - 38180, 38841, 39498, 40149, 40794, 41432, 42064, 42688, - 43304, 43912, 44511, 45102, 45683, 46255, 46817, 47369, - 47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409, - 51865, 52311, 52745, 53169, 53581, 53983, 54374, 54755, - 55125, 55485, 55834, 56174, 56503, 56823, 57133, 57433, - 57724, 58007, 58280, 58544, 58800, 59048, 59288, 59519, - 59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110, - 61279, 61441, 61599, 61750, 61896, 62036, 62172, 62302, - 62428, 62549, 62666, 62778, 62886, 62990, 63090, 63186, - 63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835, - 63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308, - 64357, 64405, 64450, 64494, 64536, 64576, 64614, 64652, - 64687, 64721, 64754, 64786, 64816, 64845, 64873, 64900, - 64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079, - 65097, 65115, 65132, 65149, 65164, 65179, 65194, 65208, - 65221, 65234, 65246, 65258, 65269, 65280, 65291, 65301, - 65310, 65319, 65328, 65337, 65345, 65352, 65360, 65367, - 65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415, - 65420, 65425, 65429, 65433, 65438, 65442, 65445, 65449, - 65453, 65456, 65459, 65462, 65465, 65468, 65471, 65474, - 65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491, - 65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504, - 65505, 65507, 65508, 65509, 65510, 65511, 65512, 65513, - 65514, 65515, 65516, 65517, 65517, 65518, 65519, 65520, - 65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524, - 65525, 65525, 65526, 65526, 65526, 65527, 65527, 65528, - 65528, 65528, 65529, 65529, 65529, 65529, 65530, 65530, - 65530, 65530, 65531, 65531, 65531, 65531, 65531, 65532, - 65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533, - 65533, 65533, 65533, 65533, 65533, 65534, 65534, 65534, - 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65535 - }); - - std::copy(table.begin(), table.end(), data->table_uint16); - - data->table_zero_uint16 = &data->table_uint16[0]; -} - void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input, TfLiteTensor* output) { const int size = @@ -260,89 +211,6 @@ void QuantizedReluX(float act_min, float act_max, const TfLiteTensor* input, GetTensorShape(output), GetTensorData(output)); } -void EvalUsingLookupTableSigmoid16Bit(struct OpData* data, const TfLiteTensor* input, - TfLiteTensor* output) { - - const int size = MatchingFlatSize(GetTensorShape(input), GetTensorShape(output)); - - int16_t* ptr_output_data = GetTensorData(output); - const int16_t* ptr_input_data = GetTensorData(input); - - for (int i = 0; i < size; ++i, ptr_output_data++, ptr_input_data++) { - int32_t input_data = *ptr_input_data; - - // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and - // we do interpolation on unsigned values. - uint32_t abs_input_data = 3*abs(input_data); - - // We divide by 2 power of 9, because - // we need to divide by 2 in power of 7 for - // the input conversion + 1/4 from the scale above. - uint8_t uh = abs_input_data >> 9; - uint32_t ua = data->table_zero_uint16[uh]; - uint32_t ub = data->table_zero_uint16[uh+1]; - uint32_t ut = abs_input_data & 0x1ff; - - // Interpolation is done using the fractional bit. - uint32_t result = (ua << 9) + ut * (ub - ua); - - result = (input_data >=0) ? (result + (1 << 9)) : - ((1 << (16 + 9)) - result + (1 << 9) - 1); - - // Back to 16-bit. - result >>= 10; - - *ptr_output_data = result; - } -} - -void EvalUsingLookupTableTanh16Bit(struct OpData* data, const TfLiteTensor* input, - TfLiteTensor* output) { - - const int size = - MatchingFlatSize(GetTensorShape(input), GetTensorShape(output)); - - const int16_t* ptr_input_data = GetTensorData(input); - int16_t* ptr_output_data = GetTensorData(output); - - // We use the LUT for sigmoid and take into account, that - // tanh(x) = 2*sigmoid(2*x) - 1 - for (int i=0; i < size; ++i, ptr_input_data++, ptr_output_data++) { - - int32_t input_data = *ptr_input_data; - - if (data->input_left_shift == 1) { - input_data <<= 1; - } - - // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7]. - uint32_t abs_input_data = 3*abs(input_data); - uint32_t uh = abs_input_data >> 8; - int32_t result; - - if (uh >= 255) { - // Saturate to maximum. - result = 0xFFFF<<8; - } else { - - uint32_t ua = data->table_zero_uint16[uh]; - uint32_t ub = data->table_zero_uint16[uh+1]; - - uint8_t ut = abs_input_data & 0xFF; - - result = (ua<<8) + ut*(ub-ua); - } - - result = (input_data>=0) ? (result - (1<<(14+9)) + (1<<(9-2))) : - (-result + (1<<(14+9)) + (1<<(9-2))-1); - - // Convert back to 16-bit. - result >>= (9-1); - - *ptr_output_data = result; - } -} - } // namespace void* Init(TfLiteContext* context, const char* buffer, size_t length) { @@ -550,8 +418,6 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) { } else if (input->type == kTfLiteInt8) { PopulateLookupTable(data, input, output, [](float value) { return std::tanh(value); }); - } else if (input->type == kTfLiteInt16) { - PopulateLookupTableSigmoid(data); } } @@ -646,7 +512,6 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) { } else if (input->type == kTfLiteInt16) { TF_LITE_ENSURE(context, output->params.scale == 1. / 32768); TF_LITE_ENSURE(context, output->params.zero_point == 0); - PopulateLookupTableSigmoid(data); } } @@ -936,12 +801,20 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) { case kTfLiteInt16: { TanhParams params; params.input_left_shift = data->input_left_shift; - if (kernel_type == kFixedPointOptimized) { + if (kernel_type == kReference) { + const int size = + MatchingFlatSize(GetTensorShape(input), GetTensorShape(output)); + + const int16_t* ptr_input_data = GetTensorData(input); + int16_t* ptr_output_data = GetTensorData(output); + + reference_integer_ops::Tanh(data->input_left_shift, size, + GetTensorData(input), + GetTensorData(output)); + } else { optimized_ops::Tanh( params, GetTensorShape(input), GetTensorData(input), GetTensorShape(output), GetTensorData(output)); - } else { - EvalUsingLookupTableTanh16Bit(data, input, output); } return kTfLiteOk; } break; @@ -1006,12 +879,19 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) { } case kTfLiteInt16: { LogisticParams params; - if (kernel_type == kFixedPointOptimized) { + if (kernel_type == kReference) { + const int size = + MatchingFlatSize(GetTensorShape(input), GetTensorShape(output)); + + int16_t* ptr_output_data = GetTensorData(output); + const int16_t* ptr_input_data = GetTensorData(input); + + reference_integer_ops::Logistic(size, GetTensorData(input), + GetTensorData(output)); + } else { optimized_ops::Logistic( params, GetTensorShape(input), GetTensorData(input), GetTensorShape(output), GetTensorData(output)); - } else { - EvalUsingLookupTableSigmoid16Bit(data, input, output); } break; } diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h index 0c4fbc1e84e..2f014062287 100644 --- a/tensorflow/lite/kernels/internal/common.h +++ b/tensorflow/lite/kernels/internal/common.h @@ -195,6 +195,38 @@ inline int CountLeadingSignBits(T integer_input) { #endif } +// Table of sigmoid(i/24) at 0.16 format - 256 elements. + +// We use combined sigmoid and tanh look-up table, since +// tanh(x) = 2*sigmoid(2*x) -1. +// Both functions are symmetric, so the LUT table is only needed +// for the absolute value of the input. +static uint16_t sigmoid_table_uint16[256] = { + 32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, 38180, 38841, 39498, + 40149, 40794, 41432, 42064, 42688, 43304, 43912, 44511, 45102, 45683, 46255, + 46817, 47369, 47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409, 51865, + 52311, 52745, 53169, 53581, 53983, 54374, 54755, 55125, 55485, 55834, 56174, + 56503, 56823, 57133, 57433, 57724, 58007, 58280, 58544, 58800, 59048, 59288, + 59519, 59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110, 61279, 61441, + 61599, 61750, 61896, 62036, 62172, 62302, 62428, 62549, 62666, 62778, 62886, + 62990, 63090, 63186, 63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835, + 63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308, 64357, 64405, 64450, + 64494, 64536, 64576, 64614, 64652, 64687, 64721, 64754, 64786, 64816, 64845, + 64873, 64900, 64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079, 65097, + 65115, 65132, 65149, 65164, 65179, 65194, 65208, 65221, 65234, 65246, 65258, + 65269, 65280, 65291, 65301, 65310, 65319, 65328, 65337, 65345, 65352, 65360, + 65367, 65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415, 65420, 65425, + 65429, 65433, 65438, 65442, 65445, 65449, 65453, 65456, 65459, 65462, 65465, + 65468, 65471, 65474, 65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491, + 65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504, 65505, 65507, 65508, + 65509, 65510, 65511, 65512, 65513, 65514, 65515, 65516, 65517, 65517, 65518, + 65519, 65520, 65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524, 65525, + 65525, 65526, 65526, 65526, 65527, 65527, 65528, 65528, 65528, 65529, 65529, + 65529, 65529, 65530, 65530, 65530, 65530, 65531, 65531, 65531, 65531, 65531, + 65532, 65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533, 65533, 65533, + 65533, 65533, 65533, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65534, + 65534, 65534, 65535}; + // TODO(b/77858996): Add these to gemmlowp. template IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) { diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h index 8277c3b3d56..aa626f43f19 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h @@ -58,6 +58,38 @@ inline void Logistic(int32_t input_zero_point, int32_t input_range_radius, } } +inline void Logistic(int32_t input_size, const int16_t* ptr_input_data, + int16_t* ptr_output_data) { + // We use the LUT for sigmoid and take into account, that + // tanh(x) = 2*sigmoid(2*x) - 1 + for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) { + int32_t input_data = *ptr_input_data; + + // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and + // we do interpolation on unsigned values. + uint32_t abs_input_data = 3 * abs(input_data); + + // We divide by 2 power of 9, because + // we need to divide by 2 in power of 7 for + // the input conversion + 1/4 from the scale above. + uint8_t uh = abs_input_data >> 9; + uint32_t ua = sigmoid_table_uint16[uh]; + uint32_t ub = sigmoid_table_uint16[uh + 1]; + uint32_t ut = abs_input_data & 0x1ff; + + // Interpolation is done using the fractional bit. + uint32_t result = (ua << 9) + ut * (ub - ua); + + result = (input_data >= 0) ? (result + (1 << 9)) + : ((1 << (16 + 9)) - result + (1 << 9) - 1); + + // Back to 16-bit. + result >>= 10; + + *ptr_output_data = result; + } +} + } // namespace reference_integer_ops } // namespace tflite diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h index cc704387f38..95dc969319d 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h @@ -59,6 +59,45 @@ inline void Tanh(int32_t input_zero_point, int32_t input_range_radius, } } +inline void Tanh(int32_t input_left_shift, int32_t input_size, + const int16_t* ptr_input_data, int16_t* ptr_output_data) { + // We use the LUT for sigmoid and take into account, that + // tanh(x) = 2*sigmoid(2*x) - 1 + for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) { + int32_t input_data = *ptr_input_data; + + if (input_left_shift == 1) { + input_data <<= 1; + } + + // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7]. + uint32_t abs_input_data = 3 * abs(input_data); + uint32_t uh = abs_input_data >> 8; + int32_t result; + + if (uh >= 255) { + // Saturate to maximum. + result = 0xFFFF << 8; + } else { + uint32_t ua = sigmoid_table_uint16[uh]; + uint32_t ub = sigmoid_table_uint16[uh + 1]; + + uint8_t ut = abs_input_data & 0xFF; + + result = (ua << 8) + ut * (ub - ua); + } + + result = (input_data >= 0) + ? (result - (1 << (14 + 9)) + (1 << (9 - 2))) + : (-result + (1 << (14 + 9)) + (1 << (9 - 2)) - 1); + + // Convert back to 16-bit. + result >>= (9 - 1); + + *ptr_output_data = result; + } +} + } // namespace reference_integer_ops } // namespace tflite From 00879a5cdf00ffbfa2c02d1ff75e09f1e5569d88 Mon Sep 17 00:00:00 2001 From: Elena Zhelezina Date: Tue, 3 Mar 2020 15:16:41 +0000 Subject: [PATCH 6/8] Tidy up. --- tensorflow/lite/kernels/activations.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc index 305e2e7f6f6..bc47b5fb32c 100644 --- a/tensorflow/lite/kernels/activations.cc +++ b/tensorflow/lite/kernels/activations.cc @@ -29,7 +29,6 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/quantization_util.h" #include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h" #include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h" -#include "tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h" #include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h" #include "tensorflow/lite/kernels/internal/reference/logistic.h" #include "tensorflow/lite/kernels/internal/reference/reference_ops.h" @@ -163,6 +162,7 @@ void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input, const uint8_t* input_data = GetTensorData(input); int i = 0; #if __aarch64__ && __clang__ + // This code uses ARM64-only instructions. // TODO(b/143709993): Port to ARMv7 // Load the tables into registers. (4*4 128-bit registers) From 9140684f7adaddcdb3a377bbe62e4556bbfd4b44 Mon Sep 17 00:00:00 2001 From: Elena Zhelezina Date: Wed, 18 Mar 2020 12:11:57 +0000 Subject: [PATCH 7/8] Fix for unused variable warning. --- tensorflow/lite/kernels/internal/BUILD | 1 + tensorflow/lite/kernels/internal/common.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD index 8c320720a31..d1ba076e41f 100644 --- a/tensorflow/lite/kernels/internal/BUILD +++ b/tensorflow/lite/kernels/internal/BUILD @@ -196,6 +196,7 @@ cc_library( ":cpu_check", ":types", "@gemmlowp//:fixedpoint", + "//tensorflow/core/platform:macros", ], ) diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h index 2f014062287..7f391adb437 100644 --- a/tensorflow/lite/kernels/internal/common.h +++ b/tensorflow/lite/kernels/internal/common.h @@ -24,6 +24,7 @@ limitations under the License. #include "fixedpoint/fixedpoint.h" #include "tensorflow/lite/kernels/internal/optimized/neon_check.h" #include "tensorflow/lite/kernels/internal/types.h" +#include "tensorflow/core/platform/macros.h" namespace tflite { @@ -201,7 +202,7 @@ inline int CountLeadingSignBits(T integer_input) { // tanh(x) = 2*sigmoid(2*x) -1. // Both functions are symmetric, so the LUT table is only needed // for the absolute value of the input. -static uint16_t sigmoid_table_uint16[256] = { +TF_ATTRIBUTE_UNUSED static uint16_t sigmoid_table_uint16[256] = { 32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, 38180, 38841, 39498, 40149, 40794, 41432, 42064, 42688, 43304, 43912, 44511, 45102, 45683, 46255, 46817, 47369, 47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409, 51865, From 913a78794dd01b5f7e7bdb36fd7f566712fc11b3 Mon Sep 17 00:00:00 2001 From: Elena Zhelezina Date: Wed, 25 Mar 2020 10:33:07 +0000 Subject: [PATCH 8/8] Fix for the error with buildifier. --- tensorflow/lite/kernels/internal/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD index d1ba076e41f..66a429d9475 100644 --- a/tensorflow/lite/kernels/internal/BUILD +++ b/tensorflow/lite/kernels/internal/BUILD @@ -195,8 +195,8 @@ cc_library( deps = [ ":cpu_check", ":types", - "@gemmlowp//:fixedpoint", "//tensorflow/core/platform:macros", + "@gemmlowp//:fixedpoint", ], )