From a858c19b0c10a89639a4897155952d8c3bbd26de Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 18 Sep 2019 17:56:26 +0100
Subject: [PATCH 1/8] New implementation of TANH/Sigmoid 16-bit activation
 functions using LUT.

We think the reference functions for 16-bit activation are too complex for
efficient implementation on resource constrained platforms and propose
to replace the functions with a lookup table approach as follows:

First rescale the input data to fixed range of -10.7 to +10.7
Use a 256-entry lookup table for Sigmoid followed by linear interpolation
to efficiently derive the result.

The Sigmoid LUT table is used for the TANH function,
because tanh(x) = 2*sigmoid(2*x) -1 and we take into account the symmetry is taked.

The proposed reference kernel implementation also has higher accuracy than the existing one.
On the current functions we measure a difference of up to 6.3 for sigmoid and 11.7 for
tanh in quantized units compared to the floating point reference implementation over
the 16-bit input range (representing -8.0 to +8.0). For the implementation of this patch we
see the error reduced to less than 1.5 quantized units compared to floating point
reference for both tanh and sigmoid.

Change-Id: I4d1406928db65740c1750c9cd7bfffab30771419
---
 tensorflow/lite/kernels/activations.cc      | 148 +++++++++++++++++++-
 tensorflow/lite/kernels/activations_test.cc |  18 ++-
 2 files changed, 153 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index f43f61128ec..d0babbbcf0a 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -60,7 +60,8 @@ struct OpData {
   int input_left_shift = 0;
   int32_t input_range_radius = 0;
   int diff_min = 0;
-  uint8_t table[256] = {0};
+  uint16_t table[256] = {0};
+  uint16_t* table_zero = nullptr;
 };
 
 struct SoftmaxOpData {
@@ -154,6 +155,54 @@ inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4],
 #endif
 
 // TODO(b/143696793): move this to optimized_ops.
+// We use combined sigmoid and tanh look-up table, since
+// tanh(x) = 2*sigmoid(2*x) -1.
+// Both functions are symmetric, so the LUT table is only needed
+// for the absolute value of the input.
+void PopulateLookupTableSigmoid(struct OpData* data) {
+
+  // Table of sigmoid(i/24) at 0.16 format - 256 elements.
+
+  auto table = std::initializer_list<uint16_t>({
+    32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513,
+    38180, 38841, 39498, 40149, 40794, 41432, 42064, 42688,
+    43304, 43912, 44511, 45102, 45683, 46255, 46817, 47369,
+    47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409,
+    51865, 52311, 52745, 53169, 53581, 53983, 54374, 54755,
+    55125, 55485, 55834, 56174, 56503, 56823, 57133, 57433,
+    57724, 58007, 58280, 58544, 58800, 59048, 59288, 59519,
+    59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110,
+    61279, 61441, 61599, 61750, 61896, 62036, 62172, 62302,
+    62428, 62549, 62666, 62778, 62886, 62990, 63090, 63186,
+    63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835,
+    63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308,
+    64357, 64405, 64450, 64494, 64536, 64576, 64614, 64652,
+    64687, 64721, 64754, 64786, 64816, 64845, 64873, 64900,
+    64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079,
+    65097, 65115, 65132, 65149, 65164, 65179, 65194, 65208,
+    65221, 65234, 65246, 65258, 65269, 65280, 65291, 65301,
+    65310, 65319, 65328, 65337, 65345, 65352, 65360, 65367,
+    65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415,
+    65420, 65425, 65429, 65433, 65438, 65442, 65445, 65449,
+    65453, 65456, 65459, 65462, 65465, 65468, 65471, 65474,
+    65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491,
+    65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504,
+    65505, 65507, 65508, 65509, 65510, 65511, 65512, 65513,
+    65514, 65515, 65516, 65517, 65517, 65518, 65519, 65520,
+    65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524,
+    65525, 65525, 65526, 65526, 65526, 65527, 65527, 65528,
+    65528, 65528, 65529, 65529, 65529, 65529, 65530, 65530,
+    65530, 65530, 65531, 65531, 65531, 65531, 65531, 65532,
+    65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533,
+    65533, 65533, 65533, 65533, 65533, 65534, 65534, 65534,
+    65534, 65534, 65534, 65534, 65534, 65534, 65534, 65535
+  });
+
+  std::copy(table.begin(), table.end(), data->table);
+
+  data->table_zero = &data->table[0];
+}
+
 void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input,
                           TfLiteTensor* output) {
   const int size =
@@ -211,6 +260,89 @@ void QuantizedReluX(float act_min, float act_max, const TfLiteTensor* input,
                        GetTensorShape(output), GetTensorData<T>(output));
 }
 
+void EvalUsingLookupTableSigmoid16Bit(struct OpData* data, const TfLiteTensor* input,
+                          TfLiteTensor* output) {
+
+  const int size = MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
+
+  int16_t* ptr_output_data = GetTensorData<int16_t>(output);
+  const int16_t* ptr_input_data = GetTensorData<int16_t>(input);
+
+  for (int i = 0; i < size; ++i, ptr_output_data++, ptr_input_data++) {
+    int32_t input_data = *ptr_input_data;
+
+    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
+    // we do interpolation on unsigned values.
+    uint32_t abs_input_data = 3*abs(input_data);
+
+    // We divide by 2 power of 9, because
+    // we need to divide by 2 in power of 7 for
+    // the input conversion + 1/4 from the scale above.
+    uint8_t uh = abs_input_data >> 9;
+    uint32_t ua = data->table_zero[uh];
+    uint32_t ub = data->table_zero[uh+1];
+    uint32_t ut = abs_input_data & 0x1ff;
+
+    // Interpolation is done using the fractional bit.
+    uint32_t result = (ua << 9) + ut * (ub - ua);
+
+    result = (input_data >=0) ? (result + (1 << 9)) :
+                  ((1 << (16 + 9)) - result + (1 << 9) - 1);
+
+    // Back to 16-bit.
+    result >>= 10;
+
+    *ptr_output_data = result;
+  }
+}
+
+void EvalUsingLookupTableTanh16Bit(struct OpData* data, const TfLiteTensor* input,
+                          TfLiteTensor* output) {
+
+  const int size =
+      MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
+
+  const int16_t* ptr_input_data = GetTensorData<int16_t>(input);
+  int16_t* ptr_output_data = GetTensorData<int16_t>(output);
+
+  // We use the LUT for sigmoid and take into account, that
+  // tanh(x) = 2*sigmoid(2*x) - 1
+  for (int i=0; i < size; ++i, ptr_input_data++, ptr_output_data++) {
+
+    int32_t input_data = *ptr_input_data;
+
+    if (data->input_left_shift == 1) {
+      input_data = gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data);
+    }
+
+    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
+    uint32_t abs_input_data = 3*abs(input_data);
+    uint32_t uh = abs_input_data >> 8;
+    int32_t result;
+
+    if (uh >= 255) {
+      // Saturate to maximum.
+      result = 0xFFFF<<8;
+    } else {
+
+      uint32_t ua = data->table_zero[uh];
+      uint32_t ub = data->table_zero[uh+1];
+
+      uint8_t ut = abs_input_data & 0xFF;
+
+      result = (ua<<8) + ut*(ub-ua);
+    }
+
+    result = (input_data>=0) ? (result - (1<<(14+9)) + (1<<(9-2))) :
+                  (-result + (1<<(14+9)) + (1<<(9-2))-1);
+
+    // Convert back to 16-bit.
+    result >>= (9-1);
+
+    *ptr_output_data = result;
+  }
+}
+
 }  // namespace
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -418,6 +550,8 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
     } else if (input->type == kTfLiteInt8) {
       PopulateLookupTable<int8_t>(data, input, output,
                                   [](float value) { return std::tanh(value); });
+    } else if (input->type == kTfLiteInt16 && kernel_type == kReference) {
+      PopulateLookupTableSigmoid(data);
     }
   }
 
@@ -509,6 +643,10 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
       PopulateLookupTable<int8_t>(data, input, output, [](float value) {
         return 1.0f / (1.0f + std::exp(-value));
       });
+    } else if (input->type == kTfLiteInt16) {
+      TF_LITE_ENSURE(context, output->params.scale == 1. / 32768);
+      TF_LITE_ENSURE(context, output->params.zero_point == 0.);
+      PopulateLookupTableSigmoid(data);
     }
   }
 
@@ -799,9 +937,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       TanhParams params;
       params.input_left_shift = data->input_left_shift;
       if (kernel_type == kReference) {
-        reference_ops::Tanh(
-            params, GetTensorShape(input), GetTensorData<int16_t>(input),
-            GetTensorShape(output), GetTensorData<int16_t>(output));
+        EvalUsingLookupTableTanh16Bit(data, input, output);
       } else {
         optimized_ops::Tanh(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
@@ -871,9 +1007,7 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt16: {
       LogisticParams params;
       if (kernel_type == kReference) {
-        reference_ops::Logistic(
-            params, GetTensorShape(input), GetTensorData<int16_t>(input),
-            GetTensorShape(output), GetTensorData<int16_t>(output));
+        EvalUsingLookupTableSigmoid16Bit(data, input, output);
       } else {
         optimized_ops::Logistic(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index e80adce9c4c..134af8239b9 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -741,11 +741,13 @@ TEST_P(TanhOpTest, TanhInt16) {
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       GetRegistration(), BuiltinOperator_TANH,
-      /*input=*/{TensorType_INT16, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
-      /*output=*/{TensorType_INT16, {1, 2, 4, 1}, kMin, kMax});
+      /*input=*/{TensorType_INT16, {1, 2, 8, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT16, {1, 2, 8, 1}, kMin, kMax});
   m.SetInput<int16_t>({
       0, -6, 2, 4,   //
       -4, -2, 8, 1,  //
+      7, -8, 3, -5,   //
+      6, -1, -3, 5
   });
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
@@ -753,6 +755,8 @@ TEST_P(TanhOpTest, TanhInt16) {
                   {
                       0.0, -0.999987, 0.964027, 0.999329,     //
                       -0.999329, -0.96402, 0.99999, 0.76159,  //
+                      0.999998337, -0.99999, 0.995054754, -0.999909204, //
+                      0.999999996,  -0.76159, -0.995054754, 0.999909204
                   },
                   kQuantizedToleranceInt16)));
 }
@@ -882,18 +886,20 @@ TEST_P(LogisticOpTest, SigmoidInt16) {
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       GetRegistration(), BuiltinOperator_LOGISTIC,
-      /*input=*/{TensorType_INT16, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
-      /*output=*/{TensorType_INT16, {1, 2, 4, 1}, kMin, kMax});
+      /*input=*/{TensorType_INT16, {1, 2, 6, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT16, {1, 2, 6, 1}, kMin, kMax});
   m.SetInput<int16_t>({
       0, -6, 2, 4,   //
-      3, -2, 10, 1,  //
+      3, -2, 8, 1,  //
+      5, -8, 7, -3
   });
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
               ElementsAreArray(ArrayFloatNear(
                   {
                       0.5, 0.002473, 0.880797, 0.982014,       //
-                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                      0.952574, 0.119203, 0.9995, 0.731059,  //
+                      0.993307, 0.0003535, 0.999089, 0.047426 //
                   },
                   kQuantizedToleranceInt16)));
 }

From 279f9264c0503b975ee91e6070f8aed2698b51b6 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Thu, 28 Nov 2019 16:28:35 +0000
Subject: [PATCH 2/8] Small improvement to TANH/Sigmoid implementation.

Change-Id: Ia9fa7e70e15a5174a045ee5f98cf4f78e6a43ef6
---
 tensorflow/lite/kernels/activations.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index d0babbbcf0a..06da2c9a15d 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -312,7 +312,7 @@ void EvalUsingLookupTableTanh16Bit(struct OpData* data, const TfLiteTensor* inpu
     int32_t input_data = *ptr_input_data;
 
     if (data->input_left_shift == 1) {
-      input_data = gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data);
+      input_data <<= 1;
     }
 
     // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].

From eaac6ea535cd2be0b33b0a2cd6664daab096364b Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 24 Dec 2019 09:25:25 +0000
Subject: [PATCH 3/8] Addressed review comments for TANH/Sigmoid function.

---
 tensorflow/lite/kernels/activations.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 06da2c9a15d..30d1ae6d402 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -550,7 +550,7 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
     } else if (input->type == kTfLiteInt8) {
       PopulateLookupTable<int8_t>(data, input, output,
                                   [](float value) { return std::tanh(value); });
-    } else if (input->type == kTfLiteInt16 && kernel_type == kReference) {
+    } else if (input->type == kTfLiteInt16) {
       PopulateLookupTableSigmoid(data);
     }
   }
@@ -936,12 +936,12 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt16: {
       TanhParams params;
       params.input_left_shift = data->input_left_shift;
-      if (kernel_type == kReference) {
-        EvalUsingLookupTableTanh16Bit(data, input, output);
-      } else {
+      if (kernel_type == kFixedPointOptimized) {
         optimized_ops::Tanh(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
             GetTensorShape(output), GetTensorData<int16_t>(output));
+      } else {
+        EvalUsingLookupTableTanh16Bit(data, input, output);
       }
       return kTfLiteOk;
     } break;
@@ -1006,12 +1006,12 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
     }
     case kTfLiteInt16: {
       LogisticParams params;
-      if (kernel_type == kReference) {
-        EvalUsingLookupTableSigmoid16Bit(data, input, output);
-      } else {
+      if (kernel_type == kFixedPointOptimized) {
         optimized_ops::Logistic(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
             GetTensorShape(output), GetTensorData<int16_t>(output));
+      } else {
+        EvalUsingLookupTableSigmoid16Bit(data, input, output);
       }
       break;
     }

From 38eeb4f5d18c6772886b1f41093f4681bb522108 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Mon, 17 Feb 2020 12:55:02 +0000
Subject: [PATCH 4/8] Addressed reviewer comments.

---
 tensorflow/lite/kernels/activations.cc | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 30d1ae6d402..495789e6306 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -60,8 +60,9 @@ struct OpData {
   int input_left_shift = 0;
   int32_t input_range_radius = 0;
   int diff_min = 0;
-  uint16_t table[256] = {0};
-  uint16_t* table_zero = nullptr;
+  uint8_t table[256] = {0};
+  uint16_t table_uint16[256] = {0};
+  uint16_t* table_zero_uint16 = nullptr;
 };
 
 struct SoftmaxOpData {
@@ -198,9 +199,9 @@ void PopulateLookupTableSigmoid(struct OpData* data) {
     65534, 65534, 65534, 65534, 65534, 65534, 65534, 65535
   });
 
-  std::copy(table.begin(), table.end(), data->table);
+  std::copy(table.begin(), table.end(), data->table_uint16);
 
-  data->table_zero = &data->table[0];
+  data->table_zero_uint16 = &data->table_uint16[0];
 }
 
 void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input,
@@ -211,7 +212,6 @@ void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input,
   const uint8_t* input_data = GetTensorData<uint8_t>(input);
   int i = 0;
 #if __aarch64__ && __clang__
-  // This code uses ARM64-only instructions.
   // TODO(b/143709993): Port to ARMv7
 
   // Load the tables into registers. (4*4 128-bit registers)
@@ -279,8 +279,8 @@ void EvalUsingLookupTableSigmoid16Bit(struct OpData* data, const TfLiteTensor* i
     // we need to divide by 2 in power of 7 for
     // the input conversion + 1/4 from the scale above.
     uint8_t uh = abs_input_data >> 9;
-    uint32_t ua = data->table_zero[uh];
-    uint32_t ub = data->table_zero[uh+1];
+    uint32_t ua = data->table_zero_uint16[uh];
+    uint32_t ub = data->table_zero_uint16[uh+1];
     uint32_t ut = abs_input_data & 0x1ff;
 
     // Interpolation is done using the fractional bit.
@@ -325,8 +325,8 @@ void EvalUsingLookupTableTanh16Bit(struct OpData* data, const TfLiteTensor* inpu
       result = 0xFFFF<<8;
     } else {
 
-      uint32_t ua = data->table_zero[uh];
-      uint32_t ub = data->table_zero[uh+1];
+      uint32_t ua = data->table_zero_uint16[uh];
+      uint32_t ub = data->table_zero_uint16[uh+1];
 
       uint8_t ut = abs_input_data & 0xFF;
 
@@ -645,7 +645,7 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
       });
     } else if (input->type == kTfLiteInt16) {
       TF_LITE_ENSURE(context, output->params.scale == 1. / 32768);
-      TF_LITE_ENSURE(context, output->params.zero_point == 0.);
+      TF_LITE_ENSURE(context, output->params.zero_point == 0);
       PopulateLookupTableSigmoid(data);
     }
   }

From e8ea83ab58aa63d9b0b86ff26e30293017022029 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Mon, 2 Mar 2020 16:41:18 +0000
Subject: [PATCH 5/8] Moved implementation of Tanh/Sigmoid to
 integer_reference_ops per discussion.

---
 tensorflow/lite/kernels/activations.cc        | 164 +++---------------
 tensorflow/lite/kernels/internal/common.h     |  32 ++++
 .../internal/reference/integer_ops/logistic.h |  32 ++++
 .../internal/reference/integer_ops/tanh.h     |  39 +++++
 4 files changed, 125 insertions(+), 142 deletions(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 495789e6306..305e2e7f6f6 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h"
 #include "tensorflow/lite/kernels/internal/reference/logistic.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
@@ -61,8 +62,6 @@ struct OpData {
   int32_t input_range_radius = 0;
   int diff_min = 0;
   uint8_t table[256] = {0};
-  uint16_t table_uint16[256] = {0};
-  uint16_t* table_zero_uint16 = nullptr;
 };
 
 struct SoftmaxOpData {
@@ -156,54 +155,6 @@ inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4],
 #endif
 
 // TODO(b/143696793): move this to optimized_ops.
-// We use combined sigmoid and tanh look-up table, since
-// tanh(x) = 2*sigmoid(2*x) -1.
-// Both functions are symmetric, so the LUT table is only needed
-// for the absolute value of the input.
-void PopulateLookupTableSigmoid(struct OpData* data) {
-
-  // Table of sigmoid(i/24) at 0.16 format - 256 elements.
-
-  auto table = std::initializer_list<uint16_t>({
-    32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513,
-    38180, 38841, 39498, 40149, 40794, 41432, 42064, 42688,
-    43304, 43912, 44511, 45102, 45683, 46255, 46817, 47369,
-    47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409,
-    51865, 52311, 52745, 53169, 53581, 53983, 54374, 54755,
-    55125, 55485, 55834, 56174, 56503, 56823, 57133, 57433,
-    57724, 58007, 58280, 58544, 58800, 59048, 59288, 59519,
-    59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110,
-    61279, 61441, 61599, 61750, 61896, 62036, 62172, 62302,
-    62428, 62549, 62666, 62778, 62886, 62990, 63090, 63186,
-    63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835,
-    63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308,
-    64357, 64405, 64450, 64494, 64536, 64576, 64614, 64652,
-    64687, 64721, 64754, 64786, 64816, 64845, 64873, 64900,
-    64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079,
-    65097, 65115, 65132, 65149, 65164, 65179, 65194, 65208,
-    65221, 65234, 65246, 65258, 65269, 65280, 65291, 65301,
-    65310, 65319, 65328, 65337, 65345, 65352, 65360, 65367,
-    65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415,
-    65420, 65425, 65429, 65433, 65438, 65442, 65445, 65449,
-    65453, 65456, 65459, 65462, 65465, 65468, 65471, 65474,
-    65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491,
-    65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504,
-    65505, 65507, 65508, 65509, 65510, 65511, 65512, 65513,
-    65514, 65515, 65516, 65517, 65517, 65518, 65519, 65520,
-    65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524,
-    65525, 65525, 65526, 65526, 65526, 65527, 65527, 65528,
-    65528, 65528, 65529, 65529, 65529, 65529, 65530, 65530,
-    65530, 65530, 65531, 65531, 65531, 65531, 65531, 65532,
-    65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533,
-    65533, 65533, 65533, 65533, 65533, 65534, 65534, 65534,
-    65534, 65534, 65534, 65534, 65534, 65534, 65534, 65535
-  });
-
-  std::copy(table.begin(), table.end(), data->table_uint16);
-
-  data->table_zero_uint16 = &data->table_uint16[0];
-}
-
 void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input,
                           TfLiteTensor* output) {
   const int size =
@@ -260,89 +211,6 @@ void QuantizedReluX(float act_min, float act_max, const TfLiteTensor* input,
                        GetTensorShape(output), GetTensorData<T>(output));
 }
 
-void EvalUsingLookupTableSigmoid16Bit(struct OpData* data, const TfLiteTensor* input,
-                          TfLiteTensor* output) {
-
-  const int size = MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
-
-  int16_t* ptr_output_data = GetTensorData<int16_t>(output);
-  const int16_t* ptr_input_data = GetTensorData<int16_t>(input);
-
-  for (int i = 0; i < size; ++i, ptr_output_data++, ptr_input_data++) {
-    int32_t input_data = *ptr_input_data;
-
-    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
-    // we do interpolation on unsigned values.
-    uint32_t abs_input_data = 3*abs(input_data);
-
-    // We divide by 2 power of 9, because
-    // we need to divide by 2 in power of 7 for
-    // the input conversion + 1/4 from the scale above.
-    uint8_t uh = abs_input_data >> 9;
-    uint32_t ua = data->table_zero_uint16[uh];
-    uint32_t ub = data->table_zero_uint16[uh+1];
-    uint32_t ut = abs_input_data & 0x1ff;
-
-    // Interpolation is done using the fractional bit.
-    uint32_t result = (ua << 9) + ut * (ub - ua);
-
-    result = (input_data >=0) ? (result + (1 << 9)) :
-                  ((1 << (16 + 9)) - result + (1 << 9) - 1);
-
-    // Back to 16-bit.
-    result >>= 10;
-
-    *ptr_output_data = result;
-  }
-}
-
-void EvalUsingLookupTableTanh16Bit(struct OpData* data, const TfLiteTensor* input,
-                          TfLiteTensor* output) {
-
-  const int size =
-      MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
-
-  const int16_t* ptr_input_data = GetTensorData<int16_t>(input);
-  int16_t* ptr_output_data = GetTensorData<int16_t>(output);
-
-  // We use the LUT for sigmoid and take into account, that
-  // tanh(x) = 2*sigmoid(2*x) - 1
-  for (int i=0; i < size; ++i, ptr_input_data++, ptr_output_data++) {
-
-    int32_t input_data = *ptr_input_data;
-
-    if (data->input_left_shift == 1) {
-      input_data <<= 1;
-    }
-
-    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
-    uint32_t abs_input_data = 3*abs(input_data);
-    uint32_t uh = abs_input_data >> 8;
-    int32_t result;
-
-    if (uh >= 255) {
-      // Saturate to maximum.
-      result = 0xFFFF<<8;
-    } else {
-
-      uint32_t ua = data->table_zero_uint16[uh];
-      uint32_t ub = data->table_zero_uint16[uh+1];
-
-      uint8_t ut = abs_input_data & 0xFF;
-
-      result = (ua<<8) + ut*(ub-ua);
-    }
-
-    result = (input_data>=0) ? (result - (1<<(14+9)) + (1<<(9-2))) :
-                  (-result + (1<<(14+9)) + (1<<(9-2))-1);
-
-    // Convert back to 16-bit.
-    result >>= (9-1);
-
-    *ptr_output_data = result;
-  }
-}
-
 }  // namespace
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -550,8 +418,6 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
     } else if (input->type == kTfLiteInt8) {
       PopulateLookupTable<int8_t>(data, input, output,
                                   [](float value) { return std::tanh(value); });
-    } else if (input->type == kTfLiteInt16) {
-      PopulateLookupTableSigmoid(data);
     }
   }
 
@@ -646,7 +512,6 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
     } else if (input->type == kTfLiteInt16) {
       TF_LITE_ENSURE(context, output->params.scale == 1. / 32768);
       TF_LITE_ENSURE(context, output->params.zero_point == 0);
-      PopulateLookupTableSigmoid(data);
     }
   }
 
@@ -936,12 +801,20 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt16: {
       TanhParams params;
       params.input_left_shift = data->input_left_shift;
-      if (kernel_type == kFixedPointOptimized) {
+      if (kernel_type == kReference) {
+        const int size =
+            MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
+
+        const int16_t* ptr_input_data = GetTensorData<int16_t>(input);
+        int16_t* ptr_output_data = GetTensorData<int16_t>(output);
+
+        reference_integer_ops::Tanh(data->input_left_shift, size,
+                                    GetTensorData<int16_t>(input),
+                                    GetTensorData<int16_t>(output));
+      } else {
         optimized_ops::Tanh(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
             GetTensorShape(output), GetTensorData<int16_t>(output));
-      } else {
-        EvalUsingLookupTableTanh16Bit(data, input, output);
       }
       return kTfLiteOk;
     } break;
@@ -1006,12 +879,19 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
     }
     case kTfLiteInt16: {
       LogisticParams params;
-      if (kernel_type == kFixedPointOptimized) {
+      if (kernel_type == kReference) {
+        const int size =
+            MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
+
+        int16_t* ptr_output_data = GetTensorData<int16_t>(output);
+        const int16_t* ptr_input_data = GetTensorData<int16_t>(input);
+
+        reference_integer_ops::Logistic(size, GetTensorData<int16_t>(input),
+                                        GetTensorData<int16_t>(output));
+      } else {
         optimized_ops::Logistic(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
             GetTensorShape(output), GetTensorData<int16_t>(output));
-      } else {
-        EvalUsingLookupTableSigmoid16Bit(data, input, output);
       }
       break;
     }
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index 0c4fbc1e84e..2f014062287 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -195,6 +195,38 @@ inline int CountLeadingSignBits(T integer_input) {
 #endif
 }
 
+// Table of sigmoid(i/24) at 0.16 format - 256 elements.
+
+// We use combined sigmoid and tanh look-up table, since
+// tanh(x) = 2*sigmoid(2*x) -1.
+// Both functions are symmetric, so the LUT table is only needed
+// for the absolute value of the input.
+static uint16_t sigmoid_table_uint16[256] = {
+    32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, 38180, 38841, 39498,
+    40149, 40794, 41432, 42064, 42688, 43304, 43912, 44511, 45102, 45683, 46255,
+    46817, 47369, 47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409, 51865,
+    52311, 52745, 53169, 53581, 53983, 54374, 54755, 55125, 55485, 55834, 56174,
+    56503, 56823, 57133, 57433, 57724, 58007, 58280, 58544, 58800, 59048, 59288,
+    59519, 59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110, 61279, 61441,
+    61599, 61750, 61896, 62036, 62172, 62302, 62428, 62549, 62666, 62778, 62886,
+    62990, 63090, 63186, 63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835,
+    63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308, 64357, 64405, 64450,
+    64494, 64536, 64576, 64614, 64652, 64687, 64721, 64754, 64786, 64816, 64845,
+    64873, 64900, 64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079, 65097,
+    65115, 65132, 65149, 65164, 65179, 65194, 65208, 65221, 65234, 65246, 65258,
+    65269, 65280, 65291, 65301, 65310, 65319, 65328, 65337, 65345, 65352, 65360,
+    65367, 65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415, 65420, 65425,
+    65429, 65433, 65438, 65442, 65445, 65449, 65453, 65456, 65459, 65462, 65465,
+    65468, 65471, 65474, 65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491,
+    65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504, 65505, 65507, 65508,
+    65509, 65510, 65511, 65512, 65513, 65514, 65515, 65516, 65517, 65517, 65518,
+    65519, 65520, 65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524, 65525,
+    65525, 65526, 65526, 65526, 65527, 65527, 65528, 65528, 65528, 65529, 65529,
+    65529, 65529, 65530, 65530, 65530, 65530, 65531, 65531, 65531, 65531, 65531,
+    65532, 65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533, 65533, 65533,
+    65533, 65533, 65533, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65534,
+    65534, 65534, 65535};
+
 // TODO(b/77858996): Add these to gemmlowp.
 template <typename IntegerType>
 IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) {
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
index 8277c3b3d56..aa626f43f19 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -58,6 +58,38 @@ inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
   }
 }
 
+inline void Logistic(int32_t input_size, const int16_t* ptr_input_data,
+                     int16_t* ptr_output_data) {
+  // We use the LUT for sigmoid and take into account, that
+  // tanh(x) = 2*sigmoid(2*x) - 1
+  for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
+    int32_t input_data = *ptr_input_data;
+
+    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
+    // we do interpolation on unsigned values.
+    uint32_t abs_input_data = 3 * abs(input_data);
+
+    // We divide by 2 power of 9, because
+    // we need to divide by 2 in power of 7 for
+    // the input conversion + 1/4 from the scale above.
+    uint8_t uh = abs_input_data >> 9;
+    uint32_t ua = sigmoid_table_uint16[uh];
+    uint32_t ub = sigmoid_table_uint16[uh + 1];
+    uint32_t ut = abs_input_data & 0x1ff;
+
+    // Interpolation is done using the fractional bit.
+    uint32_t result = (ua << 9) + ut * (ub - ua);
+
+    result = (input_data >= 0) ? (result + (1 << 9))
+                               : ((1 << (16 + 9)) - result + (1 << 9) - 1);
+
+    // Back to 16-bit.
+    result >>= 10;
+
+    *ptr_output_data = result;
+  }
+}
+
 }  // namespace reference_integer_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
index cc704387f38..95dc969319d 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -59,6 +59,45 @@ inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
   }
 }
 
+inline void Tanh(int32_t input_left_shift, int32_t input_size,
+                 const int16_t* ptr_input_data, int16_t* ptr_output_data) {
+  // We use the LUT for sigmoid and take into account, that
+  // tanh(x) = 2*sigmoid(2*x) - 1
+  for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
+    int32_t input_data = *ptr_input_data;
+
+    if (input_left_shift == 1) {
+      input_data <<= 1;
+    }
+
+    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
+    uint32_t abs_input_data = 3 * abs(input_data);
+    uint32_t uh = abs_input_data >> 8;
+    int32_t result;
+
+    if (uh >= 255) {
+      // Saturate to maximum.
+      result = 0xFFFF << 8;
+    } else {
+      uint32_t ua = sigmoid_table_uint16[uh];
+      uint32_t ub = sigmoid_table_uint16[uh + 1];
+
+      uint8_t ut = abs_input_data & 0xFF;
+
+      result = (ua << 8) + ut * (ub - ua);
+    }
+
+    result = (input_data >= 0)
+                 ? (result - (1 << (14 + 9)) + (1 << (9 - 2)))
+                 : (-result + (1 << (14 + 9)) + (1 << (9 - 2)) - 1);
+
+    // Convert back to 16-bit.
+    result >>= (9 - 1);
+
+    *ptr_output_data = result;
+  }
+}
+
 }  // namespace reference_integer_ops
 }  // namespace tflite
 

From 00879a5cdf00ffbfa2c02d1ff75e09f1e5569d88 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 3 Mar 2020 15:16:41 +0000
Subject: [PATCH 6/8] Tidy up.

---
 tensorflow/lite/kernels/activations.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 305e2e7f6f6..bc47b5fb32c 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h"
 #include "tensorflow/lite/kernels/internal/reference/logistic.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
@@ -163,6 +162,7 @@ void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input,
   const uint8_t* input_data = GetTensorData<uint8_t>(input);
   int i = 0;
 #if __aarch64__ && __clang__
+  // This code uses ARM64-only instructions.
   // TODO(b/143709993): Port to ARMv7
 
   // Load the tables into registers. (4*4 128-bit registers)

From 9140684f7adaddcdb3a377bbe62e4556bbfd4b44 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 18 Mar 2020 12:11:57 +0000
Subject: [PATCH 7/8] Fix for unused variable warning.

---
 tensorflow/lite/kernels/internal/BUILD    | 1 +
 tensorflow/lite/kernels/internal/common.h | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 8c320720a31..d1ba076e41f 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -196,6 +196,7 @@ cc_library(
         ":cpu_check",
         ":types",
         "@gemmlowp//:fixedpoint",
+        "//tensorflow/core/platform:macros",
     ],
 )
 
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index 2f014062287..7f391adb437 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
 #include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tflite {
 
@@ -201,7 +202,7 @@ inline int CountLeadingSignBits(T integer_input) {
 // tanh(x) = 2*sigmoid(2*x) -1.
 // Both functions are symmetric, so the LUT table is only needed
 // for the absolute value of the input.
-static uint16_t sigmoid_table_uint16[256] = {
+TF_ATTRIBUTE_UNUSED static uint16_t sigmoid_table_uint16[256] = {
     32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, 38180, 38841, 39498,
     40149, 40794, 41432, 42064, 42688, 43304, 43912, 44511, 45102, 45683, 46255,
     46817, 47369, 47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409, 51865,

From 913a78794dd01b5f7e7bdb36fd7f566712fc11b3 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 25 Mar 2020 10:33:07 +0000
Subject: [PATCH 8/8] Fix for the error with buildifier.

---
 tensorflow/lite/kernels/internal/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index d1ba076e41f..66a429d9475 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -195,8 +195,8 @@ cc_library(
     deps = [
         ":cpu_check",
         ":types",
-        "@gemmlowp//:fixedpoint",
         "//tensorflow/core/platform:macros",
+        "@gemmlowp//:fixedpoint",
     ],
 )