From c69c80248848f97969293012f98f5eae571a7207 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 1 Jul 2020 14:18:52 +0100
Subject: [PATCH 1/2] Fixes for tanh and logistic activation functions, 16x8.

Change-Id: I2d8dc5c706ad834ce2331ad0f77cce41986bf477
---
 tensorflow/lite/kernels/activations.cc        | 47 ++++++++++++-------
 tensorflow/lite/kernels/activations_test.cc   | 28 ++++++-----
 .../internal/reference/integer_ops/logistic.h | 43 ++++++++++++-----
 .../internal/reference/integer_ops/tanh.h     | 22 +++++----
 4 files changed, 88 insertions(+), 52 deletions(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 654ccbc27ec..d894a28bd90 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -298,7 +298,6 @@ void HardSwishFree(TfLiteContext* context, void* buffer) {
   delete static_cast<HardSwishData*>(buffer);
 }
 
-
 TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_STATUS(GenericPrepare(context, node));
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -426,13 +425,19 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
         (data->input_left_shift == 0 || data->input_left_shift == 1);
 
     if (!param_scale_pot) {
-      // In case of general scale parameter, we need to do a rescaling.
-      // Magic constant 4096:
-      // We need to scale down to (-2^3, 2^3) / 3 is kInputIntegerBits/ interval
-      // from 16-bit (-2^15, 2^15),
-      // so we need to multiply by
-      // 2^(15 - kInputIntegerBits) = 2^12 = 4096.
-      data->input_multiplier = static_cast<int32_t>(input->params.scale * 4096);
+      // Calculate multiplier to change input scale to 1/(3*4096)
+      // as required by the table lookup.
+      // In this scaling +/-2^17 represents +/-10.7
+
+      double multiplier = input->params.scale * 4096.0 * 3.0;
+      data->input_left_shift = 0;
+
+      while (multiplier <= 32767.0 / 2.0 && data->input_left_shift <= 30) {
+        data->input_left_shift++;
+        multiplier = multiplier * 2.0;
+      }
+
+      data->input_multiplier = static_cast<int32_t>(multiplier);
     }
 
     int output_scale_log2_rounded;
@@ -521,13 +526,19 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
     param_scale_pot &= (data->input_left_shift == 0);
 
     if (!param_scale_pot) {
-      // In case of general scale parameter, we need to do a rescaling.
-      // Magic constant 4096:
-      // We need to scale down to (-2^3, 2^3) / 3 is kInputIntegerBits/ interval
-      // from 16-bit (-2^15, 2^15),
-      // so we need to multiply by
-      // 2^(15 - kInputIntegerBits) = 2^12 = 4096.
-      data->input_multiplier = static_cast<int32_t>(input->params.scale * 4096);
+      // Calculate multiplier to change input scale to 1/(3*4096)
+      // as required by the table lookup.
+      // In this scaling +/-2^17 represents +/-10.7
+      double multiplier = input->params.scale * 4096.0 * 3.0;
+
+      data->input_left_shift = 0;
+
+      while (multiplier <= 32767.0 / 2.0 && data->input_left_shift <= 30) {
+        data->input_left_shift++;
+        multiplier = multiplier * 2.0;
+      }
+
+      data->input_multiplier = static_cast<int32_t>(multiplier);
     }
 
     int output_scale_log2_rounded;
@@ -943,9 +954,9 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
         const int size =
             MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
 
-        reference_integer_ops::Logistic(data->input_multiplier, size,
-                                        GetTensorData<int16_t>(input),
-                                        GetTensorData<int16_t>(output));
+        reference_integer_ops::Logistic(
+            data->input_multiplier, data->input_left_shift, size,
+            GetTensorData<int16_t>(input), GetTensorData<int16_t>(output));
       } else {
         optimized_ops::Logistic(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index d8f883b9c1d..9473b367706 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -892,13 +892,15 @@ TEST_P(TanhOpTest, TanhInt16General) {
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       GetRegistration(), BuiltinOperator_TANH,
-      /*input=*/{TensorType_INT16, {6}, 11 * kMin, 11 * kMax},
-      /*output=*/{TensorType_INT16, {5}, kMin, kMax});
-  m.SetInput<int16_t>({-10, -4, 0, 6, 7.0909090909, 8});
+      /*input=*/{TensorType_INT16, {10}, 11 * kMin, 11 * kMax},
+      /*output=*/{TensorType_INT16, {10}, kMin, kMax});
+  m.SetInput<int16_t>({-10, -4, 1, 0.5, 0.25,  //
+                       0, -0.1, 6, 7.0909090909, 8});
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
               ElementsAreArray(ArrayFloatNear(
-                  {-0.999969, -0.99408, 0, 0.999664, 0.999939, 0.999969},
+                  {-1.0, -0.999329, 0.761594, 0.462117, 0.244919,  //
+                   0.0, -0.099668, 0.999988, 0.999999, 1.0},
                   kQuantizedToleranceInt16)));
 }
 
@@ -1083,18 +1085,18 @@ TEST_P(LogisticOpTest, SigmoidInt16General) {
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       GetRegistration(), BuiltinOperator_LOGISTIC,
-      /*input=*/{TensorType_INT16, {8}, 10 * kMin, 10 * kMax},
-      /*output=*/{TensorType_INT16, {8}, kMin, kMax});
+      /*input=*/{TensorType_INT16, {12}, 13 * kMin, 13 * kMax},
+      /*output=*/{TensorType_INT16, {12}, kMin, kMax});
   m.SetInput<int16_t>({
-      0, -6, 2, 4,   //
-      3, -2, 10, 1,  //
+      0, -6, 2, 4, 0.1, 12,    //
+      3, -2, 10, 1, 0.25, -12  //
   });
   m.Invoke();
-  EXPECT_THAT(
-      m.GetDequantizedOutput<int16_t>(),
-      ElementsAreArray(ArrayFloatNear({0.5, 0.00814819, 0.832031, 0.960846,  //
-                                       0.916809, 0.167969, 0.999664, 0.689972},
-                                      kQuantizedToleranceInt16)));
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {0.5, 0.002473, 0.880797, 0.982014, 0.524979, 0.999994,  //
+                   0.952574, 0.119203, 0.999955, 0.731059, 0.562177, 0},
+                  kQuantizedToleranceInt16)));
 }
 
 TEST(FloatActivationsOpTest, Softmax4D) {
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
index e315683c0cd..b1a970396d3 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -58,30 +58,47 @@ inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
   }
 }
 
-inline void Logistic(int32_t input_multiplier, int32_t input_size,
-                     const int16_t* ptr_input_data, int16_t* ptr_output_data) {
+inline void Logistic(int32_t input_multiplier, int32_t input_left_shift,
+                     int32_t input_size, const int16_t* ptr_input_data,
+                     int16_t* ptr_output_data) {
   // We use the LUT for sigmoid and take into account, that
   // tanh(x) = 2*sigmoid(2*x) - 1
 
-  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+  // We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
+  // In case of general parameter scale, multiplier 3 is taken into account
+  // in TanhPrepare function and it is included in
+  // input_multiplier already.
+
+  if (input_multiplier == 0) {  // power of two case
+    input_multiplier = 3 << input_left_shift;
+    input_left_shift = 0;
+  }
+
+  int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
 
   for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = (*ptr_input_data) * input_data_mul;
+    int32_t input_data =
+        ((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
 
-    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
-    // we do interpolation on unsigned values.
-    uint32_t abs_input_data = 3 * abs(input_data);
+    // We do interpolation on unsigned values.
+    uint32_t abs_input_data = abs(input_data);
 
     // We divide by 2 power of 9, because
     // we need to divide by 2 in power of 7 for
     // the input conversion + 1/4 from the scale above.
-    uint8_t uh = abs_input_data >> 9;
-    uint32_t ua = sigmoid_table_uint16[uh];
-    uint32_t ub = sigmoid_table_uint16[uh + 1];
-    uint32_t ut = abs_input_data & 0x1ff;
+    uint32_t uh = abs_input_data >> 9;
 
-    // Interpolation is done using the fractional bit.
-    uint32_t result = (ua << 9) + ut * (ub - ua);
+    uint32_t result;
+    if (uh >= 255) {
+      result = 0xfffe << 9;
+    } else {
+      uint32_t ua = sigmoid_table_uint16[uh];
+      uint32_t ub = sigmoid_table_uint16[uh + 1];
+      uint32_t ut = abs_input_data & 0x1ff;
+
+      // Interpolation is done using the fractional bit.
+      result = (ua << 9) + ut * (ub - ua);
+    }
 
     result = (input_data >= 0) ? (result + (1 << 9))
                                : ((1 << (16 + 9)) - result + (1 << 9) - 1);
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
index baae65ab30e..ade3d958ccc 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -63,17 +63,23 @@ inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
   // We use the LUT for sigmoid and take into account, that
   // tanh(x) = 2*sigmoid(2*x) - 1
 
-  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+  // We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
+  // In case of general parameter scale, multiplier 3 is taken into account
+  // in TanhPrepare function and it is included in
+  // input_multiplier already.
+
+  if (input_multiplier == 0) {  // power of two case
+    input_multiplier = 3 << input_left_shift;
+    input_left_shift = 0;
+  }
+
+  int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
 
   for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = (*ptr_input_data) * input_data_mul;
+    int32_t input_data =
+        ((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
 
-    if (input_left_shift == 1) {
-      input_data <<= 1;
-    }
-
-    // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
-    uint32_t abs_input_data = 3 * abs(input_data);
+    uint32_t abs_input_data = abs(input_data);
     uint32_t uh = abs_input_data >> 8;
     int32_t result;
 

From a2b53623a2c0f81c32b02e2d4b3bac153daaacf9 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 22 Sep 2020 10:35:42 +0100
Subject: [PATCH 2/2] Addressed reviewer's comments.

Change-Id: I798bf7919b6a268a4631984ed07a242943ca0b72
---
 tensorflow/lite/kernels/activations.cc                    | 8 +++++---
 .../kernels/internal/reference/integer_ops/logistic.h     | 8 ++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 30665095c60..a951ff8dad4 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -437,14 +437,16 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
     if (!param_scale_pot) {
       // Calculate multiplier to change input scale to 1/(3*4096)
       // as required by the table lookup.
-      // In this scaling +/-2^17 represents +/-10.7
+      // The number 3.0 in the multiplier comes from here, 
+      // because the interval is [-10.7, 10.7] instead of [-8, 8].
+      // So, in this scaling +/-2^17 represents +/-10.7.
 
       double multiplier = input->params.scale * 4096.0 * 3.0;
       data->input_left_shift = 0;
 
       while (multiplier <= 32767.0 / 2.0 && data->input_left_shift <= 30) {
-        data->input_left_shift++;
-        multiplier = multiplier * 2.0;
+         data->input_left_shift++;
+         multiplier = multiplier * 2.0;
       }
 
       data->input_multiplier = static_cast<int32_t>(multiplier);
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
index b1a970396d3..07eb732e2bd 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -86,14 +86,14 @@ inline void Logistic(int32_t input_multiplier, int32_t input_left_shift,
     // We divide by 2 power of 9, because
     // we need to divide by 2 in power of 7 for
     // the input conversion + 1/4 from the scale above.
-    uint32_t uh = abs_input_data >> 9;
+    uint32_t u_table = abs_input_data >> 9;
 
     uint32_t result;
-    if (uh >= 255) {
+    if (u_table >= 255) {
       result = 0xfffe << 9;
     } else {
-      uint32_t ua = sigmoid_table_uint16[uh];
-      uint32_t ub = sigmoid_table_uint16[uh + 1];
+      uint32_t ua = sigmoid_table_uint16[u_table];
+      uint32_t ub = sigmoid_table_uint16[u_table + 1];
       uint32_t ut = abs_input_data & 0x1ff;
 
       // Interpolation is done using the fractional bit.