From 924d0b72c568f249f2fd224a942f8922524bfede Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Thu, 19 Mar 2020 15:49:04 +0000
Subject: [PATCH] Addressed reviewer comments.

---
 tensorflow/lite/kernels/add.cc                | 73 ++++++++++++-------
 .../lite/kernels/internal/reference/add.h     | 26 +++++--
 tensorflow/lite/kernels/sub.cc                | 61 +++++++++++-----
 3 files changed, 107 insertions(+), 53 deletions(-)

diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index 7ad744b4910..731c2fb6289 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -58,6 +58,11 @@ struct OpData {
   int32 input1_offset;
   int32 input2_offset;
   int32 output_offset;
+
+  // This parameter is used to indicate whether
+  // parameter scale is power of two.
+  // It is used in 16-bit -> 16-bit quantization.
+  bool pot_scale_16bit;
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -95,12 +100,36 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // 8bit -> 8bit general quantized path, with general rescalings
   // as well as, 16bit -> 16bit with general rescalings
-  bool general_16bit = input1->type == kTfLiteInt16 &&
-                       input2->type == kTfLiteInt16 &&
-                       output->type == kTfLiteInt16;
+  bool pot_scale_16bit = false;
+
+  bool input1_scale_is_pot = false;
+  bool input2_scale_is_pot = false;
+  bool output_scale_is_pot = false;
+
+  int input1_scale_log2_rounded;
+  int input2_scale_log2_rounded;
+  int output_scale_log2_rounded;
+
+  if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
+      output->type == kTfLiteInt16) {
+    // Check that param scale is POT
+    input1_scale_is_pot =
+        CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
+
+    input2_scale_is_pot =
+        CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
+
+    output_scale_is_pot =
+        CheckedLog2(output->params.scale, &output_scale_log2_rounded);
+
+    pot_scale_16bit = input1_scale_log2_rounded && input2_scale_log2_rounded &&
+                      output_scale_log2_rounded;
+  }
+
+  data->pot_scale_16bit = pot_scale_16bit;
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
-      general_16bit) {
+      pot_scale_16bit) {
     // 8bit -> 8bit general quantized path, with general rescalings
     // as well as, 16bit -> 16bit with general rescalings
     data->input1_offset = -input1->params.zero_point;
@@ -110,7 +139,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // The shift is set to 15 for 16-bit and 20 in case of 8-bit, accordingly.
     // In case of 16-bit we have 65535 << 15 which is less than 1 << 31,
     // therefore the addition will still fit in a 32 bit accumulator.
-    data->left_shift = general_16bit ? 15 : 20;
+    data->left_shift = pot_scale_16bit ? 15 : 20;
     const double twice_max_input_scale =
         2 * std::max(input1->params.scale, input2->params.scale);
     const double real_input1_multiplier =
@@ -146,19 +175,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
-    int input1_scale_log2_rounded;
-    bool input1_scale_is_pot =
-        CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
     TF_LITE_ENSURE(context, input1_scale_is_pot);
-
-    int input2_scale_log2_rounded;
-    bool input2_scale_is_pot =
-        CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
     TF_LITE_ENSURE(context, input2_scale_is_pot);
-
-    int output_scale_log2_rounded;
-    bool output_scale_is_pot =
-        CheckedLog2(output->params.scale, &output_scale_log2_rounded);
     TF_LITE_ENSURE(context, output_scale_is_pot);
 
     data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded;
@@ -233,12 +251,8 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               const TfLiteTensor* input1,
                               const TfLiteTensor* input2,
                               TfLiteTensor* output) {
-  bool general_16bit = input1->type == kTfLiteInt16 &&
-                       input2->type == kTfLiteInt16 &&
-                       output->type == kTfLiteInt16;
-
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
-      general_16bit) {
+      data->pot_scale_16bit) {
     tflite::ArithmeticParams op_params;
     op_params.left_shift = data->left_shift;
     op_params.input1_offset = data->input1_offset;
@@ -277,7 +291,10 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
       if (need_broadcast) {
         TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int16_t);
       } else {
-        TF_LITE_ADD(reference_ops, Add, int16_t);
+        reference_ops::Add(
+            op_params, GetTensorShape(input1), GetTensorData<int16_t>(input1),
+            GetTensorShape(input2), GetTensorData<int16_t>(input2),
+            GetTensorShape(output), GetTensorData<int16_t>(output), false);
       }
     } else {
       if (kernel_type == kReference) {
@@ -296,12 +313,12 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
     }
 #undef TF_LITE_ADD
   } else if (output->type == kTfLiteInt16) {
+    tflite::ArithmeticParams op_params;
+    op_params.input1_shift = data->input1_shift;
+    op_params.input2_shift = data->input2_shift;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
 #define TF_LITE_ADD(type, opname)                                      \
-  tflite::ArithmeticParams op_params;                                  \
-  op_params.input1_shift = data->input1_shift;                         \
-  op_params.input2_shift = data->input2_shift;                         \
-  SetActivationParams(data->output_activation_min,                     \
-                      data->output_activation_max, &op_params);        \
   type::opname(op_params, GetTensorShape(input1),                      \
                GetTensorData<int16_t>(input1), GetTensorShape(input2), \
                GetTensorData<int16_t>(input2), GetTensorShape(output), \
@@ -309,7 +326,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
     // The quantized version of Add doesn't support activations, so we
     // always use BroadcastAdd.
     if (kernel_type == kReference) {
-      TF_LITE_ADD(reference_ops, AddLSTM);
+      TF_LITE_ADD(reference_ops, Add);
     } else {
       TF_LITE_ADD(optimized_ops, Add);
     }
diff --git a/tensorflow/lite/kernels/internal/reference/add.h b/tensorflow/lite/kernels/internal/reference/add.h
index c1b0163640b..741f4e684c5 100644
--- a/tensorflow/lite/kernels/internal/reference/add.h
+++ b/tensorflow/lite/kernels/internal/reference/add.h
@@ -137,10 +137,13 @@ inline void Add(const ArithmeticParams& params,
   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
-inline void Add(const ArithmeticParams& params,
-                const RuntimeShape& input1_shape, const int16* input1_data,
-                const RuntimeShape& input2_shape, const int16* input2_data,
-                const RuntimeShape& output_shape, int16* output_data) {
+inline void AddGeneralParamScale(const ArithmeticParams& params,
+                                 const RuntimeShape& input1_shape,
+                                 const int16* input1_data,
+                                 const RuntimeShape& input2_shape,
+                                 const int16* input2_data,
+                                 const RuntimeShape& output_shape,
+                                 int16* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   const int flat_size =
@@ -155,10 +158,17 @@ inline void Add(const ArithmeticParams& params,
   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
-inline void AddLSTM(const ArithmeticParams& params,
-                    const RuntimeShape& input1_shape, const int16* input1_data,
-                    const RuntimeShape& input2_shape, const int16* input2_data,
-                    const RuntimeShape& output_shape, int16* output_data) {
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16* input1_data,
+                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& output_shape, int16* output_data,
+                bool pot_scale = true) {
+  if (!pot_scale) {
+    AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
+                         input2_data, output_shape, output_data);
+    return;
+  }
+
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
 
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index 077533c7338..c314289604d 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -60,6 +60,11 @@ struct OpData {
   int32 input1_offset;
   int32 input2_offset;
   int32 output_offset;
+
+  // This parameter is used to indicate whether
+  // parameter scale is power of two.
+  // It is used in 16-bit -> 16-bit quantization.
+  bool pot_scale_16bit;
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -147,10 +152,11 @@ TfLiteStatus PrepareGeneralSubOp(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-TfLiteStatus PrepareLSTMSubOp(TfLiteContext* context,
-                              const TfLiteTensor* input1,
-                              const TfLiteTensor* input2, TfLiteTensor* output,
-                              TfLiteSubParams* params, OpData* data) {
+TfLiteStatus PrepareInt16SubOpPOT(TfLiteContext* context,
+                                  const TfLiteTensor* input1,
+                                  const TfLiteTensor* input2,
+                                  TfLiteTensor* output, TfLiteSubParams* params,
+                                  OpData* data) {
   // 16bit -> 16bit special quantized path, supporting only a rather
   // narrow case of quantization parameters: zero_points must all be 0
   // ("symmetric quantization") and scales must be power-of-two (which
@@ -219,19 +225,42 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // 8bit -> 8bit general quantized path, with general rescalings
   // as well as, 16bit -> 16bit with general rescalings
+  bool pot_scale_16bit = false;
 
-  bool general_16bit = output->type == kTfLiteInt16 &&
-                       input1->type == kTfLiteInt16 &&
-                       input2->type == kTfLiteInt16;
+  bool input1_scale_is_pot = false;
+  bool input2_scale_is_pot = false;
+  bool output_scale_is_pot = false;
+
+  int input1_scale_log2_rounded;
+  int input2_scale_log2_rounded;
+  int output_scale_log2_rounded;
+
+  if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
+      output->type == kTfLiteInt16) {
+    // Check that param scale is POT
+    input1_scale_is_pot =
+        CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
+
+    input2_scale_is_pot =
+        CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
+
+    output_scale_is_pot =
+        CheckedLog2(output->params.scale, &output_scale_log2_rounded);
+
+    pot_scale_16bit = input1_scale_log2_rounded && input2_scale_log2_rounded &&
+                      output_scale_log2_rounded;
+  }
+
+  data->pot_scale_16bit = pot_scale_16bit;
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
-      general_16bit) {
+      pot_scale_16bit) {
     TF_LITE_ENSURE_OK(context, PrepareGeneralSubOp(context, input1, input2,
                                                    output, params, data, -1));
   } else if (output->type == kTfLiteInt16) {
     // LSTM-special case with scale parameter of POT
-    TF_LITE_ENSURE_OK(context, PrepareLSTMSubOp(context, input1, input2, output,
-                                                params, data));
+    TF_LITE_ENSURE_OK(context, PrepareInt16SubOpPOT(context, input1, input2,
+                                                    output, params, data));
   }
 
   return context->ResizeTensor(context, output, output_size);
@@ -306,11 +335,6 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
       GetTensorShape(input1), GetTensorShape(input2), &op_params);
 
-  // 16bit -> 16bit with general rescaling
-  bool general_16bit = output->type == kTfLiteInt16 &&
-                       input1->type == kTfLiteInt16 &&
-                       input2->type == kTfLiteInt16;
-
 #define TF_LITE_SUB(type, opname, data_type)                             \
   type::opname(op_params, GetTensorShape(input1),                        \
                GetTensorData<data_type>(input1), GetTensorShape(input2), \
@@ -324,11 +348,14 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     } else {
       TF_LITE_SUB(reference_integer_ops, Add, int8_t);
     }
-  } else if (general_16bit) {
+  } else if (data->pot_scale_16bit) {
     if (need_broadcast) {
       TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, int16_t);
     } else {
-      TF_LITE_SUB(reference_ops, Add, int16_t);
+      reference_ops::Add(op_params, GetTensorShape(input1),
+                         GetTensorData<int16_t>(input1), GetTensorShape(input2),
+                         GetTensorData<int16_t>(input2), GetTensorShape(output),
+                         GetTensorData<int16_t>(output), false);
     }
   } else if (output->type == kTfLiteUInt8) {
     if (kernel_type == kReference) {