diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 232f5f95928..e205f075b43 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -199,6 +199,8 @@ typedef struct {
 
 typedef struct {
   TfLiteFusedActivation activation;
+  // Parameter added for the version 4.
+  bool pot_scale_int16;
 } TfLiteAddParams;
 
 typedef struct {
@@ -220,6 +222,8 @@ typedef struct {
 
 typedef struct {
   TfLiteFusedActivation activation;
+  // Parameter added for the version 5.
+  bool pot_scale_int16;
 } TfLiteSubParams;
 
 typedef struct {
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 0652c64f6c2..7fb04f5b89e 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -896,6 +896,7 @@ TfLiteStatus ParseAdd(const Operator* op, ErrorReporter* error_reporter,
   if (schema_params != nullptr) {
     params->activation =
         ConvertActivation(schema_params->fused_activation_function());
+    params->pot_scale_int16 = schema_params->pot_scale_int16();
   } else {
     // TODO(b/157480169): We should either return kTfLiteError or fill in some
     // reasonable defaults in the params struct. We are not doing so until we
@@ -1631,6 +1632,7 @@ TfLiteStatus ParseSub(const Operator* op, ErrorReporter* error_reporter,
   if (schema_params != nullptr) {
     params->activation =
         ConvertActivation(schema_params->fused_activation_function());
+    params->pot_scale_int16 = schema_params->pot_scale_int16();
   } else {
     // TODO(b/157480169): We should either return kTfLiteError or fill in some
     // reasonable defaults in the params struct. We are not doing so until we
diff --git a/tensorflow/lite/experimental/writer/writer_lib_test.cc b/tensorflow/lite/experimental/writer/writer_lib_test.cc
index fb59482f705..bf50d4944f1 100644
--- a/tensorflow/lite/experimental/writer/writer_lib_test.cc
+++ b/tensorflow/lite/experimental/writer/writer_lib_test.cc
@@ -47,6 +47,7 @@ TEST(Writer, FloatModelTest) {
   TfLiteAddParams* builtin_data =
       reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
   builtin_data->activation = kTfLiteActNone;
+  builtin_data->pot_scale_int16 = false;
   const TfLiteRegistration* reg = resolver.FindOp(BuiltinOperator_ADD, 1);
   interpreter.AddNodeWithParameters({0, 1}, {2}, initial_data, 0,
                                     reinterpret_cast<void*>(builtin_data), reg);
@@ -84,6 +85,7 @@ TEST(Writer, CustomInputOutputTest) {
   TfLiteAddParams* builtin_data =
       reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
   builtin_data->activation = kTfLiteActNone;
+  builtin_data->pot_scale_int16 = false;
   const TfLiteRegistration* reg = resolver.FindOp(BuiltinOperator_ADD, 1);
   interpreter.AddNodeWithParameters({0, 1}, {2}, initial_data, 0,
                                     reinterpret_cast<void*>(builtin_data), reg);
@@ -131,6 +133,7 @@ TEST(Writer, CustomInputOutputErrorCasesTest) {
   TfLiteAddParams* builtin_data =
       reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
   builtin_data->activation = kTfLiteActNone;
+  builtin_data->pot_scale_int16 = false;
   const TfLiteRegistration* reg = resolver.FindOp(BuiltinOperator_ADD, 1);
   interpreter.AddNodeWithParameters({0, 1}, {2}, initial_data, 0,
                                     reinterpret_cast<void*>(builtin_data), reg);
@@ -173,6 +176,7 @@ TEST(Writer, PerTensorQuantizedModelTest) {
   TfLiteAddParams* builtin_data =
       reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
   builtin_data->activation = kTfLiteActNone;
+  builtin_data->pot_scale_int16 = false;
   const TfLiteRegistration* reg = resolver.FindOp(BuiltinOperator_ADD, 1);
   interpreter.AddNodeWithParameters({0, 1}, {2}, initial_data, 0,
                                     reinterpret_cast<void*>(builtin_data), reg);
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index bda475bdc35..7692ae9e54b 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -68,6 +68,11 @@ struct OpData {
   int32 input1_offset;
   int32 input2_offset;
   int32 output_offset;
+
+  // This parameter is used to indicate whether
+  // parameter scale is power of two.
+  // It is used in 16-bit -> 16-bit quantization.
+  bool pot_scale_int16;
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -103,12 +108,55 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_size = TfLiteIntArrayCopy(input1->dims);
   }
 
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+  // 8bit -> 8bit general quantized path, with general rescalings
+  // as well as, int16 -> int16 with general rescalings
+  bool pot_scale_int16 = true;
+
+  bool input1_scale_is_pot = false;
+  bool input2_scale_is_pot = false;
+  bool output_scale_is_pot = false;
+
+  int input1_scale_log2_rounded{0};
+  int input2_scale_log2_rounded{0};
+  int output_scale_log2_rounded{0};
+
+  if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
+      output->type == kTfLiteInt16) {
+    // In case of 16-bit, there are two implementation:
+    // the scale parameter is a general number
+    // the scale parameter is POT and
+    // zero_point is zero for inputs/output.
+    pot_scale_int16 = (input1->params.zero_point == 0) &&
+                      (input2->params.zero_point == 0) &&
+                      (output->params.zero_point == 0);
+
+    input1_scale_is_pot =
+        CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
+
+    input2_scale_is_pot =
+        CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
+
+    output_scale_is_pot =
+        CheckedLog2(output->params.scale, &output_scale_log2_rounded);
+
+    pot_scale_int16 &=
+        input1_scale_is_pot && input2_scale_is_pot && output_scale_is_pot;
+  }
+
+  data->pot_scale_int16 = pot_scale_int16;
+
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      !pot_scale_int16) {
     // 8bit -> 8bit general quantized path, with general rescalings
+    // as well as, 16bit -> 16bit with general rescalings
     data->input1_offset = -input1->params.zero_point;
     data->input2_offset = -input2->params.zero_point;
     data->output_offset = output->params.zero_point;
-    data->left_shift = 20;
+
+    // The shift is set to 15 for 16-bit and 20 in case of 8-bit, accordingly.
+    // In case of 16-bit we have 65535 << 15 which is less than 1 << 31,
+    // therefore the addition will still fit in a 32 bit accumulator.
+    data->left_shift = !pot_scale_int16 ? 15 : 20;
     const double twice_max_input_scale =
         2 * std::max(input1->params.scale, input2->params.scale);
     const double real_input1_multiplier =
@@ -144,19 +192,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
-    int input1_scale_log2_rounded;
-    bool input1_scale_is_pot =
-        CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
     TF_LITE_ENSURE(context, input1_scale_is_pot);
-
-    int input2_scale_log2_rounded;
-    bool input2_scale_is_pot =
-        CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
     TF_LITE_ENSURE(context, input2_scale_is_pot);
-
-    int output_scale_log2_rounded;
-    bool output_scale_is_pot =
-        CheckedLog2(output->params.scale, &output_scale_log2_rounded);
     TF_LITE_ENSURE(context, output_scale_is_pot);
 
     data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded;
@@ -231,7 +268,8 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               const TfLiteTensor* input1,
                               const TfLiteTensor* input2,
                               TfLiteTensor* output) {
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      !data->pot_scale_int16) {
     tflite::ArithmeticParams op_params;
     op_params.left_shift = data->left_shift;
     op_params.input1_offset = data->input1_offset;
@@ -266,6 +304,15 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
           TF_LITE_ADD(optimized_integer_ops, Add, int8_t);
         }
       }
+    } else if (output->type == kTfLiteInt16) {
+      if (need_broadcast) {
+        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int16_t);
+      } else {
+        reference_ops::Add(
+            op_params, GetTensorShape(input1), GetTensorData<int16_t>(input1),
+            GetTensorShape(input2), GetTensorData<int16_t>(input2),
+            GetTensorShape(output), GetTensorData<int16_t>(output), false);
+      }
     } else {
       if (kernel_type == kReference) {
         if (need_broadcast) {
@@ -283,12 +330,12 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
     }
 #undef TF_LITE_ADD
   } else if (output->type == kTfLiteInt16) {
+    tflite::ArithmeticParams op_params;
+    op_params.input1_shift = data->input1_shift;
+    op_params.input2_shift = data->input2_shift;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
 #define TF_LITE_ADD(type, opname)                                      \
-  tflite::ArithmeticParams op_params;                                  \
-  op_params.input1_shift = data->input1_shift;                         \
-  op_params.input2_shift = data->input2_shift;                         \
-  SetActivationParams(data->output_activation_min,                     \
-                      data->output_activation_max, &op_params);        \
   type::opname(op_params, GetTensorShape(input1),                      \
                GetTensorData<int16_t>(input1), GetTensorShape(input2), \
                GetTensorData<int16_t>(input2), GetTensorShape(output), \
diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc
index bb883dd9b05..fc78f930897 100644
--- a/tensorflow/lite/kernels/add_test.cc
+++ b/tensorflow/lite/kernels/add_test.cc
@@ -310,15 +310,18 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
   const float kMin = -1.f;
   const float kMax = 32767.f / 32768.f;
   float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
-  std::vector<std::vector<float>> inputs1 = {
-      {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}};
-  std::vector<std::vector<float>> inputs2 = {
-      {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}};
-  std::vector<std::vector<float>> results = {
-      {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
+  std::vector<std::vector<float>> inputs1 = {{0.1, 0.2, 0.3, 0.4, 0.9, 0.7},
+                                             {-0.8, 0.2, 0.4, 0.7, 0.1, 0.0},
+                                             {-0.8, 0.2, 0.7, 0.3, 0.9, 0.1}};
+  std::vector<std::vector<float>> inputs2 = {{0.6, 0.4, 0.3, 0.1, -0.1, 0.3},
+                                             {0.6, 0.4, 0.5, -0.8, 0.0, -1.0},
+                                             {0.6, 0.4, -0.8, 0.5, -0.9, 0.1}};
+  std::vector<std::vector<float>> results = {{0.7, 0.6, 0.6, 0.5, 0.8, 1.0},
+                                             {-0.2, 0.6, 0.9, -0.1, 0.1, -1.0},
+                                             {-0.2, 0.6, -0.1, 0.8, 0.0, 0.2}};
   for (size_t i = 0; i < inputs1.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
-                          {TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
+    QuantizedAddOpModel m({TensorType_INT16, {1, 2, 3, 1}, kMin, kMax},
+                          {TensorType_INT16, {1, 2, 3, 1}, kMin, kMax},
                           {TensorType_INT16, {}, kMin, kMax},
                           ActivationFunctionType_NONE);
     m.QuantizeAndPopulate<int16_t>(m.input1(), inputs1[i]);
@@ -439,6 +442,10 @@ TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastInt8) {
   QuantizedWithScalarBroadcast<TensorType_INT8, int8_t>();
 }
 
+TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastInt16) {
+  QuantizedWithScalarBroadcast<TensorType_INT16, int16_t>();
+}
+
 template <enum TensorType tensor_type, typename integer_dtype>
 void QuantizedWithMixedBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
@@ -501,6 +508,10 @@ TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastInt8) {
   QuantizedWithMixedBroadcast<TensorType_INT8, int8_t>();
 }
 
+TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastInt16) {
+  QuantizedWithMixedBroadcast<TensorType_INT16, int16_t>();
+}
+
 template <enum TensorType tensor_type, typename integer_dtype>
 void QuantizedWithGenericBroadcast() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
@@ -527,5 +538,9 @@ TEST(QuantizedAddOpModel, QuantizedWithGenericdBroadcastInt8) {
   QuantizedWithGenericBroadcast<TensorType_INT8, int8_t>();
 }
 
+TEST(QuantizedAddOpModel, QuantizedWithGenericdBroadcastInt16) {
+  QuantizedWithGenericBroadcast<TensorType_INT16, int16_t>();
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/reference/add.h b/tensorflow/lite/kernels/internal/reference/add.h
index 94c58097154..5be7ab4dc0c 100644
--- a/tensorflow/lite/kernels/internal/reference/add.h
+++ b/tensorflow/lite/kernels/internal/reference/add.h
@@ -51,13 +51,18 @@ inline void Add(const ArithmeticParams& params,
 
 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
+
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
 inline void AddElementwise(int size, const ArithmeticParams& params,
-                           const uint8_t* input1_data,
-                           const uint8_t* input2_data, uint8_t* output_data) {
-  TFLITE_DCHECK_GT(params.input1_offset, -256);
-  TFLITE_DCHECK_GT(params.input2_offset, -256);
-  TFLITE_DCHECK_LT(params.input1_offset, 256);
-  TFLITE_DCHECK_LT(params.input2_offset, 256);
+                           const T* input1_data, const T* input2_data,
+                           T* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
+  TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
+  TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
+  TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
 
   for (int i = 0; i < size; ++i) {
     const int32_t input1_val = params.input1_offset + input1_data[i];
@@ -78,7 +83,7 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
     const int32_t clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8_t>(clamped_output);
+    output_data[i] = static_cast<T>(clamped_output);
   }
 }
 
@@ -132,10 +137,38 @@ inline void Add(const ArithmeticParams& params,
   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
+inline void AddGeneralParamScale(const ArithmeticParams& params,
+                                 const RuntimeShape& input1_shape,
+                                 const int16_t* input1_data,
+                                 const RuntimeShape& input2_shape,
+                                 const int16_t* input2_data,
+                                 const RuntimeShape& output_shape,
+                                 int16_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  int max_value = std::numeric_limits<int16_t>::max();
+
+  TFLITE_DCHECK_GT(params.input1_offset, -max_value);
+  TFLITE_DCHECK_GT(params.input2_offset, -max_value);
+  TFLITE_DCHECK_LT(params.input1_offset, max_value);
+  TFLITE_DCHECK_LT(params.input2_offset, max_value);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
 inline void Add(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const int16_t* input1_data,
                 const RuntimeShape& input2_shape, const int16_t* input2_data,
-                const RuntimeShape& output_shape, int16_t* output_data) {
+                const RuntimeShape& output_shape, int16_t* output_data,
+                bool pot_scale = true) {
+  if (!pot_scale) {
+    AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
+                         input2_data, output_shape, output_data);
+    return;
+  }
+
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
 
@@ -258,13 +291,14 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
   }
 }
 
-inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
-                               const RuntimeShape& input1_shape,
-                               const uint8_t* input1_data,
-                               const RuntimeShape& input2_shape,
-                               const uint8_t* input2_data,
-                               const RuntimeShape& output_shape,
-                               uint8_t* output_data) {
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
+inline void BroadcastAdd4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -314,7 +348,7 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
               std::min(params.quantized_activation_max,
                        std::max(params.quantized_activation_min, raw_output));
           output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<uint8_t>(clamped_output);
+              static_cast<T>(clamped_output);
         }
       }
     }
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index adffa19c4e1..1d1db9e0403 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -89,8 +89,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* min_version = */ 1,
              /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_ADD, Register_ADD(),
-             /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* min_version */ 1,
+             /* max_version */ 4);
   AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND(),
              /* min_version = */ 1,
              /* max_version = */ 3);
@@ -143,7 +143,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SUB, Register_SUB(),
              /* min_version = */ 1,
-             /* max_version = */ 4);
+             /* max_version = */ 5);
   AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(),
              /* min_version = */ 1,
              /* max_version = */ 4);
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index 4cd9dd7ff60..f93ebecd46d 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -71,6 +71,11 @@ struct OpData {
   int32 input1_offset;
   int32 input2_offset;
   int32 output_offset;
+
+  // This parameter is used to indicate whether
+  // parameter scale is power of two.
+  // It is used in 16-bit -> 16-bit quantization.
+  bool pot_scale_int16;
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -83,13 +88,14 @@ void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
 
-TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
-                              const TfLiteTensor* input_1,
-                              const TfLiteTensor* input_2, TfLiteTensor* output,
-                              TfLiteSubParams* params, OpData* op_params,
-                              int op_sign) {
-  TF_LITE_ENSURE(context,
-                 output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
+TfLiteStatus PrepareGeneralSubOp(TfLiteContext* context,
+                                 const TfLiteTensor* input_1,
+                                 const TfLiteTensor* input_2,
+                                 TfLiteTensor* output, TfLiteSubParams* params,
+                                 OpData* op_params, int op_sign) {
+  TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 ||
+                              output->type == kTfLiteInt8 ||
+                              output->type == kTfLiteInt16);
   const auto& input1_quantization_params = input_1->params;
   const auto& input2_quantization_params = input_2->params;
   const auto& output_quantization_params = output->params;
@@ -98,6 +104,9 @@ TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
   if (output->type == kTfLiteUInt8) {
     integer_type_min = std::numeric_limits<uint8_t>::min();
     integer_type_max = std::numeric_limits<uint8_t>::max();
+  } else if (output->type == kTfLiteInt16) {
+    integer_type_min = std::numeric_limits<int16_t>::min();
+    integer_type_max = std::numeric_limits<int16_t>::max();
   } else {
     // output->type == kTfLiteInt8
     integer_type_min = std::numeric_limits<int8_t>::min();
@@ -120,7 +129,11 @@ TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
   op_params->input1_offset = -input1_quantization_params.zero_point;
   op_params->input2_offset = -input2_quantization_params.zero_point;
   op_params->output_offset = output_quantization_params.zero_point;
-  op_params->left_shift = 20;
+
+  // The shift is set to 15 in case of 16-bit and 20 in case of 8-bit,
+  // accordingly. In case of 16-bit we have 65535 << 15 which is less than 1 <<
+  // 31, therefore the addition will still fit in a 32 bit accumulator.
+  op_params->left_shift = output->type == kTfLiteInt16 ? 15 : 20;
   const double twice_max_input_scale =
       2 * std::max(input1_quantization_params.scale,
                    input2_quantization_params.scale);
@@ -146,13 +159,15 @@ TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
   TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
       context, params->activation, output, &op_params->output_activation_min,
       &op_params->output_activation_max));
+
   return kTfLiteOk;
 }
 
-TfLiteStatus PrepareInt16SubOp(TfLiteContext* context,
-                               const TfLiteTensor* input1,
-                               const TfLiteTensor* input2, TfLiteTensor* output,
-                               TfLiteSubParams* params, OpData* data) {
+TfLiteStatus PrepareInt16SubOpPOT(TfLiteContext* context,
+                                  const TfLiteTensor* input1,
+                                  const TfLiteTensor* input2,
+                                  TfLiteTensor* output, TfLiteSubParams* params,
+                                  OpData* data) {
   // 16bit -> 16bit special quantized path, supporting only a rather
   // narrow case of quantization parameters: zero_points must all be 0
   // ("symmetric quantization") and scales must be power-of-two (which
@@ -219,12 +234,51 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_size = TfLiteIntArrayCopy(input1->dims);
   }
 
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_OK(context, Prepare8BitSubOp(context, input1, input2, output,
-                                                params, data, -1));
+  // 8bit -> 8bit general quantized path, with general rescalings
+  // as well as, 16bit -> 16bit with general rescalings
+  bool pot_scale_int16 = true;
+
+  bool input1_scale_is_pot = false;
+  bool input2_scale_is_pot = false;
+  bool output_scale_is_pot = false;
+
+  int input1_scale_log2_rounded{0};
+  int input2_scale_log2_rounded{0};
+  int output_scale_log2_rounded{0};
+
+  if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
+      output->type == kTfLiteInt16) {
+    // In case of 16-bit, there are two implementation:
+    // the scale parameter is a general number
+    // the scale parameter is POT and
+    // zero_point is zero for inputs/output.
+    pot_scale_int16 = (input1->params.zero_point == 0) &&
+                      (input2->params.zero_point == 0) &&
+                      (output->params.zero_point == 0);
+
+    input1_scale_is_pot =
+        CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
+
+    input2_scale_is_pot =
+        CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
+
+    output_scale_is_pot =
+        CheckedLog2(output->params.scale, &output_scale_log2_rounded);
+
+    pot_scale_int16 &=
+        input1_scale_is_pot && input2_scale_is_pot && output_scale_is_pot;
+  }
+
+  data->pot_scale_int16 = pot_scale_int16;
+
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      !pot_scale_int16) {
+    TF_LITE_ENSURE_OK(context, PrepareGeneralSubOp(context, input1, input2,
+                                                   output, params, data, -1));
   } else if (output->type == kTfLiteInt16) {
-    TF_LITE_ENSURE_OK(context, PrepareInt16SubOp(context, input1, input2,
-                                                 output, params, data));
+    // LSTM-special case with scale parameter of POT
+    TF_LITE_ENSURE_OK(context, PrepareInt16SubOpPOT(context, input1, input2,
+                                                    output, params, data));
   }
 
   return context->ResizeTensor(context, output, output_size);
@@ -332,6 +386,15 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     } else {
       TF_LITE_SUB(reference_integer_ops, Add, int8_t);
     }
+  } else if (!data->pot_scale_int16) {
+    if (need_broadcast) {
+      TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, int16_t);
+    } else {
+      reference_ops::Add(op_params, GetTensorShape(input1),
+                         GetTensorData<int16_t>(input1), GetTensorShape(input2),
+                         GetTensorData<int16_t>(input2), GetTensorShape(output),
+                         GetTensorData<int16_t>(output), false);
+    }
   } else if (output->type == kTfLiteUInt8) {
     if (kernel_type == kReference) {
       if (need_broadcast) {
diff --git a/tensorflow/lite/kernels/sub_test.cc b/tensorflow/lite/kernels/sub_test.cc
index 67054fe4903..24d9c251afb 100644
--- a/tensorflow/lite/kernels/sub_test.cc
+++ b/tensorflow/lite/kernels/sub_test.cc
@@ -304,6 +304,10 @@ TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt8) {
   QuantizedTestsNoActivation<TensorType_INT8, int8_t>();
 }
 
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivationGenericInt16) {
+  QuantizedTestsNoActivation<TensorType_INT16, int16_t>();
+}
+
 template <TensorType tensor_type, typename integer_dtype>
 void QuantizedTestsActivationRELU_N1_TO_1() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
@@ -365,6 +369,10 @@ TEST(QuantizedSubOpModel, QuantizedVariousInputShapesInt8) {
   QuantizedVariousInputShapes<TensorType_INT8, int8_t>();
 }
 
+TEST(QuantizedSubOpModel, QuantizedVariousInputShapesInt16) {
+  QuantizedVariousInputShapes<TensorType_INT16, int16_t>();
+}
+
 template <TensorType tensor_type, typename integer_dtype>
 void QuantizedWithBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
@@ -393,6 +401,10 @@ TEST(QuantizedSubOpModel, QuantizedWithBroadcastInt8) {
   QuantizedWithBroadcast<TensorType_INT8, int8_t>();
 }
 
+TEST(QuantizedSubOpModel, QuantizedWithBroadcastInt16) {
+  QuantizedWithBroadcast<TensorType_INT16, int16_t>();
+}
+
 TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt16) {
   const float kMin = -1.f;
   const float kMax =
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 878acde1e16..baeb49f7b7a 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -583,6 +583,8 @@ table ConcatenationOptions {
 
 table AddOptions {
   fused_activation_function:ActivationFunctionType;
+  // Parameters supported by version 4.
+  pot_scale_int16:bool = true;
 }
 
 table MulOptions {
@@ -704,6 +706,8 @@ table DepthToSpaceOptions {
 
 table SubOptions {
   fused_activation_function:ActivationFunctionType;
+  // Parameters supported by version 5
+  pot_scale_int16:bool = true;
 }
 
 table DivOptions {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index a6117dc72ab..a4691b70e49 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -4742,22 +4742,29 @@ flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(flatbuffers
 
 struct AddOptionsT : public flatbuffers::NativeTable {
   typedef AddOptions TableType;
+  bool pot_scale_int16;
   tflite::ActivationFunctionType fused_activation_function;
   AddOptionsT()
-      : fused_activation_function(tflite::ActivationFunctionType_NONE) {
+      : pot_scale_int16(true),
+        fused_activation_function(tflite::ActivationFunctionType_NONE) {
   }
 };
 
 struct AddOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef AddOptionsT NativeTableType;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_FUSED_ACTIVATION_FUNCTION = 4
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_POT_SCALE_INT16 = 6
   };
+  bool pot_scale_int16() const {
+    return GetField<uint8_t>(VT_POT_SCALE_INT16, 0) != 0;
+  }
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
@@ -5907,22 +5914,29 @@ flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(flatbuffers::
 
 struct SubOptionsT : public flatbuffers::NativeTable {
   typedef SubOptions TableType;
+  bool pot_scale_int16;
   tflite::ActivationFunctionType fused_activation_function;
   SubOptionsT()
-      : fused_activation_function(tflite::ActivationFunctionType_NONE) {
+      : pot_scale_int16(true),
+        fused_activation_function(tflite::ActivationFunctionType_NONE) {
   }
 };
 
 struct SubOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef SubOptionsT NativeTableType;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_FUSED_ACTIVATION_FUNCTION = 4
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_POT_SCALE_INT16 = 6
   };
+  bool pot_scale_int16() const {
+    return GetField<uint8_t>(VT_POT_SCALE_INT16, 0) != 0;
+  }
   tflite::ActivationFunctionType fused_activation_function() const {
     return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            verifier.EndTable();
   }
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index 567d000dab6..222be969560 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -53,12 +53,15 @@ std::string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kDepthwiseConv, 5}, kPendingReleaseOpVersion},
           {{OperatorType::kAdd, 1}, "1.5.0"},
           {{OperatorType::kAdd, 2}, "1.14.0"},
+          {{OperatorType::kAdd, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kAddN, 1}, "1.14.0"},
           {{OperatorType::kSpaceToBatchND, 1}, "1.6.0"},
           {{OperatorType::kSpaceToBatchND, 2}, "1.14.0"},
           {{OperatorType::kSub, 1}, "1.6.0"},
           {{OperatorType::kSub, 2}, "1.14.0"},
+          {{OperatorType::kSub, 3}, "1.15.0"},
           {{OperatorType::kSub, 4}, kPendingReleaseOpVersion},
+          {{OperatorType::kSub, 5}, kPendingReleaseOpVersion},
           {{OperatorType::kDiv, 1}, "1.6.0"},
           {{OperatorType::kBatchToSpaceND, 1}, "1.6.0"},
           {{OperatorType::kBatchToSpaceND, 2}, "1.14.0"},
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 794691f5724..585b15bae2e 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -276,10 +276,10 @@ class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
     ::tflite::OpSignature op_sig =
         GetVersioningOpSig(builtin_op(), op_signature);
     if (input1_array.has_shape() && input2_array.has_shape()) {
-      op_sig.options.broadcast.num_dims =
+      op_sig.options.addsub.num_dims =
           std::max(input1_array.shape().dimensions_count(),
                    input2_array.shape().dimensions_count());
-      op_sig.options.broadcast.need_broadcast =
+      op_sig.options.addsub.need_broadcast =
           (input1_array.shape() != input2_array.shape());
     }
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 92a3e68c644..ef4825c397e 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -450,13 +450,31 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_ADD:
+      if (op_sig.input_types.at(0) == TensorType_INT16 &&
+          op_sig.output_types.at(0) == TensorType_INT16) {
+        if (!op_sig.options.addsub.pot_scale_int16) {
+          return 3;
+        }
+      }
+      if (op_sig.input_types.at(0) == TensorType_INT8) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_SUB:
+      if (op_sig.input_types.at(0) == TensorType_INT16 &&
+          op_sig.output_types.at(0) == TensorType_INT16) {
+        if (!op_sig.options.addsub.pot_scale_int16) {
+          return 5;
+        }
+      }
       if (!op_sig.input_types.empty() &&
           op_sig.input_types.at(0) == TensorType_INT64) {
         return 4;
       }
-      if (op_sig.options.broadcast.need_broadcast &&
-          op_sig.options.broadcast.num_dims > 4) {
+      if (op_sig.options.addsub.need_broadcast &&
+          op_sig.options.addsub.num_dims > 4) {
         return 3;
       }
       if (op_sig.input_types.at(0) == TensorType_INT8) {
@@ -542,7 +560,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
         }
       }
       return 1;
-    case BuiltinOperator_ADD:
+
     case BuiltinOperator_SPACE_TO_DEPTH:
     case BuiltinOperator_SPLIT_V:
     case BuiltinOperator_SUM:
@@ -669,6 +687,26 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
       }
     } break;
 
+    case BuiltinOperator_ADD: {
+      auto add_option = op->builtin_options_as_AddOptions();
+      op_sig.options.addsub.pot_scale_int16 = true;
+      if (add_option) {
+        op_sig.options.addsub.pot_scale_int16 = add_option->pot_scale_int16();
+      }
+    } break;
+
+    case BuiltinOperator_SUB: {
+      auto sub_option = op->builtin_options_as_SubOptions();
+      op_sig.options.addsub.need_broadcast =
+          !HaveSameShapes(subgraph, op, 0, 1);
+      op_sig.options.addsub.num_dims =
+          std::max(GetNumDims(subgraph, op, 0), GetNumDims(subgraph, op, 1));
+      op_sig.options.addsub.pot_scale_int16 = true;
+      if (sub_option) {
+        op_sig.options.addsub.pot_scale_int16 = sub_option->pot_scale_int16();
+      }
+    } break;
+
     case BuiltinOperator_LSTM: {
       auto lstm_option = op->builtin_options_as_LSTMOptions();
       if (lstm_option) {
@@ -714,7 +752,7 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
     case BuiltinOperator_TRANSPOSE: {
       op_sig.options.single_input_op.num_dims = GetNumDims(subgraph, op, 0);
     } break;
-    case BuiltinOperator_SUB:
+
     case BuiltinOperator_DIV:
     case BuiltinOperator_MAXIMUM:
     case BuiltinOperator_MINIMUM: {
diff --git a/tensorflow/lite/tools/versioning/op_version.h b/tensorflow/lite/tools/versioning/op_version.h
index 71362001387..67a7b79fe38 100644
--- a/tensorflow/lite/tools/versioning/op_version.h
+++ b/tensorflow/lite/tools/versioning/op_version.h
@@ -63,6 +63,11 @@ typedef struct {
       int32_t num_dims;
       bool need_broadcast;
     } broadcast;
+    struct {
+      bool pot_scale_int16;
+      int32_t num_dims;
+      bool need_broadcast;
+    } addsub;
     struct {
       bool is_per_channel_quantized;
     } conv_2d;
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index ccbbaa27d68..5a454224b92 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -72,6 +72,8 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_DEPTHWISE_CONV_2D, 6}, "2.3.0"},
               {{BuiltinOperator_ADD, 1}, "1.5.0"},
               {{BuiltinOperator_ADD, 2}, "1.14.0"},
+              {{BuiltinOperator_ADD, 3}, kPendingReleaseVersion},
+              {{BuiltinOperator_ADD, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_ADD_N, 1}, "1.14.0"},
               {{BuiltinOperator_SPACE_TO_BATCH_ND, 1}, "1.6.0"},
               {{BuiltinOperator_SPACE_TO_BATCH_ND, 2}, "1.14.0"},
@@ -80,6 +82,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SUB, 2}, "1.14.0"},
               {{BuiltinOperator_SUB, 3}, "2.3.0"},
               {{BuiltinOperator_SUB, 4}, kPendingReleaseVersion},
+              {{BuiltinOperator_SUB, 5}, kPendingReleaseVersion},
               {{BuiltinOperator_DENSIFY, 1}, "2.2.0"},
               {{BuiltinOperator_DIV, 1}, "1.6.0"},
               {{BuiltinOperator_DIV, 2}, "2.3.0"},