Rewrite quantized hardswish for good accuracy in all cases.

This is motivated by MobileNet v3 experiments which exercised corner cases not anticipated by the current implementation (large quantization scales) and which showed that some fine arithmetic details have a significant impact on classification accuracy. Most importantly, bias must be minimized as any increase in bias translate into a degradation of classification accuracy, and that had 2 specific consequences: 1. As HardSwish inherently requires forming the expression (x + 3)/6, it must as a prerequisite step rescale x on a scale where 3 is exactly representable. 2. There are 3 fixed-point multiplications. If we used for all of them the usual rounding fixed-point multiplication primitive/instruction (e.g. NEON SQRDMULH) that that results in significant bias away from zero. This was fixed by suitably combining this usual rounding multiplication with a truncating multiplication (e.g. NEON SQDMULH), one feeding into the other, so the biases (away from zero, and toward zero) cancel each other. A specific test case was added to guard regressions on this front in the unit test, based on the fact that bias (visible at unit-test level) is empirically seen to be a sufficient proxy for classification accuracy (not visible at unit-test level and too expensive to measure even for integration tests). PiperOrigin-RevId: 257904120
2019-07-12 18:20:08 -07:00 · 2019-07-12 18:20:08 -07:00 · 7d4d60c36e
commit 7d4d60c36e
parent 42b8511f63
6 changed files with 417 additions and 219 deletions
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@ -426,6 +426,7 @@ cc_library(
        "//tensorflow/lite/c:c_api_internal",
        "//tensorflow/lite/kernels/internal:audio_utils",
        "//tensorflow/lite/kernels/internal:common",
        "//tensorflow/lite/kernels/internal:compatibility",
        "//tensorflow/lite/kernels/internal:cpu_check",
        "//tensorflow/lite/kernels/internal:kernel_utils",
        "//tensorflow/lite/kernels/internal:optimized",
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/softmax.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@ -178,6 +179,22 @@ void HardSwishFree(TfLiteContext* context, void* buffer) {
  delete static_cast<HardSwishData*>(buffer);
 }
 void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32,
                                     int16_t* multiplier_int16) {
  TFLITE_DCHECK_GE(multiplier_int32, 0);
  static constexpr int32_t kRoundingOffset = 1 << 15;
  if (multiplier_int32 >=
      std::numeric_limits<int32_t>::max() - kRoundingOffset) {
    *multiplier_int16 = std::numeric_limits<int16_t>::max();
    return;
  }
  const int32_t result = (multiplier_int32 + kRoundingOffset) >> 16;
  TFLITE_DCHECK_LE(result << 16, multiplier_int32 + kRoundingOffset);
  TFLITE_DCHECK_GT(result << 16, multiplier_int32 - kRoundingOffset);
  *multiplier_int16 = result;
  TFLITE_DCHECK_EQ(*multiplier_int16, result);
 }
 TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_STATUS(GenericPrepare(context, node));
  TfLiteTensor* output = GetOutput(context, node, 0);
@ -186,40 +203,30 @@ TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
    HardSwishData* data = static_cast<HardSwishData*>(node->user_data);
    HardSwishParams* params = &data->params;
    const TfLiteTensor* input = GetInput(context, node, 0);
    // TODO(131260336): Maybe pick a better way to select the denominator shift.
    // Include input shift into the shift.
    static constexpr int32_t extra_input_shift = 3;
    // Note: optimized implementations will rely on the ability to perform this
    // left shift within int16 without overflow. The values being left-shifted
    // range in [-255, 255] i.e. just under 2^8 in absolute value, and after the
    // left shift they will still be added the 'three_input' value, which is
    // safe if they're not greater than 2^14 in absolute value (since 2^15 is
    // the magnitude of the boundaries of int16 range). 14-8 == 6, so we
    // require extra_input_shift to be no greater than 6.
    static_assert(extra_input_shift <= 6, "");
    const auto in_scale = input->params.scale;
    params->input_zero_point = input->params.zero_point;
-    const auto out_scale = output->params.scale;
+    params->output_zero_point = output->params.zero_point;
-    const int32_t out_zero_point = output->params.zero_point;
+    const float input_scale = input->params.scale;
-    // Get 3 and 6 represented in input scale. We avoid intermediate conversion
+    const float hires_input_scale = (1.0f / 128.0f) * input_scale;
-    // to the "true" scale, so all operations are done in input scale losslessly
+    const float reluish_scale = 3.0f / 32768.0f;
-    // And then converted to the output scale.
+    const float output_scale = output->params.scale;
    // However 3 and 6 might not have exact representation in input scale.
    // We use extra multiplier to avoid precision loss when converting
    // 3 and 6 from input to output.
    params->three_input = std::lround((3 << extra_input_shift) / in_scale);
    params->six_input = std::lround((6 << extra_input_shift) / in_scale);
    // Compensate for the fact that we multiply two numbers in in_scale
    // and produce result in output format.
    // NB: we fold 6 multiplier into the scaling factor here:
    float from_in_to_out_sq = (in_scale * in_scale / out_scale / 6);
-    from_in_to_out_sq /= (1 << extra_input_shift);
+    const float output_multiplier = hires_input_scale / output_scale;
    QuantizeMultiplierSmallerThanOneExp(from_in_to_out_sq, &(params->scale),
                                        &(params->shift));
-    params->output_offset = out_zero_point;
+    int32_t output_multiplier_fixedpoint_int32;
-    params->clip_input_shift = extra_input_shift;
+    QuantizeMultiplier(output_multiplier, &output_multiplier_fixedpoint_int32,
                       &params->output_multiplier_exponent);
    DownScaleInt32ToInt16Multiplier(
        output_multiplier_fixedpoint_int32,
        &params->output_multiplier_fixedpoint_int16);
    TF_LITE_ENSURE(context, params->output_multiplier_exponent <= 0);
    const float reluish_multiplier = hires_input_scale / reluish_scale;
    int32_t reluish_multiplier_fixedpoint_int32;
    QuantizeMultiplier(reluish_multiplier, &reluish_multiplier_fixedpoint_int32,
                       &params->reluish_multiplier_exponent);
    DownScaleInt32ToInt16Multiplier(
        reluish_multiplier_fixedpoint_int32,
        &params->reluish_multiplier_fixedpoint_int16);
  }
  return kTfLiteOk;
 }
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
 #include <limits>
 #include <random>
 #include <gtest/gtest.h>
@ -124,7 +125,6 @@ class QuantizedActivationsOpModel : public BaseActivationsOpModel {
    QuantizeAndPopulate<T>(input_, data);
  }
  template <typename T>
  std::vector<T> GetOutput() {
    return ExtractVector<T>(output_);
  }
@ -243,33 +243,88 @@ void TestFloatHardSwish(int size, std::minstd_rand* random_engine) {
 }
 template <typename QuantizedType>
-void TestQuantizedHardSwish(TensorType tensor_type, int size,
+void TestQuantizedHardSwish(TensorType tensor_type, int size, float input_min,
                            float input_max, float output_min, float output_max,
                            std::minstd_rand* random_engine) {
  std::vector<float> float_input_values;
-  const float kMin = -10.0f;
+  GenerateUniformRandomVector(size, input_min, input_max, random_engine,
  const float kMax = 10.0f;
  GenerateUniformRandomVector(size, kMin, kMax, random_engine,
                              &float_input_values);
  const float kOutMin = -3;
  const float kOutMax = kMax;
  std::vector<float> float_ref_output_values;
  EvalTestReferenceHardSwish(size, float_input_values,
                             &float_ref_output_values);
  for (float& val : float_ref_output_values) {
    val = std::min(output_max, std::max(output_min, val));
  }
  QuantizedActivationsOpModel m(
      BuiltinOperator_HARD_SWISH,
-      /*input=*/{tensor_type, {1, 1, 1, size}, kMin, kMax},
+      /*input=*/{tensor_type, {1, 1, 1, size}, input_min, input_max},
-      /*output=*/{tensor_type, {1, 1, 1, size}, kOutMin, kOutMax});
+      /*output=*/{tensor_type, {1, 1, 1, size}, output_min, output_max});
  m.SetInput<QuantizedType>(float_input_values);
  m.Invoke();
  const std::vector<float>& dequantized_output =
      m.GetDequantizedOutput<QuantizedType>();
  // The numerical error for any 8bit quantized function is at least one half
  // times the quantization step: 0.5 * (kOutMax - kOutMin) / 256.
  // To that we add again the quantization step (kOutMax - kOutMin) / 256
  // to allow for an off-by-one rounding error.
-  const float kTolerance = (kOutMax - kOutMin) * (1.5f / 256.f);
+  const float kTolerance =
-  EXPECT_THAT(
+      std::max(input_max - input_min, output_max - output_min) * (1.5f / 256.f);
-      m.GetDequantizedOutput<QuantizedType>(),
+  EXPECT_THAT(dequantized_output, ElementsAreArray(ArrayFloatNear(
-      ElementsAreArray(ArrayFloatNear(float_ref_output_values, kTolerance)));
+                                      float_ref_output_values, kTolerance)));
 }
 template <typename QuantizedType>
 void TestQuantizedHardSwishBias(TensorType tensor_type, float input_min,
                                float input_max, float output_min,
                                float output_max, float tolerated_bias) {
  const float quantized_type_range =
      static_cast<float>(std::numeric_limits<QuantizedType>::max()) -
      static_cast<float>(std::numeric_limits<QuantizedType>::min());
  const float input_scale = (input_max - input_min) / quantized_type_range;
  const float output_scale = (output_max - output_min) / quantized_type_range;
  const float max_scale = std::max(output_scale, input_scale);
  // In this bias-focused test case, no need for randomly generated input
  // values.
  ASSERT_LE(input_min, -3.0f);
  ASSERT_GE(input_max, 3.0f);
  const int quantized_input_negative_three =
      std::round(std::numeric_limits<QuantizedType>::min() +
                 (-3.0f - input_min) / input_scale);
  const int quantized_input_positive_three =
      std::round(std::numeric_limits<QuantizedType>::min() +
                 (3.0f - input_min) / input_scale);
  std::vector<float> float_input_values;
  for (int i = quantized_input_negative_three;
       i <= quantized_input_positive_three; i++) {
    float_input_values.push_back(
        input_min +
        (i - std::numeric_limits<QuantizedType>::min()) * input_scale);
  }
  const int size = float_input_values.size();
  std::vector<float> float_ref_output_values;
  EvalTestReferenceHardSwish(size, float_input_values,
                             &float_ref_output_values);
  for (float& val : float_ref_output_values) {
    val = std::min(output_max, std::max(output_min, val));
  }
  QuantizedActivationsOpModel m(
      BuiltinOperator_HARD_SWISH,
      /*input=*/{tensor_type, {1, 1, 1, size}, input_min, input_max},
      /*output=*/{tensor_type, {1, 1, 1, size}, output_min, output_max});
  m.SetInput<QuantizedType>(float_input_values);
  m.Invoke();
  const std::vector<float>& dequantized_output =
      m.GetDequantizedOutput<QuantizedType>();
  float sum_diff = 0;
  for (int i = 0; i < size; i++) {
    sum_diff += dequantized_output[i] - float_ref_output_values[i];
  }
  const float bias = sum_diff / (size * max_scale);
  EXPECT_LE(std::abs(bias), tolerated_bias);
 }
 TEST(FloatActivationsOpTest, HardSwish) {
@ -281,10 +336,34 @@ TEST(FloatActivationsOpTest, HardSwish) {
 TEST(QuantizedActivationsOpTest, HardSwish) {
  std::minstd_rand random_engine;
-  for (int size : {1, 2, 3, 4, 10, 20, 30, 40, 100}) {
+  std::vector<std::pair<float, float>> minmax_pairs{
-    TestQuantizedHardSwish<uint8_t>(TensorType_UINT8, size, &random_engine);
+      {0.f, 1.f}, {-2.f, 1.f}, {-5.f, 10.f}, {-40.f, 60.f}};
-    TestQuantizedHardSwish<int8_t>(TensorType_INT8, size, &random_engine);
+  for (const auto& input_minmax : minmax_pairs) {
    for (const auto& output_minmax : minmax_pairs) {
      float input_min = input_minmax.first;
      float input_max = input_minmax.second;
      float output_min = output_minmax.first;
      float output_max = output_minmax.second;
      for (int size : {1, 3, 10, 100}) {
        TestQuantizedHardSwish<uint8_t>(TensorType_UINT8, size, input_min,
                                        input_max, output_min, output_max,
                                        &random_engine);
        TestQuantizedHardSwish<int8_t>(TensorType_INT8, size, input_min,
                                       input_max, output_min, output_max,
                                       &random_engine);
      }
    }
  }
 }
 // See the comment in the reference implementation of quantized HardSwish:
 // A numerical issue significantly affecting ImageNet classification accuracy
 // with MobileNet v3 is only observable at the scale of HardSwish unit tests
 // if we monitor specifically bias. This testcase is extracted from one of the
 // HardSwish nodes in that MobileNet v3 that exhibited this issue.
 TEST(QuantizedActivationsOpTest, HardSwishBias) {
  TestQuantizedHardSwishBias<uint8_t>(TensorType_UINT8, -11.654928f, 25.036512f,
                                      -0.3905796f, 24.50887f, 0.035);
 }
 TEST(FloatActivationsOpTest, Tanh) {
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@ -5527,125 +5527,168 @@ inline void SaturateAndStore(int16x8_t src, std::int8_t* dst) {
 }
 #endif
-template <typename QuantizedType>
+template <typename T>
 inline void HardSwish(const HardSwishParams& params,
-                      const RuntimeShape& input_shape,
+                      const RuntimeShape& input_shape, const T* input_data,
-                      const QuantizedType* input_data,
+                      const RuntimeShape& output_shape, T* output_data) {
                      const RuntimeShape& output_shape,
                      QuantizedType* output_data) {
  gemmlowp::ScopedProfilingLabel label("HardSwish/Quantized");
-  // Goal: (x * relu6(x+3))/6
+
-  const int size = MatchingFlatSize(input_shape, output_shape);
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  const int32_t extra_input_shift = params.clip_input_shift;
+
  const auto in_zero_point = params.input_zero_point;
  const auto three_in = params.three_input;
  const auto six_in = params.six_input;
  const auto real_shift = params.shift;
  const auto scale = params.scale;
  const auto offset = params.output_offset;
  int i = 0;
-#ifdef USE_NEON
+  // This code heavily uses NEON saturating left shifts (vqshl*) with shift
-  const int16x8_t extra_input_shift_vec = vdupq_n_s16(extra_input_shift);
+  // amounts that can be zero, in which case we rely on the correct behavior
-  const int16x8_t three_in_vec = vdupq_n_s16(three_in);
+  // of a left shift by zero returning just its first operand unmodified.
-  const int16x8_t six_in_vec = vdupq_n_s16(six_in);
+  // Unfortunately, the Intel arm_neon_sse.h implementation of vqshl* is
-  // The quantization params of this op are designed around a reference
+  // buggy in the case of zero shift amounts, see b/137199585. That is why
-  // implementation that performs plain integer multiplication, not
+  // this NEON code path is restricted to true ARM NEON, excluding
-  // fixed-point multiplication. The 16-bit fixed-point multiplications
+  // arm_neon_sse.h. Anyway, the arm_neon_sse.h implemenation of saturating
-  // that we use here, vqrdmulhq_s16, differ from that by an (rounding)
+  // left shifts is slow scalar code, so there may not be much benefit in
-  // right shift by 15 bits. So in terms of scale and leaving aside
+  // running that over just plain reference code.
-  // accuracy considerations, we could simply compensate for that by
+  //
-  // adding 15 to real_shift. Doing so results in approximately correct results,
+  // TODO(b/137199585): revisit when this is fixed.
-  // but there is high inaccuracy in the low bits. That is because unlike
+#ifdef __ARM_NEON
-  // the integer multiplications done in the reference code, our fixed-point
+  const int16x8_t positive_reluish_multiplier_exponent_minus_one =
-  // multiplication are destructive of low bits. In order to have accurate
+      vdupq_n_s16(std::max(0, params.reluish_multiplier_exponent - 1));
-  // enough results, we move some of that bit-shifting from being applied to
+  const int16x8_t positive_reluish_multiplier_exponent_last_bit =
-  // the result to being applied to one of the operands of these fixed-point
+      vdupq_n_s16(params.reluish_multiplier_exponent > 0 ? 1 : 0);
-  // multiplications, before the information in the low bits is destroyed.
+  const int16x8_t negative_reluish_multiplier_exponent =
-  // Fortunately, one of the operands is by construction smaller than 2^8
+      vdupq_n_s16(std::min(0, params.reluish_multiplier_exponent));
-  // in absolute value, so it's safe to left-shift it by 7 bits.
+  const int16x8_t constant_32767 = vdupq_n_s16(32767);
-  static constexpr int left_shift_on_scaled_input = 7;
+  const int16x8_t output_multiplier_exponent =
-  // We now adjust the tweak to real_shift accordingly: instead of adding 15,
+      vdupq_n_s16(params.output_multiplier_exponent);
-  // we only add (15 - left_shift_on_scaled_input).
+  const int16x8_t output_zero_point = vdupq_n_s16(params.output_zero_point);
-  const int16x8_t real_shift_vec =
+  // 4x unrolled version of the below NEON loop. Read that first.
-      vdupq_n_s16(15 - left_shift_on_scaled_input + real_shift);
+  for (; i <= flat_size - 32; i += 32) {
  const int16x8_t scale_vec = vdupq_n_s16((scale + (1 << 15)) >> 16);
  const int16x8_t offset_vec = vdupq_n_s16(offset);
  const int16x8_t zero = vdupq_n_s16(0);
  for (; i <= size - 32; i += 32) {
    using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
-    int16x8x2_t in_0_1 =
+    const int16x8x2_t input_value_0_1 =
-        Load16AndSubtractZeroPoint(input_data + i + 0, in_zero_point);
+        Load16AndSubtractZeroPoint(input_data + i, params.input_zero_point);
-    int16x8x2_t in_2_3 =
+    const int16x8x2_t input_value_2_3 = Load16AndSubtractZeroPoint(
-        Load16AndSubtractZeroPoint(input_data + i + 16, in_zero_point);
+        input_data + i + 16, params.input_zero_point);
-    int16x8_t in_reluish_0 = vshlq_s16(in_0_1.val[0], extra_input_shift_vec);
+    const int16x8_t input_value_on_hires_input_scale_0 =
-    int16x8_t in_reluish_1 = vshlq_s16(in_0_1.val[1], extra_input_shift_vec);
+        vshlq_n_s16(input_value_0_1.val[0], 7);
-    int16x8_t in_reluish_2 = vshlq_s16(in_2_3.val[0], extra_input_shift_vec);
+    const int16x8_t input_value_on_hires_input_scale_1 =
-    int16x8_t in_reluish_3 = vshlq_s16(in_2_3.val[1], extra_input_shift_vec);
+        vshlq_n_s16(input_value_0_1.val[1], 7);
-    in_reluish_0 = vaddq_s16(in_reluish_0, three_in_vec);
+    const int16x8_t input_value_on_hires_input_scale_2 =
-    in_reluish_1 = vaddq_s16(in_reluish_1, three_in_vec);
+        vshlq_n_s16(input_value_2_3.val[0], 7);
-    in_reluish_2 = vaddq_s16(in_reluish_2, three_in_vec);
+    const int16x8_t input_value_on_hires_input_scale_3 =
-    in_reluish_3 = vaddq_s16(in_reluish_3, three_in_vec);
+        vshlq_n_s16(input_value_2_3.val[1], 7);
-    in_reluish_0 = vminq_s16(in_reluish_0, six_in_vec);
+    const int16x8_t input_value_on_preshift_output_scale_0 =
-    in_reluish_1 = vminq_s16(in_reluish_1, six_in_vec);
+        vqrdmulhq_n_s16(input_value_on_hires_input_scale_0,
-    in_reluish_2 = vminq_s16(in_reluish_2, six_in_vec);
+                        params.output_multiplier_fixedpoint_int16);
-    in_reluish_3 = vminq_s16(in_reluish_3, six_in_vec);
+    const int16x8_t input_value_on_preshift_output_scale_1 =
-    in_reluish_0 = vmaxq_s16(in_reluish_0, zero);
+        vqrdmulhq_n_s16(input_value_on_hires_input_scale_1,
-    in_reluish_1 = vmaxq_s16(in_reluish_1, zero);
+                        params.output_multiplier_fixedpoint_int16);
-    in_reluish_2 = vmaxq_s16(in_reluish_2, zero);
+    const int16x8_t input_value_on_preshift_output_scale_2 =
-    in_reluish_3 = vmaxq_s16(in_reluish_3, zero);
+        vqrdmulhq_n_s16(input_value_on_hires_input_scale_2,
-    int16x8_t in_scaled_0 = vqrdmulhq_s16(
+                        params.output_multiplier_fixedpoint_int16);
-        vshlq_n_s16(in_0_1.val[0], left_shift_on_scaled_input), scale_vec);
+    const int16x8_t input_value_on_preshift_output_scale_3 =
-    int16x8_t in_scaled_1 = vqrdmulhq_s16(
+        vqrdmulhq_n_s16(input_value_on_hires_input_scale_3,
-        vshlq_n_s16(in_0_1.val[1], left_shift_on_scaled_input), scale_vec);
+                        params.output_multiplier_fixedpoint_int16);
-    int16x8_t in_scaled_2 = vqrdmulhq_s16(
+    int16x8_t reluish_value_0 = input_value_on_hires_input_scale_0;
-        vshlq_n_s16(in_2_3.val[0], left_shift_on_scaled_input), scale_vec);
+    int16x8_t reluish_value_1 = input_value_on_hires_input_scale_1;
-    int16x8_t in_scaled_3 = vqrdmulhq_s16(
+    int16x8_t reluish_value_2 = input_value_on_hires_input_scale_2;
-        vshlq_n_s16(in_2_3.val[1], left_shift_on_scaled_input), scale_vec);
+    int16x8_t reluish_value_3 = input_value_on_hires_input_scale_3;
-    int16x8_t product_0 = vqrdmulhq_s16(in_scaled_0, in_reluish_0);
+    reluish_value_0 = vqshlq_s16(
-    int16x8_t product_1 = vqrdmulhq_s16(in_scaled_1, in_reluish_1);
+        reluish_value_0, positive_reluish_multiplier_exponent_minus_one);
-    int16x8_t product_2 = vqrdmulhq_s16(in_scaled_2, in_reluish_2);
+    reluish_value_1 = vqshlq_s16(
-    int16x8_t product_3 = vqrdmulhq_s16(in_scaled_3, in_reluish_3);
+        reluish_value_1, positive_reluish_multiplier_exponent_minus_one);
-    product_0 = vrshlq_s16(product_0, real_shift_vec);
+    reluish_value_2 = vqshlq_s16(
-    product_1 = vrshlq_s16(product_1, real_shift_vec);
+        reluish_value_2, positive_reluish_multiplier_exponent_minus_one);
-    product_2 = vrshlq_s16(product_2, real_shift_vec);
+    reluish_value_3 = vqshlq_s16(
-    product_3 = vrshlq_s16(product_3, real_shift_vec);
+        reluish_value_3, positive_reluish_multiplier_exponent_minus_one);
-    SaturateAndStore(vaddq_s16(product_0, offset_vec), output_data + i + 0);
+    reluish_value_0 = vqrdmulhq_n_s16(
-    SaturateAndStore(vaddq_s16(product_1, offset_vec), output_data + i + 8);
+        reluish_value_0, params.reluish_multiplier_fixedpoint_int16);
-    SaturateAndStore(vaddq_s16(product_2, offset_vec), output_data + i + 16);
+    reluish_value_1 = vqrdmulhq_n_s16(
-    SaturateAndStore(vaddq_s16(product_3, offset_vec), output_data + i + 24);
+        reluish_value_1, params.reluish_multiplier_fixedpoint_int16);
    reluish_value_2 = vqrdmulhq_n_s16(
        reluish_value_2, params.reluish_multiplier_fixedpoint_int16);
    reluish_value_3 = vqrdmulhq_n_s16(
        reluish_value_3, params.reluish_multiplier_fixedpoint_int16);
    reluish_value_0 = vqshlq_s16(reluish_value_0,
                                 positive_reluish_multiplier_exponent_last_bit);
    reluish_value_1 = vqshlq_s16(reluish_value_1,
                                 positive_reluish_multiplier_exponent_last_bit);
    reluish_value_2 = vqshlq_s16(reluish_value_2,
                                 positive_reluish_multiplier_exponent_last_bit);
    reluish_value_3 = vqshlq_s16(reluish_value_3,
                                 positive_reluish_multiplier_exponent_last_bit);
    reluish_value_0 =
        vrshlq_s16(reluish_value_0, negative_reluish_multiplier_exponent);
    reluish_value_1 =
        vrshlq_s16(reluish_value_1, negative_reluish_multiplier_exponent);
    reluish_value_2 =
        vrshlq_s16(reluish_value_2, negative_reluish_multiplier_exponent);
    reluish_value_3 =
        vrshlq_s16(reluish_value_3, negative_reluish_multiplier_exponent);
    reluish_value_0 = vrhaddq_s16(reluish_value_0, constant_32767);
    reluish_value_1 = vrhaddq_s16(reluish_value_1, constant_32767);
    reluish_value_2 = vrhaddq_s16(reluish_value_2, constant_32767);
    reluish_value_3 = vrhaddq_s16(reluish_value_3, constant_32767);
    const int16x8_t preshift_output_value_0 =
        vqdmulhq_s16(reluish_value_0, input_value_on_preshift_output_scale_0);
    const int16x8_t preshift_output_value_1 =
        vqdmulhq_s16(reluish_value_1, input_value_on_preshift_output_scale_1);
    const int16x8_t preshift_output_value_2 =
        vqdmulhq_s16(reluish_value_2, input_value_on_preshift_output_scale_2);
    const int16x8_t preshift_output_value_3 =
        vqdmulhq_s16(reluish_value_3, input_value_on_preshift_output_scale_3);
    int16x8_t output_value_0 =
        vrshlq_s16(preshift_output_value_0, output_multiplier_exponent);
    int16x8_t output_value_1 =
        vrshlq_s16(preshift_output_value_1, output_multiplier_exponent);
    int16x8_t output_value_2 =
        vrshlq_s16(preshift_output_value_2, output_multiplier_exponent);
    int16x8_t output_value_3 =
        vrshlq_s16(preshift_output_value_3, output_multiplier_exponent);
    output_value_0 = vaddq_s16(output_value_0, output_zero_point);
    output_value_1 = vaddq_s16(output_value_1, output_zero_point);
    output_value_2 = vaddq_s16(output_value_2, output_zero_point);
    output_value_3 = vaddq_s16(output_value_3, output_zero_point);
    SaturateAndStore(output_value_0, output_data + i);
    SaturateAndStore(output_value_1, output_data + i + 8);
    SaturateAndStore(output_value_2, output_data + i + 16);
    SaturateAndStore(output_value_3, output_data + i + 24);
  }
-  for (; i <= size - 8; i += 8) {
+  // NEON version of reference_ops::HardSwish. Read that first.
  for (; i <= flat_size - 8; i += 8) {
    using cpu_backend_gemm::detail::Load8AndSubtractZeroPoint;
-    // See comments in the float NEON HardSwish implementation.
+    const int16x8_t input_value =
-    int16x8_t in = Load8AndSubtractZeroPoint(input_data + i, in_zero_point);
+        Load8AndSubtractZeroPoint(input_data + i, params.input_zero_point);
-    int16x8_t in_reluish = vshlq_s16(in, extra_input_shift_vec);
+    const int16x8_t input_value_on_hires_input_scale =
-    in_reluish = vaddq_s16(in_reluish, three_in_vec);
+        vshlq_n_s16(input_value, 7);
-    in_reluish = vminq_s16(in_reluish, six_in_vec);
+    const int16x8_t input_value_on_preshift_output_scale =
-    in_reluish = vmaxq_s16(zero, in_reluish);
+        vqrdmulhq_n_s16(input_value_on_hires_input_scale,
-    int16x8_t in_scaled =
+                        params.output_multiplier_fixedpoint_int16);
-        vqrdmulhq_s16(vshlq_n_s16(in, left_shift_on_scaled_input), scale_vec);
+    int16x8_t reluish_value = input_value_on_hires_input_scale;
-    int16x8_t product = vqrdmulhq_s16(in_scaled, in_reluish);
+    reluish_value = vqshlq_s16(reluish_value,
-    product = vrshlq_s16(product, real_shift_vec);
+                               positive_reluish_multiplier_exponent_minus_one);
-    SaturateAndStore(vaddq_s16(product, offset_vec), output_data + i);
+    reluish_value = vqrdmulhq_n_s16(reluish_value,
                                    params.reluish_multiplier_fixedpoint_int16);
    reluish_value = vqshlq_s16(reluish_value,
                               positive_reluish_multiplier_exponent_last_bit);
    reluish_value =
        vrshlq_s16(reluish_value, negative_reluish_multiplier_exponent);
    reluish_value = vrhaddq_s16(reluish_value, constant_32767);
    const int16x8_t preshift_output_value =
        vqdmulhq_s16(reluish_value, input_value_on_preshift_output_scale);
    int16x8_t output_value =
        vrshlq_s16(preshift_output_value, output_multiplier_exponent);
    output_value = vaddq_s16(output_value, output_zero_point);
    SaturateAndStore(output_value, output_data + i);
  }
 #endif
-  for (; i < size; i++) {
+  // TODO(b/137208495): revisit when unit tests cover reference code.
-    int32_t v = static_cast<int32>(input_data[i]);
+  // Fall back to reference_ops::HardSwish. In general we have preferred
-    v -= in_zero_point;  // Make zeros - zero again!
+  // to duplicate such scalar code rather than call reference code to handle
-
+  // leftovers, thinking that code duplication was not a big concern.
-    // Computes x + 3 in input * 2^extra_input_shift scale.
+  // However, most of our unit tests happen to test only optimized code,
-    //
+  // and the quantized HardSwish implementation is nontrivial enough that
-    // Note: three_in is in that scale already.
+  // I really want test coverage for the reference code.
-    const int32_t v3 = (v << extra_input_shift) + three_in;
+  if (i < flat_size) {
-
+    const RuntimeShape leftover_shape{flat_size - i};
-    // Computes hard-swish up to a final scale
+    reference_ops::HardSwish(params, leftover_shape, input_data + i,
-    v *= std::min(six_in, std::max(0, v3));
+                             leftover_shape, output_data + i);
    // this converts from x * relu6(x+3) in input into x * relu6(x+3) / 6
    // in output scale.
    v = MultiplyByQuantizedMultiplierSmallerThanOneExp(v, scale, real_shift);
    v += offset;
    output_data[i] = reference_ops::Saturate<QuantizedType>(v);
  }
 }
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@ -4580,11 +4580,23 @@ inline void HardSwish(const RuntimeShape& input_shape, const T* input_data,
  }
 }
-template <typename T>
+inline int16_t SaturatingLeftShift(int16_t value, int amount) {
-inline T Saturate(int32_t v) {
+  int32_t result = static_cast<int32_t>(value) * (1 << amount);
-  return static_cast<T>(std::min(
+  result = std::min<int32_t>(result, std::numeric_limits<int16_t>::max());
-      static_cast<int32_t>(std::numeric_limits<T>::max()),
+  result = std::max<int32_t>(result, std::numeric_limits<int16_t>::min());
-      std::max(static_cast<int32_t>(std::numeric_limits<T>::min()), v)));
+  return result;
 }
 // Similar to ARM instruction SQDMULH.
 // Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
 // rounding to zero instead of to nearest (SQRDMULH).
 inline std::int16_t SaturatingDoublingHighMul(std::int16_t a, std::int16_t b) {
  bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
  std::int32_t a_32(a);
  std::int32_t b_32(b);
  std::int32_t ab_32 = a_32 * b_32;
  std::int16_t ab_x2_high16 = static_cast<std::int16_t>((ab_32) / (1 << 15));
  return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
 }
 template <typename T>
@ -4592,36 +4604,103 @@ inline void HardSwish(const HardSwishParams& params,
                      const RuntimeShape& input_shape, const T* input_data,
                      const RuntimeShape& output_shape, T* output_data) {
  gemmlowp::ScopedProfilingLabel label("ReferenceHardSwish/Quantized");
-  // Goal: (x * relu6(x+3))/6
+
  const T* in = input_data;
  T* out = output_data;
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  const T* in_end = in + flat_size;
  const int32_t extra_input_shift = params.clip_input_shift;
  const auto in_zero_point = params.input_zero_point;
  const auto three_in = params.three_input;
  const auto six_in = params.six_input;
  const auto real_shift = params.shift;
  const auto scale = params.scale;
  const auto offset = params.output_offset;
-  for (; in < in_end; in++, out++) {
+  for (int i = 0; i < flat_size; i++) {
-    int32_t v = static_cast<int32>(*in);
+    const int16_t input_value = input_data[i] - params.input_zero_point;
-    v -= in_zero_point;  // Make zeros - zero again!
+    // Left-shift as much as we can without overflow/saturation to put
-
+    // significant bits in the high bits of our 16-bit fixedpoint values, so
-    // Computes x + 3 in input * 2^extra_input_shift scale.
+    // that fixed-point approximate computations below are as accurate as
    // possible.
    const int16_t input_value_on_hires_input_scale = input_value << 7;
    // Compute the input value on essentially the output scale, just not
    // right-shifted yet. This is the value that we'll use in the (x >= +3)
    // case, and that in the general case we'll multiply against the "relu-ish"
    // fixed-point multiplier in [0, 1].
    const int16_t input_value_on_preshift_output_scale =
        gemmlowp::SaturatingRoundingDoublingHighMul(
            input_value_on_hires_input_scale,
            params.output_multiplier_fixedpoint_int16);
    // Now compute the "relu-ish multiplier". In the (-3 <= x <= +3) case, that
    // is just an affine rescaling of x from [-3, 3] to [0, 1]. In the general
    // case, it is just that plus saturation at the boundaries of [-3, 3].
    // First, we rescale from [-3, 3] to [-1, 1], saturating.
    // That is done by rescaling the input value with a fixed-point multiplier
    // (reluish_multiplier_fixedpoint) and bit-shift such that we represent
    // that input value on the scale where the real value 3.0f is represented
    // by the quantized value 32768.  (+32768 is actually not representable as
    // int16, so this saturates at +32767, and that is seen empirically to be
    // a negligible contribution to numerical error/bias).
    //
-    // Note: three_in is in that scale already.
+    // This code is careful to correctly implement any magnitude of multiplier,
-    const int32_t v3 = (v << extra_input_shift) + three_in;
+    // involving either a right shift or a left shift, with correct saturation
-
+    // behavior in the left-shift case. This forces this code to be more
-    // Computes hard-swish up to a final scale
+    // complicated, but is necessary for real applications: a partially
-    v *= std::min(six_in, std::max(0, v3));
+    // trained quantized MobileNet v3-small model that motivated this code
-
+    // exhibits some large [min, max] range boundaries, of the order of
-    // this converts from x * relu6(x+3) in input into x * relu6(x+3) / 6
+    // magnitude of 10 or 100 depending on layers.
-    // in output scale.
+    //
-    v = MultiplyByQuantizedMultiplierSmallerThanOneExp(v, scale, real_shift);
+    // The next few lines are basically just an ordinary
-    v += offset;
+    // MultiplyByQuantizedMultiplier, except that we are more careful here
-    *out = Saturate<uint8>(v);
+    // about the fine details of saturation when left-shifting, because here
    // overflow in left-shift is a common case, not an anomaly as
    // MultiplyByQuantizedMultiplier assumes.
    int16_t reluish_value = input_value_on_hires_input_scale;
    // Shift left, saturating, as much as we can while ensuring that this
    // saturation will not contribute to the result. That is, left shift amount
    // reduced by 1.
    if (params.reluish_multiplier_exponent > 0) {
      reluish_value = SaturatingLeftShift(
          reluish_value, params.reluish_multiplier_exponent - 1);
    }
    // Apply the fixed-point multiplier, dividing the value by a divisor
    // ranging in [1, 2].
    reluish_value = gemmlowp::SaturatingRoundingDoublingHighMul(
        reluish_value, params.reluish_multiplier_fixedpoint_int16);
    // Apply the last bit of left-shift. Thus, in the left-shifting case, if
    // any saturation affects the result, it is happening here --- any
    // saturation having occurred above is overwritten here, not affecting the
    // result.
    if (params.reluish_multiplier_exponent > 0) {
      reluish_value = SaturatingLeftShift(reluish_value, 1);
    }
    // Shift right, in the right-shifting case.
    if (params.reluish_multiplier_exponent < 0) {
      reluish_value = gemmlowp::RoundingDivideByPOT(
          reluish_value, -params.reluish_multiplier_exponent);
    }
    // At this point we have rescaled the value into a 16bit fixedpoint
    // reluish_value in [-1, 1].
    // We now convert that to a 16bit fixedpoint value in [0, 1].
    reluish_value = (reluish_value + (1 << 15)) >> 1;
    // Use of SaturatingDoublingHighMul here is important to cancel the biases
    // from the above SaturatingRoundingDoublingHighMul.
    //
    // On a partially trained MobileNet-v3-small,
    //
    //                                       | bias on    |  ImageNet
    //                                       | quantized  |  Top-1
    // Operation used here                   | values     |  accuracy (50k)
    // --------------------------------------+------------+-----------
    // SaturatingDoublingHighMul             | -0.0024    |  58.920
    // SaturatingRoundingDoublingHighMul     | -0.0067    |  58.064
    //
    // In activations_test, this is covered by this testcase:
    //     QuantizedActivationsOpTest.HardSwishBias
    //
    const int16_t preshift_output_value = SaturatingDoublingHighMul(
        reluish_value, input_value_on_preshift_output_scale);
    // We were so far operating on the pre-shift output scale. Now we finally
    // apply that output shift, arriving at the final output scale.
    int16_t output_value = gemmlowp::RoundingDivideByPOT(
        preshift_output_value, -params.output_multiplier_exponent);
    output_value += params.output_zero_point;
    output_value =
        std::min<int16_t>(output_value, std::numeric_limits<T>::max());
    output_value =
        std::max<int16_t>(output_value, std::numeric_limits<T>::min());
    output_data[i] = output_value;
  }
 }
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@ -873,35 +873,24 @@ struct LocalResponseNormalizationParams {
 };
 struct HardSwishParams {
-  // uint8 inference params
+  // zero_point of the input activations.
-
+  int16_t input_zero_point;
-  // Contains input->params.zero_point
+  // zero_point of the output activations.
-  int32_t input_zero_point;
+  int16_t output_zero_point;
-
+  // 16bit fixed-point component of the multiplier to apply to go from the
-  // when computing relu6(x+3), we scale input by using bit-shift
+  // "high-res input scale", which is the input scale multiplied by 2^7, to the
-  // to avoid loss of precision when doing computation in uint8 (and 6 might not
+  // "relu-ish scale", which 3.0/32768.
-  // be exactly representable in that scale.
+  // See the implementation of HardSwishPrepare.
-  // This flag contains number of bits to shift.
+  int16_t reluish_multiplier_fixedpoint_int16;
-  int32_t clip_input_shift;
+  // exponent/bit-shift component of the aforementioned multiplier.
-
+  int reluish_multiplier_exponent;
-  // Added to the final output to bring the output's zero_point in order.
+  // 16bit fixed-point component of the multiplier to apply to go from the
-  int32_t output_offset;
+  // "high-res input scale", which is the input scale multiplied by 2^7, to the
-
+  // output scale.
-  // Scale that converts x*relu6(x+3) computed in input range
+  // See the implementation of HardSwishPrepare.
-  // into output range x * relu6(x + 3) / 6.
+  int16_t output_multiplier_fixedpoint_int16;
-  // This takes into account that hardswish is quadratic
+  // exponent/bit-shift component of the aforementioned multiplier.
-  // so we have in_scale^2/out_scale.
+  int output_multiplier_exponent;
  // This is the integer nominator pat of the multiplier
  int32_t scale;
  // this is the denominator 2^shift of the multiplier
  int shift;
  // 3 in input 0-centered scale
  int32_t three_input;
  // 6 in input 0-centered scale
  int32_t six_input;
 };
 struct LogisticParams {