Merge pull request #43320 from tensorflow:revert-43127-revert-38873-tflu_softnax_int16_ref

PiperOrigin-RevId: 333185684 Change-Id: I4d8a2845286ba905b16cebda204ef006c1a8e535
2020-09-22 16:49:49 -07:00 · 2020-09-22 16:49:49 -07:00 · 72c2f694be
commit 72c2f694be
parent 568e2bef00 5affea736f
6 changed files with 211 additions and 30 deletions
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@ -226,6 +226,17 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
    }                                                                      \
  } while (0)

+#define TF_LITE_ENSURE_NEAR(context, a, b, epsilon)                          \
+  do {                                                                       \
+    auto delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a));                    \
+    if (delta > epsilon) {                                                   \
+      TF_LITE_KERNEL_LOG((context), "%s:%d %s not near %s (%f != %f)",       \
+                         __FILE__, __LINE__, #a, #b, static_cast<double>(a), \
+                         static_cast<double>(b));                            \
+      return kTfLiteError;                                                   \
+    }                                                                        \
+  } while (0)
+
 #define TF_LITE_ENSURE_OK(context, status) \
  do {                                     \
    const TfLiteStatus s = (status);       \
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@ -241,8 +241,12 @@ inline Integer FloorLog2(Integer n) {

 // generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
 // softmax
-inline void gen_lut(const std::function<double(double)>& func, double min,
-                    double max, int16_t* table, const int num) {
+// func - the function to build the LUT for (e.g exp(x))
+// min,max - table limits
+// table - pointer to buffer
+// num - number of elements in the LUT
+inline void gen_lut(double (*func)(double), double min, double max,
+                    int16_t* table, const int num) {
  // size of table should equal to num + 1
  // last element only for slope calculation
  double step = (max - min) / (num - 1);
@ -263,6 +267,34 @@ inline void gen_lut(const std::function<double(double)>& func, double min,
      std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
 }

+// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
+// softmax
+// func - the function to build the LUT for (e.g exp(x))
+// min,max - table limits
+// table - pointer to buffer
+// num - number of elements in the LUT
+inline void gen_lut(float (*func)(float), float min, float max, int16_t* table,
+                    const int num) {
+  // size of table should equal to num + 1
+  // last element only for slope calculation
+  float step = (max - min) / (num - 1);
+  float half_step = step / 2.0f;
+  for (int i = 0; i < num - 1; i++) {
+    float sample_val = TfLiteRound(func(min + i * step) * 32768.0f);
+    float midpoint_interp_val =
+        TfLiteRound((func(min + (i + 1) * step) * 32768.0f +
+                     TfLiteRound(func(min + i * step) * 32768.0f)) /
+                    2.0f);
+    float midpoint_val =
+        TfLiteRound(func(min + i * step + half_step) * 32768.0f);
+    float midpoint_err = midpoint_interp_val - midpoint_val;
+    float bias = TfLiteRound(midpoint_err / 2.0f);
+    table[i] = std::min(std::max(sample_val - bias, -32768.0f), 32767.0f);
+  }
+  table[num - 1] = std::min(
+      std::max(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
+}
+
 // int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
 inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) {
  // 512 base value, lut[513] only for calculate slope
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@ -1044,7 +1044,9 @@ struct SoftmaxParams {
  int32_t zero_point;
  float scale;
  float* table;
+  // int16 LUT for exp(x), where x uniform distributed between [-10.0 , 0.0]
  int16_t* exp_lut;
+  // int16 LUT for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0]
  int16_t* one_over_one_plus_x_lut;
  uint8_t* uint8_table1;
  uint8_t* uint8_table2;
--- a/tensorflow/lite/micro/kernels/softmax.cc
+++ b/tensorflow/lite/micro/kernels/softmax.cc
@ -30,23 +30,30 @@ namespace micro {
 namespace activations {
 namespace {

+// Softmax parameter data that persists in user_data
+static constexpr int kInt16LUTArraySize = 513;
+
 TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
                                    const TfLiteTensor* input,
                                    TfLiteTensor* output,
                                    const TfLiteSoftmaxParams* params,
                                    SoftmaxParams* op_data) {
-  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8 ||
+      input->type == kTfLiteInt16) {
    if (input->type == kTfLiteUInt8) {
      TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteUInt8);
      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    } else {
+    } else if (input->type == kTfLiteInt16) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+      TF_LITE_ENSURE_NEAR(context, output->params.scale, 1.f / 32768,
+                          (0.001f * 1.f / 32768));
+    } else {  // input->type == kTfLiteInt8
      TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
      if (output->type == kTfLiteInt16) {
        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
-        // NOTE: Current int16_t softmax output does not require symmetric
-        // scaling
-        // - so no need to verify scale here.
-      } else {
+        TF_LITE_ENSURE_NEAR(context, output->params.scale, 1.f / 65536,
+                            (0.001f * 1.f / 65536));
+      } else {  // output->type == kTfLiteint8
        TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
        TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
@ -55,15 +62,28 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,

    static const int kScaledDiffIntegerBits = 5;

-    int input_left_shift;
-    tflite::PreprocessSoftmaxScaling(
-        static_cast<double>(params->beta),
-        static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
-        &op_data->input_multiplier, &input_left_shift);
-    op_data->input_left_shift = input_left_shift;
-    op_data->diff_min =
-        -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
-                                            op_data->input_left_shift);
+    // Calculate input_multiplier and input_left_shift
+    if (input->type == kTfLiteInt16) {
+      int input_left_shift;
+      double input_scale_beta_rescale =
+          static_cast<double>(input->params.scale) *
+          static_cast<double>(params->beta) /
+          (10.0 / 65535.0);  // scale the input_diff such that [-65535, 0]
+                             // correspond to [-10.0, 0.0]
+      QuantizeMultiplier(input_scale_beta_rescale, &op_data->input_multiplier,
+                         &input_left_shift);
+      op_data->input_left_shift = input_left_shift;
+    } else {
+      int input_left_shift;
+      tflite::PreprocessSoftmaxScaling(
+          static_cast<double>(params->beta),
+          static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
+          &op_data->input_multiplier, &input_left_shift);
+      op_data->input_left_shift = input_left_shift;
+      op_data->diff_min =
+          -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
+                                              op_data->input_left_shift);
+    }
  } else {
    TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
@ -91,7 +111,7 @@ void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
        tflite::micro::GetTensorData<uint8_t>(input),
        tflite::micro::GetTensorShape(output),
        tflite::micro::GetTensorData<uint8_t>(output));
-  } else {
+  } else if (input->type == kTfLiteInt8) {
    if (output->type == kTfLiteInt16) {
      tflite::reference_ops::Softmax(
          op_data, tflite::micro::GetTensorShape(input),
@ -105,6 +125,12 @@ void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
          tflite::micro::GetTensorShape(output),
          tflite::micro::GetTensorData<int8_t>(output));
    }
+  } else {
+    tflite::reference_ops::SoftmaxInt16(
+        op_data, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int16_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int16_t>(output));
  }
 }

@ -114,20 +140,52 @@ void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
 }

 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
-
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
  const TfLiteTensor* input = GetInput(context, node, 0);
  TF_LITE_ENSURE(context, input != nullptr);
  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
-
  TfLiteTensor* output = GetOutput(context, node, 0);
  TF_LITE_ENSURE(context, output != nullptr);

-  TFLITE_DCHECK(node->user_data != nullptr);
-  SoftmaxParams* data = static_cast<SoftmaxParams*>(node->user_data);
-  return CalculateSoftmaxParams(context, input, output, params, data);
+  TF_LITE_ENSURE(context, node->user_data != nullptr);
+  SoftmaxParams* op_data = static_cast<SoftmaxParams*>(node->user_data);
+  // Only allocate LUTs for KTfLiteInt16 data type
+  if (input->type == kTfLiteInt16) {
+    void* raw_exp_lut = context->AllocatePersistentBuffer(
+        context, sizeof(int16_t) * kInt16LUTArraySize);
+    TF_LITE_ENSURE(context, raw_exp_lut != nullptr);
+    op_data->exp_lut = reinterpret_cast<int16_t*>(raw_exp_lut);
+    void* one_over_one_plus_x_lut = context->AllocatePersistentBuffer(
+        context, sizeof(int16_t) * kInt16LUTArraySize);
+    TF_LITE_ENSURE(context, one_over_one_plus_x_lut != nullptr);
+    op_data->one_over_one_plus_x_lut =
+        reinterpret_cast<int16_t*>(one_over_one_plus_x_lut);
+  }
+
+  if (output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE(context, input->type == kTfLiteInt8 ||
+                                input->type == kTfLiteUInt8 ||
+                                input->type == kTfLiteInt16);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  }
+
+  // Populate LUT if required
+  if (input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    // exp LUT only used on negative values
+    // we consider exp(-10.0) is insignificant to accumulation
+    gen_lut([](float value) { return std::exp(value); }, -10.0f, 0.0f,
+            op_data->exp_lut, kInt16LUTArraySize);
+    gen_lut([](float value) { return 1.0f / (1.0f + value); }, 0.0f, 1.0f,
+            op_data->one_over_one_plus_x_lut, kInt16LUTArraySize);
+    op_data->zero_point = output->params.zero_point;
+    op_data->scale = output->params.scale;
+  }
+
+  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+  return CalculateSoftmaxParams(context, input, output, params, op_data);
 }

 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
@ -135,16 +193,17 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);

  TFLITE_DCHECK(node->user_data != nullptr);
-  SoftmaxParams* data = static_cast<SoftmaxParams*>(node->user_data);
+  SoftmaxParams op_data = *static_cast<SoftmaxParams*>(node->user_data);

  switch (input->type) {
    case kTfLiteFloat32: {
-      SoftmaxFloat(input, output, *data);
+      SoftmaxFloat(input, output, op_data);
      return kTfLiteOk;
    }
    case kTfLiteInt8:
-    case kTfLiteUInt8: {
-      SoftmaxQuantized(input, output, *data);
+    case kTfLiteUInt8:
+    case kTfLiteInt16: {
+      SoftmaxQuantized(input, output, op_data);
      return kTfLiteOk;
    }
    default:
--- a/tensorflow/lite/micro/kernels/softmax_test.cc
+++ b/tensorflow/lite/micro/kernels/softmax_test.cc
@ -28,8 +28,13 @@ namespace {
 // quantization parameters.
 const float output_scale_int8 = 1.0f / 256.0f;
 const float output_scale_uint8 = 1.0f / 256.0f;
+const float output_scale_int16 = 1.0f / 32768.0f;
 const int output_zero_point_int8 = -128;
 const int output_zero_point_uint8 = 0;
+const int output_zero_point_int16 = 0;
+
+// Empirical tolerance in quantization space
+const float tolerance_int16 = 7.0;

 // 1-dimensional test data.
 const int flat_size_1d = 5;
@ -291,7 +296,7 @@ void TestSoftmaxQuantized(const int* input_dims_data, const float* input_data,
                          int input_zero_point, const int* output_dims_data,
                          const float* golden, T* golden_quantized,
                          float output_scale, int output_zero_point,
-                          T* output_data) {
+                          T* output_data, float tolerance = 1.0) {
  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
  const int output_dims_count = ElementCount(*output_dims);
@ -310,7 +315,7 @@ void TestSoftmaxQuantized(const int* input_dims_data, const float* input_data,
                     output_zero_point);

  ValidateSoftmaxGoldens(tensors, tensors_size, output_data, golden_quantized,
-                         output_dims_count, 1.0);
+                         output_dims_count, tolerance);
 }

 }  // namespace
@ -356,6 +361,21 @@ TF_LITE_MICRO_TEST(Softmax1DQuantizedInt8ShouldMatchGolden) {
      tflite::testing::output_zero_point_int8, output_data);
 }

+TF_LITE_MICRO_TEST(Softmax1DQuantizedInt16ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 0;
+
+  int16_t input_quantized[tflite::testing::flat_size_1d];
+  int16_t golden_quantized[tflite::testing::flat_size_1d];
+  int16_t output_data[tflite::testing::flat_size_1d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_1d, tflite::testing::input_data_1d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_1d,
+      tflite::testing::golden_1d, golden_quantized,
+      tflite::testing::output_scale_int16,
+      tflite::testing::output_zero_point_int16, output_data);
+}
+
 TF_LITE_MICRO_TEST(Softmax2DFloatShouldMatchGolden) {
  float output_data[tflite::testing::flat_size_2d];
  tflite::testing::TestSoftmaxFloat(
@ -393,6 +413,21 @@ TF_LITE_MICRO_TEST(Softmax2DQuantizedInt8ShouldMatchGolden) {
      tflite::testing::output_zero_point_int8, output_data);
 }

+TF_LITE_MICRO_TEST(Softmax2DQuantizedInt16ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 0;
+
+  int16_t input_quantized[tflite::testing::flat_size_2d];
+  int16_t golden_quantized[tflite::testing::flat_size_2d];
+  int16_t output_data[tflite::testing::flat_size_2d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_2d, tflite::testing::input_data_2d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_2d,
+      tflite::testing::golden_2d, golden_quantized,
+      tflite::testing::output_scale_int16,
+      tflite::testing::output_zero_point_int16, output_data);
+}
+
 TF_LITE_MICRO_TEST(Softmax3DFloatShouldMatchGolden) {
  float output_data[tflite::testing::flat_size_3d];
  tflite::testing::TestSoftmaxFloat(
@ -430,6 +465,22 @@ TF_LITE_MICRO_TEST(Softmax3DQuantizedInt8ShouldMatchGolden) {
      tflite::testing::output_zero_point_int8, output_data);
 }

+TF_LITE_MICRO_TEST(Softmax3DQuantizedInt16ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 0;
+
+  int16_t input_quantized[tflite::testing::flat_size_3d];
+  int16_t golden_quantized[tflite::testing::flat_size_3d];
+  int16_t output_data[tflite::testing::flat_size_3d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_3d, tflite::testing::input_data_3d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_3d,
+      tflite::testing::golden_3d, golden_quantized,
+      tflite::testing::output_scale_int16,
+      tflite::testing::output_zero_point_int16, output_data,
+      tflite::testing::tolerance_int16);
+}
+
 TF_LITE_MICRO_TEST(Softmax4DFloatShouldMatchGolden) {
  float output_data[tflite::testing::flat_size_4d];
  tflite::testing::TestSoftmaxFloat(
@ -467,4 +518,19 @@ TF_LITE_MICRO_TEST(Softmax4DQuantizedInt8ShouldMatchGolden) {
      tflite::testing::output_zero_point_int8, output_data);
 }

+TF_LITE_MICRO_TEST(Softmax4DQuantizedInt16ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 0;
+
+  int16_t input_quantized[tflite::testing::flat_size_4d];
+  int16_t golden_quantized[tflite::testing::flat_size_4d];
+  int16_t output_data[tflite::testing::flat_size_4d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_4d, tflite::testing::input_data_4d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_4d,
+      tflite::testing::golden_4d, golden_quantized,
+      tflite::testing::output_scale_int16,
+      tflite::testing::output_zero_point_int16, output_data,
+      tflite::testing::tolerance_int16);
+}
 TF_LITE_MICRO_TESTS_END
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@ -226,6 +226,17 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
    }                                                                      \
  } while (0)

+#define TF_LITE_ENSURE_NEAR(context, a, b, epsilon)                          \
+  do {                                                                       \
+    auto delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a));                    \
+    if (delta > epsilon) {                                                   \
+      TF_LITE_KERNEL_LOG((context), "%s:%d %s not near %s (%f != %f)",       \
+                         __FILE__, __LINE__, #a, #b, static_cast<double>(a), \
+                         static_cast<double>(b));                            \
+      return kTfLiteError;                                                   \
+    }                                                                        \
+  } while (0)
+
 #define TF_LITE_ENSURE_OK(context, status) \
  do {                                     \
    const TfLiteStatus s = (status);       \