From 4ddfd31b4e113dfbf7011b1d45bcc60289bc0fd8 Mon Sep 17 00:00:00 2001 From: Advait Jain <advaitjain@users.noreply.github.com> Date: Thu, 17 Sep 2020 16:44:43 -0700 Subject: [PATCH 1/2] Revert "Revert "Add symmetric int16 support to tflu softmax reference kernel"" --- tensorflow/lite/c/common.h | 11 ++ tensorflow/lite/kernels/internal/common.h | 24 ++++ tensorflow/lite/kernels/internal/types.h | 2 + tensorflow/lite/micro/kernels/softmax.cc | 111 ++++++++++++++---- tensorflow/lite/micro/kernels/softmax_test.cc | 70 ++++++++++- .../benchmark/experimental/c/c_api_types.h | 11 ++ 6 files changed, 201 insertions(+), 28 deletions(-) diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h index d320a90d005..31405dfb998 100644 --- a/tensorflow/lite/c/common.h +++ b/tensorflow/lite/c/common.h @@ -226,6 +226,17 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a); } \ } while (0) +#define TF_LITE_ENSURE_NEAR(context, a, b, epsilon) \ + do { \ + auto delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a)); \ + if (delta > epsilon) { \ + TF_LITE_KERNEL_LOG((context), "%s:%d %s not near %s (%f != %f)", \ + __FILE__, __LINE__, #a, #b, static_cast<double>(a), \ + static_cast<double>(b)); \ + return kTfLiteError; \ + } \ + } while (0) + #define TF_LITE_ENSURE_OK(context, status) \ do { \ const TfLiteStatus s = (status); \ diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h index 66a2d977f39..92bb01a2900 100644 --- a/tensorflow/lite/kernels/internal/common.h +++ b/tensorflow/lite/kernels/internal/common.h @@ -263,6 +263,30 @@ inline void gen_lut(const std::function<double(double)>& func, double min, std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0); } +// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in +// softmax +inline void gen_lut(const std::function<float(float)>& func, float min, + float max, int16_t* table, const int num) { + // size of table should equal to num + 1 + // last element only for slope calculation + float step = (max - min) / (num - 1); + float half_step = step / 2.0f; + for (int i = 0; i < num - 1; i++) { + float sample_val = TfLiteRound(func(min + i * step) * 32768.0f); + float midpoint_interp_val = + TfLiteRound((func(min + (i + 1) * step) * 32768.0f + + TfLiteRound(func(min + i * step) * 32768.0f)) / + 2.0f); + float midpoint_val = + TfLiteRound(func(min + i * step + half_step) * 32768.0f); + float midpoint_err = midpoint_interp_val - midpoint_val; + float bias = TfLiteRound(midpoint_err / 2.0f); + table[i] = std::min(std::max(sample_val - bias, -32768.0f), 32767.0f); + } + table[num - 1] = std::min( + std::max(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f); +} + // int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) { // 512 base value, lut[513] only for calculate slope diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h index 9db742ddf03..0164f82f19e 100644 --- a/tensorflow/lite/kernels/internal/types.h +++ b/tensorflow/lite/kernels/internal/types.h @@ -1044,7 +1044,9 @@ struct SoftmaxParams { int32_t zero_point; float scale; float* table; + // int16 LUT for exp(x), where x uniform distributed between [-10.0 , 0.0] int16_t* exp_lut; + // int16 LUT for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0] int16_t* one_over_one_plus_x_lut; uint8_t* uint8_table1; uint8_t* uint8_table2; diff --git a/tensorflow/lite/micro/kernels/softmax.cc b/tensorflow/lite/micro/kernels/softmax.cc index e85c1a4a306..fa1b9caf077 100644 --- a/tensorflow/lite/micro/kernels/softmax.cc +++ b/tensorflow/lite/micro/kernels/softmax.cc @@ -30,23 +30,30 @@ namespace micro { namespace activations { namespace { +// Softmax parameter data that persists in user_data +static constexpr int kInt16LUTArraySize = 513; + TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context, const TfLiteTensor* input, TfLiteTensor* output, const TfLiteSoftmaxParams* params, SoftmaxParams* op_data) { - if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) { + if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8 || + input->type == kTfLiteInt16) { if (input->type == kTfLiteUInt8) { TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteUInt8); TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); - } else { + } else if (input->type == kTfLiteInt16) { + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); + TF_LITE_ENSURE_NEAR(context, output->params.scale, 1.f / 32768, + (0.001f * 1.f / 32768)); + } else { // input->type == kTfLiteInt8 TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8); if (output->type == kTfLiteInt16) { TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768); - // NOTE: Current int16_t softmax output does not require symmetric - // scaling - // - so no need to verify scale here. - } else { + TF_LITE_ENSURE_NEAR(context, output->params.scale, 1.f / 65536, + (0.001f * 1.f / 65536)); + } else { // output->type == kTfLiteint8 TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8); TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128); TF_LITE_ENSURE(context, output->params.scale == 1.f / 256); @@ -55,15 +62,28 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context, static const int kScaledDiffIntegerBits = 5; - int input_left_shift; - tflite::PreprocessSoftmaxScaling( - static_cast<double>(params->beta), - static_cast<double>(input->params.scale), kScaledDiffIntegerBits, - &op_data->input_multiplier, &input_left_shift); - op_data->input_left_shift = input_left_shift; - op_data->diff_min = - -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits, - op_data->input_left_shift); + // Calculate input_multiplier and input_left_shift + if (input->type == kTfLiteInt16) { + int input_left_shift; + double input_scale_beta_rescale = + static_cast<double>(input->params.scale) * + static_cast<double>(params->beta) / + (10.0 / 65535.0); // scale the input_diff such that [-65535, 0] + // correspond to [-10.0, 0.0] + QuantizeMultiplier(input_scale_beta_rescale, &op_data->input_multiplier, + &input_left_shift); + op_data->input_left_shift = input_left_shift; + } else { + int input_left_shift; + tflite::PreprocessSoftmaxScaling( + static_cast<double>(params->beta), + static_cast<double>(input->params.scale), kScaledDiffIntegerBits, + &op_data->input_multiplier, &input_left_shift); + op_data->input_left_shift = input_left_shift; + op_data->diff_min = + -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits, + op_data->input_left_shift); + } } else { TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32); TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32); @@ -91,7 +111,7 @@ void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output, tflite::micro::GetTensorData<uint8_t>(input), tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData<uint8_t>(output)); - } else { + } else if (input->type == kTfLiteInt8) { if (output->type == kTfLiteInt16) { tflite::reference_ops::Softmax( op_data, tflite::micro::GetTensorShape(input), @@ -105,6 +125,12 @@ void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output, tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData<int8_t>(output)); } + } else { + tflite::reference_ops::SoftmaxInt16( + op_data, tflite::micro::GetTensorShape(input), + tflite::micro::GetTensorData<int16_t>(input), + tflite::micro::GetTensorShape(output), + tflite::micro::GetTensorData<int16_t>(output)); } } @@ -114,18 +140,50 @@ void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) { } TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) { - auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data); - TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); const TfLiteTensor* input = GetInput(context, node, 0); TF_LITE_ENSURE(context, NumDimensions(input) >= 1); - TfLiteTensor* output = GetOutput(context, node, 0); - TFLITE_DCHECK(node->user_data != nullptr); - SoftmaxParams* data = static_cast<SoftmaxParams*>(node->user_data); - return CalculateSoftmaxParams(context, input, output, params, data); + TF_LITE_ENSURE(context, node->user_data != nullptr); + SoftmaxParams* op_data = static_cast<SoftmaxParams*>(node->user_data); + // Only allocate LUTs for KTfLiteInt16 data type + if (input->type == kTfLiteInt16) { + void* raw_exp_lut = context->AllocatePersistentBuffer( + context, sizeof(int16_t) * kInt16LUTArraySize); + TF_LITE_ENSURE(context, raw_exp_lut != nullptr); + op_data->exp_lut = reinterpret_cast<int16_t*>(raw_exp_lut); + void* one_over_one_plus_x_lut = context->AllocatePersistentBuffer( + context, sizeof(int16_t) * kInt16LUTArraySize); + TF_LITE_ENSURE(context, one_over_one_plus_x_lut != nullptr); + op_data->one_over_one_plus_x_lut = + reinterpret_cast<int16_t*>(one_over_one_plus_x_lut); + } + + if (output->type == kTfLiteInt16) { + TF_LITE_ENSURE(context, input->type == kTfLiteInt8 || + input->type == kTfLiteUInt8 || + input->type == kTfLiteInt16); + } else { + TF_LITE_ENSURE_EQ(context, input->type, output->type); + } + + // Populate LUT if required + if (input->type == kTfLiteInt16) { + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); + // exp LUT only used on negative values + // we consider exp(-10.0) is insignificant to accumulation + gen_lut([](float value) { return std::exp(value); }, -10.0f, 0.0f, + op_data->exp_lut, kInt16LUTArraySize); + gen_lut([](float value) { return 1.0f / (1.0f + value); }, 0.0f, 1.0f, + op_data->one_over_one_plus_x_lut, kInt16LUTArraySize); + op_data->zero_point = output->params.zero_point; + op_data->scale = output->params.scale; + } + + auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data); + return CalculateSoftmaxParams(context, input, output, params, op_data); } TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) { @@ -133,16 +191,17 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) { TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0); TFLITE_DCHECK(node->user_data != nullptr); - SoftmaxParams* data = static_cast<SoftmaxParams*>(node->user_data); + SoftmaxParams op_data = *static_cast<SoftmaxParams*>(node->user_data); switch (input->type) { case kTfLiteFloat32: { - SoftmaxFloat(input, output, *data); + SoftmaxFloat(input, output, op_data); return kTfLiteOk; } case kTfLiteInt8: - case kTfLiteUInt8: { - SoftmaxQuantized(input, output, *data); + case kTfLiteUInt8: + case kTfLiteInt16: { + SoftmaxQuantized(input, output, op_data); return kTfLiteOk; } default: diff --git a/tensorflow/lite/micro/kernels/softmax_test.cc b/tensorflow/lite/micro/kernels/softmax_test.cc index 27828d2de34..808ea9396ba 100644 --- a/tensorflow/lite/micro/kernels/softmax_test.cc +++ b/tensorflow/lite/micro/kernels/softmax_test.cc @@ -28,8 +28,13 @@ namespace { // quantization parameters. const float output_scale_int8 = 1.0f / 256.0f; const float output_scale_uint8 = 1.0f / 256.0f; +const float output_scale_int16 = 1.0f / 32768.0f; const int output_zero_point_int8 = -128; const int output_zero_point_uint8 = 0; +const int output_zero_point_int16 = 0; + +// Empirical tolerance in quantization space +const float tolerance_int16 = 7.0; // 1-dimensional test data. const int flat_size_1d = 5; @@ -291,7 +296,7 @@ void TestSoftmaxQuantized(const int* input_dims_data, const float* input_data, int input_zero_point, const int* output_dims_data, const float* golden, T* golden_quantized, float output_scale, int output_zero_point, - T* output_data) { + T* output_data, float tolerance = 1.0) { TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); const int output_dims_count = ElementCount(*output_dims); @@ -310,7 +315,7 @@ void TestSoftmaxQuantized(const int* input_dims_data, const float* input_data, output_zero_point); ValidateSoftmaxGoldens(tensors, tensors_size, output_data, golden_quantized, - output_dims_count, 1.0); + output_dims_count, tolerance); } } // namespace @@ -356,6 +361,21 @@ TF_LITE_MICRO_TEST(Softmax1DQuantizedInt8ShouldMatchGolden) { tflite::testing::output_zero_point_int8, output_data); } +TF_LITE_MICRO_TEST(Softmax1DQuantizedInt16ShouldMatchGolden) { + const float input_scale = 0.1f; + const int input_zero_point = 0; + + int16_t input_quantized[tflite::testing::flat_size_1d]; + int16_t golden_quantized[tflite::testing::flat_size_1d]; + int16_t output_data[tflite::testing::flat_size_1d]; + tflite::testing::TestSoftmaxQuantized( + tflite::testing::shape_1d, tflite::testing::input_data_1d, + input_quantized, input_scale, input_zero_point, tflite::testing::shape_1d, + tflite::testing::golden_1d, golden_quantized, + tflite::testing::output_scale_int16, + tflite::testing::output_zero_point_int16, output_data); +} + TF_LITE_MICRO_TEST(Softmax2DFloatShouldMatchGolden) { float output_data[tflite::testing::flat_size_2d]; tflite::testing::TestSoftmaxFloat( @@ -393,6 +413,21 @@ TF_LITE_MICRO_TEST(Softmax2DQuantizedInt8ShouldMatchGolden) { tflite::testing::output_zero_point_int8, output_data); } +TF_LITE_MICRO_TEST(Softmax2DQuantizedInt16ShouldMatchGolden) { + const float input_scale = 0.1f; + const int input_zero_point = 0; + + int16_t input_quantized[tflite::testing::flat_size_2d]; + int16_t golden_quantized[tflite::testing::flat_size_2d]; + int16_t output_data[tflite::testing::flat_size_2d]; + tflite::testing::TestSoftmaxQuantized( + tflite::testing::shape_2d, tflite::testing::input_data_2d, + input_quantized, input_scale, input_zero_point, tflite::testing::shape_2d, + tflite::testing::golden_2d, golden_quantized, + tflite::testing::output_scale_int16, + tflite::testing::output_zero_point_int16, output_data); +} + TF_LITE_MICRO_TEST(Softmax3DFloatShouldMatchGolden) { float output_data[tflite::testing::flat_size_3d]; tflite::testing::TestSoftmaxFloat( @@ -430,6 +465,22 @@ TF_LITE_MICRO_TEST(Softmax3DQuantizedInt8ShouldMatchGolden) { tflite::testing::output_zero_point_int8, output_data); } +TF_LITE_MICRO_TEST(Softmax3DQuantizedInt16ShouldMatchGolden) { + const float input_scale = 0.1f; + const int input_zero_point = 0; + + int16_t input_quantized[tflite::testing::flat_size_3d]; + int16_t golden_quantized[tflite::testing::flat_size_3d]; + int16_t output_data[tflite::testing::flat_size_3d]; + tflite::testing::TestSoftmaxQuantized( + tflite::testing::shape_3d, tflite::testing::input_data_3d, + input_quantized, input_scale, input_zero_point, tflite::testing::shape_3d, + tflite::testing::golden_3d, golden_quantized, + tflite::testing::output_scale_int16, + tflite::testing::output_zero_point_int16, output_data, + tflite::testing::tolerance_int16); +} + TF_LITE_MICRO_TEST(Softmax4DFloatShouldMatchGolden) { float output_data[tflite::testing::flat_size_4d]; tflite::testing::TestSoftmaxFloat( @@ -467,4 +518,19 @@ TF_LITE_MICRO_TEST(Softmax4DQuantizedInt8ShouldMatchGolden) { tflite::testing::output_zero_point_int8, output_data); } +TF_LITE_MICRO_TEST(Softmax4DQuantizedInt16ShouldMatchGolden) { + const float input_scale = 0.1f; + const int input_zero_point = 0; + + int16_t input_quantized[tflite::testing::flat_size_4d]; + int16_t golden_quantized[tflite::testing::flat_size_4d]; + int16_t output_data[tflite::testing::flat_size_4d]; + tflite::testing::TestSoftmaxQuantized( + tflite::testing::shape_4d, tflite::testing::input_data_4d, + input_quantized, input_scale, input_zero_point, tflite::testing::shape_4d, + tflite::testing::golden_4d, golden_quantized, + tflite::testing::output_scale_int16, + tflite::testing::output_zero_point_int16, output_data, + tflite::testing::tolerance_int16); +} TF_LITE_MICRO_TESTS_END diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h index d320a90d005..31405dfb998 100644 --- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h +++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h @@ -226,6 +226,17 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a); } \ } while (0) +#define TF_LITE_ENSURE_NEAR(context, a, b, epsilon) \ + do { \ + auto delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a)); \ + if (delta > epsilon) { \ + TF_LITE_KERNEL_LOG((context), "%s:%d %s not near %s (%f != %f)", \ + __FILE__, __LINE__, #a, #b, static_cast<double>(a), \ + static_cast<double>(b)); \ + return kTfLiteError; \ + } \ + } while (0) + #define TF_LITE_ENSURE_OK(context, status) \ do { \ const TfLiteStatus s = (status); \ From b2e09087dd73471c7209d637f3e9c93fa24505d2 Mon Sep 17 00:00:00 2001 From: Yair Ehrenwald <yair.ehrenwald@ceva-dsp.com> Date: Tue, 22 Sep 2020 00:17:45 +0300 Subject: [PATCH 2/2] Changed gen_lut params from std::function to function pointer --- tensorflow/lite/kernels/internal/common.h | 38 ++++++++++++++--------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h index 92bb01a2900..fbd5c9ccf0f 100644 --- a/tensorflow/lite/kernels/internal/common.h +++ b/tensorflow/lite/kernels/internal/common.h @@ -219,9 +219,9 @@ inline int CountLeadingSignBits(T integer_input) { using U = typename std::make_unsigned<T>::type; return integer_input >= 0 ? CountLeadingZeros(static_cast<U>(integer_input)) - 1 - : integer_input != std::numeric_limits<T>::min() - ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1) - : 0; + : integer_input != std::numeric_limits<T>::min() + ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1) + : 0; #endif } @@ -241,8 +241,12 @@ inline Integer FloorLog2(Integer n) { // generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in // softmax -inline void gen_lut(const std::function<double(double)>& func, double min, - double max, int16_t* table, const int num) { +// func - the function to build the LUT for (e.g exp(x)) +// min,max - table limits +// table - pointer to buffer +// num - number of elements in the LUT +inline void gen_lut(double (*func)(double), double min, double max, + int16_t* table, const int num) { // size of table should equal to num + 1 // last element only for slope calculation double step = (max - min) / (num - 1); @@ -265,8 +269,12 @@ inline void gen_lut(const std::function<double(double)>& func, double min, // generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in // softmax -inline void gen_lut(const std::function<float(float)>& func, float min, - float max, int16_t* table, const int num) { +// func - the function to build the LUT for (e.g exp(x)) +// min,max - table limits +// table - pointer to buffer +// num - number of elements in the LUT +inline void gen_lut(float (*func)(float), float min, float max, int16_t* table, + const int num) { // size of table should equal to num + 1 // last element only for slope calculation float step = (max - min) / (num - 1); @@ -463,13 +471,15 @@ inline void DownScaleInt32ToInt16Multiplier(int32_t multiplier_int32_t, // ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ... // ceil(log(abs( log(2.^(0:127))+1 ))/log(2))] constexpr int min_log_x_output_bits(int input_bits) { - return input_bits > 90 ? 7 - : input_bits > 44 ? 6 - : input_bits > 21 ? 5 - : input_bits > 10 ? 4 - : input_bits > 4 ? 3 - : input_bits > 1 ? 2 - : 1; + return input_bits > 90 + ? 7 + : input_bits > 44 + ? 6 + : input_bits > 21 + ? 5 + : input_bits > 10 + ? 4 + : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1; } // Although currently the name of this function says that it cannot handle