From 924d0b72c568f249f2fd224a942f8922524bfede Mon Sep 17 00:00:00 2001 From: Elena Zhelezina Date: Thu, 19 Mar 2020 15:49:04 +0000 Subject: [PATCH] Addressed reviewer comments. --- tensorflow/lite/kernels/add.cc | 73 ++++++++++++------- .../lite/kernels/internal/reference/add.h | 26 +++++-- tensorflow/lite/kernels/sub.cc | 61 +++++++++++----- 3 files changed, 107 insertions(+), 53 deletions(-) diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc index 7ad744b4910..731c2fb6289 100644 --- a/tensorflow/lite/kernels/add.cc +++ b/tensorflow/lite/kernels/add.cc @@ -58,6 +58,11 @@ struct OpData { int32 input1_offset; int32 input2_offset; int32 output_offset; + + // This parameter is used to indicate whether + // parameter scale is power of two. + // It is used in 16-bit -> 16-bit quantization. + bool pot_scale_16bit; }; void* Init(TfLiteContext* context, const char* buffer, size_t length) { @@ -95,12 +100,36 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // 8bit -> 8bit general quantized path, with general rescalings // as well as, 16bit -> 16bit with general rescalings - bool general_16bit = input1->type == kTfLiteInt16 && - input2->type == kTfLiteInt16 && - output->type == kTfLiteInt16; + bool pot_scale_16bit = false; + + bool input1_scale_is_pot = false; + bool input2_scale_is_pot = false; + bool output_scale_is_pot = false; + + int input1_scale_log2_rounded; + int input2_scale_log2_rounded; + int output_scale_log2_rounded; + + if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 && + output->type == kTfLiteInt16) { + // Check that param scale is POT + input1_scale_is_pot = + CheckedLog2(input1->params.scale, &input1_scale_log2_rounded); + + input2_scale_is_pot = + CheckedLog2(input2->params.scale, &input2_scale_log2_rounded); + + output_scale_is_pot = + CheckedLog2(output->params.scale, &output_scale_log2_rounded); + + pot_scale_16bit = input1_scale_log2_rounded && input2_scale_log2_rounded && + output_scale_log2_rounded; + } + + data->pot_scale_16bit = pot_scale_16bit; if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 || - general_16bit) { + pot_scale_16bit) { // 8bit -> 8bit general quantized path, with general rescalings // as well as, 16bit -> 16bit with general rescalings data->input1_offset = -input1->params.zero_point; @@ -110,7 +139,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // The shift is set to 15 for 16-bit and 20 in case of 8-bit, accordingly. // In case of 16-bit we have 65535 << 15 which is less than 1 << 31, // therefore the addition will still fit in a 32 bit accumulator. - data->left_shift = general_16bit ? 15 : 20; + data->left_shift = pot_scale_16bit ? 15 : 20; const double twice_max_input_scale = 2 * std::max(input1->params.scale, input2->params.scale); const double real_input1_multiplier = @@ -146,19 +175,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0); TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); - int input1_scale_log2_rounded; - bool input1_scale_is_pot = - CheckedLog2(input1->params.scale, &input1_scale_log2_rounded); TF_LITE_ENSURE(context, input1_scale_is_pot); - - int input2_scale_log2_rounded; - bool input2_scale_is_pot = - CheckedLog2(input2->params.scale, &input2_scale_log2_rounded); TF_LITE_ENSURE(context, input2_scale_is_pot); - - int output_scale_log2_rounded; - bool output_scale_is_pot = - CheckedLog2(output->params.scale, &output_scale_log2_rounded); TF_LITE_ENSURE(context, output_scale_is_pot); data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded; @@ -233,12 +251,8 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input1, const TfLiteTensor* input2, TfLiteTensor* output) { - bool general_16bit = input1->type == kTfLiteInt16 && - input2->type == kTfLiteInt16 && - output->type == kTfLiteInt16; - if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 || - general_16bit) { + data->pot_scale_16bit) { tflite::ArithmeticParams op_params; op_params.left_shift = data->left_shift; op_params.input1_offset = data->input1_offset; @@ -277,7 +291,10 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node, if (need_broadcast) { TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int16_t); } else { - TF_LITE_ADD(reference_ops, Add, int16_t); + reference_ops::Add( + op_params, GetTensorShape(input1), GetTensorData(input1), + GetTensorShape(input2), GetTensorData(input2), + GetTensorShape(output), GetTensorData(output), false); } } else { if (kernel_type == kReference) { @@ -296,12 +313,12 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node, } #undef TF_LITE_ADD } else if (output->type == kTfLiteInt16) { + tflite::ArithmeticParams op_params; + op_params.input1_shift = data->input1_shift; + op_params.input2_shift = data->input2_shift; + SetActivationParams(data->output_activation_min, + data->output_activation_max, &op_params); #define TF_LITE_ADD(type, opname) \ - tflite::ArithmeticParams op_params; \ - op_params.input1_shift = data->input1_shift; \ - op_params.input2_shift = data->input2_shift; \ - SetActivationParams(data->output_activation_min, \ - data->output_activation_max, &op_params); \ type::opname(op_params, GetTensorShape(input1), \ GetTensorData(input1), GetTensorShape(input2), \ GetTensorData(input2), GetTensorShape(output), \ @@ -309,7 +326,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node, // The quantized version of Add doesn't support activations, so we // always use BroadcastAdd. if (kernel_type == kReference) { - TF_LITE_ADD(reference_ops, AddLSTM); + TF_LITE_ADD(reference_ops, Add); } else { TF_LITE_ADD(optimized_ops, Add); } diff --git a/tensorflow/lite/kernels/internal/reference/add.h b/tensorflow/lite/kernels/internal/reference/add.h index c1b0163640b..741f4e684c5 100644 --- a/tensorflow/lite/kernels/internal/reference/add.h +++ b/tensorflow/lite/kernels/internal/reference/add.h @@ -137,10 +137,13 @@ inline void Add(const ArithmeticParams& params, AddElementwise(flat_size, params, input1_data, input2_data, output_data); } -inline void Add(const ArithmeticParams& params, - const RuntimeShape& input1_shape, const int16* input1_data, - const RuntimeShape& input2_shape, const int16* input2_data, - const RuntimeShape& output_shape, int16* output_data) { +inline void AddGeneralParamScale(const ArithmeticParams& params, + const RuntimeShape& input1_shape, + const int16* input1_data, + const RuntimeShape& input2_shape, + const int16* input2_data, + const RuntimeShape& output_shape, + int16* output_data) { TFLITE_DCHECK_LE(params.quantized_activation_min, params.quantized_activation_max); const int flat_size = @@ -155,10 +158,17 @@ inline void Add(const ArithmeticParams& params, AddElementwise(flat_size, params, input1_data, input2_data, output_data); } -inline void AddLSTM(const ArithmeticParams& params, - const RuntimeShape& input1_shape, const int16* input1_data, - const RuntimeShape& input2_shape, const int16* input2_data, - const RuntimeShape& output_shape, int16* output_data) { +inline void Add(const ArithmeticParams& params, + const RuntimeShape& input1_shape, const int16* input1_data, + const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& output_shape, int16* output_data, + bool pot_scale = true) { + if (!pot_scale) { + AddGeneralParamScale(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data); + return; + } + TFLITE_DCHECK_LE(params.quantized_activation_min, params.quantized_activation_max); diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc index 077533c7338..c314289604d 100644 --- a/tensorflow/lite/kernels/sub.cc +++ b/tensorflow/lite/kernels/sub.cc @@ -60,6 +60,11 @@ struct OpData { int32 input1_offset; int32 input2_offset; int32 output_offset; + + // This parameter is used to indicate whether + // parameter scale is power of two. + // It is used in 16-bit -> 16-bit quantization. + bool pot_scale_16bit; }; void* Init(TfLiteContext* context, const char* buffer, size_t length) { @@ -147,10 +152,11 @@ TfLiteStatus PrepareGeneralSubOp(TfLiteContext* context, return kTfLiteOk; } -TfLiteStatus PrepareLSTMSubOp(TfLiteContext* context, - const TfLiteTensor* input1, - const TfLiteTensor* input2, TfLiteTensor* output, - TfLiteSubParams* params, OpData* data) { +TfLiteStatus PrepareInt16SubOpPOT(TfLiteContext* context, + const TfLiteTensor* input1, + const TfLiteTensor* input2, + TfLiteTensor* output, TfLiteSubParams* params, + OpData* data) { // 16bit -> 16bit special quantized path, supporting only a rather // narrow case of quantization parameters: zero_points must all be 0 // ("symmetric quantization") and scales must be power-of-two (which @@ -219,19 +225,42 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // 8bit -> 8bit general quantized path, with general rescalings // as well as, 16bit -> 16bit with general rescalings + bool pot_scale_16bit = false; - bool general_16bit = output->type == kTfLiteInt16 && - input1->type == kTfLiteInt16 && - input2->type == kTfLiteInt16; + bool input1_scale_is_pot = false; + bool input2_scale_is_pot = false; + bool output_scale_is_pot = false; + + int input1_scale_log2_rounded; + int input2_scale_log2_rounded; + int output_scale_log2_rounded; + + if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 && + output->type == kTfLiteInt16) { + // Check that param scale is POT + input1_scale_is_pot = + CheckedLog2(input1->params.scale, &input1_scale_log2_rounded); + + input2_scale_is_pot = + CheckedLog2(input2->params.scale, &input2_scale_log2_rounded); + + output_scale_is_pot = + CheckedLog2(output->params.scale, &output_scale_log2_rounded); + + pot_scale_16bit = input1_scale_log2_rounded && input2_scale_log2_rounded && + output_scale_log2_rounded; + } + + data->pot_scale_16bit = pot_scale_16bit; if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 || - general_16bit) { + pot_scale_16bit) { TF_LITE_ENSURE_OK(context, PrepareGeneralSubOp(context, input1, input2, output, params, data, -1)); } else if (output->type == kTfLiteInt16) { // LSTM-special case with scale parameter of POT - TF_LITE_ENSURE_OK(context, PrepareLSTMSubOp(context, input1, input2, output, - params, data)); + TF_LITE_ENSURE_OK(context, PrepareInt16SubOpPOT(context, input1, input2, + output, params, data)); } return context->ResizeTensor(context, output, output_size); @@ -306,11 +335,6 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, const bool need_broadcast = optimized_ops::ProcessBroadcastShapes( GetTensorShape(input1), GetTensorShape(input2), &op_params); - // 16bit -> 16bit with general rescaling - bool general_16bit = output->type == kTfLiteInt16 && - input1->type == kTfLiteInt16 && - input2->type == kTfLiteInt16; - #define TF_LITE_SUB(type, opname, data_type) \ type::opname(op_params, GetTensorShape(input1), \ GetTensorData(input1), GetTensorShape(input2), \ @@ -324,11 +348,14 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, } else { TF_LITE_SUB(reference_integer_ops, Add, int8_t); } - } else if (general_16bit) { + } else if (data->pot_scale_16bit) { if (need_broadcast) { TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, int16_t); } else { - TF_LITE_SUB(reference_ops, Add, int16_t); + reference_ops::Add(op_params, GetTensorShape(input1), + GetTensorData(input1), GetTensorShape(input2), + GetTensorData(input2), GetTensorShape(output), + GetTensorData(output), false); } } else if (output->type == kTfLiteUInt8) { if (kernel_type == kReference) {