From d21270193b92b5a588c1bc4c4faa7e99c08ed988 Mon Sep 17 00:00:00 2001 From: Yunlu Li Date: Mon, 28 Oct 2019 21:25:14 -0700 Subject: [PATCH] Fix implementation of quantized ReluX. PiperOrigin-RevId: 277208160 Change-Id: I6f92a34cd07f7451baf6fe723f2001db647d00ac --- tensorflow/lite/kernels/activations.cc | 66 +++++++++++++++---- .../internal/reference/reference_ops.h | 18 +++++ tensorflow/lite/kernels/internal/types.h | 7 ++ tensorflow/lite/testing/BUILD | 1 + tensorflow/lite/testing/op_tests/relu6.py | 7 +- 5 files changed, 83 insertions(+), 16 deletions(-) diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc index 2fc9c2d6d67..e49c274e4c3 100644 --- a/tensorflow/lite/kernels/activations.cc +++ b/tensorflow/lite/kernels/activations.cc @@ -87,6 +87,11 @@ struct HardSwishData { HardSwishParams params; }; +struct ReluOpData : public OpData { + int32_t output_multiplier = 0; + int output_shift = 0; +}; + namespace { TfLiteStatus CheckOutputQuantParams(TfLiteContext* context, const TfLiteTensor* input, @@ -136,8 +141,8 @@ void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input, template void QuantizedReluX(float act_min, float act_max, const TfLiteTensor* input, - TfLiteTensor* output) { - ActivationParams params; + TfLiteTensor* output, const ReluOpData* data) { + ReluParams params; params.quantized_activation_min = std::max(static_cast(std::numeric_limits::min()), output->params.zero_point + @@ -149,6 +154,10 @@ void QuantizedReluX(float act_min, float act_max, const TfLiteTensor* input, static_cast(std::numeric_limits::max()), output->params.zero_point + static_cast(roundf(act_max / output->params.scale))); + params.input_offset = input->params.zero_point; + params.output_offset = output->params.zero_point; + params.output_multiplier = data->output_multiplier; + params.output_shift = data->output_shift; optimized_ops::ReluX(params, GetTensorShape(input), GetTensorData(input), GetTensorShape(output), GetTensorData(output)); } @@ -206,6 +215,32 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) { TfLiteIntArrayCopy(input->dims)); } +void* ReluInit(TfLiteContext* context, const char* buffer, size_t length) { + return new ReluOpData; +} + +void ReluFree(TfLiteContext* context, void* buffer) { + delete reinterpret_cast(buffer); +} + +TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) { + ReluOpData* data = reinterpret_cast(node->user_data); + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + const TfLiteTensor* input = GetInput(context, node, 0); + TfLiteTensor* output = GetOutput(context, node, 0); + TF_LITE_ENSURE_EQ(context, input->type, output->type); + + if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8) { + double real_multiplier = input->params.scale / output->params.scale; + QuantizeMultiplier(real_multiplier, &data->output_multiplier, + &data->output_shift); + } + + return context->ResizeTensor(context, output, + TfLiteIntArrayCopy(input->dims)); +} + void* LeakyReluInit(TfLiteContext* context, const char* buffer, size_t length) { return new LeakyReluOpData; } @@ -557,6 +592,7 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) { TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input = GetInput(context, node, 0); TfLiteTensor* output = GetOutput(context, node, 0); + const ReluOpData* data = reinterpret_cast(node->user_data); switch (input->type) { case kTfLiteFloat32: { optimized_ops::Relu(GetTensorShape(input), GetTensorData(input), @@ -566,11 +602,11 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) { // the unbounded upper limit is actually hard to quantize. case kTfLiteUInt8: { QuantizedReluX(0.0f, std::numeric_limits::infinity(), - input, output); + input, output, data); } break; case kTfLiteInt8: { QuantizedReluX(0.0f, std::numeric_limits::infinity(), - input, output); + input, output, data); } break; default: context->ReportError( @@ -584,6 +620,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) { TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input = GetInput(context, node, 0); TfLiteTensor* output = GetOutput(context, node, 0); + const ReluOpData* data = reinterpret_cast(node->user_data); switch (input->type) { case kTfLiteFloat32: { optimized_ops::Relu1(GetTensorShape(input), GetTensorData(input), @@ -592,11 +629,11 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } break; case kTfLiteUInt8: { - QuantizedReluX(-1.0f, 1.0f, input, output); + QuantizedReluX(-1.0f, 1.0f, input, output, data); return kTfLiteOk; } break; case kTfLiteInt8: { - QuantizedReluX(-1, 1, input, output); + QuantizedReluX(-1, 1, input, output, data); return kTfLiteOk; } break; default: @@ -665,6 +702,7 @@ TfLiteStatus HardSwishEval(TfLiteContext* context, TfLiteNode* node) { TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* input = GetInput(context, node, 0); TfLiteTensor* output = GetOutput(context, node, 0); + ReluOpData* data = reinterpret_cast(node->user_data); switch (input->type) { case kTfLiteFloat32: { size_t elements = input->bytes / sizeof(float); @@ -675,10 +713,10 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } break; case kTfLiteUInt8: - QuantizedReluX(0.0f, 6.0f, input, output); + QuantizedReluX(0.0f, 6.0f, input, output, data); return kTfLiteOk; case kTfLiteInt8: { - QuantizedReluX(0.0f, 6.0f, input, output); + QuantizedReluX(0.0f, 6.0f, input, output, data); return kTfLiteOk; } break; default: @@ -1074,22 +1112,22 @@ TfLiteRegistration* Register_ELU() { } TfLiteRegistration* Register_RELU() { - static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr, - activations::GenericPrepare, + static TfLiteRegistration r = {activations::ReluInit, activations::ReluFree, + activations::ReluPrepare, activations::ReluEval}; return &r; } TfLiteRegistration* Register_RELU_N1_TO_1() { - static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr, - activations::GenericPrepare, + static TfLiteRegistration r = {activations::ReluInit, activations::ReluFree, + activations::ReluPrepare, activations::Relu1Eval}; return &r; } TfLiteRegistration* Register_RELU6() { - static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr, - activations::GenericPrepare, + static TfLiteRegistration r = {activations::ReluInit, activations::ReluFree, + activations::ReluPrepare, activations::Relu6Eval}; return &r; } diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h index fe6fbd7efc7..ed5d2b2674a 100644 --- a/tensorflow/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h @@ -213,6 +213,24 @@ inline void Relu6(const RuntimeShape& input_shape, const float* input_data, } } +template +inline void ReluX(const tflite::ReluParams& params, + const RuntimeShape& input_shape, const T* input_data, + const RuntimeShape& output_shape, T* output_data) { + gemmlowp::ScopedProfilingLabel label("Quantized ReluX (not fused)"); + const int flat_size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < flat_size; ++i) { + const int32 val = static_cast(input_data[i]); + int32 clamped = params.output_offset + + MultiplyByQuantizedMultiplier(val - params.input_offset, + params.output_multiplier, + params.output_shift); + clamped = std::max(params.quantized_activation_min, clamped); + clamped = std::min(params.quantized_activation_max, clamped); + output_data[i] = static_cast(clamped); + } +} + template inline void ReluX(const tflite::ActivationParams& params, const RuntimeShape& input_shape, const T* input_data, diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h index 1a4a4ee84c3..38769d1bc0b 100644 --- a/tensorflow/lite/kernels/internal/types.h +++ b/tensorflow/lite/kernels/internal/types.h @@ -734,6 +734,13 @@ struct ActivationParams { int32 quantized_activation_max; }; +struct ReluParams : public ActivationParams { + int32 input_offset; + int32 output_offset; + int32 output_multiplier; + int32 output_shift; +}; + // Styles of resizing op usages. For example, kImageStyle can be used with a Pad // op for pattern-specific optimization. enum class ResizingCategory : uint8 { diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD index e9889ac6586..769d17ce703 100644 --- a/tensorflow/lite/testing/BUILD +++ b/tensorflow/lite/testing/BUILD @@ -493,6 +493,7 @@ edgetpu_ops = [ "max_pool", "mul", "pad", # high error + "relu6", "reshape", "resize_bilinear", "sigmoid", diff --git a/tensorflow/lite/testing/op_tests/relu6.py b/tensorflow/lite/testing/op_tests/relu6.py index 8184c2f5cbd..db75c22895c 100644 --- a/tensorflow/lite/testing/op_tests/relu6.py +++ b/tensorflow/lite/testing/op_tests/relu6.py @@ -32,6 +32,8 @@ def make_relu6_tests(options): test_parameters = [{ "input_shape": [[], [1, 1, 1, 1], [1, 3, 4, 3], [3, 15, 14, 3], [3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]], + "fully_quantize": [True, False], + "input_range": [(-2, 8)] }] def build_graph(parameters): @@ -41,8 +43,9 @@ def make_relu6_tests(options): return [input_tensor], [out] def build_inputs(parameters, sess, inputs, outputs): - input_values = create_tensor_data( - np.float32, parameters["input_shape"], min_value=-3, max_value=10) + min_value, max_value = parameters["input_range"] + input_values = create_tensor_data(np.float32, parameters["input_shape"], + min_value, max_value) return [input_values], sess.run( outputs, feed_dict=dict(zip(inputs, [input_values])))