Fix implementation of quantized ReluX.

PiperOrigin-RevId: 277208160
Change-Id: I6f92a34cd07f7451baf6fe723f2001db647d00ac
This commit is contained in:
Yunlu Li 2019-10-28 21:25:14 -07:00 committed by TensorFlower Gardener
parent f0570083f5
commit d21270193b
5 changed files with 83 additions and 16 deletions

View File

@ -87,6 +87,11 @@ struct HardSwishData {
HardSwishParams params;
};
struct ReluOpData : public OpData {
int32_t output_multiplier = 0;
int output_shift = 0;
};
namespace {
TfLiteStatus CheckOutputQuantParams(TfLiteContext* context,
const TfLiteTensor* input,
@ -136,8 +141,8 @@ void EvalUsingLookupTable(struct OpData* data, const TfLiteTensor* input,
template <typename T>
void QuantizedReluX(float act_min, float act_max, const TfLiteTensor* input,
TfLiteTensor* output) {
ActivationParams params;
TfLiteTensor* output, const ReluOpData* data) {
ReluParams params;
params.quantized_activation_min =
std::max(static_cast<int32_t>(std::numeric_limits<T>::min()),
output->params.zero_point +
@ -149,6 +154,10 @@ void QuantizedReluX(float act_min, float act_max, const TfLiteTensor* input,
static_cast<int32_t>(std::numeric_limits<T>::max()),
output->params.zero_point +
static_cast<int32>(roundf(act_max / output->params.scale)));
params.input_offset = input->params.zero_point;
params.output_offset = output->params.zero_point;
params.output_multiplier = data->output_multiplier;
params.output_shift = data->output_shift;
optimized_ops::ReluX(params, GetTensorShape(input), GetTensorData<T>(input),
GetTensorShape(output), GetTensorData<T>(output));
}
@ -206,6 +215,32 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
TfLiteIntArrayCopy(input->dims));
}
void* ReluInit(TfLiteContext* context, const char* buffer, size_t length) {
return new ReluOpData;
}
void ReluFree(TfLiteContext* context, void* buffer) {
delete reinterpret_cast<ReluOpData*>(buffer);
}
TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) {
ReluOpData* data = reinterpret_cast<ReluOpData*>(node->user_data);
TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
const TfLiteTensor* input = GetInput(context, node, 0);
TfLiteTensor* output = GetOutput(context, node, 0);
TF_LITE_ENSURE_EQ(context, input->type, output->type);
if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8) {
double real_multiplier = input->params.scale / output->params.scale;
QuantizeMultiplier(real_multiplier, &data->output_multiplier,
&data->output_shift);
}
return context->ResizeTensor(context, output,
TfLiteIntArrayCopy(input->dims));
}
void* LeakyReluInit(TfLiteContext* context, const char* buffer, size_t length) {
return new LeakyReluOpData;
}
@ -557,6 +592,7 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
const TfLiteTensor* input = GetInput(context, node, 0);
TfLiteTensor* output = GetOutput(context, node, 0);
const ReluOpData* data = reinterpret_cast<ReluOpData*>(node->user_data);
switch (input->type) {
case kTfLiteFloat32: {
optimized_ops::Relu(GetTensorShape(input), GetTensorData<float>(input),
@ -566,11 +602,11 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
// the unbounded upper limit is actually hard to quantize.
case kTfLiteUInt8: {
QuantizedReluX<uint8_t>(0.0f, std::numeric_limits<float>::infinity(),
input, output);
input, output, data);
} break;
case kTfLiteInt8: {
QuantizedReluX<int8_t>(0.0f, std::numeric_limits<float>::infinity(),
input, output);
input, output, data);
} break;
default:
context->ReportError(
@ -584,6 +620,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
const TfLiteTensor* input = GetInput(context, node, 0);
TfLiteTensor* output = GetOutput(context, node, 0);
const ReluOpData* data = reinterpret_cast<ReluOpData*>(node->user_data);
switch (input->type) {
case kTfLiteFloat32: {
optimized_ops::Relu1(GetTensorShape(input), GetTensorData<float>(input),
@ -592,11 +629,11 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
return kTfLiteOk;
} break;
case kTfLiteUInt8: {
QuantizedReluX<uint8_t>(-1.0f, 1.0f, input, output);
QuantizedReluX<uint8_t>(-1.0f, 1.0f, input, output, data);
return kTfLiteOk;
} break;
case kTfLiteInt8: {
QuantizedReluX<int8_t>(-1, 1, input, output);
QuantizedReluX<int8_t>(-1, 1, input, output, data);
return kTfLiteOk;
} break;
default:
@ -665,6 +702,7 @@ TfLiteStatus HardSwishEval(TfLiteContext* context, TfLiteNode* node) {
TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
const TfLiteTensor* input = GetInput(context, node, 0);
TfLiteTensor* output = GetOutput(context, node, 0);
ReluOpData* data = reinterpret_cast<ReluOpData*>(node->user_data);
switch (input->type) {
case kTfLiteFloat32: {
size_t elements = input->bytes / sizeof(float);
@ -675,10 +713,10 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
return kTfLiteOk;
} break;
case kTfLiteUInt8:
QuantizedReluX<uint8_t>(0.0f, 6.0f, input, output);
QuantizedReluX<uint8_t>(0.0f, 6.0f, input, output, data);
return kTfLiteOk;
case kTfLiteInt8: {
QuantizedReluX<int8_t>(0.0f, 6.0f, input, output);
QuantizedReluX<int8_t>(0.0f, 6.0f, input, output, data);
return kTfLiteOk;
} break;
default:
@ -1074,22 +1112,22 @@ TfLiteRegistration* Register_ELU() {
}
TfLiteRegistration* Register_RELU() {
static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
activations::GenericPrepare,
static TfLiteRegistration r = {activations::ReluInit, activations::ReluFree,
activations::ReluPrepare,
activations::ReluEval};
return &r;
}
TfLiteRegistration* Register_RELU_N1_TO_1() {
static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
activations::GenericPrepare,
static TfLiteRegistration r = {activations::ReluInit, activations::ReluFree,
activations::ReluPrepare,
activations::Relu1Eval};
return &r;
}
TfLiteRegistration* Register_RELU6() {
static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
activations::GenericPrepare,
static TfLiteRegistration r = {activations::ReluInit, activations::ReluFree,
activations::ReluPrepare,
activations::Relu6Eval};
return &r;
}

View File

@ -213,6 +213,24 @@ inline void Relu6(const RuntimeShape& input_shape, const float* input_data,
}
}
template <typename T>
inline void ReluX(const tflite::ReluParams& params,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& output_shape, T* output_data) {
gemmlowp::ScopedProfilingLabel label("Quantized ReluX (not fused)");
const int flat_size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < flat_size; ++i) {
const int32 val = static_cast<int32_t>(input_data[i]);
int32 clamped = params.output_offset +
MultiplyByQuantizedMultiplier(val - params.input_offset,
params.output_multiplier,
params.output_shift);
clamped = std::max(params.quantized_activation_min, clamped);
clamped = std::min(params.quantized_activation_max, clamped);
output_data[i] = static_cast<T>(clamped);
}
}
template <typename T>
inline void ReluX(const tflite::ActivationParams& params,
const RuntimeShape& input_shape, const T* input_data,

View File

@ -734,6 +734,13 @@ struct ActivationParams {
int32 quantized_activation_max;
};
struct ReluParams : public ActivationParams {
int32 input_offset;
int32 output_offset;
int32 output_multiplier;
int32 output_shift;
};
// Styles of resizing op usages. For example, kImageStyle can be used with a Pad
// op for pattern-specific optimization.
enum class ResizingCategory : uint8 {

View File

@ -493,6 +493,7 @@ edgetpu_ops = [
"max_pool",
"mul",
"pad", # high error
"relu6",
"reshape",
"resize_bilinear",
"sigmoid",

View File

@ -32,6 +32,8 @@ def make_relu6_tests(options):
test_parameters = [{
"input_shape": [[], [1, 1, 1, 1], [1, 3, 4, 3], [3, 15, 14, 3],
[3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]],
"fully_quantize": [True, False],
"input_range": [(-2, 8)]
}]
def build_graph(parameters):
@ -41,8 +43,9 @@ def make_relu6_tests(options):
return [input_tensor], [out]
def build_inputs(parameters, sess, inputs, outputs):
input_values = create_tensor_data(
np.float32, parameters["input_shape"], min_value=-3, max_value=10)
min_value, max_value = parameters["input_range"]
input_values = create_tensor_data(np.float32, parameters["input_shape"],
min_value, max_value)
return [input_values], sess.run(
outputs, feed_dict=dict(zip(inputs, [input_values])))