From fe10ef671df1ac9d240b3d587dc25ed4605ff887 Mon Sep 17 00:00:00 2001 From: Advait Jain Date: Fri, 24 Jul 2020 10:24:26 -0700 Subject: [PATCH] Prefer the standard integral types over custom type-aliases. PiperOrigin-RevId: 323021115 Change-Id: Ib934f346bcc86de959027b180e50c8eb0e6f8b7e --- .../lite/kernels/internal/reference/add.h | 125 ++++++++-------- .../kernels/internal/reference/batch_matmul.h | 16 +-- .../kernels/internal/reference/comparisons.h | 44 +++--- .../internal/reference/concatenation.h | 12 +- .../lite/kernels/internal/reference/conv.h | 40 +++--- .../internal/reference/depthwiseconv_uint8.h | 88 ++++++------ .../kernels/internal/reference/dequantize.h | 14 +- .../internal/reference/fully_connected.h | 133 +++++++++--------- .../kernels/internal/reference/hard_swish.h | 2 +- .../internal/reference/integer_ops/add.h | 18 +-- .../internal/reference/integer_ops/conv.h | 58 ++++---- .../reference/integer_ops/depthwise_conv.h | 70 ++++----- .../reference/integer_ops/fully_connected.h | 32 ++--- .../reference/integer_ops/l2normalization.h | 8 +- .../internal/reference/integer_ops/mul.h | 36 ++--- .../internal/reference/integer_ops/pooling.h | 26 ++-- .../internal/reference/l2normalization.h | 27 ++-- .../kernels/internal/reference/logistic.h | 14 +- .../lite/kernels/internal/reference/mul.h | 36 ++--- .../lite/kernels/internal/reference/pad.h | 14 +- .../lite/kernels/internal/reference/pooling.h | 21 +-- .../reference/portable_tensor_utils.cc | 76 +++++----- .../reference/portable_tensor_utils.h | 2 +- .../lite/kernels/internal/reference/prelu.h | 26 ++-- .../kernels/internal/reference/quantize.h | 12 +- .../lite/kernels/internal/reference/reduce.h | 14 +- .../reference/resize_nearest_neighbor.h | 43 +++--- .../lite/kernels/internal/reference/softmax.h | 38 ++--- .../lite/kernels/internal/reference/sub.h | 97 ++++++------- .../lite/kernels/internal/reference/svdf.h | 2 +- .../lite/kernels/internal/reference/tanh.h | 36 ++--- 31 files changed, 594 insertions(+), 586 deletions(-) diff --git a/tensorflow/lite/kernels/internal/reference/add.h b/tensorflow/lite/kernels/internal/reference/add.h index d0c40912091..94c58097154 100644 --- a/tensorflow/lite/kernels/internal/reference/add.h +++ b/tensorflow/lite/kernels/internal/reference/add.h @@ -52,33 +52,33 @@ inline void Add(const ArithmeticParams& params, // Element-wise add that can often be used for inner loop of broadcast add as // well as the non-broadcast add. inline void AddElementwise(int size, const ArithmeticParams& params, - const uint8* input1_data, const uint8* input2_data, - uint8* output_data) { + const uint8_t* input1_data, + const uint8_t* input2_data, uint8_t* output_data) { TFLITE_DCHECK_GT(params.input1_offset, -256); TFLITE_DCHECK_GT(params.input2_offset, -256); TFLITE_DCHECK_LT(params.input1_offset, 256); TFLITE_DCHECK_LT(params.input2_offset, 256); for (int i = 0; i < size; ++i) { - const int32 input1_val = params.input1_offset + input1_data[i]; - const int32 input2_val = params.input2_offset + input2_data[i]; - const int32 shifted_input1_val = input1_val * (1 << params.left_shift); - const int32 shifted_input2_val = input2_val * (1 << params.left_shift); - const int32 scaled_input1_val = + const int32_t input1_val = params.input1_offset + input1_data[i]; + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input1_val, params.input1_multiplier, params.input1_shift); - const int32 scaled_input2_val = + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input2_val, params.input2_multiplier, params.input2_shift); - const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = + const int32_t raw_sum = scaled_input1_val + scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( raw_sum, params.output_multiplier, params.output_shift) + params.output_offset; - const int32 clamped_output = + const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); - output_data[i] = static_cast(clamped_output); + output_data[i] = static_cast(clamped_output); } } @@ -86,40 +86,40 @@ inline void AddElementwise(int size, const ArithmeticParams& params, // broadcast add, so that, for example, scalar-broadcast with batch will still // be fast. inline void AddScalarBroadcast(int size, const ArithmeticParams& params, - uint8 input1_data, const uint8* input2_data, - uint8* output_data) { + uint8_t input1_data, const uint8_t* input2_data, + uint8_t* output_data) { TFLITE_DCHECK_GT(params.input1_offset, -256); TFLITE_DCHECK_GT(params.input2_offset, -256); TFLITE_DCHECK_LT(params.input1_offset, 256); TFLITE_DCHECK_LT(params.input2_offset, 256); - const int32 input1_val = params.input1_offset + input1_data; - const int32 shifted_input1_val = input1_val * (1 << params.left_shift); - const int32 scaled_input1_val = + const int32_t input1_val = params.input1_offset + input1_data; + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input1_val, params.input1_multiplier, params.input1_shift); for (int i = 0; i < size; ++i) { - const int32 input2_val = params.input2_offset + input2_data[i]; - const int32 shifted_input2_val = input2_val * (1 << params.left_shift); - const int32 scaled_input2_val = + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input2_val, params.input2_multiplier, params.input2_shift); - const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = + const int32_t raw_sum = scaled_input1_val + scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( raw_sum, params.output_multiplier, params.output_shift) + params.output_offset; - const int32 clamped_output = + const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); - output_data[i] = static_cast(clamped_output); + output_data[i] = static_cast(clamped_output); } } inline void Add(const ArithmeticParams& params, - const RuntimeShape& input1_shape, const uint8* input1_data, - const RuntimeShape& input2_shape, const uint8* input2_data, - const RuntimeShape& output_shape, uint8* output_data) { + const RuntimeShape& input1_shape, const uint8_t* input1_data, + const RuntimeShape& input2_shape, const uint8_t* input2_data, + const RuntimeShape& output_shape, uint8_t* output_data) { TFLITE_DCHECK_LE(params.quantized_activation_min, params.quantized_activation_max); const int flat_size = @@ -133,23 +133,24 @@ inline void Add(const ArithmeticParams& params, } inline void Add(const ArithmeticParams& params, - const RuntimeShape& input1_shape, const int16* input1_data, - const RuntimeShape& input2_shape, const int16* input2_data, - const RuntimeShape& output_shape, int16* output_data) { + const RuntimeShape& input1_shape, const int16_t* input1_data, + const RuntimeShape& input2_shape, const int16_t* input2_data, + const RuntimeShape& output_shape, int16_t* output_data) { TFLITE_DCHECK_LE(params.quantized_activation_min, params.quantized_activation_max); const int input1_shift = params.input1_shift; const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); - const int16 output_activation_min = params.quantized_activation_min; - const int16 output_activation_max = params.quantized_activation_max; + const int16_t output_activation_min = params.quantized_activation_min; + const int16_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0); TFLITE_DCHECK_LE(input1_shift, 0); TFLITE_DCHECK_LE(params.input2_shift, 0); - const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data; - const int16* shift_input = input1_shift == 0 ? input2_data : input1_data; + const int16_t* not_shift_input = + input1_shift == 0 ? input1_data : input2_data; + const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data; const int input_right_shift = input1_shift == 0 ? -params.input2_shift : -input1_shift; @@ -161,8 +162,8 @@ inline void Add(const ArithmeticParams& params, F0 scaled_input = F0::FromRaw( gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift)); F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled); - const int16 raw_output = result.raw(); - const int16 clamped_output = std::min( + const int16_t raw_output = result.raw(); + const int16_t clamped_output = std::min( output_activation_max, std::max(output_activation_min, raw_output)); output_data[i] = clamped_output; } @@ -218,11 +219,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params, inline void BroadcastAdd4DSlow(const ArithmeticParams& params, const RuntimeShape& input1_shape, - const int32* input1_data, + const int32_t* input1_data, const RuntimeShape& input2_shape, - const int32* input2_data, + const int32_t* input2_data, const RuntimeShape& output_shape, - int32* output_data) { + int32_t* output_data) { NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, @@ -259,11 +260,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params, inline void BroadcastAdd4DSlow(const ArithmeticParams& params, const RuntimeShape& input1_shape, - const uint8* input1_data, + const uint8_t* input1_data, const RuntimeShape& input2_shape, - const uint8* input2_data, + const uint8_t* input2_data, const RuntimeShape& output_shape, - uint8* output_data) { + uint8_t* output_data) { NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, @@ -286,34 +287,34 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params, for (int y = 0; y < extended_output_shape.Dims(1); ++y) { for (int x = 0; x < extended_output_shape.Dims(2); ++x) { for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - const int32 input1_val = + const int32_t input1_val = params.input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; - const int32 input2_val = + const int32_t input2_val = params.input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; - const int32 shifted_input1_val = + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); - const int32 shifted_input2_val = + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); - const int32 scaled_input1_val = + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input1_val, params.input1_multiplier, params.input1_shift); - const int32 scaled_input2_val = + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input2_val, params.input2_multiplier, params.input2_shift); - const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = + const int32_t raw_sum = scaled_input1_val + scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( raw_sum, params.output_multiplier, params.output_shift) + params.output_offset; - const int32 clamped_output = + const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); output_data[Offset(extended_output_shape, b, y, x, c)] = - static_cast(clamped_output); + static_cast(clamped_output); } } } @@ -322,11 +323,11 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params, inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, const RuntimeShape& unswitched_input1_shape, - const uint8* unswitched_input1_data, + const uint8_t* unswitched_input1_data, const RuntimeShape& unswitched_input2_shape, - const uint8* unswitched_input2_data, + const uint8_t* unswitched_input2_data, const RuntimeShape& output_shape, - uint8* output_data) { + uint8_t* output_data) { ArithmeticParams switched_params = unswitched_params; switched_params.input1_offset = unswitched_params.input2_offset; switched_params.input1_multiplier = unswitched_params.input2_multiplier; @@ -341,18 +342,18 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, const ArithmeticParams& params = use_unswitched ? unswitched_params : switched_params; - const uint8* input1_data = + const uint8_t* input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data; - const uint8* input2_data = + const uint8_t* input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data; // Fivefold nested loops. The second input resets its position for each // iteration of the second loop. The first input resets its position at the // beginning of the fourth loop. The innermost loop is an elementwise add of // sections of the arrays. - uint8* output_data_ptr = output_data; - const uint8* input1_data_ptr = input1_data; - const uint8* input2_data_reset = input2_data; + uint8_t* output_data_ptr = output_data; + const uint8_t* input1_data_ptr = input1_data; + const uint8_t* input2_data_reset = input2_data; // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared // between input shapes. y3 for input 1 is always broadcast, and so the // dimension there is 1, whereas optionally y1 might be broadcast for input 2. @@ -368,7 +369,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner // dimension. for (int i0 = 0; i0 < y0; ++i0) { - const uint8* input2_data_ptr; + const uint8_t* input2_data_ptr; for (int i1 = 0; i1 < y1; ++i1) { input2_data_ptr = input2_data_reset; for (int i2 = 0; i2 < y2; ++i2) { @@ -397,7 +398,7 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, // for y4 == 1 and the loop over y3 is contained within the // AddScalarBroadcast function. for (int i0 = 0; i0 < y0; ++i0) { - const uint8* input2_data_ptr; + const uint8_t* input2_data_ptr; for (int i1 = 0; i1 < y1; ++i1) { input2_data_ptr = input2_data_reset; for (int i2 = 0; i2 < y2; ++i2) { diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h index 05caefaca5d..24c3ffe3d7e 100644 --- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h +++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h @@ -266,13 +266,13 @@ inline void BatchMatMul(const FullyConnectedParams& params, const int rhs_cols = extended_rhs_shape.Dims(4); const int accum_depth = extended_lhs_shape.Dims(4); - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_LE(output_activation_min, output_activation_max); for (int b0 = 0; b0 < batch_dim0; ++b0) { @@ -292,8 +292,8 @@ inline void BatchMatMul(const FullyConnectedParams& params, for (int i = 0; i < lhs_rows; ++i) { int32_t total = 0; for (int k = 0; k < accum_depth; ++k) { - int32 lhs_val = lhs_ptr2[accum_depth * i + k]; - int32 rhs_val = rhs_ptr2[accum_depth * j + k]; + int32_t lhs_val = lhs_ptr2[accum_depth * i + k]; + int32_t rhs_val = rhs_ptr2[accum_depth * j + k]; total += (lhs_val + filter_offset) * (rhs_val + input_offset); } total = MultiplyByQuantizedMultiplier(total, output_multiplier, diff --git a/tensorflow/lite/kernels/internal/reference/comparisons.h b/tensorflow/lite/kernels/internal/reference/comparisons.h index d9bc10a9390..49844ab1539 100644 --- a/tensorflow/lite/kernels/internal/reference/comparisons.h +++ b/tensorflow/lite/kernels/internal/reference/comparisons.h @@ -105,30 +105,30 @@ inline void Comparison(const ComparisonParams& op_params, input2_data, output_shape, output_data); } -template F> +template F> inline void ComparisonWithScaling( const ComparisonParams& op_params, const RuntimeShape& input1_shape, const T* input1_data, const RuntimeShape& input2_shape, const T* input2_data, const RuntimeShape& output_shape, bool* output_data) { int left_shift = op_params.left_shift; - int32 input1_offset = op_params.input1_offset; - int32 input1_multiplier = op_params.input1_multiplier; + int32_t input1_offset = op_params.input1_offset; + int32_t input1_multiplier = op_params.input1_multiplier; int input1_shift = op_params.input1_shift; - int32 input2_offset = op_params.input2_offset; - int32 input2_multiplier = op_params.input2_multiplier; + int32_t input2_offset = op_params.input2_offset; + int32_t input2_multiplier = op_params.input2_multiplier; int input2_shift = op_params.input2_shift; const int64_t flatsize = MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int64_t i = 0; i < flatsize; ++i) { - const int32 input1_val = input1_offset + input1_data[i]; - const int32 input2_val = input2_offset + input2_data[i]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = + const int32_t input1_val = input1_offset + input1_data[i]; + const int32_t input2_val = input2_offset + input2_data[i]; + const int32_t shifted_input1_val = input1_val * (1 << left_shift); + const int32_t shifted_input2_val = input2_val * (1 << left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input1_val, input1_multiplier, input1_shift); - const int32 scaled_input2_val = + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input2_val, input2_multiplier, input2_shift); output_data[i] = F(scaled_input1_val, scaled_input2_val); @@ -218,7 +218,7 @@ inline void BroadcastComparison4DSlow(const ComparisonParams& op_params, output_shape, output_data); } -template F> +template F> inline void BroadcastComparison4DSlowWithScaling( const ComparisonParams& op_params, const RuntimeShape& unextended_input1_shape, const T* input1_data, @@ -230,29 +230,29 @@ inline void BroadcastComparison4DSlowWithScaling( unextended_output_shape); int left_shift = op_params.left_shift; - int32 input1_offset = op_params.input1_offset; - int32 input1_multiplier = op_params.input1_multiplier; + int32_t input1_offset = op_params.input1_offset; + int32_t input1_multiplier = op_params.input1_multiplier; int input1_shift = op_params.input1_shift; - int32 input2_offset = op_params.input2_offset; - int32 input2_multiplier = op_params.input2_multiplier; + int32_t input2_offset = op_params.input2_offset; + int32_t input2_multiplier = op_params.input2_multiplier; int input2_shift = op_params.input2_shift; for (int b = 0; b < dims.output_shape.Dims(0); ++b) { for (int y = 0; y < dims.output_shape.Dims(1); ++y) { for (int x = 0; x < dims.output_shape.Dims(2); ++x) { for (int c = 0; c < dims.output_shape.Dims(3); ++c) { - const int32 input1_val = + const int32_t input1_val = input1_offset + input1_data[SubscriptToIndex(dims.desc1, b, y, x, c)]; - const int32 input2_val = + const int32_t input2_val = input2_offset + input2_data[SubscriptToIndex(dims.desc2, b, y, x, c)]; - const int32 shifted_input1_val = input1_val * (1 << left_shift); - const int32 shifted_input2_val = input2_val * (1 << left_shift); - const int32 scaled_input1_val = + const int32_t shifted_input1_val = input1_val * (1 << left_shift); + const int32_t shifted_input2_val = input2_val * (1 << left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input1_val, input1_multiplier, input1_shift); - const int32 scaled_input2_val = + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input2_val, input2_multiplier, input2_shift); output_data[Offset(dims.output_shape, b, y, x, c)] = diff --git a/tensorflow/lite/kernels/internal/reference/concatenation.h b/tensorflow/lite/kernels/internal/reference/concatenation.h index 958fe3ea249..25959793e9d 100644 --- a/tensorflow/lite/kernels/internal/reference/concatenation.h +++ b/tensorflow/lite/kernels/internal/reference/concatenation.h @@ -74,14 +74,14 @@ inline void Concatenation(const ConcatenationParams& params, // when optimizng this routine further. inline void ConcatenationWithScaling(const ConcatenationParams& params, const RuntimeShape* const* input_shapes, - const uint8* const* input_data, + const uint8_t* const* input_data, const RuntimeShape& output_shape, - uint8* output_data) { + uint8_t* output_data) { int axis = params.axis; - const int32* input_zeropoint = params.input_zeropoint; + const int32_t* input_zeropoint = params.input_zeropoint; const float* input_scale = params.input_scale; int inputs_count = params.inputs_count; - const int32 output_zeropoint = params.output_zeropoint; + const int32_t output_zeropoint = params.output_zeropoint; const float output_scale = params.output_scale; const int concat_dimensions = output_shape.DimensionsCount(); @@ -110,11 +110,11 @@ inline void ConcatenationWithScaling(const ConcatenationParams& params, } const float inverse_output_scale = 1.f / output_scale; - uint8* output_ptr = output_data; + uint8_t* output_ptr = output_data; for (int k = 0; k < outer_size; k++) { for (int i = 0; i < inputs_count; ++i) { const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size; - const uint8* input_ptr = input_data[i] + k * copy_size; + const uint8_t* input_ptr = input_data[i] + k * copy_size; if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale) { memcpy(output_ptr, input_ptr, copy_size); diff --git a/tensorflow/lite/kernels/internal/reference/conv.h b/tensorflow/lite/kernels/internal/reference/conv.h index 55dd869a4b1..d4bf46a86b8 100644 --- a/tensorflow/lite/kernels/internal/reference/conv.h +++ b/tensorflow/lite/kernels/internal/reference/conv.h @@ -99,11 +99,11 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, } inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& filter_shape, - const uint8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - uint8* output_data, const RuntimeShape& im2col_shape, - uint8* im2col_data, void* cpu_backend_context) { + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + uint8_t* output_data, const RuntimeShape& im2col_shape, + uint8_t* im2col_data, void* cpu_backend_context) { (void)cpu_backend_context; // only used in optimized code. (void)im2col_data; // only used in optimized code. (void)im2col_shape; // only used in optimized code. @@ -113,13 +113,13 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, const int dilation_height_factor = params.dilation_height_factor; const int pad_width = params.padding_values.width; const int pad_height = params.padding_values.height; - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_LE(output_activation_min, output_activation_max); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); @@ -143,7 +143,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, for (int out_channel = 0; out_channel < output_depth; ++out_channel) { const int in_x_origin = (out_x * stride_width) - pad_width; const int in_y_origin = (out_y * stride_height) - pad_height; - int32 acc = 0; + int32_t acc = 0; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { for (int filter_x = 0; filter_x < filter_width; ++filter_x) { for (int in_channel = 0; in_channel < input_depth; ++in_channel) { @@ -154,9 +154,9 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { - int32 input_val = input_data[Offset(input_shape, batch, in_y, - in_x, in_channel)]; - int32 filter_val = + int32_t input_val = input_data[Offset( + input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)]; acc += @@ -174,7 +174,7 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape, acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = - static_cast(acc); + static_cast(acc); } } } @@ -220,7 +220,7 @@ inline void HybridConvPerChannel( for (int out_channel = 0; out_channel < output_depth; ++out_channel) { const int in_x_origin = (out_x * stride_width) - pad_width; const int in_y_origin = (out_y * stride_height) - pad_height; - int32 acc = 0; + int32_t acc = 0; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { for (int filter_x = 0; filter_x < filter_width; ++filter_x) { for (int in_channel = 0; in_channel < input_depth; ++in_channel) { @@ -231,9 +231,9 @@ inline void HybridConvPerChannel( // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { - int32 input_val = input_data[Offset(input_shape, batch, in_y, - in_x, in_channel)]; - int32 filter_val = + int32_t input_val = input_data[Offset( + input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)]; acc += filter_val * (input_val - input_offset[batch]); diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h index 70e5dd4012f..20bf83df3d8 100644 --- a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h +++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h @@ -62,21 +62,21 @@ namespace reference_ops { namespace depthwise_conv { template -inline int32 DepthwiseConvRound(int32 x, int32 quantized_multiplier, - int shift) { +inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier, + int shift) { TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone); return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift); } template <> -inline int32 DepthwiseConvRound( - int32 x, int32 quantized_multiplier, int shift) { +inline int32_t DepthwiseConvRound( + int32_t x, int32_t quantized_multiplier, int shift) { return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift); } template <> -inline int32 DepthwiseConvRound( - int32 x, int32 quantized_multiplier, int shift) { +inline int32_t DepthwiseConvRound( + int32_t x, int32_t quantized_multiplier, int shift) { using gemmlowp::SaturatingRoundingDoublingHighMul; const int left_shift = shift > 0 ? shift : 0; const int right_shift = shift > 0 ? 0 : -shift; @@ -89,13 +89,12 @@ inline int32 DepthwiseConvRound( template struct DepthwiseConvBasicKernel { - static inline void Run(const DepthwiseParams& params, - const RuntimeShape& input_shape, - const uint8* input_data, - const RuntimeShape& filter_shape, - const uint8* filter_data, - const RuntimeShape& bias_shape, const int32* bias_data, - const RuntimeShape& output_shape, uint8* output_data) { + static inline void Run( + const DepthwiseParams& params, const RuntimeShape& input_shape, + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + uint8_t* output_data) { const int stride_width = params.stride_width; const int stride_height = params.stride_height; const int dilation_width_factor = params.dilation_width_factor; @@ -103,12 +102,12 @@ struct DepthwiseConvBasicKernel { const int pad_width = params.padding_values.width; const int pad_height = params.padding_values.height; const int depth_multiplier = params.depth_multiplier; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); @@ -135,7 +134,7 @@ struct DepthwiseConvBasicKernel { const int oc = m + ic * depth_multiplier; const int in_x_origin = (out_x * stride_width) - pad_width; const int in_y_origin = (out_y * stride_height) - pad_height; - int32 acc = 0; + int32_t acc = 0; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { for (int filter_x = 0; filter_x < filter_width; ++filter_x) { const int in_x = @@ -146,9 +145,9 @@ struct DepthwiseConvBasicKernel { // use zero as a default value. if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) { - int32 input_val = + int32_t input_val = input_data[Offset(input_shape, b, in_y, in_x, ic)]; - int32 filter_val = filter_data[Offset( + int32_t filter_val = filter_data[Offset( filter_shape, 0, filter_y, filter_x, oc)]; acc += (filter_val + filter_offset) * (input_val + input_offset); @@ -164,7 +163,7 @@ struct DepthwiseConvBasicKernel { acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); output_data[Offset(output_shape, b, out_y, out_x, oc)] = - static_cast(acc); + static_cast(acc); } } } @@ -176,10 +175,10 @@ struct DepthwiseConvBasicKernel { // MultiplyByQuantizedMultiplier or DepthwiseConvRound function. static inline void RunPerChannel( const DepthwiseParams& params, const RuntimeShape& input_shape, - const int8* input_data, const RuntimeShape& filter_shape, - const int8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - int8* output_data) { + const int8_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + int8_t* output_data) { // Get parameters. // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro. const int stride_width = params.stride_width; @@ -189,12 +188,12 @@ struct DepthwiseConvBasicKernel { const int pad_width = params.padding_values.width; const int pad_height = params.padding_values.height; const int depth_multiplier = params.depth_multiplier; - const int32 input_offset = params.input_offset; - const int32 output_offset = params.output_offset; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; - const int32* output_multiplier = params.output_multiplier_per_channel; - const int32* output_shift = params.output_shift_per_channel; + const int32_t input_offset = params.input_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + const int32_t* output_multiplier = params.output_multiplier_per_channel; + const int32_t* output_shift = params.output_shift_per_channel; // Check dimensions of the tensors. TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); @@ -222,7 +221,7 @@ struct DepthwiseConvBasicKernel { const int output_channel = m + in_channel * depth_multiplier; const int in_x_origin = (out_x * stride_width) - pad_width; const int in_y_origin = (out_y * stride_height) - pad_height; - int32 acc = 0; + int32_t acc = 0; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { for (int filter_x = 0; filter_x < filter_width; ++filter_x) { const int in_x = @@ -234,17 +233,18 @@ struct DepthwiseConvBasicKernel { (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); if (is_point_inside_image) { - int32 input_val = input_data[Offset( + int32_t input_val = input_data[Offset( input_shape, batch, in_y, in_x, in_channel)]; - int32 filter_val = filter_data[Offset( + int32_t filter_val = filter_data[Offset( filter_shape, 0, filter_y, filter_x, output_channel)]; // Accumulate with 32 bits accumulator. // In the nudging process during model quantization, we // force real value of 0.0 be represented by a quantized - // value. This guarantees that the input_offset is a int8, - // even though it is represented using int32. int32 += int8 - // * (int8 - int8) so the highest value we can get from each - // accumulation is [-127, 127] * ([-128, 127] - + // value. This guarantees that the input_offset is a int8_t, + // even though it is represented using int32_t. int32_t += + // int8_t + // * (int8_t - int8_t) so the highest value we can get from + // each accumulation is [-127, 127] * ([-128, 127] - // [-128, 127]), which is [-32512, 32512]. log2(32512) // = 14.98, which means we can accumulate at least 2^16 // multiplications without overflow. The accumulator is @@ -279,10 +279,10 @@ struct DepthwiseConvBasicKernel { inline void DepthwiseConv( const DepthwiseParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& filter_shape, - const uint8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - uint8* output_data) { + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + uint8_t* output_data) { return depthwise_conv::DepthwiseConvBasicKernel< DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape, input_data, filter_shape, diff --git a/tensorflow/lite/kernels/internal/reference/dequantize.h b/tensorflow/lite/kernels/internal/reference/dequantize.h index 286c9310799..b90951f96e8 100644 --- a/tensorflow/lite/kernels/internal/reference/dequantize.h +++ b/tensorflow/lite/kernels/internal/reference/dequantize.h @@ -32,12 +32,12 @@ inline void Dequantize(const tflite::DequantizationParams& op_params, const RuntimeShape& input_shape, const InputT* input_data, const RuntimeShape& output_shape, OutputT* output_data) { - int32 zero_point = op_params.zero_point; + int32_t zero_point = op_params.zero_point; const double scale = op_params.scale; const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { - const int32 val = input_data[i]; + const int32_t val = input_data[i]; const OutputT result = static_cast(scale * (val - zero_point)); output_data[i] = result; } @@ -52,11 +52,11 @@ inline void PerChannelDequantize( // Ensure flat size is same. MatchingFlatSize(input_shape, output_shape); - const int32* zero_point = op_params.zero_point; + const int32_t* zero_point = op_params.zero_point; const float* scale = op_params.scale; - const int32 quantized_dimension = op_params.quantized_dimension; - const int32 num_dims = input_shape.DimensionsCount(); - const int32* dims_data = input_shape.DimsData(); + const int32_t quantized_dimension = op_params.quantized_dimension; + const int32_t num_dims = input_shape.DimensionsCount(); + const int32_t* dims_data = input_shape.DimsData(); std::vector current_dim(num_dims, 0); do { @@ -64,7 +64,7 @@ inline void PerChannelDequantize( ReducedOutputOffset(num_dims, reinterpret_cast(dims_data), current_dim.data(), 0, nullptr); const int channel = current_dim[quantized_dimension]; - const int32 val = input_data[offset]; + const int32_t val = input_data[offset]; const float result = static_cast(scale[channel] * (val - zero_point[channel])); output_data[offset] = result; diff --git a/tensorflow/lite/kernels/internal/reference/fully_connected.h b/tensorflow/lite/kernels/internal/reference/fully_connected.h index 204a0fa0afa..39a9cd023d8 100644 --- a/tensorflow/lite/kernels/internal/reference/fully_connected.h +++ b/tensorflow/lite/kernels/internal/reference/fully_connected.h @@ -61,17 +61,17 @@ inline void FullyConnected( inline void FullyConnected( const FullyConnectedParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& filter_shape, - const uint8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - uint8* output_data) { - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + uint8_t* output_data) { + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); @@ -89,10 +89,10 @@ inline void FullyConnected( const int accum_depth = filter_shape.Dims(filter_dim_count - 1); for (int b = 0; b < batches; ++b) { for (int out_c = 0; out_c < output_depth; ++out_c) { - int32 acc = 0; + int32_t acc = 0; for (int d = 0; d < accum_depth; ++d) { - int32 input_val = input_data[b * accum_depth + d]; - int32 filter_val = filter_data[out_c * accum_depth + d]; + int32_t input_val = input_data[b * accum_depth + d]; + int32_t filter_val = filter_data[out_c * accum_depth + d]; acc += (filter_val + filter_offset) * (input_val + input_offset); } if (bias_data) { @@ -102,24 +102,24 @@ inline void FullyConnected( acc += output_offset; acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); - output_data[out_c + output_depth * b] = static_cast(acc); + output_data[out_c + output_depth * b] = static_cast(acc); } } } inline void FullyConnected( const FullyConnectedParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& filter_shape, - const uint8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - int16* output_data) { - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; + const uint8_t* input_data, const RuntimeShape& filter_shape, + const uint8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + int16_t* output_data) { + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_LE(output_activation_min, output_activation_max); TFLITE_DCHECK_EQ(output_offset, 0); @@ -138,20 +138,21 @@ inline void FullyConnected( for (int out_c = 0; out_c < output_depth; ++out_c) { // Internal accumulation. // Initialize accumulator with the bias-value. - int32 accum = bias_data[out_c]; + int32_t accum = bias_data[out_c]; // Accumulation loop. for (int d = 0; d < accum_depth; ++d) { - int16 input_val = input_data[b * accum_depth + d] + input_offset; - int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset; + int16_t input_val = input_data[b * accum_depth + d] + input_offset; + int16_t filter_val = + filter_data[out_c * accum_depth + d] + filter_offset; accum += filter_val * input_val; } - // Down-scale the final int32 accumulator to the scale used by our + // Down-scale the final int32_t accumulator to the scale used by our // (16-bit, typically 3 integer bits) fixed-point format. The quantized // multiplier and shift here have been pre-computed offline // (e.g. by toco). accum = MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift); - // Saturate, cast to int16, and store to output array. + // Saturate, cast to int16_t, and store to output array. accum = std::max(accum, output_activation_min - output_offset); accum = std::min(accum, output_activation_max - output_offset); accum += output_offset; @@ -162,14 +163,14 @@ inline void FullyConnected( inline void ShuffledFullyConnected( const FullyConnectedParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& weights_shape, - const uint8* shuffled_weights_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - int16* output_data, uint8* shuffled_input_workspace_data) { - const int32 output_multiplier = params.output_multiplier; + const uint8_t* input_data, const RuntimeShape& weights_shape, + const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + int16_t* output_data, uint8_t* shuffled_input_workspace_data) { + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_LE(output_activation_min, output_activation_max); TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); @@ -190,7 +191,7 @@ inline void ShuffledFullyConnected( TFLITE_DCHECK((output_depth % 4) == 0); // Shuffling and xoring of input activations into the workspace buffer - uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; + uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data; if (batches == 1) { for (int i = 0; i < accum_depth; i++) { shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; @@ -198,13 +199,13 @@ inline void ShuffledFullyConnected( } else if (batches == 4) { for (int c = 0; c < accum_depth; c += 16) { for (int b = 0; b < 4; b++) { - const uint8* src_data_ptr = input_data + b * accum_depth + c; + const uint8_t* src_data_ptr = input_data + b * accum_depth + c; for (int j = 0; j < 16; j++) { - uint8 src_val = *src_data_ptr++; + uint8_t src_val = *src_data_ptr++; // Flip the sign bit, so that the kernel will only need to - // reinterpret these uint8 values as int8, getting for free the + // reinterpret these uint8_t values as int8_t, getting for free the // subtraction of the zero_point value 128. - uint8 dst_val = src_val ^ 0x80; + uint8_t dst_val = src_val ^ 0x80; *shuffled_input_workspace_ptr++ = dst_val; } } @@ -216,62 +217,62 @@ inline void ShuffledFullyConnected( // Actual computation if (batches == 1) { - int16* output_ptr = output_data; + int16_t* output_ptr = output_data; // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) - // so that just reinterpreting them as int8 values is equivalent to + // so that just reinterpreting them as int8_t values is equivalent to // subtracting 128 from them, thus implementing for free the subtraction of // the zero_point value 128. - const int8* shuffled_weights_ptr = - reinterpret_cast(shuffled_weights_data); + const int8_t* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); // Likewise, we preshuffled and pre-xored the input data above. - const int8* shuffled_input_data = - reinterpret_cast(shuffled_input_workspace_data); + const int8_t* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); for (int c = 0; c < output_depth; c += 4) { // Internal accumulation. // Initialize accumulator with the bias-value. - int32 accum[4] = {0}; + int32_t accum[4] = {0}; // Accumulation loop. for (int d = 0; d < accum_depth; d += 16) { for (int i = 0; i < 4; i++) { for (int j = 0; j < 16; j++) { - int8 input_val = shuffled_input_data[d + j]; - int8 weights_val = *shuffled_weights_ptr++; + int8_t input_val = shuffled_input_data[d + j]; + int8_t weights_val = *shuffled_weights_ptr++; accum[i] += weights_val * input_val; } } } for (int i = 0; i < 4; i++) { // Add bias value - int32 acc = accum[i] + bias_data[c + i]; - // Down-scale the final int32 accumulator to the scale used by our + int32_t acc = accum[i] + bias_data[c + i]; + // Down-scale the final int32_t accumulator to the scale used by our // (16-bit, typically 3 integer bits) fixed-point format. The quantized // multiplier and shift here have been pre-computed offline // (e.g. by toco). acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); - // Saturate, cast to int16, and store to output array. + // Saturate, cast to int16_t, and store to output array. acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); output_ptr[c + i] = acc; } } } else if (batches == 4) { - int16* output_ptr = output_data; + int16_t* output_ptr = output_data; // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) - // so that just reinterpreting them as int8 values is equivalent to + // so that just reinterpreting them as int8_t values is equivalent to // subtracting 128 from them, thus implementing for free the subtraction of // the zero_point value 128. - const int8* shuffled_weights_ptr = - reinterpret_cast(shuffled_weights_data); + const int8_t* shuffled_weights_ptr = + reinterpret_cast(shuffled_weights_data); // Likewise, we preshuffled and pre-xored the input data above. - const int8* shuffled_input_data = - reinterpret_cast(shuffled_input_workspace_data); + const int8_t* shuffled_input_data = + reinterpret_cast(shuffled_input_workspace_data); for (int c = 0; c < output_depth; c += 4) { - const int8* shuffled_input_ptr = shuffled_input_data; + const int8_t* shuffled_input_ptr = shuffled_input_data; // Accumulation loop. // Internal accumulation. // Initialize accumulator with the bias-value. - int32 accum[4][4]; + int32_t accum[4][4]; for (int i = 0; i < 4; i++) { for (int b = 0; b < 4; b++) { accum[i][b] = 0; @@ -281,8 +282,8 @@ inline void ShuffledFullyConnected( for (int i = 0; i < 4; i++) { for (int b = 0; b < 4; b++) { for (int j = 0; j < 16; j++) { - int8 input_val = shuffled_input_ptr[16 * b + j]; - int8 weights_val = shuffled_weights_ptr[16 * i + j]; + int8_t input_val = shuffled_input_ptr[16 * b + j]; + int8_t weights_val = shuffled_weights_ptr[16 * i + j]; accum[i][b] += weights_val * input_val; } } @@ -293,14 +294,14 @@ inline void ShuffledFullyConnected( for (int i = 0; i < 4; i++) { for (int b = 0; b < 4; b++) { // Add bias value - int32 acc = accum[i][b] + bias_data[c + i]; - // Down-scale the final int32 accumulator to the scale used by our + int32_t acc = accum[i][b] + bias_data[c + i]; + // Down-scale the final int32_t accumulator to the scale used by our // (16-bit, typically 3 integer bits) fixed-point format. The // quantized multiplier and shift here have been pre-computed offline // (e.g. by toco). acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); - // Saturate, cast to int16, and store to output array. + // Saturate, cast to int16_t, and store to output array. acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); output_ptr[b * output_depth + c + i] = acc; diff --git a/tensorflow/lite/kernels/internal/reference/hard_swish.h b/tensorflow/lite/kernels/internal/reference/hard_swish.h index dd07b09c3b3..cda1b5cf0ad 100644 --- a/tensorflow/lite/kernels/internal/reference/hard_swish.h +++ b/tensorflow/lite/kernels/internal/reference/hard_swish.h @@ -86,7 +86,7 @@ inline void HardSwish(const HardSwishParams& params, // (reluish_multiplier_fixedpoint) and bit-shift such that we represent // that input value on the scale where the real value 3.0f is represented // by the quantized value 32768. (+32768 is actually not representable as - // int16, so this saturates at +32767, and that is seen empirically to be + // int16_t, so this saturates at +32767, and that is seen empirically to be // a negligible contribution to numerical error/bias). // // This code is careful to correctly implement any magnitude of multiplier, diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h index 69b42e08a6d..88ca246eaf4 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h @@ -35,22 +35,22 @@ inline void AddElementwise(int size, const ArithmeticParams& params, TFLITE_DCHECK_LE(params.input2_offset, int8_max_value); for (int i = 0; i < size; ++i) { - const int32 input1_val = params.input1_offset + input1_data[i]; - const int32 input2_val = params.input2_offset + input2_data[i]; - const int32 shifted_input1_val = input1_val * (1 << params.left_shift); - const int32 shifted_input2_val = input2_val * (1 << params.left_shift); - const int32 scaled_input1_val = + const int32_t input1_val = params.input1_offset + input1_data[i]; + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input1_val, params.input1_multiplier, params.input1_shift); - const int32 scaled_input2_val = + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input2_val, params.input2_multiplier, params.input2_shift); - const int32 raw_sum = scaled_input1_val + scaled_input2_val; - const int32 raw_output = + const int32_t raw_sum = scaled_input1_val + scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( raw_sum, params.output_multiplier, params.output_shift) + params.output_offset; - const int32 clamped_output = + const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); output_data[i] = static_cast(clamped_output); diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h index df6b787338d..f4bcb2bd06e 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h @@ -22,25 +22,25 @@ namespace reference_integer_ops { // Fixed-point per-channel-quantization convolution reference kernel. inline void ConvPerChannel( - const ConvParams& params, const int32* output_multiplier, - const int32* output_shift, const RuntimeShape& input_shape, - const int8* input_data, const RuntimeShape& filter_shape, - const int8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - int8* output_data) { + const ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const RuntimeShape& input_shape, + const int8_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + int8_t* output_data) { // Get parameters. - const int32 input_offset = params.input_offset; // r = s(q - Z) + const int32_t input_offset = params.input_offset; // r = s(q - Z) const int stride_width = params.stride_width; const int stride_height = params.stride_height; const int dilation_width_factor = params.dilation_width_factor; const int dilation_height_factor = params.dilation_height_factor; const int pad_width = params.padding_values.width; const int pad_height = params.padding_values.height; - const int32 output_offset = params.output_offset; + const int32_t output_offset = params.output_offset; // Set min and max value of the output. - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; // Consistency check. TFLITE_DCHECK_LE(output_activation_min, output_activation_max); @@ -67,7 +67,7 @@ inline void ConvPerChannel( for (int out_channel = 0; out_channel < output_depth; ++out_channel) { const int in_x_origin = (out_x * stride_width) - pad_width; const int in_y_origin = (out_y * stride_height) - pad_height; - int32 acc = 0; + int32_t acc = 0; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { for (int filter_x = 0; filter_x < filter_width; ++filter_x) { for (int in_channel = 0; in_channel < input_depth; ++in_channel) { @@ -79,18 +79,18 @@ inline void ConvPerChannel( (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); if (is_point_inside_image) { - int32 input_val = input_data[Offset(input_shape, batch, in_y, - in_x, in_channel)]; - int32 filter_val = + int32_t input_val = input_data[Offset( + input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)]; // Accumulate with 32 bits accumulator. // In the nudging process during model quantization, we force // real value of 0.0 be represented by a quantized value. This - // guarantees that the input_offset is a int8, even though it - // is represented using int32. - // int32 += int8 * (int8 - int8) so the highest value we can - // get from each accumulation is [-127, 127] * ([-128, 127] - + // guarantees that the input_offset is a int8_t, even though + // it is represented using int32_t. int32_t += int8_t * + // (int8_t - int8_t) so the highest value we can get from each + // accumulation is [-127, 127] * ([-128, 127] - // [-128, 127]), which is [-32512, 32512]. log2(32512) // = 14.98, which means we can accumulate at least 2^16 // multiplications without overflow. The accumulator is @@ -125,12 +125,12 @@ inline void ConvPerChannel( // Fixed-point per-channel-quantization convolution reference kernel. // 16-bit data and 8-bit filter inline void ConvPerChannel( - const ConvParams& params, const int32* output_multiplier, - const int32* output_shift, const RuntimeShape& input_shape, - const int16* input_data, const RuntimeShape& filter_shape, - const int8* filter_data, const RuntimeShape& bias_shape, + const ConvParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const RuntimeShape& input_shape, + const int16_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, const std::int64_t* bias_data, const RuntimeShape& output_shape, - int16* output_data) { + int16_t* output_data) { // Get parameters. const int stride_width = params.stride_width; const int stride_height = params.stride_height; @@ -140,8 +140,8 @@ inline void ConvPerChannel( const int pad_height = params.padding_values.height; // Set min and max value of the output. - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; // Consistency check. TFLITE_DCHECK_LE(output_activation_min, output_activation_max); @@ -180,13 +180,13 @@ inline void ConvPerChannel( (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); if (is_point_inside_image) { - int32 input_val = input_data[Offset(input_shape, batch, in_y, - in_x, in_channel)]; - int32 filter_val = + int32_t input_val = input_data[Offset( + input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)]; // Accumulate with 64 bits accumulator. - // int64 += int8 * int16 so the highest value we can + // int64_t += int8_t * int16_t so the highest value we can // get from each accumulation is [-127, 127] * ([-32768, // 32767] - // [-32768, 32767]), which is [-8322945, 8322945]. diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h index a4e00981367..6f54e47f344 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h @@ -20,12 +20,12 @@ limitations under the License. namespace tflite { namespace reference_integer_ops { inline void DepthwiseConvPerChannel( - const DepthwiseParams& params, const int32* output_multiplier, - const int32* output_shift, const RuntimeShape& input_shape, - const int8* input_data, const RuntimeShape& filter_shape, - const int8* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, - int8* output_data) { + const DepthwiseParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const RuntimeShape& input_shape, + const int8_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, + int8_t* output_data) { // Get parameters. // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro. const int stride_width = params.stride_width; @@ -35,10 +35,10 @@ inline void DepthwiseConvPerChannel( const int pad_width = params.padding_values.width; const int pad_height = params.padding_values.height; const int depth_multiplier = params.depth_multiplier; - const int32 input_offset = params.input_offset; - const int32 output_offset = params.output_offset; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t input_offset = params.input_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; // Check dimensions of the tensors. TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); @@ -66,7 +66,7 @@ inline void DepthwiseConvPerChannel( const int output_channel = m + in_channel * depth_multiplier; const int in_x_origin = (out_x * stride_width) - pad_width; const int in_y_origin = (out_y * stride_height) - pad_height; - int32 acc = 0; + int32_t acc = 0; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { for (int filter_x = 0; filter_x < filter_width; ++filter_x) { const int in_x = in_x_origin + dilation_width_factor * filter_x; @@ -77,17 +77,17 @@ inline void DepthwiseConvPerChannel( (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); if (is_point_inside_image) { - int32 input_val = input_data[Offset(input_shape, batch, in_y, - in_x, in_channel)]; - int32 filter_val = filter_data[Offset( + int32_t input_val = input_data[Offset( + input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = filter_data[Offset( filter_shape, 0, filter_y, filter_x, output_channel)]; // Accumulate with 32 bits accumulator. // In the nudging process during model quantization, we force // real value of 0.0 be represented by a quantized value. This - // guarantees that the input_offset is a int8, even though it - // is represented using int32. - // int32 += int8 * (int8 - int8) so the highest value we can - // get from each accumulation is [-127, 127] * ([-128, 127] - + // guarantees that the input_offset is a int8_t, even though + // it is represented using int32_t. int32_t += int8_t * + // (int8_t - int8_t) so the highest value we can get from each + // accumulation is [-127, 127] * ([-128, 127] - // [-128, 127]), which is [-32512, 32512]. log2(32512) // = 14.98, which means we can accumulate at least 2^16 // multiplications without overflow. The accumulator is @@ -120,12 +120,12 @@ inline void DepthwiseConvPerChannel( } inline void DepthwiseConvPerChannel( - const DepthwiseParams& params, const int32* output_multiplier, - const int32* output_shift, const RuntimeShape& input_shape, - const int16* input_data, const RuntimeShape& filter_shape, - const int8* filter_data, const RuntimeShape& bias_shape, + const DepthwiseParams& params, const int32_t* output_multiplier, + const int32_t* output_shift, const RuntimeShape& input_shape, + const int16_t* input_data, const RuntimeShape& filter_shape, + const int8_t* filter_data, const RuntimeShape& bias_shape, const std::int64_t* bias_data, const RuntimeShape& output_shape, - int16* output_data) { + int16_t* output_data) { // Get parameters. const int stride_width = params.stride_width; const int stride_height = params.stride_height; @@ -134,8 +134,8 @@ inline void DepthwiseConvPerChannel( const int pad_width = params.padding_values.width; const int pad_height = params.padding_values.height; const int depth_multiplier = params.depth_multiplier; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; // Check dimensions of the tensors. TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); @@ -174,9 +174,9 @@ inline void DepthwiseConvPerChannel( (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); if (is_point_inside_image) { - int32 input_val = input_data[Offset(input_shape, batch, in_y, - in_x, in_channel)]; - int32 filter_val = filter_data[Offset( + int32_t input_val = input_data[Offset( + input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = filter_data[Offset( filter_shape, 0, filter_y, filter_x, output_channel)]; // Accumulate with 64 bits accumulator. // We assume maximum of 2^16 accumulations as with the 8-bit @@ -190,7 +190,7 @@ inline void DepthwiseConvPerChannel( if (bias_data) { acc += bias_data[output_channel]; } - int32 scaled_acc = MultiplyByQuantizedMultiplier( + int32_t scaled_acc = MultiplyByQuantizedMultiplier( acc, output_multiplier[output_channel], output_shift[output_channel]); scaled_acc = std::max(scaled_acc, output_activation_min); @@ -207,8 +207,8 @@ inline void DepthwiseConvPerChannel( inline void DepthwiseConvHybridPerChannel( const DepthwiseParams& params, float* scaling_factors_ptr, - const RuntimeShape& input_shape, const int8* input_data, - const RuntimeShape& filter_shape, const int8* filter_data, + const RuntimeShape& input_shape, const int8_t* input_data, + const RuntimeShape& filter_shape, const int8_t* filter_data, const RuntimeShape& bias_shape, const float* bias_data, const RuntimeShape& output_shape, float* output_data, const float* per_channel_scale, int32_t* input_offset) { @@ -247,7 +247,7 @@ inline void DepthwiseConvHybridPerChannel( const int output_channel = m + in_channel * depth_multiplier; const int in_x_origin = (out_x * stride_width) - pad_width; const int in_y_origin = (out_y * stride_height) - pad_height; - int32 acc = 0; + int32_t acc = 0; for (int filter_y = 0; filter_y < filter_height; ++filter_y) { for (int filter_x = 0; filter_x < filter_width; ++filter_x) { const int in_x = in_x_origin + dilation_width_factor * filter_x; @@ -258,9 +258,9 @@ inline void DepthwiseConvHybridPerChannel( (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); if (is_point_inside_image) { - int32 input_val = input_data[Offset(input_shape, batch, in_y, - in_x, in_channel)]; - int32 filter_val = filter_data[Offset( + int32_t input_val = input_data[Offset( + input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = filter_data[Offset( filter_shape, 0, filter_y, filter_x, output_channel)]; acc += filter_val * (input_val - input_offset[batch]); } diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h index fd9cb0180e1..2bc3e794855 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h @@ -24,15 +24,15 @@ inline void FullyConnected( const FullyConnectedParams& params, const RuntimeShape& input_shape, const int8_t* input_data, const RuntimeShape& filter_shape, const int8_t* filter_data, const RuntimeShape& bias_shape, - const int32* bias_data, const RuntimeShape& output_shape, + const int32_t* bias_data, const RuntimeShape& output_shape, int8_t* output_data) { - const int32 input_offset = params.input_offset; - const int32 filter_offset = params.weights_offset; - const int32 output_offset = params.output_offset; - const int32 output_multiplier = params.output_multiplier; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2); @@ -44,10 +44,10 @@ inline void FullyConnected( const int accum_depth = filter_shape.Dims(filter_dim_count - 1); for (int b = 0; b < batches; ++b) { for (int out_c = 0; out_c < output_depth; ++out_c) { - int32 acc = 0; + int32_t acc = 0; for (int d = 0; d < accum_depth; ++d) { - int32 input_val = input_data[b * accum_depth + d]; - int32 filter_val = filter_data[out_c * accum_depth + d]; + int32_t input_val = input_data[b * accum_depth + d]; + int32_t filter_val = filter_data[out_c * accum_depth + d]; acc += (filter_val + filter_offset) * (input_val + input_offset); } if (bias_data) { @@ -68,11 +68,11 @@ inline void FullyConnected( const int8_t* filter_data, const RuntimeShape& bias_shape, const int64_t* bias_data, const RuntimeShape& output_shape, int16_t* output_data) { - const int32 filter_offset = params.weights_offset; - const int32 output_multiplier = params.output_multiplier; + const int32_t filter_offset = params.weights_offset; + const int32_t output_multiplier = params.output_multiplier; const int output_shift = params.output_shift; - const int32 output_activation_min = params.quantized_activation_min; - const int32 output_activation_max = params.quantized_activation_max; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2); @@ -86,8 +86,8 @@ inline void FullyConnected( for (int out_c = 0; out_c < output_depth; ++out_c) { int64_t acc = 0; for (int d = 0; d < accum_depth; ++d) { - int32 input_val = input_data[b * accum_depth + d]; - int32 filter_val = filter_data[out_c * accum_depth + d]; + int32_t input_val = input_data[b * accum_depth + d]; + int32_t filter_val = filter_data[out_c * accum_depth + d]; acc += (filter_val + filter_offset) * input_val; } if (bias_data) { diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h b/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h index 7488a2147c4..31f2de986c8 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h @@ -21,8 +21,8 @@ namespace tflite { namespace reference_integer_ops { inline void L2Normalization(int32_t input_zero_point, int32_t outer_size, - int32_t depth, const int8* input_data, - int8* output_data) { + int32_t depth, const int8_t* input_data, + int8_t* output_data) { static constexpr int8_t kMinInt8 = std::numeric_limits::min(); static constexpr int8_t kMaxInt8 = std::numeric_limits::max(); // The output scale must be in sync with Prepare(). @@ -30,7 +30,7 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size, // to [-1, 127/128]. static constexpr int32_t kOutputScale = 7; for (int outer_index = 0; outer_index < outer_size; ++outer_index) { - // int32 = (int8 - int8) ^ 2. + // int32_t = (int8_t - int8_t) ^ 2. // ([-128, 127] - [-128, 127]) ^ 2 = [0, (2^8 - 1)^2] so the accumulator is // safe from overflowing in at least 2^16 steps. int32_t acc = 0; @@ -55,7 +55,7 @@ inline void L2Normalization(int32_t input_zero_point, int32_t outer_size, std::min(static_cast(kMaxInt8), std::max(static_cast(kMinInt8), output_in_q24)); output_data[depth * outer_index + inner_index] = - static_cast(output_in_q24); + static_cast(output_in_q24); } } } diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h index a815c3f5252..b80838aa089 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h @@ -27,14 +27,14 @@ inline void MulElementwise(int size, const ArithmeticParams& params, const T* input1_data, const T* input2_data, T* output_data) { for (int i = 0; i < size; ++i) { - const int32 input1_val = params.input1_offset + input1_data[i]; - const int32 input2_val = params.input2_offset + input2_data[i]; - const int32 unclamped_result = + const int32_t input1_val = params.input1_offset + input1_data[i]; + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t unclamped_result = params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, params.output_multiplier, params.output_shift); - const int32 clamped_output = + const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); output_data[i] = static_cast(clamped_output); @@ -57,13 +57,13 @@ inline void Mul(const ArithmeticParams& params, // Mul with 16 bit inputs and int8_t outputs. inline void Mul(const ArithmeticParams& params, - const RuntimeShape& input1_shape, const int16* input1_data, - const RuntimeShape& input2_shape, const int16* input2_data, + const RuntimeShape& input1_shape, const int16_t* input1_data, + const RuntimeShape& input2_shape, const int16_t* input2_data, const RuntimeShape& output_shape, int8_t* output_data) { ruy::profiler::ScopeLabel label("Mul/Int16Int8"); - int32 output_offset = params.output_offset; - int32 output_activation_min = params.quantized_activation_min; - int32 output_activation_max = params.quantized_activation_max; + int32_t output_offset = params.output_offset; + int32_t output_activation_min = params.quantized_activation_min; + int32_t output_activation_max = params.quantized_activation_max; TFLITE_DCHECK_LE(output_activation_min, output_activation_max); const int flat_size = @@ -75,12 +75,12 @@ inline void Mul(const ArithmeticParams& params, F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]); - int16 rescaled_result = + int16_t rescaled_result = gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8); - int16 clamped_result = - std::min(output_activation_max - output_offset, rescaled_result); - clamped_result = - std::max(output_activation_min - output_offset, clamped_result); + int16_t clamped_result = std::min( + output_activation_max - output_offset, rescaled_result); + clamped_result = std::max(output_activation_min - output_offset, + clamped_result); output_data[i] = output_offset + clamped_result; } } @@ -104,18 +104,18 @@ inline void BroadcastMul4DSlow( for (int y = 0; y < extended_output_shape.Dims(1); ++y) { for (int x = 0; x < extended_output_shape.Dims(2); ++x) { for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - const int32 input1_val = + const int32_t input1_val = params.input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; - const int32 input2_val = + const int32_t input2_val = params.input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; - const int32 unclamped_result = + const int32_t unclamped_result = params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, params.output_multiplier, params.output_shift); - const int32 clamped_output = std::min( + const int32_t clamped_output = std::min( params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); output_data[Offset(extended_output_shape, b, y, x, c)] = diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h b/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h index 6b49d2b150b..17944bc47dd 100644 --- a/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h +++ b/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h @@ -22,8 +22,9 @@ namespace tflite { namespace reference_integer_ops { inline void AveragePool(const PoolParams& params, - const RuntimeShape& input_shape, const int8* input_data, - const RuntimeShape& output_shape, int8* output_data) { + const RuntimeShape& input_shape, + const int8_t* input_data, + const RuntimeShape& output_shape, int8_t* output_data) { TFLITE_DCHECK_LE(params.quantized_activation_min, params.quantized_activation_max); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); @@ -52,7 +53,7 @@ inline void AveragePool(const PoolParams& params, const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); - int32 acc = 0; + int32_t acc = 0; int filter_count = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { @@ -71,7 +72,7 @@ inline void AveragePool(const PoolParams& params, acc = std::max(acc, params.quantized_activation_min); acc = std::min(acc, params.quantized_activation_max); output_data[Offset(output_shape, batch, out_y, out_x, channel)] = - static_cast(acc); + static_cast(acc); } } } @@ -79,8 +80,8 @@ inline void AveragePool(const PoolParams& params, } inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, - const int8* input_data, const RuntimeShape& output_shape, - int8* output_data) { + const int8_t* input_data, const RuntimeShape& output_shape, + int8_t* output_data) { TFLITE_DCHECK_LE(params.quantized_activation_min, params.quantized_activation_max); TFLITE_DCHECK_GE(params.quantized_activation_min, @@ -137,8 +138,9 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, inline void AveragePool(const PoolParams& params, const RuntimeShape& input_shape, - const int16* input_data, - const RuntimeShape& output_shape, int16* output_data) { + const int16_t* input_data, + const RuntimeShape& output_shape, + int16_t* output_data) { TFLITE_DCHECK_LE(params.quantized_activation_min, params.quantized_activation_max); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); @@ -167,7 +169,7 @@ inline void AveragePool(const PoolParams& params, const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); - int32 acc = 0; + int32_t acc = 0; int filter_count = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { @@ -186,7 +188,7 @@ inline void AveragePool(const PoolParams& params, acc = std::max(acc, params.quantized_activation_min); acc = std::min(acc, params.quantized_activation_max); output_data[Offset(output_shape, batch, out_y, out_x, channel)] = - static_cast(acc); + static_cast(acc); } } } @@ -194,8 +196,8 @@ inline void AveragePool(const PoolParams& params, } inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, - const int16* input_data, const RuntimeShape& output_shape, - int16* output_data) { + const int16_t* input_data, const RuntimeShape& output_shape, + int16_t* output_data) { TFLITE_DCHECK_LE(params.quantized_activation_min, params.quantized_activation_max); TFLITE_DCHECK_GE(params.quantized_activation_min, diff --git a/tensorflow/lite/kernels/internal/reference/l2normalization.h b/tensorflow/lite/kernels/internal/reference/l2normalization.h index 00697c2e548..7587d2b5c2e 100644 --- a/tensorflow/lite/kernels/internal/reference/l2normalization.h +++ b/tensorflow/lite/kernels/internal/reference/l2normalization.h @@ -52,40 +52,39 @@ inline void L2Normalization(const tflite::L2NormalizationParams& op_params, inline void L2Normalization(const tflite::L2NormalizationParams& op_params, const RuntimeShape& input_shape, - const uint8* input_data, + const uint8_t* input_data, const RuntimeShape& output_shape, - uint8* output_data) { + uint8_t* output_data) { const int trailing_dim = input_shape.DimensionsCount() - 1; const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); - const int32 input_zero_point = op_params.input_zero_point; + const int32_t input_zero_point = op_params.input_zero_point; for (int i = 0; i < outer_size; ++i) { - int32 square_l2_norm = 0; + int32_t square_l2_norm = 0; for (int c = 0; c < depth; c++) { - int32 diff = input_data[depth * i + c] - input_zero_point; + int32_t diff = input_data[depth * i + c] - input_zero_point; square_l2_norm += diff * diff; } - int32 inv_l2norm_multiplier; + int32_t inv_l2norm_multiplier; int inv_l2norm_shift; GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift, &inv_l2norm_multiplier, &inv_l2norm_shift); for (int c = 0; c < depth; c++) { - int32 diff = input_data[depth * i + c] - input_zero_point; - int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp( + int32_t diff = input_data[depth * i + c] - input_zero_point; + int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp( 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); - int32 unclamped_output_val = 128 + rescaled_diff; - int32 output_val = - std::min(static_cast(255), - std::max(static_cast(0), unclamped_output_val)); - output_data[depth * i + c] = static_cast(output_val); + int32_t unclamped_output_val = 128 + rescaled_diff; + int32_t output_val = + std::min(static_cast(255), + std::max(static_cast(0), unclamped_output_val)); + output_data[depth * i + c] = static_cast(output_val); } } } - } // namespace reference_ops } // namespace tflite #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_L2NORMALIZATION_H_ diff --git a/tensorflow/lite/kernels/internal/reference/logistic.h b/tensorflow/lite/kernels/internal/reference/logistic.h index 8aba51896df..64b7133bec6 100644 --- a/tensorflow/lite/kernels/internal/reference/logistic.h +++ b/tensorflow/lite/kernels/internal/reference/logistic.h @@ -66,8 +66,8 @@ inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape, } inline void Logistic(const LogisticParams& params, - const RuntimeShape& input_shape, const int16* input_data, - const RuntimeShape& output_shape, int16* output_data) { + const RuntimeShape& input_shape, const int16_t* input_data, + const RuntimeShape& output_shape, int16_t* output_data) { const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { @@ -84,12 +84,12 @@ inline void Logistic(const LogisticParams& params, } } -// Quantized int8 logistic activation. Cheats by dequantizing and requantizing -// around the floating point logistic method. This implementation is slow on -// platforms without a floating point unit. +// Quantized int8_t logistic activation. Cheats by dequantizing and +// requantizing around the floating point logistic method. This implementation +// is slow on platforms without a floating point unit. -// TODO(b/141211002): Delete this int8 implementation once we can reuse the -// approach used in TFLite for int8 Logistic. +// TODO(b/141211002): Delete this int8_t implementation once we can reuse the +// approach used in TFLite for int8_t Logistic. inline void Logistic(const RuntimeShape& input_shape, const int8_t* input_data, float input_scale, int input_zero_point, const RuntimeShape& output_shape, int8_t* output_data, diff --git a/tensorflow/lite/kernels/internal/reference/mul.h b/tensorflow/lite/kernels/internal/reference/mul.h index 54e947db9ca..0578b81bfbc 100644 --- a/tensorflow/lite/kernels/internal/reference/mul.h +++ b/tensorflow/lite/kernels/internal/reference/mul.h @@ -24,20 +24,20 @@ namespace reference_ops { // Element-wise mul that can often be used for inner loop of broadcast Mul as // well as the non-broadcast Mul. inline void MulElementwise(int size, const ArithmeticParams& params, - const uint8* input1_data, const uint8* input2_data, - uint8* output_data) { + const uint8_t* input1_data, + const uint8_t* input2_data, uint8_t* output_data) { for (int i = 0; i < size; ++i) { - const int32 input1_val = params.input1_offset + input1_data[i]; - const int32 input2_val = params.input2_offset + input2_data[i]; - const int32 unclamped_result = + const int32_t input1_val = params.input1_offset + input1_data[i]; + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t unclamped_result = params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, params.output_multiplier, params.output_shift); - const int32 clamped_output = + const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); - output_data[i] = static_cast(clamped_output); + output_data[i] = static_cast(clamped_output); } } @@ -60,9 +60,9 @@ inline void Mul(const ArithmeticParams& params, } inline void Mul(const ArithmeticParams& params, - const RuntimeShape& input1_shape, const uint8* input1_data, - const RuntimeShape& input2_shape, const uint8* input2_data, - const RuntimeShape& output_shape, uint8* output_data) { + const RuntimeShape& input1_shape, const uint8_t* input1_data, + const RuntimeShape& input2_shape, const uint8_t* input2_data, + const RuntimeShape& output_shape, uint8_t* output_data) { TFLITE_DCHECK_LE(params.quantized_activation_min, params.quantized_activation_max); const int flat_size = @@ -73,11 +73,11 @@ inline void Mul(const ArithmeticParams& params, inline void BroadcastMul4DSlow(const ArithmeticParams& params, const RuntimeShape& input1_shape, - const uint8* input1_data, + const uint8_t* input1_data, const RuntimeShape& input2_shape, - const uint8* input2_data, + const uint8_t* input2_data, const RuntimeShape& output_shape, - uint8* output_data) { + uint8_t* output_data) { NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, @@ -89,22 +89,22 @@ inline void BroadcastMul4DSlow(const ArithmeticParams& params, for (int y = 0; y < extended_output_shape.Dims(1); ++y) { for (int x = 0; x < extended_output_shape.Dims(2); ++x) { for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - const int32 input1_val = + const int32_t input1_val = params.input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; - const int32 input2_val = + const int32_t input2_val = params.input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; - const int32 unclamped_result = + const int32_t unclamped_result = params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, params.output_multiplier, params.output_shift); - const int32 clamped_output = std::min( + const int32_t clamped_output = std::min( params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); output_data[Offset(extended_output_shape, b, y, x, c)] = - static_cast(clamped_output); + static_cast(clamped_output); } } } diff --git a/tensorflow/lite/kernels/internal/reference/pad.h b/tensorflow/lite/kernels/internal/reference/pad.h index 20fe3434ae5..2a040cefc91 100644 --- a/tensorflow/lite/kernels/internal/reference/pad.h +++ b/tensorflow/lite/kernels/internal/reference/pad.h @@ -32,8 +32,8 @@ constexpr int PadKernelMaxDimensionCount() { return 4; } // equivalent to a simple input1_data. For Pad, it should point to a zero // value. // -// Note that two typenames are required, so that T=P=int32 is considered a -// specialization distinct from P=int32. +// Note that two typenames are required, so that T=P=int32_t is considered a +// specialization distinct from P=int32_t. template inline void PadImpl(const tflite::PadParams& op_params, const RuntimeShape& input_shape, const T* input_data, @@ -116,11 +116,11 @@ inline void Pad(const tflite::PadParams& op_params, output_data); } -// The second (pad-value) input can be int32 when, say, the first is uint8. +// The second (pad-value) input can be int32_t when, say, the first is uint8_t. template inline void Pad(const tflite::PadParams& op_params, const RuntimeShape& input_shape, const T* input_data, - const int32* pad_value_ptr, const RuntimeShape& output_shape, + const int32_t* pad_value_ptr, const RuntimeShape& output_shape, T* output_data) { const T converted_pad_value = static_cast(*pad_value_ptr); PadImpl(op_params, input_shape, input_data, &converted_pad_value, @@ -130,9 +130,9 @@ inline void Pad(const tflite::PadParams& op_params, // This version avoids conflicting template matching. template <> inline void Pad(const tflite::PadParams& op_params, - const RuntimeShape& input_shape, const int32* input_data, - const int32* pad_value_ptr, const RuntimeShape& output_shape, - int32* output_data) { + const RuntimeShape& input_shape, const int32_t* input_data, + const int32_t* pad_value_ptr, const RuntimeShape& output_shape, + int32_t* output_data) { PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape, output_data); } diff --git a/tensorflow/lite/kernels/internal/reference/pooling.h b/tensorflow/lite/kernels/internal/reference/pooling.h index a03359cda82..0872f5210c8 100644 --- a/tensorflow/lite/kernels/internal/reference/pooling.h +++ b/tensorflow/lite/kernels/internal/reference/pooling.h @@ -78,8 +78,9 @@ inline void AveragePool(const PoolParams& params, inline void AveragePool(const PoolParams& params, const RuntimeShape& input_shape, - const uint8* input_data, - const RuntimeShape& output_shape, uint8* output_data) { + const uint8_t* input_data, + const RuntimeShape& output_shape, + uint8_t* output_data) { TFLITE_DCHECK_LE(params.quantized_activation_min, params.quantized_activation_max); TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); @@ -108,7 +109,7 @@ inline void AveragePool(const PoolParams& params, const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); - int32 acc = 0; + int32_t acc = 0; int filter_count = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { @@ -125,7 +126,7 @@ inline void AveragePool(const PoolParams& params, acc = std::max(acc, params.quantized_activation_min); acc = std::min(acc, params.quantized_activation_max); output_data[Offset(output_shape, batch, out_y, out_x, channel)] = - static_cast(acc); + static_cast(acc); } } } @@ -237,8 +238,8 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, } inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& output_shape, - uint8* output_data) { + const uint8_t* input_data, const RuntimeShape& output_shape, + uint8_t* output_data) { TFLITE_DCHECK_LE(params.quantized_activation_min, params.quantized_activation_max); TFLITE_DCHECK_GE(params.quantized_activation_min, 0); @@ -269,7 +270,7 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); - uint8 max = 0; + uint8_t max = 0; for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) { for (int filter_x = filter_x_start; filter_x < filter_x_end; @@ -281,10 +282,10 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape, input_data[Offset(input_shape, batch, in_y, in_x, channel)]); } } - max = std::max(max, params.quantized_activation_min); - max = std::min(max, params.quantized_activation_max); + max = std::max(max, params.quantized_activation_min); + max = std::min(max, params.quantized_activation_max); output_data[Offset(output_shape, batch, out_y, out_x, channel)] = - static_cast(max); + static_cast(max); } } } diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc index 4a8d4b0fb6a..d257a170091 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc @@ -97,13 +97,13 @@ void PortableAsymmetricQuantizeFloats(const float* values, const int size, zero_point_from_min_error < zero_point_from_max_error ? zero_point_from_min : zero_point_from_max; - int8 nudged_zero_point = 0; + int8_t nudged_zero_point = 0; if (zero_point_double <= qmin_double) { nudged_zero_point = kMinScale; } else if (zero_point_double >= qmax_double) { nudged_zero_point = kMaxScale; } else { - nudged_zero_point = static_cast(round(zero_point_double)); + nudged_zero_point = static_cast(round(zero_point_double)); } *scaling_factor = scale; *offset = nudged_zero_point; @@ -303,8 +303,8 @@ void PortableMatrixBatchVectorMultiplyAccumulateImpl( for (int row = 0; row < n_output; ++row) { int32_t acc = bias[row]; for (int col = 0; col < n_input; ++col) { - int8 input_val = input[batch * n_input + col]; - int8 weights_val = input_to_gate_weights[row * n_input + col]; + int8_t input_val = input[batch * n_input + col]; + int8_t weights_val = input_to_gate_weights[row * n_input + col]; acc += input_val * weights_val; } acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift); @@ -349,8 +349,8 @@ void PortableMatrixBatchVectorMultiply(const int8_t* input, int32_t n_batch, int32_t n_input, int32_t n_cell, int8_t* gate_output, int8_t gate_output_zp) { - const int32_t int8_max = std::numeric_limits::max(); - const int32_t int8_min = std::numeric_limits::min(); + const int32_t int8_max = std::numeric_limits::max(); + const int32_t int8_min = std::numeric_limits::min(); for (int batch = 0; batch < n_batch; ++batch) { for (int row = 0; row < n_cell; ++row) { int32_t acc = 0; @@ -378,8 +378,8 @@ void PortableMatrixBatchVectorMultiply( int32_t proj_effective_scale_a, int32_t proj_effective_scale_b, const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden, int32_t n_output, int32_t output_zp, int8_t* proj_output) { - const int16_t int8_max = std::numeric_limits::max(); - const int16_t int8_min = std::numeric_limits::min(); + const int16_t int8_max = std::numeric_limits::max(); + const int16_t int8_min = std::numeric_limits::min(); for (int batch = 0; batch < n_batch; ++batch) { for (int row = 0; row < n_output; ++row) { int64_t acc = gate_bias[row]; @@ -389,10 +389,10 @@ void PortableMatrixBatchVectorMultiply( int64_t curr = acc; acc += input_val * weights_val; if (input_val * weights_val > 0 && acc < curr) { - acc = std::numeric_limits::max(); + acc = std::numeric_limits::max(); } if (input_val * weights_val < 0 && acc > curr) { - acc = std::numeric_limits::min(); + acc = std::numeric_limits::min(); } } acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a, @@ -429,10 +429,10 @@ void PortableApplyLayerNorm(const int16_t* input, int32_t mean = static_cast(static_cast(sum) * 1024 / n_input); // TODO(jianlijianli): Avoids overflow but only works for POT n_input. - int32 temp = kTwoToPower20 / n_input; + int32_t temp = kTwoToPower20 / n_input; int64_t variance = sum_sq * temp - static_cast(mean) * static_cast(mean); - int32_t variance2 = static_cast(variance / kTwoToPower20); + int32_t variance2 = static_cast(variance / kTwoToPower20); if (variance2 < 1) { variance2 = variance_limit; } @@ -442,17 +442,17 @@ void PortableApplyLayerNorm(const int16_t* input, &stddev_inverse_a, &stddev_inverse_b); for (int j = 0; j < n_input; ++j) { - const int32 index = i * n_input + j; - int32 val = static_cast(input[index]); - int32 shifted = 1024 * val - mean; - int32 rescaled = MultiplyByQuantizedMultiplier(shifted, stddev_inverse_a, - stddev_inverse_b); + const int32_t index = i * n_input + j; + int32_t val = static_cast(input[index]); + int32_t shifted = 1024 * val - mean; + int32_t rescaled = MultiplyByQuantizedMultiplier( + shifted, stddev_inverse_a, stddev_inverse_b); // TODO(jianlijianli): Saturate this. int64_t val3 = rescaled * layer_norm_weights[j] + bias[j]; - int32 val4 = - static_cast((val3 > 0 ? val3 + 512 : val3 - 512) / 1024); - int32 val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a, - layer_norm_scale_b + 12); + int32_t val4 = + static_cast((val3 > 0 ? val3 + 512 : val3 - 512) / 1024); + int32_t val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a, + layer_norm_scale_b + 12); val5 = std::min(std::max(kInt16Min, val5), kInt16Max); output[index] = static_cast(val5); } @@ -465,8 +465,8 @@ void PortableApplyLayerNormFloat(const int16_t* input, int32_t layer_norm_scale_b, const int32_t* bias, int n_batch, int n_input, int16_t* output) { - const int32_t int16_max = std::numeric_limits::max(); - const int32_t int16_min = std::numeric_limits::min(); + const int32_t int16_max = std::numeric_limits::max(); + const int32_t int16_min = std::numeric_limits::min(); // This is to surpress a lint warning. const double two = 2.0; const float layer_norm_scale = @@ -498,7 +498,7 @@ void PortableApplyLayerNormFloat(const int16_t* input, const float weighted_normalized_value = normalized_value * layer_norm_weights[i] * layer_norm_scale + bias[i] * bias_scale; - const int32_t quant_output = static_cast( + const int32_t quant_output = static_cast( std::round(weighted_normalized_value * std::pow(2, 12))); output[index] = std::min(int16_max, std::max(int16_min, quant_output)); } @@ -533,18 +533,18 @@ void PortableApplySigmoid(const int16_t* input, int32_t n_batch, void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input, int16_t* output) { - const int32_t int16_max = std::numeric_limits::max(); - const int32_t int16_min = std::numeric_limits::min(); + const int32_t int16_max = std::numeric_limits::max(); + const int32_t int16_min = std::numeric_limits::min(); for (int batch = 0; batch < n_batch; ++batch) { for (int i = 0; i < n_input; ++i) { const int index = batch * n_input + i; const float float_input = input[index] * std::pow(2, -12); const float float_output = 1.0f / (1.0f + std::exp(-float_input)); const int32_t quant_output = - static_cast(float_output * std::pow(2, 15)); + static_cast(float_output * std::pow(2, 15)); const int32_t quant_output_clamped = std::min(int16_max, std::max(int16_min, quant_output)); - output[index] = static_cast(quant_output_clamped); + output[index] = static_cast(quant_output_clamped); } } } @@ -588,8 +588,8 @@ void PortableApplyTanh(int32_t integer_bits, const int16_t* input, void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input, int32_t integer_bits, int16_t* output) { - const int32_t int16_max = std::numeric_limits::max(); - const int32_t int16_min = std::numeric_limits::min(); + const int32_t int16_max = std::numeric_limits::max(); + const int32_t int16_min = std::numeric_limits::min(); const double two = 2.0; for (int batch = 0; batch < n_batch; ++batch) { for (int i = 0; i < n_input; ++i) { @@ -598,10 +598,10 @@ void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch, input[index] * std::pow(two, static_cast(integer_bits)); const float float_output = std::tanh(float_input); const int32_t quant_output = - static_cast(float_output * std::pow(2, 15)); + static_cast(float_output * std::pow(2, 15)); const int32_t quant_output_clamped = std::min(int16_max, std::max(int16_min, quant_output)); - output[index] = static_cast(quant_output_clamped); + output[index] = static_cast(quant_output_clamped); } } } @@ -634,7 +634,7 @@ void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2, value = std::min(std::max(static_cast(-128), value), static_cast(127)); - output[index] = static_cast(value); + output[index] = static_cast(value); } } } @@ -645,7 +645,7 @@ void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2, for (int i = 0; i < n_input; ++i) { const int index = batch * n_input + i; int32_t sum = input_1[index] + input_2[index]; - const int32 sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum)); + const int32_t sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum)); output[index] = static_cast(sum_clamped); } } @@ -793,12 +793,12 @@ void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp, int32_t recurrent_effective_scale_b, int32_t n_batch, int32_t n_cell, int16_t* output) { - const int32_t int16_max = std::numeric_limits::max(); - const int32_t int16_min = std::numeric_limits::min(); + const int32_t int16_max = std::numeric_limits::max(); + const int32_t int16_min = std::numeric_limits::min(); for (int i = 0; i < n_batch * n_cell; ++i) { - int32_t x = static_cast(input[i]) - static_cast(input_zp); + int32_t x = static_cast(input[i]) - static_cast(input_zp); int32_t h = - static_cast(recurrent[i]) - static_cast(recurrent_zp); + static_cast(recurrent[i]) - static_cast(recurrent_zp); int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a, input_effective_scale_b); int32_t h_scaled = MultiplyByQuantizedMultiplier( diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h index 602576ca3db..054fa43243d 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h @@ -32,7 +32,7 @@ bool IsZeroVector(const float* vector, int v_size) { return PortableIsZeroVector(vector, v_size); } -// Check if all entries of a vector are zero for int8. +// Check if all entries of a vector are zero for int8_t. bool IsZeroVector(const int8_t* vector, int v_size) { return PortableIsZeroVector(vector, v_size); } diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h index 4633cb9599a..02db5174ed6 100644 --- a/tensorflow/lite/kernels/internal/reference/prelu.h +++ b/tensorflow/lite/kernels/internal/reference/prelu.h @@ -23,7 +23,7 @@ namespace tflite { namespace reference_ops { -// Broadcast prelu to output_shape for quantized uint8/int8 data. +// Broadcast prelu to output_shape for quantized uint8_t/int8_t data. template inline void BroadcastPrelu4DSlow( const PreluParams& params, const RuntimeShape& input_shape, @@ -44,15 +44,15 @@ inline void BroadcastPrelu4DSlow( for (int c = 0; c < extended_output_shape.Dims(3); ++c) { int output_index = Offset(extended_output_shape, b, y, x, c); int input_index = SubscriptToIndex(desc1, b, y, x, c); - const int32 input_value = + const int32_t input_value = params.input_offset + input_data[input_index]; - int32 output_value; + int32_t output_value; if (input_value >= 0) { output_value = MultiplyByQuantizedMultiplier( input_value, params.output_multiplier_1, params.output_shift_1); } else { auto alpha_index = SubscriptToIndex(desc2, b, y, x, c); - const int32 alpha_value = + const int32_t alpha_value = params.alpha_offset + alpha_data[alpha_index]; output_value = MultiplyByQuantizedMultiplier( @@ -61,9 +61,9 @@ inline void BroadcastPrelu4DSlow( } output_value += params.output_offset; - const int32 quantized_min = std::numeric_limits::min(); - const int32 quantized_max = std::numeric_limits::max(); - const int32 clamped_output = + const int32_t quantized_min = std::numeric_limits::min(); + const int32_t quantized_max = std::numeric_limits::max(); + const int32_t clamped_output = std::min(quantized_max, std::max(quantized_min, output_value)); output_data[output_index] = static_cast(clamped_output); } @@ -77,19 +77,19 @@ inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape, const T* input_data, const RuntimeShape& alpha_shape, const T* alpha_data, const RuntimeShape& output_shape, T* output_data) { - const int32 quantized_min = std::numeric_limits::min(); - const int32 quantized_max = std::numeric_limits::max(); + const int32_t quantized_min = std::numeric_limits::min(); + const int32_t quantized_max = std::numeric_limits::max(); const int flat_size = MatchingElementsSize(input_shape, alpha_shape, output_shape); for (int i = 0; i < flat_size; ++i) { - const int32 input_value = params.input_offset + input_data[i]; - int32 output_value; + const int32_t input_value = params.input_offset + input_data[i]; + int32_t output_value; if (input_value >= 0) { output_value = MultiplyByQuantizedMultiplier( input_value, params.output_multiplier_1, params.output_shift_1); } else { - const int32 alpha_value = params.alpha_offset + alpha_data[i]; + const int32_t alpha_value = params.alpha_offset + alpha_data[i]; output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value, params.output_multiplier_2, @@ -97,7 +97,7 @@ inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape, } output_value += params.output_offset; - const int32 clamped_output = + const int32_t clamped_output = std::min(quantized_max, std::max(quantized_min, output_value)); output_data[i] = static_cast(clamped_output); } diff --git a/tensorflow/lite/kernels/internal/reference/quantize.h b/tensorflow/lite/kernels/internal/reference/quantize.h index d36db06f2e0..6f3f9aeb419 100644 --- a/tensorflow/lite/kernels/internal/reference/quantize.h +++ b/tensorflow/lite/kernels/internal/reference/quantize.h @@ -33,18 +33,18 @@ inline void AffineQuantize(const tflite::QuantizationParams& op_params, const InputT* input_data, const RuntimeShape& output_shape, OutputT* output_data) { - const int32 zero_point = op_params.zero_point; + const int32_t zero_point = op_params.zero_point; const double scale = op_params.scale; const int flat_size = MatchingFlatSize(input_shape, output_shape); - static constexpr int32 min_val = std::numeric_limits::min(); - static constexpr int32 max_val = std::numeric_limits::max(); + static constexpr int32_t min_val = std::numeric_limits::min(); + static constexpr int32_t max_val = std::numeric_limits::max(); for (int i = 0; i < flat_size; i++) { const InputT val = input_data[i]; - int32 unclamped = - static_cast(TfLiteRound(val / static_cast(scale))) + + int32_t unclamped = + static_cast(TfLiteRound(val / static_cast(scale))) + zero_point; - int32 clamped = std::min(std::max(unclamped, min_val), max_val); + int32_t clamped = std::min(std::max(unclamped, min_val), max_val); output_data[i] = clamped; } } diff --git a/tensorflow/lite/kernels/internal/reference/reduce.h b/tensorflow/lite/kernels/internal/reference/reduce.h index 2e54928682a..597d015d0b1 100644 --- a/tensorflow/lite/kernels/internal/reference/reduce.h +++ b/tensorflow/lite/kernels/internal/reference/reduce.h @@ -251,9 +251,9 @@ inline void Mean(const tflite::MeanParams& op_params, inline void Mean(const tflite::MeanParams& op_params, const RuntimeShape& unextended_input_shape, - const uint8_t* input_data, int32 input_zero_point, + const uint8_t* input_data, int32_t input_zero_point, float input_scale, const RuntimeShape& unextended_output_shape, - uint8_t* output_data, int32 output_zero_point, + uint8_t* output_data, int32_t output_zero_point, float output_scale) { ruy::profiler::ScopeLabel label("Mean4D/Uint8"); @@ -282,9 +282,9 @@ inline void Mean(const tflite::MeanParams& op_params, constexpr int32_t kMinValue = std::numeric_limits::min(); constexpr int32_t kMaxValue = std::numeric_limits::max(); - int32 bias = + int32_t bias = output_zero_point - - static_cast(input_zero_point * input_scale / output_scale); + static_cast(input_zero_point * input_scale / output_scale); double real_scale = static_cast(input_scale / (num_elements_in_axis * output_scale)); @@ -293,7 +293,7 @@ inline void Mean(const tflite::MeanParams& op_params, QuantizeMultiplier(real_scale, &multiplier, &shift); for (int out_b = 0; out_b < output_batch; ++out_b) { for (int out_d = 0; out_d < output_depth; ++out_d) { - int32 acc = 0; + int32_t acc = 0; for (int in_h = 0; in_h < input_height; ++in_h) { for (int in_w = 0; in_w < input_width; ++in_w) { acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)]; @@ -312,10 +312,10 @@ inline void Mean(const tflite::MeanParams& op_params, // It does so in two stages, first calculates the sum of elements along the axis // then divides it by the number of element in axis for quantized values. template -inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point, +inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point, float input_scale, const int* input_dims, const int input_num_dims, T* output_data, - int32 output_zero_point, float output_scale, + int32_t output_zero_point, float output_scale, const int* output_dims, const int output_num_dims, const int* axis, const int num_axis_dimensions, bool keep_dims, diff --git a/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h b/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h index e76fc8b6931..95550abc145 100644 --- a/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h +++ b/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h @@ -24,22 +24,23 @@ namespace tflite { namespace reference_ops { -inline int32 GetNearestNeighbor(const int input_value, const int32 input_size, - const int32 output_size, - const bool align_corners, - const bool half_pixel_centers) { +inline int32_t GetNearestNeighbor(const int input_value, + const int32_t input_size, + const int32_t output_size, + const bool align_corners, + const bool half_pixel_centers) { const float scale = (align_corners && output_size > 1) ? (input_size - 1) / static_cast(output_size - 1) : input_size / static_cast(output_size); const float offset = half_pixel_centers ? 0.5f : 0.0f; - int32 output_value = std::min( + int32_t output_value = std::min( align_corners - ? static_cast(TfLiteRound((input_value + offset) * scale)) - : static_cast(std::floor((input_value + offset) * scale)), + ? static_cast(TfLiteRound((input_value + offset) * scale)) + : static_cast(std::floor((input_value + offset) * scale)), input_size - 1); if (half_pixel_centers) { - output_value = std::max(static_cast(0), output_value); + output_value = std::max(static_cast(0), output_value); } return output_value; } @@ -48,7 +49,7 @@ template inline void ResizeNearestNeighbor( const tflite::ResizeNearestNeighborParams& op_params, const RuntimeShape& unextended_input_shape, const T* input_data, - const RuntimeShape& output_size_shape, const int32* output_size_data, + const RuntimeShape& output_size_shape, const int32_t* output_size_data, const RuntimeShape& unextended_output_shape, T* output_data) { TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4); TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); @@ -58,16 +59,16 @@ inline void ResizeNearestNeighbor( const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape); - int32 batches = MatchingDim(input_shape, 0, output_shape, 0); - int32 input_height = input_shape.Dims(1); - int32 input_width = input_shape.Dims(2); - int32 depth = MatchingDim(input_shape, 3, output_shape, 3); + int32_t batches = MatchingDim(input_shape, 0, output_shape, 0); + int32_t input_height = input_shape.Dims(1); + int32_t input_width = input_shape.Dims(2); + int32_t depth = MatchingDim(input_shape, 3, output_shape, 3); // The Tensorflow version of this op allows resize on the width and height // axis only. TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2); - int32 output_height = output_size_data[0]; - int32 output_width = output_size_data[1]; + int32_t output_height = output_size_data[0]; + int32_t output_width = output_size_data[1]; const int col_offset = input_shape.Dims(3); const int row_offset = input_shape.Dims(2) * col_offset; @@ -77,14 +78,14 @@ inline void ResizeNearestNeighbor( T* output_ptr = output_data; for (int b = 0; b < batches; ++b) { for (int y = 0; y < output_height; ++y) { - int32 in_y = GetNearestNeighbor(y, input_height, output_height, - op_params.align_corners, - op_params.half_pixel_centers); - const T* y_input_ptr = input_ptr + in_y * row_offset; - for (int x = 0; x < output_width; ++x) { - int32 in_x = GetNearestNeighbor(x, input_width, output_width, + int32_t in_y = GetNearestNeighbor(y, input_height, output_height, op_params.align_corners, op_params.half_pixel_centers); + const T* y_input_ptr = input_ptr + in_y * row_offset; + for (int x = 0; x < output_width; ++x) { + int32_t in_x = GetNearestNeighbor(x, input_width, output_width, + op_params.align_corners, + op_params.half_pixel_centers); const T* x_input_ptr = y_input_ptr + in_x * col_offset; memcpy(output_ptr, x_input_ptr, depth * sizeof(T)); output_ptr += depth; diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h index dd44b3c7863..b035b433a0b 100644 --- a/tensorflow/lite/kernels/internal/reference/softmax.h +++ b/tensorflow/lite/kernels/internal/reference/softmax.h @@ -62,13 +62,14 @@ inline void Softmax(const SoftmaxParams& params, } } -// Quantized softmax with int8/uint8 input and int8/uint8/int16 output. +// Quantized softmax with int8_t/uint8_t input and int8_t/uint8_t/int16_t +// output. template inline void Softmax(const SoftmaxParams& params, const RuntimeShape& input_shape, const InputT* input_data, const RuntimeShape& output_shape, OutputT* output_data) { - const int32 input_beta_multiplier = params.input_multiplier; - const int32 input_beta_left_shift = params.input_left_shift; + const int32_t input_beta_multiplier = params.input_multiplier; + const int32_t input_beta_left_shift = params.input_left_shift; const int diff_min = params.diff_min; // The representation chosen for the input to the exp() function is Q5.26. // We need to leave extra space since values that we skip might be as large as @@ -78,9 +79,10 @@ inline void Softmax(const SoftmaxParams& params, static const int kScaledDiffIntegerBits = 5; static const int kAccumulationIntegerBits = 12; using FixedPointScaledDiff = - gemmlowp::FixedPoint; - using FixedPointAccum = gemmlowp::FixedPoint; - using FixedPoint0 = gemmlowp::FixedPoint; + gemmlowp::FixedPoint; + using FixedPointAccum = + gemmlowp::FixedPoint; + using FixedPoint0 = gemmlowp::FixedPoint; const int trailing_dim = input_shape.DimensionsCount() - 1; const int outer_size = @@ -96,10 +98,10 @@ inline void Softmax(const SoftmaxParams& params, FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); for (int c = 0; c < depth; ++c) { - int32 input_diff = - static_cast(input_data[i * depth + c]) - max_in_row; + int32_t input_diff = + static_cast(input_data[i * depth + c]) - max_in_row; if (input_diff >= diff_min) { - const int32 input_diff_rescaled = + const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( input_diff, input_beta_multiplier, input_beta_left_shift); const FixedPointScaledDiff scaled_diff_f8 = @@ -114,28 +116,28 @@ inline void Softmax(const SoftmaxParams& params, sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit)); for (int c = 0; c < depth; ++c) { - int32 input_diff = - static_cast(input_data[i * depth + c]) - max_in_row; + int32_t input_diff = + static_cast(input_data[i * depth + c]) - max_in_row; if (input_diff >= diff_min) { - const int32 input_diff_rescaled = + const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( input_diff, input_beta_multiplier, input_beta_left_shift); const FixedPointScaledDiff scaled_diff_f8 = FixedPointScaledDiff::FromRaw(input_diff_rescaled); FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); - int32 unsat_output = gemmlowp::RoundingDivideByPOT( + int32_t unsat_output = gemmlowp::RoundingDivideByPOT( (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - (sizeof(OutputT) * 8)); - const int32 shifted_output = + const int32_t shifted_output = unsat_output + - static_cast(std::numeric_limits::min()); + static_cast(std::numeric_limits::min()); output_data[i * depth + c] = static_cast(std::max( std::min(shifted_output, - static_cast(std::numeric_limits::max())), - static_cast(std::numeric_limits::min()))); + static_cast(std::numeric_limits::max())), + static_cast(std::numeric_limits::min()))); } else { output_data[i * depth + c] = std::numeric_limits::min(); } @@ -143,7 +145,7 @@ inline void Softmax(const SoftmaxParams& params, } } -// Quantized softmax with int16 input and int16 output. +// Quantized softmax with int16_t input and int16_t output. inline void SoftmaxInt16(const SoftmaxParams& params, const RuntimeShape& input_shape, const int16_t* input_data, diff --git a/tensorflow/lite/kernels/internal/reference/sub.h b/tensorflow/lite/kernels/internal/reference/sub.h index 91ef7f2c2fd..b27f251de6c 100644 --- a/tensorflow/lite/kernels/internal/reference/sub.h +++ b/tensorflow/lite/kernels/internal/reference/sub.h @@ -47,11 +47,11 @@ inline void SubNonBroadcast(const ArithmeticParams& params, inline void SubNonBroadcast(const ArithmeticParams& params, const RuntimeShape& input1_shape, - const int32* input1_data, + const int32_t* input1_data, const RuntimeShape& input2_shape, - const int32* input2_data, + const int32_t* input2_data, const RuntimeShape& output_shape, - int32* output_data) { + int32_t* output_data) { const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < flat_size; ++i) { @@ -112,12 +112,12 @@ inline void BroadcastSubSlow(const ArithmeticParams& params, template inline void BroadcastSubSlow(const ArithmeticParams& params, const RuntimeShape& input1_shape, - const uint8* input1_data, + const uint8_t* input1_data, const RuntimeShape& input2_shape, - const uint8* input2_data, + const uint8_t* input2_data, const RuntimeShape& output_shape, - uint8* output_data) { - ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8"); + uint8_t* output_data) { + ruy::profiler::ScopeLabel label("BroadcastSubSlow/uint8_t"); TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N); TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N); TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N); @@ -140,28 +140,28 @@ inline void BroadcastSubSlow(const ArithmeticParams& params, // nesting loops such that the innermost loop has the smallest stride for the // best cache behavior. auto sub_func = [&](int indexes[N]) { - const int32 input1_val = + const int32_t input1_val = params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)]; - const int32 input2_val = + const int32_t input2_val = params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)]; - const int32 shifted_input1_val = input1_val * (1 << params.left_shift); - const int32 shifted_input2_val = input2_val * (1 << params.left_shift); - const int32 scaled_input1_val = + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input1_val, params.input1_multiplier, params.input1_shift); - const int32 scaled_input2_val = + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input2_val, params.input2_multiplier, params.input2_shift); - const int32 raw_sub = scaled_input1_val - scaled_input2_val; - const int32 raw_output = + const int32_t raw_sub = scaled_input1_val - scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( raw_sub, params.output_multiplier, params.output_shift) + params.output_offset; - const int32 clamped_output = + const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); output_data[SubscriptToIndex(output_desc, indexes)] = - static_cast(clamped_output); + static_cast(clamped_output); }; NDOpsHelper(output_desc, sub_func); } @@ -169,12 +169,12 @@ inline void BroadcastSubSlow(const ArithmeticParams& params, template inline void BroadcastSubSlow(const ArithmeticParams& params, const RuntimeShape& input1_shape, - const int32* input1_data, + const int32_t* input1_data, const RuntimeShape& input2_shape, - const int32* input2_data, + const int32_t* input2_data, const RuntimeShape& output_shape, - int32* output_data) { - ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32"); + int32_t* output_data) { + ruy::profiler::ScopeLabel label("BroadcastSubSlow/int32_t"); TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N); TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N); TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N); @@ -214,7 +214,7 @@ inline void BroadcastSubSlow(const ArithmeticParams& params, const int8_t* input2_data, const RuntimeShape& output_shape, int8_t* output_data) { - ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8"); + ruy::profiler::ScopeLabel label("BroadcastSubSlow/int8_t"); NdArrayDesc desc1; NdArrayDesc desc2; NdArrayDesc output_desc; @@ -267,7 +267,7 @@ void BroadcastSubSlow(const ArithmeticParams& params, const RuntimeShape& input2_shape, const int64_t* input2_data, const RuntimeShape& output_shape, int64_t* output_data) { - ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64"); + ruy::profiler::ScopeLabel label("BroadcastSubSlow/int64_t"); TFLITE_DCHECK_LE(input1_shape.DimensionsCount(), N); TFLITE_DCHECK_LE(input2_shape.DimensionsCount(), N); TFLITE_DCHECK_LE(output_shape.DimensionsCount(), N); @@ -339,33 +339,33 @@ void BroadcastSubSlow(const ArithmeticParams& params, // Element-wise Sub that can often be used for inner loop of broadcast sub as // well as the non-broadcast sub. inline void SubElementwise(int size, const ArithmeticParams& params, - const uint8* input1_data, const uint8* input2_data, - uint8* output_data) { + const uint8_t* input1_data, + const uint8_t* input2_data, uint8_t* output_data) { TFLITE_DCHECK_GT(params.input1_offset, -256); TFLITE_DCHECK_GT(params.input2_offset, -256); TFLITE_DCHECK_LT(params.input1_offset, 256); TFLITE_DCHECK_LT(params.input2_offset, 256); for (int i = 0; i < size; ++i) { - const int32 input1_val = params.input1_offset + input1_data[i]; - const int32 input2_val = params.input2_offset + input2_data[i]; - const int32 shifted_input1_val = input1_val * (1 << params.left_shift); - const int32 shifted_input2_val = input2_val * (1 << params.left_shift); - const int32 scaled_input1_val = + const int32_t input1_val = params.input1_offset + input1_data[i]; + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input1_val, params.input1_multiplier, params.input1_shift); - const int32 scaled_input2_val = + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input2_val, params.input2_multiplier, params.input2_shift); - const int32 raw_sub = scaled_input1_val - scaled_input2_val; - const int32 raw_output = + const int32_t raw_sub = scaled_input1_val - scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( raw_sub, params.output_multiplier, params.output_shift) + params.output_offset; - const int32 clamped_output = + const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); - output_data[i] = static_cast(clamped_output); + output_data[i] = static_cast(clamped_output); } } @@ -381,22 +381,22 @@ inline void SubElementwise(int size, const ArithmeticParams& params, TFLITE_DCHECK_LE(params.input2_offset, int8_max_value); for (int i = 0; i < size; ++i) { - const int32 input1_val = params.input1_offset + input1_data[i]; - const int32 input2_val = params.input2_offset + input2_data[i]; - const int32 shifted_input1_val = input1_val * (1 << params.left_shift); - const int32 shifted_input2_val = input2_val * (1 << params.left_shift); - const int32 scaled_input1_val = + const int32_t input1_val = params.input1_offset + input1_data[i]; + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input1_val, params.input1_multiplier, params.input1_shift); - const int32 scaled_input2_val = + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( shifted_input2_val, params.input2_multiplier, params.input2_shift); - const int32 raw_sub = scaled_input1_val - scaled_input2_val; - const int32 raw_output = + const int32_t raw_sub = scaled_input1_val - scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( raw_sub, params.output_multiplier, params.output_shift) + params.output_offset; - const int32 clamped_output = + const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); output_data[i] = static_cast(clamped_output); @@ -404,9 +404,9 @@ inline void SubElementwise(int size, const ArithmeticParams& params, } inline void Sub(const ArithmeticParams& params, - const RuntimeShape& input1_shape, const uint8* input1_data, - const RuntimeShape& input2_shape, const uint8* input2_data, - const RuntimeShape& output_shape, uint8* output_data) { + const RuntimeShape& input1_shape, const uint8_t* input1_data, + const RuntimeShape& input2_shape, const uint8_t* input2_data, + const RuntimeShape& output_shape, uint8_t* output_data) { TFLITE_DCHECK_LE(params.quantized_activation_min, params.quantized_activation_max); const int flat_size = @@ -474,7 +474,8 @@ void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape, } inline void SetActivationMinMax(const ArithmeticParams& params, - int32* activation_min, int32* activation_max) { + int32_t* activation_min, + int32_t* activation_max) { *activation_min = params.quantized_activation_min; *activation_max = params.quantized_activation_max; } diff --git a/tensorflow/lite/kernels/internal/reference/svdf.h b/tensorflow/lite/kernels/internal/reference/svdf.h index ffa46b8f422..bb986e4de0a 100644 --- a/tensorflow/lite/kernels/internal/reference/svdf.h +++ b/tensorflow/lite/kernels/internal/reference/svdf.h @@ -268,7 +268,7 @@ inline void EvalHybridSVDF( std::fill_n(scratch_ptr, batch_size * num_filters, 0.0f); if (!tensor_utils::IsZeroVector(input_ptr, batch_size * input_size)) { - // Quantize input from float to int8. + // Quantize input from float to int8_t. tensor_utils::BatchQuantizeFloats(input_ptr, batch_size, input_size, quantized_input_ptr, scaling_factors_ptr, zero_points_ptr, diff --git a/tensorflow/lite/kernels/internal/reference/tanh.h b/tensorflow/lite/kernels/internal/reference/tanh.h index 04c66989b48..3a05c474dd3 100644 --- a/tensorflow/lite/kernels/internal/reference/tanh.h +++ b/tensorflow/lite/kernels/internal/reference/tanh.h @@ -47,8 +47,8 @@ inline void Tanh(const TanhParams&, const RuntimeShape& input_shape, } inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape, - const int16* input_data, const RuntimeShape& output_shape, - int16* output_data) { + const int16_t* input_data, const RuntimeShape& output_shape, + int16_t* output_data) { const int input_left_shift = params.input_left_shift; // Support for shifts is limited until we have a parameterized version of // SaturatingRoundingMultiplyByPOT(). @@ -81,43 +81,43 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape, } inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape, - const uint8* input_data, const RuntimeShape& output_shape, - uint8* output_data) { - const int32 input_zero_point = params.input_zero_point; - const int32 input_range_radius = params.input_range_radius; - const int32 input_multiplier = params.input_multiplier; + const uint8_t* input_data, const RuntimeShape& output_shape, + uint8_t* output_data) { + const int32_t input_zero_point = params.input_zero_point; + const int32_t input_range_radius = params.input_range_radius; + const int32_t input_multiplier = params.input_multiplier; const int input_left_shift = params.input_left_shift; - const int32 output_zero_point = 128; + const int32_t output_zero_point = 128; const int flat_size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < flat_size; i++) { - const uint8 input_val_u8 = input_data[i]; - const int32 input_val_centered = - static_cast(input_val_u8) - input_zero_point; - uint8 output_val; + const uint8_t input_val_u8 = input_data[i]; + const int32_t input_val_centered = + static_cast(input_val_u8) - input_zero_point; + uint8_t output_val; if (input_val_centered <= -input_range_radius) { output_val = 0; } else if (input_val_centered >= input_range_radius) { output_val = 255; } else { - const int32 input_val_rescaled = + const int32_t input_val_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( input_val_centered, input_multiplier, input_left_shift); - using FixedPoint4 = gemmlowp::FixedPoint; - using FixedPoint0 = gemmlowp::FixedPoint; + using FixedPoint4 = gemmlowp::FixedPoint; + using FixedPoint0 = gemmlowp::FixedPoint; const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled); const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4); // Convert from Q0.31 to Q24.7. using gemmlowp::RoundingDivideByPOT; - int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24); + int32_t output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24); output_val_s32 += output_zero_point; if (output_val_s32 == 256) { output_val_s32 = 255; } - // Reinterpret as Q0.7, encoded in uint8. + // Reinterpret as Q0.7, encoded in uint8_t. TFLITE_DCHECK_GE(output_val_s32, 0); TFLITE_DCHECK_LE(output_val_s32, 255); - output_val = static_cast(output_val_s32); + output_val = static_cast(output_val_s32); } output_data[i] = output_val; }