Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/35985 This PR is one of steps to extend 8-bit quantization to support symmetric 16-bit activations. Each activation is of type int16 and symmetric around zero. The weight tensor precision remains at 8-bit signed values. The bias is set to int64 precision. In this PR we introduce implementation and tests for ADD/SUB kernel reference function. The specification of this operator: SUB Input 0: data_type : int16 range : [-32768, 32767] granularity: per-tensor, zero_point=0 Input 1: data_type : int16 range : [-32768, 32767] granularity: per-tensor, zero_point=0 Output 0: data_type : int16 range : [-32768, 32767] granularity: per-tensor, zero_point=0 ADD Input 0: data_type : int16 range : [-32768, 32767] granularity: per-tensor, zero_point=0 Input 1: data_type : int16 range : [-32768, 32767] granularity: per-tensor, zero_point=0 Output 0: data_type : int16 range : [-32768, 32767] granularity: per-tensor, zero_point=0 Copybara import of the project: -- b94cb4732ab536828e565fd1c7b557f124432e29 by Elena Zhelezina <elena.zhelezina@arm.com>: Added 16-bit version of ADD/SUB operators. Broadcasting is included. -- 924d0b72c568f249f2fd224a942f8922524bfede by Elena Zhelezina <elena.zhelezina@arm.com>: Addressed reviewer comments. -- dd0d9e8f03d1fb1b887609fffb8ea5a86638c63e by Elena Zhelezina <elena.zhelezina@arm.com>: Added versioning to ADD/SUB + some rework of the existing code. -- abae3fd9a9b894c07d13c9ef416092c9004bc913 by Elena Zhelezina <elena.zhelezina@arm.com>: Added versioning for ADD/SUB with new option in the schema.fbs schema_generated.h is edited manually. -- 24f3f5593a06d24fa1ca6be257f1265b5293d492 by Elena Zhelezina <elena.zhelezina@arm.com>: Fix for broken build. -- d252fe175aef3a1a08c65155815efb706aa80afd by Elena Zhelezina <elena.zhelezina@arm.com>: Fix for the failing internal test for NN delegates. -- 2223a5c380bb821eb05f8034703c687269353e32 by Elena Zhelezina <elena.zhelezina@arm.com>: Fix for asan failures. Change-Id: I2cf421ddda7f9e802202239136ab062bcd63b4aa -- 3c219a46ce5888e8e402b64cc943ac6522156ef5 by Elena Zhelezina <elena.zhelezina@arm.com>: Added broadcast params to addsub structure. Change-Id: I61d7d4a94087d052a782890799211031f6ed3015 -- 9131a38c776109cdbcfa60be602667ec7aafe00f by Elena Zhelezina <elena.zhelezina@arm.com>: Corrected defaults. Change-Id: I9ea50c75014cc03ac91fdef0f5b4fe11395f7074 PiperOrigin-RevId: 324865496
455 lines
21 KiB
C++
455 lines
21 KiB
C++
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
|
|
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
|
|
|
|
#include "fixedpoint/fixedpoint.h"
|
|
#include "tensorflow/lite/kernels/internal/common.h"
|
|
|
|
namespace tflite {
|
|
|
|
namespace reference_ops {
|
|
|
|
template <typename T>
|
|
inline void Add(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const T* input1_data,
|
|
const RuntimeShape& input2_shape, const T* input2_data,
|
|
const RuntimeShape& output_shape, T* output_data) {
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
for (int i = 0; i < flat_size; ++i) {
|
|
output_data[i] = ActivationFunctionWithMinMax(
|
|
input1_data[i] + input2_data[i], params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
}
|
|
}
|
|
|
|
inline void Add(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const float* input1_data,
|
|
const RuntimeShape& input2_shape, const float* input2_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
for (int i = 0; i < flat_size; i++) {
|
|
auto x = input1_data[i] + input2_data[i];
|
|
output_data[i] = ActivationFunctionWithMinMax(
|
|
x, params.float_activation_min, params.float_activation_max);
|
|
}
|
|
}
|
|
|
|
// Element-wise add that can often be used for inner loop of broadcast add as
|
|
// well as the non-broadcast add.
|
|
|
|
// This function is used for 8-bit as well as for 16-bit, but the accumulator
|
|
// is 32-bit for both cases. The overflow does not happen due to the
|
|
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
|
|
template <typename T>
|
|
inline void AddElementwise(int size, const ArithmeticParams& params,
|
|
const T* input1_data, const T* input2_data,
|
|
T* output_data) {
|
|
TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
|
|
TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
|
|
TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
|
|
TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
|
|
|
|
for (int i = 0; i < size; ++i) {
|
|
const int32_t input1_val = params.input1_offset + input1_data[i];
|
|
const int32_t input2_val = params.input2_offset + input2_data[i];
|
|
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
|
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
|
const int32_t scaled_input1_val =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
|
const int32_t scaled_input2_val =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
|
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
|
const int32_t raw_output =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
raw_sum, params.output_multiplier, params.output_shift) +
|
|
params.output_offset;
|
|
const int32_t clamped_output =
|
|
std::min(params.quantized_activation_max,
|
|
std::max(params.quantized_activation_min, raw_output));
|
|
output_data[i] = static_cast<T>(clamped_output);
|
|
}
|
|
}
|
|
|
|
// Scalar-broadcast add that can be used for inner loop of more general
|
|
// broadcast add, so that, for example, scalar-broadcast with batch will still
|
|
// be fast.
|
|
inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
|
|
uint8_t input1_data, const uint8_t* input2_data,
|
|
uint8_t* output_data) {
|
|
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
|
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
|
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
|
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
|
|
|
const int32_t input1_val = params.input1_offset + input1_data;
|
|
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
|
|
const int32_t scaled_input1_val =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
|
for (int i = 0; i < size; ++i) {
|
|
const int32_t input2_val = params.input2_offset + input2_data[i];
|
|
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
|
|
const int32_t scaled_input2_val =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
|
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
|
const int32_t raw_output =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
raw_sum, params.output_multiplier, params.output_shift) +
|
|
params.output_offset;
|
|
const int32_t clamped_output =
|
|
std::min(params.quantized_activation_max,
|
|
std::max(params.quantized_activation_min, raw_output));
|
|
output_data[i] = static_cast<uint8_t>(clamped_output);
|
|
}
|
|
}
|
|
|
|
inline void Add(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const uint8_t* input1_data,
|
|
const RuntimeShape& input2_shape, const uint8_t* input2_data,
|
|
const RuntimeShape& output_shape, uint8_t* output_data) {
|
|
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
|
|
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
|
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
|
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
|
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
|
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
|
|
}
|
|
|
|
inline void AddGeneralParamScale(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape,
|
|
const int16_t* input1_data,
|
|
const RuntimeShape& input2_shape,
|
|
const int16_t* input2_data,
|
|
const RuntimeShape& output_shape,
|
|
int16_t* output_data) {
|
|
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
|
|
int max_value = std::numeric_limits<int16_t>::max();
|
|
|
|
TFLITE_DCHECK_GT(params.input1_offset, -max_value);
|
|
TFLITE_DCHECK_GT(params.input2_offset, -max_value);
|
|
TFLITE_DCHECK_LT(params.input1_offset, max_value);
|
|
TFLITE_DCHECK_LT(params.input2_offset, max_value);
|
|
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
|
|
}
|
|
|
|
inline void Add(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const int16_t* input1_data,
|
|
const RuntimeShape& input2_shape, const int16_t* input2_data,
|
|
const RuntimeShape& output_shape, int16_t* output_data,
|
|
bool pot_scale = true) {
|
|
if (!pot_scale) {
|
|
AddGeneralParamScale(params, input1_shape, input1_data, input2_shape,
|
|
input2_data, output_shape, output_data);
|
|
return;
|
|
}
|
|
|
|
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
|
|
const int input1_shift = params.input1_shift;
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
const int16_t output_activation_min = params.quantized_activation_min;
|
|
const int16_t output_activation_max = params.quantized_activation_max;
|
|
|
|
TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
|
|
TFLITE_DCHECK_LE(input1_shift, 0);
|
|
TFLITE_DCHECK_LE(params.input2_shift, 0);
|
|
const int16_t* not_shift_input =
|
|
input1_shift == 0 ? input1_data : input2_data;
|
|
const int16_t* shift_input = input1_shift == 0 ? input2_data : input1_data;
|
|
const int input_right_shift =
|
|
input1_shift == 0 ? -params.input2_shift : -input1_shift;
|
|
|
|
for (int i = 0; i < flat_size; i++) {
|
|
// F0 uses 0 integer bits, range [-1, 1].
|
|
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
|
|
|
F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
|
|
F0 scaled_input = F0::FromRaw(
|
|
gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
|
|
F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
|
|
const int16_t raw_output = result.raw();
|
|
const int16_t clamped_output = std::min(
|
|
output_activation_max, std::max(output_activation_min, raw_output));
|
|
output_data[i] = clamped_output;
|
|
}
|
|
}
|
|
|
|
// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
|
|
// dimensionality if the runtime code does a single loop over one dimension
|
|
// that handles broadcasting as the base case. The code generator would then
|
|
// generate max(D1, D2) nested for loops.
|
|
// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
|
|
// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
|
|
// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
|
|
// reference_ops.h.
|
|
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape,
|
|
const float* input1_data,
|
|
const RuntimeShape& input2_shape,
|
|
const float* input2_data,
|
|
const RuntimeShape& output_shape,
|
|
float* output_data) {
|
|
NdArrayDesc<4> desc1;
|
|
NdArrayDesc<4> desc2;
|
|
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
|
&desc2);
|
|
const RuntimeShape extended_output_shape =
|
|
RuntimeShape::ExtendedShape(4, output_shape);
|
|
|
|
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
|
// col, channel), with extents (batches, height, width, depth), with the
|
|
// trailing dimension changing most rapidly (channels has the smallest stride,
|
|
// typically 1 element).
|
|
//
|
|
// In generated C code, we store arrays with the dimensions reversed. The
|
|
// first dimension has smallest stride.
|
|
//
|
|
// We name our variables by their Tensorflow convention, but generate C code
|
|
// nesting loops such that the innermost loop has the smallest stride for the
|
|
// best cache behavior.
|
|
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
|
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
|
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
|
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
|
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
|
ActivationFunctionWithMinMax(
|
|
input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
|
|
input2_data[SubscriptToIndex(desc2, b, y, x, c)],
|
|
params.float_activation_min, params.float_activation_max);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape,
|
|
const int32_t* input1_data,
|
|
const RuntimeShape& input2_shape,
|
|
const int32_t* input2_data,
|
|
const RuntimeShape& output_shape,
|
|
int32_t* output_data) {
|
|
NdArrayDesc<4> desc1;
|
|
NdArrayDesc<4> desc2;
|
|
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
|
&desc2);
|
|
const RuntimeShape extended_output_shape =
|
|
RuntimeShape::ExtendedShape(4, output_shape);
|
|
|
|
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
|
// col, channel), with extents (batches, height, width, depth), with the
|
|
// trailing dimension changing most rapidly (channels has the smallest stride,
|
|
// typically 1 element).
|
|
//
|
|
// In generated C code, we store arrays with the dimensions reversed. The
|
|
// first dimension has smallest stride.
|
|
//
|
|
// We name our variables by their Tensorflow convention, but generate C code
|
|
// nesting loops such that the innermost loop has the smallest stride for the
|
|
// best cache behavior.
|
|
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
|
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
|
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
|
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
|
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
|
ActivationFunctionWithMinMax(
|
|
input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
|
|
input2_data[SubscriptToIndex(desc2, b, y, x, c)],
|
|
params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// This function is used for 8-bit as well as for 16-bit, but the accumulator
|
|
// is 32-bit for both cases. The overflow does not happen due to the
|
|
// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
|
|
template <typename T>
|
|
inline void BroadcastAdd4DSlow(
|
|
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
|
const T* input1_data, const RuntimeShape& input2_shape,
|
|
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
|
|
NdArrayDesc<4> desc1;
|
|
NdArrayDesc<4> desc2;
|
|
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
|
&desc2);
|
|
const RuntimeShape extended_output_shape =
|
|
RuntimeShape::ExtendedShape(4, output_shape);
|
|
|
|
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
|
// col, channel), with extents (batches, height, width, depth), with the
|
|
// trailing dimension changing most rapidly (channels has the smallest stride,
|
|
// typically 1 element).
|
|
//
|
|
// In generated C code, we store arrays with the dimensions reversed. The
|
|
// first dimension has smallest stride.
|
|
//
|
|
// We name our variables by their Tensorflow convention, but generate C code
|
|
// nesting loops such that the innermost loop has the smallest stride for the
|
|
// best cache behavior.
|
|
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
|
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
|
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
|
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
|
const int32_t input1_val =
|
|
params.input1_offset +
|
|
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
|
|
const int32_t input2_val =
|
|
params.input2_offset +
|
|
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
|
|
const int32_t shifted_input1_val =
|
|
input1_val * (1 << params.left_shift);
|
|
const int32_t shifted_input2_val =
|
|
input2_val * (1 << params.left_shift);
|
|
const int32_t scaled_input1_val =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
shifted_input1_val, params.input1_multiplier,
|
|
params.input1_shift);
|
|
const int32_t scaled_input2_val =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
shifted_input2_val, params.input2_multiplier,
|
|
params.input2_shift);
|
|
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
|
|
const int32_t raw_output =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
raw_sum, params.output_multiplier, params.output_shift) +
|
|
params.output_offset;
|
|
const int32_t clamped_output =
|
|
std::min(params.quantized_activation_max,
|
|
std::max(params.quantized_activation_min, raw_output));
|
|
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
|
static_cast<T>(clamped_output);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
|
|
const RuntimeShape& unswitched_input1_shape,
|
|
const uint8_t* unswitched_input1_data,
|
|
const RuntimeShape& unswitched_input2_shape,
|
|
const uint8_t* unswitched_input2_data,
|
|
const RuntimeShape& output_shape,
|
|
uint8_t* output_data) {
|
|
ArithmeticParams switched_params = unswitched_params;
|
|
switched_params.input1_offset = unswitched_params.input2_offset;
|
|
switched_params.input1_multiplier = unswitched_params.input2_multiplier;
|
|
switched_params.input1_shift = unswitched_params.input2_shift;
|
|
switched_params.input2_offset = unswitched_params.input1_offset;
|
|
switched_params.input2_multiplier = unswitched_params.input1_multiplier;
|
|
switched_params.input2_shift = unswitched_params.input1_shift;
|
|
|
|
const bool use_unswitched =
|
|
unswitched_params.broadcast_category ==
|
|
tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
|
|
|
|
const ArithmeticParams& params =
|
|
use_unswitched ? unswitched_params : switched_params;
|
|
const uint8_t* input1_data =
|
|
use_unswitched ? unswitched_input1_data : unswitched_input2_data;
|
|
const uint8_t* input2_data =
|
|
use_unswitched ? unswitched_input2_data : unswitched_input1_data;
|
|
|
|
// Fivefold nested loops. The second input resets its position for each
|
|
// iteration of the second loop. The first input resets its position at the
|
|
// beginning of the fourth loop. The innermost loop is an elementwise add of
|
|
// sections of the arrays.
|
|
uint8_t* output_data_ptr = output_data;
|
|
const uint8_t* input1_data_ptr = input1_data;
|
|
const uint8_t* input2_data_reset = input2_data;
|
|
// In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
|
|
// between input shapes. y3 for input 1 is always broadcast, and so the
|
|
// dimension there is 1, whereas optionally y1 might be broadcast for input 2.
|
|
// Put another way,
|
|
// input1.shape.FlatSize = y0 * y1 * y2 * y4,
|
|
// input2.shape.FlatSize = y0 * y2 * y3 * y4.
|
|
int y0 = params.broadcast_shape[0];
|
|
int y1 = params.broadcast_shape[1];
|
|
int y2 = params.broadcast_shape[2];
|
|
int y3 = params.broadcast_shape[3];
|
|
int y4 = params.broadcast_shape[4];
|
|
if (y4 > 1) {
|
|
// General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
|
|
// dimension.
|
|
for (int i0 = 0; i0 < y0; ++i0) {
|
|
const uint8_t* input2_data_ptr;
|
|
for (int i1 = 0; i1 < y1; ++i1) {
|
|
input2_data_ptr = input2_data_reset;
|
|
for (int i2 = 0; i2 < y2; ++i2) {
|
|
for (int i3 = 0; i3 < y3; ++i3) {
|
|
AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
|
|
output_data_ptr);
|
|
input2_data_ptr += y4;
|
|
output_data_ptr += y4;
|
|
}
|
|
// We have broadcast y4 of input1 data y3 times, and now move on.
|
|
input1_data_ptr += y4;
|
|
}
|
|
}
|
|
// We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
|
|
input2_data_reset = input2_data_ptr;
|
|
}
|
|
} else {
|
|
// Special case of y4 == 1, in which the innermost loop is a single element
|
|
// and can be combined with the next (y3) as an inner broadcast.
|
|
//
|
|
// Note that this handles the case of pure scalar broadcast when
|
|
// y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
|
|
// broadcast with batch (as y2 > 1).
|
|
//
|
|
// NOTE The process is the same as the above general case except simplified
|
|
// for y4 == 1 and the loop over y3 is contained within the
|
|
// AddScalarBroadcast function.
|
|
for (int i0 = 0; i0 < y0; ++i0) {
|
|
const uint8_t* input2_data_ptr;
|
|
for (int i1 = 0; i1 < y1; ++i1) {
|
|
input2_data_ptr = input2_data_reset;
|
|
for (int i2 = 0; i2 < y2; ++i2) {
|
|
AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
|
|
output_data_ptr);
|
|
input2_data_ptr += y3;
|
|
output_data_ptr += y3;
|
|
input1_data_ptr += 1;
|
|
}
|
|
}
|
|
input2_data_reset = input2_data_ptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace reference_ops
|
|
} // namespace tflite
|
|
|
|
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
|