Migrate float/uint8 broadcast fivefold add to use binary broadcast fivefold.
PiperOrigin-RevId: 313707010 Change-Id: I800cbc4406bad709cedecbfa0e41b4e465254f75
This commit is contained in:
parent
fba1187eda
commit
4c674a64c8
@ -234,6 +234,100 @@ inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
|
||||||
|
inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
|
||||||
|
const RuntimeShape& unswitched_input1_shape,
|
||||||
|
const T* unswitched_input1_data,
|
||||||
|
const RuntimeShape& unswitched_input2_shape,
|
||||||
|
const T* unswitched_input2_data,
|
||||||
|
const RuntimeShape& output_shape,
|
||||||
|
T* output_data, ElementwiseF elementwise_f,
|
||||||
|
ScalarBroadcastF scalar_broadcast_f) {
|
||||||
|
ArithmeticParams switched_params = unswitched_params;
|
||||||
|
switched_params.input1_offset = unswitched_params.input2_offset;
|
||||||
|
switched_params.input1_multiplier = unswitched_params.input2_multiplier;
|
||||||
|
switched_params.input1_shift = unswitched_params.input2_shift;
|
||||||
|
switched_params.input2_offset = unswitched_params.input1_offset;
|
||||||
|
switched_params.input2_multiplier = unswitched_params.input1_multiplier;
|
||||||
|
switched_params.input2_shift = unswitched_params.input1_shift;
|
||||||
|
|
||||||
|
const bool use_unswitched =
|
||||||
|
unswitched_params.broadcast_category ==
|
||||||
|
tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
|
||||||
|
|
||||||
|
const ArithmeticParams& params =
|
||||||
|
use_unswitched ? unswitched_params : switched_params;
|
||||||
|
const T* input1_data =
|
||||||
|
use_unswitched ? unswitched_input1_data : unswitched_input2_data;
|
||||||
|
const T* input2_data =
|
||||||
|
use_unswitched ? unswitched_input2_data : unswitched_input1_data;
|
||||||
|
|
||||||
|
// Fivefold nested loops. The second input resets its position for each
|
||||||
|
// iteration of the second loop. The first input resets its position at the
|
||||||
|
// beginning of the fourth loop. The innermost loop is an elementwise add of
|
||||||
|
// sections of the arrays.
|
||||||
|
T* output_data_ptr = output_data;
|
||||||
|
const T* input1_data_ptr = input1_data;
|
||||||
|
const T* input2_data_reset = input2_data;
|
||||||
|
// In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
|
||||||
|
// between input shapes. y3 for input 1 is always broadcast, and so the
|
||||||
|
// dimension there is 1, whereas optionally y1 might be broadcast for
|
||||||
|
// input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
|
||||||
|
// input2.shape.FlatSize = y0 * y2 * y3 * y4.
|
||||||
|
int y0 = params.broadcast_shape[0];
|
||||||
|
int y1 = params.broadcast_shape[1];
|
||||||
|
int y2 = params.broadcast_shape[2];
|
||||||
|
int y3 = params.broadcast_shape[3];
|
||||||
|
int y4 = params.broadcast_shape[4];
|
||||||
|
if (y4 > 1) {
|
||||||
|
// General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
|
||||||
|
// dimension.
|
||||||
|
for (int i0 = 0; i0 < y0; ++i0) {
|
||||||
|
const T* input2_data_ptr = nullptr;
|
||||||
|
for (int i1 = 0; i1 < y1; ++i1) {
|
||||||
|
input2_data_ptr = input2_data_reset;
|
||||||
|
for (int i2 = 0; i2 < y2; ++i2) {
|
||||||
|
for (int i3 = 0; i3 < y3; ++i3) {
|
||||||
|
elementwise_f(y4, params, input1_data_ptr, input2_data_ptr,
|
||||||
|
output_data_ptr);
|
||||||
|
input2_data_ptr += y4;
|
||||||
|
output_data_ptr += y4;
|
||||||
|
}
|
||||||
|
// We have broadcast y4 of input1 data y3 times, and now move on.
|
||||||
|
input1_data_ptr += y4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
|
||||||
|
input2_data_reset = input2_data_ptr;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Special case of y4 == 1, in which the innermost loop is a single
|
||||||
|
// element and can be combined with the next (y3) as an inner broadcast.
|
||||||
|
//
|
||||||
|
// Note that this handles the case of pure scalar broadcast when
|
||||||
|
// y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
|
||||||
|
// broadcast with batch (as y2 > 1).
|
||||||
|
//
|
||||||
|
// NOTE The process is the same as the above general case except
|
||||||
|
// simplified for y4 == 1 and the loop over y3 is contained within the
|
||||||
|
// AddScalarBroadcast function.
|
||||||
|
for (int i0 = 0; i0 < y0; ++i0) {
|
||||||
|
const T* input2_data_ptr = nullptr;
|
||||||
|
for (int i1 = 0; i1 < y1; ++i1) {
|
||||||
|
input2_data_ptr = input2_data_reset;
|
||||||
|
for (int i2 = 0; i2 < y2; ++i2) {
|
||||||
|
scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr,
|
||||||
|
output_data_ptr);
|
||||||
|
input2_data_ptr += y3;
|
||||||
|
output_data_ptr += y3;
|
||||||
|
input1_data_ptr += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
input2_data_reset = input2_data_ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
inline void AddBiasAndEvalActivationFunction(float output_activation_min,
|
inline void AddBiasAndEvalActivationFunction(float output_activation_min,
|
||||||
float output_activation_max,
|
float output_activation_max,
|
||||||
const RuntimeShape& bias_shape,
|
const RuntimeShape& bias_shape,
|
||||||
@ -2073,186 +2167,6 @@ inline void Add(const ArithmeticParams& params,
|
|||||||
output_map = output_map.cwiseMin(params.quantized_activation_max);
|
output_map = output_map.cwiseMin(params.quantized_activation_max);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
|
|
||||||
const RuntimeShape& unswitched_input1_shape,
|
|
||||||
const uint8* unswitched_input1_data,
|
|
||||||
const RuntimeShape& unswitched_input2_shape,
|
|
||||||
const uint8* unswitched_input2_data,
|
|
||||||
const RuntimeShape& output_shape,
|
|
||||||
uint8* output_data) {
|
|
||||||
ruy::profiler::ScopeLabel label("BroadcastAddFivefold/8bit");
|
|
||||||
|
|
||||||
ArithmeticParams switched_params = unswitched_params;
|
|
||||||
switched_params.input1_offset = unswitched_params.input2_offset;
|
|
||||||
switched_params.input1_multiplier = unswitched_params.input2_multiplier;
|
|
||||||
switched_params.input1_shift = unswitched_params.input2_shift;
|
|
||||||
switched_params.input2_offset = unswitched_params.input1_offset;
|
|
||||||
switched_params.input2_multiplier = unswitched_params.input1_multiplier;
|
|
||||||
switched_params.input2_shift = unswitched_params.input1_shift;
|
|
||||||
|
|
||||||
const bool use_unswitched =
|
|
||||||
unswitched_params.broadcast_category ==
|
|
||||||
tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
|
|
||||||
|
|
||||||
const ArithmeticParams& params =
|
|
||||||
use_unswitched ? unswitched_params : switched_params;
|
|
||||||
const uint8* input1_data =
|
|
||||||
use_unswitched ? unswitched_input1_data : unswitched_input2_data;
|
|
||||||
const uint8* input2_data =
|
|
||||||
use_unswitched ? unswitched_input2_data : unswitched_input1_data;
|
|
||||||
|
|
||||||
// Fivefold nested loops. The second input resets its position for each
|
|
||||||
// iteration of the second loop. The first input resets its position at the
|
|
||||||
// beginning of the fourth loop. The innermost loop is an elementwise add of
|
|
||||||
// sections of the arrays.
|
|
||||||
uint8* output_data_ptr = output_data;
|
|
||||||
const uint8* input1_data_ptr = input1_data;
|
|
||||||
const uint8* input2_data_reset = input2_data;
|
|
||||||
// In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
|
|
||||||
// between input shapes. y3 for input 1 is always broadcast, and so the
|
|
||||||
// dimension there is 1, whereas optionally y1 might be broadcast for input 2.
|
|
||||||
// Put another way,
|
|
||||||
// input1.shape.FlatSize = y0 * y1 * y2 * y4,
|
|
||||||
// input2.shape.FlatSize = y0 * y2 * y3 * y4.
|
|
||||||
int y0 = params.broadcast_shape[0];
|
|
||||||
int y1 = params.broadcast_shape[1];
|
|
||||||
int y2 = params.broadcast_shape[2];
|
|
||||||
int y3 = params.broadcast_shape[3];
|
|
||||||
int y4 = params.broadcast_shape[4];
|
|
||||||
if (y4 > 1) {
|
|
||||||
// General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
|
|
||||||
// dimension.
|
|
||||||
for (int i0 = 0; i0 < y0; ++i0) {
|
|
||||||
const uint8* input2_data_ptr = nullptr;
|
|
||||||
for (int i1 = 0; i1 < y1; ++i1) {
|
|
||||||
input2_data_ptr = input2_data_reset;
|
|
||||||
for (int i2 = 0; i2 < y2; ++i2) {
|
|
||||||
for (int i3 = 0; i3 < y3; ++i3) {
|
|
||||||
AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
|
|
||||||
output_data_ptr);
|
|
||||||
input2_data_ptr += y4;
|
|
||||||
output_data_ptr += y4;
|
|
||||||
}
|
|
||||||
// We have broadcast y4 of input1 data y3 times, and now move on.
|
|
||||||
input1_data_ptr += y4;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
|
|
||||||
input2_data_reset = input2_data_ptr;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Special case of y4 == 1, in which the innermost loop is a single element
|
|
||||||
// and can be combined with the next (y3) as an inner broadcast.
|
|
||||||
//
|
|
||||||
// Note that this handles the case of pure scalar broadcast when
|
|
||||||
// y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
|
|
||||||
// broadcast with batch (as y2 > 1).
|
|
||||||
//
|
|
||||||
// NOTE The process is the same as the above general case except simplified
|
|
||||||
// for y4 == 1 and the loop over y3 is contained within the
|
|
||||||
// AddScalarBroadcast function.
|
|
||||||
for (int i0 = 0; i0 < y0; ++i0) {
|
|
||||||
const uint8* input2_data_ptr = nullptr;
|
|
||||||
for (int i1 = 0; i1 < y1; ++i1) {
|
|
||||||
input2_data_ptr = input2_data_reset;
|
|
||||||
for (int i2 = 0; i2 < y2; ++i2) {
|
|
||||||
AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
|
|
||||||
output_data_ptr);
|
|
||||||
input2_data_ptr += y3;
|
|
||||||
output_data_ptr += y3;
|
|
||||||
input1_data_ptr += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
input2_data_reset = input2_data_ptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void BroadcastAddFivefold(const ArithmeticParams& params,
|
|
||||||
const RuntimeShape& unswitched_input1_shape,
|
|
||||||
const float* unswitched_input1_data,
|
|
||||||
const RuntimeShape& unswitched_input2_shape,
|
|
||||||
const float* unswitched_input2_data,
|
|
||||||
const RuntimeShape& output_shape,
|
|
||||||
float* output_data) {
|
|
||||||
ruy::profiler::ScopeLabel label("BroadcastAddFivefold/float");
|
|
||||||
|
|
||||||
const bool use_unswitched =
|
|
||||||
params.broadcast_category ==
|
|
||||||
tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
|
|
||||||
|
|
||||||
const float* input1_data =
|
|
||||||
use_unswitched ? unswitched_input1_data : unswitched_input2_data;
|
|
||||||
const float* input2_data =
|
|
||||||
use_unswitched ? unswitched_input2_data : unswitched_input1_data;
|
|
||||||
|
|
||||||
// Fivefold nested loops. The second input resets its position for each
|
|
||||||
// iteration of the second loop. The first input resets its position at the
|
|
||||||
// beginning of the fourth loop. The innermost loop is an elementwise add of
|
|
||||||
// sections of the arrays.
|
|
||||||
float* output_data_ptr = output_data;
|
|
||||||
const float* input1_data_ptr = input1_data;
|
|
||||||
const float* input2_data_reset = input2_data;
|
|
||||||
// In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
|
|
||||||
// between input shapes. y3 for input 1 is always broadcast, and so the
|
|
||||||
// dimension there is 1, whereas optionally y1 might be broadcast for input 2.
|
|
||||||
// Put another way,
|
|
||||||
// input1.shape.FlatSize = y0 * y1 * y2 * y4,
|
|
||||||
// input2.shape.FlatSize = y0 * y2 * y3 * y4.
|
|
||||||
int y0 = params.broadcast_shape[0];
|
|
||||||
int y1 = params.broadcast_shape[1];
|
|
||||||
int y2 = params.broadcast_shape[2];
|
|
||||||
int y3 = params.broadcast_shape[3];
|
|
||||||
int y4 = params.broadcast_shape[4];
|
|
||||||
if (y4 > 1) {
|
|
||||||
// General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
|
|
||||||
// dimension.
|
|
||||||
for (int i0 = 0; i0 < y0; ++i0) {
|
|
||||||
const float* input2_data_ptr = nullptr;
|
|
||||||
for (int i1 = 0; i1 < y1; ++i1) {
|
|
||||||
input2_data_ptr = input2_data_reset;
|
|
||||||
for (int i2 = 0; i2 < y2; ++i2) {
|
|
||||||
for (int i3 = 0; i3 < y3; ++i3) {
|
|
||||||
AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
|
|
||||||
output_data_ptr);
|
|
||||||
input2_data_ptr += y4;
|
|
||||||
output_data_ptr += y4;
|
|
||||||
}
|
|
||||||
// We have broadcast y4 of input1 data y3 times, and now move on.
|
|
||||||
input1_data_ptr += y4;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
|
|
||||||
input2_data_reset = input2_data_ptr;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Special case of y4 == 1, in which the innermost loop is a single element
|
|
||||||
// and can be combined with the next (y3) as an inner broadcast.
|
|
||||||
//
|
|
||||||
// Note that this handles the case of pure scalar broadcast when
|
|
||||||
// y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
|
|
||||||
// broadcast with batch (as y2 > 1).
|
|
||||||
//
|
|
||||||
// NOTE The process is the same as the above general case except simplified
|
|
||||||
// for y4 == 1 and the loop over y3 is contained within the
|
|
||||||
// AddScalarBroadcast function.
|
|
||||||
for (int i0 = 0; i0 < y0; ++i0) {
|
|
||||||
const float* input2_data_ptr = nullptr;
|
|
||||||
for (int i1 = 0; i1 < y1; ++i1) {
|
|
||||||
input2_data_ptr = input2_data_reset;
|
|
||||||
for (int i2 = 0; i2 < y2; ++i2) {
|
|
||||||
AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
|
|
||||||
output_data_ptr);
|
|
||||||
input2_data_ptr += y3;
|
|
||||||
output_data_ptr += y3;
|
|
||||||
input1_data_ptr += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
input2_data_reset = input2_data_ptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline void BroadcastAddDispatch(
|
inline void BroadcastAddDispatch(
|
||||||
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
||||||
@ -2263,8 +2177,37 @@ inline void BroadcastAddDispatch(
|
|||||||
input2_data, output_shape, output_data);
|
input2_data, output_shape, output_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
BroadcastAddFivefold(params, input1_shape, input1_data, input2_shape,
|
BinaryBroadcastFiveFold(
|
||||||
input2_data, output_shape, output_data);
|
params, input1_shape, input1_data, input2_shape, input2_data,
|
||||||
|
output_shape, output_data,
|
||||||
|
static_cast<void (*)(int, const ArithmeticParams&, const T*, const T*,
|
||||||
|
T*)>(AddElementwise),
|
||||||
|
static_cast<void (*)(int, const ArithmeticParams&, T, const T*, T*)>(
|
||||||
|
AddScalarBroadcast));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
|
||||||
|
const RuntimeShape& unswitched_input1_shape,
|
||||||
|
const uint8* unswitched_input1_data,
|
||||||
|
const RuntimeShape& unswitched_input2_shape,
|
||||||
|
const uint8* unswitched_input2_data,
|
||||||
|
const RuntimeShape& output_shape,
|
||||||
|
uint8* output_data) {
|
||||||
|
BroadcastAddDispatch(unswitched_params, unswitched_input1_shape,
|
||||||
|
unswitched_input1_data, unswitched_input2_shape,
|
||||||
|
unswitched_input2_data, output_shape, output_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void BroadcastAddFivefold(const ArithmeticParams& params,
|
||||||
|
const RuntimeShape& unswitched_input1_shape,
|
||||||
|
const float* unswitched_input1_data,
|
||||||
|
const RuntimeShape& unswitched_input2_shape,
|
||||||
|
const float* unswitched_input2_data,
|
||||||
|
const RuntimeShape& output_shape,
|
||||||
|
float* output_data) {
|
||||||
|
BroadcastAddDispatch(params, unswitched_input1_shape, unswitched_input1_data,
|
||||||
|
unswitched_input2_shape, unswitched_input2_data,
|
||||||
|
output_shape, output_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void MulElementwise(int size, const ArithmeticParams& params,
|
inline void MulElementwise(int size, const ArithmeticParams& params,
|
||||||
@ -7979,101 +7922,6 @@ inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename ElementwiseF, typename ScalarBroadcastF>
|
|
||||||
inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
|
|
||||||
const RuntimeShape& unswitched_input1_shape,
|
|
||||||
const int8* unswitched_input1_data,
|
|
||||||
const RuntimeShape& unswitched_input2_shape,
|
|
||||||
const int8* unswitched_input2_data,
|
|
||||||
const RuntimeShape& output_shape,
|
|
||||||
int8* output_data,
|
|
||||||
ElementwiseF elementwise_f,
|
|
||||||
ScalarBroadcastF scalar_broadcast_f) {
|
|
||||||
ArithmeticParams switched_params = unswitched_params;
|
|
||||||
switched_params.input1_offset = unswitched_params.input2_offset;
|
|
||||||
switched_params.input1_multiplier = unswitched_params.input2_multiplier;
|
|
||||||
switched_params.input1_shift = unswitched_params.input2_shift;
|
|
||||||
switched_params.input2_offset = unswitched_params.input1_offset;
|
|
||||||
switched_params.input2_multiplier = unswitched_params.input1_multiplier;
|
|
||||||
switched_params.input2_shift = unswitched_params.input1_shift;
|
|
||||||
|
|
||||||
const bool use_unswitched =
|
|
||||||
unswitched_params.broadcast_category ==
|
|
||||||
tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
|
|
||||||
|
|
||||||
const ArithmeticParams& params =
|
|
||||||
use_unswitched ? unswitched_params : switched_params;
|
|
||||||
const int8* input1_data =
|
|
||||||
use_unswitched ? unswitched_input1_data : unswitched_input2_data;
|
|
||||||
const int8* input2_data =
|
|
||||||
use_unswitched ? unswitched_input2_data : unswitched_input1_data;
|
|
||||||
|
|
||||||
// Fivefold nested loops. The second input resets its position for each
|
|
||||||
// iteration of the second loop. The first input resets its position at the
|
|
||||||
// beginning of the fourth loop. The innermost loop is an elementwise add of
|
|
||||||
// sections of the arrays.
|
|
||||||
int8* output_data_ptr = output_data;
|
|
||||||
const int8* input1_data_ptr = input1_data;
|
|
||||||
const int8* input2_data_reset = input2_data;
|
|
||||||
// In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
|
|
||||||
// between input shapes. y3 for input 1 is always broadcast, and so the
|
|
||||||
// dimension there is 1, whereas optionally y1 might be broadcast for
|
|
||||||
// input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
|
|
||||||
// input2.shape.FlatSize = y0 * y2 * y3 * y4.
|
|
||||||
int y0 = params.broadcast_shape[0];
|
|
||||||
int y1 = params.broadcast_shape[1];
|
|
||||||
int y2 = params.broadcast_shape[2];
|
|
||||||
int y3 = params.broadcast_shape[3];
|
|
||||||
int y4 = params.broadcast_shape[4];
|
|
||||||
if (y4 > 1) {
|
|
||||||
// General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
|
|
||||||
// dimension.
|
|
||||||
for (int i0 = 0; i0 < y0; ++i0) {
|
|
||||||
const int8* input2_data_ptr = nullptr;
|
|
||||||
for (int i1 = 0; i1 < y1; ++i1) {
|
|
||||||
input2_data_ptr = input2_data_reset;
|
|
||||||
for (int i2 = 0; i2 < y2; ++i2) {
|
|
||||||
for (int i3 = 0; i3 < y3; ++i3) {
|
|
||||||
elementwise_f(y4, params, input1_data_ptr, input2_data_ptr,
|
|
||||||
output_data_ptr);
|
|
||||||
input2_data_ptr += y4;
|
|
||||||
output_data_ptr += y4;
|
|
||||||
}
|
|
||||||
// We have broadcast y4 of input1 data y3 times, and now move on.
|
|
||||||
input1_data_ptr += y4;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
|
|
||||||
input2_data_reset = input2_data_ptr;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Special case of y4 == 1, in which the innermost loop is a single
|
|
||||||
// element and can be combined with the next (y3) as an inner broadcast.
|
|
||||||
//
|
|
||||||
// Note that this handles the case of pure scalar broadcast when
|
|
||||||
// y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
|
|
||||||
// broadcast with batch (as y2 > 1).
|
|
||||||
//
|
|
||||||
// NOTE The process is the same as the above general case except
|
|
||||||
// simplified for y4 == 1 and the loop over y3 is contained within the
|
|
||||||
// AddScalarBroadcast function.
|
|
||||||
for (int i0 = 0; i0 < y0; ++i0) {
|
|
||||||
const int8* input2_data_ptr = nullptr;
|
|
||||||
for (int i1 = 0; i1 < y1; ++i1) {
|
|
||||||
input2_data_ptr = input2_data_reset;
|
|
||||||
for (int i2 = 0; i2 < y2; ++i2) {
|
|
||||||
scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr,
|
|
||||||
output_data_ptr);
|
|
||||||
input2_data_ptr += y3;
|
|
||||||
output_data_ptr += y3;
|
|
||||||
input1_data_ptr += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
input2_data_reset = input2_data_ptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Op>
|
template <typename Op>
|
||||||
inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
|
inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
|
||||||
const RuntimeShape& input1_shape,
|
const RuntimeShape& input1_shape,
|
||||||
|
@ -76,6 +76,10 @@ inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
|
|||||||
BroadcastableOpCategory::kFirstInputBroadcastsFast &&
|
BroadcastableOpCategory::kFirstInputBroadcastsFast &&
|
||||||
params->broadcast_category !=
|
params->broadcast_category !=
|
||||||
BroadcastableOpCategory::kSecondInputBroadcastsFast) {
|
BroadcastableOpCategory::kSecondInputBroadcastsFast) {
|
||||||
|
// This is unreachable because at least one else clause in the above loop
|
||||||
|
// must be reached.
|
||||||
|
TFLITE_DCHECK(false);
|
||||||
|
params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -326,11 +326,11 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
|
|||||||
TF_LITE_SUB(reference_ops, Add, uint8_t);
|
TF_LITE_SUB(reference_ops, Add, uint8_t);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (op_params.broadcast_category ==
|
if (need_broadcast) {
|
||||||
BroadcastableOpCategory::kGenericBroadcast) {
|
optimized_ops::BroadcastAddDispatch(
|
||||||
TF_LITE_SUB(optimized_ops, BroadcastAdd4DSlow, uint8_t);
|
op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1),
|
||||||
} else if (need_broadcast) {
|
GetTensorShape(input2), GetTensorData<uint8_t>(input2),
|
||||||
TF_LITE_SUB(optimized_ops, BroadcastAddFivefold, uint8_t);
|
GetTensorShape(output), GetTensorData<uint8_t>(output));
|
||||||
} else {
|
} else {
|
||||||
TF_LITE_SUB(optimized_ops, Add, uint8_t);
|
TF_LITE_SUB(optimized_ops, Add, uint8_t);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user