Introduce fivefold broadcast Mul op kernels for uint8.
PiperOrigin-RevId: 208709648
This commit is contained in:
parent
af827be63a
commit
ab53f852b7
@ -294,6 +294,37 @@ void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
|
||||
output_data);
|
||||
}
|
||||
|
||||
inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
|
||||
int32 input1_offset, const uint8* input2_data,
|
||||
const Dims<4>& input2_dims, int32 input2_offset,
|
||||
int32 output_offset, int32 output_multiplier,
|
||||
int output_shift, int32 output_activation_min,
|
||||
int32 output_activation_max, uint8* output_data,
|
||||
const Dims<4>& output_dims) {
|
||||
BroadcastMul4DSlow(
|
||||
input1_data, input1_dims, input1_offset, input2_data, input2_dims,
|
||||
input2_offset, output_offset, output_multiplier,
|
||||
// This legacy version switches the sign of the output shift.
|
||||
kReverseShift * output_shift,
|
||||
// (Break to highlight preceding line.)
|
||||
output_activation_min, output_activation_max, output_data, output_dims);
|
||||
}
|
||||
|
||||
// legacy, for compatibility with old checked-in code
|
||||
template <FusedActivationFunctionType Ac>
|
||||
inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
|
||||
int32 input1_offset, const uint8* input2_data,
|
||||
const Dims<4>& input2_dims, int32 input2_offset,
|
||||
int32 output_offset, int32 output_multiplier,
|
||||
int output_shift, int32 output_activation_min,
|
||||
int32 output_activation_max, uint8* output_data,
|
||||
const Dims<4>& output_dims) {
|
||||
BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
|
||||
input2_dims, input2_offset, output_offset, output_multiplier,
|
||||
output_shift, output_activation_min, output_activation_max,
|
||||
output_data, output_dims);
|
||||
}
|
||||
|
||||
inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
|
||||
int stride_width, int stride_height, int pad_width,
|
||||
int pad_height, int kwidth, int kheight,
|
||||
|
@ -47,6 +47,7 @@ using reference_ops::BroadcastGreater;
|
||||
using reference_ops::BroadcastGreaterEqual;
|
||||
using reference_ops::BroadcastLess;
|
||||
using reference_ops::BroadcastLessEqual;
|
||||
using reference_ops::BroadcastMul4DSlow;
|
||||
using reference_ops::BroadcastSub4DSlow;
|
||||
using reference_ops::Concatenation;
|
||||
using reference_ops::DepthConcatenation;
|
||||
@ -2904,66 +2905,128 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
|
||||
output_dims);
|
||||
}
|
||||
|
||||
inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
|
||||
int32 input1_offset, const uint8* input2_data,
|
||||
const Dims<4>& input2_dims, int32 input2_offset,
|
||||
int32 output_offset, int32 output_multiplier,
|
||||
int output_shift, int32 output_activation_min,
|
||||
int32 output_activation_max, uint8* output_data,
|
||||
const Dims<4>& output_dims) {
|
||||
gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
|
||||
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
|
||||
|
||||
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
||||
// col, channel), with extents (batches, height, width, depth), with the
|
||||
// trailing dimension changing most rapidly (channels has the smallest stride,
|
||||
// typically 1 element).
|
||||
//
|
||||
// In generated C code, we store arrays with the dimensions reversed. The
|
||||
// first dimension has smallest stride.
|
||||
//
|
||||
// We name our variables by their Tensorflow convention, but generate C code
|
||||
// nesting loops such that the innermost loop has the smallest stride for the
|
||||
// best cache behavior.
|
||||
for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
|
||||
for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
|
||||
for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
|
||||
for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
|
||||
const int32 input1_val =
|
||||
input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
|
||||
const int32 input2_val =
|
||||
input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
|
||||
const int32 unclamped_result =
|
||||
output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
input1_val * input2_val, output_multiplier,
|
||||
kReverseShift * output_shift);
|
||||
const int32 clamped_output =
|
||||
std::min(output_activation_max,
|
||||
std::max(output_activation_min, unclamped_result));
|
||||
output_data[Offset(output_dims, c, x, y, b)] =
|
||||
static_cast<uint8>(clamped_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Element-wise mul that can often be used for inner loop of broadcast Mul as
|
||||
// well as the non-broadcast Mul.
|
||||
inline void MulElementwise(int size, const ArithmeticParams& params,
|
||||
const uint8* input1_data, const uint8* input2_data,
|
||||
uint8* output_data) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32 input1_val = params.input1_offset + input1_data[i];
|
||||
const int32 input2_val = params.input2_offset + input2_data[i];
|
||||
const int32 unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
|
||||
params.output_multiplier,
|
||||
params.output_shift);
|
||||
const int32 clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[i] = static_cast<uint8>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
// legacy, for compatibility with old checked-in code
|
||||
template <FusedActivationFunctionType Ac>
|
||||
inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
|
||||
int32 input1_offset, const uint8* input2_data,
|
||||
const Dims<4>& input2_dims, int32 input2_offset,
|
||||
int32 output_offset, int32 output_multiplier,
|
||||
int output_shift, int32 output_activation_min,
|
||||
int32 output_activation_max, uint8* output_data,
|
||||
const Dims<4>& output_dims) {
|
||||
BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
|
||||
input2_dims, input2_offset, output_offset, output_multiplier,
|
||||
output_shift, output_activation_min, output_activation_max,
|
||||
output_data, output_dims);
|
||||
// Broadcast mul that can often be used for inner loop of broadcast Mul.
|
||||
inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
|
||||
const uint8 broadcast_value,
|
||||
const uint8* input2_data, uint8* output_data) {
|
||||
const int32 input1_val = params.input1_offset + broadcast_value;
|
||||
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32 input2_val = params.input2_offset + input2_data[i];
|
||||
const int32 unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
|
||||
params.output_multiplier,
|
||||
params.output_shift);
|
||||
const int32 clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[i] = static_cast<uint8>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
inline void Mul(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const uint8* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8* input2_data,
|
||||
const RuntimeShape& output_shape, uint8* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
gemmlowp::ScopedProfilingLabel label("Mul/8bit");
|
||||
const int flat_size =
|
||||
MatchingFlatSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
MulElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
|
||||
const RuntimeShape& unswitched_input1_shape,
|
||||
const uint8* unswitched_input1_data,
|
||||
const RuntimeShape& unswitched_input2_shape,
|
||||
const uint8* unswitched_input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
gemmlowp::ScopedProfilingLabel label("BroadcastMulFivefold/8bit");
|
||||
|
||||
ArithmeticParams switched_params = unswitched_params;
|
||||
switched_params.input1_offset = unswitched_params.input2_offset;
|
||||
switched_params.input2_offset = unswitched_params.input1_offset;
|
||||
|
||||
const bool use_unswitched =
|
||||
unswitched_params.broadcast_category ==
|
||||
tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
|
||||
|
||||
const ArithmeticParams& params =
|
||||
use_unswitched ? unswitched_params : switched_params;
|
||||
const uint8* input1_data =
|
||||
use_unswitched ? unswitched_input1_data : unswitched_input2_data;
|
||||
const uint8* input2_data =
|
||||
use_unswitched ? unswitched_input2_data : unswitched_input1_data;
|
||||
|
||||
// Fivefold nested loops. The second input resets its position for each
|
||||
// iteration of the second loop. The first input resets its position at the
|
||||
// beginning of the fourth loop. The innermost loop is an elementwise Mul of
|
||||
// sections of the arrays.
|
||||
uint8* output_data_ptr = output_data;
|
||||
const uint8* input1_data_ptr = input1_data;
|
||||
const uint8* input2_data_reset = input2_data;
|
||||
int y0 = params.broadcast_shape[0];
|
||||
int y1 = params.broadcast_shape[1];
|
||||
int y2 = params.broadcast_shape[2];
|
||||
int y3 = params.broadcast_shape[3];
|
||||
int y4 = params.broadcast_shape[4];
|
||||
if (y4 > 1) {
|
||||
for (int i0 = 0; i0 < y0; ++i0) {
|
||||
const uint8* input2_data_ptr;
|
||||
for (int i1 = 0; i1 < y1; ++i1) {
|
||||
input2_data_ptr = input2_data_reset;
|
||||
for (int i2 = 0; i2 < y2; ++i2) {
|
||||
for (int i3 = 0; i3 < y3; ++i3) {
|
||||
MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
|
||||
output_data_ptr);
|
||||
input2_data_ptr += y4;
|
||||
output_data_ptr += y4;
|
||||
}
|
||||
input1_data_ptr += y4;
|
||||
}
|
||||
}
|
||||
input2_data_reset = input2_data_ptr;
|
||||
}
|
||||
} else {
|
||||
for (int i0 = 0; i0 < y0; ++i0) {
|
||||
const uint8* input2_data_ptr;
|
||||
for (int i1 = 0; i1 < y1; ++i1) {
|
||||
input2_data_ptr = input2_data_reset;
|
||||
for (int i2 = 0; i2 < y2; ++i2) {
|
||||
MulSimpleBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
|
||||
output_data_ptr);
|
||||
input2_data_ptr += y3;
|
||||
output_data_ptr += y3;
|
||||
++input1_data_ptr;
|
||||
}
|
||||
}
|
||||
input2_data_reset = input2_data_ptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
|
||||
|
@ -316,6 +316,37 @@ inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
|
||||
DimsToShape(output_dims), output_data);
|
||||
}
|
||||
|
||||
inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
|
||||
int32 input1_offset, const uint8* input2_data,
|
||||
const Dims<4>& input2_dims, int32 input2_offset,
|
||||
int32 output_offset, int32 output_multiplier,
|
||||
int output_shift, int32 output_activation_min,
|
||||
int32 output_activation_max, uint8* output_data,
|
||||
const Dims<4>& output_dims) {
|
||||
BroadcastMul4DSlow(
|
||||
input1_data, input1_dims, input1_offset, input2_data, input2_dims,
|
||||
input2_offset, output_offset, output_multiplier,
|
||||
//
|
||||
kReverseShift * output_shift,
|
||||
//
|
||||
output_activation_min, output_activation_max, output_data, output_dims);
|
||||
}
|
||||
|
||||
// legacy, for compatibility with old checked-in code
|
||||
template <FusedActivationFunctionType Ac>
|
||||
inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
|
||||
int32 input1_offset, const uint8* input2_data,
|
||||
const Dims<4>& input2_dims, int32 input2_offset,
|
||||
int32 output_offset, int32 output_multiplier,
|
||||
int output_shift, int32 output_activation_min,
|
||||
int32 output_activation_max, uint8* output_data,
|
||||
const Dims<4>& output_dims) {
|
||||
BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
|
||||
input2_dims, input2_offset, output_offset, output_multiplier,
|
||||
output_shift, output_activation_min, output_activation_max,
|
||||
output_data, output_dims);
|
||||
}
|
||||
|
||||
// legacy, for compatibility with old checked-in code
|
||||
template <FusedActivationFunctionType Ac>
|
||||
void AveragePool(const float* input_data, const Dims<4>& input_dims,
|
||||
|
@ -1374,13 +1374,143 @@ void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
|
||||
output_dims);
|
||||
}
|
||||
|
||||
inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
|
||||
int32 input1_offset, const uint8* input2_data,
|
||||
const Dims<4>& input2_dims, int32 input2_offset,
|
||||
int32 output_offset, int32 output_multiplier,
|
||||
int output_shift, int32 output_activation_min,
|
||||
int32 output_activation_max, uint8* output_data,
|
||||
const Dims<4>& output_dims) {
|
||||
// Element-wise mul that can often be used for inner loop of broadcast Mul as
|
||||
// well as the non-broadcast Mul.
|
||||
inline void MulElementwise(int size, const ArithmeticParams& params,
|
||||
const uint8* input1_data, const uint8* input2_data,
|
||||
uint8* output_data) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
const int32 input1_val = params.input1_offset + input1_data[i];
|
||||
const int32 input2_val = params.input2_offset + input2_data[i];
|
||||
const int32 unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
|
||||
params.output_multiplier,
|
||||
params.output_shift);
|
||||
const int32 clamped_output =
|
||||
std::min(params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[i] = static_cast<uint8>(clamped_output);
|
||||
}
|
||||
}
|
||||
|
||||
inline void Mul(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape, const uint8* input1_data,
|
||||
const RuntimeShape& input2_shape, const uint8* input2_data,
|
||||
const RuntimeShape& output_shape, uint8* output_data) {
|
||||
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
||||
params.quantized_activation_max);
|
||||
gemmlowp::ScopedProfilingLabel label("Mul/8bit");
|
||||
const int flat_size =
|
||||
MatchingFlatSize(input1_shape, input2_shape, output_shape);
|
||||
|
||||
MulElementwise(flat_size, params, input1_data, input2_data, output_data);
|
||||
}
|
||||
|
||||
inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
|
||||
const RuntimeShape& unswitched_input1_shape,
|
||||
const uint8* unswitched_input1_data,
|
||||
const RuntimeShape& unswitched_input2_shape,
|
||||
const uint8* unswitched_input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
ArithmeticParams switched_params = unswitched_params;
|
||||
switched_params.input1_offset = unswitched_params.input2_offset;
|
||||
switched_params.input2_offset = unswitched_params.input1_offset;
|
||||
|
||||
const bool use_unswitched =
|
||||
unswitched_params.broadcast_category ==
|
||||
tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
|
||||
|
||||
const ArithmeticParams& params =
|
||||
use_unswitched ? unswitched_params : switched_params;
|
||||
const uint8* input1_data =
|
||||
use_unswitched ? unswitched_input1_data : unswitched_input2_data;
|
||||
const uint8* input2_data =
|
||||
use_unswitched ? unswitched_input2_data : unswitched_input1_data;
|
||||
|
||||
// Fivefold nested loops. The second input resets its position for each
|
||||
// iteration of the second loop. The first input resets its position at the
|
||||
// beginning of the fourth loop. The innermost loop is an elementwise Mul of
|
||||
// sections of the arrays.
|
||||
uint8* output_data_ptr = output_data;
|
||||
const uint8* input1_data_ptr = input1_data;
|
||||
const uint8* input2_data_reset = input2_data;
|
||||
int y0 = params.broadcast_shape[0];
|
||||
int y1 = params.broadcast_shape[1];
|
||||
int y2 = params.broadcast_shape[2];
|
||||
int y3 = params.broadcast_shape[3];
|
||||
int y4 = params.broadcast_shape[4];
|
||||
for (int i0 = 0; i0 < y0; ++i0) {
|
||||
const uint8* input2_data_ptr;
|
||||
for (int i1 = 0; i1 < y1; ++i1) {
|
||||
input2_data_ptr = input2_data_reset;
|
||||
for (int i2 = 0; i2 < y2; ++i2) {
|
||||
for (int i3 = 0; i3 < y3; ++i3) {
|
||||
MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
|
||||
output_data_ptr);
|
||||
input2_data_ptr += y4;
|
||||
output_data_ptr += y4;
|
||||
}
|
||||
input1_data_ptr += y4;
|
||||
}
|
||||
}
|
||||
input2_data_reset = input2_data_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
inline void BroadcastMul4DSlow(const ArithmeticParams& params,
|
||||
const RuntimeShape& input1_shape,
|
||||
const uint8* input1_data,
|
||||
const RuntimeShape& input2_shape,
|
||||
const uint8* input2_data,
|
||||
const RuntimeShape& output_shape,
|
||||
uint8* output_data) {
|
||||
gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit");
|
||||
|
||||
NdArrayDesc<4> desc1;
|
||||
NdArrayDesc<4> desc2;
|
||||
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
||||
&desc2);
|
||||
RuntimeShape extended_output_shape =
|
||||
RuntimeShape::ExtendedShape(4, output_shape);
|
||||
|
||||
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
||||
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
||||
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
||||
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
||||
const int32 input1_val =
|
||||
params.input1_offset +
|
||||
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
|
||||
const int32 input2_val =
|
||||
params.input2_offset +
|
||||
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
|
||||
const int32 unclamped_result =
|
||||
params.output_offset +
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
input1_val * input2_val, params.output_multiplier,
|
||||
params.output_shift);
|
||||
const int32 clamped_output = std::min(
|
||||
params.quantized_activation_max,
|
||||
std::max(params.quantized_activation_min, unclamped_result));
|
||||
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
||||
static_cast<uint8>(clamped_output);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Transitional version that will be moved shortly to legacy_reference_ops, as
|
||||
// part of RuntimeShape revisions.
|
||||
inline void BroadcastMul4DSlow(const uint8* input1_data,
|
||||
const Dims<4>& input1_dims, int32 input1_offset,
|
||||
const uint8* input2_data,
|
||||
const Dims<4>& input2_dims, int32 input2_offset,
|
||||
int32 output_offset, int32 output_multiplier,
|
||||
int output_shift, int32 output_activation_min,
|
||||
int32 output_activation_max, uint8* output_data,
|
||||
const Dims<4>& output_dims) {
|
||||
gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
|
||||
|
||||
NdArrayDesc<4> desc1;
|
||||
@ -1407,9 +1537,9 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
|
||||
const int32 input2_val =
|
||||
input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
|
||||
const int32 unclamped_result =
|
||||
output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
input1_val * input2_val, output_multiplier,
|
||||
kReverseShift * output_shift);
|
||||
output_offset +
|
||||
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
||||
input1_val * input2_val, output_multiplier, output_shift);
|
||||
const int32 clamped_output =
|
||||
std::min(output_activation_max,
|
||||
std::max(output_activation_min, unclamped_result));
|
||||
@ -1464,21 +1594,6 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
|
||||
}
|
||||
}
|
||||
|
||||
// legacy, for compatibility with old checked-in code
|
||||
template <FusedActivationFunctionType Ac>
|
||||
inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
|
||||
int32 input1_offset, const uint8* input2_data,
|
||||
const Dims<4>& input2_dims, int32 input2_offset,
|
||||
int32 output_offset, int32 output_multiplier,
|
||||
int output_shift, int32 output_activation_min,
|
||||
int32 output_activation_max, uint8* output_data,
|
||||
const Dims<4>& output_dims) {
|
||||
BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
|
||||
input2_dims, input2_offset, output_offset, output_multiplier,
|
||||
output_shift, output_activation_min, output_activation_max,
|
||||
output_data, output_dims);
|
||||
}
|
||||
|
||||
// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
|
||||
// dimensionality if the runtime code does a single loop over one dimension
|
||||
// that handles broadcasting as the base case. The code generator would then
|
||||
|
@ -129,6 +129,13 @@ class RuntimeShape {
|
||||
}
|
||||
}
|
||||
|
||||
RuntimeShape(int shape_size, int32 value) : size_(0) {
|
||||
Resize(shape_size);
|
||||
for (int i = 0; i < shape_size; ++i) {
|
||||
SetDim(i, value);
|
||||
}
|
||||
}
|
||||
|
||||
RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) {
|
||||
ReplaceWith(dimensions_count, dims_data);
|
||||
}
|
||||
|
@ -93,7 +93,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
input1->params.scale * input2->params.scale / output->params.scale;
|
||||
QuantizeMultiplierSmallerThanOneExp(
|
||||
real_multiplier, &data->output_multiplier, &data->output_shift);
|
||||
data->output_shift *= -1;
|
||||
}
|
||||
|
||||
return context->ResizeTensor(context, output, output_size);
|
||||
@ -161,9 +160,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
|
||||
// The quantized version of Mul doesn't support activations, so we
|
||||
// always use BroadcastMul.
|
||||
if (kernel_type == kReference) {
|
||||
TF_LITE_MUL(reference_ops, BroadcastMul);
|
||||
TF_LITE_MUL(reference_ops, BroadcastMul4DSlow);
|
||||
} else {
|
||||
TF_LITE_MUL(optimized_ops, BroadcastMul);
|
||||
TF_LITE_MUL(optimized_ops, BroadcastMul4DSlow);
|
||||
}
|
||||
#undef TF_LITE_MUL
|
||||
} else if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
|
||||
|
Loading…
x
Reference in New Issue
Block a user