Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/32168 This PR port the mul kernel to Tensorflow Lite Micro. Supported data types are float, int8 and uint8. Copybara import of the project: -- b163a6795209c367d1368311da554881617bc2b7 by Jens Elofsson <jens.elofsson@arm.com>: Move Mul ops to its own header file to remove gemmlowp dependency. -- 3b6c71fdd0f816c121264f28d1888be91f19fa0e by Jens Elofsson <jens.elofsson@arm.com>: Add MUL kernel to Tensorflow Lite Micro. -- e383f4fff2dcf713f012e842f3a027f6a8090d22 by Jens Elofsson <jens.elofsson@arm.com>: Add tests for the Mul kernel in Tensorflow Lite Micro. -- 1082b9e9101820b35c797a711e8c9677df129aaf by Jens Elofsson <jens.elofsson@arm.com>: Remove duplicated BroadcastMul4DSlow function. -- 584725fedac52c8c9c16eb904562e2797e294d8d by Jens Elofsson <jens.elofsson@arm.com>: Add Register_MUL to micro_ops.h and remove include that's failing. PiperOrigin-RevId: 279365723 Change-Id: Id64e22adbacb678f4d7b0034f506940e0540754d
167 lines
7.0 KiB
C++
167 lines
7.0 KiB
C++
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
|
|
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
|
|
|
|
#include "tensorflow/lite/kernels/internal/common.h"
|
|
|
|
namespace tflite {
|
|
|
|
namespace reference_ops {
|
|
|
|
// Element-wise mul that can often be used for inner loop of broadcast Mul as
|
|
// well as the non-broadcast Mul.
|
|
inline void MulElementwise(int size, const ArithmeticParams& params,
|
|
const uint8* input1_data, const uint8* input2_data,
|
|
uint8* output_data) {
|
|
for (int i = 0; i < size; ++i) {
|
|
const int32 input1_val = params.input1_offset + input1_data[i];
|
|
const int32 input2_val = params.input2_offset + input2_data[i];
|
|
const int32 unclamped_result =
|
|
params.output_offset +
|
|
MultiplyByQuantizedMultiplier(input1_val * input2_val,
|
|
params.output_multiplier,
|
|
params.output_shift);
|
|
const int32 clamped_output =
|
|
std::min(params.quantized_activation_max,
|
|
std::max(params.quantized_activation_min, unclamped_result));
|
|
output_data[i] = static_cast<uint8>(clamped_output);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
inline void Mul(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const T* input1_data,
|
|
const RuntimeShape& input2_shape, const T* input2_data,
|
|
const RuntimeShape& output_shape, T* output_data) {
|
|
T output_activation_min;
|
|
T output_activation_max;
|
|
GetActivationParams(params, &output_activation_min, &output_activation_max);
|
|
|
|
const int flat_size =
|
|
MatchingFlatSize(input1_shape, input2_shape, output_shape);
|
|
for (int i = 0; i < flat_size; ++i) {
|
|
output_data[i] = ActivationFunctionWithMinMax(
|
|
input1_data[i] * input2_data[i], output_activation_min,
|
|
output_activation_max);
|
|
}
|
|
}
|
|
|
|
inline void Mul(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const uint8* input1_data,
|
|
const RuntimeShape& input2_shape, const uint8* input2_data,
|
|
const RuntimeShape& output_shape, uint8* output_data) {
|
|
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
const int flat_size =
|
|
MatchingFlatSize(input1_shape, input2_shape, output_shape);
|
|
|
|
MulElementwise(flat_size, params, input1_data, input2_data, output_data);
|
|
}
|
|
|
|
inline void BroadcastMul4DSlow(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape,
|
|
const uint8* input1_data,
|
|
const RuntimeShape& input2_shape,
|
|
const uint8* input2_data,
|
|
const RuntimeShape& output_shape,
|
|
uint8* output_data) {
|
|
NdArrayDesc<4> desc1;
|
|
NdArrayDesc<4> desc2;
|
|
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
|
|
&desc2);
|
|
const RuntimeShape extended_output_shape =
|
|
RuntimeShape::ExtendedShape(4, output_shape);
|
|
|
|
for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
|
|
for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
|
|
for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
|
|
for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
|
|
const int32 input1_val =
|
|
params.input1_offset +
|
|
input1_data[SubscriptToIndex(desc1, b, y, x, c)];
|
|
const int32 input2_val =
|
|
params.input2_offset +
|
|
input2_data[SubscriptToIndex(desc2, b, y, x, c)];
|
|
const int32 unclamped_result =
|
|
params.output_offset +
|
|
MultiplyByQuantizedMultiplier(input1_val * input2_val,
|
|
params.output_multiplier,
|
|
params.output_shift);
|
|
const int32 clamped_output = std::min(
|
|
params.quantized_activation_max,
|
|
std::max(params.quantized_activation_min, unclamped_result));
|
|
output_data[Offset(extended_output_shape, b, y, x, c)] =
|
|
static_cast<uint8>(clamped_output);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void BroadcastMul4DSlow(const ArithmeticParams& params,
|
|
const RuntimeShape& unextended_input1_shape,
|
|
const T* input1_data,
|
|
const RuntimeShape& unextended_input2_shape,
|
|
const T* input2_data,
|
|
const RuntimeShape& unextended_output_shape,
|
|
T* output_data) {
|
|
T output_activation_min;
|
|
T output_activation_max;
|
|
GetActivationParams(params, &output_activation_min, &output_activation_max);
|
|
|
|
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
|
const RuntimeShape output_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
|
|
|
NdArrayDesc<4> desc1;
|
|
NdArrayDesc<4> desc2;
|
|
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
|
unextended_input2_shape, &desc1, &desc2);
|
|
|
|
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
|
// col, channel), with extents (batches, height, width, depth), with the
|
|
// trailing dimension changing most rapidly (channels has the smallest stride,
|
|
// typically 1 element).
|
|
//
|
|
// In generated C code, we store arrays with the dimensions reversed. The
|
|
// first dimension has smallest stride.
|
|
//
|
|
// We name our variables by their Tensorflow convention, but generate C code
|
|
// nesting loops such that the innermost loop has the smallest stride for the
|
|
// best cache behavior.
|
|
for (int b = 0; b < output_shape.Dims(0); ++b) {
|
|
for (int y = 0; y < output_shape.Dims(1); ++y) {
|
|
for (int x = 0; x < output_shape.Dims(2); ++x) {
|
|
for (int c = 0; c < output_shape.Dims(3); ++c) {
|
|
output_data[Offset(output_shape, b, y, x, c)] =
|
|
ActivationFunctionWithMinMax(
|
|
input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
|
|
input2_data[SubscriptToIndex(desc2, b, y, x, c)],
|
|
output_activation_min, output_activation_max);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace reference_ops
|
|
} // namespace tflite
|
|
|
|
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
|