Merge pull request #40943 from pnikam-cad:hifi4_nnlib_v2_2_0_update

PiperOrigin-RevId: 322616499
Change-Id: I6a08cb4f11abe33c38c91a72cc45c635d0f78797
This commit is contained in:
TensorFlower Gardener 2020-07-22 11:30:39 -07:00
commit bf3b14ffcb
14 changed files with 1353 additions and 647 deletions

View File

@ -1,5 +1,5 @@
/******************************************************************************
* Copyright (C) 2019 Cadence Design Systems, Inc.
/*******************************************************************************
* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/
******************************************************************************/
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
@ -41,8 +41,8 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/op_macros.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
#include "tensorflow/lite/micro/micro_utils.h"
#include "xtensa_tf_micro_common.h"
namespace tflite {
namespace ops {
@ -109,6 +109,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
switch (input->type) {
case kTfLiteFloat32: {
#if HIFI_VFPU
int err;
const float* inp_data_ptr;
float* out_data_ptr;
@ -119,11 +120,13 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
inp_data_ptr = GetTensorData<float>(input);
out_data_ptr = GetTensorData<float>(output);
const float f32_pos_inf = 0x7F800000;
err = xa_nn_vec_relu_f32_f32(out_data_ptr, inp_data_ptr, f32_pos_inf,
flat_size);
err = xa_nn_vec_relu_std_f32_f32(out_data_ptr, inp_data_ptr, flat_size);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu1_f32_f32 failed");
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu_std_f32_f32 failed");
#else
ReluFloat(GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(output), GetTensorData<float>(output));
#endif /* HIFI_VFPU */
return kTfLiteOk;
}
case kTfLiteInt8: {
@ -140,14 +143,17 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
const RuntimeShape& input_shape = GetTensorShape(input);
const RuntimeShape& output_shape = GetTensorShape(output);
const int flat_size = MatchingFlatSize(input_shape, output_shape);
const uint8_t zero = input->params.zero_point;
inp_data_ptr = GetTensorData<uint8_t>(input);
out_data_ptr = GetTensorData<uint8_t>(output);
err = xa_nn_vec_activation_min_max_asym8_asym8(
out_data_ptr, inp_data_ptr, 0, 255, flat_size); // Is 255 right?
out_data_ptr, inp_data_ptr, zero, std::numeric_limits<uint8_t>::max(),
flat_size);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_activation_min_max_8_8 failed");
CHECK_ERR_HIFI_NNLIB_KER(
err, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
return kTfLiteOk;
}
default: {
@ -168,6 +174,7 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
switch (input->type) {
case kTfLiteFloat32: {
#if HIFI_VFPU
int err;
const float* inp_data_ptr;
float* out_data_ptr;
@ -180,7 +187,11 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
err = xa_nn_vec_relu6_f32_f32(out_data_ptr, inp_data_ptr, flat_size);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu1_f32_f32 failed");
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu6_f32_f32 failed");
#else
Relu6Float(GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(output), GetTensorData<float>(output));
#endif /* HIFI_VFPU */
return kTfLiteOk;
}
case kTfLiteInt8: {
@ -209,7 +220,8 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
err = xa_nn_vec_activation_min_max_asym8_asym8(out_data_ptr, inp_data_ptr,
zero, six, flat_size);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_activation_min_max_8_8 failed");
CHECK_ERR_HIFI_NNLIB_KER(
err, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
return kTfLiteOk;
}
default: {

View File

@ -0,0 +1,273 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/reference/add.h"
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/op_macros.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
#include "tensorflow/lite/micro/memory_helpers.h"
namespace tflite {
namespace ops {
namespace micro {
namespace add {
constexpr int kInputTensor1 = 0;
constexpr int kInputTensor2 = 1;
constexpr int kOutputTensor = 0;
struct OpData {
bool requires_broadcast;
// These fields are used in both the general 8-bit -> 8bit quantized path,
// and the special 16-bit -> 16bit quantized path
int input1_shift;
int input2_shift;
int32 output_activation_min;
int32 output_activation_max;
// These fields are used only in the general 8-bit -> 8bit quantized path
int32 input1_multiplier;
int32 input2_multiplier;
int32 output_multiplier;
int output_shift;
int left_shift;
int32 input1_offset;
int32 input2_offset;
int32 output_offset;
};
TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
const TfLiteTensor* input1,
const TfLiteTensor* input2, TfLiteTensor* output,
OpData* data) {
data->requires_broadcast = !HaveSameShapes(input1, input2);
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
// 8bit -> 8bit general quantized path, with general rescalings
data->input1_offset = -input1->params.zero_point;
data->input2_offset = -input2->params.zero_point;
data->output_offset = output->params.zero_point;
data->left_shift = 20;
const double twice_max_input_scale =
2 * static_cast<double>(
std::max(input1->params.scale, input2->params.scale));
const double real_input1_multiplier =
static_cast<double>(input1->params.scale) / twice_max_input_scale;
const double real_input2_multiplier =
static_cast<double>(input2->params.scale) / twice_max_input_scale;
const double real_output_multiplier =
twice_max_input_scale /
((1 << data->left_shift) * static_cast<double>(output->params.scale));
QuantizeMultiplierSmallerThanOneExp(
real_input1_multiplier, &data->input1_multiplier, &data->input1_shift);
QuantizeMultiplierSmallerThanOneExp(
real_input2_multiplier, &data->input2_multiplier, &data->input2_shift);
QuantizeMultiplierSmallerThanOneExp(
real_output_multiplier, &data->output_multiplier, &data->output_shift);
TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
context, params->activation, output, &data->output_activation_min,
&data->output_activation_max));
}
return kTfLiteOk;
}
TfLiteStatus EvalAdd(TfLiteContext* context, TfLiteNode* node,
TfLiteAddParams* params, const OpData* data,
const TfLiteTensor* input1, const TfLiteTensor* input2,
TfLiteTensor* output) {
float output_activation_min, output_activation_max;
CalculateActivationRange(params->activation, &output_activation_min,
&output_activation_max);
tflite::ArithmeticParams op_params;
SetActivationParams(output_activation_min, output_activation_max, &op_params);
#define TF_LITE_ADD(opname) \
reference_ops::opname(op_params, GetTensorShape(input1), \
GetTensorData<float>(input1), GetTensorShape(input2), \
GetTensorData<float>(input2), GetTensorShape(output), \
GetTensorData<float>(output))
if (data->requires_broadcast) {
TF_LITE_ADD(BroadcastAdd4DSlow);
} else {
#if HIFI_VFPU
int err;
const RuntimeShape& input1_shape = GetTensorShape(input1);
const RuntimeShape& input2_shape = GetTensorShape(input2);
const RuntimeShape& output_shape = GetTensorShape(output);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
err = xa_nn_elm_add_f32xf32_f32(GetTensorData<float>(output),
GetTensorData<float>(input1),
GetTensorData<float>(input2), flat_size);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_add_f32xf32_f32 failed");
err = xa_nn_vec_activation_min_max_f32_f32(
GetTensorData<float>(output), GetTensorData<float>(output),
output_activation_min, output_activation_max, flat_size);
CHECK_ERR_HIFI_NNLIB_KER(err,
"xa_nn_vec_activation_min_max_f32_f32 failed");
#else
TF_LITE_ADD(Add);
#endif /* HIFI_VFPU */
}
#undef TF_LITE_ADD
return kTfLiteOk;
}
TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
TfLiteAddParams* params, const OpData* data,
const TfLiteTensor* input1,
const TfLiteTensor* input2,
TfLiteTensor* output) {
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
tflite::ArithmeticParams op_params;
op_params.left_shift = data->left_shift;
op_params.input1_offset = data->input1_offset;
op_params.input1_multiplier = data->input1_multiplier;
op_params.input1_shift = data->input1_shift;
op_params.input2_offset = data->input2_offset;
op_params.input2_multiplier = data->input2_multiplier;
op_params.input2_shift = data->input2_shift;
op_params.output_offset = data->output_offset;
op_params.output_multiplier = data->output_multiplier;
op_params.output_shift = data->output_shift;
SetActivationParams(data->output_activation_min,
data->output_activation_max, &op_params);
bool need_broadcast = reference_ops::ProcessBroadcastShapes(
GetTensorShape(input1), GetTensorShape(input2), &op_params);
#define TF_LITE_ADD(type, opname, dtype) \
type::opname(op_params, GetTensorShape(input1), \
GetTensorData<dtype>(input1), GetTensorShape(input2), \
GetTensorData<dtype>(input2), GetTensorShape(output), \
GetTensorData<dtype>(output));
if (output->type == kTfLiteInt8) {
if (need_broadcast) {
TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
} else {
TF_LITE_ADD(reference_integer_ops, Add, int8_t);
}
} else {
if (need_broadcast) {
TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
} else {
int err;
const RuntimeShape& input1_shape = GetTensorShape(input1);
const RuntimeShape& input2_shape = GetTensorShape(input2);
const RuntimeShape& output_shape = GetTensorShape(output);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
err = xa_nn_elm_add_asym8xasym8_asym8(
GetTensorData<uint8_t>(output), op_params.output_offset,
op_params.output_shift, op_params.output_multiplier,
op_params.quantized_activation_min,
op_params.quantized_activation_max, GetTensorData<uint8_t>(input1),
op_params.input1_offset, op_params.input1_shift,
op_params.input1_multiplier, GetTensorData<uint8_t>(input2),
op_params.input2_offset, op_params.input2_shift,
op_params.input2_multiplier, op_params.left_shift, flat_size);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_add_asym8xasym8_asym8 failed");
}
}
#undef TF_LITE_ADD
}
return kTfLiteOk;
}
void* Init(TfLiteContext* context, const char* buffer, size_t length) {
TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
void* data = nullptr;
if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
kTfLiteError) {
return nullptr;
}
return data;
}
TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
TFLITE_DCHECK(node->user_data != nullptr);
TFLITE_DCHECK(node->builtin_data != nullptr);
const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
OpData* data = static_cast<OpData*>(node->user_data);
auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
TF_LITE_ENSURE_STATUS(
CalculateOpData(context, params, input1, input2, output, data));
return kTfLiteOk;
}
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
TFLITE_DCHECK(node->user_data != nullptr);
const OpData* data = static_cast<const OpData*>(node->user_data);
const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
if (output->type == kTfLiteFloat32) {
TF_LITE_ENSURE_OK(
context, EvalAdd(context, node, params, data, input1, input2, output));
} else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, data,
input1, input2, output));
} else {
TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
TfLiteTypeGetName(output->type), output->type);
return kTfLiteError;
}
return kTfLiteOk;
}
} // namespace add
TfLiteRegistration Register_ADD() {
return {/*init=*/add::Init,
/*free=*/nullptr,
/*prepare=*/add::Prepare,
/*invoke=*/add::Eval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -1,5 +1,5 @@
/******************************************************************************
* Copyright (C) 2019 Cadence Design Systems, Inc.
/*******************************************************************************
* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/
******************************************************************************/
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
@ -44,7 +44,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/padding.h"
#include "xtensa_tf_micro_common.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
namespace tflite {
namespace ops {
@ -55,7 +55,6 @@ constexpr int kInputTensor = 0;
constexpr int kFilterTensor = 1;
constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
constexpr int kMaxChannels = 256;
// Conv is quantized along dimension 0:
// https://www.tensorflow.org/lite/performance/quantization_spec
@ -71,9 +70,8 @@ struct OpData {
int output_shift;
// Per channel output multiplier and shift.
// (b/141139247): Allocate these dynamically when possible.
int32_t per_channel_output_multiplier[kMaxChannels];
int32_t per_channel_output_shift[kMaxChannels];
int32_t* per_channel_output_multiplier;
int32_t* per_channel_output_shift;
// The range of the fused activation layer. For example for kNone and
// uint8_t these would be 0 and 255.
@ -94,10 +92,10 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) {
}
TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, int width, int height,
int filter_width, int filter_height, int out_width,
int out_height, const TfLiteType data_type,
OpData* data) {
const TfLiteConvParams* params, int width,
int height, int filter_width, int filter_height,
int out_width, int out_height,
const TfLiteType data_type, OpData* data) {
bool has_bias = node->inputs->size == 3;
// Check number of inputs/outputs
TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@ -131,8 +129,69 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
return kTfLiteOk;
}
void* Init(TfLiteContext* context, const char* buffer, size_t length) {
TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
void* data = nullptr;
if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
kTfLiteError) {
return nullptr;
}
return data;
}
TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
TFLITE_DCHECK(node->user_data != nullptr);
TFLITE_DCHECK(node->builtin_data != nullptr);
OpData* data = static_cast<OpData*>(node->user_data);
const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
int input_width = input->dims->data[2];
int input_height = input->dims->data[1];
int filter_width = filter->dims->data[2];
int filter_height = filter->dims->data[1];
int output_width = output->dims->data[2];
int output_height = output->dims->data[1];
// Dynimically allocate per-channel quantization parameters.
const int num_channels = filter->dims->data[kConvQuantizedDimension];
TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
context, num_channels * sizeof(int32_t),
reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
context, num_channels * sizeof(int32_t),
reinterpret_cast<void**>(&data->per_channel_output_shift)));
// All per-channel quantized tensors need valid zero point and scale arrays.
if (input->type == kTfLiteInt8) {
TF_LITE_ENSURE_EQ(context, filter->quantization.type,
kTfLiteAffineQuantization);
const auto* affine_quantization =
static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
TF_LITE_ENSURE(context, affine_quantization);
TF_LITE_ENSURE(context, affine_quantization->scale);
TF_LITE_ENSURE(context, affine_quantization->zero_point);
TF_LITE_ENSURE(context,
affine_quantization->scale->size == 1 ||
affine_quantization->scale->size ==
filter->dims->data[kConvQuantizedDimension]);
TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
affine_quantization->zero_point->size);
}
return CalculateOpData(context, node, params, input_width, input_height,
filter_width, filter_height, output_width,
output_height, input->type, data);
} // namespace conv
TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, OpData* data,
TfLiteConvParams* params, const OpData& data,
const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias,
TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
@ -143,9 +202,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
if ((params->dilation_width_factor == 1) &&
(params->dilation_height_factor == 1)) {
const uint8 *input_data, *filter_data;
const uint8_t *input_data, *filter_data;
const int32_t* bias_data;
uint8* output_data;
uint8_t* output_data;
const RuntimeShape& input_shape = GetTensorShape(input);
const RuntimeShape& filter_shape = GetTensorShape(filter);
const RuntimeShape& output_shape = GetTensorShape(output);
@ -158,14 +217,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
const int stride_width = params->stride_width;
const int stride_height = params->stride_height;
const int dilation_width_factor = 1;
const int dilation_height_factor = 1;
const int pad_width = data->padding.width;
const int pad_height = data->padding.height;
const int32 output_activation_min = data->output_activation_min;
const int32 output_activation_max = data->output_activation_max;
const int32 output_multiplier = data->output_multiplier;
const int output_shift = -data->output_shift;
const int pad_width = data.padding.width;
const int pad_height = data.padding.height;
const int32 output_activation_min = data.output_activation_min;
const int32 output_activation_max = data.output_activation_max;
const int32 output_multiplier = data.output_multiplier;
const int output_shift = -data.output_shift;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
@ -186,13 +243,14 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
const int filter_depth = filter_shape.Dims(3);
int err, output_data_format = 0;
void* p_scratch;
uint8 *p_filter, *p_out_scratch;
uint8_t* p_scratch;
uint8_t* p_filter;
// Calculate filter_depth_padded as next near multiple of 4
int filter_depth_padded = (filter_depth + 3) & (~3);
int out_length = output_height * output_width * output_depth;
int filter_size_padded = filter_height * filter_width * filter_depth_padded;
int required_scratch, input_precision = PREC_ASYM8;
int h, w, c;
int h, c;
required_scratch = xa_nn_conv2d_std_getsize(
input_height, input_depth, filter_height, filter_width, stride_height,
@ -207,19 +265,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
p_scratch = xtensa_nnlib_scratch_buf;
p_filter = (uint8*)p_scratch;
p_out_scratch =
(p_filter +
ALIGNED_SIZE((sizeof(uint8_t) * filter_height * filter_width *
filter_depth_padded * output_depth),
8));
p_filter = p_scratch;
required_scratch +=
ALIGNED_SIZE((sizeof(uint8_t) * filter_height * filter_width *
filter_depth_padded * output_depth),
8);
p_scratch =
(uint8*)(p_out_scratch + ALIGNED_SIZE(sizeof(uint8_t) * out_length, 8));
required_scratch += ALIGNED_SIZE(sizeof(uint8_t) * out_length, 8);
ALIGNED_SIZE((sizeof(uint8_t) * filter_size_padded * output_depth), 8);
p_scratch +=
ALIGNED_SIZE(sizeof(uint8_t) * filter_size_padded * output_depth, 8);
if (required_scratch > (int)XTENSA_NNLIB_MAX_SCRATCH_SIZE) {
TF_LITE_KERNEL_LOG(context,
@ -240,9 +290,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
}
for (int batch = 0; batch < batches; ++batch) {
uint8* p_out_temp;
p_out_temp = (uint8*)&p_out_scratch[0];
p_out_temp = (uint8*)ALIGN_PTR(p_out_temp, 8);
uint8_t* p_out_temp;
p_out_temp = &output_data[batch * out_length];
err = xa_nn_conv2d_std_asym8xasym8(
p_out_temp,
@ -252,24 +301,24 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
filter_width, output_depth, stride_width, stride_height, pad_width,
pad_height, output_height, output_width, input_offset, filter_offset,
output_multiplier, output_shift, output_offset, output_data_format,
p_scratch);
static_cast<void*>(p_scratch));
CHECK_ERR_HIFI_NNLIB_KER(
err, "conv2d_std_asym8: xa_nn_conv2d_std_asym8xasym8 failed");
for (int i = 0; i < out_length; i++) {
uint8* p_temp;
p_temp = &output_data[batch * out_length];
err = xa_nn_vec_activation_min_max_asym8_asym8(
p_out_temp, p_out_temp, output_activation_min, output_activation_max,
out_length);
ACTIVATION_MIN_MAX_ASYM8(p_temp[i], p_out_temp[i],
output_activation_min, output_activation_max)
}
CHECK_ERR_HIFI_NNLIB_KER(
err, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
}
} else {
// TODO(b/154032858): Investigate removing extra copies.
ConvParams op_params;
op_params.padding_type = RuntimePaddingType(params->padding);
op_params.padding_values.width = data->padding.width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data.padding.width;
op_params.padding_values.height = data.padding.height;
op_params.stride_width = params->stride_width;
op_params.stride_height = params->stride_height;
op_params.dilation_width_factor = params->dilation_width_factor;
@ -277,10 +326,10 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
op_params.input_offset = input_offset;
op_params.weights_offset = filter_offset;
op_params.output_offset = output_offset;
op_params.output_multiplier = data->output_multiplier;
op_params.output_shift = -data->output_shift;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
op_params.output_multiplier = data.output_multiplier;
op_params.output_shift = -data.output_shift;
op_params.quantized_activation_min = data.output_activation_min;
op_params.quantized_activation_max = data.output_activation_max;
reference_ops::Conv(op_params, GetTensorShape(input),
GetTensorData<uint8_t>(input), GetTensorShape(filter),
GetTensorData<uint8_t>(filter), GetTensorShape(bias),
@ -292,11 +341,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
}
void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, OpData* data,
TfLiteConvParams* params, const OpData& data,
const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output,
TfLiteTensor* im2col) {
// TODO(b/154032858): Investigate removing extra copies.
ConvParams op_params;
op_params.input_offset = -input->params.zero_point;
op_params.output_offset = output->params.zero_point;
@ -304,14 +354,14 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
op_params.stride_width = params->stride_width;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
op_params.padding_values.height = data.padding.height;
op_params.padding_values.width = data.padding.width;
op_params.quantized_activation_min = data.output_activation_min;
op_params.quantized_activation_max = data.output_activation_max;
reference_integer_ops::ConvPerChannel(
op_params, data->per_channel_output_multiplier,
data->per_channel_output_shift, GetTensorShape(input),
op_params, data.per_channel_output_multiplier,
data.per_channel_output_shift, GetTensorShape(input),
GetTensorData<int8>(input), GetTensorShape(filter),
GetTensorData<int8>(filter), GetTensorShape(bias),
GetTensorData<int32>(bias), GetTensorShape(output),
@ -319,7 +369,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
}
TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, OpData* data,
TfLiteConvParams* params, const OpData& data,
const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* im2col,
TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
@ -327,6 +377,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
CalculateActivationRange(params->activation, &output_activation_min,
&output_activation_max);
#if HIFI_VFPU
if ((params->dilation_width_factor == 1) &&
(params->dilation_height_factor == 1)) {
const float *input_data, *filter_data;
@ -344,10 +395,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
const int stride_width = params->stride_width;
const int stride_height = params->stride_height;
const int dilation_width_factor = 1;
const int dilation_height_factor = 1;
const int pad_width = data->padding.width;
const int pad_height = data->padding.height;
const int pad_width = data.padding.width;
const int pad_height = data.padding.height;
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
@ -366,13 +415,14 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
const int output_width = output_shape.Dims(2);
const int filter_depth = filter_shape.Dims(3);
int err, output_data_format = 0;
void* p_scratch;
float *p_filter, *p_out_scratch;
uint8_t* p_scratch;
float* p_filter;
// Calculate filter_depth_padded as next near multiple of 2
int filter_depth_padded = (filter_depth + 1) & (~1);
int out_length = output_height * output_width * output_depth;
int filter_size_padded = filter_height * filter_width * filter_depth_padded;
int required_scratch, input_precision = PREC_F32;
int h, w, c;
int h, c;
required_scratch = xa_nn_conv2d_std_getsize(
input_height, input_depth, filter_height, filter_width, stride_height,
@ -387,19 +437,11 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
p_scratch = xtensa_nnlib_scratch_buf;
p_filter = (float*)p_scratch;
p_out_scratch =
(float*)((uint8_t*)p_filter +
ALIGNED_SIZE((sizeof(float) * filter_height * filter_width *
filter_depth_padded * output_depth),
8));
p_filter = reinterpret_cast<float*>(p_scratch);
p_scratch +=
ALIGNED_SIZE((sizeof(float) * filter_size_padded * output_depth), 8);
required_scratch +=
ALIGNED_SIZE((sizeof(float) * filter_height * filter_width *
filter_depth_padded * output_depth),
8);
p_scratch = (float*)((uint8_t*)p_out_scratch +
ALIGNED_SIZE(sizeof(float) * out_length, 8));
required_scratch += ALIGNED_SIZE(sizeof(float) * out_length, 8);
ALIGNED_SIZE((sizeof(float) * filter_size_padded * output_depth), 8);
if (required_scratch > (int)XTENSA_NNLIB_MAX_SCRATCH_SIZE) {
TF_LITE_KERNEL_LOG(context,
@ -420,8 +462,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
for (int batch = 0; batch < batches; ++batch) {
float* p_out_temp;
p_out_temp = (float*)&p_out_scratch[0];
p_out_temp = (float*)ALIGN_PTR(p_out_temp, 8);
p_out_temp = &output_data[batch * out_length];
err = xa_nn_conv2d_std_f32(
p_out_temp,
@ -429,23 +470,26 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
p_filter, bias_data, input_height, input_width, input_depth,
filter_height, filter_width, output_depth, stride_width,
stride_height, pad_width, pad_height, output_height, output_width,
output_data_format, p_scratch);
output_data_format, static_cast<void*>(p_scratch));
CHECK_ERR_HIFI_NNLIB_KER(
err, "conv2d_std_f32: xa_nn_conv2d_std_f32xf32 failed");
for (int i = 0; i < out_length; i++) {
float* p_temp;
p_temp = &output_data[batch * out_length];
ACTIVATION_MIN_MAX(float, p_temp[i], p_out_temp[i],
output_activation_min, output_activation_max)
err = xa_nn_vec_activation_min_max_f32_f32(
p_out_temp, p_out_temp, output_activation_min, output_activation_max,
out_length);
CHECK_ERR_HIFI_NNLIB_KER(err,
"xa_nn_vec_activation_min_max_f32_f32 failed");
}
}
} else {
} else
#endif /* HIFI_VFPU */
{
// TODO(b/154032858): Investigate removing extra copies.
ConvParams op_params;
op_params.padding_type = RuntimePaddingType(params->padding);
op_params.padding_values.width = data->padding.width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data.padding.width;
op_params.padding_values.height = data.padding.height;
op_params.stride_width = params->stride_width;
op_params.stride_height = params->stride_height;
op_params.dilation_width_factor = params->dilation_width_factor;
@ -471,50 +515,20 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
int input_width = input->dims->data[2];
int input_height = input->dims->data[1];
int filter_width = filter->dims->data[2];
int filter_height = filter->dims->data[1];
int output_width = output->dims->data[2];
int output_height = output->dims->data[1];
OpData data;
// All per-channel quantized tensors need valid zero point and scale arrays.
if (input->type == kTfLiteInt8) {
TF_LITE_ENSURE_EQ(context, filter->quantization.type,
kTfLiteAffineQuantization);
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(
filter->quantization.params);
TF_LITE_ENSURE(context, affine_quantization);
TF_LITE_ENSURE(context, affine_quantization->scale);
TF_LITE_ENSURE(context, affine_quantization->zero_point);
TF_LITE_ENSURE(context,
affine_quantization->scale->size == 1 ||
affine_quantization->scale->size ==
filter->dims->data[kConvQuantizedDimension]);
TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
affine_quantization->zero_point->size);
}
TF_LITE_ENSURE_STATUS(CalculateOpData(
context, node, params, input_width, input_height, filter_width,
filter_height, output_width, output_height, input->type, &data));
TFLITE_DCHECK(node->user_data != nullptr);
const OpData& data = *(static_cast<const OpData*>(node->user_data));
switch (input->type) { // Already know in/out types are same.
case kTfLiteFloat32:
EvalFloat(context, node, params, &data, input, filter, bias, nullptr,
EvalFloat(context, node, params, data, input, filter, bias, nullptr,
nullptr, output);
break;
case kTfLiteInt8:
EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
output, nullptr);
break;
case kTfLiteUInt8:
EvalQuantized(context, node, params, &data, input, filter, bias, nullptr,
EvalQuantized(context, node, params, data, input, filter, bias, nullptr,
nullptr, output);
break;
default:
@ -528,9 +542,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
} // namespace conv
TfLiteRegistration Register_CONV_2D() {
return {/*init=*/nullptr,
return {/*init=*/conv::Init,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*prepare=*/conv::Prepare,
/*invoke=*/conv::Eval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,

View File

@ -1,5 +1,5 @@
/******************************************************************************
* Copyright (C) 2019 Cadence Design Systems, Inc.
/*******************************************************************************
* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/
******************************************************************************/
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
@ -45,7 +45,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/padding.h"
#include "xtensa_tf_micro_common.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
namespace tflite {
namespace ops {
@ -57,8 +57,6 @@ constexpr int kInputTensor = 0;
constexpr int kFilterTensor = 1;
constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
// Per channel quantization is not needed for any model on xtensa.
constexpr int kMaxChannels = 256;
// Depthwise conv is quantized along dimension 3:
// https://www.tensorflow.org/lite/performance/quantization_spec
@ -72,10 +70,8 @@ struct OpData {
int output_shift;
// Per channel output multiplier and shift.
// (b/141139247): Allocate these dynamically when possible.
int32_t per_channel_output_multiplier[kMaxChannels];
int32_t per_channel_output_shift[kMaxChannels];
int32_t* per_channel_output_multiplier;
int32_t* per_channel_output_shift;
// The range of the fused activation layer. For example for kNone and
// uint8_t these would be 0 and 255.
int32_t output_activation_min;
@ -107,26 +103,88 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
return tflite::PopulateConvolutionQuantizationParams(
context, input, filter, bias, output, params->activation,
&data->output_multiplier, &data->output_shift,
&data->output_activation_min, &data->output_activation_max,
data->per_channel_output_multiplier,
reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
reinterpret_cast<int*>(data->per_channel_output_shift), num_channels);
}
return kTfLiteOk;
}
} // namespace
void* Init(TfLiteContext* context, const char* buffer, size_t length) {
TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
void* data = nullptr;
if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
kTfLiteError) {
return nullptr;
}
return data;
}
TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
TFLITE_DCHECK(node->user_data != nullptr);
TFLITE_DCHECK(node->builtin_data != nullptr);
auto* params =
reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
OpData* data = static_cast<OpData*>(node->user_data);
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
const TfLiteType data_type = input->type;
int width = SizeOfDimension(input, 2);
int height = SizeOfDimension(input, 1);
int filter_width = SizeOfDimension(filter, 2);
int filter_height = SizeOfDimension(filter, 1);
// Per channel quantization is only needed for int8 inference. For other
// quantized types, only a single scale and zero point is needed.
const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
// Dynimically allocate per-channel quantization parameters.
TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
context, num_channels * sizeof(int32_t),
reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
context, num_channels * sizeof(int32_t),
reinterpret_cast<void**>(&data->per_channel_output_shift)));
// All per-channel quantized tensors need valid zero point and scale arrays.
if (input->type == kTfLiteInt8) {
TF_LITE_ENSURE_EQ(context, filter->quantization.type,
kTfLiteAffineQuantization);
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(
filter->quantization.params);
TF_LITE_ENSURE(context, affine_quantization);
TF_LITE_ENSURE(context, affine_quantization->scale);
TF_LITE_ENSURE(context, affine_quantization->zero_point);
TF_LITE_ENSURE(
context, affine_quantization->scale->size == 1 ||
affine_quantization->scale->size ==
filter->dims->data[kDepthwiseConvQuantizedDimension]);
TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
affine_quantization->zero_point->size);
}
return CalculateOpData(context, node, params, width, height, filter_width,
filter_height, data_type, data);
}
TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params, OpData* data,
TfLiteDepthwiseConvParams* params, const OpData* data,
const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
float output_activation_min, output_activation_max;
CalculateActivationRange(params->activation, &output_activation_min,
&output_activation_max);
#if HIFI_VFPU
if ((params->dilation_width_factor == 1) &&
(params->dilation_height_factor == 1)) {
const float *input_data, *filter_data, *bias_data;
@ -143,10 +201,6 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
const int stride_width = params->stride_width;
const int stride_height = params->stride_height;
const int dilation_width_factor = 1;
const int dilation_height_factor = 1;
// const int dilation_width_factor = params->dilation_width_factor;;
// const int dilation_height_factor = params->dilation_height_factor;
const int pad_width = data->padding.width;
const int pad_height = data->padding.height;
const int depth_multiplier = params->depth_multiplier;
@ -168,7 +222,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
int32_t err, input_data_format = 0, output_data_format = 0;
void* p_scratch;
uint8_t* p_scratch;
float* p_filter;
int filter_depth_padded, filter_size_padded, required_scratch;
int input_precision = PREC_F32;
@ -198,9 +252,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
return kTfLiteError;
}
p_filter = (float*)p_scratch;
p_scratch = (void*)((uint8_t*)p_filter +
ALIGNED_SIZE(sizeof(float) * filter_size_padded, 8));
p_filter = reinterpret_cast<float*>(p_scratch);
p_scratch += ALIGNED_SIZE(sizeof(float) * filter_size_padded, 8);
for (h = 0; h < filter_height * filter_width; h++) {
for (c = 0; c < filter_depth; c++) {
@ -220,37 +273,22 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
input_height, input_width, input_depth, filter_height, filter_width,
depth_multiplier, stride_width, stride_height, pad_width, pad_height,
output_height, output_width, input_data_format, output_data_format,
p_scratch);
static_cast<void*>(p_scratch));
CHECK_ERR_HIFI_NNLIB_KER(
err, "DepthwiseConvFloat: xa_nn_conv2d_depthwise_f32 failed");
}
// pre loop for activation_min_max to handle alignment
int out_length = batches * output_height * output_width * output_depth;
uint32 p_unalign_val = (uint32)output_data, p_align_val;
p_align_val = (p_unalign_val + 7) & (~7);
int pre_loop_count = p_align_val - p_unalign_val;
pre_loop_count = MIN(pre_loop_count, out_length);
for (i = 0; i < pre_loop_count; i++) {
ACTIVATION_MIN_MAX(float, output_data[i], output_data[i],
output_activation_min, output_activation_max)
}
out_length = out_length - pre_loop_count;
if (out_length) {
err = xa_nn_vec_activation_min_max_f32_f32(
&output_data[i], &output_data[i], output_activation_min,
output_activation_max, out_length);
output_data, output_data, output_activation_min, output_activation_max,
out_length);
CHECK_ERR_HIFI_NNLIB_KER(
err,
"DepthwiseConvFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
}
} else {
err, "DepthwiseConvFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
} else
#endif /* HIFI_VFPU */
{
tflite::DepthwiseParams op_params;
// Padding type is ignored, but still set.
op_params.padding_type = PaddingType::kSame;
@ -274,8 +312,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
}
void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params, OpData* data,
const TfLiteTensor* input,
TfLiteDepthwiseConvParams* params,
const OpData* data, const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
DepthwiseParams op_params;
@ -290,7 +328,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
op_params.input_offset = -input->params.zero_point;
op_params.weights_offset = 0;
op_params.output_offset = output->params.zero_point;
// (b/130439627): Use calculated value for clamping.
// TODO(b/130439627): Use calculated value for clamping.
op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
@ -304,8 +342,8 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
}
TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params, OpData* data,
const TfLiteTensor* input,
TfLiteDepthwiseConvParams* params,
const OpData* data, const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias,
TfLiteTensor* output) {
const int32_t input_offset = -input->params.zero_point;
@ -314,9 +352,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
if ((params->dilation_width_factor == 1) &&
(params->dilation_height_factor == 1)) {
const uint8 *input_data, *filter_data;
const uint8_t *input_data, *filter_data;
const int32_t* bias_data;
uint8* output_data;
uint8_t* output_data;
const RuntimeShape& input_shape = GetTensorShape(input);
const RuntimeShape& filter_shape = GetTensorShape(filter);
const RuntimeShape& output_shape = GetTensorShape(output);
@ -329,10 +367,6 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
const int stride_width = params->stride_width;
const int stride_height = params->stride_height;
const int dilation_width_factor = 1;
const int dilation_height_factor = 1;
// const int dilation_width_factor = params->dilation_width_factor;
// const int dilation_height_factor = params->dilation_height_factor;
const int pad_width = data->padding.width;
const int pad_height = data->padding.height;
const int depth_multiplier = params->depth_multiplier;
@ -360,11 +394,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
int32_t err, i, input_data_format = 0, output_data_format = 0;
void* p_scratch;
uint8* p_filter;
uint8_t* p_scratch;
uint8_t* p_filter;
int filter_depth_padded, filter_size_padded, required_scratch;
int input_precision = PREC_ASYM8;
int h, c;
int h;
ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
p_scratch = xtensa_nnlib_scratch_buf;
@ -390,18 +424,15 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
return kTfLiteError;
}
p_filter = (uint8*)p_scratch;
p_scratch = (void*)(p_filter +
ALIGNED_SIZE(sizeof(uint8_t) * filter_size_padded, 8));
p_filter = p_scratch;
p_scratch += ALIGNED_SIZE(sizeof(uint8_t) * filter_size_padded, 8);
int pad_value = filter_depth_padded - filter_depth;
for (h = 0; h < filter_height * filter_width; h++) {
for (c = 0; c < filter_depth; c++) {
p_filter[h * filter_depth_padded + c] =
filter_data[h * filter_depth + c];
}
for (c = filter_depth; c < filter_depth_padded; c++) {
p_filter[h * filter_depth_padded + c] = -filter_offset;
}
memcpy(&p_filter[h * filter_depth_padded], &filter_data[h * filter_depth],
filter_depth);
memset(&p_filter[h * filter_depth_padded + filter_depth], -filter_offset,
pad_value);
}
for (i = 0; i < batches; i++) {
@ -413,37 +444,22 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
depth_multiplier, stride_width, stride_height, pad_width, pad_height,
output_height, output_width, input_offset, filter_offset,
output_multiplier, output_shift, output_offset, input_data_format,
output_data_format, p_scratch);
output_data_format, static_cast<void*>(p_scratch));
CHECK_ERR_HIFI_NNLIB_KER(
err, "DepthwiseConvAsym8: xa_nn_conv2d_depthwise_asym8xasym8 failed");
}
// pre loop for activation_min_max to handle alignment
int out_length = batches * output_height * output_width * output_depth;
uint32 p_unalign_val = (uint32)output_data, p_align_val;
p_align_val = (p_unalign_val + 7) & (~7);
int pre_loop_count = p_align_val - p_unalign_val;
pre_loop_count = MIN(pre_loop_count, out_length);
for (i = 0; i < pre_loop_count; i++) {
ACTIVATION_MIN_MAX_ASYM8(output_data[i], output_data[i],
output_activation_min, output_activation_max)
}
out_length = out_length - pre_loop_count;
if (out_length > 0) {
err = xa_nn_vec_activation_min_max_asym8_asym8(
&output_data[i], &output_data[i], output_activation_min,
output_activation_max, out_length);
output_data, output_data, output_activation_min, output_activation_max,
out_length);
CHECK_ERR_HIFI_NNLIB_KER(
err,
"DepthwiseConvAsym8: xa_nn_vec_activation_min_max_asym8_asym8 "
"failed");
}
} else {
tflite::DepthwiseParams op_params;
// Padding type is ignored, but still set.
@ -474,8 +490,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
}
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
TFLITE_DCHECK(node->user_data != nullptr);
TFLITE_DCHECK(node->builtin_data != nullptr);
auto* params =
reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
const OpData& data = *(static_cast<const OpData*>(node->user_data));
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
@ -483,38 +503,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
const TfLiteTensor* bias =
(NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
const TfLiteType data_type = input->type;
int width = SizeOfDimension(input, 2);
int height = SizeOfDimension(input, 1);
int filter_width = SizeOfDimension(filter, 2);
int filter_height = SizeOfDimension(filter, 1);
OpData data;
// All per-channel quantized tensors need valid zero point and scale arrays.
if (input->type == kTfLiteInt8) {
TF_LITE_ENSURE_EQ(context, filter->quantization.type,
kTfLiteAffineQuantization);
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(
filter->quantization.params);
TF_LITE_ENSURE(context, affine_quantization);
TF_LITE_ENSURE(context, affine_quantization->scale);
TF_LITE_ENSURE(context, affine_quantization->zero_point);
TF_LITE_ENSURE(
context, affine_quantization->scale->size == 1 ||
affine_quantization->scale->size ==
filter->dims->data[kDepthwiseConvQuantizedDimension]);
TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
affine_quantization->zero_point->size);
}
TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
filter_width, filter_height, data_type,
&data));
// (aselle): Consider whether float conv and quantized conv should be
// TODO(aselle): Consider whether float conv and quantized conv should be
// separate ops to avoid dispatch overhead here.
switch (input->type) { // Already know in/out types are same.
case kTfLiteFloat32:
@ -538,9 +527,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
} // namespace depthwise_conv
TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
return {/*init=*/nullptr,
return {/*init=*/depthwise_conv::Init,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*prepare=*/depthwise_conv::Prepare,
/*invoke=*/depthwise_conv::Eval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,

View File

@ -1,5 +1,5 @@
/******************************************************************************
* Copyright (C) 2019 Cadence Design Systems, Inc.
/*******************************************************************************
* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/
******************************************************************************/
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
@ -39,7 +39,7 @@ limitations under the License.
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "xtensa_tf_micro_common.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
namespace tflite {
namespace ops {
@ -53,6 +53,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
#if HIFI_VFPU
int err;
const float* inp_data_ptr;
float* out_data_ptr;
@ -66,6 +67,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
err = xa_nn_elm_floor_f32_f32(out_data_ptr, inp_data_ptr, flat_size);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_floor_f32_f32 failed");
#else
reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(output), GetTensorData<float>(output));
#endif /* HIFI_VFPU */
return kTfLiteOk;
}
} // namespace floor

View File

@ -1,5 +1,5 @@
/******************************************************************************
* Copyright (C) 2019 Cadence Design Systems, Inc.
/*******************************************************************************
* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/
******************************************************************************/
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
@ -43,7 +43,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "xtensa_tf_micro_common.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
namespace tflite {
namespace ops {
@ -70,7 +70,7 @@ constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
TfLiteStatus CalculateOpData(TfLiteContext* context,
TfLiteFullyConnectedParams* params,
TfLiteFusedActivation activation,
TfLiteType data_type, const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output,
@ -84,7 +84,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
data->output_shift = -exponent;
TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
context, params->activation, output, &data->output_activation_min,
context, activation, output, &data->output_activation_min,
&data->output_activation_max));
}
return status;
@ -92,20 +92,50 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
} // namespace
void* Init(TfLiteContext* context, const char* buffer, size_t length) {
TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
void* data = nullptr;
if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
kTfLiteError) {
return nullptr;
}
return data;
}
TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
TFLITE_DCHECK(node->user_data != nullptr);
TFLITE_DCHECK(node->builtin_data != nullptr);
OpData* data = static_cast<OpData*>(node->user_data);
const auto params =
static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
TF_LITE_ENSURE_EQ(context, input->type, output->type);
TF_LITE_ENSURE_MSG(context, input->type == filter->type,
"Hybrid models are not supported on TFLite Micro.");
return CalculateOpData(context, params->activation, input->type, input,
filter, bias, output, data);
}
TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
TfLiteFullyConnectedParams* params, OpData* data,
const TfLiteTensor* input,
const OpData& data, const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
FullyConnectedParams op_params;
tflite::FullyConnectedParams op_params;
op_params.input_offset = -input->params.zero_point;
op_params.weights_offset = -filter->params.zero_point;
op_params.output_offset = output->params.zero_point;
op_params.output_multiplier = data->output_multiplier;
// (b/138810107): Figure out whether output shift should be inverted
op_params.output_shift = -data->output_shift;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
op_params.output_multiplier = data.output_multiplier;
// TODO(b/138810107): Figure out whether output shift should be inverted
op_params.output_shift = -data.output_shift;
op_params.quantized_activation_min = data.output_activation_min;
op_params.quantized_activation_max = data.output_activation_max;
reference_integer_ops::FullyConnected(
op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
@ -116,8 +146,7 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
}
TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
TfLiteFullyConnectedParams* params, OpData* data,
const TfLiteTensor* input,
const OpData& data, const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias,
TfLiteTensor* output) {
const int32_t input_offset = -input->params.zero_point;
@ -128,11 +157,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
op_params.input_offset = input_offset;
op_params.weights_offset = filter_offset;
op_params.output_offset = output_offset;
op_params.output_multiplier = data->output_multiplier;
op_params.output_multiplier = data.output_multiplier;
// Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
op_params.output_shift = -data->output_shift;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
op_params.output_shift = -data.output_shift;
op_params.quantized_activation_min = data.output_activation_min;
op_params.quantized_activation_max = data.output_activation_max;
#define TF_LITE_FULLY_CONNECTED(output_data_type) \
reference_ops::FullyConnected( \
@ -162,11 +191,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
CHECK_ERR_HIFI_NNLIB_KER(
ret, "xa_nn_fully_connected_asym8xasym8_asym8 failed");
}
for (int i = 0; i < batches * out_depth; i++) {
ACTIVATION_MIN_MAX_ASYM8(p_out[i], p_out[i],
data->output_activation_min,
data->output_activation_max)
}
ret = xa_nn_vec_activation_min_max_asym8_asym8(
p_out, p_out, data.output_activation_min, data.output_activation_max,
batches * out_depth);
CHECK_ERR_HIFI_NNLIB_KER(
ret, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
break;
}
case kTfLiteInt16:
@ -182,15 +212,16 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
}
TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
TfLiteFullyConnectedParams* params, OpData* data,
TfLiteFusedActivation activation,
const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
float output_activation_min, output_activation_max;
CalculateActivationRange(params->activation, &output_activation_min,
CalculateActivationRange(activation, &output_activation_min,
&output_activation_max);
tflite::FullyConnectedParams op_params;
op_params.float_activation_min = output_activation_min;
op_params.float_activation_max = output_activation_max;
#if HIFI_VFPU
int ret, b, weight_depth, out_depth, batches;
weight_depth =
GetTensorShape(filter).Dims(GetTensorShape(filter).DimensionsCount() - 1);
@ -208,43 +239,48 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
CHECK_ERR_HIFI_NNLIB_KER(ret, "xa_nn_fully_connected_f32 failed.");
}
float* p_out = GetTensorData<float>(output);
for (int i = 0; i < batches * out_depth; i++) {
ACTIVATION_MIN_MAX(float, p_out[i], p_out[i], output_activation_min,
output_activation_max)
}
ret = xa_nn_vec_activation_min_max_f32_f32(
p_out, p_out, output_activation_min, output_activation_max,
batches * out_depth);
CHECK_ERR_HIFI_NNLIB_KER(ret, "xa_nn_vec_activation_min_max_f32_f32 failed");
#else
tflite::reference_ops::FullyConnected(
op_params, GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(filter), GetTensorData<float>(filter),
GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
GetTensorData<float>(output));
#endif /* HIFI_VFPU */
return kTfLiteOk;
}
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
auto* params =
reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
TFLITE_DCHECK(node->builtin_data != nullptr);
const auto* params =
static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
TfLiteType data_type = input->type;
OpData local_data_object;
OpData* data = &local_data_object;
TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
filter, bias, output, data));
TFLITE_DCHECK(node->user_data != nullptr);
const OpData& data = *(static_cast<const OpData*>(node->user_data));
switch (filter->type) { // Already know in/out types are same.
// Checks in Prepare ensure input, output and filter types are all the same.
switch (input->type) {
case kTfLiteFloat32:
return EvalFloat(context, node, params, data, input, filter, bias,
return EvalFloat(context, node, params->activation, input, filter, bias,
output);
case kTfLiteInt8:
return EvalQuantizedInt8(context, node, params, data, input, filter, bias,
return EvalQuantizedInt8(context, node, data, input, filter, bias,
output);
case kTfLiteUInt8:
return EvalQuantized(context, node, params, data, input, filter, bias,
output);
return EvalQuantized(context, node, data, input, filter, bias, output);
default:
TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
TfLiteTypeGetName(filter->type), filter->type);
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
}
return kTfLiteOk;
@ -253,9 +289,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
} // namespace fully_connected
TfLiteRegistration Register_FULLY_CONNECTED() {
return {/*init=*/nullptr,
return {/*init=*/fully_connected::Init,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*prepare=*/fully_connected::Prepare,
/*invoke=*/fully_connected::Eval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,

View File

@ -1,5 +1,5 @@
/******************************************************************************
* Copyright (C) 2019 Cadence Design Systems, Inc.
/*******************************************************************************
* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/
******************************************************************************/
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
@ -34,32 +34,68 @@ See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/reference/logistic.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/logistic.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/op_macros.h"
#include "xtensa_tf_micro_common.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
namespace tflite {
namespace ops {
namespace micro {
namespace activations {
namespace {
constexpr int kInputTensor = 0;
constexpr int kOutputTensor = 0;
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
struct OpData {
int32_t input_zero_point;
int32_t input_range_radius;
int32_t input_multiplier;
int input_left_shift;
};
TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
OpData* data) {
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
TF_LITE_ENSURE_EQ(context, input->type, output->type);
if (input->type == kTfLiteInt8) {
TF_LITE_ENSURE_EQ(context, output->params.zero_point,
std::numeric_limits<int8_t>::min());
static constexpr int kInputIntegerBits = 4;
const double input_real_multiplier =
static_cast<double>(input->params.scale) *
static_cast<double>(1 << (31 - kInputIntegerBits));
const double q = std::frexp(input_real_multiplier, &data->input_left_shift);
data->input_multiplier = static_cast<int32_t>(TfLiteRound(q * (1ll << 31)));
data->input_range_radius =
CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31);
}
return kTfLiteOk;
}
} // namespace
TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
OpData data;
CalculateArithmeticOpData(context, node, &data);
if (input->type == kTfLiteFloat32) {
switch (output->type) {
case kTfLiteFloat32: {
#if HIFI_VFPU
int err;
const float* inp_data_ptr;
float* out_data_ptr;
@ -73,6 +109,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
err = xa_nn_vec_sigmoid_f32_f32(out_data_ptr, inp_data_ptr, flat_size);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_sigmoid_f32_f32 failed");
#else
reference_ops::Logistic(
GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(output), GetTensorData<float>(output));
#endif /* HIFI_VFPU */
return kTfLiteOk;
}
default:
@ -84,11 +125,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
} else if (input->type == kTfLiteInt8) {
switch (output->type) {
case kTfLiteInt8: {
reference_ops::Logistic(
GetTensorShape(input), GetTensorData<int8_t>(input),
input->params.scale, input->params.zero_point,
GetTensorShape(output), GetTensorData<int8_t>(output),
output->params.scale, output->params.zero_point);
reference_integer_ops::Logistic(
input->params.zero_point, data.input_range_radius,
data.input_multiplier, data.input_left_shift,
NumElements(input->dims), GetTensorData<int8_t>(input),
GetTensorData<int8_t>(output));
return kTfLiteOk;
}
default:
@ -98,7 +139,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
return kTfLiteError;
}
} else {
// (b/141211002): Also support other data types once we have supported
// TODO(b/141211002): Also support other data types once we have supported
// temporary tensors in TFLM.
TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
TfLiteTypeGetName(input->type),
@ -114,7 +155,7 @@ TfLiteRegistration Register_LOGISTIC() {
return {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/activations::Eval,
/*invoke=*/activations::LogisticEval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,

View File

@ -0,0 +1,229 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/reference/mul.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
#include "tensorflow/lite/micro/memory_helpers.h"
namespace tflite {
namespace ops {
namespace micro {
namespace mul {
constexpr int kInput1Tensor = 0;
constexpr int kInput2Tensor = 1;
constexpr int kOutputTensor = 0;
struct OpData {
int32_t output_activation_min;
int32_t output_activation_max;
int32_t output_multiplier;
int output_shift;
};
TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
TfLiteMulParams* params, OpData* data) {
const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
context, params->activation, output, &data->output_activation_min,
&data->output_activation_max));
double real_multiplier = static_cast<double>(input1->params.scale) *
static_cast<double>(input2->params.scale) /
static_cast<double>(output->params.scale);
QuantizeMultiplier(real_multiplier, &data->output_multiplier,
&data->output_shift);
}
return kTfLiteOk;
}
TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
if (output->dims->size == 0) {
return AllocateOutputDimensionsFromInput(context, input1, input2, output);
}
return kTfLiteOk;
}
TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
TfLiteMulParams* params, OpData* data,
const TfLiteTensor* input1,
const TfLiteTensor* input2, TfLiteTensor* output) {
if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8) {
tflite::ArithmeticParams op_params;
SetActivationParams(data->output_activation_min,
data->output_activation_max, &op_params);
op_params.input1_offset = -input1->params.zero_point;
op_params.input2_offset = -input2->params.zero_point;
op_params.output_offset = output->params.zero_point;
op_params.output_multiplier = data->output_multiplier;
op_params.output_shift = data->output_shift;
bool need_broadcast = reference_ops::ProcessBroadcastShapes(
GetTensorShape(input1), GetTensorShape(input2), &op_params);
#define TF_LITE_MUL(type, opname, dtype) \
type::opname(op_params, GetTensorShape(input1), \
GetTensorData<dtype>(input1), GetTensorShape(input2), \
GetTensorData<dtype>(input2), GetTensorShape(output), \
GetTensorData<dtype>(output));
if (output->type == kTfLiteInt8) {
if (need_broadcast) {
TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t);
} else {
TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
}
} else if (output->type == kTfLiteUInt8) {
if (need_broadcast) {
TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, uint8_t);
} else {
int err;
const RuntimeShape& input1_shape = GetTensorShape(input1);
const RuntimeShape& input2_shape = GetTensorShape(input2);
const RuntimeShape& output_shape = GetTensorShape(output);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
err = xa_nn_elm_mul_asym8xasym8_asym8(
GetTensorData<uint8_t>(output), op_params.output_offset,
op_params.output_shift, op_params.output_multiplier,
op_params.quantized_activation_min,
op_params.quantized_activation_max, GetTensorData<uint8_t>(input1),
op_params.input1_offset, GetTensorData<uint8_t>(input2),
op_params.input2_offset, flat_size);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_mul_asym8xasym8_asym8 failed");
}
}
#undef TF_LITE_MUL
}
return kTfLiteOk;
}
TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
TfLiteMulParams* params, OpData* data,
const TfLiteTensor* input1, const TfLiteTensor* input2,
TfLiteTensor* output) {
float output_activation_min, output_activation_max;
CalculateActivationRange(params->activation, &output_activation_min,
&output_activation_max);
tflite::ArithmeticParams op_params;
SetActivationParams(output_activation_min, output_activation_max, &op_params);
bool need_broadcast = reference_ops::ProcessBroadcastShapes(
GetTensorShape(input1), GetTensorShape(input2), &op_params);
#define TF_LITE_MUL(opname) \
reference_ops::opname(op_params, GetTensorShape(input1), \
GetTensorData<float>(input1), GetTensorShape(input2), \
GetTensorData<float>(input2), GetTensorShape(output), \
GetTensorData<float>(output));
if (need_broadcast) {
TF_LITE_MUL(BroadcastMul4DSlow);
} else {
#if HIFI_VFPU
int err;
const RuntimeShape& input1_shape = GetTensorShape(input1);
const RuntimeShape& input2_shape = GetTensorShape(input2);
const RuntimeShape& output_shape = GetTensorShape(output);
const int flat_size =
MatchingElementsSize(input1_shape, input2_shape, output_shape);
err = xa_nn_elm_mul_f32xf32_f32(GetTensorData<float>(output),
GetTensorData<float>(input1),
GetTensorData<float>(input2), flat_size);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_mul_f32xf32_f32 failed");
err = xa_nn_vec_activation_min_max_f32_f32(
GetTensorData<float>(output), GetTensorData<float>(output),
output_activation_min, output_activation_max, flat_size);
CHECK_ERR_HIFI_NNLIB_KER(err,
"xa_nn_vec_activation_min_max_f32_f32 failed");
#else
TF_LITE_MUL(Mul);
#endif /* HIFI_VFPU */
}
#undef TF_LITE_MUL
return kTfLiteOk;
}
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
OpData data;
const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, &data));
switch (input1->type) {
case kTfLiteUInt8:
case kTfLiteInt8:
TF_LITE_ENSURE_OK(context, EvalQuantized(context, node, params, &data,
input1, input2, output));
break;
case kTfLiteFloat32:
TF_LITE_ENSURE_OK(context, EvalFloat(context, node, params, &data, input1,
input2, output));
break;
default:
TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
TfLiteTypeGetName(input1->type), input1->type);
return kTfLiteError;
}
return kTfLiteOk;
}
} // namespace mul
TfLiteRegistration Register_MUL() {
return {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/mul::Eval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -1,5 +1,5 @@
/******************************************************************************
* Copyright (C) 2019 Cadence Design Systems, Inc.
/*******************************************************************************
* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/
******************************************************************************/
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
@ -40,7 +40,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/padding.h"
#include "xtensa_tf_micro_common.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
namespace tflite {
namespace ops {
@ -83,6 +83,7 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
CalculateActivationRange(params->activation, &activation_min,
&activation_max);
#if HIFI_VFPU
const int stride_height = params->stride_height;
const int stride_width = params->stride_width;
const int pad_width = data->padding.width;
@ -168,6 +169,20 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
CHECK_ERR_HIFI_NNLIB_KER(
err, "AveragepoolFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
}
#else
PoolParams op_params;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.filter_height = params->filter_height;
op_params.filter_width = params->filter_width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.float_activation_min = activation_min;
op_params.float_activation_max = activation_max;
reference_ops::AveragePool(
op_params, GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(output), GetTensorData<float>(output));
#endif /* HIFI_VFPU */
return kTfLiteOk;
}
@ -177,7 +192,6 @@ TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
const OpData* data, const TfLiteTensor* input,
TfLiteTensor* output) {
TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
int32_t activation_min, activation_max;
(void)CalculateActivationRangeQuantized(context, params->activation, output,
&activation_min, &activation_max);
@ -295,6 +309,7 @@ TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
CalculateActivationRange(params->activation, &activation_min,
&activation_max);
#if HIFI_VFPU
const int stride_height = params->stride_height;
const int stride_width = params->stride_width;
const int pad_width = data->padding.width;
@ -378,6 +393,20 @@ TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
CHECK_ERR_HIFI_NNLIB_KER(
err, "MaxpoolFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
}
#else
tflite::PoolParams op_params;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.filter_height = params->filter_height;
op_params.filter_width = params->filter_width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.float_activation_min = activation_min;
op_params.float_activation_max = activation_max;
reference_ops::MaxPool(op_params, GetTensorShape(input),
GetTensorData<float>(input), GetTensorShape(output),
GetTensorData<float>(output));
#endif /* HIFI_VFPU */
return kTfLiteOk;
}
@ -491,7 +520,6 @@ TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
}
return kTfLiteOk;
}
} // namespace
@ -504,7 +532,7 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
// Inputs and outputs share the same type, guarenteed by the converter.
// Inputs and outputs share the same type, guaranteed by the converter.
switch (input->type) {
case kTfLiteFloat32:
AverageEvalFloat(context, node, params, &data, input, output);

View File

@ -1,5 +1,5 @@
/******************************************************************************
* Copyright (C) 2019 Cadence Design Systems, Inc.
/*******************************************************************************
* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/
******************************************************************************/
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
@ -43,7 +43,7 @@ limitations under the License.
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/op_macros.h"
#include "xtensa_tf_micro_common.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
namespace tflite {
namespace ops {
namespace micro {
@ -105,6 +105,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
// Takes a tensor and performs softmax along the last dimension.
TfLiteStatus SoftmaxFloat(TfLiteContext* context, const TfLiteTensor* input,
TfLiteTensor* output, const SoftmaxParams& op_data) {
#if HIFI_VFPU
const RuntimeShape& input_shape = GetTensorShape(input);
const float* input_data = GetTensorData<float>(input);
const RuntimeShape& output_shape = GetTensorShape(output);
@ -133,6 +134,11 @@ TfLiteStatus SoftmaxFloat(TfLiteContext* context, const TfLiteTensor* input,
xa_nn_vec_softmax_f32_f32(&output_data[i * depth], p_scratch, depth);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_softmax_f32_f32 failed");
}
#else
tflite::reference_ops::Softmax(
op_data, GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(output), GetTensorData<float>(output));
#endif /* HIFI_VFPU */
return kTfLiteOk;
}

View File

@ -1,5 +1,5 @@
/******************************************************************************
* Copyright (C) 2019 Cadence Design Systems, Inc.
/*******************************************************************************
* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
@ -18,7 +18,6 @@
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
@ -44,8 +43,8 @@ limitations under the License.
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/op_macros.h"
#include "tensorflow/lite/micro/kernels/activation_utils.h"
#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
#include "tensorflow/lite/micro/micro_utils.h"
#include "xtensa_tf_micro_common.h"
namespace tflite {
namespace ops {
@ -53,10 +52,6 @@ namespace micro {
namespace svdf {
namespace {
// These constants represent constants specific to the hotword "OK G" model.
// They exist until (b/132070898) is fixed.
constexpr int kScratchTensorMaxSize = 64;
struct OpData {
int32 effective_scale_1_a;
int32 effective_scale_2_a;
@ -64,6 +59,8 @@ struct OpData {
// shift value - typically between [-32, 32].
int effective_scale_1_b;
int effective_scale_2_b;
int scratch_tensor_index;
int scratch_output_tensor_index;
};
/**
@ -84,6 +81,7 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
float* const __restrict__ output_ptr) {
// Compute matmul(activation_state, weights_time).
#if HIFI_VFPU
float* scratch_bias = scratch_ptr;
if (bias_ptr) {
const float* bias_data = bias_ptr;
@ -111,6 +109,51 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
weights_time_vec += memory_size * rank;
}
}
#else
for (int b = 0; b < batch_size; ++b) {
// Perform batched vector dot product:
float* scratch_ptr_batch = scratch_ptr + b * num_filters;
const float* vector1_ptr = weights_time_ptr;
const float* vector2_ptr = state_ptr + b * memory_size * num_filters;
for (int i = 0; i < num_filters; ++i) {
*scratch_ptr_batch = 0.f;
for (int j = 0; j < memory_size; ++j) {
*scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
}
scratch_ptr_batch++;
}
}
// Initialize output with bias if provided.
if (bias_ptr) {
// VectorBatchVectorAssign
for (int i = 0; i < batch_size; ++i) {
float* output_data = output_ptr + i * num_units;
const float* bias_data = bias_ptr;
for (int j = 0; j < num_units; ++j) {
*output_data++ = *bias_data++;
}
}
} else {
float* output_data = output_ptr;
for (int i = 0; i < batch_size * num_units; ++i) {
*output_data++ = 0.0f;
}
}
// Reduction sum.
for (int b = 0; b < batch_size; ++b) {
float* output_ptr_batch = output_ptr + b * num_units;
float* scratch_ptr_batch = scratch_ptr + b * num_filters;
// Reduction sum vector
for (int i = 0; i < num_units; ++i) {
for (int j = 0; j < rank; j++) {
output_ptr_batch[i] += *scratch_ptr_batch++;
}
}
}
#endif /* HIFI_VFPU */
// Apply activation.
for (int b = 0; b < batch_size; ++b) {
@ -127,7 +170,8 @@ inline TfLiteStatus EvalFloatSVDF(
TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
const TfLiteTensor* bias, const TfLiteSVDFParams* params,
TfLiteTensor* activation_state, TfLiteTensor* output) {
int scratch_tensor_index, TfLiteTensor* activation_state,
TfLiteTensor* output) {
const int rank = params->rank;
const int batch_size = input->dims->data[0];
const int input_size = input->dims->data[1];
@ -142,10 +186,11 @@ inline TfLiteStatus EvalFloatSVDF(
float* state_ptr = GetTensorData<float>(activation_state);
// TODO(b/132070898): Move this temp variable to the new scratch buffer API
// when ready.
float scratch_tensor[kScratchTensorMaxSize];
float* scratch_ptr = scratch_tensor;
TFLITE_DCHECK(context != nullptr);
TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
float* scratch_ptr = static_cast<float*>(
context->GetScratchBuffer(context, scratch_tensor_index));
float* output_ptr = GetTensorData<float>(output);
@ -174,6 +219,7 @@ inline TfLiteStatus EvalFloatSVDF(
float* result = &state_ptr[memory_size - 1];
float* result_in_batch = result;
#if HIFI_VFPU
float* out_scratch = scratch_ptr;
float* bias_scratch = output_ptr;
for (int i = 0; i < num_units; i++) bias_scratch[i] = 0.0f;
@ -195,6 +241,20 @@ inline TfLiteStatus EvalFloatSVDF(
result_in_batch += memory_size;
}
}
#else
for (int i = 0; i < batch_size; ++i) {
const float* matrix_ptr = matrix;
for (int j = 0; j < num_filters; ++j) {
float dot_prod = 0.0f;
const float* vector_in_batch = vector + i * input_size;
for (int k = 0; k < input_size; ++k) {
dot_prod += *matrix_ptr++ * *vector_in_batch++;
}
*result_in_batch = dot_prod;
result_in_batch += memory_size;
}
}
#endif /* HIFI_VFPU */
}
return ApplyTimeWeightsBiasAndActivation(
@ -203,13 +263,15 @@ inline TfLiteStatus EvalFloatSVDF(
output_ptr);
}
void EvalIntegerSVDF(
TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input_tensor,
void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
const TfLiteTensor* input_tensor,
const TfLiteTensor* weights_feature_tensor,
const TfLiteTensor* weights_time_tensor, const TfLiteTensor* bias_tensor,
const TfLiteSVDFParams* params, TfLiteTensor* activation_state_tensor,
TfLiteTensor* output_tensor, int32_t scale_1_a, int scale_1_b,
int32_t scale_2_a, int scale_2_b, int32_t input_zp, int32_t output_zp) {
const TfLiteTensor* weights_time_tensor,
const TfLiteTensor* bias_tensor,
const TfLiteSVDFParams* params,
TfLiteTensor* activation_state_tensor,
TfLiteTensor* output_tensor, const OpData& data,
int32_t input_zp, int32_t output_zp) {
const int n_rank = params->rank;
const int n_batch = input_tensor->dims->data[0];
const int n_input = input_tensor->dims->data[1];
@ -217,10 +279,13 @@ void EvalIntegerSVDF(
const int n_unit = n_filter / n_rank;
const int n_memory = weights_time_tensor->dims->data[1];
// TODO(b/132070898): Move these temp variables to the new scratch buffer API
// when ready.
int32_t scratch_tensor[kScratchTensorMaxSize];
int32_t scratch_output_tensor[kScratchTensorMaxSize];
TFLITE_DCHECK(context != nullptr);
TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
int32_t* scratch_tensor = static_cast<int32_t*>(
context->GetScratchBuffer(context, data.scratch_tensor_index));
int32_t* scratch_output_tensor = static_cast<int32_t*>(
context->GetScratchBuffer(context, data.scratch_output_tensor_index));
// Shift states.
int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
@ -254,8 +319,8 @@ void EvalIntegerSVDF(
for (int c = 0; c < n_input; c++) {
dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp);
}
dot_prod =
MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
dot_prod = MultiplyByQuantizedMultiplier(
dot_prod, data.effective_scale_1_a, data.effective_scale_1_b);
dot_prod = std::min(std::max(output_min, dot_prod), output_max);
// This assumes state is symmetrically quantized. Otherwise last bit of
// state should be initialized to its zero point and accumulate the
@ -328,7 +393,8 @@ void EvalIntegerSVDF(
const int32_t output_min = std::numeric_limits<int8_t>::min();
for (int i = 0; i < n_batch * n_unit; ++i) {
int32_t x1 = scratch_output_tensor[i];
int32_t x2 = MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b);
int32_t x2 = MultiplyByQuantizedMultiplier(x1, data.effective_scale_2_a,
data.effective_scale_2_b);
int32_t x3 = x2 + output_zp;
int32_t x4 = std::min(std::max(output_min, x3), output_max);
GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
@ -349,8 +415,20 @@ constexpr int kInputActivationStateTensor = 4;
// Output tensor.
constexpr int kOutputTensor = 0;
void* Init(TfLiteContext* context, const char* buffer, size_t length) {
TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
void* data = nullptr;
if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
kTfLiteError) {
return nullptr;
}
return data;
}
TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
const auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
TFLITE_DCHECK(node->builtin_data != nullptr);
const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
// Validate Tensor Inputs (dtype depends on quantization):
// [0] = Input, {2, batch_size, input_size}
@ -359,7 +437,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
// [3] = Bias (optional), {1, num_units}
// [4] = Activation State (variable),
// {2, batch_size, memory_size * num_filters}
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* weights_feature =
GetInput(context, node, kWeightsFeatureTensor);
@ -378,8 +455,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
const int num_units = num_filters / rank;
const int memory_size = weights_time->dims->data[1];
const bool is_full_integer = input->type == kTfLiteInt8;
// Validate Input Tensor:
TF_LITE_ENSURE(context,
input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
@ -403,7 +478,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);
// Validate Optional Bias Input Tensor:
if (bias) {
if (bias != nullptr) {
TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
}
@ -413,51 +488,75 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
memory_size * num_filters);
if (is_full_integer) {
TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
if (input->type == kTfLiteInt8) {
TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
if (bias) {
TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
if (bias != nullptr) {
TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
}
TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
// Validate Scratch Tensors:
// [0] = (shared - see float block below for usage)
// [1] = Output Temp, int8_t, {2, num_units, batch_size}
// TODO(b/132070898): Scratch values are used as stack variables in
// EvalIntegerSVDF().
// Validate output tensor:
TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
} else {
TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
// Validate Input Tensor dtypes:
const auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
input->quantization.params);
const auto* weights_feature_params =
static_cast<const TfLiteAffineQuantization*>(
weights_feature->quantization.params);
const auto* state_params = static_cast<const TfLiteAffineQuantization*>(
activation_state->quantization.params);
const auto* weight_time_params =
static_cast<const TfLiteAffineQuantization*>(
weights_time->quantization.params);
const auto* output_params = static_cast<const TfLiteAffineQuantization*>(
output->quantization.params);
const double effective_scale_1 =
static_cast<double>(input_params->scale->data[0] *
weights_feature_params->scale->data[0] /
state_params->scale->data[0]);
const double effective_scale_2 = static_cast<double>(
state_params->scale->data[0] * weight_time_params->scale->data[0] /
output_params->scale->data[0]);
TFLITE_DCHECK(node->user_data != nullptr);
OpData* data = static_cast<OpData*>(node->user_data);
QuantizeMultiplier(effective_scale_1, &(data->effective_scale_1_a),
&(data->effective_scale_1_b));
QuantizeMultiplier(effective_scale_2, &(data->effective_scale_2_a),
&(data->effective_scale_2_b));
TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
context, batch_size * num_filters * sizeof(int32_t),
&(data->scratch_tensor_index));
TF_LITE_ENSURE_OK(context, scratch_status);
const TfLiteStatus scratch_output_status =
context->RequestScratchBufferInArena(
context, batch_size * num_units * sizeof(int32_t),
&(data->scratch_output_tensor_index));
TF_LITE_ENSURE_OK(context, scratch_output_status);
} else {
TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteFloat32);
TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteFloat32);
if (bias) {
if (bias != nullptr) {
TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
}
// Validate shared Scratch Tensor:
// [0] = Holds dot-product of time-forward calculations in
// ApplyTimeWeightsBiasAndActivation():
// float/int32, {2, batch_size, num_filters}
// TODO(b/132070898): Scratch values are used as stack variables in
// EvalIntegerSVDF().
// Full-float SVDF only uses the one shared scratch tensor (see above for
// usage).
// TODO(b/132070898): Use input tensor as variable until scratch tensor
// allocation has been implemented.
// TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
TFLITE_DCHECK(node->user_data != nullptr);
OpData* data = static_cast<OpData*>(node->user_data);
TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
context, batch_size * num_filters * sizeof(float),
&(data->scratch_tensor_index));
TF_LITE_ENSURE_OK(context, scratch_status);
}
return kTfLiteOk;
@ -476,56 +575,24 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
GetVariableInput(context, node, kInputActivationStateTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
const bool is_full_integer = input->type == kTfLiteInt8;
TFLITE_DCHECK(node->user_data != nullptr);
const OpData& data = *(static_cast<const OpData*>(node->user_data));
switch (weights_feature->type) {
case kTfLiteFloat32: {
// TODO(b/132070898): Use input tensor as variable until scratch tensor
// allocation has been implemented.
// TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
return EvalFloatSVDF(context, node, input, weights_feature, weights_time,
bias, params, activation_state, output);
bias, params, data.scratch_tensor_index,
activation_state, output);
break;
}
case kTfLiteInt8: {
if (is_full_integer) {
// TODO(b/132070898): Store these values in ::Prepare() instead of
// ::Eval():
// Calculate effective scales.
OpData op_data;
auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
input->quantization.params);
auto* weights_feature_params =
reinterpret_cast<TfLiteAffineQuantization*>(
weights_feature->quantization.params);
auto* state_params = reinterpret_cast<TfLiteAffineQuantization*>(
activation_state->quantization.params);
auto* weight_time_params = reinterpret_cast<TfLiteAffineQuantization*>(
weights_time->quantization.params);
auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
output->quantization.params);
const double effective_scale_1 =
static_cast<double>(input_params->scale->data[0] *
weights_feature_params->scale->data[0] /
state_params->scale->data[0]);
const double effective_scale_2 = static_cast<double>(
state_params->scale->data[0] * weight_time_params->scale->data[0] /
output_params->scale->data[0]);
QuantizeMultiplier(effective_scale_1, &op_data.effective_scale_1_a,
&op_data.effective_scale_1_b);
QuantizeMultiplier(effective_scale_2, &op_data.effective_scale_2_a,
&op_data.effective_scale_2_b);
TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
EvalIntegerSVDF(
context, node, input, weights_feature, weights_time, bias, params,
activation_state, output, op_data.effective_scale_1_a,
op_data.effective_scale_1_b, op_data.effective_scale_2_a,
op_data.effective_scale_2_b, input->params.zero_point,
output->params.zero_point);
EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias,
params, activation_state, output, data,
input->params.zero_point, output->params.zero_point);
return kTfLiteOk;
}
break;
}
@ -540,7 +607,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
} // namespace svdf
TfLiteRegistration Register_SVDF() {
return {/*init=*/nullptr,
return {/*init=*/svdf::Init,
/*free=*/nullptr,
/*prepare=*/svdf::Prepare,
/*invoke=*/svdf::Eval,

View File

@ -4,6 +4,8 @@ ifneq ($(filter xtensa_hifi, $(ALL_TAGS)),)
ifneq (,$(filter hifi4%, $(TARGET_ARCH)))
NNLIB = xa_nnlib_hifi4
CCFLAGS += -DNNLIB_V2 \
-DXTENSA_NNLIB_MAX_SCRATCH_SIZE=70*1024
@ -11,56 +13,60 @@ ifneq ($(filter xtensa_hifi, $(ALL_TAGS)),)
-DXTENSA_NNLIB_MAX_SCRATCH_SIZE=70*1024
MICROLITE_CC_SRCS += \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_f32_f32.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_asym8_asym8.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_32_16.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_32_8.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_softmax_asym8_asym8.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_floor_f32.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_circ_buf.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_asym8xasym8.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_f32.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_matXvec_asym8xasym8_asym8_circ.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_matXvec_f32_circ.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_f32.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_asym8xasym8.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_circ_buf.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/fc/hifi4/xa_nn_fully_connected.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_f32.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_16x16.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x16.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x8.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_asym8xasym8.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_f32.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_f32.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_f32_nhwc.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8_nhwc.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_f32_nhwc.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8_nhwc.c \
$(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_inv_256_tbl.c \
$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_sigmoidf_hifi4.c \
$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_tanhf_hifi4.c \
$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_reluf_hifi4.c \
$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_softmaxf_hifi4.c \
$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_alognf_hifi4.c \
$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/scl_sigmoidf_hifi4.c \
$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/scl_tanhf_hifi4.c \
$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/expf_tbl.c \
$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/pow2f_tbl.c \
$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/inff_tbl.c \
$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/tanhf_tbl.c \
$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/nanf_tbl.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_f32_f32.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_asym8_asym8.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_32_16.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_32_8.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_softmax_asym8_asym8.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_floor_f32.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_add_f32.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_add_quant8.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_mul_f32.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_mul_quant8.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_circ_buf.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_asym8xasym8.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_f32.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_matXvec_asym8xasym8_asym8_circ.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_matXvec_f32_circ.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_f32.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_asym8xasym8.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_circ_buf.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/fc/hifi4/xa_nn_fully_connected.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_f32.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_16x16.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x16.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x8.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_asym8xasym8.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_f32.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_f32.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_f32_nhwc.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8_nhwc.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_f32_nhwc.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8_nhwc.c \
$(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_inv_256_tbl.c \
$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_sigmoidf_hifi4.c \
$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_tanhf_hifi4.c \
$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_reluf_hifi4.c \
$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_softmaxf_hifi4.c \
$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_alognf_hifi4.c \
$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/scl_sigmoidf_hifi4.c \
$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/scl_tanhf_hifi4.c \
$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/expf_tbl.c \
$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/pow2f_tbl.c \
$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/inff_tbl.c \
$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/tanhf_tbl.c \
$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/nanf_tbl.c \
INCLUDES += -I$(XTENSA_PATH)/xa_nnlib/algo/kernels/ \
-I$(XTENSA_PATH)/xa_nnlib/include/nnlib/ \
-I$(XTENSA_PATH)/xa_nnlib/include/ \
-I$(XTENSA_PATH)/xa_nnlib/algo/common/include/ \
-I$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/include/ \
INCLUDES += -I$(XTENSA_PATH)/$(NNLIB)/algo/kernels/ \
-I$(XTENSA_PATH)/$(NNLIB)/include/nnlib/ \
-I$(XTENSA_PATH)/$(NNLIB)/include/ \
-I$(XTENSA_PATH)/$(NNLIB)/algo/common/include/ \
-I$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/include/ \
endif

View File

@ -5,7 +5,7 @@
ifeq ($(TARGET), xtensa_hifi)
TARGET_ARCH := hifi3_bd5
$(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib,))
$(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib_hifi4,))
PLATFORM_ARGS = \
-mno-mul16 \

View File

@ -80,8 +80,8 @@ EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
ZEPHYR_URL := "https://github.com/antmicro/zephyr/archive/55e36b9.zip"
ZEPHYR_MD5 := "755622eb4812fde918a6382b65d50c3b"
XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"
XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_06_27.zip"
XTENSA_HIFI4_MD5 :="45fdc1209a8da62ab568aa6040f7eabf"
ETHOSU_URL := "https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-bcb5aaa99756f1b5c1295b079ebdd60996bc75a5.tar.gz"
ETHOSU_MD5 := "d2073c8d88fc167fd5c46b5dcda58ea1"