Cadence NNLib:Fixed various unit test failures
Fixed person detection test,max pool and svdf unit tests Signed-off-by: Bhanu Prakash Bandaru Venkata <bhanup@cadence.com>
This commit is contained in:
parent
5d08924cda
commit
cb4f8412f8
@ -55,7 +55,7 @@ constexpr int kInputTensor = 0;
|
||||
constexpr int kFilterTensor = 1;
|
||||
constexpr int kBiasTensor = 2;
|
||||
constexpr int kOutputTensor = 0;
|
||||
constexpr int kMaxChannels = 8;
|
||||
constexpr int kMaxChannels = 256;
|
||||
|
||||
// Conv is quantized along dimension 0:
|
||||
// https://www.tensorflow.org/lite/performance/quantization_spec
|
||||
|
@ -58,7 +58,7 @@ constexpr int kFilterTensor = 1;
|
||||
constexpr int kBiasTensor = 2;
|
||||
constexpr int kOutputTensor = 0;
|
||||
// Per channel quantization is not needed for any model on xtensa.
|
||||
constexpr int kMaxChannels = 8;
|
||||
constexpr int kMaxChannels = 256;
|
||||
|
||||
// Depthwise conv is quantized along dimension 3:
|
||||
// https://www.tensorflow.org/lite/performance/quantization_spec
|
||||
|
@ -78,14 +78,15 @@ struct OpData {
|
||||
*/
|
||||
|
||||
static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
|
||||
TfLiteContext* context, int batch_size, int memory_size, int num_filters,
|
||||
int num_units, int rank, const TfLiteTensor* weights_time,
|
||||
const TfLiteTensor* bias, TfLiteFusedActivation activation,
|
||||
TfLiteTensor* activation_state, TfLiteTensor* scratch,
|
||||
TfLiteTensor* output) {
|
||||
float* scratch_bias = GetTensorData<float>(scratch);
|
||||
if (bias) {
|
||||
const float* bias_data = GetTensorData<float>(bias);
|
||||
TfLiteContext* context, int batch_size, int memory_size, int num_filters, int num_units, int rank,
|
||||
const float* const __restrict__ weights_time_ptr,
|
||||
const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
|
||||
float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
|
||||
float* const __restrict__ output_ptr) {
|
||||
// Compute matmul(activation_state, weights_time).
|
||||
float* scratch_bias = scratch_ptr;
|
||||
if (bias_ptr) {
|
||||
const float* bias_data = bias_ptr;
|
||||
for (int j = 0; j < num_units; ++j) {
|
||||
scratch_bias[j] = *bias_data++;
|
||||
}
|
||||
@ -96,15 +97,16 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
|
||||
}
|
||||
int err = 0;
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
const float* weights_time_vec = GetTensorData<float>(weights_time);
|
||||
const float* weights_time_vec = weights_time_ptr;
|
||||
const float* mat_ptr =
|
||||
GetTensorData<float>(activation_state) + b * memory_size * num_filters;
|
||||
float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
|
||||
state_ptr + b * memory_size * num_filters;
|
||||
float* output_ptr_batch = output_ptr + b * num_units;
|
||||
for (int j = 0; j < num_units; j++) {
|
||||
err = xa_nn_matXvec_f32xf32_f32(
|
||||
output_ptr_batch, mat_ptr, NULL, weights_time_vec, NULL, scratch_bias,
|
||||
1, memory_size * rank, 0, memory_size * rank, 0);
|
||||
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_matXvec_f32xf32_f32 failed");
|
||||
|
||||
output_ptr_batch++;
|
||||
mat_ptr += memory_size * rank;
|
||||
weights_time_vec += memory_size * rank;
|
||||
@ -113,30 +115,12 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
|
||||
|
||||
// Apply activation.
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
|
||||
float* output_ptr_batch = output_ptr + b * num_units;
|
||||
for (int i = 0; i < num_units; ++i) {
|
||||
*output_ptr_batch = ActivationValFloat(activation, *output_ptr_batch);
|
||||
++output_ptr_batch;
|
||||
}
|
||||
}
|
||||
|
||||
// Left shift the activation_state to make room for next cycle's activation.
|
||||
// (alanchiao): explore collapsing this into a single loop.
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
float* state_ptr_batch =
|
||||
GetTensorData<float>(activation_state) + b * memory_size * num_filters;
|
||||
for (int f = 0; f < num_filters; ++f) {
|
||||
// Shift the vector left:
|
||||
float* batch_ptr = state_ptr_batch;
|
||||
float* batch_start = state_ptr_batch + 1;
|
||||
float* batch_end = state_ptr_batch + memory_size;
|
||||
while (batch_start != batch_end) {
|
||||
*batch_ptr++ = *batch_start++;
|
||||
}
|
||||
state_ptr_batch[memory_size - 1] = 0.0f;
|
||||
state_ptr_batch += memory_size;
|
||||
}
|
||||
}
|
||||
return kTfLiteOk;
|
||||
}
|
||||
|
||||
@ -144,8 +128,7 @@ inline TfLiteStatus EvalFloatSVDF(
|
||||
TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
|
||||
const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
|
||||
const TfLiteTensor* bias, const TfLiteSVDFParams* params,
|
||||
TfLiteTensor* scratch, TfLiteTensor* activation_state,
|
||||
TfLiteTensor* output) {
|
||||
TfLiteTensor* activation_state, TfLiteTensor* output) {
|
||||
const int rank = params->rank;
|
||||
const int batch_size = input->dims->data[0];
|
||||
const int input_size = input->dims->data[1];
|
||||
@ -153,30 +136,47 @@ inline TfLiteStatus EvalFloatSVDF(
|
||||
const int num_units = num_filters / rank;
|
||||
const int memory_size = weights_time->dims->data[1];
|
||||
|
||||
// Clear the activation (activation_state's leftmost column).
|
||||
// (ghodrat): Add a test which initialize activation_state with invalid
|
||||
// values in leftmost column and make sure it passes.
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
float* state_ptr_batch =
|
||||
GetTensorData<float>(activation_state) + b * memory_size * num_filters;
|
||||
const float* weights_feature_ptr = GetTensorData<float>(weights_feature);
|
||||
const float* weights_time_ptr = GetTensorData<float>(weights_time);
|
||||
const float* bias_ptr = GetTensorData<float>(bias);
|
||||
const float* input_ptr = GetTensorData<float>(input);
|
||||
|
||||
float* state_ptr = GetTensorData<float>(activation_state);
|
||||
|
||||
// TODO(b/132070898): Move this temp variable to the new scratch buffer API
|
||||
// when ready.
|
||||
float scratch_tensor[kScratchTensorMaxSize];
|
||||
float* scratch_ptr = scratch_tensor;
|
||||
|
||||
float* output_ptr = GetTensorData<float>(output);
|
||||
|
||||
// Left shift the activation_state.
|
||||
{
|
||||
float* new_state_start = state_ptr;
|
||||
const float* old_state_start = state_ptr + 1;
|
||||
const float* old_state_end =
|
||||
state_ptr + batch_size * num_filters * memory_size;
|
||||
while (old_state_start != old_state_end) {
|
||||
*new_state_start++ = *old_state_start++;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: no need to clear the latest activation, matmul is not accumulative.
|
||||
|
||||
// Compute conv1d(inputs, weights_feature).
|
||||
// The activation_state's rightmost column is used to save current cycle
|
||||
// activation. This is achieved by starting at
|
||||
// GetTensorData<float>(activation_state)[memory_size - 1] and having the
|
||||
// stride equal to memory_size.
|
||||
// activation. This is achieved by starting at state_ptr[memory_size - 1] and
|
||||
// having the stride equal to memory_size.
|
||||
|
||||
const float* matrix = GetTensorData<float>(weights_feature);
|
||||
const float* vector = GetTensorData<float>(input);
|
||||
float* out_scratch = GetTensorData<float>(scratch);
|
||||
/* NNLib matXvec needs a bias buffer, so using output buffer to
|
||||
avoid need for extra memory, output buffer size is batch * num_units,
|
||||
batch is at least 1 so we use size num_units of it */
|
||||
float* bias_scratch = GetTensorData<float>(output);
|
||||
float* result = &GetTensorData<float>(activation_state)[memory_size - 1];
|
||||
float* result_in_batch = result;
|
||||
// Perform batched matrix vector multiply operation:
|
||||
{
|
||||
const float* matrix = weights_feature_ptr;
|
||||
const float* vector = input_ptr;
|
||||
float* result = &state_ptr[memory_size - 1];
|
||||
float* result_in_batch = result;
|
||||
|
||||
float* out_scratch = scratch_ptr;
|
||||
float* bias_scratch = output_ptr;
|
||||
for (int i = 0; i < num_units; i++) bias_scratch[i] = 0.0f;
|
||||
|
||||
int err = 0;
|
||||
@ -196,11 +196,11 @@ inline TfLiteStatus EvalFloatSVDF(
|
||||
result_in_batch += memory_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ApplyTimeWeightsBiasAndActivation(
|
||||
context, batch_size, memory_size, num_filters, num_units, rank,
|
||||
weights_time, bias, params->activation, activation_state, scratch,
|
||||
output);
|
||||
context, batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
|
||||
bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
|
||||
}
|
||||
|
||||
void EvalIntegerSVDF(
|
||||
@ -217,24 +217,26 @@ void EvalIntegerSVDF(
|
||||
const int n_unit = n_filter / n_rank;
|
||||
const int n_memory = weights_time_tensor->dims->data[1];
|
||||
|
||||
// (b/132070898): Move these temp variables to the new scratch buffer API
|
||||
// TODO(b/132070898): Move these temp variables to the new scratch buffer API
|
||||
// when ready.
|
||||
int32_t scratch_tensor[kScratchTensorMaxSize];
|
||||
int32_t scratch_output_tensor[kScratchTensorMaxSize];
|
||||
|
||||
// Rewrite last bit of state.
|
||||
// Shift states.
|
||||
int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
|
||||
|
||||
// Left shift the activation_state.
|
||||
{
|
||||
for (int b = 0; b < n_batch; ++b) {
|
||||
int16_t* state_ptr_batch =
|
||||
GetTensorData<int16_t>(activation_state_tensor) +
|
||||
b * n_memory * n_filter;
|
||||
for (int c = 0; c < n_filter; ++c) {
|
||||
int16_t* state_ptr = state_ptr_batch + c * n_memory;
|
||||
state_ptr[n_memory - 1] = 0;
|
||||
}
|
||||
int16_t* new_state_start = state_ptr;
|
||||
const int16_t* old_state_start = state_ptr + 1;
|
||||
const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
|
||||
while (old_state_start != old_state_end) {
|
||||
*new_state_start++ = *old_state_start++;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: no need to clear the latest activation, matmul is not accumulative.
|
||||
|
||||
// Feature matmul.
|
||||
{
|
||||
int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
|
||||
@ -255,6 +257,12 @@ void EvalIntegerSVDF(
|
||||
dot_prod =
|
||||
MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
|
||||
dot_prod = std::min(std::max(output_min, dot_prod), output_max);
|
||||
// This assumes state is symmetrically quantized. Otherwise last bit of
|
||||
// state should be initialized to its zero point and accumulate the
|
||||
// dot_prod.
|
||||
// Equivalent as the following:
|
||||
// result_in_batch = zero point, which happens to be zero.
|
||||
// result_in_batch += dot_prod_56.
|
||||
*result_in_batch = dot_prod;
|
||||
result_in_batch += n_memory;
|
||||
}
|
||||
@ -326,26 +334,6 @@ void EvalIntegerSVDF(
|
||||
GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
|
||||
}
|
||||
}
|
||||
|
||||
// Shift state.
|
||||
{
|
||||
for (int b = 0; b < n_batch; ++b) {
|
||||
int16_t* state_ptr_batch =
|
||||
GetTensorData<int16_t>(activation_state_tensor) +
|
||||
b * n_memory * n_filter;
|
||||
for (int f = 0; f < n_filter; ++f) {
|
||||
// Shift the vector left:
|
||||
int16_t* batch_ptr = state_ptr_batch;
|
||||
int16_t* batch_start = state_ptr_batch + 1;
|
||||
int16_t* batch_end = state_ptr_batch + n_memory;
|
||||
while (batch_start != batch_end) {
|
||||
*batch_ptr++ = *batch_start++;
|
||||
}
|
||||
state_ptr_batch[n_memory - 1] = 0;
|
||||
state_ptr_batch += n_memory;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@ -385,12 +373,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
const int rank = params->rank;
|
||||
const int input_size = input->dims->data[1];
|
||||
const int batch_size = input->dims->data[0];
|
||||
// Ensure the input size is a multiple of two. This is necessary since
|
||||
// optimized kernels access the memory in chunks of two, and all accesses
|
||||
// must be aligned to 16 bits.
|
||||
// TODO(b/153202598): Remove when padding is allowed in TFLite tensors.
|
||||
TF_LITE_ENSURE_EQ(context, input_size % 2, 0);
|
||||
|
||||
const int num_filters = weights_feature->dims->data[0];
|
||||
TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
|
||||
const int num_units = num_filters / rank;
|
||||
@ -446,13 +428,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
// Validate Scratch Tensors:
|
||||
// [0] = (shared - see float block below for usage)
|
||||
// [1] = Output Temp, int8_t, {2, num_units, batch_size}
|
||||
// (b/132070898): Scratch values are used as stack variables in
|
||||
// TODO(b/132070898): Scratch values are used as stack variables in
|
||||
// EvalIntegerSVDF().
|
||||
|
||||
// Validate output tensor:
|
||||
TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
|
||||
} else {
|
||||
TF_LITE_ENSURE_EQ(context, node->inputs->size, 6);
|
||||
TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
|
||||
|
||||
// Validate Input Tensor dtypes:
|
||||
TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
|
||||
@ -467,19 +449,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
// [0] = Holds dot-product of time-forward calculations in
|
||||
// ApplyTimeWeightsBiasAndActivation():
|
||||
// float/int32, {2, batch_size, num_filters}
|
||||
// (b/132070898): Use input tensor as variable until scratch tensor
|
||||
// allocation has been implemented (b/132070898) TfLiteTensor*
|
||||
// scratch_tensor = GetTemporary(context, node, 0);
|
||||
TfLiteTensor* scratch_tensor = &context->tensors[node->inputs->data[5]];
|
||||
TF_LITE_ENSURE_EQ(context, scratch_tensor->type, kTfLiteFloat32);
|
||||
|
||||
TF_LITE_ENSURE_EQ(context, NumDimensions(scratch_tensor), 2);
|
||||
TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[0], batch_size);
|
||||
TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[1], num_filters);
|
||||
// TODO(b/132070898): Scratch values are used as stack variables in
|
||||
// EvalIntegerSVDF().
|
||||
|
||||
// Full-float SVDF only uses the one shared scratch tensor (see above for
|
||||
// usage).
|
||||
// (b/132070898): Use input tensor as variable until scratch tensor
|
||||
// TODO(b/132070898): Use input tensor as variable until scratch tensor
|
||||
// allocation has been implemented.
|
||||
// TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
|
||||
TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
|
||||
@ -505,18 +480,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
|
||||
|
||||
switch (weights_feature->type) {
|
||||
case kTfLiteFloat32: {
|
||||
// (b/132070898): Use input tensor as variable until scratch tensor
|
||||
// allocation has been implemented. TfLiteTensor* scratch =
|
||||
// GetTemporary(context, node, /*index=*/0);
|
||||
TfLiteTensor* scratch = &context->tensors[node->inputs->data[5]];
|
||||
return EvalFloatSVDF(context, node, input, weights_feature, weights_time,
|
||||
bias, params, scratch, activation_state, output);
|
||||
// TODO(b/132070898): Use input tensor as variable until scratch tensor
|
||||
// allocation has been implemented.
|
||||
// TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
|
||||
return EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias,
|
||||
params, activation_state, output);
|
||||
break;
|
||||
}
|
||||
|
||||
case kTfLiteInt8: {
|
||||
if (is_full_integer) {
|
||||
// (b/132070898): Store these values in ::Prepare() instead of
|
||||
// TODO(b/132070898): Store these values in ::Prepare() instead of
|
||||
// ::Eval():
|
||||
// Calculate effective scales.
|
||||
OpData op_data;
|
||||
@ -574,7 +548,6 @@ TfLiteRegistration* Register_SVDF() {
|
||||
/*builtin_code=*/0,
|
||||
/*custom_name=*/nullptr,
|
||||
/*version=*/0};
|
||||
|
||||
return &r;
|
||||
}
|
||||
|
||||
|
@ -63,6 +63,6 @@ EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
|
||||
EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/6316034d421cbbb59756239908d7c9a99075a3bb.zip"
|
||||
EMBARC_MLI_MD5 := "db0910cf0e07e43f74ae7a31de485d56"
|
||||
|
||||
XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib.zip"
|
||||
XTENSA_HIFI4_MD5 :="a517b653a75b96d0271e1b99ee2a8c14"
|
||||
XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
|
||||
XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user