Cadence NNLib:Fixed various unit test failures

Fixed person detection test,max pool and svdf unit tests

Signed-off-by: Bhanu Prakash Bandaru Venkata <bhanup@cadence.com>
This commit is contained in:
Niranjan Yadla 2020-04-08 14:14:15 -07:00
parent 5d08924cda
commit cb4f8412f8
4 changed files with 86 additions and 113 deletions

View File

@ -55,7 +55,7 @@ constexpr int kInputTensor = 0;
constexpr int kFilterTensor = 1;
constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
constexpr int kMaxChannels = 8;
constexpr int kMaxChannels = 256;
// Conv is quantized along dimension 0:
// https://www.tensorflow.org/lite/performance/quantization_spec

View File

@ -58,7 +58,7 @@ constexpr int kFilterTensor = 1;
constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
// Per channel quantization is not needed for any model on xtensa.
constexpr int kMaxChannels = 8;
constexpr int kMaxChannels = 256;
// Depthwise conv is quantized along dimension 3:
// https://www.tensorflow.org/lite/performance/quantization_spec

View File

@ -78,14 +78,15 @@ struct OpData {
*/
static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
TfLiteContext* context, int batch_size, int memory_size, int num_filters,
int num_units, int rank, const TfLiteTensor* weights_time,
const TfLiteTensor* bias, TfLiteFusedActivation activation,
TfLiteTensor* activation_state, TfLiteTensor* scratch,
TfLiteTensor* output) {
float* scratch_bias = GetTensorData<float>(scratch);
if (bias) {
const float* bias_data = GetTensorData<float>(bias);
TfLiteContext* context, int batch_size, int memory_size, int num_filters, int num_units, int rank,
const float* const __restrict__ weights_time_ptr,
const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
float* const __restrict__ output_ptr) {
// Compute matmul(activation_state, weights_time).
float* scratch_bias = scratch_ptr;
if (bias_ptr) {
const float* bias_data = bias_ptr;
for (int j = 0; j < num_units; ++j) {
scratch_bias[j] = *bias_data++;
}
@ -96,15 +97,16 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
}
int err = 0;
for (int b = 0; b < batch_size; ++b) {
const float* weights_time_vec = GetTensorData<float>(weights_time);
const float* weights_time_vec = weights_time_ptr;
const float* mat_ptr =
GetTensorData<float>(activation_state) + b * memory_size * num_filters;
float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
state_ptr + b * memory_size * num_filters;
float* output_ptr_batch = output_ptr + b * num_units;
for (int j = 0; j < num_units; j++) {
err = xa_nn_matXvec_f32xf32_f32(
output_ptr_batch, mat_ptr, NULL, weights_time_vec, NULL, scratch_bias,
1, memory_size * rank, 0, memory_size * rank, 0);
CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_matXvec_f32xf32_f32 failed");
output_ptr_batch++;
mat_ptr += memory_size * rank;
weights_time_vec += memory_size * rank;
@ -113,30 +115,12 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
// Apply activation.
for (int b = 0; b < batch_size; ++b) {
float* output_ptr_batch = GetTensorData<float>(output) + b * num_units;
float* output_ptr_batch = output_ptr + b * num_units;
for (int i = 0; i < num_units; ++i) {
*output_ptr_batch = ActivationValFloat(activation, *output_ptr_batch);
++output_ptr_batch;
}
}
// Left shift the activation_state to make room for next cycle's activation.
// (alanchiao): explore collapsing this into a single loop.
for (int b = 0; b < batch_size; ++b) {
float* state_ptr_batch =
GetTensorData<float>(activation_state) + b * memory_size * num_filters;
for (int f = 0; f < num_filters; ++f) {
// Shift the vector left:
float* batch_ptr = state_ptr_batch;
float* batch_start = state_ptr_batch + 1;
float* batch_end = state_ptr_batch + memory_size;
while (batch_start != batch_end) {
*batch_ptr++ = *batch_start++;
}
state_ptr_batch[memory_size - 1] = 0.0f;
state_ptr_batch += memory_size;
}
}
return kTfLiteOk;
}
@ -144,8 +128,7 @@ inline TfLiteStatus EvalFloatSVDF(
TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
const TfLiteTensor* bias, const TfLiteSVDFParams* params,
TfLiteTensor* scratch, TfLiteTensor* activation_state,
TfLiteTensor* output) {
TfLiteTensor* activation_state, TfLiteTensor* output) {
const int rank = params->rank;
const int batch_size = input->dims->data[0];
const int input_size = input->dims->data[1];
@ -153,30 +136,47 @@ inline TfLiteStatus EvalFloatSVDF(
const int num_units = num_filters / rank;
const int memory_size = weights_time->dims->data[1];
// Clear the activation (activation_state's leftmost column).
// (ghodrat): Add a test which initialize activation_state with invalid
// values in leftmost column and make sure it passes.
for (int b = 0; b < batch_size; ++b) {
float* state_ptr_batch =
GetTensorData<float>(activation_state) + b * memory_size * num_filters;
const float* weights_feature_ptr = GetTensorData<float>(weights_feature);
const float* weights_time_ptr = GetTensorData<float>(weights_time);
const float* bias_ptr = GetTensorData<float>(bias);
const float* input_ptr = GetTensorData<float>(input);
float* state_ptr = GetTensorData<float>(activation_state);
// TODO(b/132070898): Move this temp variable to the new scratch buffer API
// when ready.
float scratch_tensor[kScratchTensorMaxSize];
float* scratch_ptr = scratch_tensor;
float* output_ptr = GetTensorData<float>(output);
// Left shift the activation_state.
{
float* new_state_start = state_ptr;
const float* old_state_start = state_ptr + 1;
const float* old_state_end =
state_ptr + batch_size * num_filters * memory_size;
while (old_state_start != old_state_end) {
*new_state_start++ = *old_state_start++;
}
}
// Note: no need to clear the latest activation, matmul is not accumulative.
// Compute conv1d(inputs, weights_feature).
// The activation_state's rightmost column is used to save current cycle
// activation. This is achieved by starting at
// GetTensorData<float>(activation_state)[memory_size - 1] and having the
// stride equal to memory_size.
// activation. This is achieved by starting at state_ptr[memory_size - 1] and
// having the stride equal to memory_size.
const float* matrix = GetTensorData<float>(weights_feature);
const float* vector = GetTensorData<float>(input);
float* out_scratch = GetTensorData<float>(scratch);
/* NNLib matXvec needs a bias buffer, so using output buffer to
avoid need for extra memory, output buffer size is batch * num_units,
batch is at least 1 so we use size num_units of it */
float* bias_scratch = GetTensorData<float>(output);
float* result = &GetTensorData<float>(activation_state)[memory_size - 1];
float* result_in_batch = result;
// Perform batched matrix vector multiply operation:
{
const float* matrix = weights_feature_ptr;
const float* vector = input_ptr;
float* result = &state_ptr[memory_size - 1];
float* result_in_batch = result;
float* out_scratch = scratch_ptr;
float* bias_scratch = output_ptr;
for (int i = 0; i < num_units; i++) bias_scratch[i] = 0.0f;
int err = 0;
@ -196,11 +196,11 @@ inline TfLiteStatus EvalFloatSVDF(
result_in_batch += memory_size;
}
}
}
return ApplyTimeWeightsBiasAndActivation(
context, batch_size, memory_size, num_filters, num_units, rank,
weights_time, bias, params->activation, activation_state, scratch,
output);
context, batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
}
void EvalIntegerSVDF(
@ -217,24 +217,26 @@ void EvalIntegerSVDF(
const int n_unit = n_filter / n_rank;
const int n_memory = weights_time_tensor->dims->data[1];
// (b/132070898): Move these temp variables to the new scratch buffer API
// TODO(b/132070898): Move these temp variables to the new scratch buffer API
// when ready.
int32_t scratch_tensor[kScratchTensorMaxSize];
int32_t scratch_output_tensor[kScratchTensorMaxSize];
// Rewrite last bit of state.
// Shift states.
int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
// Left shift the activation_state.
{
for (int b = 0; b < n_batch; ++b) {
int16_t* state_ptr_batch =
GetTensorData<int16_t>(activation_state_tensor) +
b * n_memory * n_filter;
for (int c = 0; c < n_filter; ++c) {
int16_t* state_ptr = state_ptr_batch + c * n_memory;
state_ptr[n_memory - 1] = 0;
}
int16_t* new_state_start = state_ptr;
const int16_t* old_state_start = state_ptr + 1;
const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
while (old_state_start != old_state_end) {
*new_state_start++ = *old_state_start++;
}
}
// Note: no need to clear the latest activation, matmul is not accumulative.
// Feature matmul.
{
int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
@ -255,6 +257,12 @@ void EvalIntegerSVDF(
dot_prod =
MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
dot_prod = std::min(std::max(output_min, dot_prod), output_max);
// This assumes state is symmetrically quantized. Otherwise last bit of
// state should be initialized to its zero point and accumulate the
// dot_prod.
// Equivalent as the following:
// result_in_batch = zero point, which happens to be zero.
// result_in_batch += dot_prod_56.
*result_in_batch = dot_prod;
result_in_batch += n_memory;
}
@ -326,26 +334,6 @@ void EvalIntegerSVDF(
GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
}
}
// Shift state.
{
for (int b = 0; b < n_batch; ++b) {
int16_t* state_ptr_batch =
GetTensorData<int16_t>(activation_state_tensor) +
b * n_memory * n_filter;
for (int f = 0; f < n_filter; ++f) {
// Shift the vector left:
int16_t* batch_ptr = state_ptr_batch;
int16_t* batch_start = state_ptr_batch + 1;
int16_t* batch_end = state_ptr_batch + n_memory;
while (batch_start != batch_end) {
*batch_ptr++ = *batch_start++;
}
state_ptr_batch[n_memory - 1] = 0;
state_ptr_batch += n_memory;
}
}
}
}
} // namespace
@ -385,12 +373,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
const int rank = params->rank;
const int input_size = input->dims->data[1];
const int batch_size = input->dims->data[0];
// Ensure the input size is a multiple of two. This is necessary since
// optimized kernels access the memory in chunks of two, and all accesses
// must be aligned to 16 bits.
// TODO(b/153202598): Remove when padding is allowed in TFLite tensors.
TF_LITE_ENSURE_EQ(context, input_size % 2, 0);
const int num_filters = weights_feature->dims->data[0];
TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
const int num_units = num_filters / rank;
@ -446,13 +428,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
// Validate Scratch Tensors:
// [0] = (shared - see float block below for usage)
// [1] = Output Temp, int8_t, {2, num_units, batch_size}
// (b/132070898): Scratch values are used as stack variables in
// TODO(b/132070898): Scratch values are used as stack variables in
// EvalIntegerSVDF().
// Validate output tensor:
TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
} else {
TF_LITE_ENSURE_EQ(context, node->inputs->size, 6);
TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
// Validate Input Tensor dtypes:
TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
@ -467,19 +449,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
// [0] = Holds dot-product of time-forward calculations in
// ApplyTimeWeightsBiasAndActivation():
// float/int32, {2, batch_size, num_filters}
// (b/132070898): Use input tensor as variable until scratch tensor
// allocation has been implemented (b/132070898) TfLiteTensor*
// scratch_tensor = GetTemporary(context, node, 0);
TfLiteTensor* scratch_tensor = &context->tensors[node->inputs->data[5]];
TF_LITE_ENSURE_EQ(context, scratch_tensor->type, kTfLiteFloat32);
TF_LITE_ENSURE_EQ(context, NumDimensions(scratch_tensor), 2);
TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[0], batch_size);
TF_LITE_ENSURE_EQ(context, scratch_tensor->dims->data[1], num_filters);
// TODO(b/132070898): Scratch values are used as stack variables in
// EvalIntegerSVDF().
// Full-float SVDF only uses the one shared scratch tensor (see above for
// usage).
// (b/132070898): Use input tensor as variable until scratch tensor
// TODO(b/132070898): Use input tensor as variable until scratch tensor
// allocation has been implemented.
// TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
@ -505,18 +480,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
switch (weights_feature->type) {
case kTfLiteFloat32: {
// (b/132070898): Use input tensor as variable until scratch tensor
// allocation has been implemented. TfLiteTensor* scratch =
// GetTemporary(context, node, /*index=*/0);
TfLiteTensor* scratch = &context->tensors[node->inputs->data[5]];
return EvalFloatSVDF(context, node, input, weights_feature, weights_time,
bias, params, scratch, activation_state, output);
// TODO(b/132070898): Use input tensor as variable until scratch tensor
// allocation has been implemented.
// TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
return EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias,
params, activation_state, output);
break;
}
case kTfLiteInt8: {
if (is_full_integer) {
// (b/132070898): Store these values in ::Prepare() instead of
// TODO(b/132070898): Store these values in ::Prepare() instead of
// ::Eval():
// Calculate effective scales.
OpData op_data;
@ -574,7 +548,6 @@ TfLiteRegistration* Register_SVDF() {
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
return &r;
}

View File

@ -63,6 +63,6 @@ EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/6316034d421cbbb59756239908d7c9a99075a3bb.zip"
EMBARC_MLI_MD5 := "db0910cf0e07e43f74ae7a31de485d56"
XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib.zip"
XTENSA_HIFI4_MD5 :="a517b653a75b96d0271e1b99ee2a8c14"
XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"