SVDF: Do state shifting and clearing of last state separately, so state shifting can be done in a single loop.
PiperOrigin-RevId: 298417000 Change-Id: Ia8a6756ffcc1bb7f761755461c085c55511602bc
This commit is contained in:
parent
7c12dad5fd
commit
ae2fe82448
@ -85,28 +85,23 @@ inline void EvalIntegerSVDF(
|
||||
const int n_unit = n_filter / n_rank;
|
||||
const int n_memory = weights_time_tensor->dims->data[1];
|
||||
|
||||
// Shift state.
|
||||
{
|
||||
int16_t zero = 0;
|
||||
for (int b = 0; b < n_batch; ++b) {
|
||||
int16_t* state_ptr_batch =
|
||||
GetTensorData<int16_t>(state_tensor) + b * n_memory * n_filter;
|
||||
for (int f = 0; f < n_filter; ++f) {
|
||||
tensor_utils::VectorShiftLeft(state_ptr_batch, n_memory, zero);
|
||||
state_ptr_batch += n_memory;
|
||||
}
|
||||
}
|
||||
}
|
||||
int16_t* const state_ptr = GetTensorData<int16_t>(state_tensor);
|
||||
|
||||
// Left shift the activation_state.
|
||||
// std::copy is fine for overlapping ranges if the output is outside of the
|
||||
// input range. (This is not true for copy_n.)
|
||||
std::copy(state_ptr + 1, state_ptr + n_batch * n_memory * n_filter,
|
||||
state_ptr);
|
||||
|
||||
// Feature matmul.
|
||||
// Note: no need to clear the latest activation, matmul is not accumulative.
|
||||
{
|
||||
int16_t* state = GetTensorData<int16_t>(state_tensor);
|
||||
const int8_t* input = GetTensorData<int8_t>(input_tensor);
|
||||
const int8_t* weight_feature =
|
||||
GetTensorData<int8_t>(weights_feature_tensor);
|
||||
const int32_t output_max = std::numeric_limits<int16_t>::max();
|
||||
const int32_t output_min = std::numeric_limits<int16_t>::min();
|
||||
int16_t* result_in_batch = state + (n_memory - 1);
|
||||
int16_t* result_in_batch = state_ptr + (n_memory - 1);
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
const int8_t* matrix_ptr = weight_feature;
|
||||
for (int r = 0; r < n_filter; r++) {
|
||||
@ -133,8 +128,7 @@ inline void EvalIntegerSVDF(
|
||||
// Time.
|
||||
{
|
||||
for (int b = 0; b < n_batch; ++b) {
|
||||
const int16_t* state_ptr_batch =
|
||||
GetTensorData<int16_t>(state_tensor) + b * n_memory * n_filter;
|
||||
const int16_t* state_ptr_batch = state_ptr + b * n_memory * n_filter;
|
||||
int32_t* scratch_ptr_batch =
|
||||
GetTensorData<int32_t>(scratch_tensor) + b * n_filter;
|
||||
tensor_utils::BatchVectorBatchVectorDotProduct(
|
||||
@ -199,15 +193,15 @@ inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
|
||||
|
||||
float* output_ptr = GetTensorData<float>(output);
|
||||
|
||||
// Left shift the activation_state, and clear the latest activation (the
|
||||
// rightmost column).
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
|
||||
for (int f = 0; f < num_filters; ++f) {
|
||||
tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
|
||||
/*shift_value=*/0.0f);
|
||||
state_ptr_batch += memory_size;
|
||||
}
|
||||
// Left shift the activation_state.
|
||||
// std::copy is fine for overlapping ranges if the output is outside of the
|
||||
// input range. (This is not true for copy_n.)
|
||||
std::copy(state_ptr + 1, state_ptr + batch_size * memory_size * num_filters,
|
||||
state_ptr);
|
||||
|
||||
// Clear the latest activation (the rightmost column).
|
||||
for (int i = 0; i < batch_size * num_filters; ++i) {
|
||||
state_ptr[i * memory_size + memory_size - 1] = 0.0f;
|
||||
}
|
||||
|
||||
// Compute conv1d(inputs, weights_feature).
|
||||
@ -252,15 +246,15 @@ inline void EvalHybridSVDF(
|
||||
// Initialize the weights scale.
|
||||
const float weights_feature_scale = weights_feature->params.scale;
|
||||
|
||||
// Left shift the activation_state, and clear the latest activation (the
|
||||
// rightmost column).
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
|
||||
for (int f = 0; f < num_filters; ++f) {
|
||||
tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
|
||||
/*shift_value=*/0.0f);
|
||||
state_ptr_batch += memory_size;
|
||||
}
|
||||
// Left shift the activation_state.
|
||||
// std::copy is fine for overlapping ranges if the output is outside of the
|
||||
// input range. (This is not true for copy_n.)
|
||||
std::copy(state_ptr + 1, state_ptr + batch_size * memory_size * num_filters,
|
||||
state_ptr);
|
||||
|
||||
// Clear the latest activation (the rightmost column).
|
||||
for (int i = 0; i < batch_size * num_filters; ++i) {
|
||||
state_ptr[i * memory_size + memory_size - 1] = 0.0f;
|
||||
}
|
||||
|
||||
if (!tensor_utils::IsZeroVector(input_ptr, batch_size * input_size)) {
|
||||
|
@ -142,19 +142,16 @@ inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
|
||||
float* output_ptr = GetTensorData<float>(output);
|
||||
|
||||
// Left shift the activation_state.
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
|
||||
for (int f = 0; f < num_filters; ++f) {
|
||||
// Shift the vector left:
|
||||
float* batch_ptr = state_ptr_batch;
|
||||
float* batch_start = state_ptr_batch + 1;
|
||||
float* batch_end = state_ptr_batch + memory_size;
|
||||
while (batch_start != batch_end) {
|
||||
*batch_ptr++ = *batch_start++;
|
||||
}
|
||||
state_ptr_batch += memory_size;
|
||||
{
|
||||
float* new_state_start = state_ptr;
|
||||
const float* old_state_start = state_ptr + 1;
|
||||
const float* old_state_end =
|
||||
state_ptr + batch_size * num_filters * memory_size;
|
||||
while (old_state_start != old_state_end) {
|
||||
*new_state_start++ = *old_state_start++;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: no need to clear the latest activation, matmul is not accumulative.
|
||||
|
||||
// Compute conv1d(inputs, weights_feature).
|
||||
@ -206,25 +203,21 @@ void EvalIntegerSVDF(
|
||||
int32_t scratch_tensor[kScratchTensorMaxSize];
|
||||
int32_t scratch_output_tensor[kScratchTensorMaxSize];
|
||||
|
||||
// Shift states. No need to set last state, the matmul is not accumulative.
|
||||
// Shift states.
|
||||
int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
|
||||
|
||||
// Left shift the activation_state.
|
||||
{
|
||||
for (int b = 0; b < n_batch; ++b) {
|
||||
int16_t* state_ptr_batch =
|
||||
GetTensorData<int16_t>(activation_state_tensor) +
|
||||
b * n_memory * n_filter;
|
||||
for (int f = 0; f < n_filter; ++f) {
|
||||
// Shift the vector left:
|
||||
int16_t* batch_ptr = state_ptr_batch;
|
||||
int16_t* batch_start = state_ptr_batch + 1;
|
||||
int16_t* batch_end = state_ptr_batch + n_memory;
|
||||
while (batch_start != batch_end) {
|
||||
*batch_ptr++ = *batch_start++;
|
||||
}
|
||||
state_ptr_batch += n_memory;
|
||||
}
|
||||
int16_t* new_state_start = state_ptr;
|
||||
const int16_t* old_state_start = state_ptr + 1;
|
||||
const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
|
||||
while (old_state_start != old_state_end) {
|
||||
*new_state_start++ = *old_state_start++;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: no need to clear the latest activation, matmul is not accumulative.
|
||||
|
||||
// Feature matmul.
|
||||
{
|
||||
int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
|
||||
|
@ -75,32 +75,27 @@ void EvalIntegerSVDF(
|
||||
int32_t scratch_tensor[kScratchTensorMaxSize];
|
||||
int32_t scratch_output_tensor[kScratchTensorMaxSize];
|
||||
|
||||
// Shift states. No need to set last state, the matmul is not accumulative.
|
||||
// Shift states.
|
||||
int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
|
||||
|
||||
// Left shift the activation_state.
|
||||
{
|
||||
for (int b = 0; b < n_batch; ++b) {
|
||||
int16_t* state_ptr_batch =
|
||||
GetTensorData<int16_t>(activation_state_tensor) +
|
||||
b * n_memory * n_filter;
|
||||
for (int f = 0; f < n_filter; ++f) {
|
||||
// Shift the vector left:
|
||||
int16_t* batch_ptr = state_ptr_batch;
|
||||
int16_t* batch_start = state_ptr_batch + 1;
|
||||
int16_t* batch_end = state_ptr_batch + n_memory;
|
||||
while (batch_start != batch_end) {
|
||||
*batch_ptr++ = *batch_start++;
|
||||
}
|
||||
state_ptr_batch += n_memory;
|
||||
}
|
||||
int16_t* new_state_start = state_ptr;
|
||||
const int16_t* old_state_start = state_ptr + 1;
|
||||
const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
|
||||
while (old_state_start != old_state_end) {
|
||||
*new_state_start++ = *old_state_start++;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: no need to clear the latest activation, matmul is not accumulative.
|
||||
|
||||
// Feature matmul.
|
||||
{
|
||||
int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
|
||||
const int8_t* input = GetTensorData<int8_t>(input_tensor);
|
||||
const int8_t* weight_feature =
|
||||
GetTensorData<int8_t>(weights_feature_tensor);
|
||||
int16_t* result_in_batch = state + (n_memory - 1);
|
||||
int16_t* result_in_batch = state_ptr + (n_memory - 1);
|
||||
|
||||
ae_q56s output_int16_max_56 = AE_CVTQ48A32S(INT16_MAX);
|
||||
ae_q56s output_int16_min_56 = AE_CVTQ48A32S(INT16_MIN);
|
||||
@ -170,9 +165,7 @@ void EvalIntegerSVDF(
|
||||
|
||||
// Perform batched vector dot product:
|
||||
const int16_t* vector1_ptr = GetTensorData<int16_t>(weights_time_tensor);
|
||||
const int16_t* vector2_ptr =
|
||||
GetTensorData<int16_t>(activation_state_tensor) +
|
||||
b * n_memory * n_filter;
|
||||
const int16_t* vector2_ptr = state_ptr + b * n_memory * n_filter;
|
||||
|
||||
int num_iters = n_filter / 2;
|
||||
const ae_p16x2s* offset_vector1 = (const ae_p16x2s*)(vector1_ptr - 2);
|
||||
|
Loading…
Reference in New Issue
Block a user