SVDF: Do state shifting and clearing of last state separately, so state shifting can be done in a single loop.

PiperOrigin-RevId: 298417000
Change-Id: Ia8a6756ffcc1bb7f761755461c085c55511602bc
This commit is contained in:
Robert David 2020-03-02 12:30:31 -08:00 committed by TensorFlower Gardener
parent 7c12dad5fd
commit ae2fe82448
3 changed files with 60 additions and 80 deletions
tensorflow/lite
kernels/internal/reference
micro/kernels
svdf.cc
xtensa_hifimini

View File

@ -85,28 +85,23 @@ inline void EvalIntegerSVDF(
const int n_unit = n_filter / n_rank;
const int n_memory = weights_time_tensor->dims->data[1];
// Shift state.
{
int16_t zero = 0;
for (int b = 0; b < n_batch; ++b) {
int16_t* state_ptr_batch =
GetTensorData<int16_t>(state_tensor) + b * n_memory * n_filter;
for (int f = 0; f < n_filter; ++f) {
tensor_utils::VectorShiftLeft(state_ptr_batch, n_memory, zero);
state_ptr_batch += n_memory;
}
}
}
int16_t* const state_ptr = GetTensorData<int16_t>(state_tensor);
// Left shift the activation_state.
// std::copy is fine for overlapping ranges if the output is outside of the
// input range. (This is not true for copy_n.)
std::copy(state_ptr + 1, state_ptr + n_batch * n_memory * n_filter,
state_ptr);
// Feature matmul.
// Note: no need to clear the latest activation, matmul is not accumulative.
{
int16_t* state = GetTensorData<int16_t>(state_tensor);
const int8_t* input = GetTensorData<int8_t>(input_tensor);
const int8_t* weight_feature =
GetTensorData<int8_t>(weights_feature_tensor);
const int32_t output_max = std::numeric_limits<int16_t>::max();
const int32_t output_min = std::numeric_limits<int16_t>::min();
int16_t* result_in_batch = state + (n_memory - 1);
int16_t* result_in_batch = state_ptr + (n_memory - 1);
for (int b = 0; b < n_batch; b++) {
const int8_t* matrix_ptr = weight_feature;
for (int r = 0; r < n_filter; r++) {
@ -133,8 +128,7 @@ inline void EvalIntegerSVDF(
// Time.
{
for (int b = 0; b < n_batch; ++b) {
const int16_t* state_ptr_batch =
GetTensorData<int16_t>(state_tensor) + b * n_memory * n_filter;
const int16_t* state_ptr_batch = state_ptr + b * n_memory * n_filter;
int32_t* scratch_ptr_batch =
GetTensorData<int32_t>(scratch_tensor) + b * n_filter;
tensor_utils::BatchVectorBatchVectorDotProduct(
@ -199,15 +193,15 @@ inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
float* output_ptr = GetTensorData<float>(output);
// Left shift the activation_state, and clear the latest activation (the
// rightmost column).
for (int b = 0; b < batch_size; ++b) {
float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
for (int f = 0; f < num_filters; ++f) {
tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
/*shift_value=*/0.0f);
state_ptr_batch += memory_size;
}
// Left shift the activation_state.
// std::copy is fine for overlapping ranges if the output is outside of the
// input range. (This is not true for copy_n.)
std::copy(state_ptr + 1, state_ptr + batch_size * memory_size * num_filters,
state_ptr);
// Clear the latest activation (the rightmost column).
for (int i = 0; i < batch_size * num_filters; ++i) {
state_ptr[i * memory_size + memory_size - 1] = 0.0f;
}
// Compute conv1d(inputs, weights_feature).
@ -252,15 +246,15 @@ inline void EvalHybridSVDF(
// Initialize the weights scale.
const float weights_feature_scale = weights_feature->params.scale;
// Left shift the activation_state, and clear the latest activation (the
// rightmost column).
for (int b = 0; b < batch_size; ++b) {
float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
for (int f = 0; f < num_filters; ++f) {
tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
/*shift_value=*/0.0f);
state_ptr_batch += memory_size;
}
// Left shift the activation_state.
// std::copy is fine for overlapping ranges if the output is outside of the
// input range. (This is not true for copy_n.)
std::copy(state_ptr + 1, state_ptr + batch_size * memory_size * num_filters,
state_ptr);
// Clear the latest activation (the rightmost column).
for (int i = 0; i < batch_size * num_filters; ++i) {
state_ptr[i * memory_size + memory_size - 1] = 0.0f;
}
if (!tensor_utils::IsZeroVector(input_ptr, batch_size * input_size)) {

View File

@ -142,19 +142,16 @@ inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
float* output_ptr = GetTensorData<float>(output);
// Left shift the activation_state.
for (int b = 0; b < batch_size; ++b) {
float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
for (int f = 0; f < num_filters; ++f) {
// Shift the vector left:
float* batch_ptr = state_ptr_batch;
float* batch_start = state_ptr_batch + 1;
float* batch_end = state_ptr_batch + memory_size;
while (batch_start != batch_end) {
*batch_ptr++ = *batch_start++;
}
state_ptr_batch += memory_size;
{
float* new_state_start = state_ptr;
const float* old_state_start = state_ptr + 1;
const float* old_state_end =
state_ptr + batch_size * num_filters * memory_size;
while (old_state_start != old_state_end) {
*new_state_start++ = *old_state_start++;
}
}
// Note: no need to clear the latest activation, matmul is not accumulative.
// Compute conv1d(inputs, weights_feature).
@ -206,25 +203,21 @@ void EvalIntegerSVDF(
int32_t scratch_tensor[kScratchTensorMaxSize];
int32_t scratch_output_tensor[kScratchTensorMaxSize];
// Shift states. No need to set last state, the matmul is not accumulative.
// Shift states.
int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
// Left shift the activation_state.
{
for (int b = 0; b < n_batch; ++b) {
int16_t* state_ptr_batch =
GetTensorData<int16_t>(activation_state_tensor) +
b * n_memory * n_filter;
for (int f = 0; f < n_filter; ++f) {
// Shift the vector left:
int16_t* batch_ptr = state_ptr_batch;
int16_t* batch_start = state_ptr_batch + 1;
int16_t* batch_end = state_ptr_batch + n_memory;
while (batch_start != batch_end) {
*batch_ptr++ = *batch_start++;
}
state_ptr_batch += n_memory;
}
int16_t* new_state_start = state_ptr;
const int16_t* old_state_start = state_ptr + 1;
const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
while (old_state_start != old_state_end) {
*new_state_start++ = *old_state_start++;
}
}
// Note: no need to clear the latest activation, matmul is not accumulative.
// Feature matmul.
{
int16_t* state = GetTensorData<int16_t>(activation_state_tensor);

View File

@ -75,32 +75,27 @@ void EvalIntegerSVDF(
int32_t scratch_tensor[kScratchTensorMaxSize];
int32_t scratch_output_tensor[kScratchTensorMaxSize];
// Shift states. No need to set last state, the matmul is not accumulative.
// Shift states.
int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
// Left shift the activation_state.
{
for (int b = 0; b < n_batch; ++b) {
int16_t* state_ptr_batch =
GetTensorData<int16_t>(activation_state_tensor) +
b * n_memory * n_filter;
for (int f = 0; f < n_filter; ++f) {
// Shift the vector left:
int16_t* batch_ptr = state_ptr_batch;
int16_t* batch_start = state_ptr_batch + 1;
int16_t* batch_end = state_ptr_batch + n_memory;
while (batch_start != batch_end) {
*batch_ptr++ = *batch_start++;
}
state_ptr_batch += n_memory;
}
int16_t* new_state_start = state_ptr;
const int16_t* old_state_start = state_ptr + 1;
const int16_t* old_state_end = state_ptr + n_batch * n_filter * n_memory;
while (old_state_start != old_state_end) {
*new_state_start++ = *old_state_start++;
}
}
// Note: no need to clear the latest activation, matmul is not accumulative.
// Feature matmul.
{
int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
const int8_t* input = GetTensorData<int8_t>(input_tensor);
const int8_t* weight_feature =
GetTensorData<int8_t>(weights_feature_tensor);
int16_t* result_in_batch = state + (n_memory - 1);
int16_t* result_in_batch = state_ptr + (n_memory - 1);
ae_q56s output_int16_max_56 = AE_CVTQ48A32S(INT16_MAX);
ae_q56s output_int16_min_56 = AE_CVTQ48A32S(INT16_MIN);
@ -170,9 +165,7 @@ void EvalIntegerSVDF(
// Perform batched vector dot product:
const int16_t* vector1_ptr = GetTensorData<int16_t>(weights_time_tensor);
const int16_t* vector2_ptr =
GetTensorData<int16_t>(activation_state_tensor) +
b * n_memory * n_filter;
const int16_t* vector2_ptr = state_ptr + b * n_memory * n_filter;
int num_iters = n_filter / 2;
const ae_p16x2s* offset_vector1 = (const ae_p16x2s*)(vector1_ptr - 2);