356 lines
16 KiB
C++
356 lines
16 KiB
C++
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
#include "tensorflow/lite/kernels/internal/kernel_utils.h"
|
|
|
|
#include <algorithm>
|
|
|
|
#include "tensorflow/lite/kernels/internal/tensor_utils.h"
|
|
|
|
namespace tflite {
|
|
namespace kernel_utils {
|
|
|
|
void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
|
|
const float* recurrent_weights_ptr, const float* bias_ptr,
|
|
int input_size, int num_units, int batch_size,
|
|
int output_batch_leading_dim,
|
|
TfLiteFusedActivation activation,
|
|
float* hidden_state_ptr_batch, float* output_ptr_batch) {
|
|
RnnBatchStep(input_ptr_batch, input_weights_ptr,
|
|
/*aux_input_ptr_batch=*/nullptr,
|
|
/*aux_input_weights_ptr=*/nullptr, recurrent_weights_ptr,
|
|
bias_ptr, input_size, /*aux_input_size=*/0, num_units,
|
|
batch_size, output_batch_leading_dim, activation,
|
|
hidden_state_ptr_batch, output_ptr_batch);
|
|
}
|
|
|
|
void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
|
|
const float* aux_input_ptr_batch,
|
|
const float* aux_input_weights_ptr,
|
|
const float* recurrent_weights_ptr, const float* bias_ptr,
|
|
int input_size, int aux_input_size, int num_units,
|
|
int batch_size, int output_batch_leading_dim,
|
|
TfLiteFusedActivation activation,
|
|
float* hidden_state_ptr_batch, float* output_ptr_batch) {
|
|
// Since the output batch rows may not be contiguous (output_batch_leading_dim
|
|
// != n_output), we unroll the batched operations where this is the case.
|
|
if (output_batch_leading_dim == num_units) {
|
|
// Output = bias
|
|
tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
|
|
output_ptr_batch);
|
|
|
|
// Output += input * input_weights
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size,
|
|
output_ptr_batch);
|
|
|
|
// Output += aux_input * aux_input_weights (if they are not empty).
|
|
if (aux_input_size > 0) {
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
aux_input_weights_ptr, num_units, aux_input_size, aux_input_ptr_batch,
|
|
batch_size, output_ptr_batch);
|
|
}
|
|
|
|
// Output += recurrent_weights * hidden_state
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch,
|
|
batch_size, output_ptr_batch);
|
|
|
|
// Output = activation(Output) and update hidden_state
|
|
tensor_utils::ApplyActivationToVector(
|
|
output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
|
|
std::copy_n(output_ptr_batch, num_units * batch_size,
|
|
hidden_state_ptr_batch);
|
|
} else {
|
|
// Output = bias
|
|
for (int k = 0; k < batch_size; k++) {
|
|
std::copy_n(bias_ptr, num_units,
|
|
output_ptr_batch + k * output_batch_leading_dim);
|
|
}
|
|
|
|
// Output += input * input_weights
|
|
for (int k = 0; k < batch_size; k++) {
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
input_weights_ptr, num_units, input_size,
|
|
input_ptr_batch + k * input_size, /*n_batch=*/1,
|
|
output_ptr_batch + k * output_batch_leading_dim);
|
|
}
|
|
|
|
// Output += aux_input * aux_input_weights (if they are not empty).
|
|
if (aux_input_size > 0) {
|
|
for (int k = 0; k < batch_size; k++) {
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
aux_input_weights_ptr, num_units, aux_input_size,
|
|
aux_input_ptr_batch + k * aux_input_size,
|
|
/*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim);
|
|
}
|
|
}
|
|
|
|
// Output += recurrent_weights * hidden_state
|
|
for (int k = 0; k < batch_size; k++) {
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
recurrent_weights_ptr, num_units, num_units,
|
|
hidden_state_ptr_batch + k * num_units,
|
|
/*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim);
|
|
}
|
|
|
|
// Output = activation(Output) and update hidden_state
|
|
for (int k = 0; k < batch_size; k++) {
|
|
tensor_utils::ApplyActivationToVector(
|
|
output_ptr_batch + k * output_batch_leading_dim, num_units,
|
|
activation, output_ptr_batch + k * output_batch_leading_dim);
|
|
std::copy_n(output_ptr_batch + k * output_batch_leading_dim, num_units,
|
|
hidden_state_ptr_batch + k * num_units);
|
|
}
|
|
}
|
|
}
|
|
|
|
void RnnBatchStep(
|
|
const float* input_ptr_batch, const int8_t* input_weights_ptr,
|
|
float input_weights_scale, const int8_t* recurrent_weights_ptr,
|
|
float recurrent_weights_scale, const float* bias_ptr, int input_size,
|
|
int num_units, int batch_size, int output_batch_leading_dim,
|
|
TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
|
|
int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
|
|
float* hidden_state_ptr_batch, float* output_ptr_batch,
|
|
bool asymmetric_quantize_inputs, int32_t* zero_points,
|
|
int32_t* accum_scratch, int32_t* row_sums, bool* compute_row_sums) {
|
|
RnnBatchStep(input_ptr_batch, input_weights_ptr, input_weights_scale,
|
|
/*aux_input_ptr_batch=*/nullptr,
|
|
/*aux_input_weights_ptr=*/nullptr,
|
|
/*aux_input_weights_scale=*/0.0f, recurrent_weights_ptr,
|
|
recurrent_weights_scale, bias_ptr, input_size,
|
|
/*aux_input_size=*/0, num_units, batch_size,
|
|
output_batch_leading_dim, activation, quantized_input_ptr_batch,
|
|
/*aux_quantized_input_ptr_batch=*/nullptr,
|
|
quantized_hidden_state_ptr_batch, scaling_factors,
|
|
hidden_state_ptr_batch, output_ptr_batch,
|
|
asymmetric_quantize_inputs, zero_points, accum_scratch, row_sums,
|
|
compute_row_sums);
|
|
}
|
|
|
|
void ComputeMatrixSums(int32_t* input_row_sums, int32_t* aux_input_row_sums,
|
|
int32_t* recurrent_row_sums, int32_t* row_sums,
|
|
const float* aux_input_ptr_batch, int num_units,
|
|
int input_size, int aux_input_size,
|
|
const int8_t* input_weights_ptr,
|
|
const int8_t* aux_input_weights_ptr,
|
|
const int8_t* recurrent_weights_ptr) {
|
|
memset(input_row_sums, 0, sizeof(int32_t) * num_units);
|
|
tensor_utils::ReductionSumVector(input_weights_ptr, input_row_sums, num_units,
|
|
input_size);
|
|
if (aux_input_ptr_batch) {
|
|
memset(aux_input_row_sums, 0, sizeof(int32_t) * num_units);
|
|
tensor_utils::ReductionSumVector(aux_input_weights_ptr, aux_input_row_sums,
|
|
num_units, aux_input_size);
|
|
}
|
|
memset(recurrent_row_sums, 0, sizeof(int32_t) * num_units);
|
|
tensor_utils::ReductionSumVector(recurrent_weights_ptr, recurrent_row_sums,
|
|
num_units, num_units);
|
|
}
|
|
|
|
void RnnBatchStep(
|
|
const float* input_ptr_batch, const int8_t* input_weights_ptr,
|
|
float input_weights_scale, const float* aux_input_ptr_batch,
|
|
const int8_t* aux_input_weights_ptr, float aux_input_weights_scale,
|
|
const int8_t* recurrent_weights_ptr, float recurrent_weights_scale,
|
|
const float* bias_ptr, int input_size, int aux_input_size, int num_units,
|
|
int batch_size, int output_batch_leading_dim,
|
|
TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
|
|
int8_t* aux_quantized_input_ptr_batch,
|
|
int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
|
|
float* hidden_state_ptr_batch, float* output_ptr_batch,
|
|
bool asymmetric_quantize_inputs, int32_t* zero_points,
|
|
int32_t* accum_scratch, int32_t* row_sums, bool* compute_row_sums) {
|
|
// Since the output batch rows may not be contiguous (output_batch_leading_dim
|
|
// != n_output), we unroll the batched operations where this is the case.
|
|
|
|
int32_t* input_row_sums = nullptr;
|
|
int32_t* aux_input_row_sums = nullptr;
|
|
int32_t* recurrent_row_sums = nullptr;
|
|
if (asymmetric_quantize_inputs) {
|
|
input_row_sums = row_sums;
|
|
aux_input_row_sums = row_sums;
|
|
if (aux_input_ptr_batch) {
|
|
aux_input_row_sums += num_units;
|
|
}
|
|
recurrent_row_sums = aux_input_row_sums + num_units;
|
|
if (*compute_row_sums) {
|
|
ComputeMatrixSums(input_row_sums, aux_input_row_sums, recurrent_row_sums,
|
|
row_sums, aux_input_ptr_batch, num_units, input_size,
|
|
aux_input_size, input_weights_ptr,
|
|
aux_input_weights_ptr, recurrent_weights_ptr);
|
|
*compute_row_sums = false;
|
|
}
|
|
}
|
|
|
|
if (output_batch_leading_dim == num_units) {
|
|
// Output = bias
|
|
tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
|
|
output_ptr_batch);
|
|
|
|
// Save quantization and matmul computation for all zero input.
|
|
if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
|
|
// Quantize input from float to uint8 + quantization params (scaling
|
|
// factor).
|
|
tensor_utils::BatchQuantizeFloats(
|
|
input_ptr_batch, batch_size, input_size, quantized_input_ptr_batch,
|
|
scaling_factors, zero_points, asymmetric_quantize_inputs);
|
|
for (int b = 0; b < batch_size; ++b) {
|
|
scaling_factors[b] *= input_weights_scale;
|
|
}
|
|
// Output += input * input_weights
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
input_weights_ptr, num_units, input_size, quantized_input_ptr_batch,
|
|
scaling_factors, batch_size, output_ptr_batch,
|
|
/*per_channel_scale=*/nullptr, zero_points, accum_scratch,
|
|
input_row_sums, compute_row_sums, /*context=*/nullptr);
|
|
}
|
|
|
|
if (aux_input_ptr_batch &&
|
|
!tensor_utils::IsZeroVector(aux_input_ptr_batch,
|
|
batch_size * aux_input_size)) {
|
|
tensor_utils::BatchQuantizeFloats(
|
|
aux_input_ptr_batch, batch_size, aux_input_size,
|
|
aux_quantized_input_ptr_batch, scaling_factors, zero_points,
|
|
asymmetric_quantize_inputs);
|
|
for (int b = 0; b < batch_size; ++b) {
|
|
scaling_factors[b] *= aux_input_weights_scale;
|
|
}
|
|
|
|
// Output += aux_input * aux_input_weights
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
aux_input_weights_ptr, num_units, aux_input_size,
|
|
aux_quantized_input_ptr_batch, scaling_factors, batch_size,
|
|
output_ptr_batch, /*per_channel_scale=*/nullptr, zero_points,
|
|
accum_scratch, aux_input_row_sums, compute_row_sums,
|
|
/*context=*/nullptr);
|
|
}
|
|
|
|
// Save quantization and matmul computation for all zero input.
|
|
if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
|
|
batch_size * num_units)) {
|
|
// Quantize hidden_state
|
|
tensor_utils::BatchQuantizeFloats(
|
|
hidden_state_ptr_batch, batch_size, num_units,
|
|
quantized_hidden_state_ptr_batch, scaling_factors, zero_points,
|
|
asymmetric_quantize_inputs);
|
|
for (int b = 0; b < batch_size; ++b) {
|
|
scaling_factors[b] *= recurrent_weights_scale;
|
|
}
|
|
|
|
// Output += recurrent_weights * hidden_state
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
recurrent_weights_ptr, num_units, num_units,
|
|
quantized_hidden_state_ptr_batch, scaling_factors, batch_size,
|
|
output_ptr_batch, /*per_channel_scale=*/nullptr, zero_points,
|
|
accum_scratch, recurrent_row_sums, compute_row_sums,
|
|
/*context=*/nullptr);
|
|
}
|
|
|
|
// Output = activation(Output) and update hidden_state
|
|
tensor_utils::ApplyActivationToVector(
|
|
output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
|
|
std::copy_n(output_ptr_batch, num_units * batch_size,
|
|
hidden_state_ptr_batch);
|
|
} else {
|
|
// Output = bias
|
|
for (int k = 0; k < batch_size; k++) {
|
|
std::copy_n(bias_ptr, num_units,
|
|
output_ptr_batch + k * output_batch_leading_dim);
|
|
}
|
|
|
|
// Save quantization and matmul computation for all zero input.
|
|
if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
|
|
// Quantize input from float to uint8 + quantization params (scaling
|
|
// factor).
|
|
tensor_utils::BatchQuantizeFloats(
|
|
input_ptr_batch, batch_size, input_size, quantized_input_ptr_batch,
|
|
scaling_factors, zero_points, asymmetric_quantize_inputs);
|
|
for (int b = 0; b < batch_size; ++b) {
|
|
scaling_factors[b] *= input_weights_scale;
|
|
}
|
|
|
|
// Output += input * input_weights
|
|
for (int k = 0; k < batch_size; k++) {
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
input_weights_ptr, num_units, input_size,
|
|
quantized_input_ptr_batch + k * input_size, &scaling_factors[k],
|
|
/*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
|
|
/*per_channel_scale=*/nullptr, zero_points + k, accum_scratch,
|
|
input_row_sums, compute_row_sums, /*context=*/nullptr);
|
|
}
|
|
}
|
|
|
|
if (aux_input_ptr_batch &&
|
|
!tensor_utils::IsZeroVector(aux_input_ptr_batch,
|
|
batch_size * aux_input_size)) {
|
|
tensor_utils::BatchQuantizeFloats(
|
|
aux_input_ptr_batch, batch_size, aux_input_size,
|
|
aux_quantized_input_ptr_batch, scaling_factors, zero_points,
|
|
asymmetric_quantize_inputs);
|
|
for (int b = 0; b < batch_size; ++b) {
|
|
scaling_factors[b] *= aux_input_weights_scale;
|
|
}
|
|
|
|
// Output += aux_input * aux_input_weights
|
|
for (int k = 0; k < batch_size; k++) {
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
aux_input_weights_ptr, num_units, aux_input_size,
|
|
aux_quantized_input_ptr_batch + k * aux_input_size,
|
|
&scaling_factors[k],
|
|
/*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
|
|
/*per_channel_scale=*/nullptr, zero_points + k, accum_scratch,
|
|
aux_input_row_sums, compute_row_sums, /*context=*/nullptr);
|
|
}
|
|
}
|
|
|
|
// Save quantization and matmul computation for all zero input.
|
|
if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
|
|
batch_size * num_units)) {
|
|
// Quantize hidden_state
|
|
tensor_utils::BatchQuantizeFloats(
|
|
hidden_state_ptr_batch, batch_size, num_units,
|
|
quantized_hidden_state_ptr_batch, scaling_factors, zero_points,
|
|
asymmetric_quantize_inputs);
|
|
for (int b = 0; b < batch_size; ++b) {
|
|
scaling_factors[b] *= recurrent_weights_scale;
|
|
}
|
|
|
|
// Output += recurrent_weights * hidden_state
|
|
for (int k = 0; k < batch_size; k++) {
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
recurrent_weights_ptr, num_units, num_units,
|
|
quantized_hidden_state_ptr_batch + k * num_units,
|
|
&scaling_factors[k], /*n_batch=*/1,
|
|
output_ptr_batch + k * output_batch_leading_dim,
|
|
/*per_channel_scale=*/nullptr, zero_points + k, accum_scratch,
|
|
recurrent_row_sums, compute_row_sums, /*context=*/nullptr);
|
|
}
|
|
}
|
|
|
|
// Output = activation(Output) and update hidden_state
|
|
for (int k = 0; k < batch_size; k++) {
|
|
tensor_utils::ApplyActivationToVector(
|
|
output_ptr_batch + k * output_batch_leading_dim, num_units,
|
|
activation, output_ptr_batch + k * output_batch_leading_dim);
|
|
std::copy_n(output_ptr_batch + k * output_batch_leading_dim, num_units,
|
|
hidden_state_ptr_batch + k * num_units);
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace kernel_utils
|
|
} // namespace tflite
|