Remove TFLiteTensors from FloatSVDF and IntegerSVDF methods in shared SVDF code. Call into shared SVDF code instead of TFLM specific SVDF code for float reference kernel.
PiperOrigin-RevId: 336555588 Change-Id: If448e01cf0ea944229658997c5ea8ff8ec5eff2d
This commit is contained in:
parent
a7cabb95d6
commit
f93d0e2c72
@ -36,7 +36,7 @@ namespace reference_ops {
|
||||
|
||||
static inline void ApplyTimeWeightsBiasAndActivation(
|
||||
int batch_size, int memory_size, int num_filters, int num_units, int rank,
|
||||
const float* const __restrict__ weights_time_ptr,
|
||||
const float* const __restrict__ weights_time_data,
|
||||
const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
|
||||
float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
|
||||
float* const __restrict__ output_ptr) {
|
||||
@ -45,7 +45,7 @@ static inline void ApplyTimeWeightsBiasAndActivation(
|
||||
float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
|
||||
float* scratch_ptr_batch = scratch_ptr + b * num_filters;
|
||||
tensor_utils::BatchVectorBatchVectorDotProduct(
|
||||
weights_time_ptr, state_ptr_batch, memory_size, num_filters,
|
||||
weights_time_data, state_ptr_batch, memory_size, num_filters,
|
||||
scratch_ptr_batch);
|
||||
}
|
||||
|
||||
@ -74,44 +74,41 @@ static inline void ApplyTimeWeightsBiasAndActivation(
|
||||
}
|
||||
|
||||
inline void EvalIntegerSVDF(
|
||||
TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input_tensor,
|
||||
const TfLiteTensor* weights_feature_tensor,
|
||||
const TfLiteTensor* weights_time_tensor, const TfLiteTensor* bias_tensor,
|
||||
const TfLiteSVDFParams* params, TfLiteTensor* state_tensor,
|
||||
TfLiteTensor* output_tensor, TfLiteTensor* scratch_tensor,
|
||||
TfLiteTensor* output_temp_tensor, int32_t scale_1_a, int scale_1_b,
|
||||
int32_t scale_2_a, int scale_2_b, int32_t input_zp, int32_t output_zp) {
|
||||
const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
|
||||
const int8_t* input_data, const RuntimeShape& weights_feature_shape,
|
||||
const int8_t* weights_feature_data, const RuntimeShape& weights_time_shape,
|
||||
const int16_t* weights_time_data, const RuntimeShape& bias_shape,
|
||||
const int32_t* bias_data, int16_t* state_data,
|
||||
const RuntimeShape& output_shape, int8_t* output_data,
|
||||
int32_t* scratch_data, int32_t* output_temp_data, int32_t scale_1_a,
|
||||
int scale_1_b, int32_t scale_2_a, int scale_2_b, int32_t input_zp,
|
||||
int32_t output_zp) {
|
||||
const int n_rank = params->rank;
|
||||
const int n_batch = input_tensor->dims->data[0];
|
||||
const int n_input = input_tensor->dims->data[1];
|
||||
const int n_filter = weights_feature_tensor->dims->data[0];
|
||||
const int n_batch = input_shape.Dims(0);
|
||||
const int n_input = input_shape.Dims(1);
|
||||
const int n_filter = weights_feature_shape.Dims(0);
|
||||
const int n_unit = n_filter / n_rank;
|
||||
const int n_memory = weights_time_tensor->dims->data[1];
|
||||
|
||||
int16_t* const state_ptr = GetTensorData<int16_t>(state_tensor);
|
||||
const int n_memory = weights_time_shape.Dims(1);
|
||||
|
||||
// Left shift the activation_state.
|
||||
// std::copy is fine for overlapping ranges if the output is outside of the
|
||||
// input range. (This is not true for copy_n.)
|
||||
std::copy(state_ptr + 1, state_ptr + n_batch * n_memory * n_filter,
|
||||
state_ptr);
|
||||
std::copy(state_data + 1, state_data + n_batch * n_memory * n_filter,
|
||||
state_data);
|
||||
|
||||
// Feature matmul.
|
||||
// Note: no need to clear the latest activation, matmul is not accumulative.
|
||||
{
|
||||
const int8_t* input = GetTensorData<int8_t>(input_tensor);
|
||||
const int8_t* weight_feature =
|
||||
GetTensorData<int8_t>(weights_feature_tensor);
|
||||
const int32_t output_max = std::numeric_limits<int16_t>::max();
|
||||
const int32_t output_min = std::numeric_limits<int16_t>::min();
|
||||
int16_t* result_in_batch = state_ptr + (n_memory - 1);
|
||||
int16_t* result_in_batch = state_data + (n_memory - 1);
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
const int8_t* matrix_ptr = weight_feature;
|
||||
const int8_t* matrix_data = weights_feature_data;
|
||||
for (int r = 0; r < n_filter; r++) {
|
||||
int32_t dot_prod = 0;
|
||||
const int8_t* vector_in_batch = input + b * n_input;
|
||||
const int8_t* vector_in_batch = input_data + b * n_input;
|
||||
for (int c = 0; c < n_input; c++) {
|
||||
dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp);
|
||||
dot_prod += *matrix_data++ * (*vector_in_batch++ - input_zp);
|
||||
}
|
||||
dot_prod =
|
||||
MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
|
||||
@ -131,171 +128,134 @@ inline void EvalIntegerSVDF(
|
||||
// Time.
|
||||
{
|
||||
for (int b = 0; b < n_batch; ++b) {
|
||||
const int16_t* state_ptr_batch = state_ptr + b * n_memory * n_filter;
|
||||
int32_t* scratch_ptr_batch =
|
||||
GetTensorData<int32_t>(scratch_tensor) + b * n_filter;
|
||||
const int16_t* state_data_batch = state_data + b * n_memory * n_filter;
|
||||
int32_t* scratch_data_batch = scratch_data + b * n_filter;
|
||||
tensor_utils::BatchVectorBatchVectorDotProduct(
|
||||
GetTensorData<int16_t>(weights_time_tensor), state_ptr_batch,
|
||||
n_memory, n_filter, scratch_ptr_batch);
|
||||
weights_time_data, state_data_batch, n_memory, n_filter,
|
||||
scratch_data_batch);
|
||||
}
|
||||
}
|
||||
|
||||
// Reduce, add bias, rescale, activation.
|
||||
{
|
||||
int32_t* output_temp = GetTensorData<int32_t>(output_temp_tensor);
|
||||
// Add bias.
|
||||
if (bias_tensor) {
|
||||
tensor_utils::VectorBatchVectorAssign(GetTensorData<int32_t>(bias_tensor),
|
||||
n_unit, n_batch, output_temp);
|
||||
if (bias_data) {
|
||||
tensor_utils::VectorBatchVectorAssign(bias_data, n_unit, n_batch,
|
||||
output_temp_data);
|
||||
} else {
|
||||
std::fill_n(output_temp, n_batch * n_unit, 0);
|
||||
std::fill_n(output_temp_data, n_batch * n_unit, 0);
|
||||
}
|
||||
// Reduce.
|
||||
for (int b = 0; b < n_batch; ++b) {
|
||||
int32_t* output_temp_ptr = output_temp + b * n_unit;
|
||||
int32_t* scratch_ptr_batch =
|
||||
GetTensorData<int32_t>(scratch_tensor) + b * n_filter;
|
||||
tensor_utils::ReductionSumVector(scratch_ptr_batch, output_temp_ptr,
|
||||
int32_t* output_temp_ptr = output_temp_data + b * n_unit;
|
||||
int32_t* scratch_data_batch = scratch_data + b * n_filter;
|
||||
tensor_utils::ReductionSumVector(scratch_data_batch, output_temp_ptr,
|
||||
n_unit, n_rank);
|
||||
}
|
||||
// Rescale.
|
||||
const int32_t output_max = std::numeric_limits<int8_t>::max();
|
||||
const int32_t output_min = std::numeric_limits<int8_t>::min();
|
||||
for (int i = 0; i < n_batch * n_unit; ++i) {
|
||||
int32_t x1 = output_temp[i];
|
||||
int32_t x1 = output_temp_data[i];
|
||||
int32_t x2 = MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b);
|
||||
int32_t x3 = x2 + output_zp;
|
||||
int32_t x4 = std::min(std::max(output_min, x3), output_max);
|
||||
GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
|
||||
output_data[i] = static_cast<int8_t>(x4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
|
||||
const TfLiteTensor* input,
|
||||
const TfLiteTensor* weights_feature,
|
||||
const TfLiteTensor* weights_time,
|
||||
const TfLiteTensor* bias,
|
||||
const TfLiteSVDFParams* params, TfLiteTensor* scratch,
|
||||
TfLiteTensor* state, TfLiteTensor* output) {
|
||||
inline void EvalFloatSVDF(
|
||||
const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& weights_feature_shape,
|
||||
const float* weights_feature_data, const RuntimeShape& weights_time_shape,
|
||||
const float* weights_time_data, const RuntimeShape& bias_shape,
|
||||
const float* bias_data, float* scratch_data, float* state_data,
|
||||
const RuntimeShape& output_shape, float* output_data) {
|
||||
const int rank = params->rank;
|
||||
const int batch_size = input->dims->data[0];
|
||||
const int input_size = input->dims->data[1];
|
||||
const int num_filters = weights_feature->dims->data[0];
|
||||
const int batch_size = input_shape.Dims(0);
|
||||
const int input_size = input_shape.Dims(1);
|
||||
const int num_filters = weights_feature_shape.Dims(0);
|
||||
const int num_units = num_filters / rank;
|
||||
const int memory_size = weights_time->dims->data[1];
|
||||
|
||||
// Raw pointers to tensor data.
|
||||
const float* input_ptr = GetTensorData<float>(input);
|
||||
const float* weights_feature_ptr = GetTensorData<float>(weights_feature);
|
||||
const float* weights_time_ptr = GetTensorData<float>(weights_time);
|
||||
const float* bias_ptr = GetTensorData<float>(bias);
|
||||
|
||||
float* state_ptr = GetTensorData<float>(state);
|
||||
float* scratch_ptr = GetTensorData<float>(scratch);
|
||||
|
||||
float* output_ptr = GetTensorData<float>(output);
|
||||
const int memory_size = weights_time_shape.Dims(1);
|
||||
|
||||
// Left shift the activation_state.
|
||||
// std::copy is fine for overlapping ranges if the output is outside of the
|
||||
// input range. (This is not true for copy_n.)
|
||||
std::copy(state_ptr + 1, state_ptr + batch_size * memory_size * num_filters,
|
||||
state_ptr);
|
||||
std::copy(state_data + 1, state_data + batch_size * memory_size * num_filters,
|
||||
state_data);
|
||||
|
||||
// Clear scratch (the matmul is accumulative).
|
||||
std::fill_n(scratch_ptr, batch_size * num_filters, 0.0f);
|
||||
std::fill_n(scratch_data, batch_size * num_filters, 0.0f);
|
||||
|
||||
// Compute conv1d(inputs, weights_feature).
|
||||
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
||||
weights_feature_ptr, num_filters, input_size, input_ptr, batch_size,
|
||||
scratch_ptr);
|
||||
weights_feature_data, num_filters, input_size, input_data, batch_size,
|
||||
scratch_data);
|
||||
|
||||
// Copy the latest activation from scratch into activation_state:
|
||||
// The last, i.e. (memory_size-1)th entry for each batch, and filter.
|
||||
for (int i = 0; i < batch_size * num_filters; ++i) {
|
||||
state_ptr[i * memory_size + memory_size - 1] = scratch_ptr[i];
|
||||
state_data[i * memory_size + memory_size - 1] = scratch_data[i];
|
||||
}
|
||||
|
||||
ApplyTimeWeightsBiasAndActivation(
|
||||
batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
|
||||
bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
|
||||
batch_size, memory_size, num_filters, num_units, rank, weights_time_data,
|
||||
bias_data, params->activation, state_data, scratch_data, output_data);
|
||||
}
|
||||
|
||||
inline void EvalHybridSVDF(
|
||||
TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
|
||||
const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
|
||||
const TfLiteTensor* bias, const TfLiteSVDFParams* params,
|
||||
TfLiteTensor* scratch, TfLiteTensor* scaling_factors,
|
||||
TfLiteTensor* input_quantized, TfLiteTensor* state, TfLiteTensor* output,
|
||||
TfLiteTensor* zero_points, TfLiteTensor* row_sums, bool* compute_row_sums) {
|
||||
const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
|
||||
const float* input_data, const RuntimeShape& weights_feature_shape,
|
||||
const int8_t* weights_feature_data, const float weights_feature_scale,
|
||||
const RuntimeShape& weights_time_shape, const float* weights_time_data,
|
||||
const RuntimeShape& bias_shape, const float* bias_data, float* scratch,
|
||||
float* scaling_factors, int8_t* quantized_input, float* state,
|
||||
const RuntimeShape& output_shape, float* output_data, int32_t* zero_points,
|
||||
int32_t* row_sums, bool* compute_row_sums) {
|
||||
const int rank = params->rank;
|
||||
const int batch_size = input->dims->data[0];
|
||||
const int input_size = input->dims->data[1];
|
||||
const int num_filters = weights_feature->dims->data[0];
|
||||
const int batch_size = input_shape.Dims(0);
|
||||
const int input_size = input_shape.Dims(1);
|
||||
const int num_filters = weights_feature_shape.Dims(0);
|
||||
const int num_units = num_filters / rank;
|
||||
const int memory_size = weights_time->dims->data[1];
|
||||
|
||||
// Raw pointers to tensor data.
|
||||
const float* input_ptr = GetTensorData<float>(input);
|
||||
const int8_t* weights_feature_ptr = GetTensorData<int8_t>(weights_feature);
|
||||
const float* weights_time_ptr = GetTensorData<float>(weights_time);
|
||||
const float* bias_ptr = GetTensorData<float>(bias);
|
||||
|
||||
int8_t* quantized_input_ptr = GetTensorData<int8_t>(input_quantized);
|
||||
float* scaling_factors_ptr = GetTensorData<float>(scaling_factors);
|
||||
float* state_ptr = GetTensorData<float>(state);
|
||||
float* scratch_ptr = GetTensorData<float>(scratch);
|
||||
|
||||
float* output_ptr = GetTensorData<float>(output);
|
||||
|
||||
int32_t* zero_points_ptr = nullptr;
|
||||
int32_t* row_sums_ptr = nullptr;
|
||||
if (params->asymmetric_quantize_inputs && row_sums != nullptr) {
|
||||
zero_points_ptr = GetTensorData<int32_t>(zero_points);
|
||||
row_sums_ptr = GetTensorData<int32_t>(row_sums);
|
||||
}
|
||||
|
||||
// Initialize the weights scale.
|
||||
const float weights_feature_scale = weights_feature->params.scale;
|
||||
const int memory_size = weights_time_shape.Dims(1);
|
||||
|
||||
// Left shift the activation_state.
|
||||
// std::copy is fine for overlapping ranges if the output is outside of the
|
||||
// input range. (This is not true for copy_n.)
|
||||
std::copy(state_ptr + 1, state_ptr + batch_size * memory_size * num_filters,
|
||||
state_ptr);
|
||||
std::copy(state + 1, state + batch_size * memory_size * num_filters, state);
|
||||
|
||||
// Clear scratch (the matmul is accumulative).
|
||||
std::fill_n(scratch_ptr, batch_size * num_filters, 0.0f);
|
||||
std::fill_n(scratch, batch_size * num_filters, 0.0f);
|
||||
|
||||
if (!tensor_utils::IsZeroVector(input_ptr, batch_size * input_size)) {
|
||||
if (!tensor_utils::IsZeroVector(input_data, batch_size * input_size)) {
|
||||
// Quantize input from float to int8_t.
|
||||
tensor_utils::BatchQuantizeFloats(input_ptr, batch_size, input_size,
|
||||
quantized_input_ptr, scaling_factors_ptr,
|
||||
zero_points_ptr,
|
||||
params->asymmetric_quantize_inputs);
|
||||
tensor_utils::BatchQuantizeFloats(
|
||||
input_data, batch_size, input_size, quantized_input, scaling_factors,
|
||||
zero_points, params->asymmetric_quantize_inputs);
|
||||
for (int b = 0; b < batch_size; ++b) {
|
||||
scaling_factors_ptr[b] *= weights_feature_scale;
|
||||
scaling_factors[b] *= weights_feature_scale;
|
||||
}
|
||||
|
||||
// Compute conv1d(inputs, weights_feature).
|
||||
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
||||
weights_feature_ptr, num_filters, input_size, quantized_input_ptr,
|
||||
scaling_factors_ptr, batch_size, scratch_ptr,
|
||||
/*per_channel_scale=*/nullptr, zero_points_ptr,
|
||||
reinterpret_cast<int32_t*>(scratch_ptr), row_sums_ptr, compute_row_sums,
|
||||
weights_feature_data, num_filters, input_size, quantized_input,
|
||||
scaling_factors, batch_size, scratch,
|
||||
/*per_channel_scale=*/nullptr, zero_points,
|
||||
reinterpret_cast<int32_t*>(scratch), row_sums, compute_row_sums,
|
||||
/*context=*/nullptr);
|
||||
}
|
||||
// Copy the latest activation from scratch into activation_state:
|
||||
// The last, i.e. (memory_size-1)th entry for each batch, and filter.
|
||||
for (int i = 0; i < batch_size * num_filters; ++i) {
|
||||
state_ptr[i * memory_size + memory_size - 1] = scratch_ptr[i];
|
||||
state[i * memory_size + memory_size - 1] = scratch[i];
|
||||
}
|
||||
|
||||
// TODO(alanchiao): can optimize hybrid case ~5% by unrolling loop in applying
|
||||
// time weights so that the inner loop multiplies eight elements at a time.
|
||||
ApplyTimeWeightsBiasAndActivation(
|
||||
batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
|
||||
bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
|
||||
batch_size, memory_size, num_filters, num_units, rank, weights_time_data,
|
||||
bias_data, params->activation, state, scratch, output_data);
|
||||
}
|
||||
|
||||
} // namespace reference_ops
|
||||
|
@ -304,9 +304,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
|
||||
|
||||
switch (weights_feature->type) {
|
||||
case kTfLiteFloat32: {
|
||||
reference_ops::EvalFloatSVDF(context, node, input, weights_feature,
|
||||
weights_time, bias, params, scratch, state,
|
||||
output);
|
||||
reference_ops::EvalFloatSVDF(
|
||||
params, GetTensorShape(input), GetTensorData<float>(input),
|
||||
GetTensorShape(weights_feature),
|
||||
GetTensorData<float>(weights_feature), GetTensorShape(weights_time),
|
||||
GetTensorData<float>(weights_time), GetTensorShape(bias),
|
||||
GetTensorData<float>(bias), GetTensorData<float>(scratch),
|
||||
GetTensorData<float>(state), GetTensorShape(output),
|
||||
GetTensorData<float>(output));
|
||||
return kTfLiteOk;
|
||||
break;
|
||||
}
|
||||
@ -346,10 +351,24 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
|
||||
op_data->float_weights_time_initialized = true;
|
||||
}
|
||||
|
||||
int32_t* zero_points_ptr = nullptr;
|
||||
int32_t* row_sums_ptr = nullptr;
|
||||
if (params->asymmetric_quantize_inputs && row_sums != nullptr) {
|
||||
zero_points_ptr = GetTensorData<int32_t>(zero_points);
|
||||
row_sums_ptr = GetTensorData<int32_t>(row_sums);
|
||||
}
|
||||
|
||||
reference_ops::EvalHybridSVDF(
|
||||
context, node, input, weights_feature, float_weights_time, bias,
|
||||
params, scratch, scaling_factors, input_quantized, state, output,
|
||||
zero_points, row_sums, &op_data->compute_row_sums);
|
||||
params, GetTensorShape(input), GetTensorData<float>(input),
|
||||
GetTensorShape(weights_feature),
|
||||
GetTensorData<int8_t>(weights_feature),
|
||||
weights_feature->params.scale, GetTensorShape(float_weights_time),
|
||||
GetTensorData<float>(float_weights_time), GetTensorShape(bias),
|
||||
GetTensorData<float>(bias), GetTensorData<float>(scratch),
|
||||
GetTensorData<float>(scaling_factors),
|
||||
GetTensorData<int8_t>(input_quantized), GetTensorData<float>(state),
|
||||
GetTensorShape(output), GetTensorData<float>(output),
|
||||
zero_points_ptr, row_sums_ptr, &op_data->compute_row_sums);
|
||||
return kTfLiteOk;
|
||||
} else {
|
||||
auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
|
||||
@ -363,9 +382,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
|
||||
// Currently supports only ReLU.
|
||||
// TODO(jianlijianli): support other activations.
|
||||
TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
|
||||
|
||||
reference_ops::EvalIntegerSVDF(
|
||||
context, node, input, weights_feature, weights_time, bias, params,
|
||||
state, output, scratch, output_temp, op_data->effective_scale_1_a,
|
||||
params, GetTensorShape(input), GetTensorData<int8_t>(input),
|
||||
GetTensorShape(weights_feature),
|
||||
GetTensorData<int8_t>(weights_feature),
|
||||
GetTensorShape(weights_time), GetTensorData<int16_t>(weights_time),
|
||||
GetTensorShape(bias), GetTensorData<int32_t>(bias),
|
||||
GetTensorData<int16_t>(state), GetTensorShape(output),
|
||||
GetTensorData<int8_t>(output), GetTensorData<int32_t>(scratch),
|
||||
GetTensorData<int32_t>(output_temp), op_data->effective_scale_1_a,
|
||||
op_data->effective_scale_1_b, op_data->effective_scale_2_a,
|
||||
op_data->effective_scale_2_b, input_params->zero_point->data[0],
|
||||
output_params->zero_point->data[0]);
|
||||
|
Loading…
Reference in New Issue
Block a user