Extract reference for operator BATCH_MATMUL to standalone header
Move the reference implementation to its own header so that micro can use it without the unrelated depedencies of reference_ops.h. PR step 2 for issue #46504
This commit is contained in:
parent
0e30543810
commit
0827a24f0e
@ -570,6 +570,7 @@ cc_library(
|
||||
"reference/add.h",
|
||||
"reference/add_n.h",
|
||||
"reference/arg_min_max.h",
|
||||
"reference/batch_matmul.h",
|
||||
"reference/batch_to_space_nd.h",
|
||||
"reference/binary_function.h",
|
||||
"reference/cast.h",
|
||||
@ -807,6 +808,7 @@ cc_library(
|
||||
],
|
||||
hdrs = [
|
||||
"tensor_utils.h",
|
||||
"tensor_utils_common.h",
|
||||
],
|
||||
compatible_with = get_compatible_with_portable(),
|
||||
copts = tflite_copts() + NEON_FLAGS_IF_APPLICABLE,
|
||||
|
@ -15,16 +15,40 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_BATCH_MATMUL_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/tensor_utils.h"
|
||||
#include "tensorflow/lite/kernels/internal/tensor_utils_common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
namespace reference_ops {
|
||||
namespace batch_matmul {
|
||||
|
||||
// Determine which dimension is the broadcast dimension.
|
||||
inline int broadcast_dim(int lhs_dim, int rhs_dim) {
|
||||
if (lhs_dim == rhs_dim) return lhs_dim;
|
||||
if (lhs_dim == 1) return rhs_dim;
|
||||
TFLITE_DCHECK_EQ(rhs_dim, 1);
|
||||
return lhs_dim;
|
||||
}
|
||||
|
||||
// Compute the "extent" for iterating on this dimension.
|
||||
// If we are broadcasting, then don't advance (i.e return 0).
|
||||
inline int extent(const RuntimeShape& shape, int x) {
|
||||
if (shape.Dims(x) == 1) {
|
||||
return 0;
|
||||
}
|
||||
int prod = 1;
|
||||
for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
|
||||
prod *= shape.Dims(i);
|
||||
}
|
||||
return prod;
|
||||
}
|
||||
|
||||
} // namespace batch_matmul
|
||||
|
||||
inline void BatchMatMul(const RuntimeShape& lhs_shape, const float* lhs_data,
|
||||
const RuntimeShape& rhs_shape, const float* rhs_data,
|
||||
@ -34,40 +58,19 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const float* lhs_data,
|
||||
const RuntimeShape extended_rhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, rhs_shape);
|
||||
|
||||
// Determine which dimension is the broadcast dimension.
|
||||
auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
|
||||
if (lhs_dim == rhs_dim) return lhs_dim;
|
||||
if (lhs_dim == 1) return rhs_dim;
|
||||
TFLITE_DCHECK_EQ(rhs_dim, 1);
|
||||
return lhs_dim;
|
||||
};
|
||||
const int batch_dim0 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
|
||||
const int batch_dim1 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
|
||||
const int batch_dim2 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
|
||||
|
||||
// Compute the "extent" for iterating on this dimension.
|
||||
// If we are broadcasting, then don't advance (i.e return 0).
|
||||
auto extent = [](const RuntimeShape& shape, int x) {
|
||||
if (shape.Dims(x) == 1) {
|
||||
return 0;
|
||||
}
|
||||
int prod = 1;
|
||||
for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
|
||||
prod *= shape.Dims(i);
|
||||
}
|
||||
return prod;
|
||||
};
|
||||
|
||||
const int batch_dim0 =
|
||||
broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
|
||||
const int batch_dim1 =
|
||||
broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
|
||||
const int batch_dim2 =
|
||||
broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
|
||||
|
||||
const int lhs_ext0 = extent(extended_lhs_shape, 0);
|
||||
const int lhs_ext1 = extent(extended_lhs_shape, 1);
|
||||
const int lhs_ext2 = extent(extended_lhs_shape, 2);
|
||||
const int rhs_ext0 = extent(extended_rhs_shape, 0);
|
||||
const int rhs_ext1 = extent(extended_rhs_shape, 1);
|
||||
const int rhs_ext2 = extent(extended_rhs_shape, 2);
|
||||
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
|
||||
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
|
||||
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
|
||||
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
|
||||
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
|
||||
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
|
||||
|
||||
// Set params for each matrix multiply.
|
||||
const int lhs_rows = extended_lhs_shape.Dims(3);
|
||||
@ -113,40 +116,19 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
|
||||
const RuntimeShape extended_rhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, rhs_shape);
|
||||
|
||||
// Determine which dimension is the broadcast dimension.
|
||||
auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
|
||||
if (lhs_dim == rhs_dim) return lhs_dim;
|
||||
if (lhs_dim == 1) return rhs_dim;
|
||||
TFLITE_DCHECK_EQ(rhs_dim, 1);
|
||||
return lhs_dim;
|
||||
};
|
||||
const int batch_dim0 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
|
||||
const int batch_dim1 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
|
||||
const int batch_dim2 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
|
||||
|
||||
// Compute the "extent" for iterating on this dimension.
|
||||
// If we are broadcasting, then don't advance (i.e return 0).
|
||||
auto extent = [](const RuntimeShape& shape, int x) {
|
||||
if (shape.Dims(x) == 1) {
|
||||
return 0;
|
||||
}
|
||||
int prod = 1;
|
||||
for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
|
||||
prod *= shape.Dims(i);
|
||||
}
|
||||
return prod;
|
||||
};
|
||||
|
||||
const int batch_dim0 =
|
||||
broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
|
||||
const int batch_dim1 =
|
||||
broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
|
||||
const int batch_dim2 =
|
||||
broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
|
||||
|
||||
const int lhs_ext0 = extent(extended_lhs_shape, 0);
|
||||
const int lhs_ext1 = extent(extended_lhs_shape, 1);
|
||||
const int lhs_ext2 = extent(extended_lhs_shape, 2);
|
||||
const int rhs_ext0 = extent(extended_rhs_shape, 0);
|
||||
const int rhs_ext1 = extent(extended_rhs_shape, 1);
|
||||
const int rhs_ext2 = extent(extended_rhs_shape, 2);
|
||||
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
|
||||
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
|
||||
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
|
||||
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
|
||||
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
|
||||
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
|
||||
|
||||
// Set params for each matrix multiply.
|
||||
const int lhs_rows = extended_lhs_shape.Dims(3);
|
||||
@ -223,40 +205,19 @@ inline void BatchMatMul(const FullyConnectedParams& params,
|
||||
const RuntimeShape extended_rhs_shape =
|
||||
RuntimeShape::ExtendedShape(5, rhs_shape);
|
||||
|
||||
// Determine which dimension is the broadcast dimension.
|
||||
auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
|
||||
if (lhs_dim == rhs_dim) return lhs_dim;
|
||||
if (lhs_dim == 1) return rhs_dim;
|
||||
TFLITE_DCHECK_EQ(rhs_dim, 1);
|
||||
return lhs_dim;
|
||||
};
|
||||
const int batch_dim0 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
|
||||
const int batch_dim1 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
|
||||
const int batch_dim2 = batch_matmul::broadcast_dim(
|
||||
extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
|
||||
|
||||
// Compute the "extent" for iterating on this dimension.
|
||||
// If we are broadcasting, then don't advance (i.e return 0).
|
||||
auto extent = [](const RuntimeShape& shape, int x) {
|
||||
if (shape.Dims(x) == 1) {
|
||||
return 0;
|
||||
}
|
||||
int prod = 1;
|
||||
for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
|
||||
prod *= shape.Dims(i);
|
||||
}
|
||||
return prod;
|
||||
};
|
||||
|
||||
const int batch_dim0 =
|
||||
broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
|
||||
const int batch_dim1 =
|
||||
broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
|
||||
const int batch_dim2 =
|
||||
broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
|
||||
|
||||
const int lhs_ext0 = extent(extended_lhs_shape, 0);
|
||||
const int lhs_ext1 = extent(extended_lhs_shape, 1);
|
||||
const int lhs_ext2 = extent(extended_lhs_shape, 2);
|
||||
const int rhs_ext0 = extent(extended_rhs_shape, 0);
|
||||
const int rhs_ext1 = extent(extended_rhs_shape, 1);
|
||||
const int rhs_ext2 = extent(extended_rhs_shape, 2);
|
||||
const int lhs_ext0 = batch_matmul::extent(extended_lhs_shape, 0);
|
||||
const int lhs_ext1 = batch_matmul::extent(extended_lhs_shape, 1);
|
||||
const int lhs_ext2 = batch_matmul::extent(extended_lhs_shape, 2);
|
||||
const int rhs_ext0 = batch_matmul::extent(extended_rhs_shape, 0);
|
||||
const int rhs_ext1 = batch_matmul::extent(extended_rhs_shape, 1);
|
||||
const int rhs_ext2 = batch_matmul::extent(extended_rhs_shape, 2);
|
||||
|
||||
// Set params for each matrix multiply.
|
||||
const int lhs_rows = extended_lhs_shape.Dims(3);
|
||||
|
@ -35,6 +35,7 @@ limitations under the License.
|
||||
#include "tensorflow/lite/kernels/internal/reference/add.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/add_n.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/arg_min_max.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/batch_matmul.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/cast.h"
|
||||
|
@ -17,9 +17,11 @@ limitations under the License.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
|
||||
#include "third_party/eigen3/Eigen/Core"
|
||||
#include "tensorflow/lite/c/builtin_op_data.h"
|
||||
#include "tensorflow/lite/kernels/internal/tensor_utils_common.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define __restrict__ __restrict
|
||||
@ -34,106 +36,6 @@ class CpuBackendContext;
|
||||
|
||||
namespace tensor_utils {
|
||||
|
||||
// Checks if all entries of vector are zero for float.
|
||||
bool IsZeroVector(const float* vector, int v_size);
|
||||
|
||||
// Checks if all entries of vector are zero for int8.
|
||||
bool IsZeroVector(const int8_t* vector, int v_size);
|
||||
|
||||
// Quantizes a buffer of floating point values using a symmetric quantization
|
||||
// (i.e. linear quantization without an offset) to 8-bit signed integers.
|
||||
// It also outputs the range (min, max) of the floating point buffer, and the
|
||||
// scaling factor used to quantize the values.
|
||||
void SymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* min_value,
|
||||
float* max_value, float* scaling_factor);
|
||||
|
||||
// Quantizes a buffer of floating point values using a symmetric quantization
|
||||
// (i.e. linear quantization without an offset) to 8-bit signed integers.
|
||||
// It uses the range (min, max) provided to the function to calculate the
|
||||
// appropriate scaling factor to quantize the values.
|
||||
void SymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float min_value,
|
||||
float max_value, float* scaling_factor);
|
||||
|
||||
void AsymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* scaling_factor,
|
||||
int32_t* offset);
|
||||
|
||||
// Helper function to quantize floats.
|
||||
// float_data_ptr input float vectors
|
||||
// n_batch number of input vectors
|
||||
// n_data size of a single input vector
|
||||
// quantized_data_ptr (out) vector with quantized data
|
||||
// scaling_factors (out) scaling factors (one per vector)
|
||||
// zero_points (out) zero points (one per vector)
|
||||
// do_asymmetric controls if the quantization should be asymmetric.
|
||||
inline void BatchQuantizeFloats(const float* float_data_ptr, int n_batch,
|
||||
int n_data, int8_t* quantized_data_ptr,
|
||||
float* scaling_factors, int32_t* zero_points,
|
||||
bool do_asymmetric) {
|
||||
for (int b = 0; b < n_batch; ++b) {
|
||||
const int offset = b * n_data;
|
||||
if (do_asymmetric) {
|
||||
tensor_utils::AsymmetricQuantizeFloats(
|
||||
float_data_ptr + offset, n_data, quantized_data_ptr + offset,
|
||||
&scaling_factors[b], &zero_points[b]);
|
||||
} else {
|
||||
float unused_min, unused_max;
|
||||
tensor_utils::SymmetricQuantizeFloats(
|
||||
float_data_ptr + offset, n_data, quantized_data_ptr + offset,
|
||||
&unused_min, &unused_max, &scaling_factors[b]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
|
||||
// dimension composed by input vectors independent from each other). The result
|
||||
// of the multiplication is accumulated to the passed result buffer.
|
||||
// More specifically, for a matrix M of shape [n, i] and a batched-vector
|
||||
// of shape [i, batch] it will first compute the product of shape [n, batch].
|
||||
// This product will be accumulated to the result buffer.
|
||||
void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
|
||||
int m_cols, const float* vector,
|
||||
int n_batch, float* result);
|
||||
|
||||
// Same as the function above, but the matrix is a sparse tensor with block
|
||||
// pattern 1x4.
|
||||
// This function assumes that m_cols is a multiple of the block size (4 in this
|
||||
// case) so that there's no incomplete block.
|
||||
void SparseMatrixBatchVectorMultiplyAccumulate1x4(
|
||||
const float* __restrict__ matrix, const int32_t* __restrict__ segments,
|
||||
const int32_t* __restrict__ indices, int m_rows, int m_cols,
|
||||
const float* __restrict__ vector, int n_batch, float* __restrict__ result);
|
||||
|
||||
// Same as the function above, but the matrix is stored in block compressed
|
||||
// sparse row format with block pattern 1x16 which consists of two arrays:
|
||||
// 1. A matrix array stores non-zero blocks of the matrix in row major.
|
||||
// 2. A ledger array stores nrows groups, one group per row. Each group starts
|
||||
// with an integer representing the number of non-zero blocks for the
|
||||
// corresponding row and follows with column indexes of the first element
|
||||
// of each non-zero block.
|
||||
// This function assumes that
|
||||
// 1. m_cols is a multiple of 16 so that all blocks are full blocks.
|
||||
// 2. m_cols < 254 * 16 so that block index can be represented by uint8.
|
||||
void SparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
||||
float* __restrict__ result);
|
||||
|
||||
// Same as the function above, but for values quantized using symmetric
|
||||
// quantization (e.g. by calling SymmetricQuantizeFloats).
|
||||
// The passed scaling factors is a buffer of the quantization scaling factors
|
||||
// that will be used to dequentize the products into the final result buffer.
|
||||
// These scaling factors are the multiplication of the matrix scaling factor
|
||||
// by the vector's scaling factor, one per batch (i.e. this allows quantizing
|
||||
// each batch in the batch-vector matrix independently).
|
||||
void MatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors,
|
||||
const float* __restrict__ scaling_factors, int n_batch,
|
||||
float* __restrict__ result);
|
||||
|
||||
// Same as the function above, but provide a scratch buffer for the
|
||||
// int8 x int8 -> int32 and a CpuBackendContext for the accumulator
|
||||
// computation.
|
||||
@ -144,16 +46,6 @@ void MatrixBatchVectorMultiplyAccumulate(
|
||||
int32_t* __restrict__ scratch, float* __restrict__ result,
|
||||
CpuBackendContext* __restrict__ context);
|
||||
|
||||
// Same as the function above except that vector values
|
||||
// are quantized with asymmetric quantization per-batch and the matrix
|
||||
// is quantized per row.
|
||||
void MatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors,
|
||||
const float* __restrict__ scaling_factors, int n_batch,
|
||||
float* __restrict__ result, const float* __restrict__ per_channel_scale,
|
||||
const int32_t* __restrict__ input_offset);
|
||||
|
||||
// Same as the function above except that can make use of cached row sums.
|
||||
void MatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
@ -183,22 +75,6 @@ inline void MatrixBatchVectorMultiplyAccumulate(
|
||||
row_sums, compute_row_sums, context);
|
||||
}
|
||||
|
||||
// Same as the function above, but the matrix is stored in block compressed
|
||||
// sparse row format with block pattern 1x16 which consists of two arrays:
|
||||
// 1. A matrix array stores non-zero blocks of the matrix in row major.
|
||||
// 2. A ledger array stores nrows groups, one group per row. Each group starts
|
||||
// with an integer representing the number of non-zero blocks for the
|
||||
// corresponding row followed by column index of the first element of
|
||||
// each non-zero block.
|
||||
// This function assumes that
|
||||
// 1. m_cols is a multiple of 16 so that all blocks are full blocks.
|
||||
// 2. m_cols < 254 * 16 so that block index can be represented by uint8.
|
||||
void SparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||
const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
|
||||
const float* __restrict__ scaling_factors, int n_batch,
|
||||
float* __restrict__ result);
|
||||
|
||||
// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
|
||||
// dimension composed by input vectors independent from each other). The result
|
||||
// of the multiplication is accumulated to the passed result buffer.
|
||||
@ -223,8 +99,8 @@ void SparseMatrixBatchVectorMultiplyAccumulate(
|
||||
// - multiplier and shift combined gives the scale.
|
||||
// - assumes input zero point is 0.
|
||||
// - scratch is created for optimization purpose only.
|
||||
// TODO(b/152066492): this can be removed if some future optimization
|
||||
// work makes it unnecessary.
|
||||
// TODO(b/152066492): this can be removed if some future optimization
|
||||
// work makes it unnecessary.
|
||||
void MatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
@ -254,280 +130,14 @@ void MatrixBatchVectorMultiplyAccumulate(
|
||||
// - multiplier and shift combined gives the scale.
|
||||
// - assumes input zero point is 0.
|
||||
// - scratch is created for optimization purpose only.
|
||||
// TODO(b/152066492): this can be removed if some future optimization
|
||||
// work makes it unnecessary.
|
||||
// TODO(b/152066492): this can be removed if some future optimization
|
||||
// work makes it unnecessary.
|
||||
void MatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
|
||||
int32_t* scratch, int8_t* output, CpuBackendContext* context);
|
||||
|
||||
// Same as the above 8, 8, 8 integer matmul except for the presence of zero
|
||||
// point and non-accumulative.
|
||||
// TODO(b/148688698): remove this function by folding zero point calculation in
|
||||
// prepare() function.
|
||||
void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
|
||||
const int8_t* input_to_gate_weights,
|
||||
int32_t input_to_gate_effective_scale_a,
|
||||
int32_t input_to_gate_effective_scale_b,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_cell,
|
||||
int8_t* gate_output, int8_t gate_output_zp);
|
||||
|
||||
// Same as above but has 16 bit and 8 bit input and 8 bit output.
|
||||
// Used in projection when hidden is 16bit.
|
||||
void MatrixBatchVectorMultiply(const int16_t* hidden,
|
||||
const int8_t* hidden_to_output_weights,
|
||||
int32_t proj_effective_scale_a,
|
||||
int32_t proj_effective_scale_b,
|
||||
const int32_t* gate_bias, int32_t n_batch,
|
||||
int32_t n_hidden, int32_t n_output,
|
||||
int32_t output_zp, int8_t* proj_output);
|
||||
|
||||
// Multiplies a matrix with a scalar and reduce the result on each row to a
|
||||
// scalar.
|
||||
// Parameters:
|
||||
// - matrix: matrix of size n_row * n_col
|
||||
// - scalar: the scalar that is multiplied to each element in the matrix
|
||||
// - n_row: the row count of the matrix
|
||||
// - n_col: the column count of the matrix
|
||||
// - output: the 32bit output
|
||||
// Note: We do not need saturation because the int8 * int8 is safe from overflow
|
||||
// in (2^31-1) / (2^14) = 131072, which is bigger than the n_row. Non-zero
|
||||
// initial output value is not exceptionally large.
|
||||
void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
|
||||
int32_t n_row, int32_t n_col,
|
||||
int32_t* output);
|
||||
|
||||
// Apply Layer Normalization (https://arxiv.org/abs/1607.06450) to a Quantized
|
||||
// vector.
|
||||
// Parameters:
|
||||
// - input: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - layer_norm_weights: the quantized layer normalization weights.
|
||||
// - bias: the bias for the layer normalization.
|
||||
// - layer_norm_scale_a: multiplier for scale factor.
|
||||
// - layer_norm_scale_b: shift for scale factor.
|
||||
// - variance_limit: the guard to make sure the inverse does not overflow.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - output: the 16 bit output
|
||||
void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
|
||||
const int32_t* bias, int32_t layer_norm_scale_a,
|
||||
int32_t layer_norm_scale_b, int32_t variance_limit,
|
||||
int n_batch, int n_input, int16_t* output);
|
||||
|
||||
// Same as above but the internal calculation is done in float.
|
||||
void ApplyLayerNormFloat(const int16_t* input,
|
||||
const int16_t* layer_norm_weights,
|
||||
int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
|
||||
const int32_t* bias, int n_batch, int n_input,
|
||||
int16_t* output);
|
||||
|
||||
// Apply Sigmoid to a quantized vector.
|
||||
// Parameters:
|
||||
// - input: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - output: the 16 bit output
|
||||
// The input is in Q3.12 format and the output is in Q0.15 format.
|
||||
void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
|
||||
int16_t* output);
|
||||
|
||||
// Same as above but the internal calcualtion is float.
|
||||
void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
|
||||
int16_t* output);
|
||||
|
||||
// Apply Tanh to a quantized vector.
|
||||
// Parameters:
|
||||
// - integer_bits: the integer bits of the input.
|
||||
// Currently supports 0, 1, 2, 3, 4, 5, 6.
|
||||
// - input: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - output: the 16 bit output
|
||||
// The input is in Qm.15-m format and the output is in Q0.15 format.
|
||||
void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int16_t* output);
|
||||
|
||||
// Apply Tanh to a quantized vector. Tbe internal calculation is in float.
|
||||
// - Input has 2^(integer_bits) as scale.
|
||||
// - Output has Q0.15 as scale.
|
||||
void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
|
||||
int32_t integer_bits, int16_t* output);
|
||||
|
||||
// Element-wise multiplication of two quantized vectors.
|
||||
// Parameters:
|
||||
// - input_1: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - input_2: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - shift: the shift needed to produce the output.
|
||||
// - output: the 16 bit output of size n_batch * n_input.
|
||||
// Output does not need to be initialized.
|
||||
void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
|
||||
int n_input, int shift, int16_t* output);
|
||||
|
||||
// Element-wise multiplication of two quantized vectors.
|
||||
// Parameters:
|
||||
// - input_1: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - input_2: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - shift: the shift needed to produce the output.
|
||||
// - output: the 8 bit output of size n_batch * n_input.
|
||||
// Output does not need to be initialized.
|
||||
void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
|
||||
int n_input, int shift, int8_t* output);
|
||||
|
||||
// Element-wise multiplication of two quantized vectors with rescaling.
|
||||
// Parameters:
|
||||
// - input_1: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - input_2: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - multiplier: the multiplier part of scale.
|
||||
// - shift: the shift part of scale.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - output: the 8 bit output of size n_batch * n_input.
|
||||
// - output_zp: the zero point of output.
|
||||
// Output does not need to be initialized.
|
||||
// Multiplier ("m") and shift ("s") are connected to scale ("s") with s = m *
|
||||
// 2^(s - 31).
|
||||
void CwiseMul(const int16_t* input_1, const int16_t* input_2,
|
||||
int32_t multiplier, int32_t shift, int32_t n_batch,
|
||||
int32_t n_input, int32_t output_zp, int8_t* output);
|
||||
|
||||
// Element-wise saturating addition of two quantized vectors without rescaling.
|
||||
// Parameters:
|
||||
// - input_1: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - input_2: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - output: the 8 bit output of size n_batch * n_input.
|
||||
// Output does not need to be initialized.
|
||||
void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
|
||||
int n_input, int16_t* output);
|
||||
|
||||
// Element-wise in-place clipping of a vector. Overloaded for float, int16_t,
|
||||
// int8_t. Parameters:
|
||||
// - vector: vector of size v_size.
|
||||
// - v_size: the size of the vector.
|
||||
// - clipping_value: the value used for clipping.
|
||||
void CwiseClipping(float* vector, const int v_size, const float clipping_value);
|
||||
void CwiseClipping(int16_t* vector, const int v_size,
|
||||
const int16_t clipping_value);
|
||||
void CwiseClipping(int8_t* vector, const int v_size,
|
||||
const int8_t clipping_value);
|
||||
|
||||
// Cwise product of two vectors.
|
||||
template <typename T>
|
||||
inline void VectorVectorCwiseProduct(const T* __restrict__ vector1,
|
||||
const T* __restrict__ vector2, int v_size,
|
||||
T* __restrict__ result) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
*result++ = *vector1++ * *vector2++;
|
||||
}
|
||||
}
|
||||
|
||||
// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
|
||||
// assumption here is that result array is initialized to valid values.
|
||||
template <typename T>
|
||||
inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
|
||||
const T* __restrict__ vector2,
|
||||
int v_size,
|
||||
T* __restrict__ result) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
*result++ += *vector1++ * *vector2++;
|
||||
}
|
||||
}
|
||||
|
||||
// Dot product of two vectors.
|
||||
float VectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size);
|
||||
|
||||
// Dot product of two batch vectors of size n_batch * v_size:
|
||||
// vector1 = [x_1_1, x_1_2, ..., x_1_vsize,
|
||||
// x_2_1, x_2_2, ..., x_2_vsize,
|
||||
// ...
|
||||
// x_nbatch_1,..., x_nbatch_vsize]
|
||||
// vector2 = [y_1_1, y_1_2, ..., y_1_vsize,
|
||||
// y_2_1, y_2_2, ..., y_2_vsize,
|
||||
// ...
|
||||
// y_nbatch_1,..., y_nbatch_vsize]
|
||||
// Then result will be a vector of n_batch size starting from 'result':
|
||||
// [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize,
|
||||
// x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize,
|
||||
// ...
|
||||
// x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize]
|
||||
template <typename T>
|
||||
inline void BatchVectorBatchVectorDotProduct(const T* vector1, const T* vector2,
|
||||
int v_size, int n_batch,
|
||||
T* result) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
|
||||
vector1 += v_size;
|
||||
vector2 += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Same as above but input is 16bit and output is 32bit.
|
||||
void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
|
||||
const int16_t* vector2, int v_size,
|
||||
int n_batch, int32_t* result);
|
||||
|
||||
// Cwise product of a vector and a batch-vector.
|
||||
template <typename T>
|
||||
inline void VectorBatchVectorCwiseProduct(const T* vector, int v_size,
|
||||
const T* batch_vector, int n_batch,
|
||||
T* result) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
|
||||
// Update the pointers.
|
||||
result += v_size;
|
||||
batch_vector += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
|
||||
// operation, the assumption here is that result array is initialized to valid
|
||||
// values.
|
||||
template <typename T>
|
||||
inline void VectorBatchVectorCwiseProductAccumulate(const T* vector, int v_size,
|
||||
const T* batch_vector,
|
||||
int n_batch, T* result) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
|
||||
// Update the pointers.
|
||||
result += v_size;
|
||||
batch_vector += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Same as above, but inputs are 16bit integer and output is 16bit integer.
|
||||
void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
|
||||
const int16_t* batch_vector,
|
||||
int n_batch, int32_t multiplier,
|
||||
int shift, int16_t* result);
|
||||
|
||||
// Add another vector for each batch in the batch vector.
|
||||
template <typename T>
|
||||
void VectorBatchVectorAdd(const T* vector, int v_size, int n_batch,
|
||||
T* batch_vector) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
for (int i = 0; i < v_size; ++i) {
|
||||
batch_vector[i] += vector[i];
|
||||
}
|
||||
batch_vector += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Batch vector initialization with another vector.
|
||||
template <typename T>
|
||||
void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch,
|
||||
T* batch_vector) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
std::copy_n(vector, v_size, batch_vector + b * v_size);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply Rectified Linear to elements of a vector.
|
||||
inline void ApplyReluToVector(const float* __restrict__ vector, int v_size,
|
||||
float* __restrict__ result) {
|
||||
@ -601,48 +211,6 @@ inline void ApplyActivationToVector(const float* __restrict__ vector,
|
||||
}
|
||||
}
|
||||
|
||||
// Compute "1.0f - elements of vector" (used in CIFG).
|
||||
void Sub1Vector(const float* vector, int v_size, float* result);
|
||||
|
||||
// Compute "1.0f - elements of vector" (used in CIFG) for int16 input.
|
||||
// "vector" has range [0, 32767] because it is the output of sigmoid function.
|
||||
void Sub1Vector(const int16_t* vector, int v_size, int16_t* result);
|
||||
|
||||
// Multiply all elements of vector with a scalar.
|
||||
void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
||||
float* result);
|
||||
|
||||
// Reduce-sum on a float input vector:
|
||||
// input_vector: float pointer to input vector.
|
||||
// output_vector: float pointer to vector.
|
||||
// output_size: output vector size.
|
||||
// reduction_size: number of consecutive elements from input vector which are
|
||||
// added to get one element of output.
|
||||
void ReductionSumVector(const float* input_vector, float* output_vector,
|
||||
int output_size, int reduction_size);
|
||||
|
||||
// Same as above but input/output is 32 bit integer.
|
||||
void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
|
||||
int output_size, int reduction_size);
|
||||
|
||||
// Same as above but input is 8 bit integer.
|
||||
void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
|
||||
int output_size, int reduction_size);
|
||||
|
||||
// Layer norm for each batch.
|
||||
void MeanStddevNormalization(const float* __restrict__ input_vector,
|
||||
float* __restrict__ output_vector, int v_size,
|
||||
int n_batch);
|
||||
|
||||
// Saturate Add with rescale on both inputs.
|
||||
void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
|
||||
const int8_t* recurrent, int8_t recurrent_zp,
|
||||
int32_t input_effective_scale_a,
|
||||
int32_t input_effective_scale_b,
|
||||
int32_t recurrent_effective_scale_a,
|
||||
int32_t recurrent_effective_scale_b, int32_t n_batch,
|
||||
int32_t n_cell, int16_t* output);
|
||||
|
||||
} // namespace tensor_utils
|
||||
} // namespace tflite
|
||||
|
||||
|
466
tensorflow/lite/kernels/internal/tensor_utils_common.h
Normal file
466
tensorflow/lite/kernels/internal/tensor_utils_common.h
Normal file
@ -0,0 +1,466 @@
|
||||
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_COMMON_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_COMMON_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define __restrict__ __restrict
|
||||
#endif
|
||||
|
||||
namespace tflite {
|
||||
|
||||
namespace tensor_utils {
|
||||
|
||||
// Checks if all entries of vector are zero for float.
|
||||
bool IsZeroVector(const float* vector, int v_size);
|
||||
|
||||
// Checks if all entries of vector are zero for int8.
|
||||
bool IsZeroVector(const int8_t* vector, int v_size);
|
||||
|
||||
// Quantizes a buffer of floating point values using a symmetric quantization
|
||||
// (i.e. linear quantization without an offset) to 8-bit signed integers.
|
||||
// It also outputs the range (min, max) of the floating point buffer, and the
|
||||
// scaling factor used to quantize the values.
|
||||
void SymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* min_value,
|
||||
float* max_value, float* scaling_factor);
|
||||
|
||||
// Quantizes a buffer of floating point values using a symmetric quantization
|
||||
// (i.e. linear quantization without an offset) to 8-bit signed integers.
|
||||
// It uses the range (min, max) provided to the function to calculate the
|
||||
// appropriate scaling factor to quantize the values.
|
||||
void SymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float min_value,
|
||||
float max_value, float* scaling_factor);
|
||||
|
||||
void AsymmetricQuantizeFloats(const float* values, const int size,
|
||||
int8_t* quantized_values, float* scaling_factor,
|
||||
int32_t* offset);
|
||||
|
||||
// Helper function to quantize floats.
|
||||
// float_data_ptr input float vectors
|
||||
// n_batch number of input vectors
|
||||
// n_data size of a single input vector
|
||||
// quantized_data_ptr (out) vector with quantized data
|
||||
// scaling_factors (out) scaling factors (one per vector)
|
||||
// zero_points (out) zero points (one per vector)
|
||||
// do_asymmetric controls if the quantization should be asymmetric.
|
||||
inline void BatchQuantizeFloats(const float* float_data_ptr, int n_batch,
|
||||
int n_data, int8_t* quantized_data_ptr,
|
||||
float* scaling_factors, int32_t* zero_points,
|
||||
bool do_asymmetric) {
|
||||
for (int b = 0; b < n_batch; ++b) {
|
||||
const int offset = b * n_data;
|
||||
if (do_asymmetric) {
|
||||
tensor_utils::AsymmetricQuantizeFloats(
|
||||
float_data_ptr + offset, n_data, quantized_data_ptr + offset,
|
||||
&scaling_factors[b], &zero_points[b]);
|
||||
} else {
|
||||
float unused_min, unused_max;
|
||||
tensor_utils::SymmetricQuantizeFloats(
|
||||
float_data_ptr + offset, n_data, quantized_data_ptr + offset,
|
||||
&unused_min, &unused_max, &scaling_factors[b]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
|
||||
// dimension composed by input vectors independent from each other). The result
|
||||
// of the multiplication is accumulated to the passed result buffer.
|
||||
// More specifically, for a matrix M of shape [n, i] and a batched-vector
|
||||
// of shape [i, batch] it will first compute the product of shape [n, batch].
|
||||
// This product will be accumulated to the result buffer.
|
||||
void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
|
||||
int m_cols, const float* vector,
|
||||
int n_batch, float* result);
|
||||
|
||||
// Same as the function above, but the matrix is a sparse tensor with block
|
||||
// pattern 1x4.
|
||||
// This function assumes that m_cols is a multiple of the block size (4 in this
|
||||
// case) so that there's no incomplete block.
|
||||
void SparseMatrixBatchVectorMultiplyAccumulate1x4(
|
||||
const float* __restrict__ matrix, const int32_t* __restrict__ segments,
|
||||
const int32_t* __restrict__ indices, int m_rows, int m_cols,
|
||||
const float* __restrict__ vector, int n_batch, float* __restrict__ result);
|
||||
|
||||
// Same as the function above, but the matrix is stored in block compressed
|
||||
// sparse row format with block pattern 1x16 which consists of two arrays:
|
||||
// 1. A matrix array stores non-zero blocks of the matrix in row major.
|
||||
// 2. A ledger array stores nrows groups, one group per row. Each group starts
|
||||
// with an integer representing the number of non-zero blocks for the
|
||||
// corresponding row and follows with column indexes of the first element
|
||||
// of each non-zero block.
|
||||
// This function assumes that
|
||||
// 1. m_cols is a multiple of 16 so that all blocks are full blocks.
|
||||
// 2. m_cols < 254 * 16 so that block index can be represented by uint8.
|
||||
void SparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||
int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
|
||||
float* __restrict__ result);
|
||||
|
||||
// Same as the function above, but for values quantized using symmetric
|
||||
// quantization (e.g. by calling SymmetricQuantizeFloats).
|
||||
// The passed scaling factors is a buffer of the quantization scaling factors
|
||||
// that will be used to dequentize the products into the final result buffer.
|
||||
// These scaling factors are the multiplication of the matrix scaling factor
|
||||
// by the vector's scaling factor, one per batch (i.e. this allows quantizing
|
||||
// each batch in the batch-vector matrix independently).
|
||||
void MatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors,
|
||||
const float* __restrict__ scaling_factors, int n_batch,
|
||||
float* __restrict__ result);
|
||||
|
||||
// Same as the function above except that vector values
|
||||
// are quantized with asymmetric quantization per-batch and the matrix
|
||||
// is quantized per row.
|
||||
void MatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
|
||||
const int8_t* __restrict__ vectors,
|
||||
const float* __restrict__ scaling_factors, int n_batch,
|
||||
float* __restrict__ result, const float* __restrict__ per_channel_scale,
|
||||
const int32_t* __restrict__ input_offset);
|
||||
|
||||
// Same as the function above, but the matrix is stored in block compressed
|
||||
// sparse row format with block pattern 1x16 which consists of two arrays:
|
||||
// 1. A matrix array stores non-zero blocks of the matrix in row major.
|
||||
// 2. A ledger array stores nrows groups, one group per row. Each group starts
|
||||
// with an integer representing the number of non-zero blocks for the
|
||||
// corresponding row followed by column index of the first element of
|
||||
// each non-zero block.
|
||||
// This function assumes that
|
||||
// 1. m_cols is a multiple of 16 so that all blocks are full blocks.
|
||||
// 2. m_cols < 254 * 16 so that block index can be represented by uint8.
|
||||
void SparseMatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* __restrict__ matrix, const uint8_t* __restrict__ ledger,
|
||||
const int m_rows, const int m_cols, const int8_t* __restrict__ vectors,
|
||||
const float* __restrict__ scaling_factors, int n_batch,
|
||||
float* __restrict__ result);
|
||||
|
||||
// Same as the above 8, 8, 8 integer matmul except for the presence of zero
|
||||
// point and non-accumulative.
|
||||
// TODO(b/148688698): remove this function by folding zero point calculation in
|
||||
// prepare() function.
|
||||
void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
|
||||
const int8_t* input_to_gate_weights,
|
||||
int32_t input_to_gate_effective_scale_a,
|
||||
int32_t input_to_gate_effective_scale_b,
|
||||
int32_t n_batch, int32_t n_input, int32_t n_cell,
|
||||
int8_t* gate_output, int8_t gate_output_zp);
|
||||
|
||||
// Same as above but has 16 bit and 8 bit input and 8 bit output.
|
||||
// Used in projection when hidden is 16bit.
|
||||
void MatrixBatchVectorMultiply(const int16_t* hidden,
|
||||
const int8_t* hidden_to_output_weights,
|
||||
int32_t proj_effective_scale_a,
|
||||
int32_t proj_effective_scale_b,
|
||||
const int32_t* gate_bias, int32_t n_batch,
|
||||
int32_t n_hidden, int32_t n_output,
|
||||
int32_t output_zp, int8_t* proj_output);
|
||||
|
||||
// Multiplies a matrix with a scalar and reduce the result on each row to a
|
||||
// scalar.
|
||||
// Parameters:
|
||||
// - matrix: matrix of size n_row * n_col
|
||||
// - scalar: the scalar that is multiplied to each element in the matrix
|
||||
// - n_row: the row count of the matrix
|
||||
// - n_col: the column count of the matrix
|
||||
// - output: the 32bit output
|
||||
// Note: We do not need saturation because the int8 * int8 is safe from overflow
|
||||
// in (2^31-1) / (2^14) = 131072, which is bigger than the n_row. Non-zero
|
||||
// initial output value is not exceptionally large.
|
||||
void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
|
||||
int32_t n_row, int32_t n_col,
|
||||
int32_t* output);
|
||||
|
||||
// Apply Layer Normalization (https://arxiv.org/abs/1607.06450) to a Quantized
|
||||
// vector.
|
||||
// Parameters:
|
||||
// - input: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - layer_norm_weights: the quantized layer normalization weights.
|
||||
// - bias: the bias for the layer normalization.
|
||||
// - layer_norm_scale_a: multiplier for scale factor.
|
||||
// - layer_norm_scale_b: shift for scale factor.
|
||||
// - variance_limit: the guard to make sure the inverse does not overflow.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - output: the 16 bit output
|
||||
void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
|
||||
const int32_t* bias, int32_t layer_norm_scale_a,
|
||||
int32_t layer_norm_scale_b, int32_t variance_limit,
|
||||
int n_batch, int n_input, int16_t* output);
|
||||
|
||||
// Same as above but the internal calculation is done in float.
|
||||
void ApplyLayerNormFloat(const int16_t* input,
|
||||
const int16_t* layer_norm_weights,
|
||||
int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
|
||||
const int32_t* bias, int n_batch, int n_input,
|
||||
int16_t* output);
|
||||
|
||||
// Apply Sigmoid to a quantized vector.
|
||||
// Parameters:
|
||||
// - input: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - output: the 16 bit output
|
||||
// The input is in Q3.12 format and the output is in Q0.15 format.
|
||||
void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
|
||||
int16_t* output);
|
||||
|
||||
// Same as above but the internal calcualtion is float.
|
||||
void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
|
||||
int16_t* output);
|
||||
|
||||
// Apply Tanh to a quantized vector.
|
||||
// Parameters:
|
||||
// - integer_bits: the integer bits of the input.
|
||||
// Currently supports 0, 1, 2, 3, 4, 5, 6.
|
||||
// - input: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - output: the 16 bit output
|
||||
// The input is in Qm.15-m format and the output is in Q0.15 format.
|
||||
void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
|
||||
int32_t n_input, int16_t* output);
|
||||
|
||||
// Apply Tanh to a quantized vector. Tbe internal calculation is in float.
|
||||
// - Input has 2^(integer_bits) as scale.
|
||||
// - Output has Q0.15 as scale.
|
||||
void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
|
||||
int32_t integer_bits, int16_t* output);
|
||||
|
||||
// Element-wise multiplication of two quantized vectors.
|
||||
// Parameters:
|
||||
// - input_1: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - input_2: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - shift: the shift needed to produce the output.
|
||||
// - output: the 16 bit output of size n_batch * n_input.
|
||||
// Output does not need to be initialized.
|
||||
void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
|
||||
int n_input, int shift, int16_t* output);
|
||||
|
||||
// Element-wise multiplication of two quantized vectors.
|
||||
// Parameters:
|
||||
// - input_1: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - input_2: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - shift: the shift needed to produce the output.
|
||||
// - output: the 8 bit output of size n_batch * n_input.
|
||||
// Output does not need to be initialized.
|
||||
void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
|
||||
int n_input, int shift, int8_t* output);
|
||||
|
||||
// Element-wise multiplication of two quantized vectors with rescaling.
|
||||
// Parameters:
|
||||
// - input_1: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - input_2: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - multiplier: the multiplier part of scale.
|
||||
// - shift: the shift part of scale.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - output: the 8 bit output of size n_batch * n_input.
|
||||
// - output_zp: the zero point of output.
|
||||
// Output does not need to be initialized.
|
||||
// Multiplier ("m") and shift ("s") are connected to scale ("s") with s = m *
|
||||
// 2^(s - 31).
|
||||
void CwiseMul(const int16_t* input_1, const int16_t* input_2,
|
||||
int32_t multiplier, int32_t shift, int32_t n_batch,
|
||||
int32_t n_input, int32_t output_zp, int8_t* output);
|
||||
|
||||
// Element-wise saturating addition of two quantized vectors without rescaling.
|
||||
// Parameters:
|
||||
// - input_1: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - input_2: batch vector of size n_batch * n_input; 16 bit.
|
||||
// - n_batch: the number of batches.
|
||||
// - n_input: the size for input and output.
|
||||
// - output: the 8 bit output of size n_batch * n_input.
|
||||
// Output does not need to be initialized.
|
||||
void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
|
||||
int n_input, int16_t* output);
|
||||
|
||||
// Element-wise in-place clipping of a vector. Overloaded for float, int16_t,
|
||||
// int8_t. Parameters:
|
||||
// - vector: vector of size v_size.
|
||||
// - v_size: the size of the vector.
|
||||
// - clipping_value: the value used for clipping.
|
||||
void CwiseClipping(float* vector, const int v_size, const float clipping_value);
|
||||
void CwiseClipping(int16_t* vector, const int v_size,
|
||||
const int16_t clipping_value);
|
||||
void CwiseClipping(int8_t* vector, const int v_size,
|
||||
const int8_t clipping_value);
|
||||
|
||||
// Cwise product of two vectors.
|
||||
template <typename T>
|
||||
inline void VectorVectorCwiseProduct(const T* __restrict__ vector1,
|
||||
const T* __restrict__ vector2, int v_size,
|
||||
T* __restrict__ result) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
*result++ = *vector1++ * *vector2++;
|
||||
}
|
||||
}
|
||||
|
||||
// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
|
||||
// assumption here is that result array is initialized to valid values.
|
||||
template <typename T>
|
||||
inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
|
||||
const T* __restrict__ vector2,
|
||||
int v_size,
|
||||
T* __restrict__ result) {
|
||||
for (int v = 0; v < v_size; v++) {
|
||||
*result++ += *vector1++ * *vector2++;
|
||||
}
|
||||
}
|
||||
|
||||
// Dot product of two vectors.
|
||||
float VectorVectorDotProduct(const float* vector1, const float* vector2,
|
||||
int v_size);
|
||||
|
||||
// Dot product of two batch vectors of size n_batch * v_size:
|
||||
// vector1 = [x_1_1, x_1_2, ..., x_1_vsize,
|
||||
// x_2_1, x_2_2, ..., x_2_vsize,
|
||||
// ...
|
||||
// x_nbatch_1,..., x_nbatch_vsize]
|
||||
// vector2 = [y_1_1, y_1_2, ..., y_1_vsize,
|
||||
// y_2_1, y_2_2, ..., y_2_vsize,
|
||||
// ...
|
||||
// y_nbatch_1,..., y_nbatch_vsize]
|
||||
// Then result will be a vector of n_batch size starting from 'result':
|
||||
// [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize,
|
||||
// x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize,
|
||||
// ...
|
||||
// x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize]
|
||||
template <typename T>
|
||||
inline void BatchVectorBatchVectorDotProduct(const T* vector1, const T* vector2,
|
||||
int v_size, int n_batch,
|
||||
T* result) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
|
||||
vector1 += v_size;
|
||||
vector2 += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Same as above but input is 16bit and output is 32bit.
|
||||
void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
|
||||
const int16_t* vector2, int v_size,
|
||||
int n_batch, int32_t* result);
|
||||
|
||||
// Cwise product of a vector and a batch-vector.
|
||||
template <typename T>
|
||||
inline void VectorBatchVectorCwiseProduct(const T* vector, int v_size,
|
||||
const T* batch_vector, int n_batch,
|
||||
T* result) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
|
||||
// Update the pointers.
|
||||
result += v_size;
|
||||
batch_vector += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
|
||||
// operation, the assumption here is that result array is initialized to valid
|
||||
// values.
|
||||
template <typename T>
|
||||
inline void VectorBatchVectorCwiseProductAccumulate(const T* vector, int v_size,
|
||||
const T* batch_vector,
|
||||
int n_batch, T* result) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
|
||||
// Update the pointers.
|
||||
result += v_size;
|
||||
batch_vector += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Same as above, but inputs are 16bit integer and output is 16bit integer.
|
||||
void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
|
||||
const int16_t* batch_vector,
|
||||
int n_batch, int32_t multiplier,
|
||||
int shift, int16_t* result);
|
||||
|
||||
// Add another vector for each batch in the batch vector.
|
||||
template <typename T>
|
||||
void VectorBatchVectorAdd(const T* vector, int v_size, int n_batch,
|
||||
T* batch_vector) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
for (int i = 0; i < v_size; ++i) {
|
||||
batch_vector[i] += vector[i];
|
||||
}
|
||||
batch_vector += v_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Batch vector initialization with another vector.
|
||||
template <typename T>
|
||||
void VectorBatchVectorAssign(const T* vector, int v_size, int n_batch,
|
||||
T* batch_vector) {
|
||||
for (int b = 0; b < n_batch; b++) {
|
||||
std::copy_n(vector, v_size, batch_vector + b * v_size);
|
||||
}
|
||||
}
|
||||
|
||||
// Compute "1.0f - elements of vector" (used in CIFG).
|
||||
void Sub1Vector(const float* vector, int v_size, float* result);
|
||||
|
||||
// Compute "1.0f - elements of vector" (used in CIFG) for int16 input.
|
||||
// "vector" has range [0, 32767] because it is the output of sigmoid function.
|
||||
void Sub1Vector(const int16_t* vector, int v_size, int16_t* result);
|
||||
|
||||
// Multiply all elements of vector with a scalar.
|
||||
void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
|
||||
float* result);
|
||||
|
||||
// Reduce-sum on a float input vector:
|
||||
// input_vector: float pointer to input vector.
|
||||
// output_vector: float pointer to vector.
|
||||
// output_size: output vector size.
|
||||
// reduction_size: number of consecutive elements from input vector which are
|
||||
// added to get one element of output.
|
||||
void ReductionSumVector(const float* input_vector, float* output_vector,
|
||||
int output_size, int reduction_size);
|
||||
|
||||
// Same as above but input/output is 32 bit integer.
|
||||
void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
|
||||
int output_size, int reduction_size);
|
||||
|
||||
// Same as above but input is 8 bit integer.
|
||||
void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
|
||||
int output_size, int reduction_size);
|
||||
|
||||
// Layer norm for each batch.
|
||||
void MeanStddevNormalization(const float* __restrict__ input_vector,
|
||||
float* __restrict__ output_vector, int v_size,
|
||||
int n_batch);
|
||||
|
||||
// Saturate Add with rescale on both inputs.
|
||||
void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
|
||||
const int8_t* recurrent, int8_t recurrent_zp,
|
||||
int32_t input_effective_scale_a,
|
||||
int32_t input_effective_scale_b,
|
||||
int32_t recurrent_effective_scale_a,
|
||||
int32_t recurrent_effective_scale_b, int32_t n_batch,
|
||||
int32_t n_cell, int16_t* output);
|
||||
|
||||
} // namespace tensor_utils
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_COMMON_H_
|
Loading…
x
Reference in New Issue
Block a user