8157 lines
353 KiB
C++
8157 lines
353 KiB
C++
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
|
|
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
|
|
|
|
#include <assert.h>
|
|
#include <stdint.h>
|
|
#include <sys/types.h>
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <cstdint>
|
|
#include <limits>
|
|
#include <memory>
|
|
#include <tuple>
|
|
#include <type_traits>
|
|
|
|
#include "tensorflow/lite/kernels/internal/common.h"
|
|
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
|
#include "tensorflow/lite/kernels/internal/reference/add.h"
|
|
#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
|
|
|
|
#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
|
|
#include <Accelerate/Accelerate.h>
|
|
#endif
|
|
|
|
#include "third_party/eigen3/Eigen/Core"
|
|
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
|
#include "fixedpoint/fixedpoint.h"
|
|
#include "ruy/profiler/instrumentation.h" // from @ruy
|
|
#include "tensorflow/lite/c/common.h"
|
|
#include "tensorflow/lite/kernels/cpu_backend_context.h"
|
|
#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
|
|
#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
|
|
#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
|
|
#include "tensorflow/lite/kernels/internal/cppmath.h"
|
|
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
|
|
#include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
|
|
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
|
#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
|
|
#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
|
|
#include "tensorflow/lite/kernels/internal/tensor.h"
|
|
#include "tensorflow/lite/kernels/internal/tensor_utils.h"
|
|
#include "tensorflow/lite/kernels/internal/transpose_utils.h"
|
|
#include "tensorflow/lite/kernels/internal/types.h"
|
|
|
|
#if __aarch64__ && __clang__
|
|
#define TFLITE_SOFTMAX_USE_UINT16_LUT
|
|
#endif
|
|
|
|
namespace tflite {
|
|
namespace optimized_ops {
|
|
|
|
// Unoptimized reference ops:
|
|
using reference_ops::ArgMax;
|
|
using reference_ops::ArgMinMax;
|
|
using reference_ops::Broadcast4DSlowGreater;
|
|
using reference_ops::Broadcast4DSlowGreaterEqual;
|
|
using reference_ops::Broadcast4DSlowGreaterEqualWithScaling;
|
|
using reference_ops::Broadcast4DSlowGreaterWithScaling;
|
|
using reference_ops::Broadcast4DSlowLess;
|
|
using reference_ops::Broadcast4DSlowLessEqual;
|
|
using reference_ops::Broadcast4DSlowLessEqualWithScaling;
|
|
using reference_ops::Broadcast4DSlowLessWithScaling;
|
|
using reference_ops::BroadcastAdd4DSlow;
|
|
using reference_ops::BroadcastMul4DSlow;
|
|
using reference_ops::BroadcastSubSlow;
|
|
using reference_ops::Concatenation;
|
|
using reference_ops::ConcatenationWithScaling;
|
|
using reference_ops::DepthConcatenation;
|
|
using reference_ops::Div;
|
|
using reference_ops::Elu;
|
|
using reference_ops::FakeQuant;
|
|
using reference_ops::Fill;
|
|
using reference_ops::Gather;
|
|
using reference_ops::Greater;
|
|
using reference_ops::GreaterEqual;
|
|
using reference_ops::GreaterEqualWithScaling;
|
|
using reference_ops::GreaterWithScaling;
|
|
using reference_ops::LeakyRelu;
|
|
using reference_ops::Less;
|
|
using reference_ops::LessEqual;
|
|
using reference_ops::LessEqualWithScaling;
|
|
using reference_ops::LessWithScaling;
|
|
using reference_ops::Mean;
|
|
using reference_ops::ProcessBroadcastShapes;
|
|
using reference_ops::RankOneSelect;
|
|
using reference_ops::Relu1;
|
|
using reference_ops::Relu6;
|
|
using reference_ops::ReluX;
|
|
using reference_ops::Round;
|
|
using reference_ops::Select;
|
|
using reference_ops::SpaceToBatchND;
|
|
using reference_ops::Split;
|
|
using reference_ops::StridedSlice;
|
|
using reference_ops::Sub16;
|
|
|
|
// TODO(b/80247582) Remove this constant.
|
|
// This will be phased out as the shifts are revised with more thought. Use of a
|
|
// constant enables us to track progress on this work.
|
|
//
|
|
// Used to convert from old-style shifts (right) to new-style (left).
|
|
static constexpr int kReverseShift = -1;
|
|
|
|
// Make a local VectorMap typedef allowing to map a float array
|
|
// as a Eigen vector expression. The std::conditional here is to
|
|
// construct the suitable Eigen type for the constness of the
|
|
// data. Indeed, for const data, we need to produce
|
|
// Eigen::Map<const Eigen::Matrix<float, ...>>
|
|
// and not the more straightforward
|
|
// Eigen::Map<Eigen::Matrix<const float, ...>>
|
|
template <typename Scalar>
|
|
using VectorMap = typename std::conditional<
|
|
std::is_const<Scalar>::value,
|
|
Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
|
|
Eigen::Dynamic, 1>>,
|
|
Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
|
|
|
|
template <typename Scalar>
|
|
VectorMap<Scalar> MapAsVector(Scalar* data, const RuntimeShape& shape) {
|
|
const int size = shape.FlatSize();
|
|
return VectorMap<Scalar>(data, size, 1);
|
|
}
|
|
|
|
// Make a local VectorMap typedef allowing to map a float array
|
|
// as a Eigen matrix expression. The same explanation as for VectorMap
|
|
// above also applies here.
|
|
template <typename Scalar>
|
|
using MatrixMap = typename std::conditional<
|
|
std::is_const<Scalar>::value,
|
|
Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
|
|
Eigen::Dynamic, Eigen::Dynamic>>,
|
|
Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
|
|
|
|
template <typename Scalar>
|
|
MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar* data,
|
|
const RuntimeShape& shape) {
|
|
const int dims_count = shape.DimensionsCount();
|
|
const int rows = shape.Dims(dims_count - 1);
|
|
const int cols = FlatSizeSkipDim(shape, dims_count - 1);
|
|
return MatrixMap<Scalar>(data, rows, cols);
|
|
}
|
|
|
|
template <typename Scalar>
|
|
MatrixMap<Scalar> MapAsMatrixWithFirstDimAsCols(Scalar* data,
|
|
const RuntimeShape& shape) {
|
|
const int cols = shape.Dims(0);
|
|
const int rows = FlatSizeSkipDim(shape, 0);
|
|
return MatrixMap<Scalar>(data, rows, cols);
|
|
}
|
|
|
|
template <typename Scalar>
|
|
using ArrayMap = typename std::conditional<
|
|
std::is_const<Scalar>::value,
|
|
Eigen::Map<const Eigen::Array<typename std::remove_const<Scalar>::type,
|
|
Eigen::Dynamic, Eigen::Dynamic>>,
|
|
Eigen::Map<Eigen::Array<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
|
|
|
|
template <typename Scalar>
|
|
ArrayMap<Scalar> MapAsArrayWithLastDimAsRows(Scalar* data,
|
|
const RuntimeShape& shape) {
|
|
const int dims_count = shape.DimensionsCount();
|
|
const int rows = shape.Dims(dims_count - 1);
|
|
const int cols = FlatSizeSkipDim(shape, dims_count - 1);
|
|
return ArrayMap<Scalar>(data, rows, cols);
|
|
}
|
|
|
|
// Copied from tensorflow/core/framework/tensor_types.h
|
|
template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
|
|
struct TTypes {
|
|
// Rank-1 tensor (vector) of scalar type T.
|
|
typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
|
|
Eigen::Aligned>
|
|
Flat;
|
|
typedef Eigen::TensorMap<
|
|
Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>>
|
|
UnalignedConstMatrix;
|
|
};
|
|
|
|
// TODO(b/62193649): this function is only needed as long
|
|
// as we have the --variable_batch hack.
|
|
template <typename Scalar>
|
|
MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
|
|
const RuntimeShape& shape,
|
|
int rows) {
|
|
const int flatsize = shape.FlatSize();
|
|
TFLITE_DCHECK_EQ(flatsize % rows, 0);
|
|
const int cols = flatsize / rows;
|
|
return MatrixMap<Scalar>(data, rows, cols);
|
|
}
|
|
|
|
// TODO(renjieliu): Refactor this to merge with other
|
|
// MultiplyByQuantizedMultipler.
|
|
#ifdef USE_NEON
|
|
inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
|
|
int32x4x4_t input_val, int32 quantized_multiplier, int32 shift) {
|
|
const int left_shift = std::max(shift, 0);
|
|
const int right_shift = std::min(shift, 0);
|
|
int32x4x4_t result;
|
|
|
|
int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
|
|
int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
|
|
int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
|
|
|
|
result.val[0] =
|
|
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
|
|
multiplier_dup),
|
|
right_shift_dup);
|
|
|
|
result.val[1] =
|
|
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
|
|
multiplier_dup),
|
|
right_shift_dup);
|
|
|
|
result.val[2] =
|
|
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
|
|
multiplier_dup),
|
|
right_shift_dup);
|
|
|
|
result.val[3] =
|
|
vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
|
|
multiplier_dup),
|
|
right_shift_dup);
|
|
|
|
return result;
|
|
}
|
|
#endif
|
|
|
|
template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
|
|
inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
|
|
const RuntimeShape& unswitched_input1_shape,
|
|
const T* unswitched_input1_data,
|
|
const RuntimeShape& unswitched_input2_shape,
|
|
const T* unswitched_input2_data,
|
|
const RuntimeShape& output_shape,
|
|
T* output_data, ElementwiseF elementwise_f,
|
|
ScalarBroadcastF scalar_broadcast_f) {
|
|
ArithmeticParams switched_params = unswitched_params;
|
|
switched_params.input1_offset = unswitched_params.input2_offset;
|
|
switched_params.input1_multiplier = unswitched_params.input2_multiplier;
|
|
switched_params.input1_shift = unswitched_params.input2_shift;
|
|
switched_params.input2_offset = unswitched_params.input1_offset;
|
|
switched_params.input2_multiplier = unswitched_params.input1_multiplier;
|
|
switched_params.input2_shift = unswitched_params.input1_shift;
|
|
|
|
const bool use_unswitched =
|
|
unswitched_params.broadcast_category ==
|
|
tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
|
|
|
|
const ArithmeticParams& params =
|
|
use_unswitched ? unswitched_params : switched_params;
|
|
const T* input1_data =
|
|
use_unswitched ? unswitched_input1_data : unswitched_input2_data;
|
|
const T* input2_data =
|
|
use_unswitched ? unswitched_input2_data : unswitched_input1_data;
|
|
|
|
// Fivefold nested loops. The second input resets its position for each
|
|
// iteration of the second loop. The first input resets its position at the
|
|
// beginning of the fourth loop. The innermost loop is an elementwise add of
|
|
// sections of the arrays.
|
|
T* output_data_ptr = output_data;
|
|
const T* input1_data_ptr = input1_data;
|
|
const T* input2_data_reset = input2_data;
|
|
// In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
|
|
// between input shapes. y3 for input 1 is always broadcast, and so the
|
|
// dimension there is 1, whereas optionally y1 might be broadcast for
|
|
// input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
|
|
// input2.shape.FlatSize = y0 * y2 * y3 * y4.
|
|
int y0 = params.broadcast_shape[0];
|
|
int y1 = params.broadcast_shape[1];
|
|
int y2 = params.broadcast_shape[2];
|
|
int y3 = params.broadcast_shape[3];
|
|
int y4 = params.broadcast_shape[4];
|
|
if (y4 > 1) {
|
|
// General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
|
|
// dimension.
|
|
for (int i0 = 0; i0 < y0; ++i0) {
|
|
const T* input2_data_ptr = nullptr;
|
|
for (int i1 = 0; i1 < y1; ++i1) {
|
|
input2_data_ptr = input2_data_reset;
|
|
for (int i2 = 0; i2 < y2; ++i2) {
|
|
for (int i3 = 0; i3 < y3; ++i3) {
|
|
elementwise_f(y4, params, input1_data_ptr, input2_data_ptr,
|
|
output_data_ptr);
|
|
input2_data_ptr += y4;
|
|
output_data_ptr += y4;
|
|
}
|
|
// We have broadcast y4 of input1 data y3 times, and now move on.
|
|
input1_data_ptr += y4;
|
|
}
|
|
}
|
|
// We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
|
|
input2_data_reset = input2_data_ptr;
|
|
}
|
|
} else {
|
|
// Special case of y4 == 1, in which the innermost loop is a single
|
|
// element and can be combined with the next (y3) as an inner broadcast.
|
|
//
|
|
// Note that this handles the case of pure scalar broadcast when
|
|
// y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
|
|
// broadcast with batch (as y2 > 1).
|
|
//
|
|
// NOTE The process is the same as the above general case except
|
|
// simplified for y4 == 1 and the loop over y3 is contained within the
|
|
// AddScalarBroadcast function.
|
|
for (int i0 = 0; i0 < y0; ++i0) {
|
|
const T* input2_data_ptr = nullptr;
|
|
for (int i1 = 0; i1 < y1; ++i1) {
|
|
input2_data_ptr = input2_data_reset;
|
|
for (int i2 = 0; i2 < y2; ++i2) {
|
|
scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr,
|
|
output_data_ptr);
|
|
input2_data_ptr += y3;
|
|
output_data_ptr += y3;
|
|
input1_data_ptr += 1;
|
|
}
|
|
}
|
|
input2_data_reset = input2_data_ptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
|
|
|
|
// Looks up each element of <indices> in <table>, returns them in a vector.
|
|
inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4],
|
|
uint8x16_t indices) {
|
|
// Look up in 1st quarter of the table: top 2 bits of indices == 00
|
|
uint8x16_t output1 = vqtbl4q_u8(table[0], indices);
|
|
// Look up in 2nd quarter of the table: top 2 bits of indices == 01
|
|
uint8x16_t output2 =
|
|
vqtbl4q_u8(table[1], veorq_u8(indices, vdupq_n_u8(0x40)));
|
|
// Look up in 3rd quarter of the table: top 2 bits of indices == 10
|
|
uint8x16_t output3 =
|
|
vqtbl4q_u8(table[2], veorq_u8(indices, vdupq_n_u8(0x80)));
|
|
// Look up in 4th quarter of the table: top 2 bits of indices == 11
|
|
uint8x16_t output4 =
|
|
vqtbl4q_u8(table[3], veorq_u8(indices, vdupq_n_u8(0xc0)));
|
|
|
|
// Combine result of the 4 lookups.
|
|
return vorrq_u8(vorrq_u8(output1, output2), vorrq_u8(output3, output4));
|
|
}
|
|
|
|
#endif
|
|
|
|
inline void AddBiasAndEvalActivationFunction(float output_activation_min,
|
|
float output_activation_max,
|
|
const RuntimeShape& bias_shape,
|
|
const float* bias_data,
|
|
const RuntimeShape& array_shape,
|
|
float* array_data) {
|
|
BiasAndClamp(output_activation_min, output_activation_max,
|
|
bias_shape.FlatSize(), bias_data, array_shape.FlatSize(),
|
|
array_data);
|
|
}
|
|
|
|
inline void FullyConnected(
|
|
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
|
const float* input_data, const RuntimeShape& weights_shape,
|
|
const float* weights_data, const RuntimeShape& bias_shape,
|
|
const float* optional_bias_data, const RuntimeShape& output_shape,
|
|
float* output_data, CpuBackendContext* cpu_backend_context) {
|
|
ruy::profiler::ScopeLabel label("FullyConnected");
|
|
const int dims_count = weights_shape.DimensionsCount();
|
|
const int input_rows = weights_shape.Dims(dims_count - 1);
|
|
cpu_backend_gemm::MatrixParams<float> rhs_params;
|
|
rhs_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
rhs_params.rows = input_rows;
|
|
rhs_params.cols = input_shape.FlatSize() / input_rows;
|
|
rhs_params.cache_policy =
|
|
cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable);
|
|
TFLITE_DCHECK_EQ(input_shape.FlatSize(), rhs_params.rows * rhs_params.cols);
|
|
cpu_backend_gemm::MatrixParams<float> lhs_params;
|
|
lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
|
|
lhs_params.cols = weights_shape.Dims(dims_count - 1);
|
|
lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1);
|
|
lhs_params.cache_policy =
|
|
cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable);
|
|
cpu_backend_gemm::MatrixParams<float> dst_params;
|
|
dst_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1);
|
|
dst_params.cols =
|
|
FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
|
|
cpu_backend_gemm::GemmParams<float, float> gemm_params;
|
|
gemm_params.bias = optional_bias_data;
|
|
gemm_params.clamp_min = params.float_activation_min;
|
|
gemm_params.clamp_max = params.float_activation_max;
|
|
cpu_backend_gemm::Gemm(lhs_params, weights_data, rhs_params, input_data,
|
|
dst_params, output_data, gemm_params,
|
|
cpu_backend_context);
|
|
}
|
|
|
|
inline void FullyConnected(
|
|
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
|
const uint8* input_data, const RuntimeShape& filter_shape,
|
|
const uint8* filter_data, const RuntimeShape& bias_shape,
|
|
const int32* bias_data, const RuntimeShape& output_shape,
|
|
uint8* output_data, CpuBackendContext* cpu_backend_context) {
|
|
ruy::profiler::ScopeLabel label("FullyConnected/8bit");
|
|
const int32 input_offset = params.input_offset;
|
|
const int32 filter_offset = params.weights_offset;
|
|
const int32 output_offset = params.output_offset;
|
|
const int32 output_multiplier = params.output_multiplier;
|
|
const int output_shift = params.output_shift;
|
|
const int32 output_activation_min = params.quantized_activation_min;
|
|
const int32 output_activation_max = params.quantized_activation_max;
|
|
TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
|
|
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
|
|
// TODO(benoitjacob): This really should be:
|
|
// const int batches = ArraySize(output_dims, 1);
|
|
// but the current --variable_batch hack consists in overwriting the 3rd
|
|
// dimension with the runtime batch size, as we don't keep track for each
|
|
// array of which dimension is the batch dimension in it.
|
|
const int output_dim_count = output_shape.DimensionsCount();
|
|
const int filter_dim_count = filter_shape.DimensionsCount();
|
|
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
|
|
const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
|
|
const int filter_cols = filter_shape.Dims(filter_dim_count - 1);
|
|
TFLITE_DCHECK_EQ(filter_shape.FlatSize(), filter_rows * filter_cols);
|
|
const int output_rows = output_shape.Dims(output_dim_count - 1);
|
|
TFLITE_DCHECK_EQ(output_rows, filter_rows);
|
|
if (bias_data) {
|
|
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
|
|
}
|
|
|
|
cpu_backend_gemm::MatrixParams<uint8> lhs_params;
|
|
lhs_params.rows = filter_rows;
|
|
lhs_params.cols = filter_cols;
|
|
lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
|
|
lhs_params.zero_point = -filter_offset;
|
|
lhs_params.cache_policy =
|
|
cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable);
|
|
cpu_backend_gemm::MatrixParams<uint8> rhs_params;
|
|
rhs_params.rows = filter_cols;
|
|
rhs_params.cols = batches;
|
|
rhs_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
rhs_params.zero_point = -input_offset;
|
|
rhs_params.cache_policy =
|
|
cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable);
|
|
cpu_backend_gemm::MatrixParams<uint8> dst_params;
|
|
dst_params.rows = filter_rows;
|
|
dst_params.cols = batches;
|
|
dst_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
dst_params.zero_point = output_offset;
|
|
cpu_backend_gemm::GemmParams<int32, uint8> gemm_params;
|
|
gemm_params.bias = bias_data;
|
|
gemm_params.clamp_min = output_activation_min;
|
|
gemm_params.clamp_max = output_activation_max;
|
|
gemm_params.multiplier_fixedpoint = output_multiplier;
|
|
gemm_params.multiplier_exponent = output_shift;
|
|
cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, input_data,
|
|
dst_params, output_data, gemm_params,
|
|
cpu_backend_context);
|
|
}
|
|
|
|
inline void FullyConnected(
|
|
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
|
const uint8* input_data, const RuntimeShape& filter_shape,
|
|
const uint8* filter_data, const RuntimeShape& bias_shape,
|
|
const int32* bias_data_int32, const RuntimeShape& output_shape,
|
|
int16* output_data, CpuBackendContext* cpu_backend_context) {
|
|
ruy::profiler::ScopeLabel label("FullyConnected/Uint8Int16");
|
|
const int32 input_offset = params.input_offset;
|
|
const int32 filter_offset = params.weights_offset;
|
|
const int32 output_offset = params.output_offset;
|
|
const int32 output_multiplier = params.output_multiplier;
|
|
const int output_shift = params.output_shift;
|
|
const int32 output_activation_min = params.quantized_activation_min;
|
|
const int32 output_activation_max = params.quantized_activation_max;
|
|
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
|
TFLITE_DCHECK_EQ(output_offset, 0);
|
|
TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
|
|
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
|
|
|
|
// TODO(benoitjacob): This really should be:
|
|
// const int batches = ArraySize(output_dims, 1);
|
|
// but the current --variable_batch hack consists in overwriting the 3rd
|
|
// dimension with the runtime batch size, as we don't keep track for each
|
|
// array of which dimension is the batch dimension in it.
|
|
const int output_dim_count = output_shape.DimensionsCount();
|
|
const int filter_dim_count = filter_shape.DimensionsCount();
|
|
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
|
|
const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
|
|
output_shape, output_dim_count - 1);
|
|
const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
|
|
|
|
cpu_backend_gemm::MatrixParams<uint8> lhs_params;
|
|
lhs_params.rows = output_depth;
|
|
lhs_params.cols = accum_depth;
|
|
lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
|
|
lhs_params.zero_point = -filter_offset;
|
|
lhs_params.cache_policy =
|
|
cpu_backend_gemm::DefaultCachePolicy(params.lhs_cacheable);
|
|
cpu_backend_gemm::MatrixParams<uint8> rhs_params;
|
|
rhs_params.rows = accum_depth;
|
|
rhs_params.cols = batches;
|
|
rhs_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
rhs_params.zero_point = -input_offset;
|
|
rhs_params.cache_policy =
|
|
cpu_backend_gemm::DefaultCachePolicy(params.rhs_cacheable);
|
|
cpu_backend_gemm::MatrixParams<int16> dst_params;
|
|
dst_params.rows = output_depth;
|
|
dst_params.cols = batches;
|
|
dst_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
dst_params.zero_point = 0;
|
|
cpu_backend_gemm::GemmParams<int32, int16> gemm_params;
|
|
gemm_params.bias = bias_data_int32;
|
|
gemm_params.clamp_min = output_activation_min;
|
|
gemm_params.clamp_max = output_activation_max;
|
|
gemm_params.multiplier_fixedpoint = output_multiplier;
|
|
gemm_params.multiplier_exponent = output_shift;
|
|
cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, input_data,
|
|
dst_params, output_data, gemm_params,
|
|
cpu_backend_context);
|
|
}
|
|
|
|
// Internal function doing the actual arithmetic work for
|
|
// ShuffledFullyConnected.
|
|
// May be called either directly by it (single-threaded case) or may be used
|
|
// as the 'task' for worker threads to run (multi-threaded case, see
|
|
// ShuffledFullyConnectedWorkerTask below).
|
|
inline void ShuffledFullyConnectedWorkerImpl(
|
|
const uint8* shuffled_input_workspace_data,
|
|
const int8* shuffled_weights_data, int batches, int output_depth,
|
|
int output_stride, int accum_depth, const int32* bias_data,
|
|
int32 output_multiplier, int output_shift, int16* output_data) {
|
|
#if defined USE_NEON
|
|
const int8* shuffled_weights_ptr = shuffled_weights_data;
|
|
if (batches == 1) {
|
|
const int right_shift = output_shift > 0 ? 0 : -output_shift;
|
|
const int left_shift = output_shift > 0 ? output_shift : 0;
|
|
for (int c = 0; c < output_depth; c += 4) {
|
|
// Accumulation loop.
|
|
int32x4_t row_accum0 = vdupq_n_s32(0);
|
|
int32x4_t row_accum1 = vdupq_n_s32(0);
|
|
int32x4_t row_accum2 = vdupq_n_s32(0);
|
|
int32x4_t row_accum3 = vdupq_n_s32(0);
|
|
for (int d = 0; d < accum_depth; d += 16) {
|
|
int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0);
|
|
int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16);
|
|
int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32);
|
|
int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48);
|
|
shuffled_weights_ptr += 64;
|
|
int8x16_t input =
|
|
vreinterpretq_s8_u8(vld1q_u8(shuffled_input_workspace_data + d));
|
|
int16x8_t local_accum0 =
|
|
vmull_s8(vget_low_s8(weights0), vget_low_s8(input));
|
|
int16x8_t local_accum1 =
|
|
vmull_s8(vget_low_s8(weights1), vget_low_s8(input));
|
|
int16x8_t local_accum2 =
|
|
vmull_s8(vget_low_s8(weights2), vget_low_s8(input));
|
|
int16x8_t local_accum3 =
|
|
vmull_s8(vget_low_s8(weights3), vget_low_s8(input));
|
|
local_accum0 =
|
|
vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input));
|
|
local_accum1 =
|
|
vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input));
|
|
local_accum2 =
|
|
vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input));
|
|
local_accum3 =
|
|
vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input));
|
|
row_accum0 = vpadalq_s16(row_accum0, local_accum0);
|
|
row_accum1 = vpadalq_s16(row_accum1, local_accum1);
|
|
row_accum2 = vpadalq_s16(row_accum2, local_accum2);
|
|
row_accum3 = vpadalq_s16(row_accum3, local_accum3);
|
|
}
|
|
// Horizontally reduce accumulators
|
|
int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
|
|
pairwise_reduced_acc_2, pairwise_reduced_acc_3;
|
|
pairwise_reduced_acc_0 =
|
|
vpadd_s32(vget_low_s32(row_accum0), vget_high_s32(row_accum0));
|
|
pairwise_reduced_acc_1 =
|
|
vpadd_s32(vget_low_s32(row_accum1), vget_high_s32(row_accum1));
|
|
pairwise_reduced_acc_2 =
|
|
vpadd_s32(vget_low_s32(row_accum2), vget_high_s32(row_accum2));
|
|
pairwise_reduced_acc_3 =
|
|
vpadd_s32(vget_low_s32(row_accum3), vget_high_s32(row_accum3));
|
|
const int32x2_t reduced_lo =
|
|
vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
|
|
const int32x2_t reduced_hi =
|
|
vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
|
|
int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
|
|
// Add bias values.
|
|
int32x4_t bias_vec = vld1q_s32(bias_data + c);
|
|
reduced = vaddq_s32(reduced, bias_vec);
|
|
reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
|
|
// Multiply by the fixed-point multiplier.
|
|
reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
|
|
// Rounding-shift-right.
|
|
using gemmlowp::RoundingDivideByPOT;
|
|
reduced = RoundingDivideByPOT(reduced, right_shift);
|
|
// Narrow values down to 16 bit signed.
|
|
const int16x4_t res16 = vqmovn_s32(reduced);
|
|
vst1_s16(output_data + c, res16);
|
|
}
|
|
} else if (batches == 4) {
|
|
const int right_shift = output_shift > 0 ? 0 : -output_shift;
|
|
const int left_shift = output_shift > 0 ? output_shift : 0;
|
|
for (int c = 0; c < output_depth; c += 4) {
|
|
const int8* shuffled_input_ptr =
|
|
reinterpret_cast<const int8*>(shuffled_input_workspace_data);
|
|
// Accumulation loop.
|
|
int32x4_t row_accum00 = vdupq_n_s32(0);
|
|
int32x4_t row_accum10 = vdupq_n_s32(0);
|
|
int32x4_t row_accum20 = vdupq_n_s32(0);
|
|
int32x4_t row_accum30 = vdupq_n_s32(0);
|
|
int32x4_t row_accum01 = vdupq_n_s32(0);
|
|
int32x4_t row_accum11 = vdupq_n_s32(0);
|
|
int32x4_t row_accum21 = vdupq_n_s32(0);
|
|
int32x4_t row_accum31 = vdupq_n_s32(0);
|
|
int32x4_t row_accum02 = vdupq_n_s32(0);
|
|
int32x4_t row_accum12 = vdupq_n_s32(0);
|
|
int32x4_t row_accum22 = vdupq_n_s32(0);
|
|
int32x4_t row_accum32 = vdupq_n_s32(0);
|
|
int32x4_t row_accum03 = vdupq_n_s32(0);
|
|
int32x4_t row_accum13 = vdupq_n_s32(0);
|
|
int32x4_t row_accum23 = vdupq_n_s32(0);
|
|
int32x4_t row_accum33 = vdupq_n_s32(0);
|
|
for (int d = 0; d < accum_depth; d += 16) {
|
|
int8x16_t weights0 = vld1q_s8(shuffled_weights_ptr + 0);
|
|
int8x16_t weights1 = vld1q_s8(shuffled_weights_ptr + 16);
|
|
int8x16_t weights2 = vld1q_s8(shuffled_weights_ptr + 32);
|
|
int8x16_t weights3 = vld1q_s8(shuffled_weights_ptr + 48);
|
|
shuffled_weights_ptr += 64;
|
|
int8x16_t input0 = vld1q_s8(shuffled_input_ptr + 0);
|
|
int8x16_t input1 = vld1q_s8(shuffled_input_ptr + 16);
|
|
int8x16_t input2 = vld1q_s8(shuffled_input_ptr + 32);
|
|
int8x16_t input3 = vld1q_s8(shuffled_input_ptr + 48);
|
|
shuffled_input_ptr += 64;
|
|
int16x8_t local_accum0, local_accum1, local_accum2, local_accum3;
|
|
#define TFLITE_SHUFFLED_FC_ACCUM(B) \
|
|
local_accum0 = vmull_s8(vget_low_s8(weights0), vget_low_s8(input##B)); \
|
|
local_accum1 = vmull_s8(vget_low_s8(weights1), vget_low_s8(input##B)); \
|
|
local_accum2 = vmull_s8(vget_low_s8(weights2), vget_low_s8(input##B)); \
|
|
local_accum3 = vmull_s8(vget_low_s8(weights3), vget_low_s8(input##B)); \
|
|
local_accum0 = \
|
|
vmlal_s8(local_accum0, vget_high_s8(weights0), vget_high_s8(input##B)); \
|
|
local_accum1 = \
|
|
vmlal_s8(local_accum1, vget_high_s8(weights1), vget_high_s8(input##B)); \
|
|
local_accum2 = \
|
|
vmlal_s8(local_accum2, vget_high_s8(weights2), vget_high_s8(input##B)); \
|
|
local_accum3 = \
|
|
vmlal_s8(local_accum3, vget_high_s8(weights3), vget_high_s8(input##B)); \
|
|
row_accum0##B = vpadalq_s16(row_accum0##B, local_accum0); \
|
|
row_accum1##B = vpadalq_s16(row_accum1##B, local_accum1); \
|
|
row_accum2##B = vpadalq_s16(row_accum2##B, local_accum2); \
|
|
row_accum3##B = vpadalq_s16(row_accum3##B, local_accum3);
|
|
|
|
TFLITE_SHUFFLED_FC_ACCUM(0)
|
|
TFLITE_SHUFFLED_FC_ACCUM(1)
|
|
TFLITE_SHUFFLED_FC_ACCUM(2)
|
|
TFLITE_SHUFFLED_FC_ACCUM(3)
|
|
|
|
#undef TFLITE_SHUFFLED_FC_ACCUM
|
|
}
|
|
// Horizontally reduce accumulators
|
|
|
|
#define TFLITE_SHUFFLED_FC_STORE(B) \
|
|
{ \
|
|
int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1, \
|
|
pairwise_reduced_acc_2, pairwise_reduced_acc_3; \
|
|
pairwise_reduced_acc_0 = \
|
|
vpadd_s32(vget_low_s32(row_accum0##B), vget_high_s32(row_accum0##B)); \
|
|
pairwise_reduced_acc_1 = \
|
|
vpadd_s32(vget_low_s32(row_accum1##B), vget_high_s32(row_accum1##B)); \
|
|
pairwise_reduced_acc_2 = \
|
|
vpadd_s32(vget_low_s32(row_accum2##B), vget_high_s32(row_accum2##B)); \
|
|
pairwise_reduced_acc_3 = \
|
|
vpadd_s32(vget_low_s32(row_accum3##B), vget_high_s32(row_accum3##B)); \
|
|
const int32x2_t reduced_lo = \
|
|
vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1); \
|
|
const int32x2_t reduced_hi = \
|
|
vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3); \
|
|
int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); \
|
|
int32x4_t bias_vec = vld1q_s32(bias_data + c); \
|
|
reduced = vaddq_s32(reduced, bias_vec); \
|
|
reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift)); \
|
|
reduced = vqrdmulhq_n_s32(reduced, output_multiplier); \
|
|
using gemmlowp::RoundingDivideByPOT; \
|
|
reduced = RoundingDivideByPOT(reduced, right_shift); \
|
|
const int16x4_t res16 = vqmovn_s32(reduced); \
|
|
vst1_s16(output_data + c + B * output_stride, res16); \
|
|
}
|
|
|
|
TFLITE_SHUFFLED_FC_STORE(0);
|
|
TFLITE_SHUFFLED_FC_STORE(1);
|
|
TFLITE_SHUFFLED_FC_STORE(2);
|
|
TFLITE_SHUFFLED_FC_STORE(3);
|
|
|
|
#undef TFLITE_SHUFFLED_FC_STORE
|
|
}
|
|
} else {
|
|
TFLITE_DCHECK(false);
|
|
return;
|
|
}
|
|
#else
|
|
if (batches == 1) {
|
|
int16* output_ptr = output_data;
|
|
// Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
|
|
// so that just reinterpreting them as int8 values is equivalent to
|
|
// subtracting 128 from them, thus implementing for free the subtraction of
|
|
// the zero_point value 128.
|
|
const int8* shuffled_weights_ptr =
|
|
reinterpret_cast<const int8*>(shuffled_weights_data);
|
|
// Likewise, we preshuffled and pre-xored the input data above.
|
|
const int8* shuffled_input_data =
|
|
reinterpret_cast<const int8*>(shuffled_input_workspace_data);
|
|
for (int c = 0; c < output_depth; c += 4) {
|
|
// Internal accumulation.
|
|
// Initialize accumulator with the bias-value.
|
|
int32 accum[4] = {0};
|
|
// Accumulation loop.
|
|
for (int d = 0; d < accum_depth; d += 16) {
|
|
for (int i = 0; i < 4; i++) {
|
|
for (int j = 0; j < 16; j++) {
|
|
int8 input_val = shuffled_input_data[d + j];
|
|
int8 weights_val = *shuffled_weights_ptr++;
|
|
accum[i] += weights_val * input_val;
|
|
}
|
|
}
|
|
}
|
|
for (int i = 0; i < 4; i++) {
|
|
// Add bias value
|
|
int acc = accum[i] + bias_data[c + i];
|
|
// Down-scale the final int32 accumulator to the scale used by our
|
|
// (16-bit, typically 3 integer bits) fixed-point format. The quantized
|
|
// multiplier and shift here have been pre-computed offline
|
|
// (e.g. by toco).
|
|
acc =
|
|
MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
|
|
// Saturate, cast to int16, and store to output array.
|
|
acc = std::max(acc, -32768);
|
|
acc = std::min(acc, 32767);
|
|
output_ptr[c + i] = acc;
|
|
}
|
|
}
|
|
} else if (batches == 4) {
|
|
int16* output_ptr = output_data;
|
|
// Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
|
|
// so that just reinterpreting them as int8 values is equivalent to
|
|
// subtracting 128 from them, thus implementing for free the subtraction of
|
|
// the zero_point value 128.
|
|
const int8* shuffled_weights_ptr =
|
|
reinterpret_cast<const int8*>(shuffled_weights_data);
|
|
// Likewise, we preshuffled and pre-xored the input data above.
|
|
const int8* shuffled_input_data =
|
|
reinterpret_cast<const int8*>(shuffled_input_workspace_data);
|
|
for (int c = 0; c < output_depth; c += 4) {
|
|
const int8* shuffled_input_ptr = shuffled_input_data;
|
|
// Accumulation loop.
|
|
// Internal accumulation.
|
|
// Initialize accumulator with the bias-value.
|
|
int32 accum[4][4];
|
|
for (int i = 0; i < 4; i++) {
|
|
for (int b = 0; b < 4; b++) {
|
|
accum[i][b] = 0;
|
|
}
|
|
}
|
|
for (int d = 0; d < accum_depth; d += 16) {
|
|
for (int i = 0; i < 4; i++) {
|
|
for (int b = 0; b < 4; b++) {
|
|
for (int j = 0; j < 16; j++) {
|
|
int8 input_val = shuffled_input_ptr[16 * b + j];
|
|
int8 weights_val = shuffled_weights_ptr[16 * i + j];
|
|
accum[i][b] += weights_val * input_val;
|
|
}
|
|
}
|
|
}
|
|
shuffled_input_ptr += 64;
|
|
shuffled_weights_ptr += 64;
|
|
}
|
|
for (int i = 0; i < 4; i++) {
|
|
for (int b = 0; b < 4; b++) {
|
|
// Add bias value
|
|
int acc = accum[i][b] + bias_data[c + i];
|
|
// Down-scale the final int32 accumulator to the scale used by our
|
|
// (16-bit, typically 3 integer bits) fixed-point format. The
|
|
// quantized multiplier and shift here have been pre-computed offline
|
|
// (e.g. by toco).
|
|
acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
|
|
output_shift);
|
|
// Saturate, cast to int16, and store to output array.
|
|
acc = std::max(acc, -32768);
|
|
acc = std::min(acc, 32767);
|
|
output_ptr[b * output_stride + c + i] = acc;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
TFLITE_DCHECK(false);
|
|
return;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// Wraps ShuffledFullyConnectedWorkerImpl into a Task class
|
|
// to allow using gemmlowp's threadpool.
|
|
struct ShuffledFullyConnectedWorkerTask : cpu_backend_threadpool::Task {
|
|
ShuffledFullyConnectedWorkerTask(const uint8* input_data,
|
|
const int8* shuffled_weights_data,
|
|
int batches, int output_depth,
|
|
int output_stride, int accum_depth,
|
|
const int32* bias_data,
|
|
int32 output_multiplier, int output_shift,
|
|
int16* output_data)
|
|
: input_data_(input_data),
|
|
shuffled_weights_data_(shuffled_weights_data),
|
|
batches_(batches),
|
|
output_depth_(output_depth),
|
|
output_stride_(output_stride),
|
|
accum_depth_(accum_depth),
|
|
bias_data_(bias_data),
|
|
output_multiplier_(output_multiplier),
|
|
output_shift_(output_shift),
|
|
output_data_(output_data) {}
|
|
|
|
void Run() override {
|
|
ShuffledFullyConnectedWorkerImpl(
|
|
input_data_, shuffled_weights_data_, batches_, output_depth_,
|
|
output_stride_, accum_depth_, bias_data_, output_multiplier_,
|
|
output_shift_, output_data_);
|
|
}
|
|
|
|
const uint8* input_data_;
|
|
const int8* shuffled_weights_data_;
|
|
int batches_;
|
|
int output_depth_;
|
|
int output_stride_;
|
|
int accum_depth_;
|
|
const int32* bias_data_;
|
|
int32 output_multiplier_;
|
|
int output_shift_;
|
|
int16* output_data_;
|
|
};
|
|
|
|
inline void ShuffledFullyConnected(
|
|
const FullyConnectedParams& params, const RuntimeShape& input_shape,
|
|
const uint8* input_data, const RuntimeShape& weights_shape,
|
|
const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
|
|
const int32* bias_data, const RuntimeShape& output_shape,
|
|
int16* output_data, uint8* shuffled_input_workspace_data,
|
|
CpuBackendContext* cpu_backend_context) {
|
|
ruy::profiler::ScopeLabel label("ShuffledFullyConnected/8bit");
|
|
const int32 output_multiplier = params.output_multiplier;
|
|
const int output_shift = params.output_shift;
|
|
const int32 output_activation_min = params.quantized_activation_min;
|
|
const int32 output_activation_max = params.quantized_activation_max;
|
|
TFLITE_DCHECK_EQ(output_activation_min, -32768);
|
|
TFLITE_DCHECK_EQ(output_activation_max, 32767);
|
|
TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
|
|
TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
|
|
TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
|
|
// TODO(benoitjacob): This really should be:
|
|
// const int batches = ArraySize(output_dims, 1);
|
|
// but the current --variable_batch hack consists in overwriting the 3rd
|
|
// dimension with the runtime batch size, as we don't keep track for each
|
|
// array of which dimension is the batch dimension in it.
|
|
const int output_dim_count = output_shape.DimensionsCount();
|
|
const int weights_dim_count = weights_shape.DimensionsCount();
|
|
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
|
|
const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
|
|
output_shape, output_dim_count - 1);
|
|
const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
|
|
TFLITE_DCHECK((accum_depth % 16) == 0);
|
|
TFLITE_DCHECK((output_depth % 4) == 0);
|
|
// Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
|
|
// so that just reinterpreting them as int8 values is equivalent to
|
|
// subtracting 128 from them, thus implementing for free the subtraction of
|
|
// the zero_point value 128.
|
|
const int8* int8_shuffled_weights_data =
|
|
reinterpret_cast<const int8*>(shuffled_weights_data);
|
|
|
|
// Shuffling and xoring of input activations into the workspace buffer
|
|
if (batches == 1) {
|
|
#ifdef USE_NEON
|
|
const uint8x16_t signbit = vdupq_n_u8(0x80);
|
|
for (int i = 0; i < accum_depth; i += 16) {
|
|
uint8x16_t val = vld1q_u8(input_data + i);
|
|
val = veorq_u8(val, signbit);
|
|
vst1q_u8(shuffled_input_workspace_data + i, val);
|
|
}
|
|
#else
|
|
for (int i = 0; i < accum_depth; i++) {
|
|
shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
|
|
}
|
|
#endif
|
|
} else if (batches == 4) {
|
|
uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
|
|
int c = 0;
|
|
#ifdef USE_NEON
|
|
const uint8x16_t signbit = vdupq_n_u8(0x80);
|
|
for (c = 0; c < accum_depth; c += 16) {
|
|
const uint8* src_data_ptr = input_data + c;
|
|
uint8x16_t val0 = vld1q_u8(src_data_ptr + 0 * accum_depth);
|
|
uint8x16_t val1 = vld1q_u8(src_data_ptr + 1 * accum_depth);
|
|
uint8x16_t val2 = vld1q_u8(src_data_ptr + 2 * accum_depth);
|
|
uint8x16_t val3 = vld1q_u8(src_data_ptr + 3 * accum_depth);
|
|
val0 = veorq_u8(val0, signbit);
|
|
val1 = veorq_u8(val1, signbit);
|
|
val2 = veorq_u8(val2, signbit);
|
|
val3 = veorq_u8(val3, signbit);
|
|
vst1q_u8(shuffled_input_workspace_ptr + 0, val0);
|
|
vst1q_u8(shuffled_input_workspace_ptr + 16, val1);
|
|
vst1q_u8(shuffled_input_workspace_ptr + 32, val2);
|
|
vst1q_u8(shuffled_input_workspace_ptr + 48, val3);
|
|
shuffled_input_workspace_ptr += 64;
|
|
}
|
|
#else
|
|
for (c = 0; c < accum_depth; c += 16) {
|
|
for (int b = 0; b < 4; b++) {
|
|
const uint8* src_data_ptr = input_data + b * accum_depth + c;
|
|
for (int j = 0; j < 16; j++) {
|
|
uint8 src_val = *src_data_ptr++;
|
|
// Flip the sign bit, so that the kernel will only need to
|
|
// reinterpret these uint8 values as int8, getting for free the
|
|
// subtraction of the zero_point value 128.
|
|
uint8 dst_val = src_val ^ 0x80;
|
|
*shuffled_input_workspace_ptr++ = dst_val;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
} else {
|
|
TFLITE_DCHECK(false);
|
|
return;
|
|
}
|
|
|
|
static constexpr int kKernelRows = 4;
|
|
const int thread_count =
|
|
LegacyHowManyThreads<kKernelRows>(cpu_backend_context->max_num_threads(),
|
|
output_depth, batches, accum_depth);
|
|
if (thread_count == 1) {
|
|
// Single-thread case: do the computation on the current thread, don't
|
|
// use a threadpool
|
|
ShuffledFullyConnectedWorkerImpl(
|
|
shuffled_input_workspace_data, int8_shuffled_weights_data, batches,
|
|
output_depth, output_depth, accum_depth, bias_data, output_multiplier,
|
|
output_shift, output_data);
|
|
return;
|
|
}
|
|
|
|
// Multi-threaded case: use the gemmlowp context's threadpool.
|
|
TFLITE_DCHECK_GT(thread_count, 1);
|
|
std::vector<ShuffledFullyConnectedWorkerTask> tasks;
|
|
// TODO(b/131746020) don't create new heap allocations every time.
|
|
// At least we make it a single heap allocation by using reserve().
|
|
tasks.reserve(thread_count);
|
|
const int kRowsPerWorker =
|
|
RoundUp<kKernelRows>(CeilQuotient(output_depth, thread_count));
|
|
int row_start = 0;
|
|
for (int i = 0; i < thread_count; i++) {
|
|
int row_end = std::min(output_depth, row_start + kRowsPerWorker);
|
|
tasks.emplace_back(shuffled_input_workspace_data,
|
|
int8_shuffled_weights_data + row_start * accum_depth,
|
|
batches, row_end - row_start, output_depth, accum_depth,
|
|
bias_data + row_start, output_multiplier, output_shift,
|
|
output_data + row_start);
|
|
row_start = row_end;
|
|
}
|
|
TFLITE_DCHECK_EQ(row_start, output_depth);
|
|
cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
|
|
cpu_backend_context);
|
|
}
|
|
|
|
#ifdef USE_NEON
|
|
|
|
inline int32x4_t RoundToNearest(const float32x4_t input) {
|
|
#if defined(__aarch64__) || defined(__SSSE3__)
|
|
// Note: vcvtnq_s32_f32 is not available in ARMv7
|
|
return vcvtnq_s32_f32(input);
|
|
#else
|
|
static const float32x4_t zero_val_dup = vdupq_n_f32(0.0f);
|
|
static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
|
|
static const float32x4_t minus_point5_val_dup = vdupq_n_f32(-0.5f);
|
|
|
|
const uint32x4_t mask = vcltq_f32(input, zero_val_dup);
|
|
const float32x4_t round =
|
|
vbslq_f32(mask, minus_point5_val_dup, point5_val_dup);
|
|
return vcvtq_s32_f32(vaddq_f32(input, round));
|
|
#endif // defined(__aarch64__) || defined(__SSSE3__)
|
|
}
|
|
|
|
inline uint32x4_t RoundToNearestUnsigned(const float32x4_t input) {
|
|
#if defined(__aarch64__)
|
|
// Note that vcvtnq_u32_f32 is not available in ARMv7 or in arm_neon_sse.h.
|
|
return vcvtnq_u32_f32(input);
|
|
#else
|
|
static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
|
|
|
|
return vcvtq_u32_f32(vaddq_f32(input, point5_val_dup));
|
|
#endif // defined(__aarch64__)
|
|
}
|
|
|
|
#endif // USE_NEON
|
|
|
|
inline void MeanImpl(const tflite::MeanParams& op_params,
|
|
const RuntimeShape& input_shape, const uint8_t* input_data,
|
|
int32 multiplier, int32 shift, int32 bias,
|
|
const RuntimeShape& output_shape, uint8_t* output_data,
|
|
int start_depth, int end_depth) {
|
|
ruy::profiler::ScopeLabel label("Mean4D/Uint8/MeanImpl");
|
|
|
|
// Current implementation only supports dimension equals 4 and simultaneous
|
|
// reduction over width and height.
|
|
const int output_batch = output_shape.Dims(0);
|
|
const int output_height = output_shape.Dims(2);
|
|
const int output_width = output_shape.Dims(2);
|
|
const int input_height = input_shape.Dims(1);
|
|
const int input_width = input_shape.Dims(2);
|
|
|
|
TFLITE_CHECK_EQ(op_params.axis_count, 2);
|
|
TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
|
|
(op_params.axis[0] == 2 && op_params.axis[1] == 1));
|
|
TFLITE_CHECK_EQ(output_height, 1);
|
|
TFLITE_CHECK_EQ(output_width, 1);
|
|
|
|
constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
|
|
constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
|
|
|
|
#ifdef USE_NEON
|
|
const int32x4_t bias_dup = vdupq_n_s32(bias);
|
|
const int32x4_t min_dup = vdupq_n_s32(kMinValue);
|
|
const int32x4_t max_dup = vdupq_n_s32(kMaxValue);
|
|
#endif // USE_NEON
|
|
|
|
for (int out_b = 0; out_b < output_batch; ++out_b) {
|
|
int out_d = start_depth;
|
|
#ifdef USE_NEON
|
|
|
|
for (; out_d <= end_depth - 16; out_d += 16) {
|
|
int32x4x4_t temp_sum;
|
|
temp_sum.val[0] = vdupq_n_s32(0);
|
|
temp_sum.val[1] = vdupq_n_s32(0);
|
|
temp_sum.val[2] = vdupq_n_s32(0);
|
|
temp_sum.val[3] = vdupq_n_s32(0);
|
|
for (int in_h = 0; in_h < input_height; ++in_h) {
|
|
for (int in_w = 0; in_w < input_width; ++in_w) {
|
|
const uint8_t* input_data_ptr =
|
|
input_data + Offset(input_shape, out_b, in_h, in_w, out_d);
|
|
uint8x16_t input_data_val = vld1q_u8(input_data_ptr);
|
|
|
|
int16x8_t input_data_low_shift =
|
|
vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input_data_val)));
|
|
int16x8_t input_data_high_shift =
|
|
vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input_data_val)));
|
|
|
|
int32x4_t input_low_low =
|
|
vmovl_s16(vget_low_s16(input_data_low_shift));
|
|
int32x4_t input_high_low =
|
|
vmovl_s16(vget_high_s16(input_data_low_shift));
|
|
int32x4_t input_low_high =
|
|
vmovl_s16(vget_low_s16(input_data_high_shift));
|
|
int32x4_t input_high_high =
|
|
vmovl_s16(vget_high_s16(input_data_high_shift));
|
|
|
|
temp_sum.val[0] = vaddq_s32(temp_sum.val[0], input_low_low);
|
|
temp_sum.val[1] = vaddq_s32(temp_sum.val[1], input_high_low);
|
|
temp_sum.val[2] = vaddq_s32(temp_sum.val[2], input_low_high);
|
|
temp_sum.val[3] = vaddq_s32(temp_sum.val[3], input_high_high);
|
|
}
|
|
}
|
|
|
|
temp_sum =
|
|
MultiplyByQuantizedMultiplier4Rows(temp_sum, multiplier, shift);
|
|
|
|
temp_sum.val[0] = vaddq_s32(temp_sum.val[0], bias_dup);
|
|
temp_sum.val[1] = vaddq_s32(temp_sum.val[1], bias_dup);
|
|
temp_sum.val[2] = vaddq_s32(temp_sum.val[2], bias_dup);
|
|
temp_sum.val[3] = vaddq_s32(temp_sum.val[3], bias_dup);
|
|
|
|
temp_sum.val[0] = vminq_s32(vmaxq_s32(temp_sum.val[0], min_dup), max_dup);
|
|
temp_sum.val[1] = vminq_s32(vmaxq_s32(temp_sum.val[1], min_dup), max_dup);
|
|
temp_sum.val[2] = vminq_s32(vmaxq_s32(temp_sum.val[2], min_dup), max_dup);
|
|
temp_sum.val[3] = vminq_s32(vmaxq_s32(temp_sum.val[3], min_dup), max_dup);
|
|
|
|
uint16x4_t narrowed_low_low =
|
|
vmovn_u32(vreinterpretq_u32_s32(temp_sum.val[0]));
|
|
uint16x4_t narrowed_high_low =
|
|
vmovn_u32(vreinterpretq_u32_s32(temp_sum.val[1]));
|
|
uint16x4_t narrowed_low_high =
|
|
vmovn_u32(vreinterpretq_u32_s32(temp_sum.val[2]));
|
|
uint16x4_t narrowed_high_high =
|
|
vmovn_u32(vreinterpretq_u32_s32(temp_sum.val[3]));
|
|
|
|
uint16x8_t combined_low =
|
|
vcombine_u16(narrowed_low_low, narrowed_high_low);
|
|
uint16x8_t combined_high =
|
|
vcombine_u16(narrowed_low_high, narrowed_high_high);
|
|
|
|
uint8x8_t narrowed_low = vmovn_u16(combined_low);
|
|
uint8x8_t narrowed_high = vmovn_u16(combined_high);
|
|
|
|
uint8x16_t combined_output = vcombine_u8(narrowed_low, narrowed_high);
|
|
|
|
uint8_t* output_data_ptr =
|
|
output_data + Offset(output_shape, out_b, 0, 0, out_d);
|
|
vst1q_u8(output_data_ptr, combined_output);
|
|
}
|
|
#endif // USE_NEON
|
|
|
|
for (; out_d < end_depth; ++out_d) {
|
|
int acc = 0;
|
|
for (int in_h = 0; in_h < input_height; ++in_h) {
|
|
for (int in_w = 0; in_w < input_width; ++in_w) {
|
|
acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
|
|
}
|
|
}
|
|
|
|
acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
|
|
acc += bias;
|
|
acc = std::min(std::max(acc, kMinValue), kMaxValue);
|
|
output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
|
|
static_cast<uint8_t>(acc);
|
|
}
|
|
}
|
|
}
|
|
|
|
struct MeanWorkerTask : cpu_backend_threadpool::Task {
|
|
MeanWorkerTask(const tflite::MeanParams& op_params,
|
|
const RuntimeShape& input_shape, const uint8_t* input_data,
|
|
int32 multiplier, int32 shift, int32 bias,
|
|
const RuntimeShape& output_shape, uint8_t* output_data,
|
|
int start_height, int end_height)
|
|
: op_params(op_params),
|
|
input_shape(input_shape),
|
|
input_data(input_data),
|
|
multiplier(multiplier),
|
|
shift(shift),
|
|
bias(bias),
|
|
output_shape(output_shape),
|
|
output_data(output_data),
|
|
start_height(start_height),
|
|
end_height(end_height) {}
|
|
|
|
void Run() override {
|
|
MeanImpl(op_params, input_shape, input_data, multiplier, shift, bias,
|
|
output_shape, output_data, start_height, end_height);
|
|
}
|
|
|
|
private:
|
|
const tflite::MeanParams& op_params;
|
|
const RuntimeShape& input_shape;
|
|
const uint8_t* input_data;
|
|
int32 multiplier;
|
|
int32 shift;
|
|
int32 bias;
|
|
const RuntimeShape& output_shape;
|
|
uint8_t* output_data;
|
|
int start_height;
|
|
int end_height;
|
|
};
|
|
|
|
inline void Mean(const tflite::MeanParams& op_params,
|
|
const RuntimeShape& unextended_input_shape,
|
|
const uint8_t* input_data, int32 input_zero_point,
|
|
float input_scale, const RuntimeShape& unextended_output_shape,
|
|
uint8_t* output_data, int32 output_zero_point,
|
|
float output_scale, CpuBackendContext* cpu_backend_context) {
|
|
ruy::profiler::ScopeLabel label("Mean4D/Uint8");
|
|
// Current implementation only supports dimension equals 4 and simultaneous
|
|
// reduction over width and height.
|
|
TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
|
|
TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
|
const RuntimeShape input_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
|
const RuntimeShape output_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
|
const int output_height = output_shape.Dims(1);
|
|
const int output_width = output_shape.Dims(2);
|
|
const int output_depth = output_shape.Dims(3);
|
|
|
|
TFLITE_CHECK_EQ(op_params.axis_count, 2);
|
|
TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
|
|
(op_params.axis[0] == 2 && op_params.axis[1] == 1));
|
|
TFLITE_CHECK_EQ(output_height, 1);
|
|
TFLITE_CHECK_EQ(output_width, 1);
|
|
|
|
const int input_height = input_shape.Dims(1);
|
|
const int input_width = input_shape.Dims(2);
|
|
const float num_elements_in_axis = input_width * input_height;
|
|
|
|
int32 bias =
|
|
output_zero_point -
|
|
static_cast<int32>(input_zero_point * input_scale / output_scale);
|
|
float real_scale = input_scale / (num_elements_in_axis * output_scale);
|
|
|
|
int32 multiplier, shift;
|
|
QuantizeMultiplier(real_scale, &multiplier, &shift);
|
|
|
|
constexpr int kMinDepthPerThread = 8;
|
|
int thread_count = output_depth / kMinDepthPerThread;
|
|
thread_count = thread_count > 0 ? thread_count : 1;
|
|
const int capped_thread_count =
|
|
std::min(thread_count, cpu_backend_context->max_num_threads());
|
|
|
|
if (capped_thread_count == 1) {
|
|
MeanImpl(op_params, input_shape, input_data, multiplier, shift, bias,
|
|
output_shape, output_data, 0, output_depth);
|
|
} else {
|
|
// Instead parallel for batch, we loop for the output_depth since batch
|
|
// is typical 1.
|
|
std::vector<MeanWorkerTask> tasks;
|
|
// TODO(b/131746020) don't create new heap allocations every time.
|
|
// At least we make it a single heap allocation by using reserve().
|
|
tasks.reserve(capped_thread_count);
|
|
int depth_start = 0;
|
|
for (int i = 0; i < capped_thread_count; ++i) {
|
|
// Try to distribute the tasks as even as possible.
|
|
int depth_end = depth_start +
|
|
(output_depth - depth_start) / (capped_thread_count - i);
|
|
tasks.emplace_back(op_params, input_shape, input_data, multiplier, shift,
|
|
bias, output_shape, output_data, depth_start,
|
|
depth_end);
|
|
depth_start = depth_end;
|
|
}
|
|
cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
|
|
cpu_backend_context);
|
|
}
|
|
}
|
|
|
|
template <typename T, typename U>
|
|
inline bool MeanGeneral(const T* input_data, const int* input_dims,
|
|
const int input_num_dims, T* output_data,
|
|
const int* output_dims, const int output_num_dims,
|
|
const int* axis, const int num_axis_dimensions,
|
|
bool keep_dims, int* temp_index, int* resolved_axis,
|
|
U* temp_sum) {
|
|
return reference_ops::Mean(input_data, input_dims, input_num_dims,
|
|
output_data, output_dims, output_num_dims, axis,
|
|
num_axis_dimensions, keep_dims, temp_index,
|
|
resolved_axis, temp_sum);
|
|
}
|
|
|
|
template <>
|
|
inline bool MeanGeneral<float, float>(
|
|
const float* input_data, const int* input_dims, const int input_num_dims,
|
|
float* output_data, const int* output_dims, const int output_num_dims,
|
|
const int* axis, const int num_axis_dimensions, bool keep_dims,
|
|
int* temp_index, int* resolved_axis, float* temp_sum) {
|
|
// Handle reduce_mean for the last dimensions.
|
|
if (num_axis_dimensions == 1 && axis[0] == (input_num_dims - 1)) {
|
|
ruy::profiler::ScopeLabel label("MeanLastDim/Float");
|
|
int output_size = 1;
|
|
for (int i = 0; i < input_num_dims - 1; ++i) {
|
|
output_size *= input_dims[i];
|
|
}
|
|
const int last_input_dim = input_dims[axis[0]];
|
|
|
|
// TODO(b/152563685): Consider use eigen to cover more general cases.
|
|
const MatrixMap<const float> in_mat(input_data, last_input_dim,
|
|
output_size);
|
|
VectorMap<float> out(output_data, output_size, 1);
|
|
out = (in_mat.array().colwise().sum()) / static_cast<float>(last_input_dim);
|
|
return true;
|
|
}
|
|
|
|
return reference_ops::Mean(input_data, input_dims, input_num_dims,
|
|
output_data, output_dims, output_num_dims, axis,
|
|
num_axis_dimensions, keep_dims, temp_index,
|
|
resolved_axis, temp_sum);
|
|
}
|
|
|
|
inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
|
const float* input_data, const RuntimeShape& filter_shape,
|
|
const float* filter_data, const RuntimeShape& bias_shape,
|
|
const float* bias_data, const RuntimeShape& output_shape,
|
|
float* output_data, const RuntimeShape& im2col_shape,
|
|
float* im2col_data, CpuBackendContext* cpu_backend_context) {
|
|
const int stride_width = params.stride_width;
|
|
const int stride_height = params.stride_height;
|
|
const int dilation_width_factor = params.dilation_width_factor;
|
|
const int dilation_height_factor = params.dilation_height_factor;
|
|
const float output_activation_min = params.float_activation_min;
|
|
const float output_activation_max = params.float_activation_max;
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
|
|
ruy::profiler::ScopeLabel label("Conv");
|
|
|
|
// NB: the float 0.0f value is represented by all zero bytes.
|
|
const uint8 float_zero_byte = 0x00;
|
|
const float* gemm_input_data = nullptr;
|
|
const RuntimeShape* gemm_input_shape = nullptr;
|
|
const int filter_width = filter_shape.Dims(2);
|
|
const int filter_height = filter_shape.Dims(1);
|
|
const bool need_dilated_im2col =
|
|
dilation_width_factor != 1 || dilation_height_factor != 1;
|
|
const bool need_im2col = stride_width != 1 || stride_height != 1 ||
|
|
filter_width != 1 || filter_height != 1;
|
|
if (need_dilated_im2col) {
|
|
DilatedIm2col(params, float_zero_byte, input_shape, input_data,
|
|
filter_shape, output_shape, im2col_data);
|
|
gemm_input_data = im2col_data;
|
|
gemm_input_shape = &im2col_shape;
|
|
} else if (need_im2col) {
|
|
TFLITE_DCHECK(im2col_data);
|
|
Im2col(params, filter_height, filter_width, float_zero_byte, input_shape,
|
|
input_data, im2col_shape, im2col_data);
|
|
gemm_input_data = im2col_data;
|
|
gemm_input_shape = &im2col_shape;
|
|
} else {
|
|
// TODO(aselle): We need to make sure to not send im2col if it is not
|
|
// needed.
|
|
TFLITE_DCHECK(!im2col_data);
|
|
gemm_input_data = input_data;
|
|
gemm_input_shape = &input_shape;
|
|
}
|
|
|
|
const int gemm_input_dims = gemm_input_shape->DimensionsCount();
|
|
int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
|
|
int n = output_shape.Dims(3);
|
|
int k = gemm_input_shape->Dims(gemm_input_dims - 1);
|
|
|
|
#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
|
|
// The following code computes matrix multiplication c = a * transponse(b)
|
|
// with CBLAS, where:
|
|
// * `a` is a matrix with dimensions (m, k).
|
|
// * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
|
|
// * `c` is a matrix with dimensions (m, n).
|
|
// The naming of variables are aligned with CBLAS specification here.
|
|
const float* a = gemm_input_data;
|
|
const float* b = filter_data;
|
|
float* c = output_data;
|
|
// The stride of matrix a, b and c respectively.
|
|
int stride_a = k;
|
|
int stride_b = k;
|
|
int stride_c = n;
|
|
|
|
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
|
|
stride_a, b, stride_b, 0.0f, c, stride_c);
|
|
optimized_ops::AddBiasAndEvalActivationFunction(
|
|
output_activation_min, output_activation_max, bias_shape, bias_data,
|
|
output_shape, output_data);
|
|
#else
|
|
// When an optimized CBLAS implementation is not available, fall back
|
|
// to using cpu_backend_gemm.
|
|
cpu_backend_gemm::MatrixParams<float> lhs_params;
|
|
lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
|
|
lhs_params.rows = n;
|
|
lhs_params.cols = k;
|
|
cpu_backend_gemm::MatrixParams<float> rhs_params;
|
|
rhs_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
rhs_params.rows = k;
|
|
rhs_params.cols = m;
|
|
cpu_backend_gemm::MatrixParams<float> dst_params;
|
|
dst_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
dst_params.rows = n;
|
|
dst_params.cols = m;
|
|
cpu_backend_gemm::GemmParams<float, float> gemm_params;
|
|
gemm_params.bias = bias_data;
|
|
gemm_params.clamp_min = output_activation_min;
|
|
gemm_params.clamp_max = output_activation_max;
|
|
cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
|
|
dst_params, output_data, gemm_params,
|
|
cpu_backend_context);
|
|
#endif // defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
|
|
}
|
|
|
|
inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
|
|
const RuntimeShape& input_shape,
|
|
const int8_t* input_data,
|
|
const RuntimeShape& filter_shape,
|
|
const int8_t* filter_data,
|
|
const RuntimeShape& bias_shape, const float* bias_data,
|
|
const RuntimeShape& accum_scratch_shape,
|
|
int32_t* accum_scratch, const RuntimeShape& output_shape,
|
|
float* output_data, const RuntimeShape& im2col_shape,
|
|
int8_t* im2col_data, CpuBackendContext* context) {
|
|
const int stride_width = params.stride_width;
|
|
const int stride_height = params.stride_height;
|
|
const float output_activation_min = params.float_activation_min;
|
|
const float output_activation_max = params.float_activation_max;
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
|
|
const int batch_size = input_shape.Dims(0);
|
|
const int filter_width = filter_shape.Dims(2);
|
|
const int filter_height = filter_shape.Dims(1);
|
|
|
|
const int8_t* gemm_input_data = nullptr;
|
|
int num_input;
|
|
const bool need_im2col = stride_width != 1 || stride_height != 1 ||
|
|
filter_width != 1 || filter_height != 1;
|
|
|
|
if (need_im2col) {
|
|
TFLITE_DCHECK(im2col_data);
|
|
// symmetric quantization assumes zero point of 0.
|
|
const int input_zero_point = 0;
|
|
|
|
Im2col(params, filter_height, filter_width, input_zero_point, input_shape,
|
|
input_data, im2col_shape, im2col_data);
|
|
gemm_input_data = im2col_data;
|
|
num_input = im2col_shape.FlatSize();
|
|
} else {
|
|
TFLITE_DCHECK(!im2col_data);
|
|
gemm_input_data = input_data;
|
|
num_input = input_shape.FlatSize();
|
|
}
|
|
|
|
// Flatten 4D matrices into 2D matrices for matrix multiplication.
|
|
|
|
// Flatten so that each filter has its own row.
|
|
const int filter_rows = filter_shape.Dims(0);
|
|
const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
|
|
|
|
// In MatrixBatchVectorMultiplyAccumulate, each output value is the
|
|
// dot product of one row of the first matrix with one row of the second
|
|
// matrix. Therefore, the number of cols in each matrix are equivalent.
|
|
//
|
|
// After Im2Col, each input patch becomes a row.
|
|
const int gemm_input_cols = filter_cols;
|
|
const int gemm_input_rows = num_input / gemm_input_cols;
|
|
|
|
const int output_cols = output_shape.Dims(3);
|
|
const int output_rows = FlatSizeSkipDim(output_shape, 3);
|
|
TFLITE_DCHECK_EQ(output_cols, filter_rows);
|
|
TFLITE_DCHECK_EQ(output_rows, gemm_input_rows);
|
|
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_cols);
|
|
|
|
// MatrixBatchVectorMultiplyAccumulate assumes that each row of the second
|
|
// input matrix has its own scale factor. This code duplicates the scale
|
|
// factors for each row in the same batch.
|
|
const int rows_per_batch = gemm_input_rows / batch_size;
|
|
for (int i = gemm_input_rows - 1; i >= 0; --i) {
|
|
scaling_factors_ptr[i] = scaling_factors_ptr[i / rows_per_batch];
|
|
}
|
|
|
|
std::fill_n(output_data, output_rows * output_cols, 0.0f);
|
|
|
|
// The scratch buffer must have the same size as the output.
|
|
TFLITE_DCHECK_EQ(accum_scratch_shape.FlatSize(), output_shape.FlatSize());
|
|
tensor_utils::MatrixBatchVectorMultiplyAccumulate(
|
|
filter_data, filter_rows, filter_cols, gemm_input_data,
|
|
scaling_factors_ptr, /*n_batch=*/gemm_input_rows, accum_scratch,
|
|
output_data, context);
|
|
AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
|
|
bias_shape, bias_data, output_shape,
|
|
output_data);
|
|
}
|
|
|
|
inline void HybridConvPerChannel(
|
|
const ConvParams& params, float* scaling_factors_ptr,
|
|
const RuntimeShape& input_shape, const int8_t* input_data,
|
|
const RuntimeShape& filter_shape, const int8_t* filter_data,
|
|
const RuntimeShape& bias_shape, const float* bias_data,
|
|
const RuntimeShape& output_shape, float* output_data,
|
|
const RuntimeShape& im2col_shape, int8_t* im2col_data,
|
|
const float* per_channel_scale, int32_t* input_offset,
|
|
const RuntimeShape& scratch_shape, int32_t* scratch, int32_t* row_sums,
|
|
bool* compute_row_sums, CpuBackendContext* cpu_backend_context) {
|
|
ruy::profiler::ScopeLabel label("ConvHybridPerChannel");
|
|
const int stride_width = params.stride_width;
|
|
const int stride_height = params.stride_height;
|
|
const int dilation_width_factor = params.dilation_width_factor;
|
|
const int dilation_height_factor = params.dilation_height_factor;
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
|
|
const int8* gemm_input_data = nullptr;
|
|
const RuntimeShape* gemm_input_shape = nullptr;
|
|
const int filter_width = filter_shape.Dims(2);
|
|
const int filter_height = filter_shape.Dims(1);
|
|
const bool need_dilated_im2col =
|
|
dilation_width_factor != 1 || dilation_height_factor != 1;
|
|
const bool need_im2col = stride_width != 1 || stride_height != 1 ||
|
|
filter_width != 1 || filter_height != 1;
|
|
|
|
const int batch_size = input_shape.Dims(0);
|
|
|
|
if (need_dilated_im2col) {
|
|
TFLITE_DCHECK(im2col_data);
|
|
optimized_ops::DilatedIm2col(params, input_shape, input_data, filter_shape,
|
|
output_shape, im2col_data, input_offset,
|
|
batch_size);
|
|
gemm_input_data = im2col_data;
|
|
gemm_input_shape = &im2col_shape;
|
|
} else if (need_im2col) {
|
|
Im2col(params, filter_height, filter_width, input_offset, batch_size,
|
|
input_shape, input_data, im2col_shape, im2col_data);
|
|
gemm_input_data = im2col_data;
|
|
gemm_input_shape = &im2col_shape;
|
|
} else {
|
|
TFLITE_DCHECK(!im2col_data);
|
|
gemm_input_data = input_data;
|
|
gemm_input_shape = &input_shape;
|
|
}
|
|
|
|
const int filter_rows = filter_shape.Dims(0);
|
|
const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
|
|
|
|
const int gemm_input_rows = gemm_input_shape->Dims(3);
|
|
const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
|
|
const int output_rows = output_shape.Dims(3);
|
|
const int output_cols =
|
|
output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
|
|
|
|
TFLITE_DCHECK_EQ(output_rows, filter_rows);
|
|
TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
|
|
TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
|
|
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
|
|
TFLITE_DCHECK_EQ(scratch_shape.FlatSize(), output_shape.FlatSize());
|
|
if (!compute_row_sums || *compute_row_sums) {
|
|
memset(row_sums, 0, sizeof(int32_t) * filter_rows);
|
|
tensor_utils::ReductionSumVector(filter_data, row_sums, filter_rows,
|
|
filter_cols);
|
|
if (compute_row_sums) {
|
|
*compute_row_sums = false;
|
|
}
|
|
}
|
|
|
|
cpu_backend_gemm::MatrixParams<int8> lhs_params;
|
|
lhs_params.rows = filter_rows;
|
|
lhs_params.cols = filter_cols;
|
|
lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
|
|
|
|
cpu_backend_gemm::MatrixParams<int8> rhs_params;
|
|
rhs_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
rhs_params.rows = gemm_input_rows;
|
|
rhs_params.cols = gemm_input_cols;
|
|
|
|
cpu_backend_gemm::MatrixParams<int32> dst_params;
|
|
dst_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
dst_params.rows = output_rows;
|
|
dst_params.cols = output_cols;
|
|
|
|
// TODO(b/149003801): Use hybrid gemm once supported in Ruy.
|
|
cpu_backend_gemm::GemmParams<int32_t, int32_t> gemm_params;
|
|
cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
|
|
dst_params, scratch, gemm_params, cpu_backend_context);
|
|
|
|
MatrixMap<float> out_mat(output_data, filter_rows, output_cols);
|
|
MatrixMap<int32_t> in_mat(scratch, filter_rows, output_cols);
|
|
VectorMap<const float> bias_data_vec(bias_data, filter_rows, 1);
|
|
VectorMap<int32_t> row_sums_vec(row_sums, filter_rows, 1);
|
|
VectorMap<const float> per_channel_scale_vec(per_channel_scale, filter_rows,
|
|
1);
|
|
const int cols_per_batch = output_cols / batch_size;
|
|
for (int c = 0; c < output_cols; c++) {
|
|
const int b = c / cols_per_batch;
|
|
const float input_scale = scaling_factors_ptr[b];
|
|
const int32_t zero_point = input_offset[b];
|
|
out_mat.col(c) =
|
|
(((in_mat.col(c) - (row_sums_vec * zero_point))
|
|
.cast<float>()
|
|
.cwiseProduct((per_channel_scale_vec * input_scale))) +
|
|
bias_data_vec)
|
|
.cwiseMin(params.float_activation_max)
|
|
.cwiseMax(params.float_activation_min);
|
|
}
|
|
}
|
|
|
|
inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
|
|
const uint8* input_data, const RuntimeShape& filter_shape,
|
|
const uint8* filter_data, const RuntimeShape& bias_shape,
|
|
const int32* bias_data, const RuntimeShape& output_shape,
|
|
uint8* output_data, const RuntimeShape& im2col_shape,
|
|
uint8* im2col_data, CpuBackendContext* cpu_backend_context) {
|
|
ruy::profiler::ScopeLabel label("Conv/8bit");
|
|
|
|
const int stride_width = params.stride_width;
|
|
const int stride_height = params.stride_height;
|
|
const int dilation_width_factor = params.dilation_width_factor;
|
|
const int dilation_height_factor = params.dilation_height_factor;
|
|
const int32 input_offset = params.input_offset;
|
|
const int32 filter_offset = params.weights_offset;
|
|
const int32 output_offset = params.output_offset;
|
|
const int32 output_multiplier = params.output_multiplier;
|
|
const int output_shift = params.output_shift;
|
|
const int32 output_activation_min = params.quantized_activation_min;
|
|
const int32 output_activation_max = params.quantized_activation_max;
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
|
|
const uint8* gemm_input_data = nullptr;
|
|
const RuntimeShape* gemm_input_shape = nullptr;
|
|
const int filter_width = filter_shape.Dims(2);
|
|
const int filter_height = filter_shape.Dims(1);
|
|
const bool need_dilated_im2col =
|
|
dilation_width_factor != 1 || dilation_height_factor != 1;
|
|
const bool need_im2col = stride_width != 1 || stride_height != 1 ||
|
|
filter_width != 1 || filter_height != 1;
|
|
if (need_dilated_im2col) {
|
|
TFLITE_DCHECK(im2col_data);
|
|
const int input_zero_point = -input_offset;
|
|
TFLITE_DCHECK_GE(input_zero_point, 0);
|
|
TFLITE_DCHECK_LE(input_zero_point, 255);
|
|
DilatedIm2col(params, input_zero_point, input_shape, input_data,
|
|
filter_shape, output_shape, im2col_data);
|
|
gemm_input_data = im2col_data;
|
|
gemm_input_shape = &im2col_shape;
|
|
} else if (need_im2col) {
|
|
TFLITE_DCHECK(im2col_data);
|
|
const int input_zero_point = -input_offset;
|
|
TFLITE_DCHECK_GE(input_zero_point, 0);
|
|
TFLITE_DCHECK_LE(input_zero_point, 255);
|
|
Im2col(params, filter_height, filter_width, input_zero_point, input_shape,
|
|
input_data, im2col_shape, im2col_data);
|
|
gemm_input_data = im2col_data;
|
|
gemm_input_shape = &im2col_shape;
|
|
} else {
|
|
TFLITE_DCHECK(!im2col_data);
|
|
gemm_input_data = input_data;
|
|
gemm_input_shape = &input_shape;
|
|
}
|
|
|
|
const int gemm_input_rows = gemm_input_shape->Dims(3);
|
|
// Using FlatSizeSkipDim causes segfault in some contexts (see b/79927784).
|
|
// The root cause has not yet been identified though. Same applies below for
|
|
// the other calls commented out. This is a partial rollback of cl/196819423.
|
|
// const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
|
|
const int gemm_input_cols = gemm_input_shape->Dims(0) *
|
|
gemm_input_shape->Dims(1) *
|
|
gemm_input_shape->Dims(2);
|
|
const int filter_rows = filter_shape.Dims(0);
|
|
// See b/79927784.
|
|
// const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
|
|
const int filter_cols =
|
|
filter_shape.Dims(1) * filter_shape.Dims(2) * filter_shape.Dims(3);
|
|
const int output_rows = output_shape.Dims(3);
|
|
// See b/79927784.
|
|
// const int output_cols = FlatSizeSkipDim(output_shape, 3);
|
|
const int output_cols =
|
|
output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
|
|
TFLITE_DCHECK_EQ(output_rows, filter_rows);
|
|
TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
|
|
TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
|
|
TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
|
|
|
|
cpu_backend_gemm::MatrixParams<uint8> lhs_params;
|
|
lhs_params.rows = filter_rows;
|
|
lhs_params.cols = filter_cols;
|
|
lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
|
|
lhs_params.zero_point = -filter_offset;
|
|
cpu_backend_gemm::MatrixParams<uint8> rhs_params;
|
|
rhs_params.rows = gemm_input_rows;
|
|
rhs_params.cols = gemm_input_cols;
|
|
rhs_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
rhs_params.zero_point = -input_offset;
|
|
cpu_backend_gemm::MatrixParams<uint8> dst_params;
|
|
dst_params.rows = output_rows;
|
|
dst_params.cols = output_cols;
|
|
dst_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
dst_params.zero_point = output_offset;
|
|
cpu_backend_gemm::GemmParams<int32, uint8> gemm_params;
|
|
gemm_params.bias = bias_data;
|
|
gemm_params.clamp_min = output_activation_min;
|
|
gemm_params.clamp_max = output_activation_max;
|
|
gemm_params.multiplier_fixedpoint = output_multiplier;
|
|
gemm_params.multiplier_exponent = output_shift;
|
|
cpu_backend_gemm::Gemm(lhs_params, filter_data, rhs_params, gemm_input_data,
|
|
dst_params, output_data, gemm_params,
|
|
cpu_backend_context);
|
|
}
|
|
|
|
template <typename T>
|
|
inline void DepthToSpace(const tflite::DepthToSpaceParams& op_params,
|
|
const RuntimeShape& unextended_input_shape,
|
|
const T* input_data,
|
|
const RuntimeShape& unextended_output_shape,
|
|
T* output_data) {
|
|
ruy::profiler::ScopeLabel label("DepthToSpace");
|
|
|
|
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
|
const RuntimeShape input_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
|
const RuntimeShape output_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
|
|
|
const int input_depth = input_shape.Dims(3);
|
|
const int input_width = input_shape.Dims(2);
|
|
const int input_height = input_shape.Dims(1);
|
|
|
|
const int output_depth = output_shape.Dims(3);
|
|
const int batch_size = output_shape.Dims(0);
|
|
|
|
// Number of continuous values that we can copy in one interation.
|
|
const int stride = op_params.block_size * output_depth;
|
|
|
|
for (int batch = 0; batch < batch_size; ++batch) {
|
|
for (int in_h = 0; in_h < input_height; ++in_h) {
|
|
const T* input_ptr = input_data + Offset(input_shape, batch, in_h, 0, 0);
|
|
for (int offset_h = 0; offset_h < op_params.block_size; ++offset_h) {
|
|
const T* src = input_ptr;
|
|
for (int in_w = 0; in_w < input_width; ++in_w) {
|
|
memcpy(output_data, src, stride * sizeof(T));
|
|
output_data += stride;
|
|
src += input_depth;
|
|
}
|
|
input_ptr += stride;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
|
|
const RuntimeShape& unextended_input_shape,
|
|
const T* input_data,
|
|
const RuntimeShape& unextended_output_shape,
|
|
T* output_data) {
|
|
ruy::profiler::ScopeLabel label("SpaceToDepth");
|
|
|
|
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
|
const RuntimeShape input_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
|
const RuntimeShape output_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
|
|
|
const int output_depth = output_shape.Dims(3);
|
|
const int output_width = output_shape.Dims(2);
|
|
const int output_height = output_shape.Dims(1);
|
|
|
|
const int input_depth = input_shape.Dims(3);
|
|
const int batch_size = input_shape.Dims(0);
|
|
|
|
// Number of continuous values that we can copy in one interation.
|
|
const int stride = op_params.block_size * input_depth;
|
|
|
|
for (int batch = 0; batch < batch_size; ++batch) {
|
|
for (int out_h = 0; out_h < output_height; ++out_h) {
|
|
T* output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0);
|
|
for (int offset_h = 0; offset_h < op_params.block_size; ++offset_h) {
|
|
T* dst = output_ptr;
|
|
for (int out_w = 0; out_w < output_width; ++out_w) {
|
|
memcpy(dst, input_data, stride * sizeof(T));
|
|
input_data += stride;
|
|
dst += output_depth;
|
|
}
|
|
output_ptr += stride;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void Relu(const RuntimeShape& input_shape, const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("Relu (not fused)");
|
|
|
|
const auto input = MapAsVector(input_data, input_shape);
|
|
auto output = MapAsVector(output_data, output_shape);
|
|
output = input.cwiseMax(0.0f);
|
|
}
|
|
|
|
inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const float* input_data,
|
|
const RuntimeShape& output_shape,
|
|
float* output_data, float epsilon = 1e-6) {
|
|
ruy::profiler::ScopeLabel label("L2Normalization");
|
|
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
|
const int outer_size =
|
|
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
|
const int depth =
|
|
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
|
for (int i = 0; i < outer_size; ++i) {
|
|
float squared_l2_norm = 0;
|
|
for (int c = 0; c < depth; ++c) {
|
|
const float val = input_data[c];
|
|
squared_l2_norm += val * val;
|
|
}
|
|
float l2_norm = std::sqrt(squared_l2_norm);
|
|
l2_norm = std::max(l2_norm, epsilon);
|
|
for (int c = 0; c < depth; ++c) {
|
|
*output_data = *input_data / l2_norm;
|
|
++output_data;
|
|
++input_data;
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const uint8* input_data,
|
|
const RuntimeShape& output_shape,
|
|
uint8* output_data) {
|
|
ruy::profiler::ScopeLabel label("L2Normalization/8bit");
|
|
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
|
const int depth =
|
|
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
|
const int outer_size =
|
|
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
|
const int32 input_zero_point = op_params.input_zero_point;
|
|
for (int i = 0; i < outer_size; ++i) {
|
|
int32 square_l2_norm = 0;
|
|
for (int c = 0; c < depth; c++) {
|
|
// Note that input_data advances by depth in the second pass below.
|
|
int32 diff = input_data[c] - input_zero_point;
|
|
square_l2_norm += diff * diff;
|
|
}
|
|
// TODO(b/29395854): add clamping to TOCO and TF Lite kernel
|
|
// for all zero tensors in the input_data
|
|
int32 inv_l2norm_multiplier;
|
|
int inv_l2norm_shift;
|
|
GetInvSqrtQuantizedMultiplierExp(square_l2_norm, kReverseShift,
|
|
&inv_l2norm_multiplier, &inv_l2norm_shift);
|
|
|
|
for (int c = 0; c < depth; c++) {
|
|
int32 diff = *input_data - input_zero_point;
|
|
int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
|
|
int32 unclamped_output_val = 128 + rescaled_diff;
|
|
int32 output_val = std::min(255, std::max(0, unclamped_output_val));
|
|
*output_data = static_cast<uint8>(output_val);
|
|
++input_data;
|
|
++output_data;
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void AddElementwise(int size, const ArithmeticParams& params,
|
|
const float* input1_data, const float* input2_data,
|
|
float* output_data) {
|
|
int i = 0;
|
|
|
|
#ifdef USE_NEON
|
|
const auto activation_min = vdupq_n_f32(params.float_activation_min);
|
|
const auto activation_max = vdupq_n_f32(params.float_activation_max);
|
|
for (; i <= size - 16; i += 16) {
|
|
auto a10 = vld1q_f32(input1_data + i);
|
|
auto a11 = vld1q_f32(input1_data + i + 4);
|
|
auto a12 = vld1q_f32(input1_data + i + 8);
|
|
auto a13 = vld1q_f32(input1_data + i + 12);
|
|
auto a20 = vld1q_f32(input2_data + i);
|
|
auto a21 = vld1q_f32(input2_data + i + 4);
|
|
auto a22 = vld1q_f32(input2_data + i + 8);
|
|
auto a23 = vld1q_f32(input2_data + i + 12);
|
|
auto x0 = vaddq_f32(a10, a20);
|
|
auto x1 = vaddq_f32(a11, a21);
|
|
auto x2 = vaddq_f32(a12, a22);
|
|
auto x3 = vaddq_f32(a13, a23);
|
|
x0 = vmaxq_f32(activation_min, x0);
|
|
x1 = vmaxq_f32(activation_min, x1);
|
|
x2 = vmaxq_f32(activation_min, x2);
|
|
x3 = vmaxq_f32(activation_min, x3);
|
|
x0 = vminq_f32(activation_max, x0);
|
|
x1 = vminq_f32(activation_max, x1);
|
|
x2 = vminq_f32(activation_max, x2);
|
|
x3 = vminq_f32(activation_max, x3);
|
|
vst1q_f32(output_data + i, x0);
|
|
vst1q_f32(output_data + i + 4, x1);
|
|
vst1q_f32(output_data + i + 8, x2);
|
|
vst1q_f32(output_data + i + 12, x3);
|
|
}
|
|
for (; i <= size - 4; i += 4) {
|
|
auto a1 = vld1q_f32(input1_data + i);
|
|
auto a2 = vld1q_f32(input2_data + i);
|
|
auto x = vaddq_f32(a1, a2);
|
|
x = vmaxq_f32(activation_min, x);
|
|
x = vminq_f32(activation_max, x);
|
|
vst1q_f32(output_data + i, x);
|
|
}
|
|
#endif // NEON
|
|
|
|
for (; i < size; i++) {
|
|
auto x = input1_data[i] + input2_data[i];
|
|
output_data[i] = ActivationFunctionWithMinMax(
|
|
x, params.float_activation_min, params.float_activation_max);
|
|
}
|
|
}
|
|
|
|
inline void Add(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const float* input1_data,
|
|
const RuntimeShape& input2_shape, const float* input2_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("Add");
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
|
|
}
|
|
|
|
// Element-wise add that can often be used for inner loop of broadcast add as
|
|
// well as the non-broadcast add.
|
|
inline void AddElementwise(int size, const ArithmeticParams& params,
|
|
const uint8* input1_data, const uint8* input2_data,
|
|
uint8* output_data) {
|
|
ruy::profiler::ScopeLabel label("AddElementwise/8bit");
|
|
int i = 0;
|
|
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
|
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
|
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
|
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
|
#ifdef USE_NEON
|
|
const uint8x8_t output_activation_min_vector =
|
|
vdup_n_u8(params.quantized_activation_min);
|
|
const uint8x8_t output_activation_max_vector =
|
|
vdup_n_u8(params.quantized_activation_max);
|
|
for (; i <= size - 8; i += 8) {
|
|
const uint8x8_t input1_val_original = vld1_u8(input1_data + i);
|
|
const uint8x8_t input2_val_original = vld1_u8(input2_data + i);
|
|
const int16x8_t input1_val_s16 =
|
|
vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
|
|
const int16x8_t input2_val_s16 =
|
|
vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
|
|
const int16x8_t input1_val =
|
|
vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
|
|
const int16x8_t input2_val =
|
|
vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
|
|
const int16x4_t input1_val_high = vget_high_s16(input1_val);
|
|
const int16x4_t input1_val_low = vget_low_s16(input1_val);
|
|
const int16x4_t input2_val_high = vget_high_s16(input2_val);
|
|
const int16x4_t input2_val_low = vget_low_s16(input2_val);
|
|
int32x4_t x11 = vmovl_s16(input1_val_low);
|
|
int32x4_t x12 = vmovl_s16(input1_val_high);
|
|
int32x4_t x21 = vmovl_s16(input2_val_low);
|
|
int32x4_t x22 = vmovl_s16(input2_val_high);
|
|
const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
|
|
x11 = vshlq_s32(x11, left_shift_dup);
|
|
x12 = vshlq_s32(x12, left_shift_dup);
|
|
x21 = vshlq_s32(x21, left_shift_dup);
|
|
x22 = vshlq_s32(x22, left_shift_dup);
|
|
x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
|
|
x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
|
|
x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
|
|
x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
|
|
const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
|
|
const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
|
|
x11 = vshlq_s32(x11, input1_shift_dup);
|
|
x12 = vshlq_s32(x12, input1_shift_dup);
|
|
x21 = vshlq_s32(x21, input2_shift_dup);
|
|
x22 = vshlq_s32(x22, input2_shift_dup);
|
|
int32x4_t s1 = vaddq_s32(x11, x21);
|
|
int32x4_t s2 = vaddq_s32(x12, x22);
|
|
s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
|
|
s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
|
|
using gemmlowp::RoundingDivideByPOT;
|
|
s1 = RoundingDivideByPOT(s1, -params.output_shift);
|
|
s2 = RoundingDivideByPOT(s2, -params.output_shift);
|
|
const int16x4_t s1_narrowed = vmovn_s32(s1);
|
|
const int16x4_t s2_narrowed = vmovn_s32(s2);
|
|
const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
|
|
vdupq_n_s16(params.output_offset));
|
|
const uint8x8_t clamped =
|
|
vmax_u8(output_activation_min_vector,
|
|
vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
|
|
vst1_u8(output_data + i, clamped);
|
|
}
|
|
#endif // NEON
|
|
|
|
for (; i < size; ++i) {
|
|
const int32 input1_val = params.input1_offset + input1_data[i];
|
|
const int32 input2_val = params.input2_offset + input2_data[i];
|
|
const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
|
|
const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
|
|
const int32 scaled_input1_val =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
|
const int32 scaled_input2_val =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
shifted_input2_val, params.input2_multiplier, params.input2_shift);
|
|
const int32 raw_sum = scaled_input1_val + scaled_input2_val;
|
|
const int32 raw_output =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
raw_sum, params.output_multiplier, params.output_shift) +
|
|
params.output_offset;
|
|
const int32 clamped_output =
|
|
std::min(params.quantized_activation_max,
|
|
std::max(params.quantized_activation_min, raw_output));
|
|
output_data[i] = static_cast<uint8>(clamped_output);
|
|
}
|
|
}
|
|
|
|
// Scalar-broadcast add that can be used for inner loop of more general
|
|
// broadcast add, so that, for example, scalar-broadcast with batch will still
|
|
// be fast.
|
|
inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
|
|
uint8 input1_data, const uint8* input2_data,
|
|
uint8* output_data) {
|
|
using gemmlowp::RoundingDivideByPOT;
|
|
|
|
ruy::profiler::ScopeLabel label("AddScalarBroadcast/8bit");
|
|
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
|
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
|
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
|
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
|
|
|
int i = 0;
|
|
|
|
#ifdef USE_NEON
|
|
const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
|
|
const uint8x8_t output_activation_min_vector =
|
|
vdup_n_u8(params.quantized_activation_min);
|
|
const uint8x8_t output_activation_max_vector =
|
|
vdup_n_u8(params.quantized_activation_max);
|
|
|
|
// Process broadcast scalar.
|
|
const uint8x8_t input1_val_original = vdup_n_u8(input1_data);
|
|
const int16x8_t input1_val_s16 =
|
|
vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
|
|
const int16x8_t input1_val =
|
|
vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
|
|
const int16x4_t input1_val_high = vget_high_s16(input1_val);
|
|
const int16x4_t input1_val_low = vget_low_s16(input1_val);
|
|
int32x4_t x11 = vmovl_s16(input1_val_low);
|
|
int32x4_t x12 = vmovl_s16(input1_val_high);
|
|
x11 = vshlq_s32(x11, left_shift_dup);
|
|
x12 = vshlq_s32(x12, left_shift_dup);
|
|
x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
|
|
x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
|
|
const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
|
|
x11 = vshlq_s32(x11, input1_shift_dup);
|
|
x12 = vshlq_s32(x12, input1_shift_dup);
|
|
|
|
for (; i <= size - 8; i += 8) {
|
|
const uint8x8_t input2_val_original = vld1_u8(input2_data + i);
|
|
const int16x8_t input2_val_s16 =
|
|
vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
|
|
const int16x8_t input2_val =
|
|
vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
|
|
const int16x4_t input2_val_high = vget_high_s16(input2_val);
|
|
const int16x4_t input2_val_low = vget_low_s16(input2_val);
|
|
int32x4_t x21 = vmovl_s16(input2_val_low);
|
|
int32x4_t x22 = vmovl_s16(input2_val_high);
|
|
x21 = vshlq_s32(x21, left_shift_dup);
|
|
x22 = vshlq_s32(x22, left_shift_dup);
|
|
x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
|
|
x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
|
|
const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
|
|
x21 = vshlq_s32(x21, input2_shift_dup);
|
|
x22 = vshlq_s32(x22, input2_shift_dup);
|
|
int32x4_t s1 = vaddq_s32(x11, x21);
|
|
int32x4_t s2 = vaddq_s32(x12, x22);
|
|
s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
|
|
s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
|
|
s1 = RoundingDivideByPOT(s1, -params.output_shift);
|
|
s2 = RoundingDivideByPOT(s2, -params.output_shift);
|
|
const int16x4_t s1_narrowed = vmovn_s32(s1);
|
|
const int16x4_t s2_narrowed = vmovn_s32(s2);
|
|
const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
|
|
vdupq_n_s16(params.output_offset));
|
|
const uint8x8_t clamped =
|
|
vmax_u8(output_activation_min_vector,
|
|
vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
|
|
vst1_u8(output_data + i, clamped);
|
|
}
|
|
#endif // NEON
|
|
|
|
if (i < size) {
|
|
// Process broadcast scalar.
|
|
const int32 input1_val = params.input1_offset + input1_data;
|
|
const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
|
|
const int32 scaled_input1_val =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
shifted_input1_val, params.input1_multiplier, params.input1_shift);
|
|
|
|
for (; i < size; ++i) {
|
|
const int32 input2_val = params.input2_offset + input2_data[i];
|
|
const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
|
|
const int32 scaled_input2_val =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
shifted_input2_val, params.input2_multiplier,
|
|
params.input2_shift);
|
|
const int32 raw_sum = scaled_input1_val + scaled_input2_val;
|
|
const int32 raw_output =
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
raw_sum, params.output_multiplier, params.output_shift) +
|
|
params.output_offset;
|
|
const int32 clamped_output =
|
|
std::min(params.quantized_activation_max,
|
|
std::max(params.quantized_activation_min, raw_output));
|
|
output_data[i] = static_cast<uint8>(clamped_output);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Scalar-broadcast add that can be used for inner loop of more general
|
|
// broadcast add, so that, for example, scalar-broadcast with batch will still
|
|
// be fast.
|
|
inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
|
|
float broadcast_value, const float* input2_data,
|
|
float* output_data) {
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
const float32x4_t output_activation_min_vector =
|
|
vdupq_n_f32(params.float_activation_min);
|
|
const float32x4_t output_activation_max_vector =
|
|
vdupq_n_f32(params.float_activation_max);
|
|
const float32x4_t broadcast_value_dup = vdupq_n_f32(broadcast_value);
|
|
for (; i <= size - 4; i += 4) {
|
|
const float32x4_t input2_val_original = vld1q_f32(input2_data + i);
|
|
|
|
const float32x4_t output =
|
|
vaddq_f32(input2_val_original, broadcast_value_dup);
|
|
|
|
const float32x4_t clamped =
|
|
vmaxq_f32(output_activation_min_vector,
|
|
vminq_f32(output_activation_max_vector, output));
|
|
vst1q_f32(output_data + i, clamped);
|
|
}
|
|
#endif // NEON
|
|
|
|
for (; i < size; ++i) {
|
|
auto x = broadcast_value + input2_data[i];
|
|
output_data[i] = ActivationFunctionWithMinMax(
|
|
x, params.float_activation_min, params.float_activation_max);
|
|
}
|
|
}
|
|
|
|
inline void Add(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const uint8* input1_data,
|
|
const RuntimeShape& input2_shape, const uint8* input2_data,
|
|
const RuntimeShape& output_shape, uint8* output_data) {
|
|
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
ruy::profiler::ScopeLabel label("Add/8bit");
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
|
|
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
|
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
|
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
|
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
|
AddElementwise(flat_size, params, input1_data, input2_data, output_data);
|
|
}
|
|
|
|
inline void Add(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const int16* input1_data,
|
|
const RuntimeShape& input2_shape, const int16* input2_data,
|
|
const RuntimeShape& output_shape, int16* output_data) {
|
|
ruy::profiler::ScopeLabel label("Add/Int16");
|
|
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
|
|
const int input1_shift = params.input1_shift;
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
const int16 output_activation_min = params.quantized_activation_min;
|
|
const int16 output_activation_max = params.quantized_activation_max;
|
|
|
|
TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
|
|
TFLITE_DCHECK_LE(input1_shift, 0);
|
|
TFLITE_DCHECK_LE(params.input2_shift, 0);
|
|
const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
|
|
const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
|
|
const int input_right_shift =
|
|
input1_shift == 0 ? -params.input2_shift : -input1_shift;
|
|
|
|
for (int i = 0; i < flat_size; i++) {
|
|
// F0 uses 0 integer bits, range [-1, 1].
|
|
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
|
|
|
F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
|
|
F0 scaled_input = F0::FromRaw(
|
|
gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
|
|
F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
|
|
const int16 raw_output = result.raw();
|
|
const int16 clamped_output = std::min(
|
|
output_activation_max, std::max(output_activation_min, raw_output));
|
|
output_data[i] = clamped_output;
|
|
}
|
|
}
|
|
|
|
inline void Add(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const int32* input1_data,
|
|
const RuntimeShape& input2_shape, const int32* input2_data,
|
|
const RuntimeShape& output_shape, int32* output_data) {
|
|
ruy::profiler::ScopeLabel label("Add/int32");
|
|
|
|
auto input1_map = MapAsVector(input1_data, input1_shape);
|
|
auto input2_map = MapAsVector(input2_data, input2_shape);
|
|
auto output_map = MapAsVector(output_data, output_shape);
|
|
if (input1_shape == input2_shape) {
|
|
output_map.array() = input1_map.array() + input2_map.array();
|
|
} else if (input2_shape.FlatSize() == 1) {
|
|
auto scalar = input2_data[0];
|
|
output_map.array() = input1_map.array() + scalar;
|
|
} else if (input1_shape.FlatSize() == 1) {
|
|
auto scalar = input1_data[0];
|
|
output_map.array() = scalar + input2_map.array();
|
|
} else {
|
|
reference_ops::BroadcastAdd4DSlow(params, input1_shape, input1_data,
|
|
input2_shape, input2_data, output_shape,
|
|
output_data);
|
|
return;
|
|
}
|
|
output_map = output_map.cwiseMax(params.quantized_activation_min);
|
|
output_map = output_map.cwiseMin(params.quantized_activation_max);
|
|
}
|
|
|
|
template <typename T>
|
|
inline void BroadcastAddDispatch(
|
|
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
|
const T* input1_data, const RuntimeShape& input2_shape,
|
|
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
|
|
if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
|
|
return BroadcastAdd4DSlow(params, input1_shape, input1_data, input2_shape,
|
|
input2_data, output_shape, output_data);
|
|
}
|
|
|
|
BinaryBroadcastFiveFold(
|
|
params, input1_shape, input1_data, input2_shape, input2_data,
|
|
output_shape, output_data,
|
|
static_cast<void (*)(int, const ArithmeticParams&, const T*, const T*,
|
|
T*)>(AddElementwise),
|
|
static_cast<void (*)(int, const ArithmeticParams&, T, const T*, T*)>(
|
|
AddScalarBroadcast));
|
|
}
|
|
|
|
inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
|
|
const RuntimeShape& unswitched_input1_shape,
|
|
const uint8* unswitched_input1_data,
|
|
const RuntimeShape& unswitched_input2_shape,
|
|
const uint8* unswitched_input2_data,
|
|
const RuntimeShape& output_shape,
|
|
uint8* output_data) {
|
|
BroadcastAddDispatch(unswitched_params, unswitched_input1_shape,
|
|
unswitched_input1_data, unswitched_input2_shape,
|
|
unswitched_input2_data, output_shape, output_data);
|
|
}
|
|
|
|
inline void BroadcastAddFivefold(const ArithmeticParams& params,
|
|
const RuntimeShape& unswitched_input1_shape,
|
|
const float* unswitched_input1_data,
|
|
const RuntimeShape& unswitched_input2_shape,
|
|
const float* unswitched_input2_data,
|
|
const RuntimeShape& output_shape,
|
|
float* output_data) {
|
|
BroadcastAddDispatch(params, unswitched_input1_shape, unswitched_input1_data,
|
|
unswitched_input2_shape, unswitched_input2_data,
|
|
output_shape, output_data);
|
|
}
|
|
|
|
inline void MulElementwise(int size, const ArithmeticParams& params,
|
|
const float* input1_data, const float* input2_data,
|
|
float* output_data) {
|
|
const float output_activation_min = params.float_activation_min;
|
|
const float output_activation_max = params.float_activation_max;
|
|
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
const auto activation_min = vdupq_n_f32(output_activation_min);
|
|
const auto activation_max = vdupq_n_f32(output_activation_max);
|
|
for (; i <= size - 16; i += 16) {
|
|
auto a10 = vld1q_f32(input1_data + i);
|
|
auto a11 = vld1q_f32(input1_data + i + 4);
|
|
auto a12 = vld1q_f32(input1_data + i + 8);
|
|
auto a13 = vld1q_f32(input1_data + i + 12);
|
|
auto a20 = vld1q_f32(input2_data + i);
|
|
auto a21 = vld1q_f32(input2_data + i + 4);
|
|
auto a22 = vld1q_f32(input2_data + i + 8);
|
|
auto a23 = vld1q_f32(input2_data + i + 12);
|
|
auto x0 = vmulq_f32(a10, a20);
|
|
auto x1 = vmulq_f32(a11, a21);
|
|
auto x2 = vmulq_f32(a12, a22);
|
|
auto x3 = vmulq_f32(a13, a23);
|
|
|
|
x0 = vmaxq_f32(activation_min, x0);
|
|
x1 = vmaxq_f32(activation_min, x1);
|
|
x2 = vmaxq_f32(activation_min, x2);
|
|
x3 = vmaxq_f32(activation_min, x3);
|
|
x0 = vminq_f32(activation_max, x0);
|
|
x1 = vminq_f32(activation_max, x1);
|
|
x2 = vminq_f32(activation_max, x2);
|
|
x3 = vminq_f32(activation_max, x3);
|
|
|
|
vst1q_f32(output_data + i, x0);
|
|
vst1q_f32(output_data + i + 4, x1);
|
|
vst1q_f32(output_data + i + 8, x2);
|
|
vst1q_f32(output_data + i + 12, x3);
|
|
}
|
|
for (; i <= size - 4; i += 4) {
|
|
auto a1 = vld1q_f32(input1_data + i);
|
|
auto a2 = vld1q_f32(input2_data + i);
|
|
auto x = vmulq_f32(a1, a2);
|
|
|
|
x = vmaxq_f32(activation_min, x);
|
|
x = vminq_f32(activation_max, x);
|
|
|
|
vst1q_f32(output_data + i, x);
|
|
}
|
|
#endif // NEON
|
|
|
|
for (; i < size; i++) {
|
|
auto x = input1_data[i] * input2_data[i];
|
|
output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min,
|
|
output_activation_max);
|
|
}
|
|
}
|
|
|
|
inline void Mul(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const float* input1_data,
|
|
const RuntimeShape& input2_shape, const float* input2_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("Mul");
|
|
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
MulElementwise(flat_size, params, input1_data, input2_data, output_data);
|
|
}
|
|
|
|
inline void Mul(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const int32* input1_data,
|
|
const RuntimeShape& input2_shape, const int32* input2_data,
|
|
const RuntimeShape& output_shape, int32* output_data) {
|
|
ruy::profiler::ScopeLabel label("Mul/int32/activation");
|
|
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
const int32 output_activation_min = params.quantized_activation_min;
|
|
const int32 output_activation_max = params.quantized_activation_max;
|
|
for (int i = 0; i < flat_size; ++i) {
|
|
output_data[i] = ActivationFunctionWithMinMax(
|
|
input1_data[i] * input2_data[i], output_activation_min,
|
|
output_activation_max);
|
|
}
|
|
}
|
|
|
|
inline void MulNoActivation(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape,
|
|
const int32* input1_data,
|
|
const RuntimeShape& input2_shape,
|
|
const int32* input2_data,
|
|
const RuntimeShape& output_shape,
|
|
int32* output_data) {
|
|
ruy::profiler::ScopeLabel label("Mul/int32");
|
|
|
|
auto input1_map = MapAsVector(input1_data, input1_shape);
|
|
auto input2_map = MapAsVector(input2_data, input2_shape);
|
|
auto output_map = MapAsVector(output_data, output_shape);
|
|
if (input1_shape == input2_shape) {
|
|
output_map.array() = input1_map.array() * input2_map.array();
|
|
} else if (input2_shape.FlatSize() == 1) {
|
|
auto scalar = input2_data[0];
|
|
output_map.array() = input1_map.array() * scalar;
|
|
} else if (input1_shape.FlatSize() == 1) {
|
|
auto scalar = input1_data[0];
|
|
output_map.array() = scalar * input2_map.array();
|
|
} else {
|
|
reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data,
|
|
input2_shape, input2_data, output_shape,
|
|
output_data);
|
|
}
|
|
}
|
|
|
|
inline void Mul(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const int16* input1_data,
|
|
const RuntimeShape& input2_shape, const int16* input2_data,
|
|
const RuntimeShape& output_shape, int16* output_data) {
|
|
ruy::profiler::ScopeLabel label("Mul/Int16/NoActivation");
|
|
// This is a copy of the reference implementation. We do not currently have a
|
|
// properly optimized version.
|
|
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
|
|
for (int i = 0; i < flat_size; i++) {
|
|
// F0 uses 0 integer bits, range [-1, 1].
|
|
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
|
|
|
F0 unclamped_result =
|
|
F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
|
|
output_data[i] = unclamped_result.raw();
|
|
}
|
|
}
|
|
|
|
inline void Mul(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const int16* input1_data,
|
|
const RuntimeShape& input2_shape, const int16* input2_data,
|
|
const RuntimeShape& output_shape, uint8* output_data) {
|
|
ruy::profiler::ScopeLabel label("Mul/Int16Uint8");
|
|
// This is a copy of the reference implementation. We do not currently have a
|
|
// properly optimized version.
|
|
const int32 output_activation_min = params.quantized_activation_min;
|
|
const int32 output_activation_max = params.quantized_activation_max;
|
|
const int32 output_offset = params.output_offset;
|
|
TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
|
|
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
|
|
for (int i = 0; i < flat_size; i++) {
|
|
// F0 uses 0 integer bits, range [-1, 1].
|
|
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
|
|
|
F0 unclamped_result =
|
|
F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
|
|
int16 rescaled_result =
|
|
gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
|
|
int16 clamped_result =
|
|
std::min<int16>(output_activation_max - output_offset, rescaled_result);
|
|
clamped_result =
|
|
std::max<int16>(output_activation_min - output_offset, clamped_result);
|
|
output_data[i] = output_offset + clamped_result;
|
|
}
|
|
}
|
|
|
|
// Element-wise mul that can often be used for inner loop of broadcast Mul as
|
|
// well as the non-broadcast Mul.
|
|
inline void MulElementwise(int size, const ArithmeticParams& params,
|
|
const uint8* input1_data, const uint8* input2_data,
|
|
uint8* output_data) {
|
|
int i = 0;
|
|
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
|
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
|
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
|
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
|
TFLITE_DCHECK_GT(params.output_offset, -256);
|
|
TFLITE_DCHECK_LT(params.output_offset, 256);
|
|
#ifdef USE_NEON
|
|
const auto input1_offset_vector = vdupq_n_s16(params.input1_offset);
|
|
const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
|
|
const auto output_offset_vector = vdupq_n_s16(params.output_offset);
|
|
const auto output_activation_min_vector =
|
|
vdup_n_u8(params.quantized_activation_min);
|
|
const auto output_activation_max_vector =
|
|
vdup_n_u8(params.quantized_activation_max);
|
|
const int left_shift = std::max(0, params.output_shift);
|
|
const int right_shift = std::max(0, -params.output_shift);
|
|
const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
|
|
for (; i <= size - 8; i += 8) {
|
|
// We load / store 8 at a time, multiplying as two sets of 4 int32s.
|
|
const auto input1_val_original = vld1_u8(input1_data + i);
|
|
const auto input2_val_original = vld1_u8(input2_data + i);
|
|
const auto input1_val_s16 =
|
|
vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
|
|
const auto input2_val_s16 =
|
|
vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
|
|
const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector);
|
|
const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
|
|
|
|
const auto input1_val_low = vget_low_s16(input1_val);
|
|
const auto input1_val_high = vget_high_s16(input1_val);
|
|
const auto input2_val_low = vget_low_s16(input2_val);
|
|
const auto input2_val_high = vget_high_s16(input2_val);
|
|
|
|
auto p1 = vmull_s16(input2_val_low, input1_val_low);
|
|
auto p2 = vmull_s16(input2_val_high, input1_val_high);
|
|
|
|
p1 = vshlq_s32(p1, left_shift_vec);
|
|
p2 = vshlq_s32(p2, left_shift_vec);
|
|
p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
|
|
p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
|
|
using gemmlowp::RoundingDivideByPOT;
|
|
p1 = RoundingDivideByPOT(p1, right_shift);
|
|
p2 = RoundingDivideByPOT(p2, right_shift);
|
|
|
|
const auto p1_narrowed = vqmovn_s32(p1);
|
|
const auto p2_narrowed = vqmovn_s32(p2);
|
|
const auto p =
|
|
vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
|
|
const auto clamped =
|
|
vmax_u8(output_activation_min_vector,
|
|
vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
|
|
vst1_u8(output_data + i, clamped);
|
|
}
|
|
#endif // NEON
|
|
|
|
for (; i < size; ++i) {
|
|
const int32 input1_val = params.input1_offset + input1_data[i];
|
|
const int32 input2_val = params.input2_offset + input2_data[i];
|
|
const int32 unclamped_result =
|
|
params.output_offset +
|
|
MultiplyByQuantizedMultiplier(input1_val * input2_val,
|
|
params.output_multiplier,
|
|
params.output_shift);
|
|
const int32 clamped_output =
|
|
std::min(params.quantized_activation_max,
|
|
std::max(params.quantized_activation_min, unclamped_result));
|
|
output_data[i] = static_cast<uint8>(clamped_output);
|
|
}
|
|
}
|
|
|
|
// Broadcast mul that can often be used for inner loop of broadcast Mul.
|
|
inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
|
|
const uint8 broadcast_value,
|
|
const uint8* input2_data, uint8* output_data) {
|
|
const int16 input1_val = params.input1_offset + broadcast_value;
|
|
|
|
int i = 0;
|
|
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
|
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
|
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
|
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
|
TFLITE_DCHECK_GT(params.output_offset, -256);
|
|
TFLITE_DCHECK_LT(params.output_offset, 256);
|
|
#ifdef USE_NEON
|
|
const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
|
|
const auto output_offset_vector = vdupq_n_s16(params.output_offset);
|
|
const auto output_activation_min_vector =
|
|
vdup_n_u8(params.quantized_activation_min);
|
|
const auto output_activation_max_vector =
|
|
vdup_n_u8(params.quantized_activation_max);
|
|
const int left_shift = std::max(0, params.output_shift);
|
|
const int right_shift = std::max(0, -params.output_shift);
|
|
const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
|
|
for (; i <= size - 8; i += 8) {
|
|
// We load / store 8 at a time, multiplying as two sets of 4 int32s.
|
|
const auto input2_val_original = vld1_u8(input2_data + i);
|
|
const auto input2_val_s16 =
|
|
vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
|
|
const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
|
|
|
|
const auto input2_val_low = vget_low_s16(input2_val);
|
|
const auto input2_val_high = vget_high_s16(input2_val);
|
|
|
|
auto p1 = vmull_n_s16(input2_val_low, input1_val);
|
|
auto p2 = vmull_n_s16(input2_val_high, input1_val);
|
|
|
|
p1 = vshlq_s32(p1, left_shift_vec);
|
|
p2 = vshlq_s32(p2, left_shift_vec);
|
|
p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
|
|
p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
|
|
using gemmlowp::RoundingDivideByPOT;
|
|
p1 = RoundingDivideByPOT(p1, right_shift);
|
|
p2 = RoundingDivideByPOT(p2, right_shift);
|
|
|
|
const auto p1_narrowed = vmovn_s32(p1);
|
|
const auto p2_narrowed = vmovn_s32(p2);
|
|
const auto p =
|
|
vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
|
|
const auto clamped =
|
|
vmax_u8(output_activation_min_vector,
|
|
vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
|
|
vst1_u8(output_data + i, clamped);
|
|
}
|
|
#endif // NEON
|
|
|
|
for (; i < size; ++i) {
|
|
const int32 input2_val = params.input2_offset + input2_data[i];
|
|
const int32 unclamped_result =
|
|
params.output_offset +
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
|
|
params.output_multiplier,
|
|
params.output_shift);
|
|
const int32 clamped_output =
|
|
std::min(params.quantized_activation_max,
|
|
std::max(params.quantized_activation_min, unclamped_result));
|
|
output_data[i] = static_cast<uint8>(clamped_output);
|
|
}
|
|
}
|
|
|
|
// Broadcast mul that can often be used for inner loop of broadcast Mul.
|
|
// This function will handle scalar_value (LHS) * vector_values (RHS).
|
|
// Since it's a float function, input params does not matter here.
|
|
inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
|
|
const float broadcast_value,
|
|
const float* input2_data, float* output_data) {
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
const float32x4_t output_activation_min_vector =
|
|
vdupq_n_f32(params.float_activation_min);
|
|
const float32x4_t output_activation_max_vector =
|
|
vdupq_n_f32(params.float_activation_max);
|
|
const float32x4_t broadcast_value_dup = vdupq_n_f32(broadcast_value);
|
|
for (; i <= size - 4; i += 4) {
|
|
const float32x4_t input2_val_original = vld1q_f32(input2_data + i);
|
|
|
|
const float32x4_t output =
|
|
vmulq_f32(input2_val_original, broadcast_value_dup);
|
|
|
|
const float32x4_t clamped =
|
|
vmaxq_f32(output_activation_min_vector,
|
|
vminq_f32(output_activation_max_vector, output));
|
|
vst1q_f32(output_data + i, clamped);
|
|
}
|
|
#endif // NEON
|
|
|
|
for (; i < size; ++i) {
|
|
float x = broadcast_value * input2_data[i];
|
|
output_data[i] = ActivationFunctionWithMinMax(
|
|
x, params.float_activation_min, params.float_activation_max);
|
|
}
|
|
}
|
|
|
|
inline void Mul(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape, const uint8* input1_data,
|
|
const RuntimeShape& input2_shape, const uint8* input2_data,
|
|
const RuntimeShape& output_shape, uint8* output_data) {
|
|
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
ruy::profiler::ScopeLabel label("Mul/8bit");
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
|
|
MulElementwise(flat_size, params, input1_data, input2_data, output_data);
|
|
}
|
|
|
|
template <typename T>
|
|
inline void BroadcastMulDispatch(
|
|
const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
|
const T* input1_data, const RuntimeShape& input2_shape,
|
|
const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
|
|
if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
|
|
return BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
|
|
input2_data, output_shape, output_data);
|
|
}
|
|
|
|
BinaryBroadcastFiveFold(
|
|
params, input1_shape, input1_data, input2_shape, input2_data,
|
|
output_shape, output_data,
|
|
static_cast<void (*)(int, const ArithmeticParams&, const T*, const T*,
|
|
T*)>(MulElementwise),
|
|
static_cast<void (*)(int, const ArithmeticParams&, T, const T*, T*)>(
|
|
MulSimpleBroadcast));
|
|
}
|
|
|
|
inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
|
|
const RuntimeShape& unswitched_input1_shape,
|
|
const uint8* unswitched_input1_data,
|
|
const RuntimeShape& unswitched_input2_shape,
|
|
const uint8* unswitched_input2_data,
|
|
const RuntimeShape& output_shape,
|
|
uint8* output_data) {
|
|
BroadcastMulDispatch(unswitched_params, unswitched_input1_shape,
|
|
unswitched_input1_data, unswitched_input2_shape,
|
|
unswitched_input2_data, output_shape, output_data);
|
|
}
|
|
|
|
inline void BroadcastMulFivefold(const ArithmeticParams& params,
|
|
const RuntimeShape& unswitched_input1_shape,
|
|
const float* unswitched_input1_data,
|
|
const RuntimeShape& unswitched_input2_shape,
|
|
const float* unswitched_input2_data,
|
|
const RuntimeShape& output_shape,
|
|
float* output_data) {
|
|
BroadcastMulDispatch(params, unswitched_input1_shape, unswitched_input1_data,
|
|
unswitched_input2_shape, unswitched_input2_data,
|
|
output_shape, output_data);
|
|
}
|
|
|
|
|
|
|
|
// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
|
|
// dimensionality if the runtime code does a single loop over one dimension
|
|
// that handles broadcasting as the base case. The code generator would then
|
|
// generate max(D1, D2) nested for loops.
|
|
// TODO(benoitjacob): BroadcastDiv is intentionally duplicated from
|
|
// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
|
|
// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
|
|
// reference_ops.h.
|
|
template <typename T, int N = 5>
|
|
void BroadcastDivSlow(const ArithmeticParams& params,
|
|
const RuntimeShape& unextended_input1_shape,
|
|
const T* input1_data,
|
|
const RuntimeShape& unextended_input2_shape,
|
|
const T* input2_data,
|
|
const RuntimeShape& unextended_output_shape,
|
|
T* output_data) {
|
|
ruy::profiler::ScopeLabel label("BroadcastDivSlow");
|
|
T output_activation_min;
|
|
T output_activation_max;
|
|
GetActivationParams(params, &output_activation_min, &output_activation_max);
|
|
|
|
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
|
|
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
|
|
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
|
|
|
|
NdArrayDesc<N> desc1;
|
|
NdArrayDesc<N> desc2;
|
|
NdArrayDesc<N> output_desc;
|
|
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
|
unextended_input2_shape, &desc1, &desc2);
|
|
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
|
|
&output_desc);
|
|
|
|
// In Tensorflow, the dimensions are canonically named (batch_number, row,
|
|
// col, channel), with extents (batches, height, width, depth), with the
|
|
// trailing dimension changing most rapidly (channels has the smallest stride,
|
|
// typically 1 element).
|
|
//
|
|
// In generated C code, we store arrays with the dimensions reversed. The
|
|
// first dimension has smallest stride.
|
|
//
|
|
// We name our variables by their Tensorflow convention, but generate C code
|
|
// nesting loops such that the innermost loop has the smallest stride for the
|
|
// best cache behavior.
|
|
auto div_func = [&](int indexes[N]) {
|
|
output_data[SubscriptToIndex(output_desc, indexes)] =
|
|
ActivationFunctionWithMinMax(
|
|
input1_data[SubscriptToIndex(desc1, indexes)] /
|
|
input2_data[SubscriptToIndex(desc2, indexes)],
|
|
output_activation_min, output_activation_max);
|
|
};
|
|
NDOpsHelper<N>(output_desc, div_func);
|
|
}
|
|
|
|
// TODO: BroadcastDiv is intentionally duplicated from reference_ops.h.
|
|
// For more details see the comment above the generic version of
|
|
// BroadcastDivSlow.
|
|
template <int N = 5>
|
|
inline void BroadcastDivSlow(const ArithmeticParams& params,
|
|
const RuntimeShape& unextended_input1_shape,
|
|
const uint8* input1_data,
|
|
const RuntimeShape& unextended_input2_shape,
|
|
const uint8* input2_data,
|
|
const RuntimeShape& unextended_output_shape,
|
|
uint8* output_data) {
|
|
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), N);
|
|
TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), N);
|
|
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), N);
|
|
|
|
NdArrayDesc<N> desc1;
|
|
NdArrayDesc<N> desc2;
|
|
NdArrayDesc<N> output_desc;
|
|
NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
|
|
unextended_input2_shape, &desc1, &desc2);
|
|
CopyDimsToDesc(RuntimeShape::ExtendedShape(N, unextended_output_shape),
|
|
&output_desc);
|
|
|
|
TFLITE_DCHECK_GT(params.input1_offset, -256);
|
|
TFLITE_DCHECK_LT(params.input1_offset, 256);
|
|
TFLITE_DCHECK_GT(params.input2_offset, -256);
|
|
TFLITE_DCHECK_LT(params.input2_offset, 256);
|
|
TFLITE_DCHECK_GT(params.output_offset, -256);
|
|
TFLITE_DCHECK_LT(params.output_offset, 256);
|
|
|
|
auto div_func = [&](int indexes[N]) {
|
|
const int32 input1_val =
|
|
params.input1_offset + input1_data[SubscriptToIndex(desc1, indexes)];
|
|
const int32 input2_val =
|
|
params.input2_offset + input2_data[SubscriptToIndex(desc2, indexes)];
|
|
TFLITE_DCHECK_NE(input2_val, 0);
|
|
int recip_shift;
|
|
const int32 input2_inv =
|
|
(input2_val > 0) ? GetReciprocal(input2_val, 31, &recip_shift)
|
|
: -GetReciprocal(-input2_val, 31, &recip_shift);
|
|
const int headroom = CountLeadingSignBits(input1_val);
|
|
const int32 unscaled_quotient = MultiplyByQuantizedMultiplierGreaterThanOne(
|
|
input1_val, input2_inv, headroom);
|
|
const int total_shift = params.output_shift - recip_shift - headroom;
|
|
const int32 unclamped_result =
|
|
params.output_offset +
|
|
MultiplyByQuantizedMultiplierSmallerThanOneExp(
|
|
unscaled_quotient, params.output_multiplier, total_shift);
|
|
const int32 clamped_output =
|
|
std::min(params.quantized_activation_max,
|
|
std::max(params.quantized_activation_min, unclamped_result));
|
|
output_data[SubscriptToIndex(output_desc, indexes)] =
|
|
static_cast<uint8>(clamped_output);
|
|
};
|
|
NDOpsHelper<N>(output_desc, div_func);
|
|
}
|
|
|
|
// TODO(aselle): This is not actually optimized yet.
|
|
inline void SubNonBroadcast(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape,
|
|
const float* input1_data,
|
|
const RuntimeShape& input2_shape,
|
|
const float* input2_data,
|
|
const RuntimeShape& output_shape,
|
|
float* output_data) {
|
|
ruy::profiler::ScopeLabel label("SubNonBroadcast");
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
for (int i = 0; i < flat_size; ++i) {
|
|
output_data[i] = ActivationFunctionWithMinMax(
|
|
input1_data[i] - input2_data[i], params.float_activation_min,
|
|
params.float_activation_max);
|
|
}
|
|
}
|
|
|
|
inline void SubWithActivation(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape,
|
|
const int32* input1_data,
|
|
const RuntimeShape& input2_shape,
|
|
const int32* input2_data,
|
|
const RuntimeShape& output_shape,
|
|
int32* output_data) {
|
|
ruy::profiler::ScopeLabel label("SubWithActivation/int32");
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
for (int i = 0; i < flat_size; ++i) {
|
|
output_data[i] = ActivationFunctionWithMinMax(
|
|
input1_data[i] - input2_data[i], params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
}
|
|
}
|
|
|
|
inline void SubWithActivation(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape,
|
|
const float* input1_data,
|
|
const RuntimeShape& input2_shape,
|
|
const float* input2_data,
|
|
const RuntimeShape& output_shape,
|
|
float* output_data) {
|
|
ruy::profiler::ScopeLabel label("SubWithActivation/float");
|
|
const int flat_size =
|
|
MatchingElementsSize(input1_shape, input2_shape, output_shape);
|
|
for (int i = 0; i < flat_size; ++i) {
|
|
output_data[i] = ActivationFunctionWithMinMax(
|
|
input1_data[i] - input2_data[i], params.float_activation_min,
|
|
params.float_activation_max);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
|
|
const T* input1_data, const RuntimeShape& input2_shape,
|
|
const T* input2_data, const RuntimeShape& output_shape,
|
|
T* output_data) {
|
|
ruy::profiler::ScopeLabel label("Sub");
|
|
|
|
auto input1_map = MapAsVector(input1_data, input1_shape);
|
|
auto input2_map = MapAsVector(input2_data, input2_shape);
|
|
auto output_map = MapAsVector(output_data, output_shape);
|
|
if (input1_shape == input2_shape) {
|
|
output_map.array() = input1_map.array() - input2_map.array();
|
|
} else if (input1_shape.FlatSize() == 1) {
|
|
auto scalar = input1_data[0];
|
|
output_map.array() = scalar - input2_map.array();
|
|
} else if (input2_shape.FlatSize() == 1) {
|
|
auto scalar = input2_data[0];
|
|
output_map.array() = input1_map.array() - scalar;
|
|
} else {
|
|
BroadcastSubSlow(params, input1_shape, input1_data, input2_shape,
|
|
input2_data, output_shape, output_data);
|
|
}
|
|
}
|
|
|
|
inline void LstmCell(
|
|
const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
|
|
const float* input_data, const RuntimeShape& unextended_prev_activ_shape,
|
|
const float* prev_activ_data, const RuntimeShape& weights_shape,
|
|
const float* weights_data, const RuntimeShape& unextended_bias_shape,
|
|
const float* bias_data, const RuntimeShape& unextended_prev_state_shape,
|
|
const float* prev_state_data,
|
|
const RuntimeShape& unextended_output_state_shape, float* output_state_data,
|
|
const RuntimeShape& unextended_output_activ_shape, float* output_activ_data,
|
|
const RuntimeShape& unextended_concat_temp_shape, float* concat_temp_data,
|
|
const RuntimeShape& unextended_activ_temp_shape, float* activ_temp_data,
|
|
CpuBackendContext* cpu_backend_context) {
|
|
ruy::profiler::ScopeLabel label("LstmCell");
|
|
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
|
|
const RuntimeShape input_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
|
const RuntimeShape prev_activ_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
|
|
const RuntimeShape bias_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_bias_shape);
|
|
const RuntimeShape prev_state_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
|
|
const RuntimeShape output_state_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
|
|
const RuntimeShape output_activ_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
|
|
const RuntimeShape concat_temp_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
|
|
const RuntimeShape activ_temp_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
|
|
TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
|
|
|
|
const int weights_dim_count = weights_shape.DimensionsCount();
|
|
MatchingDim( // batches
|
|
input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
|
|
output_state_shape, 0, output_activ_shape, 0);
|
|
MatchingDim( // height
|
|
input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
|
|
output_state_shape, 1, output_activ_shape, 1);
|
|
MatchingDim( // width
|
|
input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
|
|
output_state_shape, 2, output_activ_shape, 2);
|
|
const int input_depth = input_shape.Dims(3);
|
|
const int prev_activ_depth = prev_activ_shape.Dims(3);
|
|
const int total_input_depth = prev_activ_depth + input_depth;
|
|
TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
|
|
total_input_depth);
|
|
TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
|
|
const int intern_activ_depth =
|
|
MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
|
|
TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
|
|
intern_activ_depth * total_input_depth);
|
|
TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
|
|
const int output_depth =
|
|
MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
|
|
3, output_activ_shape, 3);
|
|
TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
|
|
|
|
// Concatenate prev_activ and input data together
|
|
std::vector<float const*> concat_input_arrays_data;
|
|
std::vector<RuntimeShape const*> concat_input_arrays_shapes;
|
|
concat_input_arrays_data.push_back(input_data);
|
|
concat_input_arrays_data.push_back(prev_activ_data);
|
|
concat_input_arrays_shapes.push_back(&input_shape);
|
|
concat_input_arrays_shapes.push_back(&prev_activ_shape);
|
|
tflite::ConcatenationParams concat_params;
|
|
concat_params.axis = 3;
|
|
concat_params.inputs_count = concat_input_arrays_data.size();
|
|
Concatenation(concat_params, &(concat_input_arrays_shapes[0]),
|
|
&(concat_input_arrays_data[0]), concat_temp_shape,
|
|
concat_temp_data);
|
|
|
|
// Fully connected
|
|
tflite::FullyConnectedParams fc_params;
|
|
fc_params.float_activation_min = std::numeric_limits<float>::lowest();
|
|
fc_params.float_activation_max = std::numeric_limits<float>::max();
|
|
fc_params.lhs_cacheable = false;
|
|
fc_params.rhs_cacheable = false;
|
|
FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape,
|
|
weights_data, bias_shape, bias_data, activ_temp_shape,
|
|
activ_temp_data, cpu_backend_context);
|
|
|
|
// Map raw arrays to Eigen arrays so we can use Eigen's optimized array
|
|
// operations.
|
|
ArrayMap<float> activ_temp_map =
|
|
MapAsArrayWithLastDimAsRows(activ_temp_data, activ_temp_shape);
|
|
auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth,
|
|
activ_temp_map.cols());
|
|
auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth,
|
|
activ_temp_map.cols());
|
|
auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth,
|
|
activ_temp_map.cols());
|
|
auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth,
|
|
activ_temp_map.cols());
|
|
ArrayMap<const float> prev_state_map =
|
|
MapAsArrayWithLastDimAsRows(prev_state_data, prev_state_shape);
|
|
ArrayMap<float> output_state_map =
|
|
MapAsArrayWithLastDimAsRows(output_state_data, output_state_shape);
|
|
ArrayMap<float> output_activ_map =
|
|
MapAsArrayWithLastDimAsRows(output_activ_data, output_activ_shape);
|
|
|
|
// Combined memory state and final output calculation
|
|
ruy::profiler::ScopeLabel label2("MemoryStateAndFinalOutput");
|
|
output_state_map =
|
|
input_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
|
|
new_input_sm.tanh() +
|
|
forget_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
|
|
prev_state_map;
|
|
output_activ_map =
|
|
output_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
|
|
output_state_map.tanh();
|
|
}
|
|
|
|
template <int StateIntegerBits>
|
|
inline void LstmCell(
|
|
const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
|
|
const uint8* input_data_uint8,
|
|
const RuntimeShape& unextended_prev_activ_shape,
|
|
const uint8* prev_activ_data_uint8, const RuntimeShape& weights_shape,
|
|
const uint8* weights_data_uint8, const RuntimeShape& unextended_bias_shape,
|
|
const int32* bias_data_int32,
|
|
const RuntimeShape& unextended_prev_state_shape,
|
|
const int16* prev_state_data_int16,
|
|
const RuntimeShape& unextended_output_state_shape,
|
|
int16* output_state_data_int16,
|
|
const RuntimeShape& unextended_output_activ_shape,
|
|
uint8* output_activ_data_uint8,
|
|
const RuntimeShape& unextended_concat_temp_shape,
|
|
uint8* concat_temp_data_uint8,
|
|
const RuntimeShape& unextended_activ_temp_shape,
|
|
int16* activ_temp_data_int16, CpuBackendContext* cpu_backend_context) {
|
|
ruy::profiler::ScopeLabel label(
|
|
"LstmCell/quantized (8bit external, 16bit internal)");
|
|
int32 weights_zero_point = params.weights_zero_point;
|
|
int32 accum_multiplier = params.accum_multiplier;
|
|
int accum_shift = params.accum_shift;
|
|
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
|
|
const RuntimeShape input_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
|
const RuntimeShape prev_activ_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
|
|
const RuntimeShape bias_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_bias_shape);
|
|
const RuntimeShape prev_state_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
|
|
const RuntimeShape output_state_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
|
|
const RuntimeShape output_activ_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
|
|
const RuntimeShape concat_temp_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
|
|
const RuntimeShape activ_temp_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
|
|
TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
|
|
|
|
// Gather dimensions information, and perform consistency checks.
|
|
const int weights_dim_count = weights_shape.DimensionsCount();
|
|
const int outer_size = MatchingFlatSizeSkipDim(
|
|
input_shape, 3, prev_activ_shape, prev_state_shape, output_state_shape,
|
|
output_activ_shape);
|
|
const int input_depth = input_shape.Dims(3);
|
|
const int prev_activ_depth = prev_activ_shape.Dims(3);
|
|
const int total_input_depth = prev_activ_depth + input_depth;
|
|
TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1),
|
|
total_input_depth);
|
|
const int intern_activ_depth =
|
|
MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
|
|
TFLITE_DCHECK_EQ(weights_shape.FlatSize(),
|
|
intern_activ_depth * total_input_depth);
|
|
TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
|
|
TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
|
|
const int output_depth =
|
|
MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
|
|
3, output_activ_shape, 3);
|
|
TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
|
|
const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
|
|
const int fc_output_depth =
|
|
MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
|
|
const int fc_accum_depth = total_input_depth;
|
|
TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
|
|
|
|
// Depth-concatenate prev_activ and input data together.
|
|
uint8 const* concat_input_arrays_data[2] = {input_data_uint8,
|
|
prev_activ_data_uint8};
|
|
const RuntimeShape* concat_input_arrays_shapes[2] = {&input_shape,
|
|
&prev_activ_shape};
|
|
tflite::ConcatenationParams concat_params;
|
|
concat_params.axis = 3;
|
|
concat_params.inputs_count = 2;
|
|
Concatenation(concat_params, concat_input_arrays_shapes,
|
|
concat_input_arrays_data, concat_temp_shape,
|
|
concat_temp_data_uint8);
|
|
|
|
// Implementation of the fully connected node inside the LSTM cell.
|
|
// The operands are 8-bit integers, the accumulators are internally 32bit
|
|
// integers, and the output is 16-bit fixed-point with 3 integer bits so
|
|
// the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
|
|
// is explained in the function comment above.
|
|
cpu_backend_gemm::MatrixParams<uint8> lhs_params;
|
|
lhs_params.rows = fc_output_depth;
|
|
lhs_params.cols = fc_accum_depth;
|
|
lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
|
|
lhs_params.zero_point = weights_zero_point;
|
|
cpu_backend_gemm::MatrixParams<uint8> rhs_params;
|
|
rhs_params.rows = fc_accum_depth;
|
|
rhs_params.cols = fc_batches;
|
|
rhs_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
rhs_params.zero_point = 128;
|
|
cpu_backend_gemm::MatrixParams<int16> dst_params;
|
|
dst_params.rows = fc_output_depth;
|
|
dst_params.cols = fc_batches;
|
|
dst_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
dst_params.zero_point = 0;
|
|
cpu_backend_gemm::GemmParams<int32, int16> gemm_params;
|
|
gemm_params.bias = bias_data_int32;
|
|
gemm_params.multiplier_fixedpoint = accum_multiplier;
|
|
gemm_params.multiplier_exponent = accum_shift;
|
|
cpu_backend_gemm::Gemm(
|
|
lhs_params, weights_data_uint8, rhs_params, concat_temp_data_uint8,
|
|
dst_params, activ_temp_data_int16, gemm_params, cpu_backend_context);
|
|
|
|
// Rest of the LSTM cell: tanh and logistic math functions, and some adds
|
|
// and muls, all done in 16-bit fixed-point.
|
|
const int16* input_gate_input_ptr = activ_temp_data_int16;
|
|
const int16* input_modulation_gate_input_ptr =
|
|
activ_temp_data_int16 + output_depth;
|
|
const int16* forget_gate_input_ptr = activ_temp_data_int16 + 2 * output_depth;
|
|
const int16* output_gate_input_ptr = activ_temp_data_int16 + 3 * output_depth;
|
|
const int16* prev_state_ptr = prev_state_data_int16;
|
|
int16* output_state_data_ptr = output_state_data_int16;
|
|
uint8* output_activ_data_ptr = output_activ_data_uint8;
|
|
|
|
for (int b = 0; b < outer_size; ++b) {
|
|
int c = 0;
|
|
#ifdef GEMMLOWP_NEON
|
|
for (; c <= output_depth - 8; c += 8) {
|
|
// Define the fixed-point data types that we will use here. All use
|
|
// int16 as the underlying integer type i.e. all are 16-bit fixed-point.
|
|
// They only differ by the number of integral vs. fractional bits,
|
|
// determining the range of values that they can represent.
|
|
//
|
|
// F0 uses 0 integer bits, range [-1, 1].
|
|
// This is the return type of math functions such as tanh, logistic,
|
|
// whose range is in [-1, 1].
|
|
using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
|
|
// F3 uses 3 integer bits, range [-8, 8].
|
|
// This is the range of the previous fully-connected node's output,
|
|
// which is our input here.
|
|
using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
|
|
// FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
|
|
// 2^StateIntegerBits]. It's used to represent the internal state, whose
|
|
// number of integer bits is currently dictated by the model. See comment
|
|
// on the StateIntegerBits template parameter above.
|
|
using FS = gemmlowp::FixedPoint<int16x8_t, StateIntegerBits>;
|
|
// Implementation of input gate, using fixed-point logistic function.
|
|
F3 input_gate_input = F3::FromRaw(vld1q_s16(input_gate_input_ptr));
|
|
input_gate_input_ptr += 8;
|
|
F0 input_gate_output = gemmlowp::logistic(input_gate_input);
|
|
// Implementation of input modulation gate, using fixed-point tanh
|
|
// function.
|
|
F3 input_modulation_gate_input =
|
|
F3::FromRaw(vld1q_s16(input_modulation_gate_input_ptr));
|
|
input_modulation_gate_input_ptr += 8;
|
|
F0 input_modulation_gate_output =
|
|
gemmlowp::tanh(input_modulation_gate_input);
|
|
// Implementation of forget gate, using fixed-point logistic function.
|
|
F3 forget_gate_input = F3::FromRaw(vld1q_s16(forget_gate_input_ptr));
|
|
forget_gate_input_ptr += 8;
|
|
F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
|
|
// Implementation of output gate, using fixed-point logistic function.
|
|
F3 output_gate_input = F3::FromRaw(vld1q_s16(output_gate_input_ptr));
|
|
output_gate_input_ptr += 8;
|
|
F0 output_gate_output = gemmlowp::logistic(output_gate_input);
|
|
// Implementation of internal multiplication nodes, still in fixed-point.
|
|
F0 input_times_input_modulation =
|
|
input_gate_output * input_modulation_gate_output;
|
|
FS prev_state = FS::FromRaw(vld1q_s16(prev_state_ptr));
|
|
prev_state_ptr += 8;
|
|
FS prev_state_times_forget_state = forget_gate_output * prev_state;
|
|
// Implementation of internal addition node, saturating.
|
|
FS new_state = gemmlowp::SaturatingAdd(
|
|
gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
|
|
prev_state_times_forget_state);
|
|
// Implementation of last internal Tanh node, still in fixed-point.
|
|
// Since a Tanh fixed-point implementation is specialized for a given
|
|
// number or integer bits, and each specialization can have a substantial
|
|
// code size, and we already used above a Tanh on an input with 3 integer
|
|
// bits, and per the table in the above function comment there is no
|
|
// significant accuracy to be lost by clamping to [-8, +8] for a
|
|
// 3-integer-bits representation, let us just do that. This helps people
|
|
// porting this to targets where code footprint must be minimized.
|
|
F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
|
|
F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
|
|
// Store the new internal state back to memory, as 16-bit integers.
|
|
// Note: here we store the original value with StateIntegerBits, not
|
|
// the rescaled 3-integer-bits value fed to tanh.
|
|
vst1q_s16(output_state_data_ptr, new_state.raw());
|
|
output_state_data_ptr += 8;
|
|
// Down-scale the output activations to 8-bit integers, saturating,
|
|
// and store back to memory.
|
|
int16x8_t rescaled_output_activ =
|
|
gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
|
|
int8x8_t int8_output_activ = vqmovn_s16(rescaled_output_activ);
|
|
uint8x8_t uint8_output_activ =
|
|
vadd_u8(vdup_n_u8(128), vreinterpret_u8_s8(int8_output_activ));
|
|
vst1_u8(output_activ_data_ptr, uint8_output_activ);
|
|
output_activ_data_ptr += 8;
|
|
}
|
|
#endif
|
|
for (; c < output_depth; ++c) {
|
|
// Define the fixed-point data types that we will use here. All use
|
|
// int16 as the underlying integer type i.e. all are 16-bit fixed-point.
|
|
// They only differ by the number of integral vs. fractional bits,
|
|
// determining the range of values that they can represent.
|
|
//
|
|
// F0 uses 0 integer bits, range [-1, 1].
|
|
// This is the return type of math functions such as tanh, logistic,
|
|
// whose range is in [-1, 1].
|
|
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
|
// F3 uses 3 integer bits, range [-8, 8].
|
|
// This is the range of the previous fully-connected node's output,
|
|
// which is our input here.
|
|
using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
|
|
// FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
|
|
// 2^StateIntegerBits]. It's used to represent the internal state, whose
|
|
// number of integer bits is currently dictated by the model. See comment
|
|
// on the StateIntegerBits template parameter above.
|
|
using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
|
|
// Implementation of input gate, using fixed-point logistic function.
|
|
F3 input_gate_input = F3::FromRaw(*input_gate_input_ptr++);
|
|
F0 input_gate_output = gemmlowp::logistic(input_gate_input);
|
|
// Implementation of input modulation gate, using fixed-point tanh
|
|
// function.
|
|
F3 input_modulation_gate_input =
|
|
F3::FromRaw(*input_modulation_gate_input_ptr++);
|
|
F0 input_modulation_gate_output =
|
|
gemmlowp::tanh(input_modulation_gate_input);
|
|
// Implementation of forget gate, using fixed-point logistic function.
|
|
F3 forget_gate_input = F3::FromRaw(*forget_gate_input_ptr++);
|
|
F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
|
|
// Implementation of output gate, using fixed-point logistic function.
|
|
F3 output_gate_input = F3::FromRaw(*output_gate_input_ptr++);
|
|
F0 output_gate_output = gemmlowp::logistic(output_gate_input);
|
|
// Implementation of internal multiplication nodes, still in fixed-point.
|
|
F0 input_times_input_modulation =
|
|
input_gate_output * input_modulation_gate_output;
|
|
FS prev_state = FS::FromRaw(*prev_state_ptr++);
|
|
FS prev_state_times_forget_state = forget_gate_output * prev_state;
|
|
// Implementation of internal addition node, saturating.
|
|
FS new_state = gemmlowp::SaturatingAdd(
|
|
gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
|
|
prev_state_times_forget_state);
|
|
// Implementation of last internal Tanh node, still in fixed-point.
|
|
// Since a Tanh fixed-point implementation is specialized for a given
|
|
// number or integer bits, and each specialization can have a substantial
|
|
// code size, and we already used above a Tanh on an input with 3 integer
|
|
// bits, and per the table in the above function comment there is no
|
|
// significant accuracy to be lost by clamping to [-8, +8] for a
|
|
// 3-integer-bits representation, let us just do that. This helps people
|
|
// porting this to targets where code footprint must be minimized.
|
|
F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
|
|
F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
|
|
// Store the new internal state back to memory, as 16-bit integers.
|
|
// Note: here we store the original value with StateIntegerBits, not
|
|
// the rescaled 3-integer-bits value fed to tanh.
|
|
*output_state_data_ptr++ = new_state.raw();
|
|
// Down-scale the output activations to 8-bit integers, saturating,
|
|
// and store back to memory.
|
|
int16 rescaled_output_activ =
|
|
gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
|
|
int16 clamped_output_activ =
|
|
std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
|
|
*output_activ_data_ptr++ = 128 + clamped_output_activ;
|
|
}
|
|
input_gate_input_ptr += 3 * output_depth;
|
|
input_modulation_gate_input_ptr += 3 * output_depth;
|
|
forget_gate_input_ptr += 3 * output_depth;
|
|
output_gate_input_ptr += 3 * output_depth;
|
|
}
|
|
}
|
|
|
|
inline int NodeOffset(int b, int h, int w, int height, int width) {
|
|
return (b * height + h) * width + w;
|
|
}
|
|
|
|
inline void AveragePool(const PoolParams& params,
|
|
const RuntimeShape& input_shape,
|
|
const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("AveragePool");
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
|
const int input_height = input_shape.Dims(1);
|
|
const int input_width = input_shape.Dims(2);
|
|
const int output_height = output_shape.Dims(1);
|
|
const int output_width = output_shape.Dims(2);
|
|
const int stride_height = params.stride_height;
|
|
const int stride_width = params.stride_width;
|
|
|
|
// TODO(benoitjacob) make this a proper reference impl without Eigen!
|
|
const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
|
|
auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
|
|
// TODO(benoitjacob) get rid of the dynamic memory allocation here!
|
|
Eigen::VectorXf out_count(out_mat.cols());
|
|
out_count.setZero();
|
|
// Prefill the output to 0.
|
|
out_mat.setZero();
|
|
for (int b = 0; b < batches; ++b) {
|
|
for (int h = 0; h < input_height; ++h) {
|
|
for (int w = 0; w < input_width; ++w) {
|
|
// (h_start, h_end) * (w_start, w_end) is the range that the input
|
|
// vector projects to.
|
|
int hpad = h + params.padding_values.height;
|
|
int wpad = w + params.padding_values.width;
|
|
int h_start = (hpad < params.filter_height)
|
|
? 0
|
|
: (hpad - params.filter_height) / stride_height + 1;
|
|
int h_end = std::min(hpad / stride_height + 1, output_height);
|
|
int w_start = (wpad < params.filter_width)
|
|
? 0
|
|
: (wpad - params.filter_width) / stride_width + 1;
|
|
int w_end = std::min(wpad / stride_width + 1, output_width);
|
|
// compute elementwise sum
|
|
for (int ph = h_start; ph < h_end; ++ph) {
|
|
for (int pw = w_start; pw < w_end; ++pw) {
|
|
int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
|
|
out_mat.col(out_offset) +=
|
|
in_mat.col(NodeOffset(b, h, w, input_height, input_width));
|
|
out_count(out_offset)++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Divide the output by the actual number of elements being averaged over
|
|
TFLITE_DCHECK_GT(out_count.minCoeff(), 0);
|
|
out_mat.array().rowwise() /= out_count.transpose().array();
|
|
|
|
const int flat_size = output_shape.FlatSize();
|
|
for (int i = 0; i < flat_size; ++i) {
|
|
output_data[i] = ActivationFunctionWithMinMax(output_data[i],
|
|
params.float_activation_min,
|
|
params.float_activation_max);
|
|
}
|
|
}
|
|
|
|
inline void AveragePool16(const PoolParams& params,
|
|
const RuntimeShape& input_shape,
|
|
const uint8* input_data,
|
|
const RuntimeShape& output_shape,
|
|
uint8* output_data) {
|
|
ruy::profiler::ScopeLabel label("AveragePool/8bit");
|
|
|
|
// Here, and in other pooling ops, in order to maintain locality of reference,
|
|
// to minimize some recalculations, and to load into NEON vector registers, we
|
|
// use an inner loop down the depth. Since depths can be large and hence we
|
|
// would need arbitrarily large temporary storage, we divide the work up into
|
|
// depth tranches just within the batch loop.
|
|
static constexpr int kPoolingAccTrancheSize = 256;
|
|
|
|
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
|
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
|
|
const int input_height = input_shape.Dims(1);
|
|
const int input_width = input_shape.Dims(2);
|
|
const int output_height = output_shape.Dims(1);
|
|
const int output_width = output_shape.Dims(2);
|
|
const int stride_height = params.stride_height;
|
|
const int stride_width = params.stride_width;
|
|
|
|
uint16 acc[kPoolingAccTrancheSize];
|
|
for (int batch = 0; batch < batches; ++batch) {
|
|
// We proceed through the depth in tranches (see comment above). The
|
|
// depth_base is the depth at the beginning of the tranche. The
|
|
// tranche_depth is the depth dimension of the tranche.
|
|
for (int depth_base = 0; depth_base < depth;
|
|
depth_base += kPoolingAccTrancheSize) {
|
|
const int tranche_depth =
|
|
std::min(depth - depth_base, kPoolingAccTrancheSize);
|
|
for (int out_y = 0; out_y < output_height; ++out_y) {
|
|
for (int out_x = 0; out_x < output_width; ++out_x) {
|
|
const int in_x_origin =
|
|
(out_x * stride_width) - params.padding_values.width;
|
|
const int in_y_origin =
|
|
(out_y * stride_height) - params.padding_values.height;
|
|
const int filter_x_start = std::max(0, -in_x_origin);
|
|
const int filter_x_end =
|
|
std::min(params.filter_width, input_width - in_x_origin);
|
|
const int filter_y_start = std::max(0, -in_y_origin);
|
|
const int filter_y_end =
|
|
std::min(params.filter_height, input_height - in_y_origin);
|
|
const int filter_count =
|
|
(filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
|
|
memset(acc, 0, tranche_depth * sizeof(acc[0]));
|
|
const uint8* input_ptr =
|
|
input_data + depth_base +
|
|
depth * (in_x_origin +
|
|
input_width * (in_y_origin + input_height * batch));
|
|
for (int fy = filter_y_start; fy < filter_y_end; fy++) {
|
|
const uint8* input_row_ptr =
|
|
input_ptr + depth * (fy * input_width + filter_x_start);
|
|
for (int fx = filter_x_start; fx < filter_x_end; fx++) {
|
|
const uint8* input_channel_ptr = input_row_ptr;
|
|
int channel = 0;
|
|
#ifdef USE_NEON
|
|
for (; channel <= tranche_depth - 16; channel += 16) {
|
|
uint16x8_t acc_reg[2];
|
|
for (int i = 0; i < 2; i++) {
|
|
acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
|
|
}
|
|
uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
|
|
input_channel_ptr += 16;
|
|
acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
|
|
acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
|
|
for (int i = 0; i < 2; i++) {
|
|
vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
|
|
}
|
|
}
|
|
for (; channel <= tranche_depth - 8; channel += 8) {
|
|
uint16x8_t acc_reg = vld1q_u16(acc + channel);
|
|
uint8x8_t input_reg = vld1_u8(input_channel_ptr);
|
|
input_channel_ptr += 8;
|
|
acc_reg = vaddw_u8(acc_reg, input_reg);
|
|
vst1q_u16(acc + channel, acc_reg);
|
|
}
|
|
#endif
|
|
for (; channel < tranche_depth; ++channel) {
|
|
acc[channel] += *input_channel_ptr++;
|
|
}
|
|
input_row_ptr += depth;
|
|
}
|
|
}
|
|
uint8* output_ptr = output_data + Offset(output_shape, batch, out_y,
|
|
out_x, depth_base);
|
|
int channel = 0;
|
|
#ifdef USE_NEON
|
|
#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \
|
|
if (filter_count == FILTER_COUNT) { \
|
|
for (; channel <= tranche_depth - 8; channel += 8) { \
|
|
uint16 buf[8]; \
|
|
for (int i = 0; i < 8; i++) { \
|
|
buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \
|
|
} \
|
|
uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \
|
|
buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \
|
|
buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \
|
|
vst1_u8(output_ptr + channel, buf8); \
|
|
} \
|
|
}
|
|
AVGPOOL_DIVIDING_BY(9)
|
|
AVGPOOL_DIVIDING_BY(15)
|
|
#undef AVGPOOL_DIVIDING_BY
|
|
for (; channel <= tranche_depth - 8; channel += 8) {
|
|
uint16 buf[8];
|
|
for (int i = 0; i < 8; i++) {
|
|
buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
|
|
}
|
|
uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
|
|
buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
|
|
buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
|
|
vst1_u8(output_ptr + channel, buf8);
|
|
}
|
|
#endif
|
|
for (; channel < tranche_depth; ++channel) {
|
|
uint16 a = (acc[channel] + filter_count / 2) / filter_count;
|
|
a = std::max<uint16>(a, params.quantized_activation_min);
|
|
a = std::min<uint16>(a, params.quantized_activation_max);
|
|
output_ptr[channel] = static_cast<uint8>(a);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void AveragePool32(const PoolParams& params,
|
|
const RuntimeShape& input_shape,
|
|
const uint8* input_data,
|
|
const RuntimeShape& output_shape,
|
|
uint8* output_data) {
|
|
ruy::profiler::ScopeLabel label("AveragePool/8bit");
|
|
|
|
// Here, and in other pooling ops, in order to maintain locality of reference,
|
|
// to minimize some recalculations, and to load into NEON vector registers, we
|
|
// use an inner loop down the depth. Since depths can be large and hence we
|
|
// would need arbitrarily large temporary storage, we divide the work up into
|
|
// depth tranches just within the batch loop.
|
|
static constexpr int kPoolingAccTrancheSize = 256;
|
|
|
|
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
|
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
|
|
const int input_height = input_shape.Dims(1);
|
|
const int input_width = input_shape.Dims(2);
|
|
const int output_height = output_shape.Dims(1);
|
|
const int output_width = output_shape.Dims(2);
|
|
const int stride_height = params.stride_height;
|
|
const int stride_width = params.stride_width;
|
|
|
|
uint32 acc[kPoolingAccTrancheSize];
|
|
for (int batch = 0; batch < batches; ++batch) {
|
|
// We proceed through the depth in tranches (see comment above). The
|
|
// depth_base is the depth at the beginning of the tranche. The
|
|
// tranche_depth is the depth dimension of the tranche.
|
|
for (int depth_base = 0; depth_base < depth;
|
|
depth_base += kPoolingAccTrancheSize) {
|
|
const int tranche_depth =
|
|
std::min(depth - depth_base, kPoolingAccTrancheSize);
|
|
for (int out_y = 0; out_y < output_height; ++out_y) {
|
|
for (int out_x = 0; out_x < output_width; ++out_x) {
|
|
const int in_x_origin =
|
|
(out_x * stride_width) - params.padding_values.width;
|
|
const int in_y_origin =
|
|
(out_y * stride_height) - params.padding_values.height;
|
|
const int filter_x_start = std::max(0, -in_x_origin);
|
|
const int filter_x_end =
|
|
std::min(params.filter_width, input_width - in_x_origin);
|
|
const int filter_y_start = std::max(0, -in_y_origin);
|
|
const int filter_y_end =
|
|
std::min(params.filter_height, input_height - in_y_origin);
|
|
const int filter_count =
|
|
(filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
|
|
memset(acc, 0, tranche_depth * sizeof(acc[0]));
|
|
const uint8* input_ptr =
|
|
input_data + depth_base +
|
|
depth * (in_x_origin +
|
|
input_width * (in_y_origin + input_height * batch));
|
|
for (int fy = filter_y_start; fy < filter_y_end; fy++) {
|
|
const uint8* input_row_ptr =
|
|
input_ptr + depth * (fy * input_width + filter_x_start);
|
|
for (int fx = filter_x_start; fx < filter_x_end; fx++) {
|
|
const uint8* input_channel_ptr = input_row_ptr;
|
|
int channel = 0;
|
|
#ifdef USE_NEON
|
|
for (; channel <= tranche_depth - 16; channel += 16) {
|
|
uint16x4_t acc_reg[4];
|
|
uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
|
|
input_channel_ptr += 16;
|
|
acc_reg[0] = vget_low_u16(vmovl_u8(vget_low_u8(input_reg)));
|
|
acc_reg[1] = vget_high_u16(vmovl_u8(vget_low_u8(input_reg)));
|
|
acc_reg[2] = vget_low_u16(vmovl_u8(vget_high_u8(input_reg)));
|
|
acc_reg[3] = vget_high_u16(vmovl_u8(vget_high_u8(input_reg)));
|
|
for (int i = 0; i < 4; i++) {
|
|
vst1q_u32(
|
|
acc + channel + 4 * i,
|
|
vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i]));
|
|
}
|
|
}
|
|
for (; channel <= tranche_depth - 8; channel += 8) {
|
|
uint16x4_t acc_reg[2];
|
|
uint16x8_t input_reg = vmovl_u8(vld1_u8(input_channel_ptr));
|
|
input_channel_ptr += 8;
|
|
acc_reg[0] = vget_low_u16(input_reg);
|
|
acc_reg[1] = vget_high_u16(input_reg);
|
|
for (int i = 0; i < 2; i++) {
|
|
vst1q_u32(
|
|
acc + channel + 4 * i,
|
|
vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i]));
|
|
}
|
|
}
|
|
#endif
|
|
for (; channel < tranche_depth; ++channel) {
|
|
acc[channel] += *input_channel_ptr++;
|
|
}
|
|
input_row_ptr += depth;
|
|
}
|
|
}
|
|
uint8* output_ptr = output_data + Offset(output_shape, batch, out_y,
|
|
out_x, depth_base);
|
|
int channel = 0;
|
|
#ifdef USE_NEON
|
|
#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \
|
|
if (filter_count == FILTER_COUNT) { \
|
|
for (; channel <= tranche_depth - 8; channel += 8) { \
|
|
uint16 buf[8]; \
|
|
for (int i = 0; i < 8; i++) { \
|
|
buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \
|
|
} \
|
|
uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \
|
|
buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \
|
|
buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \
|
|
vst1_u8(output_ptr + channel, buf8); \
|
|
} \
|
|
}
|
|
AVGPOOL_DIVIDING_BY(9)
|
|
AVGPOOL_DIVIDING_BY(15)
|
|
#undef AVGPOOL_DIVIDING_BY
|
|
for (; channel <= tranche_depth - 8; channel += 8) {
|
|
uint16 buf[8];
|
|
for (int i = 0; i < 8; i++) {
|
|
buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
|
|
}
|
|
uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
|
|
buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
|
|
buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
|
|
vst1_u8(output_ptr + channel, buf8);
|
|
}
|
|
#endif
|
|
for (; channel < tranche_depth; ++channel) {
|
|
uint16 a = (acc[channel] + filter_count / 2) / filter_count;
|
|
a = std::max<uint16>(a, params.quantized_activation_min);
|
|
a = std::min<uint16>(a, params.quantized_activation_max);
|
|
output_ptr[channel] = static_cast<uint8>(a);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void AveragePool(const PoolParams& params,
|
|
const RuntimeShape& input_shape,
|
|
const uint8* input_data,
|
|
const RuntimeShape& output_shape, uint8* output_data) {
|
|
if (params.filter_height * params.filter_width > 16 * 16) {
|
|
AveragePool32(params, input_shape, input_data, output_shape, output_data);
|
|
} else {
|
|
AveragePool16(params, input_shape, input_data, output_shape, output_data);
|
|
}
|
|
}
|
|
|
|
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
|
const float* input_data, const RuntimeShape& output_shape,
|
|
float* output_data) {
|
|
ruy::profiler::ScopeLabel label("MaxPool");
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
|
const int input_height = input_shape.Dims(1);
|
|
const int input_width = input_shape.Dims(2);
|
|
const int output_height = output_shape.Dims(1);
|
|
const int output_width = output_shape.Dims(2);
|
|
const int stride_height = params.stride_height;
|
|
const int stride_width = params.stride_width;
|
|
|
|
const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
|
|
auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
|
|
// Prefill the output to minimum representable float value
|
|
out_mat.setConstant(std::numeric_limits<float>::lowest());
|
|
for (int b = 0; b < batches; ++b) {
|
|
for (int h = 0; h < input_height; ++h) {
|
|
for (int w = 0; w < input_width; ++w) {
|
|
// (h_start, h_end) * (w_start, w_end) is the range that the input
|
|
// vector projects to.
|
|
int hpad = h + params.padding_values.height;
|
|
int wpad = w + params.padding_values.width;
|
|
int h_start = (hpad < params.filter_height)
|
|
? 0
|
|
: (hpad - params.filter_height) / stride_height + 1;
|
|
int h_end = std::min(hpad / stride_height + 1, output_height);
|
|
int w_start = (wpad < params.filter_width)
|
|
? 0
|
|
: (wpad - params.filter_width) / stride_width + 1;
|
|
int w_end = std::min(wpad / stride_width + 1, output_width);
|
|
// compute elementwise sum
|
|
for (int ph = h_start; ph < h_end; ++ph) {
|
|
for (int pw = w_start; pw < w_end; ++pw) {
|
|
int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
|
|
out_mat.col(out_offset) =
|
|
out_mat.col(out_offset)
|
|
.cwiseMax(in_mat.col(
|
|
NodeOffset(b, h, w, input_height, input_width)));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
const int flat_size = output_shape.FlatSize();
|
|
for (int i = 0; i < flat_size; ++i) {
|
|
output_data[i] = ActivationFunctionWithMinMax(output_data[i],
|
|
params.float_activation_min,
|
|
params.float_activation_max);
|
|
}
|
|
}
|
|
|
|
inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
|
|
const uint8* input_data, const RuntimeShape& output_shape,
|
|
uint8* output_data) {
|
|
ruy::profiler::ScopeLabel label("MaxPool/8bit");
|
|
|
|
// Here, and in other pooling ops, in order to maintain locality of reference,
|
|
// to minimize some recalculations, and to load into NEON vector registers, we
|
|
// use an inner loop down the depth. Since depths can be large and hence we
|
|
// would need arbitrarily large temporary storage, we divide the work up into
|
|
// depth tranches just within the batch loop.
|
|
static constexpr int kPoolingAccTrancheSize = 256;
|
|
|
|
TFLITE_DCHECK_LE(params.quantized_activation_min,
|
|
params.quantized_activation_max);
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
|
const int depth = MatchingDim(input_shape, 3, output_shape, 3);
|
|
const int input_height = input_shape.Dims(1);
|
|
const int input_width = input_shape.Dims(2);
|
|
const int output_height = output_shape.Dims(1);
|
|
const int output_width = output_shape.Dims(2);
|
|
const int stride_height = params.stride_height;
|
|
const int stride_width = params.stride_width;
|
|
|
|
uint8 acc[kPoolingAccTrancheSize];
|
|
for (int batch = 0; batch < batches; ++batch) {
|
|
// We proceed through the depth in tranches (see comment above). The
|
|
// depth_base is the depth at the beginning of the tranche. The
|
|
// tranche_depth is the depth dimension of the tranche.
|
|
for (int depth_base = 0; depth_base < depth;
|
|
depth_base += kPoolingAccTrancheSize) {
|
|
const int tranche_depth =
|
|
std::min(depth - depth_base, kPoolingAccTrancheSize);
|
|
for (int out_y = 0; out_y < output_height; ++out_y) {
|
|
for (int out_x = 0; out_x < output_width; ++out_x) {
|
|
const int in_x_origin =
|
|
(out_x * stride_width) - params.padding_values.width;
|
|
const int in_y_origin =
|
|
(out_y * stride_height) - params.padding_values.height;
|
|
const int filter_x_start = std::max(0, -in_x_origin);
|
|
const int filter_x_end =
|
|
std::min(params.filter_width, input_width - in_x_origin);
|
|
const int filter_y_start = std::max(0, -in_y_origin);
|
|
const int filter_y_end =
|
|
std::min(params.filter_height, input_height - in_y_origin);
|
|
memset(acc, 0, tranche_depth * sizeof(acc[0]));
|
|
const uint8* input_ptr =
|
|
input_data + depth_base +
|
|
depth * (in_x_origin +
|
|
input_width * (in_y_origin + input_height * batch));
|
|
for (int fy = filter_y_start; fy < filter_y_end; fy++) {
|
|
const uint8* input_row_ptr =
|
|
input_ptr + depth * (fy * input_width + filter_x_start);
|
|
for (int fx = filter_x_start; fx < filter_x_end; fx++) {
|
|
const uint8* input_channel_ptr = input_row_ptr;
|
|
int channel = 0;
|
|
#ifdef USE_NEON
|
|
for (; channel <= tranche_depth - 16; channel += 16) {
|
|
uint8x16_t acc_reg = vld1q_u8(acc + channel);
|
|
uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
|
|
input_channel_ptr += 16;
|
|
acc_reg = vmaxq_u8(acc_reg, input_reg);
|
|
vst1q_u8(acc + channel, acc_reg);
|
|
}
|
|
|
|
for (; channel <= tranche_depth - 8; channel += 8) {
|
|
uint8x8_t acc_reg = vld1_u8(acc + channel);
|
|
uint8x8_t input_reg = vld1_u8(input_channel_ptr);
|
|
input_channel_ptr += 8;
|
|
acc_reg = vmax_u8(acc_reg, input_reg);
|
|
vst1_u8(acc + channel, acc_reg);
|
|
}
|
|
#endif
|
|
for (; channel < tranche_depth; ++channel) {
|
|
acc[channel] = std::max(acc[channel], *input_channel_ptr++);
|
|
}
|
|
input_row_ptr += depth;
|
|
}
|
|
}
|
|
uint8* output_ptr = output_data + Offset(output_shape, batch, out_y,
|
|
out_x, depth_base);
|
|
int channel = 0;
|
|
#ifdef USE_NEON
|
|
for (; channel <= tranche_depth - 16; channel += 16) {
|
|
uint8x16_t a = vld1q_u8(acc + channel);
|
|
a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max));
|
|
a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min));
|
|
vst1q_u8(output_ptr + channel, a);
|
|
}
|
|
for (; channel <= tranche_depth - 8; channel += 8) {
|
|
uint8x8_t a = vld1_u8(acc + channel);
|
|
a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max));
|
|
a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min));
|
|
vst1_u8(output_ptr + channel, a);
|
|
}
|
|
#endif
|
|
for (; channel < tranche_depth; ++channel) {
|
|
uint8 a = acc[channel];
|
|
a = std::max<uint8>(a, params.quantized_activation_min);
|
|
a = std::min<uint8>(a, params.quantized_activation_max);
|
|
output_ptr[channel] = static_cast<uint8>(a);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void L2Pool(const PoolParams& params, const RuntimeShape& input_shape,
|
|
const float* input_data, const RuntimeShape& output_shape,
|
|
float* output_data) {
|
|
ruy::profiler::ScopeLabel label("L2Pool");
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
|
const int input_height = input_shape.Dims(1);
|
|
const int input_width = input_shape.Dims(2);
|
|
const int output_height = output_shape.Dims(1);
|
|
const int output_width = output_shape.Dims(2);
|
|
const int stride_height = params.stride_height;
|
|
const int stride_width = params.stride_width;
|
|
// Actually carry out L2 Pool. Code is written in forward mode: we go through
|
|
// the input values once, and write to all the pooled regions that it maps to.
|
|
const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
|
|
auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
|
|
Eigen::VectorXf in_square(in_mat.rows());
|
|
Eigen::VectorXf out_count(out_mat.cols());
|
|
out_count.setZero();
|
|
// Prefill the output to 0.
|
|
out_mat.setZero();
|
|
for (int b = 0; b < batches; ++b) {
|
|
for (int h = 0; h < input_height; ++h) {
|
|
for (int w = 0; w < input_width; ++w) {
|
|
// (h_start, h_end) * (w_start, w_end) is the range that the input
|
|
// vector projects to.
|
|
const int hpad = h + params.padding_values.height;
|
|
const int wpad = w + params.padding_values.width;
|
|
const int h_start =
|
|
(hpad < params.filter_height)
|
|
? 0
|
|
: (hpad - params.filter_height) / stride_height + 1;
|
|
const int h_end = std::min(hpad / stride_height + 1, output_height);
|
|
const int w_start =
|
|
(wpad < params.filter_width)
|
|
? 0
|
|
: (wpad - params.filter_width) / stride_width + 1;
|
|
const int w_end = std::min(wpad / stride_width + 1, output_width);
|
|
// pre-compute square
|
|
const int in_offset = w + input_width * (h + input_height * b);
|
|
in_square =
|
|
in_mat.col(in_offset).array() * in_mat.col(in_offset).array();
|
|
// compute elementwise sum of squares
|
|
for (int ph = h_start; ph < h_end; ++ph) {
|
|
for (int pw = w_start; pw < w_end; ++pw) {
|
|
const int out_offset = pw + output_width * (ph + output_height * b);
|
|
out_mat.col(out_offset) += in_square;
|
|
out_count(out_offset)++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
out_count = out_count.array().inverse();
|
|
out_mat =
|
|
(out_mat.array().rowwise() * out_count.transpose().array()).cwiseSqrt();
|
|
|
|
const int flat_size = output_shape.FlatSize();
|
|
for (int i = 0; i < flat_size; ++i) {
|
|
output_data[i] = ActivationFunctionWithMinMax(output_data[i],
|
|
params.float_activation_min,
|
|
params.float_activation_max);
|
|
}
|
|
}
|
|
|
|
inline void LocalResponseNormalization(
|
|
const tflite::LocalResponseNormalizationParams& op_params,
|
|
const RuntimeShape& input_shape, const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("LocalResponseNormalization");
|
|
MatchingFlatSize(input_shape, output_shape);
|
|
|
|
const auto data_in = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
|
|
auto data_out = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
|
|
|
|
// Carry out local response normalization, vector by vector.
|
|
// Since the data are stored column major, making row-wise operation
|
|
// probably not memory efficient anyway, we do an explicit for loop over
|
|
// the columns.
|
|
const int double_range = op_params.range * 2;
|
|
Eigen::VectorXf padded_square(data_in.rows() + double_range);
|
|
padded_square.setZero();
|
|
for (int r = 0; r < data_in.cols(); ++r) {
|
|
// Do local response normalization for data_in(:, r)
|
|
// first, compute the square and store them in buffer for repeated use
|
|
padded_square.block(op_params.range, 0, data_in.rows(), 1) =
|
|
data_in.col(r).cwiseProduct(data_in.col(r)) * op_params.alpha;
|
|
// Then, compute the scale and writes them to data_out
|
|
float accumulated_scale = 0;
|
|
for (int i = 0; i < double_range; ++i) {
|
|
accumulated_scale += padded_square(i);
|
|
}
|
|
for (int i = 0; i < data_in.rows(); ++i) {
|
|
accumulated_scale += padded_square(i + double_range);
|
|
data_out(i, r) = op_params.bias + accumulated_scale;
|
|
accumulated_scale -= padded_square(i);
|
|
}
|
|
}
|
|
|
|
// In a few cases, the pow computation could benefit from speedups.
|
|
if (op_params.beta == 1) {
|
|
data_out.array() = data_in.array() * data_out.array().inverse();
|
|
} else if (op_params.beta == 0.5f) {
|
|
data_out.array() = data_in.array() * data_out.array().sqrt().inverse();
|
|
} else {
|
|
data_out.array() = data_in.array() * data_out.array().pow(-op_params.beta);
|
|
}
|
|
}
|
|
|
|
inline void SoftmaxImpl(const SoftmaxParams& params,
|
|
const RuntimeShape& input_shape,
|
|
const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data,
|
|
int start_batch, int end_batch) {
|
|
ruy::profiler::ScopeLabel label("Softmax/Impl");
|
|
MatchingFlatSize(input_shape, output_shape);
|
|
|
|
const int logit_size = input_shape.Dims(input_shape.DimensionsCount() - 1);
|
|
const MatrixMap<const float> in_mat(input_data + logit_size * start_batch,
|
|
logit_size, end_batch - start_batch);
|
|
MatrixMap<float> out_mat(output_data + logit_size * start_batch, logit_size,
|
|
end_batch - start_batch);
|
|
// Compute the exponential first, removing the max coefficient for numerical
|
|
// stability.
|
|
out_mat =
|
|
(in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * params.beta;
|
|
// We are separating out the exp function so that exp can be vectorized.
|
|
out_mat = out_mat.array().exp();
|
|
// Normalize to get the activations.
|
|
Eigen::Array<float, 1, Eigen::Dynamic> scale =
|
|
out_mat.array().colwise().sum().inverse();
|
|
out_mat.array().rowwise() *= scale;
|
|
}
|
|
|
|
struct SoftmaxWorkerTask : cpu_backend_threadpool::Task {
|
|
SoftmaxWorkerTask(const SoftmaxParams& params,
|
|
const RuntimeShape& input_shape, const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data,
|
|
int start_batch, int end_batch)
|
|
: params(params),
|
|
input_shape(input_shape),
|
|
input_data(input_data),
|
|
output_shape(output_shape),
|
|
output_data(output_data),
|
|
start_batch(start_batch),
|
|
end_batch(end_batch) {}
|
|
void Run() override {
|
|
SoftmaxImpl(params, input_shape, input_data, output_shape, output_data,
|
|
start_batch, end_batch);
|
|
}
|
|
|
|
private:
|
|
const tflite::SoftmaxParams& params;
|
|
const RuntimeShape& input_shape;
|
|
const float* input_data;
|
|
const RuntimeShape& output_shape;
|
|
float* output_data;
|
|
int start_batch;
|
|
int end_batch;
|
|
};
|
|
|
|
inline void Softmax(const SoftmaxParams& params,
|
|
const RuntimeShape& input_shape, const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data,
|
|
CpuBackendContext* cpu_backend_context = nullptr) {
|
|
ruy::profiler::ScopeLabel label("Softmax");
|
|
|
|
// We picture softmax input as a 2-D matrix while the last dim is the logit
|
|
// dim, and the rest dims will be the batch dim for the 2-D matrix.
|
|
const int batch_size =
|
|
FlatSizeSkipDim(input_shape, input_shape.DimensionsCount() - 1);
|
|
constexpr int kMinBatchPerThread = 8;
|
|
int thread_count = batch_size / kMinBatchPerThread;
|
|
thread_count = thread_count > 0 ? thread_count : 1;
|
|
const int capped_thread_count =
|
|
cpu_backend_context == nullptr
|
|
? 1
|
|
: std::min(thread_count, cpu_backend_context->max_num_threads());
|
|
if (capped_thread_count == 1) {
|
|
SoftmaxImpl(params, input_shape, input_data, output_shape, output_data, 0,
|
|
batch_size);
|
|
} else {
|
|
std::vector<SoftmaxWorkerTask> tasks;
|
|
// TODO(b/131746020) don't create new heap allocations every time.
|
|
// At least we make it a single heap allocation by using reserve().
|
|
tasks.reserve(capped_thread_count);
|
|
int batch_start = 0;
|
|
for (int i = 0; i < capped_thread_count; ++i) {
|
|
// Try to distribute the tasks as even as possible.
|
|
int batch_end =
|
|
batch_start + (batch_size - batch_start) / (capped_thread_count - i);
|
|
tasks.emplace_back(params, input_shape, input_data, output_shape,
|
|
output_data, batch_start, batch_end);
|
|
batch_start = batch_end;
|
|
}
|
|
cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
|
|
cpu_backend_context);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
inline int32_t QuantizeSoftmaxOutput(float prob_rescaled, int32_t zero_point) {
|
|
const int32_t prob_rnd = static_cast<int32_t>(std::round(prob_rescaled));
|
|
return prob_rnd + zero_point;
|
|
}
|
|
|
|
#if !__aarch64__
|
|
// With ARM64, rounding is faster than add + truncation.
|
|
template <>
|
|
inline int32_t QuantizeSoftmaxOutput<uint8_t>(float prob_rescaled,
|
|
int32_t zero_point) {
|
|
return static_cast<int32_t>(prob_rescaled + 0.5f);
|
|
}
|
|
#endif
|
|
|
|
inline void PopulateSoftmaxLookupTable(SoftmaxParams* data, float input_scale,
|
|
float beta) {
|
|
const float scale = -input_scale * beta;
|
|
const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
|
|
for (int32_t val = 0; val <= max_uint8; ++val) {
|
|
data->table[max_uint8 - val] = expf(scale * val);
|
|
}
|
|
}
|
|
|
|
template <typename In, typename Out>
|
|
inline void Softmax(const SoftmaxParams& params,
|
|
const RuntimeShape& input_shape, const In* input_data,
|
|
const RuntimeShape& output_shape, Out* output_data) {
|
|
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
|
const int excluding_last_dim =
|
|
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
|
const int last_dim =
|
|
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
|
|
|
const int32_t clamp_max = std::numeric_limits<Out>::max();
|
|
const int32_t clamp_min = std::numeric_limits<Out>::min();
|
|
for (int i = 0; i < excluding_last_dim; ++i) {
|
|
int32_t max_val = std::numeric_limits<In>::min();
|
|
// Find max quantized value.
|
|
for (int j = 0; j < last_dim; ++j) {
|
|
max_val = std::max(max_val, static_cast<int32_t>(input_data[j]));
|
|
}
|
|
|
|
float sum_exp = 0.0f;
|
|
const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
|
|
const float* table_offset = ¶ms.table[max_uint8 - max_val];
|
|
// Calculate normalizer sum(exp(x)).
|
|
for (int j = 0; j < last_dim; ++j) {
|
|
sum_exp += table_offset[input_data[j]];
|
|
}
|
|
|
|
const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
|
|
// Normalize and quantize probabilities.
|
|
for (int j = 0; j < last_dim; ++j) {
|
|
const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp;
|
|
const int32_t prob_quantized =
|
|
QuantizeSoftmaxOutput<Out>(prob_rescaled, params.zero_point);
|
|
output_data[j] = static_cast<Out>(
|
|
std::max(std::min(clamp_max, prob_quantized), clamp_min));
|
|
}
|
|
input_data += last_dim;
|
|
output_data += last_dim;
|
|
}
|
|
}
|
|
|
|
// Here's the softmax LUT optimization strategy:
|
|
// For softmax, we can do some mathmetically equivalent transformation:
|
|
//
|
|
// softmax(x) = e^x / sum(e^x, 0...n) ===> equals to
|
|
// softmax(x) = e^(x - CONST) / sum(e^(x - CONST), 0...n)
|
|
//
|
|
// For quantization, `x` in our case is (input_q - input_zp) * input_s
|
|
// For uint8 case (int8 can be handled similarly), the range is [0, 255]
|
|
//
|
|
// so if we let
|
|
// CONST = (255 - input_zp) * input_s
|
|
// then we will have:
|
|
// softmax(x) = e^((input_q - 255) * input_s) --------- (1)
|
|
// /
|
|
// sum(e^(input_q - 255) * input_s, 0...n) -------- (2)
|
|
//
|
|
// the good thing about (1) is it's within the range of (0, 1), so we can
|
|
// approximate its result with uint16.
|
|
// (1) = uint8_out * 1 / 2^16.
|
|
//
|
|
// so (1) is lookup_uint8_table(input_zp) * 1 / 2^16.
|
|
// then (2) is essentially the following:
|
|
// sum(lookup_uint8_table(input_zp), 0...n) / 2^16.
|
|
//
|
|
// since (output_q - output_zp) * output_s = softmax(x)
|
|
// output_q = lookup_uint8_table(input_zp)
|
|
// /
|
|
// (sum(lookup_uint8_table(input_zp), 0...n) * output_s)
|
|
// +
|
|
// output_zp
|
|
//
|
|
// We can actually further improve the performance by using uint8 instead of
|
|
// uint16. But that we may lose some accuracy, so we need to pay attention
|
|
// to that.
|
|
inline void PopulateSoftmaxUInt8LookupTable(SoftmaxParams* data,
|
|
float input_scale, float beta) {
|
|
const float scale = input_scale * beta;
|
|
const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
|
|
const int32_t max_uint16 = std::numeric_limits<uint16_t>::max();
|
|
|
|
for (int32_t val = 0; val <= max_uint8; ++val) {
|
|
float input_to_exp = scale * (val - max_uint8);
|
|
int32_t temp = static_cast<int>(expf(input_to_exp) * max_uint16 + 0.5);
|
|
temp = std::min(max_uint16, temp);
|
|
uint8_t part1 = temp >> 8;
|
|
uint8_t part2 = temp & 0xff;
|
|
data->uint8_table1[val] = static_cast<uint8_t>(part1);
|
|
data->uint8_table2[val] = static_cast<uint8_t>(part2);
|
|
}
|
|
}
|
|
|
|
inline int FindMaxValue(int size, const uint8_t* input_data, uint8_t offset) {
|
|
int32_t max_val = std::numeric_limits<uint8_t>::min();
|
|
int j = 0;
|
|
#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
|
|
uint8x16_t max_val_dup = vdupq_n_u8(max_val);
|
|
uint8x16_t offset_dup = vdupq_n_u8(offset);
|
|
for (; j <= size - 16; j += 16) {
|
|
uint8x16_t input_value = vld1q_u8(input_data + j);
|
|
input_value = veorq_u8(input_value, offset_dup);
|
|
max_val_dup = vmaxq_u8(input_value, max_val_dup);
|
|
}
|
|
max_val = std::max(max_val, static_cast<int32>(vmaxvq_u8(max_val_dup)));
|
|
#endif
|
|
|
|
for (; j < size; ++j) {
|
|
max_val = std::max(max_val, static_cast<int32_t>(input_data[j] ^ offset));
|
|
}
|
|
return max_val;
|
|
}
|
|
|
|
#ifdef USE_NEON
|
|
// Value_to_store layout:
|
|
// [high_high, high_low, low_high, low_low].
|
|
inline void StoreValue(int32x4x4_t value_to_store, int8_t* output) {
|
|
const int16x8_t result_1 = vcombine_s16(vqmovn_s32(value_to_store.val[1]),
|
|
vqmovn_s32(value_to_store.val[0]));
|
|
const int16x8_t result_2 = vcombine_s16(vqmovn_s32(value_to_store.val[3]),
|
|
vqmovn_s32(value_to_store.val[2]));
|
|
const int8x16_t result =
|
|
vcombine_s8(vqmovn_s16(result_2), vqmovn_s16(result_1));
|
|
vst1q_s8(output, result);
|
|
}
|
|
|
|
// Value_to_store layout:
|
|
// [high_high, high_low, low_high, low_low].
|
|
inline void StoreValue(int32x4x4_t value_to_store, uint8_t* output) {
|
|
const uint16x8_t result_1 =
|
|
vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[1])),
|
|
vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[0])));
|
|
const uint16x8_t result_2 =
|
|
vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[3])),
|
|
vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[2])));
|
|
const uint8x16_t result =
|
|
vcombine_u8(vqmovn_u16(result_2), vqmovn_u16(result_1));
|
|
vst1q_u8(output, result);
|
|
}
|
|
|
|
#endif
|
|
|
|
template <typename In, typename Out>
|
|
inline void SoftmaxInt8LUT(const SoftmaxParams& params,
|
|
const RuntimeShape& input_shape,
|
|
const In* input_data,
|
|
const RuntimeShape& output_shape, Out* output_data) {
|
|
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
|
const int excluding_last_dim =
|
|
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
|
const int last_dim =
|
|
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
|
|
|
const int32_t clamp_max = std::numeric_limits<Out>::max();
|
|
const int32_t clamp_min = std::numeric_limits<Out>::min();
|
|
|
|
// Offset is used to interpret the input data "correctly".
|
|
// If the input is uint8, the data will be unchanged.
|
|
// If the input is int8, since it will be reinterpret as uint8.
|
|
// e.g.,
|
|
// int8 127 will be applied "offset" to become 255 in uint8.
|
|
uint8_t offset = 0;
|
|
if (std::is_same<In, int8>::value) {
|
|
offset = 0x80;
|
|
}
|
|
|
|
const uint8_t* input_data_uint = reinterpret_cast<const uint8_t*>(input_data);
|
|
|
|
#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
|
|
// This code uses ARM64-only instructions.
|
|
// TODO(b/143709993): Port to ARMv7
|
|
|
|
// Load the tables into registers. (4*4 128-bit registers)
|
|
uint8x16x4_t table1[4];
|
|
table1[0] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 0);
|
|
table1[1] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 1);
|
|
table1[2] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 2);
|
|
table1[3] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 3);
|
|
|
|
uint8x16x4_t table2[4];
|
|
table2[0] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 0);
|
|
table2[1] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 1);
|
|
table2[2] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 2);
|
|
table2[3] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 3);
|
|
#endif
|
|
|
|
for (int i = 0; i < excluding_last_dim; ++i) {
|
|
// Find max quantized value.
|
|
int32_t max_val = FindMaxValue(last_dim, input_data_uint, offset);
|
|
|
|
int32 sum_exp = 0;
|
|
const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
|
|
const uint8_t table_offset = max_uint8 - max_val;
|
|
|
|
// Calculate normalizer sum(exp(x)).
|
|
int sum_j = 0;
|
|
#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
|
|
uint8x16_t table_offset_dup = vdupq_n_u8(table_offset);
|
|
uint8x16_t offset_dup = vdupq_n_u8(offset);
|
|
uint32x4_t sum_4 = vdupq_n_u32(0);
|
|
const int multiplier_shift = 8;
|
|
for (; sum_j <= last_dim - 16; sum_j += 16) {
|
|
uint8x16_t input_value = vld1q_u8(input_data_uint + sum_j);
|
|
input_value = veorq_u8(input_value, offset_dup);
|
|
input_value = vaddq_u8(input_value, table_offset_dup);
|
|
|
|
const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value);
|
|
const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value);
|
|
|
|
uint16x8_t exp_value1 =
|
|
vshll_n_u8(vget_high_u8(output1), multiplier_shift);
|
|
uint16x8_t exp_value2 =
|
|
vshll_n_u8(vget_low_u8(output1), multiplier_shift);
|
|
|
|
exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2));
|
|
exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2));
|
|
|
|
sum_4 = vpadalq_u16(sum_4, exp_value1);
|
|
sum_4 = vpadalq_u16(sum_4, exp_value2);
|
|
}
|
|
int temp = vgetq_lane_u32(sum_4, 0) + vgetq_lane_u32(sum_4, 1) +
|
|
vgetq_lane_u32(sum_4, 2) + vgetq_lane_u32(sum_4, 3);
|
|
sum_exp += temp;
|
|
|
|
#endif
|
|
for (; sum_j < last_dim; ++sum_j) {
|
|
const uint8_t index = (input_data_uint[sum_j] ^ offset) + table_offset;
|
|
|
|
uint8_t part1 = params.uint8_table1[index];
|
|
uint8_t part2 = params.uint8_table2[index];
|
|
sum_exp += ((part1 << 8) + part2);
|
|
}
|
|
|
|
const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
|
|
|
|
int32 multiplier, shift;
|
|
QuantizeMultiplier(inv_sum_exp, &multiplier, &shift);
|
|
|
|
// Normalize and quantize probabilities.
|
|
int j = 0;
|
|
#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
|
|
const int32x4_t output_zp_dup = vdupq_n_s32(params.zero_point);
|
|
const int32x4_t max_val_dup = vdupq_n_s32(clamp_max);
|
|
const int32x4_t min_val_dup = vdupq_n_s32(clamp_min);
|
|
|
|
for (; j <= last_dim - 16; j += 16) {
|
|
uint8x16_t input_value = vld1q_u8(input_data_uint + j);
|
|
input_value = veorq_u8(input_value, offset_dup);
|
|
input_value = vaddq_u8(input_value, table_offset_dup);
|
|
|
|
const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value);
|
|
const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value);
|
|
|
|
uint16x8_t exp_value1 =
|
|
vshll_n_u8(vget_high_u8(output1), multiplier_shift);
|
|
uint16x8_t exp_value2 =
|
|
vshll_n_u8(vget_low_u8(output1), multiplier_shift);
|
|
|
|
exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2));
|
|
exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2));
|
|
|
|
int32x4x4_t output_value;
|
|
output_value.val[0] =
|
|
vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value1)));
|
|
output_value.val[1] =
|
|
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value1)));
|
|
output_value.val[2] =
|
|
vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value2)));
|
|
output_value.val[3] =
|
|
vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value2)));
|
|
|
|
int32x4x4_t temp_val =
|
|
MultiplyByQuantizedMultiplier4Rows(output_value, multiplier, shift);
|
|
|
|
temp_val.val[0] = vaddq_s32(temp_val.val[0], output_zp_dup);
|
|
temp_val.val[1] = vaddq_s32(temp_val.val[1], output_zp_dup);
|
|
temp_val.val[2] = vaddq_s32(temp_val.val[2], output_zp_dup);
|
|
temp_val.val[3] = vaddq_s32(temp_val.val[3], output_zp_dup);
|
|
|
|
temp_val.val[0] =
|
|
vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
|
|
temp_val.val[1] =
|
|
vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
|
|
temp_val.val[2] =
|
|
vmaxq_s32(vminq_s32(temp_val.val[2], max_val_dup), min_val_dup);
|
|
temp_val.val[3] =
|
|
vmaxq_s32(vminq_s32(temp_val.val[3], max_val_dup), min_val_dup);
|
|
|
|
StoreValue(temp_val, output_data + j);
|
|
}
|
|
#endif
|
|
for (; j < last_dim; ++j) {
|
|
const uint8_t index = (input_data_uint[j] ^ offset) + table_offset;
|
|
const uint8_t part1 = params.uint8_table1[index];
|
|
const uint8_t part2 = params.uint8_table2[index];
|
|
const int32_t exp_value = (part1 << 8) + part2;
|
|
const int32_t output_value =
|
|
MultiplyByQuantizedMultiplier(exp_value, multiplier, shift);
|
|
|
|
output_data[j] = static_cast<Out>(std::max(
|
|
std::min(clamp_max, output_value + params.zero_point), clamp_min));
|
|
}
|
|
input_data_uint += last_dim;
|
|
output_data += last_dim;
|
|
}
|
|
}
|
|
|
|
// TODO(myenik): This is the same as the reference implementation, not actually
|
|
// optimized yet.
|
|
inline void LogSoftmax(const SoftmaxParams& params,
|
|
const RuntimeShape& input_shape, const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("LogSoftmax");
|
|
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
|
const int outer_size =
|
|
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
|
const int depth =
|
|
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
|
|
|
for (int i = 0; i < outer_size; ++i) {
|
|
const float* block_input_data = input_data + i * depth;
|
|
float* block_output_data = output_data + i * depth;
|
|
// Find max element value which we'll use to ensure numerical stability
|
|
// taking advantage of the following equality:
|
|
// log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
|
|
float max = std::numeric_limits<float>::lowest();
|
|
for (int c = 0; c < depth; ++c) {
|
|
max = std::max(max, block_input_data[c]);
|
|
}
|
|
|
|
// Compute sum.
|
|
float sum = 0.f;
|
|
for (int c = 0; c < depth; ++c) {
|
|
sum += std::exp(block_input_data[c] - max);
|
|
}
|
|
|
|
// Compute result.
|
|
const float log_sum = std::log(sum);
|
|
for (int c = 0; c < depth; ++c) {
|
|
block_output_data[c] = block_input_data[c] - max - log_sum;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Backwards compatibility. Less optimized than below version.
|
|
inline void LogSoftmax(const SoftmaxParams& params,
|
|
const RuntimeShape& input_shape, const uint8* input_data,
|
|
const RuntimeShape& output_shape, uint8* output_data) {
|
|
reference_ops::LogSoftmax(params, input_shape, input_data, output_shape,
|
|
output_data);
|
|
}
|
|
|
|
// Compute LogSoftmax as (x - x_max) - ln(sum(e^(x_i - x_max)...)
|
|
// as done in tf.nn.log_softmax to prevent underflow and overflow.
|
|
// This is in contrast to just log(softmax(x))
|
|
//
|
|
// To handle quantization, first dequantize the inputs (from doing
|
|
// e^(input scale * val) where we ignore the zero point since it cancels
|
|
// out during subtraction due to the ln) and do a rescale at the end to int8.
|
|
//
|
|
// Notably this makes use of float and is intended as the optimized
|
|
// form for quantized execution on CPU. For a fully integer version,
|
|
// see the reference op.
|
|
//
|
|
// TODO(tflite): notes for optimization:
|
|
// 1) See if e^ is also bottleneck in the reference fully-integer
|
|
// version and apply lookup there and compare.
|
|
inline void LogSoftmax(const SoftmaxParams& params, float input_scale,
|
|
const RuntimeShape& input_shape, const uint8* input_data,
|
|
const RuntimeShape& output_shape, uint8* output_data) {
|
|
ruy::profiler::ScopeLabel label("LogSoftmax/Uint8");
|
|
const int trailing_dim = input_shape.DimensionsCount() - 1;
|
|
const int excluding_last_dim =
|
|
MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
|
|
const int last_dim =
|
|
MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
|
|
|
|
const int32_t clamp_max = std::numeric_limits<uint8>::max();
|
|
const int32_t clamp_min = std::numeric_limits<uint8>::min();
|
|
for (int i = 0; i < excluding_last_dim; ++i) {
|
|
uint8_t max_val = std::numeric_limits<uint8>::min();
|
|
// Find max quantized value.
|
|
for (int j = 0; j < last_dim; ++j) {
|
|
max_val = std::max(max_val, input_data[j]);
|
|
}
|
|
|
|
float sum_exp = 0.0f;
|
|
const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
|
|
// Offset into table to compute exp(scale*(x - xmax)) instead of
|
|
// exp(scale*(x)) to prevent overflow.
|
|
const float* table_offset = ¶ms.table[max_uint8 - max_val];
|
|
// Calculate sum(exp(scale*(x - x_max))).
|
|
for (int j = 0; j < last_dim; ++j) {
|
|
sum_exp += table_offset[input_data[j]];
|
|
}
|
|
const float log_sum_exp = std::log(sum_exp);
|
|
|
|
// params.scale is the output scale.
|
|
const float scale = input_scale / params.scale;
|
|
const float precomputed =
|
|
(input_scale * max_val + log_sum_exp) / params.scale;
|
|
for (int j = 0; j < last_dim; ++j) {
|
|
// Equivalent to (input_scale * (input_data[j] - max_val) - log_sum_exp) /
|
|
// output_scale.
|
|
const float log_prob = scale * input_data[j] - precomputed;
|
|
|
|
// TODO(tflite): look into better solution.
|
|
// Use std::rint over std::round (which is used in
|
|
// FakeQuant) since it's multiple times faster on tested arm32.
|
|
const int32_t prob_quantized = std::rint(log_prob) + params.zero_point;
|
|
output_data[j] = static_cast<uint8_t>(
|
|
std::max(std::min(clamp_max, prob_quantized), clamp_min));
|
|
}
|
|
input_data += last_dim;
|
|
output_data += last_dim;
|
|
}
|
|
}
|
|
|
|
inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("Logistic");
|
|
auto input_map = MapAsVector(input_data, input_shape);
|
|
auto output_map = MapAsVector(output_data, output_shape);
|
|
output_map.array() =
|
|
input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
|
|
}
|
|
|
|
// Convenience version that allows, for example, generated-code calls to be
|
|
// uniform between data types.
|
|
inline void Logistic(const LogisticParams&, const RuntimeShape& input_shape,
|
|
const float* input_data, const RuntimeShape& output_shape,
|
|
float* output_data) {
|
|
// Drop params: not needed.
|
|
Logistic(input_shape, input_data, output_shape, output_data);
|
|
}
|
|
|
|
inline void Logistic(const LogisticParams& params,
|
|
const RuntimeShape& input_shape, const int16* input_data,
|
|
const RuntimeShape& output_shape, int16* output_data) {
|
|
ruy::profiler::ScopeLabel label("Logistic/Int16");
|
|
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
|
|
|
for (int i = 0; i < flat_size; i++) {
|
|
}
|
|
|
|
int c = 0;
|
|
const int16* input_data_ptr = input_data;
|
|
int16* output_data_ptr = output_data;
|
|
#ifdef GEMMLOWP_NEON
|
|
{
|
|
// F0 uses 0 integer bits, range [-1, 1].
|
|
// This is the return type of math functions such as tanh, logistic,
|
|
// whose range is in [-1, 1].
|
|
using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
|
|
// F3 uses 3 integer bits, range [-8, 8], the input range expected here.
|
|
using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
|
|
|
|
for (; c <= flat_size - 16; c += 16) {
|
|
F3 input0 = F3::FromRaw(vld1q_s16(input_data_ptr));
|
|
F3 input1 = F3::FromRaw(vld1q_s16(input_data_ptr + 8));
|
|
F0 output0 = gemmlowp::logistic(input0);
|
|
F0 output1 = gemmlowp::logistic(input1);
|
|
vst1q_s16(output_data_ptr, output0.raw());
|
|
vst1q_s16(output_data_ptr + 8, output1.raw());
|
|
|
|
input_data_ptr += 16;
|
|
output_data_ptr += 16;
|
|
}
|
|
for (; c <= flat_size - 8; c += 8) {
|
|
F3 input = F3::FromRaw(vld1q_s16(input_data_ptr));
|
|
F0 output = gemmlowp::logistic(input);
|
|
vst1q_s16(output_data_ptr, output.raw());
|
|
|
|
input_data_ptr += 8;
|
|
output_data_ptr += 8;
|
|
}
|
|
}
|
|
#endif
|
|
#ifdef GEMMLOWP_SSE4
|
|
{
|
|
// F0 uses 0 integer bits, range [-1, 1].
|
|
// This is the return type of math functions such as tanh, logistic,
|
|
// whose range is in [-1, 1].
|
|
using F0 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 0>;
|
|
// F3 uses 3 integer bits, range [-8, 8], the input range expected here.
|
|
using F3 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 3>;
|
|
|
|
for (; c <= flat_size - 16; c += 16) {
|
|
F3 input0 = F3::FromRaw(gemmlowp::to_int16x8_m128i(
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
|
|
F3 input1 = F3::FromRaw(gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
|
|
reinterpret_cast<const __m128i*>(input_data_ptr + 8))));
|
|
F0 output0 = gemmlowp::logistic(input0);
|
|
F0 output1 = gemmlowp::logistic(input1);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
|
|
output0.raw().v);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
|
|
output1.raw().v);
|
|
input_data_ptr += 16;
|
|
output_data_ptr += 16;
|
|
}
|
|
for (; c <= flat_size - 8; c += 8) {
|
|
F3 input = F3::FromRaw(gemmlowp::to_int16x8_m128i(
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
|
|
F0 output = gemmlowp::logistic(input);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
|
|
output.raw().v);
|
|
input_data_ptr += 8;
|
|
output_data_ptr += 8;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
{
|
|
// F0 uses 0 integer bits, range [-1, 1].
|
|
// This is the return type of math functions such as tanh, logistic,
|
|
// whose range is in [-1, 1].
|
|
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
|
// F3 uses 3 integer bits, range [-8, 8], the input range expected here.
|
|
using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
|
|
|
|
for (; c < flat_size; ++c) {
|
|
F3 input = F3::FromRaw(*input_data_ptr);
|
|
F0 output = gemmlowp::logistic(input);
|
|
*output_data_ptr = output.raw();
|
|
|
|
++input_data_ptr;
|
|
++output_data_ptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("Tanh");
|
|
auto input_map = MapAsVector(input_data, input_shape);
|
|
auto output_map = MapAsVector(output_data, output_shape);
|
|
output_map.array() = input_map.array().tanh();
|
|
}
|
|
|
|
// Convenience version that allows, for example, generated-code calls to be
|
|
// uniform between data types.
|
|
inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
|
|
const float* input_data, const RuntimeShape& output_shape,
|
|
float* output_data) {
|
|
// Drop params: not needed.
|
|
Tanh(input_shape, input_data, output_shape, output_data);
|
|
}
|
|
|
|
inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
|
|
const int16* input_data, const RuntimeShape& output_shape,
|
|
int16* output_data) {
|
|
ruy::profiler::ScopeLabel label("Tanh/Int16");
|
|
const int input_left_shift = params.input_left_shift;
|
|
// Support for shifts is limited until we have a parameterized version of
|
|
// SaturatingRoundingMultiplyByPOT().
|
|
TFLITE_DCHECK_GE(input_left_shift, 0);
|
|
TFLITE_DCHECK_LE(input_left_shift, 1);
|
|
|
|
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
|
|
|
int c = 0;
|
|
const int16* input_data_ptr = input_data;
|
|
int16* output_data_ptr = output_data;
|
|
#ifdef GEMMLOWP_NEON
|
|
{
|
|
// F0 uses 0 integer bits, range [-1, 1].
|
|
// This is the return type of math functions such as tanh, logistic,
|
|
// whose range is in [-1, 1].
|
|
using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
|
|
// F3 uses 3 integer bits, range [-8, 8], the input range expected here.
|
|
using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
|
|
|
|
if (input_left_shift == 0) {
|
|
for (; c <= flat_size - 16; c += 16) {
|
|
F3 input0 = F3::FromRaw(vld1q_s16(input_data_ptr));
|
|
F3 input1 = F3::FromRaw(vld1q_s16(input_data_ptr + 8));
|
|
F0 output0 = gemmlowp::tanh(input0);
|
|
F0 output1 = gemmlowp::tanh(input1);
|
|
vst1q_s16(output_data_ptr, output0.raw());
|
|
vst1q_s16(output_data_ptr + 8, output1.raw());
|
|
|
|
input_data_ptr += 16;
|
|
output_data_ptr += 16;
|
|
}
|
|
for (; c <= flat_size - 8; c += 8) {
|
|
F3 input = F3::FromRaw(vld1q_s16(input_data_ptr));
|
|
F0 output = gemmlowp::tanh(input);
|
|
vst1q_s16(output_data_ptr, output.raw());
|
|
|
|
input_data_ptr += 8;
|
|
output_data_ptr += 8;
|
|
}
|
|
} else {
|
|
for (; c <= flat_size - 16; c += 16) {
|
|
F3 input0 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
|
|
vld1q_s16(input_data_ptr)));
|
|
F3 input1 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
|
|
vld1q_s16(input_data_ptr + 8)));
|
|
F0 output0 = gemmlowp::tanh(input0);
|
|
F0 output1 = gemmlowp::tanh(input1);
|
|
vst1q_s16(output_data_ptr, output0.raw());
|
|
vst1q_s16(output_data_ptr + 8, output1.raw());
|
|
|
|
input_data_ptr += 16;
|
|
output_data_ptr += 16;
|
|
}
|
|
for (; c <= flat_size - 8; c += 8) {
|
|
F3 input = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
|
|
vld1q_s16(input_data_ptr)));
|
|
F0 output = gemmlowp::tanh(input);
|
|
vst1q_s16(output_data_ptr, output.raw());
|
|
|
|
input_data_ptr += 8;
|
|
output_data_ptr += 8;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
#ifdef GEMMLOWP_SSE4
|
|
{
|
|
// F0 uses 0 integer bits, range [-1, 1].
|
|
// This is the return type of math functions such as tanh, logistic,
|
|
// whose range is in [-1, 1].
|
|
using F0 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 0>;
|
|
// F3 uses 3 integer bits, range [-8, 8], the input range expected here.
|
|
using F3 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 3>;
|
|
|
|
if (input_left_shift == 0) {
|
|
for (; c <= flat_size - 16; c += 16) {
|
|
F3 input0 = F3::FromRaw(gemmlowp::to_int16x8_m128i(
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
|
|
F3 input1 = F3::FromRaw(gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
|
|
reinterpret_cast<const __m128i*>(input_data_ptr + 8))));
|
|
F0 output0 = gemmlowp::tanh(input0);
|
|
F0 output1 = gemmlowp::tanh(input1);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
|
|
output0.raw().v);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
|
|
output1.raw().v);
|
|
|
|
input_data_ptr += 16;
|
|
output_data_ptr += 16;
|
|
}
|
|
for (; c <= flat_size - 8; c += 8) {
|
|
F3 input = F3::FromRaw(gemmlowp::to_int16x8_m128i(
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
|
|
F0 output = gemmlowp::tanh(input);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
|
|
output.raw().v);
|
|
input_data_ptr += 8;
|
|
output_data_ptr += 8;
|
|
}
|
|
} else {
|
|
for (; c <= flat_size - 16; c += 16) {
|
|
F3 input0 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
|
|
gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
|
|
reinterpret_cast<const __m128i*>(input_data_ptr)))));
|
|
F3 input1 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
|
|
gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
|
|
reinterpret_cast<const __m128i*>(input_data_ptr + 8)))));
|
|
F0 output0 = gemmlowp::tanh(input0);
|
|
F0 output1 = gemmlowp::tanh(input1);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
|
|
output0.raw().v);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
|
|
output1.raw().v);
|
|
|
|
input_data_ptr += 16;
|
|
output_data_ptr += 16;
|
|
}
|
|
for (; c <= flat_size - 8; c += 8) {
|
|
F3 input = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
|
|
gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
|
|
reinterpret_cast<const __m128i*>(input_data_ptr)))));
|
|
F0 output = gemmlowp::tanh(input);
|
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
|
|
output.raw().v);
|
|
input_data_ptr += 8;
|
|
output_data_ptr += 8;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
{
|
|
// F0 uses 0 integer bits, range [-1, 1].
|
|
// This is the return type of math functions such as tanh, logistic,
|
|
// whose range is in [-1, 1].
|
|
using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
|
|
// F3 uses 3 integer bits, range [-8, 8], the input range expected here.
|
|
using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
|
|
|
|
if (input_left_shift == 0) {
|
|
for (; c < flat_size; ++c) {
|
|
F3 input = F3::FromRaw(*input_data_ptr);
|
|
F0 output = gemmlowp::tanh(input);
|
|
*output_data_ptr = output.raw();
|
|
|
|
++input_data_ptr;
|
|
++output_data_ptr;
|
|
}
|
|
} else {
|
|
for (; c < flat_size; ++c) {
|
|
F3 input = F3::FromRaw(
|
|
gemmlowp::SaturatingRoundingMultiplyByPOT<1>(*input_data_ptr));
|
|
F0 output = gemmlowp::tanh(input);
|
|
*output_data_ptr = output.raw();
|
|
|
|
++input_data_ptr;
|
|
++output_data_ptr;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename SrcT, typename DstT>
|
|
inline void Cast(const RuntimeShape& input_shape, const SrcT* input_data,
|
|
const RuntimeShape& output_shape, DstT* output_data) {
|
|
ruy::profiler::ScopeLabel label("Cast");
|
|
auto input_map = MapAsVector(input_data, input_shape);
|
|
auto output_map = MapAsVector(output_data, output_shape);
|
|
output_map.array() = input_map.array().template cast<DstT>();
|
|
}
|
|
|
|
inline void Floor(const RuntimeShape& input_shape, const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("Floor");
|
|
auto input_map = MapAsVector(input_data, input_shape);
|
|
auto output_map = MapAsVector(output_data, output_shape);
|
|
output_map.array() = Eigen::floor(input_map.array());
|
|
}
|
|
|
|
inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("Ceil");
|
|
auto input_map = MapAsVector(input_data, input_shape);
|
|
auto output_map = MapAsVector(output_data, output_shape);
|
|
output_map.array() = Eigen::ceil(input_map.array());
|
|
}
|
|
|
|
#ifdef USE_NEON
|
|
inline void ResizeBilinearKernel(const float* input_ptr, int32 depth,
|
|
float scale, float* output_ptr) {
|
|
int ic = 0;
|
|
// Handle 32 input channels at a time.
|
|
for (; ic <= depth - 32; ic += 32) {
|
|
float32x4x2_t input[4];
|
|
for (int i = 0; i < 4; i++) {
|
|
input[i].val[0] = vld1q_f32(input_ptr + 8 * i);
|
|
input[i].val[1] = vld1q_f32(input_ptr + 8 * i + 4);
|
|
}
|
|
float32x4x2_t acc[4];
|
|
for (int i = 0; i < 4; i++) {
|
|
acc[i].val[0] = vld1q_f32(output_ptr + 8 * i);
|
|
acc[i].val[1] = vld1q_f32(output_ptr + 8 * i + 4);
|
|
}
|
|
for (int i = 0; i < 4; i++) {
|
|
acc[i].val[0] = vmlaq_n_f32(acc[i].val[0], input[i].val[0], scale);
|
|
acc[i].val[1] = vmlaq_n_f32(acc[i].val[1], input[i].val[1], scale);
|
|
}
|
|
for (int i = 0; i < 4; i++) {
|
|
vst1q_f32(output_ptr, acc[i].val[0]);
|
|
vst1q_f32(output_ptr + 4, acc[i].val[1]);
|
|
output_ptr += 8;
|
|
}
|
|
input_ptr += 32;
|
|
}
|
|
// Handle 16 input channels at a time.
|
|
for (; ic <= depth - 16; ic += 16) {
|
|
float32x4x2_t input[2];
|
|
for (int i = 0; i < 2; i++) {
|
|
input[i].val[0] = vld1q_f32(input_ptr + 8 * i);
|
|
input[i].val[1] = vld1q_f32(input_ptr + 8 * i + 4);
|
|
}
|
|
float32x4x2_t acc[2];
|
|
for (int i = 0; i < 2; i++) {
|
|
acc[i].val[0] = vld1q_f32(output_ptr + 8 * i);
|
|
acc[i].val[1] = vld1q_f32(output_ptr + 8 * i + 4);
|
|
}
|
|
for (int i = 0; i < 2; i++) {
|
|
acc[i].val[0] = vmlaq_n_f32(acc[i].val[0], input[i].val[0], scale);
|
|
acc[i].val[1] = vmlaq_n_f32(acc[i].val[1], input[i].val[1], scale);
|
|
}
|
|
for (int i = 0; i < 2; i++) {
|
|
vst1q_f32(output_ptr, acc[i].val[0]);
|
|
vst1q_f32(output_ptr + 4, acc[i].val[1]);
|
|
output_ptr += 8;
|
|
}
|
|
input_ptr += 16;
|
|
}
|
|
// Handle 8 input channels at a time.
|
|
for (; ic <= depth - 8; ic += 8) {
|
|
float32x4x2_t input;
|
|
input.val[0] = vld1q_f32(input_ptr);
|
|
input.val[1] = vld1q_f32(input_ptr + 4);
|
|
|
|
float32x4x2_t acc;
|
|
acc.val[0] = vld1q_f32(output_ptr);
|
|
acc.val[1] = vld1q_f32(output_ptr + 4);
|
|
acc.val[0] = vmlaq_n_f32(acc.val[0], input.val[0], scale);
|
|
acc.val[1] = vmlaq_n_f32(acc.val[1], input.val[1], scale);
|
|
|
|
vst1q_f32(output_ptr, acc.val[0]);
|
|
vst1q_f32(output_ptr + 4, acc.val[1]);
|
|
|
|
input_ptr += 8;
|
|
output_ptr += 8;
|
|
}
|
|
// Handle 4 input channels at a time.
|
|
for (; ic <= depth - 4; ic += 4) {
|
|
float32x4_t input = vld1q_f32(input_ptr);
|
|
float32x4_t acc = vld1q_f32(output_ptr);
|
|
|
|
acc = vmlaq_n_f32(acc, input, scale);
|
|
vst1q_f32(output_ptr, acc);
|
|
|
|
input_ptr += 4;
|
|
output_ptr += 4;
|
|
}
|
|
// Handle 1 input channel at a time.
|
|
for (; ic < depth; ic++) {
|
|
*output_ptr += *input_ptr * scale;
|
|
output_ptr++;
|
|
input_ptr++;
|
|
}
|
|
}
|
|
#else
|
|
inline void ResizeBilinearKernel(const float* input_ptr, int32 depth,
|
|
float scale, float* output_ptr) {
|
|
for (int32 i = 0; i < depth; i++) {
|
|
*output_ptr += *input_ptr * scale;
|
|
output_ptr++;
|
|
input_ptr++;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
|
|
int32 x, int32 y, int32 depth, int32 batch,
|
|
const RuntimeShape& input_shape,
|
|
const float* input_data,
|
|
const RuntimeShape& output_shape,
|
|
float* output_data) {
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
const int32 input_width = input_shape.Dims(2);
|
|
const int32 output_width = output_shape.Dims(2);
|
|
|
|
const int32 input_x_offset = (x1 - x0) * depth;
|
|
const int32 input_y_offset = (y1 - y0) * depth * input_width;
|
|
const int32 output_x_offset = depth;
|
|
const int32 output_y_offset = depth * output_width;
|
|
|
|
#ifdef USE_NEON
|
|
TFLITE_DCHECK(x1 >= x0);
|
|
TFLITE_DCHECK(y1 >= y0);
|
|
|
|
int ic = 0;
|
|
// Handle 8 input channels at a time.
|
|
for (; ic <= depth - 8; ic += 8) {
|
|
const float* input_ptr = nullptr;
|
|
|
|
float32x4x2_t x0y0;
|
|
input_ptr = &input_data[Offset(input_shape, batch, y0, x0, ic)];
|
|
x0y0.val[0] = vld1q_f32(input_ptr);
|
|
x0y0.val[1] = vld1q_f32(input_ptr + 4);
|
|
|
|
float32x4x2_t x1y0;
|
|
input_ptr += input_x_offset;
|
|
x1y0.val[0] = vld1q_f32(input_ptr);
|
|
x1y0.val[1] = vld1q_f32(input_ptr + 4);
|
|
|
|
float32x4x2_t x0y1;
|
|
input_ptr += -input_x_offset + input_y_offset;
|
|
x0y1.val[0] = vld1q_f32(input_ptr);
|
|
x0y1.val[1] = vld1q_f32(input_ptr + 4);
|
|
|
|
float32x4x2_t x1y1;
|
|
input_ptr += input_x_offset;
|
|
x1y1.val[0] = vld1q_f32(input_ptr);
|
|
x1y1.val[1] = vld1q_f32(input_ptr + 4);
|
|
|
|
// Top left corner.
|
|
float* output_ptr = &output_data[Offset(output_shape, batch, y, x, ic)];
|
|
vst1q_f32(output_ptr, x0y0.val[0]);
|
|
vst1q_f32(output_ptr + 4, x0y0.val[1]);
|
|
|
|
// Top right corner.
|
|
output_ptr += output_x_offset;
|
|
float32x4x2_t tr;
|
|
tr.val[0] = vaddq_f32(x0y0.val[0], x1y0.val[0]);
|
|
tr.val[1] = vaddq_f32(x0y0.val[1], x1y0.val[1]);
|
|
tr.val[0] = vmulq_n_f32(tr.val[0], 0.5f);
|
|
tr.val[1] = vmulq_n_f32(tr.val[1], 0.5f);
|
|
|
|
vst1q_f32(output_ptr, tr.val[0]);
|
|
vst1q_f32(output_ptr + 4, tr.val[1]);
|
|
|
|
// Bottom left corner.
|
|
output_ptr += -output_x_offset + output_y_offset;
|
|
float32x4x2_t bl;
|
|
bl.val[0] = vaddq_f32(x0y0.val[0], x0y1.val[0]);
|
|
bl.val[1] = vaddq_f32(x0y0.val[1], x0y1.val[1]);
|
|
bl.val[0] = vmulq_n_f32(bl.val[0], 0.5f);
|
|
bl.val[1] = vmulq_n_f32(bl.val[1], 0.5f);
|
|
vst1q_f32(output_ptr, bl.val[0]);
|
|
vst1q_f32(output_ptr + 4, bl.val[1]);
|
|
|
|
// Bottom right corner.
|
|
output_ptr += output_x_offset;
|
|
float32x4x2_t br;
|
|
br.val[0] = vaddq_f32(x1y0.val[0], x1y1.val[0]);
|
|
br.val[1] = vaddq_f32(x1y0.val[1], x1y1.val[1]);
|
|
br.val[0] = vmlaq_n_f32(bl.val[0], br.val[0], 0.5f);
|
|
br.val[1] = vmlaq_n_f32(bl.val[1], br.val[1], 0.5f);
|
|
br.val[0] = vmulq_n_f32(br.val[0], 0.5f);
|
|
br.val[1] = vmulq_n_f32(br.val[1], 0.5f);
|
|
vst1q_f32(output_ptr, br.val[0]);
|
|
vst1q_f32(output_ptr + 4, br.val[1]);
|
|
}
|
|
// Handle 4 input channels at a time.
|
|
for (; ic <= depth - 4; ic += 4) {
|
|
const float* input_ptr =
|
|
&input_data[Offset(input_shape, batch, y0, x0, ic)];
|
|
float32x4_t x0y0 = vld1q_f32(input_ptr);
|
|
float32x4_t x1y0 = vld1q_f32(input_ptr + input_x_offset);
|
|
float32x4_t x0y1 = vld1q_f32(input_ptr + input_y_offset);
|
|
float32x4_t x1y1 = vld1q_f32(input_ptr + input_x_offset + input_y_offset);
|
|
|
|
// Top left corner.
|
|
float* output_ptr = &output_data[Offset(output_shape, batch, y, x, ic)];
|
|
vst1q_f32(output_ptr, x0y0);
|
|
|
|
// Top right corner.
|
|
output_ptr += output_x_offset;
|
|
float32x4_t tr = vaddq_f32(x0y0, x1y0);
|
|
tr = vmulq_n_f32(tr, 0.5f);
|
|
vst1q_f32(output_ptr, tr);
|
|
|
|
// Bottom left corner.
|
|
output_ptr += -output_x_offset + output_y_offset;
|
|
float32x4_t bl = vaddq_f32(x0y0, x0y1);
|
|
bl = vmulq_n_f32(bl, 0.5f);
|
|
vst1q_f32(output_ptr, bl);
|
|
|
|
// Bottom right corner.
|
|
output_ptr += output_x_offset;
|
|
float32x4_t br = vaddq_f32(x1y0, x1y1);
|
|
br = vmlaq_n_f32(bl, br, 0.5f);
|
|
br = vmulq_n_f32(br, 0.5f);
|
|
vst1q_f32(output_ptr, br);
|
|
}
|
|
// Handle one input channel at a time.
|
|
for (; ic < depth; ic++) {
|
|
const int32 input_offset = Offset(input_shape, batch, y0, x0, ic);
|
|
|
|
float x0y0 = input_data[input_offset];
|
|
float x1y0 = input_data[input_offset + input_x_offset];
|
|
float x0y1 = input_data[input_offset + input_y_offset];
|
|
float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
|
|
|
|
// Top left corner.
|
|
const int32 output_offset = Offset(output_shape, batch, y, x, ic);
|
|
output_data[output_offset] = x0y0;
|
|
|
|
// Top right corner.
|
|
output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2;
|
|
|
|
// Bottom left corner.
|
|
float output = (x0y0 + x0y1) / 2;
|
|
output_data[output_offset + output_y_offset] = output;
|
|
|
|
// Bottom right corner.
|
|
output_data[output_offset + output_x_offset + output_y_offset] =
|
|
(output + ((x1y0 + x1y1) / 2)) / 2;
|
|
}
|
|
#else
|
|
for (int ch = 0; ch < depth; ch++) {
|
|
const int32 input_offset = Offset(input_shape, batch, y0, x0, ch);
|
|
|
|
float x0y0 = input_data[input_offset];
|
|
float x1y0 = input_data[input_offset + input_x_offset];
|
|
float x0y1 = input_data[input_offset + input_y_offset];
|
|
float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
|
|
|
|
// Top left corner.
|
|
const int32 output_offset = Offset(output_shape, batch, y, x, ch);
|
|
output_data[output_offset] = x0y0;
|
|
|
|
// Top right corner.
|
|
output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2;
|
|
|
|
// Bottom left corner.
|
|
float output = (x0y0 + x0y1) / 2;
|
|
output_data[output_offset + output_y_offset] = output;
|
|
|
|
// Bottom right corner.
|
|
output_data[output_offset + output_x_offset + output_y_offset] =
|
|
(output + ((x1y0 + x1y1) / 2)) / 2;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
inline void ResizeBilinear2x2(int32 batches, int32 input_height,
|
|
int32 input_width, int32 depth,
|
|
int32 output_height, int32 output_width,
|
|
const RuntimeShape& input_shape,
|
|
const float* input_data,
|
|
const RuntimeShape& output_shape,
|
|
float* output_data) {
|
|
for (int b = 0; b < batches; b++) {
|
|
for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++) {
|
|
for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++) {
|
|
int32 x1 = std::min(x0 + 1, input_width - 1);
|
|
int32 y1 = std::min(y0 + 1, input_height - 1);
|
|
ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_shape,
|
|
input_data, output_shape, output_data);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void ResizeBilinearGeneric(
|
|
int32 batches, int32 input_height, int32 input_width, int32 depth,
|
|
int32 output_height, int32 output_width, float height_scale,
|
|
float width_scale, const RuntimeShape& input_shape, const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data,
|
|
const bool half_pixel_centers) {
|
|
memset(output_data, 0,
|
|
batches * output_height * output_width * depth * sizeof(float));
|
|
|
|
int32 output_offset = 0;
|
|
for (int b = 0; b < batches; ++b) {
|
|
for (int y = 0; y < output_height; ++y) {
|
|
float input_y;
|
|
int32 y0, y1;
|
|
reference_ops::ComputeInterpolationValues(
|
|
y, height_scale, half_pixel_centers, input_height, &input_y, &y0,
|
|
&y1);
|
|
for (int x = 0; x < output_width; ++x) {
|
|
float input_x;
|
|
int32 x0, x1;
|
|
reference_ops::ComputeInterpolationValues(
|
|
x, width_scale, half_pixel_centers, input_width, &input_x, &x0,
|
|
&x1);
|
|
float* output_ptr = &output_data[output_offset];
|
|
|
|
// Run kernel on the 4 corners of the bilinear resize algorithm.
|
|
int32 input_offset = Offset(input_shape, b, y0, x0, 0);
|
|
float scale = (1 - (input_y - y0)) * (1 - (input_x - x0));
|
|
const float* input_ptr = &input_data[input_offset];
|
|
ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
|
|
|
|
input_offset = Offset(input_shape, b, y0, x1, 0);
|
|
scale = (1 - (input_y - y0)) * (input_x - x0);
|
|
input_ptr = &input_data[input_offset];
|
|
ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
|
|
|
|
input_offset = Offset(input_shape, b, y1, x0, 0);
|
|
scale = (input_y - y0) * (1 - (input_x - x0));
|
|
input_ptr = &input_data[input_offset];
|
|
ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
|
|
|
|
input_offset = Offset(input_shape, b, y1, x1, 0);
|
|
scale = (input_y - y0) * (input_x - x0);
|
|
input_ptr = &input_data[input_offset];
|
|
ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
|
|
|
|
output_offset += depth;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
inline void ResizeBilinearGenericSmallChannel(
|
|
int32 batches, int32 input_height, int32 input_width, int32 depth,
|
|
int32 output_height, int32 output_width, float height_scale,
|
|
float width_scale, const RuntimeShape& input_shape, const T* input_data,
|
|
const RuntimeShape& output_shape, T* output_data,
|
|
const bool half_pixel_centers) {
|
|
T* output_ptr = &output_data[0];
|
|
for (int b = 0; b < batches; ++b) {
|
|
for (int y = 0; y < output_height; ++y) {
|
|
float input_y;
|
|
int32 y0, y1;
|
|
reference_ops::ComputeInterpolationValues(
|
|
y, height_scale, half_pixel_centers, input_height, &input_y, &y0,
|
|
&y1);
|
|
for (int x = 0; x < output_width; ++x) {
|
|
float input_x;
|
|
int32 x0, x1;
|
|
reference_ops::ComputeInterpolationValues(
|
|
x, width_scale, half_pixel_centers, input_width, &input_x, &x0,
|
|
&x1);
|
|
|
|
int32 input_offset[4] = {Offset(input_shape, b, y0, x0, 0),
|
|
Offset(input_shape, b, y0, x1, 0),
|
|
Offset(input_shape, b, y1, x0, 0),
|
|
Offset(input_shape, b, y1, x1, 0)};
|
|
float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
|
|
(1 - (input_y - y0)) * (input_x - x0),
|
|
(input_y - y0) * (1 - (input_x - x0)),
|
|
(input_y - y0) * (input_x - x0)};
|
|
|
|
for (int d = 0; d < depth; d++) {
|
|
const T* input_ptr = &input_data[d];
|
|
*output_ptr++ = static_cast<T>(input_ptr[input_offset[0]] * scale[0] +
|
|
input_ptr[input_offset[1]] * scale[1] +
|
|
input_ptr[input_offset[2]] * scale[2] +
|
|
input_ptr[input_offset[3]] * scale[3]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
|
|
const RuntimeShape& unextended_input_shape,
|
|
const float* input_data,
|
|
const RuntimeShape& output_size_shape,
|
|
const int32* output_size_data,
|
|
const RuntimeShape& unextended_output_shape,
|
|
float* output_data) {
|
|
ruy::profiler::ScopeLabel label("ResizeBilinear");
|
|
// If half_pixel_centers is True, align_corners must be False.
|
|
TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
|
|
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
|
const RuntimeShape input_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
|
const RuntimeShape output_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
|
|
|
int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
|
|
int32 input_height = input_shape.Dims(1);
|
|
int32 input_width = input_shape.Dims(2);
|
|
int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
|
|
|
|
TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
|
|
int32 output_height = output_size_data[0];
|
|
int32 output_width = output_size_data[1];
|
|
|
|
// Specialize for 2x2 upsample.
|
|
if (!op_params.align_corners && !op_params.half_pixel_centers &&
|
|
output_height == 2 * input_height && output_width == 2 * input_width) {
|
|
ResizeBilinear2x2(batches, input_height, input_width, depth, output_height,
|
|
output_width, input_shape, input_data, output_shape,
|
|
output_data);
|
|
} else {
|
|
float height_scale = static_cast<float>(input_height) / output_height;
|
|
float width_scale = static_cast<float>(input_width) / output_width;
|
|
if (op_params.align_corners && output_height > 1) {
|
|
height_scale = static_cast<float>(input_height - 1) / (output_height - 1);
|
|
}
|
|
if (op_params.align_corners && output_width > 1) {
|
|
width_scale = static_cast<float>(input_width - 1) / (output_width - 1);
|
|
}
|
|
|
|
ResizeBilinearGeneric(batches, input_height, input_width, depth,
|
|
output_height, output_width, height_scale,
|
|
width_scale, input_shape, input_data, output_shape,
|
|
output_data, op_params.half_pixel_centers);
|
|
}
|
|
}
|
|
|
|
// TODO(prabhumk): This is not a real quantized bilinear. It does not use int8
|
|
// or int16 arithmetic.
|
|
inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
|
|
const RuntimeShape& unextended_input_shape,
|
|
const uint8* input_data,
|
|
const RuntimeShape& output_size_shape,
|
|
const int32* output_size_data,
|
|
const RuntimeShape& unextended_output_shape,
|
|
uint8* output_data) {
|
|
ruy::profiler::ScopeLabel label("ResizeBilinear");
|
|
// If half_pixel_centers is True, align_corners must be False.
|
|
TFLITE_DCHECK(!op_params.half_pixel_centers || !op_params.align_corners);
|
|
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
|
const RuntimeShape input_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
|
const RuntimeShape output_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
|
|
|
int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
|
|
int32 input_height = input_shape.Dims(1);
|
|
int32 input_width = input_shape.Dims(2);
|
|
int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
|
|
|
|
TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
|
|
int32 output_height = output_size_data[0];
|
|
int32 output_width = output_size_data[1];
|
|
|
|
float height_scale =
|
|
(op_params.align_corners && output_height > 1)
|
|
? (static_cast<float>(input_height - 1) / (output_height - 1))
|
|
: (static_cast<float>(input_height) / output_height);
|
|
|
|
float width_scale =
|
|
(op_params.align_corners && output_width > 1)
|
|
? (static_cast<float>(input_width - 1) / (output_width - 1))
|
|
: (static_cast<float>(input_width) / output_width);
|
|
|
|
ResizeBilinearGenericSmallChannel<uint8>(
|
|
batches, input_height, input_width, depth, output_height, output_width,
|
|
height_scale, width_scale, input_shape, input_data, output_shape,
|
|
output_data, op_params.half_pixel_centers);
|
|
}
|
|
|
|
// Helper methods for BatchToSpaceND.
|
|
// `spatial_index_dim` specifies post-crop offset index in this spatial
|
|
// dimension, i.e. spatial offset introduced by flattening batch to spatial
|
|
// dimension minus the crop size at beginning. `block_shape_dim` is the block
|
|
// size in current dimension. `input_dim` and `output_dim` are input and output
|
|
// size of BatchToSpaceND operation in current dimension.
|
|
// Output start index is inclusive and end index is exclusive.
|
|
inline void GetIndexRange(int spatial_index_dim, int block_shape_dim,
|
|
int input_dim, int output_dim, int* start_index,
|
|
int* end_index) {
|
|
// (*start_index) * block_shape_dim is effectively rounded up to the next
|
|
// multiple of block_shape_dim by the integer division.
|
|
*start_index =
|
|
std::max(0, (-spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
|
|
// Similarly, (*end_index) * block_shape_dim is rounded up too (note that
|
|
// end_index is exclusive).
|
|
*end_index = std::min(
|
|
input_dim,
|
|
(output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
|
|
}
|
|
|
|
template <typename T>
|
|
inline void BatchToSpaceND(
|
|
const RuntimeShape& unextended_input1_shape, const T* input1_data,
|
|
const RuntimeShape& unextended_input2_shape, const int32* block_shape_data,
|
|
const RuntimeShape& unextended_input3_shape, const int32* crops_data,
|
|
const RuntimeShape& unextended_output_shape, T* output_data) {
|
|
ruy::profiler::ScopeLabel label("BatchToSpaceND");
|
|
|
|
TFLITE_DCHECK_GE(unextended_input1_shape.DimensionsCount(), 3);
|
|
TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(unextended_input1_shape.DimensionsCount(),
|
|
unextended_output_shape.DimensionsCount());
|
|
|
|
// Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
|
|
auto extend_shape = [](const RuntimeShape& shape) {
|
|
if (shape.DimensionsCount() == 4) {
|
|
return shape;
|
|
}
|
|
RuntimeShape new_shape(4, 1);
|
|
new_shape.SetDim(0, shape.Dims(0));
|
|
new_shape.SetDim(1, shape.Dims(1));
|
|
new_shape.SetDim(3, shape.Dims(2));
|
|
return new_shape;
|
|
};
|
|
const RuntimeShape input1_shape = extend_shape(unextended_input1_shape);
|
|
const RuntimeShape output_shape = extend_shape(unextended_output_shape);
|
|
|
|
const int output_width = output_shape.Dims(2);
|
|
const int output_height = output_shape.Dims(1);
|
|
const int output_batch_size = output_shape.Dims(0);
|
|
|
|
const int depth = input1_shape.Dims(3);
|
|
const int input_width = input1_shape.Dims(2);
|
|
const int input_height = input1_shape.Dims(1);
|
|
const int input_batch_size = input1_shape.Dims(0);
|
|
|
|
const int block_shape_height = block_shape_data[0];
|
|
const int block_shape_width =
|
|
unextended_input1_shape.DimensionsCount() == 4 ? block_shape_data[1] : 1;
|
|
const int crops_top = crops_data[0];
|
|
const int crops_left =
|
|
unextended_input1_shape.DimensionsCount() == 4 ? crops_data[2] : 0;
|
|
|
|
for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
|
|
const int out_batch = in_batch % output_batch_size;
|
|
const int spatial_offset = in_batch / output_batch_size;
|
|
|
|
int in_h_start = 0;
|
|
int in_h_end = 0;
|
|
// GetIndexRange ensures start and end indices are in [0, output_height).
|
|
GetIndexRange(spatial_offset / block_shape_width - crops_top,
|
|
block_shape_height, input_height, output_height, &in_h_start,
|
|
&in_h_end);
|
|
|
|
for (int in_h = in_h_start; in_h < in_h_end; ++in_h) {
|
|
const int out_h = in_h * block_shape_height +
|
|
spatial_offset / block_shape_width - crops_top;
|
|
TFLITE_DCHECK_GE(out_h, 0);
|
|
TFLITE_DCHECK_LT(out_h, output_height);
|
|
|
|
int in_w_start = 0;
|
|
int in_w_end = 0;
|
|
// GetIndexRange ensures start and end indices are in [0, output_width).
|
|
GetIndexRange(spatial_offset % block_shape_width - crops_left,
|
|
block_shape_width, input_width, output_width, &in_w_start,
|
|
&in_w_end);
|
|
|
|
for (int in_w = in_w_start; in_w < in_w_end; ++in_w) {
|
|
const int out_w = in_w * block_shape_width +
|
|
spatial_offset % block_shape_width - crops_left;
|
|
TFLITE_DCHECK_GE(out_w, 0);
|
|
TFLITE_DCHECK_LT(out_w, output_width);
|
|
T* out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
|
|
const T* in =
|
|
input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
|
|
memcpy(out, in, depth * sizeof(T));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void TypedMemset(void* ptr, T value, size_t num) {
|
|
// Optimization for common cases where memset() will suffice.
|
|
if (value == 0 || std::is_same<T, uint8_t>::value) {
|
|
memset(ptr, value, num * sizeof(T));
|
|
} else {
|
|
// Default implementation for cases where memset() will not preserve the
|
|
// bytes, e.g., typically when sizeof(T) > sizeof(uint8_t).
|
|
char* pos = static_cast<char*>(ptr);
|
|
for (size_t i = 0; i < num; ++i) {
|
|
memcpy(pos, &value, sizeof(T));
|
|
pos = pos + sizeof(T);
|
|
}
|
|
}
|
|
}
|
|
|
|
// This makes heavy use of Offset, along with conditional branches. There may be
|
|
// opportunities for improvement.
|
|
//
|
|
// There are two versions of pad: Pad and PadV2. In PadV2 there is a second
|
|
// scalar input that provides the padding value. Therefore pad_value_ptr can be
|
|
// equivalent to a simple input1_data. For Pad, it should point to a zero
|
|
// value.
|
|
//
|
|
// Note that two typenames are required, so that T=P=int32 is considered a
|
|
// specialization distinct from P=int32.
|
|
template <typename T, typename P>
|
|
inline void PadImpl(const tflite::PadParams& op_params,
|
|
const RuntimeShape& input_shape, const T* input_data,
|
|
const P* pad_value_ptr, const RuntimeShape& output_shape,
|
|
T* output_data) {
|
|
ruy::profiler::ScopeLabel label("Pad4DSlowImpl");
|
|
const RuntimeShape ext_input_shape =
|
|
RuntimeShape::ExtendedShape(4, input_shape);
|
|
const RuntimeShape ext_output_shape =
|
|
RuntimeShape::ExtendedShape(4, output_shape);
|
|
TFLITE_DCHECK_LE(op_params.left_padding_count, 4);
|
|
TFLITE_DCHECK_LE(op_params.right_padding_count, 4);
|
|
|
|
// Pad kernels are limited to max 4 dimensions. Copy inputs so we can pad them
|
|
// to 4 dims (yes, we are "padding the padding").
|
|
std::vector<int> left_padding_copy(4, 0);
|
|
const int left_padding_extend = 4 - op_params.left_padding_count;
|
|
for (int i = 0; i < op_params.left_padding_count; ++i) {
|
|
left_padding_copy[left_padding_extend + i] = op_params.left_padding[i];
|
|
}
|
|
std::vector<int> right_padding_copy(4, 0);
|
|
const int right_padding_extend = 4 - op_params.right_padding_count;
|
|
for (int i = 0; i < op_params.right_padding_count; ++i) {
|
|
right_padding_copy[right_padding_extend + i] = op_params.right_padding[i];
|
|
}
|
|
|
|
const int output_batch = ext_output_shape.Dims(0);
|
|
const int output_height = ext_output_shape.Dims(1);
|
|
const int output_width = ext_output_shape.Dims(2);
|
|
const int output_depth = ext_output_shape.Dims(3);
|
|
|
|
const int left_b_padding = left_padding_copy[0];
|
|
const int left_h_padding = left_padding_copy[1];
|
|
const int left_w_padding = left_padding_copy[2];
|
|
const int left_d_padding = left_padding_copy[3];
|
|
|
|
const int right_b_padding = right_padding_copy[0];
|
|
const int right_h_padding = right_padding_copy[1];
|
|
const int right_w_padding = right_padding_copy[2];
|
|
const int right_d_padding = right_padding_copy[3];
|
|
|
|
const int input_depth = ext_input_shape.Dims(3);
|
|
const T pad_value = *pad_value_ptr;
|
|
|
|
if (left_b_padding != 0) {
|
|
TypedMemset<T>(
|
|
output_data, pad_value,
|
|
left_b_padding * output_height * output_width * output_depth);
|
|
}
|
|
for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
|
|
++out_b) {
|
|
if (left_h_padding != 0) {
|
|
TypedMemset<T>(output_data + Offset(ext_output_shape, out_b, 0, 0, 0),
|
|
pad_value, left_h_padding * output_width * output_depth);
|
|
}
|
|
for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
|
|
++out_h) {
|
|
if (left_w_padding != 0) {
|
|
TypedMemset<T>(
|
|
output_data + Offset(ext_output_shape, out_b, out_h, 0, 0),
|
|
pad_value, left_w_padding * output_depth);
|
|
}
|
|
for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
|
|
++out_w) {
|
|
if (left_d_padding != 0) {
|
|
TypedMemset<T>(
|
|
output_data + Offset(ext_output_shape, out_b, out_h, out_w, 0),
|
|
pad_value, left_d_padding);
|
|
}
|
|
|
|
T* out = output_data +
|
|
Offset(ext_output_shape, out_b, out_h, out_w, left_d_padding);
|
|
const T* in = input_data +
|
|
Offset(ext_input_shape, out_b - left_b_padding,
|
|
out_h - left_h_padding, out_w - left_w_padding, 0);
|
|
memcpy(out, in, input_depth * sizeof(T));
|
|
|
|
if (right_d_padding != 0) {
|
|
TypedMemset<T>(
|
|
output_data + Offset(ext_output_shape, out_b, out_h, out_w,
|
|
output_depth - right_d_padding),
|
|
pad_value, right_d_padding);
|
|
}
|
|
}
|
|
if (right_w_padding != 0) {
|
|
TypedMemset<T>(output_data + Offset(ext_output_shape, out_b, out_h,
|
|
output_width - right_w_padding, 0),
|
|
pad_value, right_w_padding * output_depth);
|
|
}
|
|
}
|
|
if (right_h_padding != 0) {
|
|
TypedMemset<T>(
|
|
output_data + Offset(ext_output_shape, out_b,
|
|
output_height - right_h_padding, 0, 0),
|
|
pad_value, right_h_padding * output_width * output_depth);
|
|
}
|
|
}
|
|
if (right_b_padding != 0) {
|
|
TypedMemset<T>(
|
|
output_data +
|
|
Offset(ext_output_shape, output_batch - right_b_padding, 0, 0, 0),
|
|
pad_value,
|
|
right_b_padding * output_height * output_width * output_depth);
|
|
}
|
|
}
|
|
|
|
template <typename T, typename P>
|
|
inline void Pad(const tflite::PadParams& op_params,
|
|
const RuntimeShape& input_shape, const T* input_data,
|
|
const P* pad_value_ptr, const RuntimeShape& output_shape,
|
|
T* output_data) {
|
|
PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
|
|
output_data);
|
|
}
|
|
|
|
// The second (pad-value) input can be int32 when, say, the first is uint8.
|
|
template <typename T>
|
|
inline void Pad(const tflite::PadParams& op_params,
|
|
const RuntimeShape& input_shape, const T* input_data,
|
|
const int32* pad_value_ptr, const RuntimeShape& output_shape,
|
|
T* output_data) {
|
|
const T converted_pad_value = static_cast<T>(*pad_value_ptr);
|
|
PadImpl(op_params, input_shape, input_data, &converted_pad_value,
|
|
output_shape, output_data);
|
|
}
|
|
|
|
// This version avoids conflicting template matching.
|
|
template <>
|
|
inline void Pad(const tflite::PadParams& op_params,
|
|
const RuntimeShape& input_shape, const int32* input_data,
|
|
const int32* pad_value_ptr, const RuntimeShape& output_shape,
|
|
int32* output_data) {
|
|
PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
|
|
output_data);
|
|
}
|
|
|
|
// TODO(b/117643175): Optimize. (This is an introductory copy of standard Pad.)
|
|
//
|
|
// This pad requires that (a) left and right paddings are in the 4D patterns
|
|
// {0, h_pad, w_pad, 0}, and (b) memset can be used: *pad_value_ptr == 0 and/or
|
|
// T is uint8.
|
|
//
|
|
// There are two versions of pad: Pad and PadV2. In PadV2 there is a second
|
|
// scalar input that provides the padding value. Therefore pad_value_ptr can be
|
|
// equivalent to a simple input1_data. For Pad, it should point to a zero
|
|
// value.
|
|
//
|
|
// Note that two typenames are required, so that T=P=int32 is considered a
|
|
// specialization distinct from P=int32.
|
|
template <typename T, typename P>
|
|
inline void PadImageStyleMemset(const tflite::PadParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const T* input_data, const P* pad_value_ptr,
|
|
const RuntimeShape& output_shape,
|
|
T* output_data) {
|
|
ruy::profiler::ScopeLabel label("PadImageStyle");
|
|
const RuntimeShape ext_input_shape =
|
|
RuntimeShape::ExtendedShape(4, input_shape);
|
|
const RuntimeShape ext_output_shape =
|
|
RuntimeShape::ExtendedShape(4, output_shape);
|
|
TFLITE_DCHECK_LE(op_params.left_padding_count, 4);
|
|
TFLITE_DCHECK_LE(op_params.right_padding_count, 4);
|
|
|
|
// Pad kernels are limited to max 4 dimensions. Copy inputs so we can pad them
|
|
// to 4 dims (yes, we are "padding the padding").
|
|
std::vector<int> left_padding_copy(4, 0);
|
|
const int left_padding_extend = 4 - op_params.left_padding_count;
|
|
for (int i = 0; i < op_params.left_padding_count; ++i) {
|
|
left_padding_copy[left_padding_extend + i] = op_params.left_padding[i];
|
|
}
|
|
std::vector<int> right_padding_copy(4, 0);
|
|
const int right_padding_extend = 4 - op_params.right_padding_count;
|
|
for (int i = 0; i < op_params.right_padding_count; ++i) {
|
|
right_padding_copy[right_padding_extend + i] = op_params.right_padding[i];
|
|
}
|
|
// The following padding restrictions are contractual requirements, and
|
|
// embody what it means for a padding op to be "image-style".
|
|
TFLITE_DCHECK_EQ(left_padding_copy[0], 0);
|
|
TFLITE_DCHECK_EQ(left_padding_copy[3], 0);
|
|
TFLITE_DCHECK_EQ(right_padding_copy[0], 0);
|
|
TFLITE_DCHECK_EQ(right_padding_copy[3], 0);
|
|
|
|
const int batch = MatchingDim(ext_input_shape, 0, ext_output_shape, 0);
|
|
const int output_height = ext_output_shape.Dims(1);
|
|
const int output_width = ext_output_shape.Dims(2);
|
|
const int input_height = ext_input_shape.Dims(1);
|
|
const int input_width = ext_input_shape.Dims(2);
|
|
const int depth = MatchingDim(ext_input_shape, 3, ext_output_shape, 3);
|
|
|
|
const int left_h_padding = left_padding_copy[1];
|
|
const int left_w_padding = left_padding_copy[2];
|
|
const int right_h_padding = right_padding_copy[1];
|
|
const int right_w_padding = right_padding_copy[2];
|
|
|
|
TFLITE_DCHECK_EQ(output_height,
|
|
input_height + left_h_padding + right_h_padding);
|
|
TFLITE_DCHECK_EQ(output_width,
|
|
input_width + left_w_padding + right_w_padding);
|
|
|
|
const T pad_value = *pad_value_ptr;
|
|
const int top_block_size = left_h_padding * output_width * depth;
|
|
const size_t num_top_block_bytes = top_block_size * sizeof(T);
|
|
const int bottom_block_size = right_h_padding * output_width * depth;
|
|
const size_t num_bottom_block_bytes = bottom_block_size * sizeof(T);
|
|
const int left_blocks_size = left_w_padding * depth;
|
|
const size_t num_left_block_bytes = left_blocks_size * sizeof(T);
|
|
const int right_blocks_size = right_w_padding * depth;
|
|
const size_t num_right_block_bytes = right_blocks_size * sizeof(T);
|
|
const int inner_line_size = input_width * depth;
|
|
const size_t num_inner_line_bytes = inner_line_size * sizeof(T);
|
|
|
|
if (input_height == 0) {
|
|
memset(output_data, pad_value,
|
|
num_top_block_bytes + num_bottom_block_bytes);
|
|
} else {
|
|
for (int i = 0; i < batch; ++i) {
|
|
// For each image in the batch, apply the top padding, then iterate
|
|
// through rows, then apply the bottom padding.
|
|
//
|
|
// By unwinding one iteration, we can combine the first left-margin
|
|
// padding with the top padding, and the last right-margin padding with
|
|
// the bottom padding.
|
|
memset(output_data, pad_value,
|
|
num_top_block_bytes + num_left_block_bytes);
|
|
output_data += top_block_size + left_blocks_size;
|
|
memcpy(output_data, input_data, num_inner_line_bytes);
|
|
input_data += inner_line_size;
|
|
output_data += inner_line_size;
|
|
// One iteration unwound.
|
|
// Unwinding this loop affords the opportunity to reorder the loop work
|
|
// and hence combine memset() calls.
|
|
//
|
|
// Before unwinding:
|
|
// for (int j = 0; j < input_height; ++j) {
|
|
// // Pad on left, copy central data, pad on right.
|
|
// memset(output_data, pad_value, num_left_block_bytes);
|
|
// output_data += left_blocks_size;
|
|
// memcpy(output_data, input_data, num_inner_line_bytes);
|
|
// input_data += inner_line_size;
|
|
// output_data += inner_line_size;
|
|
// memset(output_data, pad_value, num_right_block_bytes);
|
|
// output_data += right_blocks_size;
|
|
// }
|
|
for (int j = 1; j < input_height; ++j) {
|
|
memset(output_data, pad_value,
|
|
num_right_block_bytes + num_left_block_bytes);
|
|
output_data += right_blocks_size + left_blocks_size;
|
|
memcpy(output_data, input_data, num_inner_line_bytes);
|
|
input_data += inner_line_size;
|
|
output_data += inner_line_size;
|
|
}
|
|
memset(output_data, pad_value,
|
|
num_right_block_bytes + num_bottom_block_bytes);
|
|
output_data += right_blocks_size + bottom_block_size;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T, typename P>
|
|
inline void PadImageStyle(const tflite::PadParams& op_params,
|
|
const RuntimeShape& input_shape, const T* input_data,
|
|
const P* pad_value_ptr,
|
|
const RuntimeShape& output_shape, T* output_data) {
|
|
TFLITE_ASSERT_FALSE;
|
|
}
|
|
|
|
template <typename P>
|
|
inline void PadImageStyle(const tflite::PadParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const uint8* input_data, const P* pad_value_ptr,
|
|
const RuntimeShape& output_shape,
|
|
uint8* output_data) {
|
|
PadImageStyleMemset(op_params, input_shape, input_data, pad_value_ptr,
|
|
output_shape, output_data);
|
|
}
|
|
|
|
template <typename P>
|
|
inline void PadImageStyle(const tflite::PadParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const float* input_data, const P* pad_value_ptr,
|
|
const RuntimeShape& output_shape,
|
|
float* output_data) {
|
|
const float converted_pad_value = static_cast<float>(*pad_value_ptr);
|
|
if (converted_pad_value == 0.0f) {
|
|
PadImageStyleMemset(op_params, input_shape, input_data, pad_value_ptr,
|
|
output_shape, output_data);
|
|
} else {
|
|
PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
|
|
output_data);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
inline void Slice(const tflite::SliceParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const RuntimeShape& output_shape,
|
|
SequentialTensorWriter<T>* writer) {
|
|
ruy::profiler::ScopeLabel label("Slice");
|
|
const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(4, input_shape);
|
|
// TODO(dkalenichenko): This op only supports 4D tensors or smaller.
|
|
TFLITE_DCHECK_LE(op_params.begin_count, 4);
|
|
TFLITE_DCHECK_LE(op_params.size_count, 4);
|
|
const int begin_count = op_params.begin_count;
|
|
const int size_count = op_params.size_count;
|
|
// We front-pad the begin and size vectors.
|
|
const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
|
|
const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1)
|
|
? ext_shape.Dims(0)
|
|
: start_b + op_params.size[0];
|
|
const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
|
|
const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
|
|
? ext_shape.Dims(1)
|
|
: start_h + op_params.size[size_count - 3];
|
|
const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
|
|
const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
|
|
? ext_shape.Dims(2)
|
|
: start_w + op_params.size[size_count - 2];
|
|
const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
|
|
const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
|
|
? ext_shape.Dims(3)
|
|
: start_d + op_params.size[size_count - 1];
|
|
|
|
for (int in_b = start_b; in_b < stop_b; ++in_b) {
|
|
for (int in_h = start_h; in_h < stop_h; ++in_h) {
|
|
for (int in_w = start_w; in_w < stop_w; ++in_w) {
|
|
const int len = stop_d - start_d;
|
|
if (len > 0)
|
|
writer->WriteN(Offset(ext_shape, in_b, in_h, in_w, start_d), len);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
inline void Slice(const tflite::SliceParams& op_params,
|
|
const RuntimeShape& input_shape, const T* input_data,
|
|
const RuntimeShape& output_shape, T* output_data) {
|
|
SequentialTensorWriter<T> writer(input_data, output_data);
|
|
return Slice(op_params, input_shape, output_shape, &writer);
|
|
}
|
|
|
|
template <typename T>
|
|
inline void Slice(const tflite::SliceParams& op_params,
|
|
const RuntimeShape& input_shape, const TfLiteTensor* input,
|
|
const RuntimeShape& output_shape, TfLiteTensor* output) {
|
|
SequentialTensorWriter<T> writer(input, output);
|
|
return Slice(op_params, input_shape, output_shape, &writer);
|
|
}
|
|
|
|
template <typename T>
|
|
void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
|
|
const T* input2_data, const RuntimeShape& output_shape,
|
|
T* output_data) {
|
|
ruy::profiler::ScopeLabel label("TensorFlowMinimum");
|
|
auto input1_map = MapAsVector(input1_data, input1_shape);
|
|
auto output_map = MapAsVector(output_data, output_shape);
|
|
auto min_value = input2_data[0];
|
|
output_map.array() = input1_map.array().min(min_value);
|
|
}
|
|
|
|
// Convenience version that allows, for example, generated-code calls to be
|
|
// the same as other binary ops.
|
|
template <typename T>
|
|
inline void Minimum(const RuntimeShape& input1_shape, const T* input1_data,
|
|
const RuntimeShape&, const T* input2_data,
|
|
const RuntimeShape& output_shape, T* output_data) {
|
|
// Drop shape of second input: not needed.
|
|
Minimum(input1_shape, input1_data, input2_data, output_shape, output_data);
|
|
}
|
|
|
|
template <typename T>
|
|
void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
|
|
const T* input2_data, const RuntimeShape& output_shape,
|
|
T* output_data) {
|
|
ruy::profiler::ScopeLabel label("TensorFlowMaximum");
|
|
auto input1_map = MapAsVector(input1_data, input1_shape);
|
|
auto output_map = MapAsVector(output_data, output_shape);
|
|
auto max_value = input2_data[0];
|
|
output_map.array() = input1_map.array().max(max_value);
|
|
}
|
|
|
|
// Convenience version that allows, for example, generated-code calls to be
|
|
// the same as other binary ops.
|
|
template <typename T>
|
|
inline void Maximum(const RuntimeShape& input1_shape, const T* input1_data,
|
|
const RuntimeShape&, const T* input2_data,
|
|
const RuntimeShape& output_shape, T* output_data) {
|
|
// Drop shape of second input: not needed.
|
|
Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
|
|
}
|
|
|
|
template <typename T>
|
|
void TransposeIm2col(const ConvParams& params, uint8 zero_byte,
|
|
const RuntimeShape& input_shape, const T* input_data,
|
|
const RuntimeShape& filter_shape,
|
|
const RuntimeShape& output_shape, T* im2col_data) {
|
|
ruy::profiler::ScopeLabel label("TransposeIm2col");
|
|
const int stride_width = params.stride_width;
|
|
const int stride_height = params.stride_height;
|
|
const int pad_width = params.padding_values.width;
|
|
const int pad_height = params.padding_values.height;
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK(im2col_data);
|
|
|
|
const int batches = MatchingDim(input_shape, 0, output_shape, 0);
|
|
const int input_height = input_shape.Dims(1);
|
|
const int input_width = input_shape.Dims(2);
|
|
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
|
|
const int filter_height = filter_shape.Dims(1);
|
|
const int filter_width = filter_shape.Dims(2);
|
|
const int output_height = output_shape.Dims(1);
|
|
const int output_width = output_shape.Dims(2);
|
|
MatchingDim(output_shape, 3, filter_shape, 0); // output_depth
|
|
|
|
// Construct the MxN sized im2col matrix.
|
|
// The rows M, are sub-ordered B x H x W
|
|
const RuntimeShape row_shape({1, batches, output_height, output_width});
|
|
// The columns, N, are sub-ordered Kh x Kw x Din
|
|
const RuntimeShape col_shape({1, filter_height, filter_width, input_depth});
|
|
// Use dimensions M and N to construct dims for indexing directly into im2col
|
|
const RuntimeShape im2col_shape(
|
|
{1, 1, row_shape.FlatSize(), col_shape.FlatSize()});
|
|
|
|
// Build the im2col matrix by looping through all the input pixels,
|
|
// computing their influence on the output, rather than looping through all
|
|
// the output pixels. We therefore must initialize the im2col array to zero.
|
|
// This is potentially inefficient because we subsequently overwrite bytes
|
|
// set here. However, in practice memset is very fast and costs negligible.
|
|
memset(im2col_data, zero_byte, im2col_shape.FlatSize() * sizeof(T));
|
|
|
|
// Loop through the output batches
|
|
for (int batch = 0; batch < batches; ++batch) {
|
|
// Loop through input pixels one at a time.
|
|
for (int in_y = 0; in_y < input_height; ++in_y) {
|
|
for (int in_x = 0; in_x < input_width; ++in_x) {
|
|
// Loop through the output pixels it will influence
|
|
const int out_x_origin = (in_x * stride_width) - pad_width;
|
|
const int out_y_origin = (in_y * stride_height) - pad_height;
|
|
for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
|
|
const int out_y = out_y_origin + filter_y;
|
|
// Is output pixel within height bounds?
|
|
if ((out_y >= 0) && (out_y < output_height)) {
|
|
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
|
const int out_x = out_x_origin + filter_x;
|
|
// Is output pixel within width bounds?
|
|
if ((out_x >= 0) && (out_x < output_width)) {
|
|
// Copy the input elements of this pixel
|
|
T const* src =
|
|
input_data + Offset(input_shape, batch, in_y, in_x, 0);
|
|
int row_offset = Offset(row_shape, 0, batch, out_y, out_x);
|
|
int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0);
|
|
T* dst = im2col_data +
|
|
Offset(im2col_shape, 0, 0, row_offset, col_offset);
|
|
memcpy(dst, src, input_depth * sizeof(T));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Returns in 'im_data' (assumes to be zero-initialized) image patch in storage
|
|
// order (height, width, depth), constructed from patches in 'col_data', which
|
|
// is required to be in storage order (out_height * out_width, filter_height,
|
|
// filter_width, in_depth). Implementation by Yangqing Jia (jiayq).
|
|
// Copied from //tensorflow/core/kernels/conv_grad_input_ops.cc
|
|
template <typename T>
|
|
void Col2im(const T* col_data, const int depth, const int height,
|
|
const int width, const int filter_h, const int filter_w,
|
|
const int pad_t, const int pad_l, const int pad_b, const int pad_r,
|
|
const int stride_h, const int stride_w, T* im_data) {
|
|
ruy::profiler::ScopeLabel label("Col2im");
|
|
int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
|
|
int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
|
|
int h_pad = -pad_t;
|
|
for (int h = 0; h < height_col; ++h) {
|
|
int w_pad = -pad_l;
|
|
for (int w = 0; w < width_col; ++w) {
|
|
T* im_patch_data = im_data + (h_pad * width + w_pad) * depth;
|
|
for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
|
|
for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
|
|
if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
|
|
// TODO(andydavis) Vectorize this loop (if compiler does not).
|
|
for (int i = 0; i < depth; ++i) {
|
|
im_patch_data[i] += col_data[i];
|
|
}
|
|
}
|
|
im_patch_data += depth;
|
|
col_data += depth;
|
|
}
|
|
// Jump over remaining number of depth.
|
|
im_patch_data += depth * (width - filter_w);
|
|
}
|
|
w_pad += stride_w;
|
|
}
|
|
h_pad += stride_h;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void BiasAdd(T* im_data, const T* bias_data, const int batch_size,
|
|
const int height, const int width, const int depth) {
|
|
if (bias_data) {
|
|
for (int n = 0; n < batch_size; ++n) {
|
|
for (int h = 0; h < height; ++h) {
|
|
for (int w = 0; w < width; ++w) {
|
|
for (int d = 0; d < depth; ++d) {
|
|
im_data[d] += bias_data[d];
|
|
}
|
|
im_data += depth;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// TransposeConvV2 expect the weights in HWOI order.
|
|
inline void TransposeConvV2(
|
|
const ConvParams& params, const RuntimeShape& input_shape,
|
|
const float* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
|
|
const float* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
|
|
const float* bias_data, const RuntimeShape& output_shape,
|
|
float* const output_data, const RuntimeShape& col2im_shape,
|
|
float* col2im_data, CpuBackendContext* cpu_backend_context) {
|
|
ruy::profiler::ScopeLabel label("TransposeConvV2/float");
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(hwoi_ordered_filter_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK(col2im_data);
|
|
TFLITE_DCHECK(hwoi_ordered_filter_data);
|
|
|
|
const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
|
|
const int input_image_size = input_shape.Dims(1) * input_shape.Dims(2);
|
|
const int output_height = output_shape.Dims(1);
|
|
const int output_width = output_shape.Dims(2);
|
|
const int output_image_size = output_height * output_width;
|
|
const int input_depth =
|
|
MatchingDim(input_shape, 3, hwoi_ordered_filter_shape, 3);
|
|
const int output_depth =
|
|
MatchingDim(output_shape, 3, hwoi_ordered_filter_shape, 2);
|
|
const int input_offset = input_image_size * input_depth;
|
|
const int output_offset = output_image_size * output_depth;
|
|
|
|
const int filter_height = hwoi_ordered_filter_shape.Dims(0);
|
|
const int filter_width = hwoi_ordered_filter_shape.Dims(1);
|
|
const int padding_top = params.padding_values.height;
|
|
const int padding_bottom =
|
|
params.padding_values.height + params.padding_values.height_offset;
|
|
const int padding_left = params.padding_values.width;
|
|
const int padding_right =
|
|
params.padding_values.width + params.padding_values.width_offset;
|
|
const int stride_height = params.stride_height;
|
|
const int stride_width = params.stride_width;
|
|
|
|
const int hwoi_ordered_filter_total_size =
|
|
filter_height * filter_width * output_depth;
|
|
|
|
cpu_backend_gemm::MatrixParams<float> lhs_params;
|
|
lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
|
|
lhs_params.rows = hwoi_ordered_filter_total_size;
|
|
lhs_params.cols = input_depth;
|
|
float* output_data_p = output_data;
|
|
std::fill_n(output_data, output_offset * batch_size, 0.0f);
|
|
for (int i = 0; i < batch_size; ++i) {
|
|
cpu_backend_gemm::MatrixParams<float> rhs_params;
|
|
rhs_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
rhs_params.rows = input_depth;
|
|
rhs_params.cols = input_image_size;
|
|
cpu_backend_gemm::MatrixParams<float> dst_params;
|
|
dst_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
dst_params.rows = hwoi_ordered_filter_total_size;
|
|
dst_params.cols = input_image_size;
|
|
cpu_backend_gemm::GemmParams<float, float> gemm_params;
|
|
cpu_backend_gemm::Gemm(lhs_params, hwoi_ordered_filter_data, rhs_params,
|
|
input_data + input_offset * i, dst_params,
|
|
col2im_data, gemm_params, cpu_backend_context);
|
|
|
|
Col2im(col2im_data, output_depth, output_height, output_width,
|
|
filter_height, filter_width, padding_top, padding_left,
|
|
padding_bottom, padding_right, stride_height, stride_width,
|
|
output_data_p);
|
|
output_data_p += output_offset;
|
|
}
|
|
output_data_p = output_data;
|
|
BiasAdd(output_data_p, bias_data, batch_size, output_height, output_width,
|
|
output_depth);
|
|
}
|
|
|
|
inline void Quantize(int32_t multiplier, int32_t shift, int32_t total_size,
|
|
int32_t output_zp, int32_t* scratch, uint8_t* output) {
|
|
ruy::profiler::ScopeLabel label("Quantize/uint8");
|
|
int i = 0;
|
|
const int32_t output_min = std::numeric_limits<uint8_t>::min();
|
|
const int32_t output_max = std::numeric_limits<uint8_t>::max();
|
|
|
|
#ifdef USE_NEON
|
|
const int32x4_t output_zp_dup = vdupq_n_s32(output_zp);
|
|
const int32x4_t max_val_dup = vdupq_n_s32(output_max);
|
|
const int32x4_t min_val_dup = vdupq_n_s32(output_min);
|
|
|
|
using gemmlowp::RoundingDivideByPOT;
|
|
using gemmlowp::SaturatingRoundingDoublingHighMul;
|
|
|
|
for (; i <= total_size - 16; i += 16) {
|
|
int32x4x4_t scratch_val;
|
|
scratch_val.val[0] = vld1q_s32(scratch + i);
|
|
scratch_val.val[1] = vld1q_s32(scratch + i + 4);
|
|
scratch_val.val[2] = vld1q_s32(scratch + i + 8);
|
|
scratch_val.val[3] = vld1q_s32(scratch + i + 12);
|
|
|
|
int32x4x4_t temp_val =
|
|
MultiplyByQuantizedMultiplier4Rows(scratch_val, multiplier, shift);
|
|
|
|
temp_val.val[0] = vaddq_s32(temp_val.val[0], output_zp_dup);
|
|
temp_val.val[1] = vaddq_s32(temp_val.val[1], output_zp_dup);
|
|
temp_val.val[2] = vaddq_s32(temp_val.val[2], output_zp_dup);
|
|
temp_val.val[3] = vaddq_s32(temp_val.val[3], output_zp_dup);
|
|
|
|
temp_val.val[0] =
|
|
vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
|
|
temp_val.val[1] =
|
|
vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
|
|
temp_val.val[2] =
|
|
vmaxq_s32(vminq_s32(temp_val.val[2], max_val_dup), min_val_dup);
|
|
temp_val.val[3] =
|
|
vmaxq_s32(vminq_s32(temp_val.val[3], max_val_dup), min_val_dup);
|
|
|
|
const uint16x8_t result_1 =
|
|
vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(temp_val.val[0])),
|
|
vqmovn_u32(vreinterpretq_u32_s32(temp_val.val[1])));
|
|
const uint16x8_t result_2 =
|
|
vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(temp_val.val[2])),
|
|
vqmovn_u32(vreinterpretq_u32_s32(temp_val.val[3])));
|
|
const uint8x16_t result =
|
|
vcombine_u8(vqmovn_u16(result_1), vqmovn_u16(result_2));
|
|
vst1q_u8(output + i, result);
|
|
}
|
|
#endif
|
|
for (; i < total_size; ++i) {
|
|
int32_t temp = MultiplyByQuantizedMultiplier(scratch[i], multiplier, shift);
|
|
temp += output_zp;
|
|
if (temp > output_max) {
|
|
temp = output_max;
|
|
}
|
|
if (temp < output_min) {
|
|
temp = output_min;
|
|
}
|
|
output[i] = static_cast<uint8_t>(temp);
|
|
}
|
|
}
|
|
|
|
inline void Quantize(const int32_t* multiplier, const int32_t* shift,
|
|
int32_t channel_size, int32_t total_size,
|
|
int32_t output_zp, int32_t output_min, int32_t output_max,
|
|
int32_t* scratch, int8_t* output) {
|
|
ruy::profiler::ScopeLabel label("Quantize/int8");
|
|
|
|
// Here we're trying to quantize the raw accumulators:
|
|
// output_channels
|
|
// data data data data data
|
|
// rows data data data data data
|
|
// data data data data data
|
|
// ....
|
|
//
|
|
// In order to minimize the reload of the multipliers & shifts, once we load
|
|
// the multipliers & shifts, we load & quantize the raw accumulators for every
|
|
// row.
|
|
#ifdef USE_NEON
|
|
const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
|
|
const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
|
|
const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
|
|
const int32x4_t zeros = vdupq_n_s32(0);
|
|
#endif
|
|
|
|
TFLITE_DCHECK_EQ(total_size % channel_size, 0);
|
|
const int32_t rows = total_size / channel_size;
|
|
|
|
int c = 0;
|
|
|
|
#ifdef USE_NEON
|
|
using gemmlowp::RoundingDivideByPOT;
|
|
for (; c <= channel_size - 8; c += 8) {
|
|
int32x4_t out_shift_1 = vld1q_s32(shift + c);
|
|
int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
|
|
int32x4_t left_shift_1 = vmaxq_s32(out_shift_1, zeros);
|
|
int32x4_t left_shift_2 = vmaxq_s32(out_shift_2, zeros);
|
|
|
|
// Right shift will be performed as left shift with negative values.
|
|
int32x4_t right_shift_1 = vminq_s32(out_shift_1, zeros);
|
|
int32x4_t right_shift_2 = vminq_s32(out_shift_2, zeros);
|
|
|
|
int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
|
|
int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
|
|
for (int n = 0; n < rows; ++n) {
|
|
int loc = n * channel_size + c;
|
|
int32x4_t acc_1 = vld1q_s32(scratch + loc);
|
|
int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
|
|
|
|
// Saturating Rounding Doubling High Mul.
|
|
acc_1 = vshlq_s32(acc_1, left_shift_1);
|
|
acc_1 = vqrdmulhq_s32(acc_1, out_mul_1);
|
|
acc_2 = vshlq_s32(acc_2, left_shift_2);
|
|
acc_2 = vqrdmulhq_s32(acc_2, out_mul_2);
|
|
|
|
// Rounding Dividing By POT.
|
|
acc_1 = vrshlq_s32(acc_1, right_shift_1);
|
|
acc_2 = vrshlq_s32(acc_2, right_shift_2);
|
|
|
|
// Add the output offset.
|
|
acc_1 = vaddq_s32(acc_1, output_offset_vec);
|
|
acc_2 = vaddq_s32(acc_2, output_offset_vec);
|
|
|
|
// Apply the activation function.
|
|
acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
|
|
acc_1 = vminq_s32(acc_1, output_activation_max_vec);
|
|
acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
|
|
acc_2 = vminq_s32(acc_2, output_activation_max_vec);
|
|
|
|
// Saturating cast to int8 and store to destination.
|
|
const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
|
|
const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
|
|
const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
|
|
const int8x8_t res_s8 = vqmovn_s16(res_s16);
|
|
vst1_s8(output + loc, res_s8);
|
|
}
|
|
}
|
|
|
|
#endif // USE_NEON
|
|
// Handle leftover values, one by one. This is very slow.
|
|
for (; c < channel_size; c++) {
|
|
for (int n = 0; n < rows; ++n) {
|
|
int loc = n * channel_size + c;
|
|
int32 acc = scratch[loc];
|
|
acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
|
|
acc += output_zp;
|
|
acc = std::max(acc, output_min);
|
|
acc = std::min(acc, output_max);
|
|
output[loc] = static_cast<int8>(acc);
|
|
}
|
|
}
|
|
}
|
|
|
|
// TransposeConvV2 expect the weights in HWOI order.
|
|
inline void TransposeConvV2(
|
|
const ConvParams& params, const RuntimeShape& input_shape,
|
|
const uint8_t* input_data, const RuntimeShape& hwoi_ordered_filter_shape,
|
|
const uint8_t* hwoi_ordered_filter_data, const RuntimeShape& bias_shape,
|
|
const int32* bias_data, const RuntimeShape& output_shape,
|
|
uint8_t* output_data, const RuntimeShape& col2im_shape,
|
|
int32_t* col2im_data, int32_t* scratch_data,
|
|
CpuBackendContext* cpu_backend_context) {
|
|
ruy::profiler::ScopeLabel label("TransposeConvV2/uint8");
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_EQ(hwoi_ordered_filter_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK(col2im_data);
|
|
TFLITE_DCHECK(hwoi_ordered_filter_data);
|
|
|
|
const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
|
|
const int input_image_size = input_shape.Dims(1) * input_shape.Dims(2);
|
|
const int output_height = output_shape.Dims(1);
|
|
const int output_width = output_shape.Dims(2);
|
|
const int output_image_size = output_height * output_width;
|
|
const int input_depth =
|
|
MatchingDim(input_shape, 3, hwoi_ordered_filter_shape, 3);
|
|
const int output_depth =
|
|
MatchingDim(output_shape, 3, hwoi_ordered_filter_shape, 2);
|
|
const int input_offset = input_image_size * input_depth;
|
|
const int output_offset = output_image_size * output_depth;
|
|
|
|
const int filter_height = hwoi_ordered_filter_shape.Dims(0);
|
|
const int filter_width = hwoi_ordered_filter_shape.Dims(1);
|
|
const int padding_top = params.padding_values.height;
|
|
const int padding_bottom =
|
|
params.padding_values.height + params.padding_values.height_offset;
|
|
const int padding_left = params.padding_values.width;
|
|
const int padding_right =
|
|
params.padding_values.width + params.padding_values.width_offset;
|
|
const int stride_height = params.stride_height;
|
|
const int stride_width = params.stride_width;
|
|
|
|
const int hwoi_ordered_filter_total_size =
|
|
filter_height * filter_width * output_depth;
|
|
|
|
cpu_backend_gemm::MatrixParams<uint8_t> lhs_params;
|
|
lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
|
|
lhs_params.rows = hwoi_ordered_filter_total_size;
|
|
lhs_params.cols = input_depth;
|
|
lhs_params.zero_point = -params.weights_offset;
|
|
|
|
int32_t* scratch_data_p = scratch_data;
|
|
std::fill_n(scratch_data, output_offset * batch_size, static_cast<int32>(0));
|
|
for (int i = 0; i < batch_size; ++i) {
|
|
cpu_backend_gemm::MatrixParams<uint8_t> rhs_params;
|
|
rhs_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
rhs_params.rows = input_depth;
|
|
rhs_params.cols = input_image_size;
|
|
rhs_params.zero_point = -params.input_offset;
|
|
|
|
cpu_backend_gemm::MatrixParams<int32_t> dst_params;
|
|
dst_params.order = cpu_backend_gemm::Order::kColMajor;
|
|
dst_params.rows = hwoi_ordered_filter_total_size;
|
|
dst_params.cols = input_image_size;
|
|
|
|
cpu_backend_gemm::GemmParams<int32_t, int32_t> gemm_params;
|
|
cpu_backend_gemm::Gemm(lhs_params, hwoi_ordered_filter_data, rhs_params,
|
|
input_data + input_offset * i, dst_params,
|
|
col2im_data, gemm_params, cpu_backend_context);
|
|
|
|
Col2im(col2im_data, output_depth, output_height, output_width,
|
|
filter_height, filter_width, padding_top, padding_left,
|
|
padding_bottom, padding_right, stride_height, stride_width,
|
|
scratch_data_p);
|
|
|
|
scratch_data_p += output_offset;
|
|
}
|
|
scratch_data_p = scratch_data;
|
|
BiasAdd(scratch_data_p, bias_data, batch_size, output_height, output_width,
|
|
output_depth);
|
|
|
|
Quantize(params.output_multiplier, params.output_shift,
|
|
output_shape.FlatSize(), params.output_offset, scratch_data,
|
|
output_data);
|
|
}
|
|
|
|
// Integer-only version of ResizeNearestNeighbor. Since scales are represented
|
|
// in fixed-point and thus approximated, |in_x| or |in_y| may differ from the
|
|
// reference version. Debug checks are in place to test if this occurs.
|
|
// NOTE: If align_corners or half_pixel_centers is true, we use the reference
|
|
// version.
|
|
inline void ResizeNearestNeighbor(
|
|
const tflite::ResizeNearestNeighborParams& op_params,
|
|
const RuntimeShape& unextended_input_shape, const uint8* input_data,
|
|
const RuntimeShape& output_size_shape, const int32* output_size_data,
|
|
const RuntimeShape& unextended_output_shape, uint8* output_data) {
|
|
if (op_params.align_corners || op_params.half_pixel_centers) {
|
|
// TODO(b/149823713): Add support for align_corners & half_pixel_centers in
|
|
// this kernel.
|
|
reference_ops::ResizeNearestNeighbor(
|
|
op_params, unextended_input_shape, input_data, output_size_shape,
|
|
output_size_data, unextended_output_shape, output_data);
|
|
return;
|
|
}
|
|
TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
|
|
TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
|
|
|
|
const RuntimeShape input_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_input_shape);
|
|
const RuntimeShape output_shape =
|
|
RuntimeShape::ExtendedShape(4, unextended_output_shape);
|
|
|
|
int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
|
|
int32 input_height = input_shape.Dims(1);
|
|
int32 input_width = input_shape.Dims(2);
|
|
int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
|
|
|
|
// The Tensorflow version of this op allows resize on the width and height
|
|
// axis only.
|
|
TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
|
|
int32 output_height = output_size_data[0];
|
|
int32 output_width = output_size_data[1];
|
|
|
|
// Convert scales to fixed-point with 16 fractional bits. We add 1 as an
|
|
// error factor and to avoid zero scales. For example, with input_height = 1,
|
|
// output_height = 3, the float scaling factor would be non-zero at 1/3.
|
|
// With fixed-point, this is zero.
|
|
int32 height_scale = (input_height << 16) / output_height + 1;
|
|
int32 width_scale = (input_width << 16) / output_width + 1;
|
|
|
|
const int col_offset = input_shape.Dims(3);
|
|
const int row_offset = input_shape.Dims(2) * col_offset;
|
|
const int batch_offset = input_shape.Dims(1) * row_offset;
|
|
|
|
const uint8* input_ptr = input_data;
|
|
uint8* output_ptr = output_data;
|
|
for (int b = 0; b < batches; ++b) {
|
|
for (int y = 0; y < output_height; ++y) {
|
|
int32 in_y = std::min((y * height_scale) >> 16, input_height - 1);
|
|
// Check offset calculation is the same as the reference version. See
|
|
// function comment for details. We check using a non-float version of:
|
|
// TFLITE_DCHECK_EQ(in_y, std::floor(y * (static_cast<float>(input_height)
|
|
// / output_height)));
|
|
TFLITE_DCHECK_LT(y * input_height, output_height + in_y * output_height);
|
|
TFLITE_DCHECK_GE(y * input_height, in_y * output_height);
|
|
const uint8* y_input_ptr = input_ptr + in_y * row_offset;
|
|
for (int x = 0; x < output_width; ++x) {
|
|
int32 in_x = std::min((x * width_scale) >> 16, input_width - 1);
|
|
// Check offset calculation is the same as the reference version. See
|
|
// function comment for details. We check using a non-float version of:
|
|
// TFLITE_DCHECK_EQ(in_y,
|
|
// std::floor(y * (static_cast<float>(input_width)
|
|
// / output_width)));
|
|
TFLITE_DCHECK_LT(x * input_width, output_width + in_x * output_width);
|
|
TFLITE_DCHECK_GE(x * input_width, in_x * output_width);
|
|
const uint8* x_input_ptr = y_input_ptr + in_x * col_offset;
|
|
memcpy(output_ptr, x_input_ptr, depth);
|
|
output_ptr += depth;
|
|
}
|
|
}
|
|
input_ptr += batch_offset;
|
|
}
|
|
}
|
|
|
|
template <typename input_type, typename output_type>
|
|
inline void Requantize(const input_type* input_data, int32_t size,
|
|
int32_t effective_scale_multiplier,
|
|
int32_t effective_scale_shift, int32_t input_zeropoint,
|
|
int32_t output_zeropoint, output_type* output_data) {
|
|
reference_ops::Requantize(input_data, size, effective_scale_multiplier,
|
|
effective_scale_shift, input_zeropoint,
|
|
output_zeropoint, output_data);
|
|
}
|
|
|
|
template <>
|
|
inline void Requantize<int8_t, uint8_t>(const int8_t* input_data, int32_t size,
|
|
int32_t effective_scale_multiplier,
|
|
int32_t effective_scale_shift,
|
|
int32_t input_zeropoint,
|
|
int32_t output_zeropoint,
|
|
uint8_t* output_data) {
|
|
ruy::profiler::ScopeLabel label("Requantize/Int8ToUint8");
|
|
|
|
static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
|
|
static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
|
|
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
// Constants.
|
|
const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
|
|
const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
|
|
const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
|
|
const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
|
|
|
|
for (; i <= size - 16; i += 16) {
|
|
const int8x16_t input_vec = vld1q_s8(input_data + i);
|
|
const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
|
|
const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
|
|
int32x4x4_t input;
|
|
input.val[0] = vmovl_s16(vget_low_s16(first_half));
|
|
input.val[1] = vmovl_s16(vget_high_s16(first_half));
|
|
input.val[2] = vmovl_s16(vget_low_s16(second_half));
|
|
input.val[3] = vmovl_s16(vget_high_s16(second_half));
|
|
input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
|
|
input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
|
|
input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
|
|
input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
|
|
|
|
int32x4x4_t result = MultiplyByQuantizedMultiplier4Rows(
|
|
input, effective_scale_multiplier, effective_scale_shift);
|
|
|
|
result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
|
|
result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
|
|
result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
|
|
result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
|
|
result.val[0] =
|
|
vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
|
|
result.val[1] =
|
|
vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
|
|
result.val[2] =
|
|
vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
|
|
result.val[3] =
|
|
vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
|
|
|
|
const uint32x4_t result_val_1_unsigned =
|
|
vreinterpretq_u32_s32(result.val[0]);
|
|
const uint32x4_t result_val_2_unsigned =
|
|
vreinterpretq_u32_s32(result.val[1]);
|
|
const uint32x4_t result_val_3_unsigned =
|
|
vreinterpretq_u32_s32(result.val[2]);
|
|
const uint32x4_t result_val_4_unsigned =
|
|
vreinterpretq_u32_s32(result.val[3]);
|
|
|
|
const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
|
|
const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
|
|
const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
|
|
const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
|
|
const uint16x8_t output_first_half =
|
|
vcombine_u16(narrowed_val_1, narrowed_val_2);
|
|
const uint16x8_t output_second_half =
|
|
vcombine_u16(narrowed_val_3, narrowed_val_4);
|
|
const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
|
|
const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
|
|
const uint8x16_t narrowed_result =
|
|
vcombine_u8(narrowed_first_half, narrowed_second_half);
|
|
vst1q_u8(output_data + i, narrowed_result);
|
|
}
|
|
|
|
#endif
|
|
for (; i < size; ++i) {
|
|
const int32_t input = input_data[i] - input_zeropoint;
|
|
const int32_t output =
|
|
MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
|
|
effective_scale_shift) +
|
|
output_zeropoint;
|
|
const int32_t clamped_output =
|
|
std::max(std::min(output, kMaxOutput), kMinOutput);
|
|
output_data[i] = static_cast<uint8_t>(clamped_output);
|
|
}
|
|
}
|
|
|
|
template <>
|
|
inline void Requantize<uint8_t, int8_t>(const uint8_t* input_data, int32_t size,
|
|
int32_t effective_scale_multiplier,
|
|
int32_t effective_scale_shift,
|
|
int32_t input_zeropoint,
|
|
int32_t output_zeropoint,
|
|
int8_t* output_data) {
|
|
ruy::profiler::ScopeLabel label("Requantize/Uint8ToInt8");
|
|
|
|
static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
|
|
static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
|
|
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
// Constants.
|
|
const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
|
|
const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
|
|
const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
|
|
const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
|
|
|
|
for (; i <= size - 16; i += 16) {
|
|
const uint8x16_t input_vec = vld1q_u8(input_data + i);
|
|
const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
|
|
const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
|
|
int32x4x4_t input;
|
|
input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
|
|
input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
|
|
input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
|
|
input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
|
|
input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
|
|
input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
|
|
input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
|
|
input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
|
|
|
|
int32x4x4_t result = MultiplyByQuantizedMultiplier4Rows(
|
|
input, effective_scale_multiplier, effective_scale_shift);
|
|
|
|
result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
|
|
result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
|
|
result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
|
|
result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
|
|
result.val[0] =
|
|
vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
|
|
result.val[1] =
|
|
vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
|
|
result.val[2] =
|
|
vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
|
|
result.val[3] =
|
|
vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
|
|
|
|
const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]);
|
|
const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]);
|
|
const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]);
|
|
const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]);
|
|
const int16x8_t output_first_half =
|
|
vcombine_s16(narrowed_val_1, narrowed_val_2);
|
|
const int16x8_t output_second_half =
|
|
vcombine_s16(narrowed_val_3, narrowed_val_4);
|
|
const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
|
|
const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
|
|
const int8x16_t narrowed_result =
|
|
vcombine_s8(narrowed_first_half, narrowed_second_half);
|
|
vst1q_s8(output_data + i, narrowed_result);
|
|
}
|
|
|
|
#endif
|
|
for (; i < size; ++i) {
|
|
const int32_t input = input_data[i] - input_zeropoint;
|
|
const int32_t output =
|
|
MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
|
|
effective_scale_shift) +
|
|
output_zeropoint;
|
|
const int32_t clamped_output =
|
|
std::max(std::min(output, kMaxOutput), kMinOutput);
|
|
output_data[i] = static_cast<int8_t>(clamped_output);
|
|
}
|
|
}
|
|
|
|
template <>
|
|
inline void Requantize<int8_t, int8_t>(const int8_t* input_data, int32_t size,
|
|
int32_t effective_scale_multiplier,
|
|
int32_t effective_scale_shift,
|
|
int32_t input_zeropoint,
|
|
int32_t output_zeropoint,
|
|
int8_t* output_data) {
|
|
ruy::profiler::ScopeLabel label("Requantize/Int8ToInt8");
|
|
|
|
static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
|
|
static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
|
|
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
// Constants.
|
|
const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
|
|
const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
|
|
const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
|
|
const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
|
|
|
|
for (; i <= size - 16; i += 16) {
|
|
const int8x16_t input_vec = vld1q_s8(input_data + i);
|
|
const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
|
|
const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
|
|
int32x4x4_t input;
|
|
input.val[0] = vmovl_s16(vget_low_s16(first_half));
|
|
input.val[1] = vmovl_s16(vget_high_s16(first_half));
|
|
input.val[2] = vmovl_s16(vget_low_s16(second_half));
|
|
input.val[3] = vmovl_s16(vget_high_s16(second_half));
|
|
|
|
input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
|
|
input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
|
|
input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
|
|
input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
|
|
|
|
int32x4x4_t result = MultiplyByQuantizedMultiplier4Rows(
|
|
input, effective_scale_multiplier, effective_scale_shift);
|
|
|
|
result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
|
|
result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
|
|
result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
|
|
result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
|
|
result.val[0] =
|
|
vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
|
|
result.val[1] =
|
|
vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
|
|
result.val[2] =
|
|
vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
|
|
result.val[3] =
|
|
vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
|
|
|
|
const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]);
|
|
const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]);
|
|
const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]);
|
|
const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]);
|
|
const int16x8_t output_first_half =
|
|
vcombine_s16(narrowed_val_1, narrowed_val_2);
|
|
const int16x8_t output_second_half =
|
|
vcombine_s16(narrowed_val_3, narrowed_val_4);
|
|
const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
|
|
const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
|
|
const int8x16_t narrowed_result =
|
|
vcombine_s8(narrowed_first_half, narrowed_second_half);
|
|
vst1q_s8(output_data + i, narrowed_result);
|
|
}
|
|
|
|
#endif
|
|
for (; i < size; ++i) {
|
|
const int32_t input = input_data[i] - input_zeropoint;
|
|
const int32_t output =
|
|
MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
|
|
effective_scale_shift) +
|
|
output_zeropoint;
|
|
const int32_t clamped_output =
|
|
std::max(std::min(output, kMaxOutput), kMinOutput);
|
|
output_data[i] = static_cast<int8_t>(clamped_output);
|
|
}
|
|
}
|
|
|
|
template <>
|
|
inline void Requantize<uint8_t, uint8_t>(
|
|
const uint8_t* input_data, int32_t size, int32_t effective_scale_multiplier,
|
|
int32_t effective_scale_shift, int32_t input_zeropoint,
|
|
int32_t output_zeropoint, uint8_t* output_data) {
|
|
ruy::profiler::ScopeLabel label("Requantize/Uint8ToUint8");
|
|
|
|
static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
|
|
static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
|
|
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
// Constants.
|
|
const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
|
|
const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
|
|
const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
|
|
const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
|
|
|
|
for (; i <= size - 16; i += 16) {
|
|
const uint8x16_t input_vec = vld1q_u8(input_data + i);
|
|
const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
|
|
const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
|
|
int32x4x4_t input;
|
|
input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
|
|
input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
|
|
input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
|
|
input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
|
|
input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
|
|
input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
|
|
input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
|
|
input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
|
|
|
|
int32x4x4_t result = MultiplyByQuantizedMultiplier4Rows(
|
|
input, effective_scale_multiplier, effective_scale_shift);
|
|
|
|
result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
|
|
result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
|
|
result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
|
|
result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
|
|
result.val[0] =
|
|
vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
|
|
result.val[1] =
|
|
vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
|
|
result.val[2] =
|
|
vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
|
|
result.val[3] =
|
|
vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
|
|
|
|
const uint32x4_t result_val_1_unsigned =
|
|
vreinterpretq_u32_s32(result.val[0]);
|
|
const uint32x4_t result_val_2_unsigned =
|
|
vreinterpretq_u32_s32(result.val[1]);
|
|
const uint32x4_t result_val_3_unsigned =
|
|
vreinterpretq_u32_s32(result.val[2]);
|
|
const uint32x4_t result_val_4_unsigned =
|
|
vreinterpretq_u32_s32(result.val[3]);
|
|
|
|
const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
|
|
const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
|
|
const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
|
|
const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
|
|
const uint16x8_t output_first_half =
|
|
vcombine_u16(narrowed_val_1, narrowed_val_2);
|
|
const uint16x8_t output_second_half =
|
|
vcombine_u16(narrowed_val_3, narrowed_val_4);
|
|
const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
|
|
const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
|
|
const uint8x16_t narrowed_result =
|
|
vcombine_u8(narrowed_first_half, narrowed_second_half);
|
|
vst1q_u8(output_data + i, narrowed_result);
|
|
}
|
|
|
|
#endif
|
|
for (; i < size; ++i) {
|
|
const int32_t input = input_data[i] - input_zeropoint;
|
|
const int32_t output =
|
|
MultiplyByQuantizedMultiplier(input, effective_scale_multiplier,
|
|
effective_scale_shift) +
|
|
output_zeropoint;
|
|
const int32_t clamped_output =
|
|
std::max(std::min(output, kMaxOutput), kMinOutput);
|
|
output_data[i] = static_cast<uint8_t>(clamped_output);
|
|
}
|
|
}
|
|
|
|
inline void HardSwish(const RuntimeShape& input_shape, const float* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("HardSwish/Float");
|
|
auto size = MatchingFlatSize(input_shape, output_shape);
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
const float32x4_t zero = vdupq_n_f32(0.0f);
|
|
const float32x4_t three = vdupq_n_f32(3.0f);
|
|
const float32x4_t six = vdupq_n_f32(6.0f);
|
|
const float32x4_t one_sixth = vdupq_n_f32(1.0f / 6.0f);
|
|
|
|
for (; i <= size - 16; i += 16) {
|
|
// 4x partially unrolled version of the loop below. Refer to its comments.
|
|
const float32x4_t in_0 = vld1q_f32(input_data + i + 0);
|
|
const float32x4_t in_1 = vld1q_f32(input_data + i + 4);
|
|
const float32x4_t in_2 = vld1q_f32(input_data + i + 8);
|
|
const float32x4_t in_3 = vld1q_f32(input_data + i + 12);
|
|
const float32x4_t in_scaled_0 = vmulq_f32(in_0, one_sixth);
|
|
const float32x4_t in_scaled_1 = vmulq_f32(in_1, one_sixth);
|
|
const float32x4_t in_scaled_2 = vmulq_f32(in_2, one_sixth);
|
|
const float32x4_t in_scaled_3 = vmulq_f32(in_3, one_sixth);
|
|
const float32x4_t in_reluish_0 =
|
|
vminq_f32(six, vmaxq_f32(zero, vaddq_f32(in_0, three)));
|
|
const float32x4_t in_reluish_1 =
|
|
vminq_f32(six, vmaxq_f32(zero, vaddq_f32(in_1, three)));
|
|
const float32x4_t in_reluish_2 =
|
|
vminq_f32(six, vmaxq_f32(zero, vaddq_f32(in_2, three)));
|
|
const float32x4_t in_reluish_3 =
|
|
vminq_f32(six, vmaxq_f32(zero, vaddq_f32(in_3, three)));
|
|
const float32x4_t product_0 = vmulq_f32(in_scaled_0, in_reluish_0);
|
|
const float32x4_t product_1 = vmulq_f32(in_scaled_1, in_reluish_1);
|
|
const float32x4_t product_2 = vmulq_f32(in_scaled_2, in_reluish_2);
|
|
const float32x4_t product_3 = vmulq_f32(in_scaled_3, in_reluish_3);
|
|
vst1q_f32(output_data + i + 0, product_0);
|
|
vst1q_f32(output_data + i + 4, product_1);
|
|
vst1q_f32(output_data + i + 8, product_2);
|
|
vst1q_f32(output_data + i + 12, product_3);
|
|
}
|
|
for (; i <= size - 4; i += 4) {
|
|
// The expression to be computed is:
|
|
// out = one_sixth * in * min(six, max(zero, (in + three)))
|
|
// We structure the AST to have two roughly balanced, independent branches:
|
|
// - Multiplication: in_scaled = one_sixth * in.
|
|
// - Addition and clamping: in_reluish = min(six, max(zero, (in + three))).
|
|
// Then the remaining multiplication at the root of the tree.
|
|
const float32x4_t in = vld1q_f32(input_data + i);
|
|
const float32x4_t in_scaled = vmulq_f32(in, one_sixth);
|
|
const float32x4_t in_reluish =
|
|
vminq_f32(six, vmaxq_f32(zero, vaddq_f32(in, three)));
|
|
const float32x4_t product = vmulq_f32(in_scaled, in_reluish);
|
|
vst1q_f32(output_data + i, product);
|
|
}
|
|
#endif
|
|
for (; i < size; i++) {
|
|
const float in = input_data[i];
|
|
output_data[i] =
|
|
in * std::min(6.0f, std::max(0.0f, in + 3.0f)) * (1.0f / 6.0f);
|
|
}
|
|
}
|
|
|
|
#ifdef USE_NEON
|
|
inline void SaturateAndStore(int16x8_t src, std::uint8_t* dst) {
|
|
// Narrow values down to 8 bit unsigned, saturating.
|
|
uint8x8_t res8 = vqmovun_s16(src);
|
|
// Store results to destination.
|
|
vst1_u8(dst, res8);
|
|
}
|
|
|
|
inline void SaturateAndStore(int16x8_t src, std::int8_t* dst) {
|
|
// Narrow values down to 8 bit unsigned, saturating.
|
|
int8x8_t res8 = vqmovn_s16(src);
|
|
// Store results to destination.
|
|
vst1_s8(dst, res8);
|
|
}
|
|
#endif
|
|
|
|
template <typename T>
|
|
inline void HardSwish(const HardSwishParams& params,
|
|
const RuntimeShape& input_shape, const T* input_data,
|
|
const RuntimeShape& output_shape, T* output_data) {
|
|
ruy::profiler::ScopeLabel label("HardSwish/Quantized");
|
|
|
|
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
|
|
|
int i = 0;
|
|
// This code heavily uses NEON saturating left shifts (vqshl*) with shift
|
|
// amounts that can be zero, in which case we rely on the correct behavior
|
|
// of a left shift by zero returning just its first operand unmodified.
|
|
// Unfortunately, the Intel arm_neon_sse.h implementation of vqshl* is
|
|
// buggy in the case of zero shift amounts, see b/137199585. That is why
|
|
// this NEON code path is restricted to true ARM NEON, excluding
|
|
// arm_neon_sse.h. Anyway, the arm_neon_sse.h implementation of saturating
|
|
// left shifts is slow scalar code, so there may not be much benefit in
|
|
// running that over just plain reference code.
|
|
//
|
|
// TODO(b/137199585): revisit when this is fixed.
|
|
#ifdef __ARM_NEON
|
|
const int16x8_t positive_reluish_multiplier_exponent_minus_one =
|
|
vdupq_n_s16(std::max(0, params.reluish_multiplier_exponent - 1));
|
|
const int16x8_t positive_reluish_multiplier_exponent_last_bit =
|
|
vdupq_n_s16(params.reluish_multiplier_exponent > 0 ? 1 : 0);
|
|
const int16x8_t negative_reluish_multiplier_exponent =
|
|
vdupq_n_s16(std::min(0, params.reluish_multiplier_exponent));
|
|
const int16x8_t constant_32767 = vdupq_n_s16(32767);
|
|
const int16x8_t output_multiplier_exponent =
|
|
vdupq_n_s16(params.output_multiplier_exponent);
|
|
const int16x8_t output_zero_point = vdupq_n_s16(params.output_zero_point);
|
|
// 4x unrolled version of the below NEON loop. Read that first.
|
|
for (; i <= flat_size - 32; i += 32) {
|
|
using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
|
|
const int16x8x2_t input_value_0_1 =
|
|
Load16AndSubtractZeroPoint(input_data + i, params.input_zero_point);
|
|
const int16x8x2_t input_value_2_3 = Load16AndSubtractZeroPoint(
|
|
input_data + i + 16, params.input_zero_point);
|
|
const int16x8_t input_value_on_hires_input_scale_0 =
|
|
vshlq_n_s16(input_value_0_1.val[0], 7);
|
|
const int16x8_t input_value_on_hires_input_scale_1 =
|
|
vshlq_n_s16(input_value_0_1.val[1], 7);
|
|
const int16x8_t input_value_on_hires_input_scale_2 =
|
|
vshlq_n_s16(input_value_2_3.val[0], 7);
|
|
const int16x8_t input_value_on_hires_input_scale_3 =
|
|
vshlq_n_s16(input_value_2_3.val[1], 7);
|
|
const int16x8_t input_value_on_preshift_output_scale_0 =
|
|
vqrdmulhq_n_s16(input_value_on_hires_input_scale_0,
|
|
params.output_multiplier_fixedpoint_int16);
|
|
const int16x8_t input_value_on_preshift_output_scale_1 =
|
|
vqrdmulhq_n_s16(input_value_on_hires_input_scale_1,
|
|
params.output_multiplier_fixedpoint_int16);
|
|
const int16x8_t input_value_on_preshift_output_scale_2 =
|
|
vqrdmulhq_n_s16(input_value_on_hires_input_scale_2,
|
|
params.output_multiplier_fixedpoint_int16);
|
|
const int16x8_t input_value_on_preshift_output_scale_3 =
|
|
vqrdmulhq_n_s16(input_value_on_hires_input_scale_3,
|
|
params.output_multiplier_fixedpoint_int16);
|
|
int16x8_t reluish_value_0 = input_value_on_hires_input_scale_0;
|
|
int16x8_t reluish_value_1 = input_value_on_hires_input_scale_1;
|
|
int16x8_t reluish_value_2 = input_value_on_hires_input_scale_2;
|
|
int16x8_t reluish_value_3 = input_value_on_hires_input_scale_3;
|
|
reluish_value_0 = vqshlq_s16(
|
|
reluish_value_0, positive_reluish_multiplier_exponent_minus_one);
|
|
reluish_value_1 = vqshlq_s16(
|
|
reluish_value_1, positive_reluish_multiplier_exponent_minus_one);
|
|
reluish_value_2 = vqshlq_s16(
|
|
reluish_value_2, positive_reluish_multiplier_exponent_minus_one);
|
|
reluish_value_3 = vqshlq_s16(
|
|
reluish_value_3, positive_reluish_multiplier_exponent_minus_one);
|
|
reluish_value_0 = vqrdmulhq_n_s16(
|
|
reluish_value_0, params.reluish_multiplier_fixedpoint_int16);
|
|
reluish_value_1 = vqrdmulhq_n_s16(
|
|
reluish_value_1, params.reluish_multiplier_fixedpoint_int16);
|
|
reluish_value_2 = vqrdmulhq_n_s16(
|
|
reluish_value_2, params.reluish_multiplier_fixedpoint_int16);
|
|
reluish_value_3 = vqrdmulhq_n_s16(
|
|
reluish_value_3, params.reluish_multiplier_fixedpoint_int16);
|
|
reluish_value_0 = vqshlq_s16(reluish_value_0,
|
|
positive_reluish_multiplier_exponent_last_bit);
|
|
reluish_value_1 = vqshlq_s16(reluish_value_1,
|
|
positive_reluish_multiplier_exponent_last_bit);
|
|
reluish_value_2 = vqshlq_s16(reluish_value_2,
|
|
positive_reluish_multiplier_exponent_last_bit);
|
|
reluish_value_3 = vqshlq_s16(reluish_value_3,
|
|
positive_reluish_multiplier_exponent_last_bit);
|
|
reluish_value_0 =
|
|
vrshlq_s16(reluish_value_0, negative_reluish_multiplier_exponent);
|
|
reluish_value_1 =
|
|
vrshlq_s16(reluish_value_1, negative_reluish_multiplier_exponent);
|
|
reluish_value_2 =
|
|
vrshlq_s16(reluish_value_2, negative_reluish_multiplier_exponent);
|
|
reluish_value_3 =
|
|
vrshlq_s16(reluish_value_3, negative_reluish_multiplier_exponent);
|
|
reluish_value_0 = vrhaddq_s16(reluish_value_0, constant_32767);
|
|
reluish_value_1 = vrhaddq_s16(reluish_value_1, constant_32767);
|
|
reluish_value_2 = vrhaddq_s16(reluish_value_2, constant_32767);
|
|
reluish_value_3 = vrhaddq_s16(reluish_value_3, constant_32767);
|
|
const int16x8_t preshift_output_value_0 =
|
|
vqdmulhq_s16(reluish_value_0, input_value_on_preshift_output_scale_0);
|
|
const int16x8_t preshift_output_value_1 =
|
|
vqdmulhq_s16(reluish_value_1, input_value_on_preshift_output_scale_1);
|
|
const int16x8_t preshift_output_value_2 =
|
|
vqdmulhq_s16(reluish_value_2, input_value_on_preshift_output_scale_2);
|
|
const int16x8_t preshift_output_value_3 =
|
|
vqdmulhq_s16(reluish_value_3, input_value_on_preshift_output_scale_3);
|
|
int16x8_t output_value_0 =
|
|
vrshlq_s16(preshift_output_value_0, output_multiplier_exponent);
|
|
int16x8_t output_value_1 =
|
|
vrshlq_s16(preshift_output_value_1, output_multiplier_exponent);
|
|
int16x8_t output_value_2 =
|
|
vrshlq_s16(preshift_output_value_2, output_multiplier_exponent);
|
|
int16x8_t output_value_3 =
|
|
vrshlq_s16(preshift_output_value_3, output_multiplier_exponent);
|
|
output_value_0 = vaddq_s16(output_value_0, output_zero_point);
|
|
output_value_1 = vaddq_s16(output_value_1, output_zero_point);
|
|
output_value_2 = vaddq_s16(output_value_2, output_zero_point);
|
|
output_value_3 = vaddq_s16(output_value_3, output_zero_point);
|
|
SaturateAndStore(output_value_0, output_data + i);
|
|
SaturateAndStore(output_value_1, output_data + i + 8);
|
|
SaturateAndStore(output_value_2, output_data + i + 16);
|
|
SaturateAndStore(output_value_3, output_data + i + 24);
|
|
}
|
|
// NEON version of reference_ops::HardSwish. Read that first.
|
|
for (; i <= flat_size - 8; i += 8) {
|
|
using cpu_backend_gemm::detail::Load8AndSubtractZeroPoint;
|
|
const int16x8_t input_value =
|
|
Load8AndSubtractZeroPoint(input_data + i, params.input_zero_point);
|
|
const int16x8_t input_value_on_hires_input_scale =
|
|
vshlq_n_s16(input_value, 7);
|
|
const int16x8_t input_value_on_preshift_output_scale =
|
|
vqrdmulhq_n_s16(input_value_on_hires_input_scale,
|
|
params.output_multiplier_fixedpoint_int16);
|
|
int16x8_t reluish_value = input_value_on_hires_input_scale;
|
|
reluish_value = vqshlq_s16(reluish_value,
|
|
positive_reluish_multiplier_exponent_minus_one);
|
|
reluish_value = vqrdmulhq_n_s16(reluish_value,
|
|
params.reluish_multiplier_fixedpoint_int16);
|
|
reluish_value = vqshlq_s16(reluish_value,
|
|
positive_reluish_multiplier_exponent_last_bit);
|
|
reluish_value =
|
|
vrshlq_s16(reluish_value, negative_reluish_multiplier_exponent);
|
|
reluish_value = vrhaddq_s16(reluish_value, constant_32767);
|
|
const int16x8_t preshift_output_value =
|
|
vqdmulhq_s16(reluish_value, input_value_on_preshift_output_scale);
|
|
int16x8_t output_value =
|
|
vrshlq_s16(preshift_output_value, output_multiplier_exponent);
|
|
output_value = vaddq_s16(output_value, output_zero_point);
|
|
SaturateAndStore(output_value, output_data + i);
|
|
}
|
|
#endif
|
|
// TODO(b/137208495): revisit when unit tests cover reference code.
|
|
// Fall back to reference_ops::HardSwish. In general we have preferred
|
|
// to duplicate such scalar code rather than call reference code to handle
|
|
// leftovers, thinking that code duplication was not a big concern.
|
|
// However, most of our unit tests happen to test only optimized code,
|
|
// and the quantized HardSwish implementation is nontrivial enough that
|
|
// I really want test coverage for the reference code.
|
|
if (i < flat_size) {
|
|
const RuntimeShape leftover_shape{flat_size - i};
|
|
reference_ops::HardSwish(params, leftover_shape, input_data + i,
|
|
leftover_shape, output_data + i);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
inline void IntegerExponentPow(const ArithmeticParams& params,
|
|
const RuntimeShape& unextended_base_shape,
|
|
const T* base_data, const int exponent,
|
|
const RuntimeShape& unextended_output_shape,
|
|
T* output_data) {
|
|
TFLITE_DCHECK_GE(exponent, 1);
|
|
if (exponent == 1) {
|
|
// copy data over.
|
|
std::memcpy(output_data, base_data,
|
|
unextended_base_shape.FlatSize() * sizeof(T));
|
|
} else {
|
|
IntegerExponentPow(params, unextended_base_shape, base_data, exponent / 2,
|
|
unextended_output_shape, output_data);
|
|
Mul(params, unextended_base_shape, output_data, unextended_base_shape,
|
|
output_data, unextended_output_shape, output_data);
|
|
if (exponent % 2 == 1) {
|
|
Mul(params, unextended_base_shape, base_data, unextended_base_shape,
|
|
output_data, unextended_output_shape, output_data);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
inline void BroadcastPow4D(const RuntimeShape& unextended_input1_shape,
|
|
const T* input1_data,
|
|
const RuntimeShape& unextended_input2_shape,
|
|
const T* input2_data,
|
|
const RuntimeShape& unextended_output_shape,
|
|
T* output_data) {
|
|
ruy::profiler::ScopeLabel label("PowBroadcast");
|
|
|
|
if (unextended_input2_shape.FlatSize() == 1) {
|
|
static const float epsilon = 1e-5;
|
|
const T exponent = input2_data[0];
|
|
const int int_exponent = static_cast<int>(std::round(exponent));
|
|
if ((std::abs(input2_data[0] - int_exponent) < epsilon) &&
|
|
(int_exponent >= 1)) {
|
|
ArithmeticParams params;
|
|
if (std::is_same<T, float>::value) {
|
|
params.float_activation_max = std::numeric_limits<float>::max();
|
|
params.float_activation_min = std::numeric_limits<float>::lowest();
|
|
} else if (std::is_same<T, int>::value) {
|
|
params.quantized_activation_max = std::numeric_limits<int>::max();
|
|
params.quantized_activation_min = std::numeric_limits<int>::lowest();
|
|
}
|
|
IntegerExponentPow(params, unextended_input1_shape, input1_data,
|
|
int_exponent, unextended_output_shape, output_data);
|
|
return;
|
|
}
|
|
}
|
|
reference_ops::BroadcastPow4DSlow(unextended_input1_shape, input1_data,
|
|
unextended_input2_shape, input2_data,
|
|
unextended_output_shape, output_data);
|
|
}
|
|
|
|
#ifdef USE_NEON
|
|
|
|
inline void ScaleWithNewZeroPoint(const int32x4_t input,
|
|
const float32x4_t scale_dup,
|
|
const float32x4_t zero_times_scale_dup,
|
|
float32x4_t* output) {
|
|
#ifdef __ARM_FEATURE_FMA
|
|
*output = vfmaq_f32(zero_times_scale_dup, vcvtq_f32_s32(input), scale_dup);
|
|
#else
|
|
*output = vaddq_f32(vmulq_f32(vcvtq_f32_s32(input), scale_dup),
|
|
zero_times_scale_dup);
|
|
#endif
|
|
}
|
|
|
|
#endif // USE_NEON
|
|
|
|
inline void Dequantize(const tflite::DequantizationParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const uint8_t* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("Dequantize/Uint8");
|
|
const int32 zero_point = op_params.zero_point;
|
|
const double scale = op_params.scale;
|
|
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
|
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
|
|
const float32x4_t zero_times_scale_dup =
|
|
vdupq_n_f32(static_cast<float>(-zero_point * scale));
|
|
for (; i <= flat_size - 8; i += 8) {
|
|
const uint8x8_t input_u8 = vld1_u8(input_data + i);
|
|
const uint16x8_t input_u16 = vmovl_u8(input_u8);
|
|
const int16x8_t input_s16 = vreinterpretq_s16_u16(input_u16);
|
|
const int16x4_t input_s16_low = vget_low_s16(input_s16);
|
|
const int16x4_t input_s16_high = vget_high_s16(input_s16);
|
|
const int32x4_t val_low = vmovl_s16(input_s16_low);
|
|
const int32x4_t val_high = vmovl_s16(input_s16_high);
|
|
|
|
float32x4_t result_low, result_high;
|
|
ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup,
|
|
&result_low);
|
|
ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup,
|
|
&result_high);
|
|
|
|
vst1q_f32(output_data + i, result_low);
|
|
vst1q_f32(output_data + i + 4, result_high);
|
|
}
|
|
#endif // NEON
|
|
for (; i < flat_size; ++i) {
|
|
const int32 val = input_data[i];
|
|
const float result = static_cast<float>(scale * (val - zero_point));
|
|
output_data[i] = result;
|
|
}
|
|
}
|
|
|
|
inline void Dequantize(const tflite::DequantizationParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const int8_t* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("Dequantize/Int8");
|
|
const int32 zero_point = op_params.zero_point;
|
|
const double scale = op_params.scale;
|
|
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
|
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
|
|
const float32x4_t zero_times_scale_dup =
|
|
vdupq_n_f32(static_cast<float>(-zero_point * scale));
|
|
for (; i <= flat_size - 8; i += 8) {
|
|
const int8x8_t input_s8 = vld1_s8(input_data + i);
|
|
const int16x8_t input_s16 = vmovl_s8(input_s8);
|
|
const int16x4_t input_s16_low = vget_low_s16(input_s16);
|
|
const int16x4_t input_s16_high = vget_high_s16(input_s16);
|
|
const int32x4_t val_low = vmovl_s16(input_s16_low);
|
|
const int32x4_t val_high = vmovl_s16(input_s16_high);
|
|
|
|
float32x4_t result_low, result_high;
|
|
ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup,
|
|
&result_low);
|
|
ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup,
|
|
&result_high);
|
|
|
|
vst1q_f32(output_data + i, result_low);
|
|
vst1q_f32(output_data + i + 4, result_high);
|
|
}
|
|
#endif // NEON
|
|
for (; i < flat_size; ++i) {
|
|
const int32 val = input_data[i];
|
|
const float result = static_cast<float>(scale * (val - zero_point));
|
|
output_data[i] = result;
|
|
}
|
|
}
|
|
|
|
inline void Dequantize(const tflite::DequantizationParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const int16_t* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
ruy::profiler::ScopeLabel label("Dequantize/Int16");
|
|
const int32 zero_point = op_params.zero_point;
|
|
const double scale = op_params.scale;
|
|
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
|
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
|
|
const float32x4_t zero_times_scale_dup =
|
|
vdupq_n_f32(static_cast<float>(-zero_point * scale));
|
|
for (; i <= flat_size - 8; i += 8) {
|
|
const int16x4_t input_s16_low = vld1_s16(input_data + i);
|
|
const int16x4_t input_s16_high = vld1_s16(input_data + i + 4);
|
|
const int32x4_t val_low = vmovl_s16(input_s16_low);
|
|
const int32x4_t val_high = vmovl_s16(input_s16_high);
|
|
|
|
float32x4_t result_low, result_high;
|
|
ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup,
|
|
&result_low);
|
|
ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup,
|
|
&result_high);
|
|
|
|
vst1q_f32(output_data + i, result_low);
|
|
vst1q_f32(output_data + i + 4, result_high);
|
|
}
|
|
#endif // NEON
|
|
for (; i < flat_size; ++i) {
|
|
const int32 val = input_data[i];
|
|
const float result = static_cast<float>(scale * (val - zero_point));
|
|
output_data[i] = result;
|
|
}
|
|
}
|
|
|
|
inline void Dequantize(const RuntimeShape& input_shape,
|
|
const Eigen::half* input_data,
|
|
const RuntimeShape& output_shape, float* output_data) {
|
|
reference_ops::Dequantize(input_shape, input_data, output_shape, output_data);
|
|
}
|
|
|
|
template <typename T>
|
|
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const float* input_data,
|
|
const RuntimeShape& output_shape, T* output_data) {
|
|
reference_ops::AffineQuantize(op_params, input_shape, input_data,
|
|
output_shape, output_data);
|
|
}
|
|
|
|
template <>
|
|
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const float* input_data,
|
|
const RuntimeShape& output_shape,
|
|
int8_t* output_data) {
|
|
ruy::profiler::ScopeLabel label("Quantize/Int8");
|
|
const int32 zero_point = op_params.zero_point;
|
|
const double scale = static_cast<double>(op_params.scale);
|
|
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
|
static constexpr int32 min_val = std::numeric_limits<int8_t>::min();
|
|
static constexpr int32 max_val = std::numeric_limits<int8_t>::max();
|
|
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
|
|
const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
|
|
const int32x4_t min_val_dup = vdupq_n_s32(min_val);
|
|
const int32x4_t max_val_dup = vdupq_n_s32(max_val);
|
|
|
|
for (; i <= flat_size - 8; i += 8) {
|
|
const float* src_data_ptr = input_data + i;
|
|
float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
|
|
float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
|
|
|
|
input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
|
|
input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
|
|
|
|
int32x4_t casted_val_0 = RoundToNearest(input_val_0);
|
|
int32x4_t casted_val_1 = RoundToNearest(input_val_1);
|
|
|
|
casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
|
|
casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
|
|
|
|
// Clamp the values to fit the target type's range.
|
|
casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
|
|
casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
|
|
casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
|
|
casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
|
|
|
|
const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
|
|
const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
|
|
const int16x8_t combined_val = vcombine_s16(narrowed_val_0, narrowed_val_1);
|
|
const int8x8_t combined_val_narrowed = vmovn_s16(combined_val);
|
|
vst1_s8(output_data + i, combined_val_narrowed);
|
|
}
|
|
#endif // NEON
|
|
|
|
for (; i < flat_size; ++i) {
|
|
const float val = input_data[i];
|
|
const int32 unclamped =
|
|
static_cast<int32>(TfLiteRound(val / scale)) + zero_point;
|
|
const int32 clamped = std::min(std::max(unclamped, min_val), max_val);
|
|
output_data[i] = clamped;
|
|
}
|
|
}
|
|
|
|
template <>
|
|
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const float* input_data,
|
|
const RuntimeShape& output_shape,
|
|
uint8_t* output_data) {
|
|
ruy::profiler::ScopeLabel label("Quantize/Uint8");
|
|
const int32 zero_point = op_params.zero_point;
|
|
const double scale = static_cast<double>(op_params.scale);
|
|
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
|
static constexpr int32 min_val = std::numeric_limits<uint8_t>::min();
|
|
static constexpr int32 max_val = std::numeric_limits<uint8_t>::max();
|
|
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
|
|
const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
|
|
const int32x4_t min_val_dup = vdupq_n_s32(min_val);
|
|
const int32x4_t max_val_dup = vdupq_n_s32(max_val);
|
|
|
|
for (; i <= flat_size - 8; i += 8) {
|
|
const float* src_data_ptr = input_data + i;
|
|
float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
|
|
float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
|
|
|
|
input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
|
|
input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
|
|
|
|
int32x4_t casted_val_0 = RoundToNearest(input_val_0);
|
|
int32x4_t casted_val_1 = RoundToNearest(input_val_1);
|
|
|
|
casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
|
|
casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
|
|
|
|
// Clamp the values to fit the target type's range.
|
|
casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
|
|
casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
|
|
casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
|
|
casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
|
|
|
|
const uint16x4_t narrowed_val_0 = vqmovun_s32(casted_val_0);
|
|
const uint16x4_t narrowed_val_1 = vqmovun_s32(casted_val_1);
|
|
const uint16x8_t combined_val =
|
|
vcombine_u16(narrowed_val_0, narrowed_val_1);
|
|
const uint8x8_t combined_val_narrowed = vmovn_u16(combined_val);
|
|
vst1_u8(output_data + i, combined_val_narrowed);
|
|
}
|
|
#endif // NEON
|
|
|
|
for (; i < flat_size; ++i) {
|
|
const float val = input_data[i];
|
|
const int32 unclamped =
|
|
static_cast<int32>(TfLiteRound(val / scale)) + zero_point;
|
|
const int32 clamped = std::min(std::max(unclamped, min_val), max_val);
|
|
output_data[i] = clamped;
|
|
}
|
|
}
|
|
|
|
template <>
|
|
inline void AffineQuantize(const tflite::QuantizationParams& op_params,
|
|
const RuntimeShape& input_shape,
|
|
const float* input_data,
|
|
const RuntimeShape& output_shape,
|
|
int16_t* output_data) {
|
|
ruy::profiler::ScopeLabel label("Quantize/Int16");
|
|
const int32 zero_point = op_params.zero_point;
|
|
const double scale = static_cast<double>(op_params.scale);
|
|
const int flat_size = MatchingFlatSize(input_shape, output_shape);
|
|
static constexpr int32 min_val = std::numeric_limits<int16_t>::min();
|
|
static constexpr int32 max_val = std::numeric_limits<int16_t>::max();
|
|
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
|
|
const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
|
|
const int32x4_t min_val_dup = vdupq_n_s32(min_val);
|
|
const int32x4_t max_val_dup = vdupq_n_s32(max_val);
|
|
|
|
for (; i <= flat_size - 8; i += 8) {
|
|
const float* src_data_ptr = input_data + i;
|
|
float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
|
|
float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
|
|
|
|
input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
|
|
input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
|
|
|
|
int32x4_t casted_val_0 = RoundToNearest(input_val_0);
|
|
int32x4_t casted_val_1 = RoundToNearest(input_val_1);
|
|
|
|
casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
|
|
casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
|
|
|
|
// Clamp the values to fit the target type's range.
|
|
casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
|
|
casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
|
|
casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
|
|
casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
|
|
|
|
const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
|
|
const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
|
|
vst1_s16(output_data + i, narrowed_val_0);
|
|
vst1_s16(output_data + i + 4, narrowed_val_1);
|
|
}
|
|
#endif // NEON
|
|
|
|
for (; i < flat_size; ++i) {
|
|
const float val = input_data[i];
|
|
const int32 unclamped =
|
|
static_cast<int32>(TfLiteRound(val / scale)) + zero_point;
|
|
const int32 clamped = std::min(std::max(unclamped, min_val), max_val);
|
|
output_data[i] = clamped;
|
|
}
|
|
}
|
|
|
|
// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
|
|
// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
|
|
// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
|
|
#ifdef GEMMLOWP_NEON
|
|
|
|
inline int16x8x4_t SaturatingRounding(
|
|
int16x8_t input_val_0, int16x8_t input_val_1, int16x8_t input_val_2,
|
|
int16x8_t input_val_3, int input_left_shift, int input_multiplier) {
|
|
// This performs what is expressed in the scalar code as
|
|
// const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
|
|
// static_cast<int16>(input_val_centered * (1 << input_left_shift)),
|
|
// static_cast<int16>(input_multiplier));
|
|
const int16x8_t left_shift_dup = vdupq_n_s16(input_left_shift);
|
|
const int16x8_t input_val_shifted_0 = vshlq_s16(input_val_0, left_shift_dup);
|
|
const int16x8_t input_val_shifted_1 = vshlq_s16(input_val_1, left_shift_dup);
|
|
const int16x8_t input_val_shifted_2 = vshlq_s16(input_val_2, left_shift_dup);
|
|
const int16x8_t input_val_shifted_3 = vshlq_s16(input_val_3, left_shift_dup);
|
|
int16x8x4_t result;
|
|
result.val[0] = vqrdmulhq_n_s16(input_val_shifted_0, input_multiplier);
|
|
result.val[1] = vqrdmulhq_n_s16(input_val_shifted_1, input_multiplier);
|
|
result.val[2] = vqrdmulhq_n_s16(input_val_shifted_2, input_multiplier);
|
|
result.val[3] = vqrdmulhq_n_s16(input_val_shifted_3, input_multiplier);
|
|
return result;
|
|
}
|
|
|
|
// 4-bit fixed point is enough for tanh since tanh(16) is almost same with one,
|
|
// considering 7 digits under zero.
|
|
inline int16x8x4_t FixedPoint4Logistic(int16x8x4_t input_val) {
|
|
// Invoke gemmlowp::logistic on FixedPoint wrapping int16x8_t
|
|
using FixedPoint4 = gemmlowp::FixedPoint<int16x8_t, 4>;
|
|
using FixedPoint0 = gemmlowp::FixedPoint<int16x8_t, 0>;
|
|
const FixedPoint4 input_val_f4_0 = FixedPoint4::FromRaw(input_val.val[0]);
|
|
const FixedPoint4 input_val_f4_1 = FixedPoint4::FromRaw(input_val.val[1]);
|
|
const FixedPoint4 input_val_f4_2 = FixedPoint4::FromRaw(input_val.val[2]);
|
|
const FixedPoint4 input_val_f4_3 = FixedPoint4::FromRaw(input_val.val[3]);
|
|
|
|
// TODO(b/134622898) Implement a low accuracy version of logistic. In this
|
|
// method, gemmlowp::tanh spends about 80% of the execution times. The
|
|
// current implementation is rougly 12-bit accurate in the 16-bit fixed
|
|
// point case. Until reaching to error bounds, there are rooms for
|
|
// improvements.
|
|
const FixedPoint0 output_val_f0_0 = gemmlowp::logistic(input_val_f4_0);
|
|
const FixedPoint0 output_val_f0_1 = gemmlowp::logistic(input_val_f4_1);
|
|
const FixedPoint0 output_val_f0_2 = gemmlowp::logistic(input_val_f4_2);
|
|
const FixedPoint0 output_val_f0_3 = gemmlowp::logistic(input_val_f4_3);
|
|
|
|
// Divide by 2^7 as in the scalar code
|
|
int16x8x4_t result;
|
|
result.val[0] = vrshrq_n_s16(output_val_f0_0.raw(), 7);
|
|
result.val[1] = vrshrq_n_s16(output_val_f0_1.raw(), 7);
|
|
result.val[2] = vrshrq_n_s16(output_val_f0_2.raw(), 7);
|
|
result.val[3] = vrshrq_n_s16(output_val_f0_3.raw(), 7);
|
|
return result;
|
|
}
|
|
|
|
// 4-bit fixed point is enough for tanh since tanh(16) is almost same with one,
|
|
// considering 11 digits under zero at least.
|
|
inline int16x8x4_t FixedPoint4Tanh(int16x8x4_t input_val) {
|
|
// Invoke gemmlowp::logistic on FixedPoint wrapping int16x8_t
|
|
using FixedPoint4 = gemmlowp::FixedPoint<int16x8_t, 4>;
|
|
using FixedPoint0 = gemmlowp::FixedPoint<int16x8_t, 0>;
|
|
const FixedPoint4 input_val_f4_0 = FixedPoint4::FromRaw(input_val.val[0]);
|
|
const FixedPoint4 input_val_f4_1 = FixedPoint4::FromRaw(input_val.val[1]);
|
|
const FixedPoint4 input_val_f4_2 = FixedPoint4::FromRaw(input_val.val[2]);
|
|
const FixedPoint4 input_val_f4_3 = FixedPoint4::FromRaw(input_val.val[3]);
|
|
|
|
// TODO(b/134622898) Implement a low accuracy version of logistic. In this
|
|
// method, gemmlowp::tanh spends about 80% of the execution times. The
|
|
// current implementation is rougly 12-bit accurate in the 16-bit fixed
|
|
// point case. Until reaching to error bounds, there are rooms for
|
|
// improvements.
|
|
const FixedPoint0 output_val_f0_0 = gemmlowp::tanh(input_val_f4_0);
|
|
const FixedPoint0 output_val_f0_1 = gemmlowp::tanh(input_val_f4_1);
|
|
const FixedPoint0 output_val_f0_2 = gemmlowp::tanh(input_val_f4_2);
|
|
const FixedPoint0 output_val_f0_3 = gemmlowp::tanh(input_val_f4_3);
|
|
|
|
// Divide by 2^7 as in the scalar code
|
|
int16x8x4_t result;
|
|
result.val[0] = vrshrq_n_s16(output_val_f0_0.raw(), 8);
|
|
result.val[1] = vrshrq_n_s16(output_val_f0_1.raw(), 8);
|
|
result.val[2] = vrshrq_n_s16(output_val_f0_2.raw(), 8);
|
|
result.val[3] = vrshrq_n_s16(output_val_f0_3.raw(), 8);
|
|
return result;
|
|
}
|
|
|
|
inline uint8x16x2_t CalculateUnsignedClampingWithRangeBitMasks(
|
|
int16x8x2_t input_val, int16x8_t range_radius_dup,
|
|
int16x8_t neg_range_radius_dup) {
|
|
const uint16x8_t mask_rightclamp_0 =
|
|
vcgtq_s16(input_val.val[0], range_radius_dup);
|
|
const uint16x8_t mask_rightclamp_1 =
|
|
vcgtq_s16(input_val.val[1], range_radius_dup);
|
|
|
|
const uint16x8_t mask_leftclamp_0 =
|
|
vcgeq_s16(input_val.val[0], neg_range_radius_dup);
|
|
const uint16x8_t mask_leftclamp_1 =
|
|
vcgeq_s16(input_val.val[1], neg_range_radius_dup);
|
|
|
|
uint8x16x2_t result;
|
|
result.val[0] = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
|
|
vshrn_n_u16(mask_leftclamp_1, 8));
|
|
result.val[1] = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
|
|
vshrn_n_u16(mask_rightclamp_1, 8));
|
|
return result;
|
|
}
|
|
|
|
inline uint8x16x2_t CalculateSignedClampingWithRangeBitMasks(
|
|
int16x8x2_t input_val, int16x8_t range_radius_dup,
|
|
int16x8_t neg_range_radius_dup) {
|
|
const uint16x8_t mask_rightclamp_0 =
|
|
vcgtq_s16(input_val.val[0], range_radius_dup);
|
|
const uint16x8_t mask_rightclamp_1 =
|
|
vcgtq_s16(input_val.val[1], range_radius_dup);
|
|
|
|
const uint16x8_t mask_leftclamp_0 =
|
|
vcltq_s16(input_val.val[0], neg_range_radius_dup);
|
|
const uint16x8_t mask_leftclamp_1 =
|
|
vcltq_s16(input_val.val[1], neg_range_radius_dup);
|
|
|
|
uint8x16x2_t result;
|
|
result.val[0] = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
|
|
vshrn_n_u16(mask_leftclamp_1, 8));
|
|
result.val[1] = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
|
|
vshrn_n_u16(mask_rightclamp_1, 8));
|
|
return result;
|
|
}
|
|
|
|
inline void ClampWithRangeAndStore(uint8_t* output_dst, uint8x16_t input_val,
|
|
uint8x16x2_t masks_clamp) {
|
|
// Store back to memory
|
|
vst1q_u8(output_dst, vandq_u8(vorrq_u8(input_val, masks_clamp.val[1]),
|
|
masks_clamp.val[0]));
|
|
}
|
|
|
|
inline void ClampWithRangeAndStore(int8_t* output_dst, int8x16_t input_val,
|
|
uint8x16x2_t masks_clamp) {
|
|
static const int8x16_t max_dup = vdupq_n_s8(127);
|
|
static const int8x16_t min_dup = vdupq_n_s8(-128);
|
|
// Store back to memory
|
|
vst1q_s8(output_dst,
|
|
vbslq_s8(masks_clamp.val[1], max_dup,
|
|
vbslq_s8(masks_clamp.val[0], min_dup, input_val)));
|
|
}
|
|
|
|
#endif // GEMMLOWP_NEON
|
|
|
|
inline void Tanh16bitPrecision(const TanhParams& params,
|
|
const RuntimeShape& input_shape,
|
|
const uint8* input_data,
|
|
const RuntimeShape& output_shape,
|
|
uint8* output_data) {
|
|
// Note that this is almost the exact same code as in Logistic().
|
|
ruy::profiler::ScopeLabel label("Tanh/Uint8");
|
|
const int32 input_zero_point = params.input_zero_point;
|
|
const int32 input_range_radius = params.input_range_radius;
|
|
const int16 input_multiplier = static_cast<int16>(params.input_multiplier);
|
|
const int16 input_left_shift = static_cast<int16>(params.input_left_shift);
|
|
const int size = MatchingFlatSize(input_shape, output_shape);
|
|
|
|
int c = 0;
|
|
int16_t output_zero_point = 128;
|
|
|
|
// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
|
|
// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
|
|
// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
|
|
#ifdef GEMMLOWP_NEON
|
|
const int16x8_t range_radius_dup = vdupq_n_s16(input_range_radius);
|
|
const int16x8_t neg_range_radius_dup = vdupq_n_s16(-input_range_radius);
|
|
const int16x8_t output_zero_point_s16 = vdupq_n_s16(output_zero_point);
|
|
|
|
// Handle 32 values at a time
|
|
for (; c <= size - 32; c += 32) {
|
|
// Read input uint8 values, cast to int16 and subtract input_zero_point
|
|
using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
|
|
const int16x8x2_t input_val_centered_0_1 =
|
|
Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
|
|
const int16x8x2_t input_val_centered_2_3 =
|
|
Load16AndSubtractZeroPoint(input_data + c + 16, input_zero_point);
|
|
|
|
// Prepare the bit masks that we will use at the end to implement the logic
|
|
// that was expressed in the scalar code with branching:
|
|
// if (input_val_centered < -input_range_radius) {
|
|
// output_val = 0;
|
|
// } else if (input_val_centered > input_range_radius) {
|
|
// output_val = 255;
|
|
// } else {
|
|
// ...
|
|
uint8x16x2_t masks_clamp_0_1 = CalculateUnsignedClampingWithRangeBitMasks(
|
|
input_val_centered_0_1, range_radius_dup, neg_range_radius_dup);
|
|
uint8x16x2_t masks_clamp_2_3 = CalculateUnsignedClampingWithRangeBitMasks(
|
|
input_val_centered_2_3, range_radius_dup, neg_range_radius_dup);
|
|
|
|
int16x8x4_t input_val_rescaled = SaturatingRounding(
|
|
input_val_centered_0_1.val[0], input_val_centered_0_1.val[1],
|
|
input_val_centered_2_3.val[0], input_val_centered_2_3.val[1],
|
|
input_left_shift, input_multiplier);
|
|
|
|
int16x8x4_t output_val_s16 = FixedPoint4Tanh(input_val_rescaled);
|
|
|
|
// Add the output zero point
|
|
output_val_s16.val[0] =
|
|
vaddq_s16(output_val_s16.val[0], output_zero_point_s16);
|
|
output_val_s16.val[1] =
|
|
vaddq_s16(output_val_s16.val[1], output_zero_point_s16);
|
|
output_val_s16.val[2] =
|
|
vaddq_s16(output_val_s16.val[2], output_zero_point_s16);
|
|
output_val_s16.val[3] =
|
|
vaddq_s16(output_val_s16.val[3], output_zero_point_s16);
|
|
|
|
// Cast output values to uint8, saturating
|
|
uint8x16_t output_val_u8_0_1 = vcombine_u8(
|
|
vqmovun_s16(output_val_s16.val[0]), vqmovun_s16(output_val_s16.val[1]));
|
|
uint8x16_t output_val_u8_2_3 = vcombine_u8(
|
|
vqmovun_s16(output_val_s16.val[2]), vqmovun_s16(output_val_s16.val[3]));
|
|
|
|
ClampWithRangeAndStore(output_data + c, output_val_u8_0_1, masks_clamp_0_1);
|
|
ClampWithRangeAndStore(output_data + c + 16, output_val_u8_2_3,
|
|
masks_clamp_2_3);
|
|
}
|
|
#endif // GEMMLOWP_NEON
|
|
// Leftover loop: handle one value at a time with scalar code.
|
|
for (; c < size; ++c) {
|
|
const uint8 input_val_u8 = input_data[c];
|
|
const int16 input_val_centered =
|
|
static_cast<int16>(input_val_u8) - input_zero_point;
|
|
uint8 output_val;
|
|
if (input_val_centered < -input_range_radius) {
|
|
output_val = 0;
|
|
} else if (input_val_centered > input_range_radius) {
|
|
output_val = 255;
|
|
} else {
|
|
using gemmlowp::SaturatingRoundingDoublingHighMul;
|
|
const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
|
|
static_cast<int16>(input_val_centered * (1 << input_left_shift)),
|
|
static_cast<int16>(input_multiplier));
|
|
using FixedPoint4 = gemmlowp::FixedPoint<int16, 4>;
|
|
using FixedPoint0 = gemmlowp::FixedPoint<int16, 0>;
|
|
const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
|
|
const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
|
|
using gemmlowp::RoundingDivideByPOT;
|
|
int16 output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 8);
|
|
output_val_s16 += output_zero_point;
|
|
if (output_val_s16 == 256) {
|
|
output_val_s16 = 255;
|
|
}
|
|
TFLITE_DCHECK_GE(output_val_s16, 0);
|
|
TFLITE_DCHECK_LE(output_val_s16, 255);
|
|
output_val = static_cast<uint8>(output_val_s16);
|
|
}
|
|
output_data[c] = output_val;
|
|
}
|
|
}
|
|
|
|
inline void Tanh16bitPrecision(const TanhParams& params,
|
|
const RuntimeShape& input_shape,
|
|
const int8* input_data,
|
|
const RuntimeShape& output_shape,
|
|
int8* output_data) {
|
|
// Note that this is almost the exact same code as in Logistic().
|
|
ruy::profiler::ScopeLabel label("Tanh/Int8");
|
|
const int32 input_zero_point = params.input_zero_point;
|
|
const int32 input_range_radius = params.input_range_radius;
|
|
const int16 input_multiplier = static_cast<int16>(params.input_multiplier);
|
|
const int16 input_left_shift = static_cast<int16>(params.input_left_shift);
|
|
const int size = MatchingFlatSize(input_shape, output_shape);
|
|
|
|
int c = 0;
|
|
// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
|
|
// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
|
|
// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
|
|
#ifdef GEMMLOWP_NEON
|
|
const int16x8_t range_radius_dup = vdupq_n_s16(input_range_radius);
|
|
const int16x8_t neg_range_radius_dup = vdupq_n_s16(-input_range_radius);
|
|
|
|
// Handle 32 values at a time
|
|
for (; c <= size - 32; c += 32) {
|
|
// Read input int8 values, cast to int16 and subtract input_zero_point
|
|
using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
|
|
const int16x8x2_t input_val_centered_0_1 =
|
|
Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
|
|
const int16x8x2_t input_val_centered_2_3 =
|
|
Load16AndSubtractZeroPoint(input_data + c + 16, input_zero_point);
|
|
|
|
// Prepare the bit masks that we will use at the end to implement the logic
|
|
// that was expressed in the scalar code with branching:
|
|
// if (input_val_centered < -input_range_radius) {
|
|
// output_val = -128;
|
|
// } else if (input_val_centered > input_range_radius) {
|
|
// output_val = 127;
|
|
// } else {
|
|
// ...
|
|
uint8x16x2_t masks_clamp_0_1 = CalculateSignedClampingWithRangeBitMasks(
|
|
input_val_centered_0_1, range_radius_dup, neg_range_radius_dup);
|
|
uint8x16x2_t masks_clamp_2_3 = CalculateSignedClampingWithRangeBitMasks(
|
|
input_val_centered_2_3, range_radius_dup, neg_range_radius_dup);
|
|
|
|
int16x8x4_t input_val_rescaled = SaturatingRounding(
|
|
input_val_centered_0_1.val[0], input_val_centered_0_1.val[1],
|
|
input_val_centered_2_3.val[0], input_val_centered_2_3.val[1],
|
|
input_left_shift, input_multiplier);
|
|
|
|
int16x8x4_t output_val_s16 = FixedPoint4Tanh(input_val_rescaled);
|
|
|
|
// Cast output values to uint8, saturating
|
|
int8x16_t output_val_s8_0_1 = vcombine_s8(
|
|
vqmovn_s16(output_val_s16.val[0]), vqmovn_s16(output_val_s16.val[1]));
|
|
int8x16_t output_val_s8_2_3 = vcombine_s8(
|
|
vqmovn_s16(output_val_s16.val[2]), vqmovn_s16(output_val_s16.val[3]));
|
|
|
|
ClampWithRangeAndStore(output_data + c, output_val_s8_0_1, masks_clamp_0_1);
|
|
ClampWithRangeAndStore(output_data + c + 16, output_val_s8_2_3,
|
|
masks_clamp_2_3);
|
|
}
|
|
#endif // GEMMLOWP_NEON
|
|
// Leftover loop: handle one value at a time with scalar code.
|
|
for (; c < size; ++c) {
|
|
const int8 input_val_s8 = input_data[c];
|
|
const int16 input_val_centered =
|
|
static_cast<int16>(input_val_s8) - input_zero_point;
|
|
int8 output_val;
|
|
if (input_val_centered <= -input_range_radius) {
|
|
output_val = -128;
|
|
} else if (input_val_centered >= input_range_radius) {
|
|
output_val = 127;
|
|
} else {
|
|
using gemmlowp::SaturatingRoundingDoublingHighMul;
|
|
const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
|
|
static_cast<int16>(input_val_centered * (1 << input_left_shift)),
|
|
static_cast<int16>(input_multiplier));
|
|
using FixedPoint4 = gemmlowp::FixedPoint<int16, 4>;
|
|
using FixedPoint0 = gemmlowp::FixedPoint<int16, 0>;
|
|
const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
|
|
const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
|
|
using gemmlowp::RoundingDivideByPOT;
|
|
int16 output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 8);
|
|
if (output_val_s16 == 128) {
|
|
output_val_s16 = 127;
|
|
}
|
|
TFLITE_DCHECK_GE(output_val_s16, -128);
|
|
TFLITE_DCHECK_LE(output_val_s16, 127);
|
|
output_val = static_cast<int8>(output_val_s16);
|
|
}
|
|
output_data[c] = output_val;
|
|
}
|
|
}
|
|
|
|
inline void Logistic16bitPrecision(const LogisticParams& params,
|
|
const RuntimeShape& input_shape,
|
|
const uint8* input_data,
|
|
const RuntimeShape& output_shape,
|
|
uint8* output_data) {
|
|
ruy::profiler::ScopeLabel label("Logistic/Uint8");
|
|
const int32 input_zero_point = params.input_zero_point;
|
|
const int32 input_range_radius = params.input_range_radius;
|
|
const int32 input_multiplier = params.input_multiplier;
|
|
const int16 input_left_shift = static_cast<int16>(params.input_left_shift);
|
|
const int size = MatchingFlatSize(input_shape, output_shape);
|
|
|
|
int c = 0;
|
|
// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
|
|
// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
|
|
// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
|
|
#ifdef GEMMLOWP_NEON
|
|
const int16x8_t range_radius_dup = vdupq_n_s16(input_range_radius);
|
|
const int16x8_t neg_range_radius_dup = vdupq_n_s16(-input_range_radius);
|
|
|
|
// Handle 32 values at a time
|
|
for (; c <= size - 32; c += 32) {
|
|
// Read input uint8 values, cast to int16 and subtract input_zero_point
|
|
using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
|
|
const int16x8x2_t input_val_centered_0_1 =
|
|
Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
|
|
const int16x8x2_t input_val_centered_2_3 =
|
|
Load16AndSubtractZeroPoint(input_data + c + 16, input_zero_point);
|
|
|
|
// Prepare the bit masks that we will use at the end to implement the logic
|
|
// that was expressed in the scalar code with branching:
|
|
// if (input_val_centered < -input_range_radius) {
|
|
// output_val = 0;
|
|
// } else if (input_val_centered > input_range_radius) {
|
|
// output_val = 255;
|
|
// } else {
|
|
// ...
|
|
uint8x16x2_t masks_clamp_0_1 = CalculateUnsignedClampingWithRangeBitMasks(
|
|
input_val_centered_0_1, range_radius_dup, neg_range_radius_dup);
|
|
uint8x16x2_t masks_clamp_2_3 = CalculateUnsignedClampingWithRangeBitMasks(
|
|
input_val_centered_2_3, range_radius_dup, neg_range_radius_dup);
|
|
|
|
int16x8x4_t input_val_rescaled = SaturatingRounding(
|
|
input_val_centered_0_1.val[0], input_val_centered_0_1.val[1],
|
|
input_val_centered_2_3.val[0], input_val_centered_2_3.val[1],
|
|
input_left_shift, input_multiplier);
|
|
|
|
int16x8x4_t output_val_s16 = FixedPoint4Logistic(input_val_rescaled);
|
|
|
|
// Cast output values to uint8, saturating
|
|
uint8x16_t output_val_u8_0_1 = vcombine_u8(
|
|
vqmovun_s16(output_val_s16.val[0]), vqmovun_s16(output_val_s16.val[1]));
|
|
uint8x16_t output_val_u8_2_3 = vcombine_u8(
|
|
vqmovun_s16(output_val_s16.val[2]), vqmovun_s16(output_val_s16.val[3]));
|
|
|
|
ClampWithRangeAndStore(output_data + c, output_val_u8_0_1, masks_clamp_0_1);
|
|
ClampWithRangeAndStore(output_data + c + 16, output_val_u8_2_3,
|
|
masks_clamp_2_3);
|
|
}
|
|
#endif // GEMMLOWP_NEON
|
|
// Leftover loop: handle one value at a time with scalar code.
|
|
for (; c < size; ++c) {
|
|
const uint8 input_val_u8 = input_data[c];
|
|
const int16 input_val_centered =
|
|
static_cast<int16>(input_val_u8) - input_zero_point;
|
|
uint8 output_val;
|
|
if (input_val_centered < -input_range_radius) {
|
|
output_val = 0;
|
|
} else if (input_val_centered > input_range_radius) {
|
|
output_val = 255;
|
|
} else {
|
|
using gemmlowp::SaturatingRoundingDoublingHighMul;
|
|
const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
|
|
static_cast<int16>(input_val_centered * (1 << input_left_shift)),
|
|
static_cast<int16>(input_multiplier));
|
|
using FixedPoint4 = gemmlowp::FixedPoint<int16, 4>;
|
|
using FixedPoint0 = gemmlowp::FixedPoint<int16, 0>;
|
|
const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
|
|
const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
|
|
using gemmlowp::RoundingDivideByPOT;
|
|
int16 output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 7);
|
|
if (output_val_s16 == 256) {
|
|
output_val_s16 = 255;
|
|
}
|
|
TFLITE_DCHECK_GE(output_val_s16, 0);
|
|
TFLITE_DCHECK_LE(output_val_s16, 255);
|
|
output_val = static_cast<uint8>(output_val_s16);
|
|
}
|
|
output_data[c] = output_val;
|
|
}
|
|
}
|
|
|
|
inline void Logistic16bitPrecision(const LogisticParams& params,
|
|
const RuntimeShape& input_shape,
|
|
const int8* input_data,
|
|
const RuntimeShape& output_shape,
|
|
int8* output_data) {
|
|
ruy::profiler::ScopeLabel label("Logistic/Int8");
|
|
const int32 input_zero_point = params.input_zero_point;
|
|
const int32 input_range_radius = params.input_range_radius;
|
|
const int32 input_multiplier = params.input_multiplier;
|
|
const int16 input_left_shift = static_cast<int16>(params.input_left_shift);
|
|
const int size = MatchingFlatSize(input_shape, output_shape);
|
|
|
|
int c = 0;
|
|
const int16 output_zero_point = 128;
|
|
// TODO(b/139252020): Replace GEMMLOWP_NEON with USE_NEON when the bug is fixed.
|
|
// The converted versions of gemmlowp::tanh and gemmlowp::logistic, done by
|
|
// arm_sse_2_neon.h, produce incorrect results with int16x8_t data types.
|
|
#ifdef GEMMLOWP_NEON
|
|
const int16x8_t range_radius_dup = vdupq_n_s16(input_range_radius);
|
|
const int16x8_t neg_range_radius_dup = vdupq_n_s16(-input_range_radius);
|
|
const int16x8_t output_zero_point_dup = vdupq_n_s16(output_zero_point);
|
|
|
|
// Handle 32 values at a time
|
|
for (; c <= size - 32; c += 32) {
|
|
// Read input int8 values, cast to int16 and subtract input_zero_point
|
|
using cpu_backend_gemm::detail::Load16AndSubtractZeroPoint;
|
|
const int16x8x2_t input_val_centered_0_1 =
|
|
Load16AndSubtractZeroPoint(input_data + c, input_zero_point);
|
|
const int16x8x2_t input_val_centered_2_3 =
|
|
Load16AndSubtractZeroPoint(input_data + c + 16, input_zero_point);
|
|
|
|
// Prepare the bit masks that we will use at the end to implement the logic
|
|
// that was expressed in the scalar code with branching:
|
|
// if (input_val_centered < -input_range_radius) {
|
|
// output_val = -128;
|
|
// } else if (input_val_centered > input_range_radius) {
|
|
// output_val = 127;
|
|
// } else {
|
|
// ...
|
|
uint8x16x2_t masks_clamp_0_1 = CalculateSignedClampingWithRangeBitMasks(
|
|
input_val_centered_0_1, range_radius_dup, neg_range_radius_dup);
|
|
uint8x16x2_t masks_clamp_2_3 = CalculateSignedClampingWithRangeBitMasks(
|
|
input_val_centered_2_3, range_radius_dup, neg_range_radius_dup);
|
|
|
|
int16x8x4_t input_val_rescaled = SaturatingRounding(
|
|
input_val_centered_0_1.val[0], input_val_centered_0_1.val[1],
|
|
input_val_centered_2_3.val[0], input_val_centered_2_3.val[1],
|
|
input_left_shift, input_multiplier);
|
|
|
|
int16x8x4_t output_val_s16 = FixedPoint4Logistic(input_val_rescaled);
|
|
|
|
// Substract output zero point.
|
|
output_val_s16.val[0] =
|
|
vsubq_s16(output_val_s16.val[0], output_zero_point_dup);
|
|
output_val_s16.val[1] =
|
|
vsubq_s16(output_val_s16.val[1], output_zero_point_dup);
|
|
output_val_s16.val[2] =
|
|
vsubq_s16(output_val_s16.val[2], output_zero_point_dup);
|
|
output_val_s16.val[3] =
|
|
vsubq_s16(output_val_s16.val[3], output_zero_point_dup);
|
|
|
|
// Cast output values to int8, saturating
|
|
int8x16_t output_val_s8_0_1 = vcombine_s8(
|
|
vqmovn_s16(output_val_s16.val[0]), vqmovn_s16(output_val_s16.val[1]));
|
|
int8x16_t output_val_s8_2_3 = vcombine_s8(
|
|
vqmovn_s16(output_val_s16.val[2]), vqmovn_s16(output_val_s16.val[3]));
|
|
|
|
ClampWithRangeAndStore(output_data + c, output_val_s8_0_1, masks_clamp_0_1);
|
|
ClampWithRangeAndStore(output_data + c + 16, output_val_s8_2_3,
|
|
masks_clamp_2_3);
|
|
}
|
|
#endif // GEMMLOWP_NEON
|
|
// Leftover loop: handle one value at a time with scalar code.
|
|
for (; c < size; ++c) {
|
|
const int8 input_val_s8 = input_data[c];
|
|
const int16 input_val_centered =
|
|
static_cast<int16>(input_val_s8) - input_zero_point;
|
|
int8 output_val;
|
|
if (input_val_centered < -input_range_radius) {
|
|
output_val = -128;
|
|
} else if (input_val_centered > input_range_radius) {
|
|
output_val = 127;
|
|
} else {
|
|
using gemmlowp::SaturatingRoundingDoublingHighMul;
|
|
const int16 input_val_rescaled = SaturatingRoundingDoublingHighMul(
|
|
static_cast<int16>(input_val_centered * (1 << input_left_shift)),
|
|
static_cast<int16>(input_multiplier));
|
|
using FixedPoint4 = gemmlowp::FixedPoint<int16, 4>;
|
|
using FixedPoint0 = gemmlowp::FixedPoint<int16, 0>;
|
|
const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
|
|
const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
|
|
using gemmlowp::RoundingDivideByPOT;
|
|
int16 output_val_s16 = RoundingDivideByPOT(output_val_f0.raw(), 7);
|
|
output_val_s16 -= output_zero_point;
|
|
if (output_val_s16 == 128) {
|
|
output_val_s16 = 127;
|
|
}
|
|
TFLITE_DCHECK_GE(output_val_s16, -128);
|
|
TFLITE_DCHECK_LE(output_val_s16, 127);
|
|
output_val = static_cast<int8>(output_val_s16);
|
|
}
|
|
output_data[c] = output_val;
|
|
}
|
|
}
|
|
|
|
// Transpose2D only deals with typical 2D matrix transpose ops.
|
|
// Perform transpose by transposing 4x4 blocks of the input, proceeding from
|
|
// left to right (down the rows) of the input, and then from top to bottom.
|
|
template <typename T>
|
|
inline void Transpose2D(const RuntimeShape& input_shape, const T* input_data,
|
|
const RuntimeShape& output_shape, T* output_data) {
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 2);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
|
|
|
|
const int d0 = input_shape.DimsData()[0];
|
|
const int d1 = input_shape.DimsData()[1];
|
|
const int kLines = 4;
|
|
const int kSkipSize = (kLines - 1) * d1;
|
|
|
|
const T* input = input_data;
|
|
|
|
int i = 0;
|
|
for (; i <= d0 - kLines; i += kLines) {
|
|
T* output = output_data + i;
|
|
|
|
const T* input_ptr = input;
|
|
optimized_ops_preload_l1_keep(input_ptr);
|
|
input_ptr += d1;
|
|
optimized_ops_preload_l1_keep(input_ptr);
|
|
input_ptr += d1;
|
|
optimized_ops_preload_l1_keep(input_ptr);
|
|
input_ptr += d1;
|
|
optimized_ops_preload_l1_keep(input_ptr);
|
|
|
|
int j = 0;
|
|
for (; j <= d1 - kLines; j += kLines) {
|
|
input_ptr = input;
|
|
const T a00 = input_ptr[0];
|
|
const T a01 = input_ptr[1];
|
|
const T a02 = input_ptr[2];
|
|
const T a03 = input_ptr[3];
|
|
input_ptr += d1;
|
|
const T a10 = input_ptr[0];
|
|
const T a11 = input_ptr[1];
|
|
const T a12 = input_ptr[2];
|
|
const T a13 = input_ptr[3];
|
|
input_ptr += d1;
|
|
const T a20 = input_ptr[0];
|
|
const T a21 = input_ptr[1];
|
|
const T a22 = input_ptr[2];
|
|
const T a23 = input_ptr[3];
|
|
input_ptr += d1;
|
|
const T a30 = input_ptr[0];
|
|
const T a31 = input_ptr[1];
|
|
const T a32 = input_ptr[2];
|
|
const T a33 = input_ptr[3];
|
|
|
|
output[0] = a00;
|
|
output[1] = a10;
|
|
output[2] = a20;
|
|
output[3] = a30;
|
|
output += d0;
|
|
|
|
output[0] = a01;
|
|
output[1] = a11;
|
|
output[2] = a21;
|
|
output[3] = a31;
|
|
output += d0;
|
|
|
|
output[0] = a02;
|
|
output[1] = a12;
|
|
output[2] = a22;
|
|
output[3] = a32;
|
|
output += d0;
|
|
|
|
output[0] = a03;
|
|
output[1] = a13;
|
|
output[2] = a23;
|
|
output[3] = a33;
|
|
output += d0;
|
|
|
|
input += kLines;
|
|
}
|
|
if (j == d1) {
|
|
input += kSkipSize;
|
|
} else {
|
|
for (int p = 0; p < kLines; ++p) {
|
|
for (int q = 0; q < d1 - j; ++q) {
|
|
*(output + q * d0 + p) = *(input + p * d1 + q);
|
|
}
|
|
}
|
|
input += (d1 - j) + kSkipSize;
|
|
}
|
|
}
|
|
for (; i < d0; ++i) {
|
|
T* output = output_data + i;
|
|
for (int j = 0; j < d1; ++j) {
|
|
*output = *input;
|
|
output += d0;
|
|
++input;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <>
|
|
inline void Transpose2D(const RuntimeShape& input_shape,
|
|
const int32_t* input_data,
|
|
const RuntimeShape& output_shape,
|
|
int32_t* output_data) {
|
|
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 2);
|
|
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
|
|
|
|
const int d0 = input_shape.DimsData()[0];
|
|
const int d1 = input_shape.DimsData()[1];
|
|
#ifdef USE_NEON
|
|
const int kLines = 4;
|
|
const int kSkipSize = (kLines - 1) * d1;
|
|
#endif
|
|
|
|
const int32_t* input = input_data;
|
|
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
for (; i <= d0 - kLines; i += kLines) {
|
|
int32_t* output = output_data + i;
|
|
|
|
const int32_t* input_ptr = input;
|
|
optimized_ops_preload_l1_keep(input_ptr);
|
|
input_ptr += d1;
|
|
optimized_ops_preload_l1_keep(input_ptr);
|
|
input_ptr += d1;
|
|
optimized_ops_preload_l1_keep(input_ptr);
|
|
input_ptr += d1;
|
|
optimized_ops_preload_l1_keep(input_ptr);
|
|
|
|
int j = 0;
|
|
for (; j <= d1 - kLines; j += kLines) {
|
|
input_ptr = input;
|
|
int32x4_t a0 = vld1q_s32(input);
|
|
input_ptr += d1;
|
|
int32x4_t a1 = vld1q_s32(input_ptr);
|
|
input_ptr += d1;
|
|
int32x4_t a2 = vld1q_s32(input_ptr);
|
|
input_ptr += d1;
|
|
int32x4_t a3 = vld1q_s32(input_ptr);
|
|
|
|
int32x4x2_t tmp1 = vuzpq_s32(a0, a2);
|
|
int32x4x2_t tmp2 = vuzpq_s32(a1, a3);
|
|
int32x4x2_t tmp3 = vtrnq_s32(tmp1.val[0], tmp2.val[0]);
|
|
int32x4x2_t tmp4 = vtrnq_s32(tmp1.val[1], tmp2.val[1]);
|
|
|
|
vst1q_s32(output, tmp3.val[0]);
|
|
output += d0;
|
|
vst1q_s32(output, tmp4.val[0]);
|
|
output += d0;
|
|
vst1q_s32(output, tmp3.val[1]);
|
|
output += d0;
|
|
vst1q_s32(output, tmp4.val[1]);
|
|
output += d0;
|
|
input += kLines;
|
|
}
|
|
if (j == d1) {
|
|
input += kSkipSize;
|
|
} else {
|
|
for (int p = 0; p < kLines; ++p) {
|
|
for (int q = 0; q < d1 - j; ++q) {
|
|
*(output + q * d0 + p) = *(input + p * d1 + q);
|
|
}
|
|
}
|
|
input += (d1 - j) + kSkipSize;
|
|
}
|
|
}
|
|
#endif
|
|
for (; i < d0; ++i) {
|
|
int32_t* output = output_data + i;
|
|
for (int j = 0; j < d1; ++j) {
|
|
*output = *input;
|
|
output += d0;
|
|
++input;
|
|
}
|
|
}
|
|
}
|
|
|
|
// TODO(alanchiao): see if we can reduce the number
|
|
// of lines of code in branching without affecting latency.
|
|
template <typename T>
|
|
inline void Transpose3D(const TransposeParams& params,
|
|
const RuntimeShape& input_shape, const T* input_data,
|
|
const RuntimeShape& output_shape, T* output_data) {
|
|
int s1, s2, s3;
|
|
s1 = input_shape.Dims(0);
|
|
s2 = input_shape.Dims(1);
|
|
s3 = input_shape.Dims(2);
|
|
|
|
int p1, p2, p3;
|
|
if (params.perm[0] == 2) {
|
|
p1 = 1;
|
|
} else if (params.perm[1] == 2) {
|
|
p2 = 1;
|
|
} else {
|
|
p3 = 1;
|
|
}
|
|
|
|
if (params.perm[0] == 1) {
|
|
p1 = s3;
|
|
} else if (params.perm[1] == 1) {
|
|
p2 = s3;
|
|
} else {
|
|
p3 = s3;
|
|
}
|
|
|
|
if (params.perm[0] == 0) {
|
|
p1 = s2 * s3;
|
|
} else if (params.perm[1] == 0) {
|
|
p2 = s2 * s3;
|
|
} else {
|
|
p3 = s2 * s3;
|
|
}
|
|
|
|
int o_s[3];
|
|
o_s[0] = input_shape.Dims(params.perm[0]);
|
|
o_s[1] = input_shape.Dims(params.perm[1]);
|
|
o_s[2] = input_shape.Dims(params.perm[2]);
|
|
|
|
for (int i1 = 0; i1 < o_s[0]; ++i1) {
|
|
for (int i2 = 0; i2 < o_s[1]; ++i2) {
|
|
for (int i3 = 0; i3 < o_s[2]; ++i3) {
|
|
const int i = i1 * p1 + i2 * p2 + i3 * p3;
|
|
const int o = i1 * o_s[1] * o_s[2] + i2 * o_s[2] + i3;
|
|
output_data[o] = input_data[i];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T, int N>
|
|
void TransposeImpl(const TransposeParams& params,
|
|
const RuntimeShape& input_shape, const T* input_data,
|
|
const RuntimeShape& output_shape, T* output_data) {
|
|
const int dims_cnt = input_shape.DimensionsCount();
|
|
|
|
int dim0, dim1;
|
|
if (transpose_utils::IsTranspose2DApplicable(params, input_shape, &dim0,
|
|
&dim1)) {
|
|
Transpose2D(RuntimeShape({dim0, dim1}), input_data,
|
|
RuntimeShape({dim1, dim0}), output_data);
|
|
return;
|
|
}
|
|
|
|
// TODO(b/141217325): notably Eigen is better suited for
|
|
// larger inputs whereas Transpose3D is generally
|
|
// better for smaller ones.
|
|
//
|
|
// E.g. on Nexus 5, Eigen is better for size 96^3 and up
|
|
// and Transpose3D is better for 72^3 and down.
|
|
//
|
|
// 96^3 is not mobile-friendly for certain usecases
|
|
// (e.g. model used in beam search for seq2seq) but is in others.
|
|
// Consider tradeoffs.
|
|
if (dims_cnt == 3) {
|
|
Transpose3D(params, input_shape, input_data, output_shape, output_data);
|
|
return;
|
|
}
|
|
|
|
// Reroute to the reference version if an optimized method for the given data
|
|
// is not available.
|
|
reference_ops::Transpose<T, N>(params, input_shape, input_data, output_shape,
|
|
output_data);
|
|
}
|
|
|
|
template <typename T, int N = 5>
|
|
void Transpose(const TransposeParams& unshrinked_params,
|
|
const RuntimeShape& unshrinked_input_shape, const T* input_data,
|
|
const RuntimeShape& unshrinked_output_shape, T* output_data) {
|
|
ruy::profiler::ScopeLabel label("Transpose");
|
|
|
|
const int output_size = unshrinked_output_shape.DimensionsCount();
|
|
TFLITE_DCHECK_LE(unshrinked_input_shape.DimensionsCount(), N);
|
|
TFLITE_DCHECK_LE(output_size, N);
|
|
TFLITE_DCHECK_EQ(output_size, unshrinked_params.perm_count);
|
|
|
|
RuntimeShape shrinked_input_shape = RuntimeShape(unshrinked_input_shape);
|
|
RuntimeShape shrinked_output_shape = RuntimeShape(unshrinked_output_shape);
|
|
TransposeParams shrinked_params = unshrinked_params;
|
|
|
|
// Reduce any dimensions that have one size. Lower transpose op usually
|
|
// performs better since memory access patterns will be improved.
|
|
transpose_utils::RemoveOneSizeDimensions(
|
|
&shrinked_input_shape, &shrinked_output_shape, &shrinked_params);
|
|
|
|
// Handle identity cases.
|
|
// TODO(b/140779653): Add an optimization pass in the conversion process to
|
|
// remove transpose op nodes where they do nothing like the below one.
|
|
bool identical = true;
|
|
for (int i = 0; i < shrinked_params.perm_count; ++i) {
|
|
if (shrinked_params.perm[i] != i) {
|
|
identical = false;
|
|
break;
|
|
}
|
|
}
|
|
if (identical) {
|
|
memcpy(output_data, input_data,
|
|
unshrinked_input_shape.FlatSize() * sizeof(T));
|
|
return;
|
|
}
|
|
|
|
// Reduce dimensions by flattening.
|
|
if (shrinked_params.perm[0] == 0 && output_size >= 3) {
|
|
RuntimeShape non_flatten_input_shape;
|
|
RuntimeShape non_flatten_output_shape;
|
|
TransposeParams non_flatten_params;
|
|
const int total_size = shrinked_input_shape.FlatSize();
|
|
const int non_flatten_size = transpose_utils::Flatten(
|
|
shrinked_input_shape, shrinked_output_shape, shrinked_params,
|
|
&non_flatten_input_shape, &non_flatten_output_shape,
|
|
&non_flatten_params);
|
|
TFLITE_DCHECK_NE(non_flatten_params.perm[0], 0);
|
|
|
|
for (int i = 0; i < total_size; i += non_flatten_size) {
|
|
TransposeImpl<T, N>(non_flatten_params, non_flatten_input_shape,
|
|
input_data + i, non_flatten_output_shape,
|
|
output_data + i);
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Call non-flattened case.
|
|
TransposeImpl<T, N>(shrinked_params, shrinked_input_shape, input_data,
|
|
shrinked_output_shape, output_data);
|
|
}
|
|
|
|
// Assume input1 & input2 have the same scale & zero point.
|
|
inline void MaximumElementwise(int size, const ArithmeticParams& params,
|
|
const int8* input1_data, const int8* input2_data,
|
|
int8* output_data) {
|
|
ruy::profiler::ScopeLabel label("MaximumElementwiseInt8/8bit");
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
for (; i <= size - 16; i += 16) {
|
|
const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
|
|
const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
|
|
const int8x16_t max_data =
|
|
vmaxq_s8(input1_val_original, input2_val_original);
|
|
vst1q_s8(output_data + i, max_data);
|
|
}
|
|
#endif // USE_NEON
|
|
for (; i < size; ++i) {
|
|
const int8 input1_val = input1_data[i];
|
|
const int8 input2_val = input2_data[i];
|
|
output_data[i] = std::max(input1_val, input2_val);
|
|
}
|
|
}
|
|
|
|
inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
|
|
int8 input1_data, const int8* input2_data,
|
|
int8* output_data) {
|
|
ruy::profiler::ScopeLabel label("MaximumScalarBroadcastInt8/8bit");
|
|
int i = 0;
|
|
|
|
#ifdef USE_NEON
|
|
const int8x16_t input1_val_original = vdupq_n_s8(input1_data);
|
|
for (; i <= size - 16; i += 16) {
|
|
const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
|
|
const int8x16_t max_data =
|
|
vmaxq_s8(input1_val_original, input2_val_original);
|
|
vst1q_s8(output_data + i, max_data);
|
|
}
|
|
#endif // USE_NEON
|
|
for (; i < size; ++i) {
|
|
const int8 input2_val = input2_data[i];
|
|
output_data[i] = std::max(input1_data, input2_val);
|
|
}
|
|
}
|
|
|
|
// Assume input1 & input2 have the same scale & zero point.
|
|
inline void MinimumElementwise(int size, const ArithmeticParams& params,
|
|
const int8* input1_data, const int8* input2_data,
|
|
int8* output_data) {
|
|
ruy::profiler::ScopeLabel label("MinimumElementwiseInt8/8bit");
|
|
int i = 0;
|
|
#ifdef USE_NEON
|
|
for (; i <= size - 16; i += 16) {
|
|
const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
|
|
const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
|
|
const int8x16_t min_data =
|
|
vminq_s8(input1_val_original, input2_val_original);
|
|
vst1q_s8(output_data + i, min_data);
|
|
}
|
|
#endif // USE_NEON
|
|
for (; i < size; ++i) {
|
|
const int8 input1_val = input1_data[i];
|
|
const int8 input2_val = input2_data[i];
|
|
output_data[i] = std::min(input1_val, input2_val);
|
|
}
|
|
}
|
|
|
|
inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
|
|
int8 input1_data, const int8* input2_data,
|
|
int8* output_data) {
|
|
ruy::profiler::ScopeLabel label("MinimumScalarBroadcastInt8/8bit");
|
|
int i = 0;
|
|
|
|
#ifdef USE_NEON
|
|
const int8x16_t input1_val_original = vdupq_n_s8(input1_data);
|
|
for (; i <= size - 16; i += 16) {
|
|
const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
|
|
const int8x16_t min_data =
|
|
vminq_s8(input1_val_original, input2_val_original);
|
|
vst1q_s8(output_data + i, min_data);
|
|
}
|
|
#endif // USE_NEON
|
|
for (; i < size; ++i) {
|
|
const int8 input2_val = input2_data[i];
|
|
output_data[i] = std::min(input1_data, input2_val);
|
|
}
|
|
}
|
|
|
|
template <typename Op>
|
|
inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape,
|
|
const int8* input1_data,
|
|
const RuntimeShape& input2_shape,
|
|
const int8* input2_data,
|
|
const RuntimeShape& output_shape,
|
|
int8* output_data, Op op) {
|
|
if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
|
|
return reference_ops::MaximumMinimumBroadcastSlow(
|
|
input1_shape, input1_data, input2_shape, input2_data, output_shape,
|
|
output_data, op);
|
|
}
|
|
|
|
BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
|
|
input2_data, output_shape, output_data,
|
|
MaximumElementwise, MaximumScalarBroadcast);
|
|
}
|
|
|
|
template <typename Op>
|
|
inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
|
|
const RuntimeShape& input1_shape,
|
|
const int8* input1_data,
|
|
const RuntimeShape& input2_shape,
|
|
const int8* input2_data,
|
|
const RuntimeShape& output_shape,
|
|
int8* output_data, Op op) {
|
|
if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
|
|
return reference_ops::MaximumMinimumBroadcastSlow(
|
|
input1_shape, input1_data, input2_shape, input2_data, output_shape,
|
|
output_data, op);
|
|
}
|
|
|
|
BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
|
|
input2_data, output_shape, output_data,
|
|
MinimumElementwise, MinimumScalarBroadcast);
|
|
}
|
|
|
|
} // namespace optimized_ops
|
|
} // namespace tflite
|
|
|
|
#if defined OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
|
|
#undef OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
|
|
#pragma GCC diagnostic pop
|
|
#endif
|
|
|
|
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
|