minor spelling tweaks
This commit is contained in:
parent
56944a8148
commit
caa68bf2d0
@ -110,7 +110,7 @@ TEST_F(ReadAccelerationConfigTest, IgnoresCommentedLines) {
|
||||
EXPECT_TRUE(blacklist_.empty());
|
||||
}
|
||||
|
||||
TEST_F(ReadAccelerationConfigTest, CommentCanHaveTralingBlanks) {
|
||||
TEST_F(ReadAccelerationConfigTest, CommentCanHaveTrailingBlanks) {
|
||||
ReadAccelerationConfig(" #key,value", consumer_);
|
||||
|
||||
EXPECT_TRUE(whitelist_.empty());
|
||||
|
@ -809,7 +809,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
|
||||
params.input_range_radius = data->input_range_radius;
|
||||
params.input_multiplier = data->input_multiplier;
|
||||
params.input_left_shift = data->input_left_shift;
|
||||
optimized_ops::Tanh16bitPercision(
|
||||
optimized_ops::Tanh16bitPrecision(
|
||||
params, GetTensorShape(input), GetTensorData<uint8_t>(input),
|
||||
GetTensorShape(output), GetTensorData<uint8_t>(output));
|
||||
} else {
|
||||
@ -824,7 +824,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
|
||||
params.input_range_radius = data->input_range_radius;
|
||||
params.input_multiplier = data->input_multiplier;
|
||||
params.input_left_shift = data->input_left_shift;
|
||||
optimized_ops::Tanh16bitPercision(
|
||||
optimized_ops::Tanh16bitPrecision(
|
||||
params, GetTensorShape(input), GetTensorData<int8_t>(input),
|
||||
GetTensorShape(output), GetTensorData<int8_t>(output));
|
||||
} else {
|
||||
@ -881,7 +881,7 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
|
||||
params.input_range_radius = data->input_range_radius;
|
||||
params.input_multiplier = data->input_multiplier;
|
||||
params.input_left_shift = data->input_left_shift;
|
||||
optimized_ops::Logistic16bitPercision(
|
||||
optimized_ops::Logistic16bitPrecision(
|
||||
params, GetTensorShape(input), GetTensorData<uint8_t>(input),
|
||||
GetTensorShape(output), GetTensorData<uint8_t>(output));
|
||||
} else {
|
||||
@ -896,7 +896,7 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
|
||||
params.input_range_radius = data->input_range_radius;
|
||||
params.input_multiplier = data->input_multiplier;
|
||||
params.input_left_shift = data->input_left_shift;
|
||||
optimized_ops::Logistic16bitPercision(
|
||||
optimized_ops::Logistic16bitPrecision(
|
||||
params, GetTensorShape(input), GetTensorData<int8_t>(input),
|
||||
GetTensorShape(output), GetTensorData<int8_t>(output));
|
||||
} else {
|
||||
|
@ -2766,11 +2766,11 @@ TEST_P(LSTMOpTest, BlackBoxTestWithAuxInputZeroAuxWeight) {
|
||||
// Aux input and input are the same, so we should observe the same outputs
|
||||
// as there's no aux input.
|
||||
lstm.SetAuxInput(0, batch0_start, batch0_end);
|
||||
std::vector<float> dummpy_weights(n_cell * n_input, 0.0f);
|
||||
lstm.SetAuxInputToInputWeights(dummpy_weights);
|
||||
lstm.SetAuxInputToForgetWeights(dummpy_weights);
|
||||
lstm.SetAuxInputToCellWeights(dummpy_weights);
|
||||
lstm.SetAuxInputToOutputWeights(dummpy_weights);
|
||||
std::vector<float> dummy_weights(n_cell * n_input, 0.0f);
|
||||
lstm.SetAuxInputToInputWeights(dummy_weights);
|
||||
lstm.SetAuxInputToForgetWeights(dummy_weights);
|
||||
lstm.SetAuxInputToCellWeights(dummy_weights);
|
||||
lstm.SetAuxInputToOutputWeights(dummy_weights);
|
||||
|
||||
lstm.Invoke();
|
||||
|
||||
|
@ -1346,7 +1346,7 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestCrossLinkingAuxInputOnlyTimeMajor) {
|
||||
}
|
||||
|
||||
// Same as BlackBox test, but the input tensor and weights tensor are split
|
||||
// along the last dimension and passed to both regular and auxiliry inputs and
|
||||
// along the last dimension and passed to both regular and auxiliary inputs and
|
||||
// weights. The output in this case is the same. To understand this, let's
|
||||
// define W and V as regular input weights matrix and auxiliary input weights
|
||||
// matrix correspondingly. It's easy to see that this is equivalent to a regular
|
||||
|
@ -55,7 +55,7 @@ class CpuBackendContext final : public TfLiteInternalBackendContext {
|
||||
const std::unique_ptr<ruy::Context> ruy_context_;
|
||||
const std::unique_ptr<gemmlowp::GemmContext> gemmlowp_context_;
|
||||
|
||||
// The maxinum of threads used for parallelizing TfLite ops. However,
|
||||
// The maximum of threads used for parallelizing TfLite ops. However,
|
||||
// cpu_backend_threadpool::Execute creates as many threads as it's
|
||||
// asked to, regardless of this. Typically a call site would query
|
||||
// cpu_backend_context->max_num_threads() and used that to determine
|
||||
|
@ -593,10 +593,10 @@ struct CustomGemvImpl<LhsScalar, RhsScalar, std::int32_t, DstScalar,
|
||||
|
||||
// We want to use fused multiply-add when it's available (that is, on A64
|
||||
// unconditionally and on A32 with VFPv4) because it's often faster, and
|
||||
// because non-fused seems not to be available in A64 so a conscentious compiler
|
||||
// might emit slow code (separate mul and add instructions) in order to
|
||||
// because non-fused seems not to be available in A64 so a conscientious
|
||||
// compiler might emit slow code (separate mul and add instructions) in order to
|
||||
// implement the vmlaq_f32 intrinsic with strict bit-for-bit exactness on A64.
|
||||
// (Compilers seems to be generating a fused fmla instruction at the moment,
|
||||
// (Compilers seem to be generating a fused fmla instruction at the moment,
|
||||
// but that could change).
|
||||
//
|
||||
// We still want to support building for A32 without VFPv4.
|
||||
|
@ -19,7 +19,7 @@ limitations under the License.
|
||||
|
||||
// See b/131835803: in TFLite code, because eigen_spatial_convolutions.h does
|
||||
// #define Eigen EigenForTFLite, it is difficult to have any #include of Eigen
|
||||
// headers in a header file, as that results in name clases (compilation
|
||||
// headers in a header file, as that results in name classes (compilation
|
||||
// errors) depending on the order in which these headers are #included.
|
||||
// So we have moved the #include of Eigen here, in a .cc file, where we have
|
||||
// control over the header #include sequence.
|
||||
|
@ -737,7 +737,7 @@ TEST(DetectionPostprocessOpTest,
|
||||
ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
|
||||
}
|
||||
|
||||
TEST(DetectionPostprocessOpTest, FloatTestwithNoBackgroudClassAndKeypoints) {
|
||||
TEST(DetectionPostprocessOpTest, FloatTestwithNoBackgroundClassAndKeypoints) {
|
||||
DetectionPostprocessOpModelwithRegularNMS m(
|
||||
{TensorType_FLOAT32, {1, 6, 5}}, {TensorType_FLOAT32, {1, 6, 2}},
|
||||
{TensorType_FLOAT32, {6, 4}}, {TensorType_FLOAT32, {}},
|
||||
|
@ -251,7 +251,7 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
|
||||
TfLiteIntArray* output_size_array = nullptr;
|
||||
if (params->keep_num_dims) {
|
||||
// When number of dimensions are kept the filter operates along the last
|
||||
// dimenions. In other words, for an input tensor with shape
|
||||
// dimentions. In other words, for an input tensor with shape
|
||||
// [batch_size, ..., n_inputs] and a filter of shape [n_inputs, n_units]
|
||||
// this Op produces an output of shape [batch_size, ..., n_units].
|
||||
TF_LITE_ENSURE_EQ(context, input->dims->data[input->dims->size - 1],
|
||||
|
@ -790,7 +790,7 @@ TEST_P(QuantizedFullyConnectedOpTest,
|
||||
SimpleTestQuantizedInt16OutputShuffled4x16Int8Weights) {
|
||||
// The shuffled weights block shape is 4x16. The shape of the weights matrix
|
||||
// is: rows = output_depth, cols = input_depth. It must be a multiple of 4x16.
|
||||
// This means that output_depth must be a multiple of 4, and input_deth must
|
||||
// This means that output_depth must be a multiple of 4, and input_depth must
|
||||
// be a multiple of 16.
|
||||
for (int input_depth_numblocks : {1, 3}) {
|
||||
for (int output_depth_numblocks : {1, 3}) {
|
||||
|
@ -290,7 +290,7 @@ void TryTestOneDepthwiseConv3x3Filter() {
|
||||
// It's hard to come up with a right multiplier, random guess basically makes
|
||||
// all the results saturated and becomes meaningfulless, so we first use
|
||||
// reference impl to poke the min/max value of the accumulation, then use that
|
||||
// value as a guided suggestion for us to populate meaningful mulitplier &
|
||||
// value as a guided suggestion for us to populate meaningful multiplier &
|
||||
// shift.
|
||||
PickReasonableMultiplier(
|
||||
params, output_activation_min, output_activation_max, output_depth,
|
||||
@ -305,7 +305,7 @@ void TryTestOneDepthwiseConv3x3Filter() {
|
||||
dilation_width_factor, dilation_height_factor, pad_width, pad_height,
|
||||
depth_multiplier, output_shape_inference, 0, output_shift.data()));
|
||||
|
||||
// The following tests compare referene impl and Neon general impl agrees,
|
||||
// The following tests compare reference impl and Neon general impl agrees,
|
||||
// and reference impl loosely agrees with fast kernel since they use different
|
||||
// rounding strategy.
|
||||
reference_integer_ops::DepthwiseConvPerChannel(
|
||||
|
@ -787,37 +787,37 @@ void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
// For the current (filter_x, filter_y) point in the filter,
|
||||
// compute the boundaries of the corresponding output row segment.
|
||||
int out_x_loop_start_unclampled = 0;
|
||||
int out_x_loop_end_unclampled = 0;
|
||||
if (kAllowStrided) {
|
||||
int out_x_loop_start_unclamped = 0;
|
||||
int out_x_loop_end_unclamped = 0;
|
||||
if (kAllowStrided)
|
||||
if (stride == 2) {
|
||||
out_x_loop_start_unclampled =
|
||||
out_x_loop_start_unclamped =
|
||||
(pad_width - dilation_factor * filter_x + 1) / 2;
|
||||
out_x_loop_end_unclampled =
|
||||
out_x_loop_end_unclamped =
|
||||
(pad_width + input_width - dilation_factor * filter_x + 1) / 2;
|
||||
} else if (stride == 4) {
|
||||
out_x_loop_start_unclampled =
|
||||
out_x_loop_start_unclamped =
|
||||
(pad_width - dilation_factor * filter_x + 3) / 4;
|
||||
out_x_loop_end_unclampled =
|
||||
out_x_loop_end_unclamped =
|
||||
(pad_width + input_width - dilation_factor * filter_x + 3) / 4;
|
||||
} else {
|
||||
out_x_loop_start_unclampled =
|
||||
out_x_loop_start_unclamped =
|
||||
(pad_width - dilation_factor * filter_x + stride - 1) / stride;
|
||||
out_x_loop_end_unclampled = (pad_width + input_width -
|
||||
dilation_factor * filter_x + stride - 1) /
|
||||
stride;
|
||||
out_x_loop_end_unclamped = (pad_width + input_width -
|
||||
dilation_factor * filter_x + stride - 1) /
|
||||
stride;
|
||||
}
|
||||
} else {
|
||||
out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
|
||||
out_x_loop_end_unclampled =
|
||||
out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
|
||||
out_x_loop_end_unclamped =
|
||||
pad_width + input_width - dilation_factor * filter_x;
|
||||
}
|
||||
// The kernel will have to iterate on the segment of the
|
||||
// output row that starts at out_x_loop_start and out_x_loop_end.
|
||||
const int out_x_loop_start =
|
||||
std::max(out_x_buffer_start, out_x_loop_start_unclampled);
|
||||
std::max(out_x_buffer_start, out_x_loop_start_unclamped);
|
||||
const int out_x_loop_end =
|
||||
std::min(out_x_buffer_end, out_x_loop_end_unclampled);
|
||||
std::min(out_x_buffer_end, out_x_loop_end_unclamped);
|
||||
|
||||
float* acc_buffer_ptr =
|
||||
acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
|
||||
|
@ -1496,37 +1496,37 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor,
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
// For the current (filter_x, filter_y) point in the filter,
|
||||
// compute the boundaries of the corresponding output row segment.
|
||||
int out_x_loop_start_unclampled = 0;
|
||||
int out_x_loop_end_unclampled = 0;
|
||||
int out_x_loop_start_unclamped = 0;
|
||||
int out_x_loop_end_unclamped = 0;
|
||||
if (kAllowStrided) {
|
||||
if (stride == 2) {
|
||||
out_x_loop_start_unclampled =
|
||||
out_x_loop_start_unclamped =
|
||||
(pad_width - dilation_factor * filter_x + 1) / 2;
|
||||
out_x_loop_end_unclampled =
|
||||
out_x_loop_end_unclamped =
|
||||
(pad_width + input_width - dilation_factor * filter_x + 1) / 2;
|
||||
} else if (stride == 4) {
|
||||
out_x_loop_start_unclampled =
|
||||
out_x_loop_start_unclamped =
|
||||
(pad_width - dilation_factor * filter_x + 3) / 4;
|
||||
out_x_loop_end_unclampled =
|
||||
out_x_loop_end_unclamped =
|
||||
(pad_width + input_width - dilation_factor * filter_x + 3) / 4;
|
||||
} else {
|
||||
out_x_loop_start_unclampled =
|
||||
out_x_loop_start_unclamped =
|
||||
(pad_width - dilation_factor * filter_x + stride - 1) / stride;
|
||||
out_x_loop_end_unclampled = (pad_width + input_width -
|
||||
dilation_factor * filter_x + stride - 1) /
|
||||
stride;
|
||||
out_x_loop_end_unclamped = (pad_width + input_width -
|
||||
dilation_factor * filter_x + stride - 1) /
|
||||
stride;
|
||||
}
|
||||
} else {
|
||||
out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
|
||||
out_x_loop_end_unclampled =
|
||||
out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
|
||||
out_x_loop_end_unclamped =
|
||||
pad_width + input_width - dilation_factor * filter_x;
|
||||
}
|
||||
// The kernel will have to iterate on the segment of the
|
||||
// output row that starts at out_x_loop_start and out_x_loop_end.
|
||||
const int out_x_loop_start =
|
||||
std::max(out_x_buffer_start, out_x_loop_start_unclampled);
|
||||
std::max(out_x_buffer_start, out_x_loop_start_unclamped);
|
||||
const int out_x_loop_end =
|
||||
std::min(out_x_buffer_end, out_x_loop_end_unclampled);
|
||||
std::min(out_x_buffer_end, out_x_loop_end_unclamped);
|
||||
|
||||
int32* acc_buffer_ptr =
|
||||
acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
|
||||
|
@ -13128,7 +13128,7 @@ inline void DepthwiseConvDotProduct3x3Impl(
|
||||
// "next" data, of at least 16 bytes, even when at the end of the workspace.
|
||||
// It is relatively expensive to detect the end micro block. It is also very
|
||||
// difficult to test for (to trigger) erroneous reads (past end of array) in
|
||||
// the depth multplication case.
|
||||
// the depth multiplication case.
|
||||
int workspace_width_micro_repeats =
|
||||
(has_depth_multiplication
|
||||
? kDepthwiseConvScratchWorkspaceSize - kWorkspaceExtension
|
||||
|
@ -1441,37 +1441,37 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor,
|
||||
for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
|
||||
// For the current (filter_x, filter_y) point in the filter,
|
||||
// compute the boundaries of the corresponding output row segment.
|
||||
int out_x_loop_start_unclampled = 0;
|
||||
int out_x_loop_end_unclampled = 0;
|
||||
int out_x_loop_start_unclamped = 0;
|
||||
int out_x_loop_end_unclamped = 0;
|
||||
if (kAllowStrided) {
|
||||
if (stride == 2) {
|
||||
out_x_loop_start_unclampled =
|
||||
out_x_loop_start_unclamped =
|
||||
(pad_width - dilation_factor * filter_x + 1) / 2;
|
||||
out_x_loop_end_unclampled =
|
||||
out_x_loop_end_unclamped =
|
||||
(pad_width + input_width - dilation_factor * filter_x + 1) / 2;
|
||||
} else if (stride == 4) {
|
||||
out_x_loop_start_unclampled =
|
||||
out_x_loop_start_unclamped =
|
||||
(pad_width - dilation_factor * filter_x + 3) / 4;
|
||||
out_x_loop_end_unclampled =
|
||||
out_x_loop_end_unclamped =
|
||||
(pad_width + input_width - dilation_factor * filter_x + 3) / 4;
|
||||
} else {
|
||||
out_x_loop_start_unclampled =
|
||||
out_x_loop_start_unclamped =
|
||||
(pad_width - dilation_factor * filter_x + stride - 1) / stride;
|
||||
out_x_loop_end_unclampled = (pad_width + input_width -
|
||||
dilation_factor * filter_x + stride - 1) /
|
||||
stride;
|
||||
out_x_loop_end_unclamped = (pad_width + input_width -
|
||||
dilation_factor * filter_x + stride - 1) /
|
||||
stride;
|
||||
}
|
||||
} else {
|
||||
out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
|
||||
out_x_loop_end_unclampled =
|
||||
out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
|
||||
out_x_loop_end_unclamped =
|
||||
pad_width + input_width - dilation_factor * filter_x;
|
||||
}
|
||||
// The kernel will have to iterate on the segment of the
|
||||
// output row that starts at out_x_loop_start and out_x_loop_end.
|
||||
const int out_x_loop_start =
|
||||
std::max(out_x_buffer_start, out_x_loop_start_unclampled);
|
||||
std::max(out_x_buffer_start, out_x_loop_start_unclamped);
|
||||
const int out_x_loop_end =
|
||||
std::min(out_x_buffer_end, out_x_loop_end_unclampled);
|
||||
std::min(out_x_buffer_end, out_x_loop_end_unclamped);
|
||||
|
||||
int32* acc_buffer_ptr =
|
||||
acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
|
||||
|
@ -179,10 +179,10 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
|
||||
// the first 4 values of the output_multiplier_ptr (we have 8 in total);
|
||||
// v30 (which held duplicated output right shift previously) will hold
|
||||
// the first 4 values of the output_shift_ptr (we have 8 in total);
|
||||
// lastly, v28 will hold the last 4 values of output_mulitplier and v31
|
||||
// lastly, v28 will hold the last 4 values of output_multiplier and v31
|
||||
// (previously occupied by activations) will hold the last 4 values of
|
||||
// output_shift. Then v25 will be used for output activation min while
|
||||
// output activation max will just reuse oother registers, like v24.
|
||||
// output activation max will just reuse other registers, like v24.
|
||||
//
|
||||
// Set "constant" registers. These registers may be replaced with temp
|
||||
// values from time to time when there are not enough NEON registers.
|
||||
@ -1024,7 +1024,7 @@ struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 2,
|
||||
// part.
|
||||
// The register planning here is really tricky:
|
||||
// v0-v29 are all used at least once for either filter/input/output,
|
||||
// some of them are used for output shift and output mulitplier, or
|
||||
// some of them are used for output shift and output multiplier, or
|
||||
// input/output offset.
|
||||
// Only v30 & v31 are only used for output activation min/max.
|
||||
// For per-channel case, we need 4 registers to hold output shift &
|
||||
|
@ -222,7 +222,7 @@ inline void Mean(const tflite::MeanParams& op_params,
|
||||
MeanImpl(op_params, input_shape, input_data, multiplier, shift, bias,
|
||||
output_shape, output_data, 0, output_depth);
|
||||
} else {
|
||||
// Instead parrallel for batch, we loop for the output_depth since batch
|
||||
// Instead parallel for batch, we loop for the output_depth since batch
|
||||
// is typical 1.
|
||||
std::vector<MeanWorkerTask> tasks;
|
||||
// TODO(b/131746020) don't create new heap allocations every time.
|
||||
|
@ -2339,7 +2339,7 @@ void NeonSymmetricQuantizeFloats(const float* values, const int size,
|
||||
const int32x4_t f2i0_i32x4 = RoundToNearest(mul0_f32x4);
|
||||
const int32x4_t f2i1_i32x4 = RoundToNearest(mul1_f32x4);
|
||||
|
||||
// Implements the vectorized version of the folowing block:
|
||||
// Implements the vectorized version of the following block:
|
||||
// quantized_values[i] = std::min(kScale, std::max(-kScale,
|
||||
// quantized_value));
|
||||
int32x4_t max0_i32x4 = vmaxq_s32(f2i0_i32x4, neg_scale_i32x4);
|
||||
|
@ -1123,7 +1123,7 @@ inline void Mean(const tflite::MeanParams& op_params,
|
||||
MeanImpl(op_params, input_shape, input_data, multiplier, shift, bias,
|
||||
output_shape, output_data, 0, output_depth);
|
||||
} else {
|
||||
// Instead parrallel for batch, we loop for the output_depth since batch
|
||||
// Instead parallel for batch, we loop for the output_depth since batch
|
||||
// is typical 1.
|
||||
std::vector<MeanWorkerTask> tasks;
|
||||
// TODO(b/131746020) don't create new heap allocations every time.
|
||||
@ -5714,7 +5714,7 @@ inline void Quantize(const int32_t* multiplier, const int32_t* shift,
|
||||
// ....
|
||||
//
|
||||
// In order to minimize the reload of the multipliers & shifts, once we load
|
||||
// the multipliers & shifts, we load & quantize the raw accumualtrs for every
|
||||
// the multipliers & shifts, we load & quantize the raw accumulators for every
|
||||
// row.
|
||||
#ifdef USE_NEON
|
||||
const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
|
||||
@ -6369,7 +6369,7 @@ inline void HardSwish(const HardSwishParams& params,
|
||||
// Unfortunately, the Intel arm_neon_sse.h implementation of vqshl* is
|
||||
// buggy in the case of zero shift amounts, see b/137199585. That is why
|
||||
// this NEON code path is restricted to true ARM NEON, excluding
|
||||
// arm_neon_sse.h. Anyway, the arm_neon_sse.h implemenation of saturating
|
||||
// arm_neon_sse.h. Anyway, the arm_neon_sse.h implementation of saturating
|
||||
// left shifts is slow scalar code, so there may not be much benefit in
|
||||
// running that over just plain reference code.
|
||||
//
|
||||
@ -7039,7 +7039,7 @@ inline void ClampWithRangeAndStore(int8_t* output_dst, int8x16_t input_val,
|
||||
|
||||
#endif // GEMMLOWP_NEON
|
||||
|
||||
inline void Tanh16bitPercision(const TanhParams& params,
|
||||
inline void Tanh16bitPrecision(const TanhParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const uint8* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
@ -7146,7 +7146,7 @@ inline void Tanh16bitPercision(const TanhParams& params,
|
||||
}
|
||||
}
|
||||
|
||||
inline void Tanh16bitPercision(const TanhParams& params,
|
||||
inline void Tanh16bitPrecision(const TanhParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const int8* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
@ -7239,7 +7239,7 @@ inline void Tanh16bitPercision(const TanhParams& params,
|
||||
}
|
||||
}
|
||||
|
||||
inline void Logistic16bitPercision(const LogisticParams& params,
|
||||
inline void Logistic16bitPrecision(const LogisticParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const uint8* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
@ -7331,7 +7331,7 @@ inline void Logistic16bitPercision(const LogisticParams& params,
|
||||
}
|
||||
}
|
||||
|
||||
inline void Logistic16bitPercision(const LogisticParams& params,
|
||||
inline void Logistic16bitPrecision(const LogisticParams& params,
|
||||
const RuntimeShape& input_shape,
|
||||
const int8* input_data,
|
||||
const RuntimeShape& output_shape,
|
||||
|
@ -372,7 +372,7 @@ void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
|
||||
|
||||
bool CheckedLog2(const float x, int* log2_result) {
|
||||
// Using TfLiteRound instead of std::round and std::log instead of
|
||||
// std::log2 to work around these fuctions being missing in a toolchain
|
||||
// std::log2 to work around these functions being missing in a toolchain
|
||||
// used in some TensorFlow tests as of May 2018.
|
||||
const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
|
||||
const float x_log2_rounded = TfLiteRound(x_log2);
|
||||
|
@ -26,7 +26,7 @@ namespace reference_ops {
|
||||
// TODO(ycling): Refactoring. Remove BroadcastLogical and use the more
|
||||
// generalized and efficient BroadcastBinaryFunction.
|
||||
//
|
||||
// Also appears to duplicte MinimumMaximum.
|
||||
// Also appears to duplicate MinimumMaximum.
|
||||
//
|
||||
// R: Result type. T1: Input 1 type. T2: Input 2 type.
|
||||
template <typename R, typename T1, typename T2>
|
||||
|
@ -175,7 +175,7 @@ bool Spectrogram::ComputeSquaredMagnitudeSpectrogram(
|
||||
for (int i = 0; i < output_frequency_channels_; ++i) {
|
||||
// Similar to the Complex case, except storing the norm.
|
||||
// But the norm function is known to be a performance killer,
|
||||
// so do it this way with explicit real and imagninary temps.
|
||||
// so do it this way with explicit real and imaginary temps.
|
||||
const double re = fft_input_output_[2 * i];
|
||||
const double im = fft_input_output_[2 * i + 1];
|
||||
// Which finally converts double to float if it needs to.
|
||||
|
@ -161,8 +161,8 @@ void SparseMatrixBatchVectorMultiplyAccumulate(
|
||||
// - multiplier and shift combined gives the scale.
|
||||
// - assumes input zero point is 0.
|
||||
// - scratch is created for optimization purpose only.
|
||||
// TODO(jianlijianli): this can be removed if some furture optimization
|
||||
// work makes it unnecesssary.
|
||||
// TODO(jianlijianli): this can be removed if some future optimization
|
||||
// work makes it unnecessary.
|
||||
void MatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
@ -192,8 +192,8 @@ void MatrixBatchVectorMultiplyAccumulate(
|
||||
// - multiplier and shift combined gives the scale.
|
||||
// - assumes input zero point is 0.
|
||||
// - scratch is created for optimization purpose only.
|
||||
// TODO(jianlijianli): this can be removed if some furture optimization
|
||||
// work makes it unnecesssary.
|
||||
// TODO(jianlijianli): this can be removed if some future optimization
|
||||
// work makes it unnecessary.
|
||||
void MatrixBatchVectorMultiplyAccumulate(
|
||||
const int8_t* input, const int32_t* bias,
|
||||
const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
|
||||
@ -231,7 +231,7 @@ void MatrixBatchVectorMultiply(const int16_t* hidden,
|
||||
// - output: the 32bit output
|
||||
// Note: We do not need saturation because the int8 * int8 is safe from overflow
|
||||
// in (2^31-1) / (2^14) = 131072, which is bigger than the n_row. Non-zero
|
||||
// initial output value is not exceiptionally large.
|
||||
// initial output value is not exceptionally large.
|
||||
void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
|
||||
int32_t n_row, int32_t n_col,
|
||||
int32_t* output);
|
||||
@ -372,7 +372,7 @@ inline void VectorVectorCwiseProduct(const T* __restrict__ vector1,
|
||||
}
|
||||
}
|
||||
|
||||
// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
|
||||
// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
|
||||
// assumption here is that result array is initialized to valid values.
|
||||
template <typename T>
|
||||
inline void VectorVectorCwiseProductAccumulate(const T* __restrict__ vector1,
|
||||
|
@ -371,14 +371,14 @@ TEST(uKernels, QuantMatrixBatchVectorMultiplyAccumulate8x8_16Test) {
|
||||
const int32_t multiplier = 2080364544;
|
||||
const int32_t shift = -2;
|
||||
|
||||
std::vector<int32_t> scrach(2 * 9, 0);
|
||||
std::vector<int32_t> scratch(2 * 9, 0);
|
||||
std::vector<int16_t> output = {10, 2, 33, 4, 5, 6, 65, 4, 3,
|
||||
52, 1, 2, 8, -1, -2, 11, 17, -18};
|
||||
MatrixBatchVectorMultiplyAccumulate(
|
||||
input.data(), input_zeropoint_times_weights.data(),
|
||||
input_to_gate_weights.data(), multiplier, shift,
|
||||
/*n_batch=*/2, /*n_input=*/30, /*n_output=*/9, /*output_zp=*/0,
|
||||
scrach.data(), output.data(), &context);
|
||||
scratch.data(), output.data(), &context);
|
||||
const std::vector<int16_t> expected_output = {
|
||||
-210, 331, 153, 139, -570, -657, 258, 515, -495,
|
||||
91, -243, -73, 603, -744, -269, 169, -748, -174,
|
||||
@ -497,11 +497,11 @@ TEST(uKernels, QuantMatrixBatchVectorMultiplyAccumulate8x8_8Test) {
|
||||
|
||||
std::vector<int8_t> output = {1, 2, 3, 4, 5, 6, 5, 4, 3,
|
||||
2, 1, 2, 8, -1, -2, 11, 17, 18};
|
||||
std::vector<int32_t> scrach(2 * 9, 0);
|
||||
std::vector<int32_t> scratch(2 * 9, 0);
|
||||
MatrixBatchVectorMultiplyAccumulate(
|
||||
input.data(), input_zeropoint_times_weights.data(),
|
||||
input_to_gate_weights.data(), multiplier, shift,
|
||||
/*n_batch=*/2, /*n_input=*/30, /*n_output=*/9, output_zp, scrach.data(),
|
||||
/*n_batch=*/2, /*n_input=*/30, /*n_output=*/9, output_zp, scratch.data(),
|
||||
output.data(), &context);
|
||||
const std::vector<int8_t> expected_output = {
|
||||
5, -9, -2, -30, -5, -11, -22, -18, 18,
|
||||
|
@ -100,7 +100,7 @@ TfLiteStatus PopulateConvolutionQuantizationParams(
|
||||
context, input, filter, bias, output, &real_multiplier));
|
||||
int exponent;
|
||||
|
||||
// Populate quantization parameteters with multiplier and shift.
|
||||
// Populate quantization parameters with multiplier and shift.
|
||||
QuantizeMultiplier(real_multiplier, multiplier, &exponent);
|
||||
*shift = -exponent;
|
||||
}
|
||||
|
@ -1248,7 +1248,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
}
|
||||
|
||||
// Create a scratch buffer tensor for float case and hybrid case.
|
||||
// TODO(jianlijianli): Create a is_float boolean and reorginze the temporary
|
||||
// TODO(jianlijianli): Create a is_float boolean and reorganize the temporary
|
||||
// buffer allocation logic.
|
||||
if (!is_integer) {
|
||||
node->temporaries->data[0] = op_data->scratch_tensor_index;
|
||||
|
@ -861,7 +861,7 @@ inline void LstmStepHybrid(
|
||||
//
|
||||
// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
|
||||
// layer_norm_input_weight_ptr - optional
|
||||
// layer_norm_forput_weight_ptr - optional
|
||||
// layer_norm_forget_weight_ptr - optional
|
||||
// layer_norm_cell_weight_ptr - optional
|
||||
// layer_norm_output_weight_ptr - optional
|
||||
//
|
||||
@ -1187,7 +1187,7 @@ inline void LstmStepInteger(
|
||||
//
|
||||
// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
|
||||
// layer_norm_input_weight_ptr - optional
|
||||
// layer_norm_forput_weight_ptr - optional
|
||||
// layer_norm_forget_weight_ptr - optional
|
||||
// layer_norm_cell_weight_ptr - optional
|
||||
// layer_norm_output_weight_ptr - optional
|
||||
//
|
||||
|
@ -91,7 +91,7 @@ TEST(MatrixDiagTest, Int32TestTwoDimDiag) {
|
||||
EXPECT_THAT(model.GetOutputType(), TfLiteType::kTfLiteInt32);
|
||||
}
|
||||
|
||||
TEST(MatrixDiagTest, DegenenerateCase) {
|
||||
TEST(MatrixDiagTest, DegenerateCase) {
|
||||
MatrixDiagOpModel<uint8_t> model({TensorType_UINT8, {1}});
|
||||
model.PopulateTensor<uint8_t>(model.input(), {1});
|
||||
model.Invoke();
|
||||
|
@ -25,11 +25,11 @@ namespace {
|
||||
using ::testing::ElementsAreArray;
|
||||
using ::testing::Matcher;
|
||||
|
||||
template <typename RegularInputOuput>
|
||||
template <typename RegularInputOutput>
|
||||
class PadOpModel : public SingleOpModel {
|
||||
public:
|
||||
void SetInput(std::initializer_list<RegularInputOuput> data) {
|
||||
PopulateTensor<RegularInputOuput>(input_, data);
|
||||
void SetInput(std::initializer_list<RegularInputOutput> data) {
|
||||
PopulateTensor<RegularInputOutput>(input_, data);
|
||||
}
|
||||
|
||||
template <typename QuantizedInputOutput>
|
||||
@ -46,8 +46,8 @@ class PadOpModel : public SingleOpModel {
|
||||
PopulateTensor<int>(paddings_, paddings);
|
||||
}
|
||||
|
||||
std::vector<RegularInputOuput> GetOutput() {
|
||||
return ExtractVector<RegularInputOuput>(output_);
|
||||
std::vector<RegularInputOutput> GetOutput() {
|
||||
return ExtractVector<RegularInputOutput>(output_);
|
||||
}
|
||||
std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
|
||||
|
||||
@ -128,17 +128,17 @@ class PadOpConstModel : public PadOpModel<float> {
|
||||
};
|
||||
|
||||
// Test case where paddings is a non-const tensor.
|
||||
template <typename RegularInputOuput>
|
||||
class PadV2OpDynamicModel : public PadOpModel<RegularInputOuput> {
|
||||
template <typename RegularInputOutput>
|
||||
class PadV2OpDynamicModel : public PadOpModel<RegularInputOutput> {
|
||||
public:
|
||||
PadV2OpDynamicModel(const TensorData& input,
|
||||
std::initializer_list<int> paddings_shape,
|
||||
RegularInputOuput constant_values,
|
||||
RegularInputOutput constant_values,
|
||||
const TensorData& output) {
|
||||
this->input_ = this->AddInput(input);
|
||||
this->paddings_ = this->AddInput(TensorType_INT32);
|
||||
this->constant_values_ = this->AddConstInput(
|
||||
GetTensorType<RegularInputOuput>(), {constant_values}, {1});
|
||||
GetTensorType<RegularInputOutput>(), {constant_values}, {1});
|
||||
this->output_ = this->AddOutput(output);
|
||||
|
||||
this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
|
||||
|
@ -360,7 +360,7 @@ TfLiteStatus Rfft2dHelper(TfLiteContext* context, TfLiteNode* node) {
|
||||
double* fft_double_working_area_data = reinterpret_cast<double*>(
|
||||
GetTensorData<int64_t>(fft_double_working_area));
|
||||
|
||||
// Process evert slice in the input buffer
|
||||
// Process every slice in the input buffer
|
||||
for (int i = 0; i < num_slices; ++i) {
|
||||
PrepareInputBuffer(input_data, input_height, input_width, fft_height,
|
||||
fft_width, fft_input_output);
|
||||
|
@ -87,7 +87,7 @@ TYPED_TEST(StridedSliceOpTest, UnsupportedInputSize) {
|
||||
"StridedSlice op only supports 1D-5D input arrays.");
|
||||
}
|
||||
|
||||
TYPED_TEST(StridedSliceOpTest, UnssupportedArgs) {
|
||||
TYPED_TEST(StridedSliceOpTest, UnsupportedArgs) {
|
||||
EXPECT_DEATH(
|
||||
StridedSliceOpModel<TypeParam>({3, 2}, {2}, {2}, {2}, 0, 0, 1, 0, 0),
|
||||
"ellipsis_mask is not implemented yet.");
|
||||
|
@ -63,7 +63,7 @@ class SubgraphBuilder {
|
||||
void BuildLessEqualCondSubgraph(Subgraph* subgraph, int rhs);
|
||||
|
||||
// An accumulate loop body subgraph. Used to produce triangle number
|
||||
// seqeuence. 2 inputs and 2 outpus
|
||||
// sequence. 2 inputs and 2 outputs
|
||||
// Equivalent to (counter, value) -> (counter + 1, counter + 1 + value)
|
||||
void BuildAccumulateLoopBodySubgraph(Subgraph* subgraph);
|
||||
|
||||
|
@ -36,7 +36,7 @@ class SubgraphBuilderTest : public ::testing::Test {
|
||||
}
|
||||
|
||||
protected:
|
||||
void TestAccumelateLoopBody(int input1, int input2, int output1,
|
||||
void TestAccumulateLoopBody(int input1, int input2, int output1,
|
||||
int output2) {
|
||||
interpreter_.reset(new Interpreter);
|
||||
builder_->BuildAccumulateLoopBodySubgraph(
|
||||
@ -140,9 +140,9 @@ TEST_F(SubgraphBuilderTest, TestBuildLessEqualCondSubgraph) {
|
||||
}
|
||||
|
||||
TEST_F(SubgraphBuilderTest, TestBuildAccumulateLoopBodySubgraph) {
|
||||
TestAccumelateLoopBody(1, 1, 2, 3);
|
||||
TestAccumelateLoopBody(2, 3, 3, 6);
|
||||
TestAccumelateLoopBody(3, 6, 4, 10);
|
||||
TestAccumulateLoopBody(1, 1, 2, 3);
|
||||
TestAccumulateLoopBody(2, 3, 3, 6);
|
||||
TestAccumulateLoopBody(3, 6, 4, 10);
|
||||
}
|
||||
|
||||
TEST_F(SubgraphBuilderTest, TestBuildPadLoopBodySubgraph) {
|
||||
|
@ -547,7 +547,7 @@ TEST_F(SVDFOpTest, BlackBoxTestInteger) {
|
||||
|
||||
svdf.SetBias({-0.0976817, 0.15294972, 0.39635518, -0.02702999});
|
||||
|
||||
const std::vector<std::vector<float>> input_sequnces = {
|
||||
const std::vector<std::vector<float>> input_sequences = {
|
||||
{0.49837467, 0.19278903, 0.26584083, 0.17660543, 0.52949083, -0.77931279},
|
||||
{0.12609188, -0.46347019, -0.89598465, 0.35867718, 0.36897406,
|
||||
0.73463392},
|
||||
@ -585,7 +585,7 @@ TEST_F(SVDFOpTest, BlackBoxTestInteger) {
|
||||
};
|
||||
|
||||
for (int sequence_index = 0; sequence_index < 12; ++sequence_index) {
|
||||
svdf.SetInput(input_sequnces[sequence_index]);
|
||||
svdf.SetInput(input_sequences[sequence_index]);
|
||||
svdf.Invoke();
|
||||
const std::vector<int8_t> res = svdf.GetOutput();
|
||||
EXPECT_THAT(res, ElementsAreArray(expected_output[sequence_index]));
|
||||
|
@ -22,7 +22,7 @@ limitations under the License.
|
||||
|
||||
namespace tflite {
|
||||
|
||||
// Forward declaraction for op kernels.
|
||||
// Forward declaration for op kernels.
|
||||
namespace ops {
|
||||
namespace custom {
|
||||
|
||||
@ -104,7 +104,7 @@ TEST_F(VariableOpsTest, TestReadVariableBeforeAssign) {
|
||||
ASSERT_EQ(interpreter_.Invoke(), kTfLiteError);
|
||||
}
|
||||
|
||||
TEST_F(VariableOpsTest, TestReeasignToDifferentSize) {
|
||||
TEST_F(VariableOpsTest, TestReassignToDifferentSize) {
|
||||
// 1st invocation. The variable is assigned as a scalar.
|
||||
{
|
||||
ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
|
||||
|
@ -79,7 +79,7 @@ TEST_F(WhileTest, TestPadLoop) {
|
||||
TfLiteTensor* output2 = interpreter_->tensor(interpreter_->outputs()[1]);
|
||||
CheckIntTensor(output2, {11}, {0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0});
|
||||
|
||||
// The extra invocation serves as a regiression test: There was a bug that
|
||||
// The extra invocation serves as a regression test: There was a bug that
|
||||
// invoking a while loop with dynamic shaped body makes the interpreter
|
||||
// state uninvokable.
|
||||
ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
|
||||
|
Loading…
Reference in New Issue
Block a user