a bit more

This commit is contained in:
Dmitry Volodin 2020-10-29 17:31:40 +03:00
parent 04ff97cbb6
commit 620a090a63
14 changed files with 29 additions and 29 deletions

View File

@ -170,7 +170,7 @@ std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
}
for (int d = 0; d < out_z; ++d) {
c += " " + s_conv + "(r[" + std::to_string(d) +
"], src, args.weigths.GetPtr(),";
"], src, args.weights.GetPtr(),";
c += " " + std::to_string(filters_counter) + ");\n";
filters_counter += ch_count;
}
@ -201,7 +201,7 @@ bool IsConvConstantsSupported(const DeviceInfo& device_info,
if (device_info.IsAMD() &&
definition.precision != CalculationsPrecision::F32 &&
definition.src_tensors[0].storage_type != TensorStorageType::BUFFER) {
// BUG, some AMD gpus crashe without it
// BUG, some AMD GPUs crash without it
return false;
}

View File

@ -104,7 +104,7 @@ void UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI, T>& weights,
absl::MakeSpan(ptr, float_count / 4));
}
op->args_.AddObject("weigths",
op->args_.AddObject("weights",
absl::make_unique<BufferDescriptor>(std::move(desc)));
}

View File

@ -85,7 +85,7 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
const TfLiteConvParams* params) {
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
// MLI optimized version only supports int8_t dataype, dilation factor of 1
// MLI optimized version only supports int8_t datatype, dilation factor of 1
// and per-axis quantization of weights (no broadcasting/per-tensor)
bool ret_val = (filter->type == kTfLiteInt8) &&
(input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
@ -159,7 +159,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
int output_width = output->dims->data[2];
int output_height = output->dims->data[1];
// Dynimically allocate per-channel quantization parameters.
// Dynamically allocate per-channel quantization parameters.
const int num_channels = filter->dims->data[kConvQuantizedDimension];
data->per_channel_output_multiplier =
reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
@ -241,7 +241,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
const OpData& data, const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
// Run Conv MLI kernel
// MLI optimized version only supports int8_t dataype and dilation factor of 1
// MLI optimized version only supports int8_t datatype and dilation factor of 1
if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
(params->dilation_height_factor == 1)) {
mli_tensor mli_in = {};
@ -299,7 +299,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
const int overlap = kernel_height - cfg.stride_height;
// for weight slicing (on output channels)
// NHWC layout for weigths, output channel dimension is the first dimension.
// NHWC layout for weights, output channel dimension is the first dimension.
const int weight_out_ch_dimension = 0;
int slice_channels =
static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
@ -362,9 +362,9 @@ TfLiteStatus EvalMliQuantizedPerChannel(
in_slice_height, cfg.padding_top,
cfg.padding_bottom, overlap);
/* output tensor is alreade sliced in the output channel dimension.
/* output tensor is already sliced in the output channel dimension.
out_ch_slice.Sub() is the tensor for the amount of output channels of this
itteration of the weight slice loop. This tensor needs to be further
iteration of the weight slice loop. This tensor needs to be further
sliced over the batch and height dimension. */
ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
out_slice_height);

View File

@ -72,7 +72,7 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
const int in_ch = SizeOfDimension(input, 3);
const int filters_num = SizeOfDimension(filter, 3);
// MLI optimized version only supports int8_t dataype, dilation factor of 1
// MLI optimized version only supports int8_t datatype, dilation factor of 1
// and per-axis quantization of weights (no broadcasting/per-tensor) (in_ch ==
// filters_num) || (in_ch == 1)) is a forbidding of channel multiplier logic
// for multichannel input.
@ -150,7 +150,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
// Per channel quantization is only needed for int8 inference. For other
// quantized types, only a single scale and zero point is needed.
const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
// Dynimically allocate per-channel quantization parameters.
// Dynamically allocate per-channel quantization parameters.
data->per_channel_output_multiplier =
reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
context, num_channels * sizeof(int32_t)));
@ -280,7 +280,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
const int overlap = kernelHeight - cfg.stride_height;
// for weight slicing (on output channels)
// HWCN layout for weigths, output channel dimension is the first dimension.
// HWCN layout for weights, output channel dimension is the first dimension.
const int weight_out_ch_dimension = 3;
// bias has only 1 dimension
const int bias_out_ch_dimension = 0;
@ -345,9 +345,9 @@ TfLiteStatus EvalMliQuantizedPerChannel(
mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
/* input tensor is alreade sliced in the channel dimension.
/* input tensor is already sliced in the channel dimension.
out_ch_slice.Sub() is the tensor for the amount of channels of this
itteration of the weight slice loop. This tensor needs to be further
iteration of the weight slice loop. This tensor needs to be further
sliced over the batch and height dimension. in_ch_slice.Sub() tensor
contains batches of HWC tensors. so it is a 4 dimensional tensor. because
the mli kernel will process one HWC tensor at a time, the 4 dimensional
@ -360,9 +360,9 @@ TfLiteStatus EvalMliQuantizedPerChannel(
inSliceHeight, padding_top,
padding_bottom, overlap);
/* output tensor is alreade sliced in the output channel dimension.
/* output tensor is already sliced in the output channel dimension.
out_ch_slice.Sub() is the tensor for the amount of output channels of this
itteration of the weight slice loop. This tensor needs to be further
iteration of the weight slice loop. This tensor needs to be further
sliced over the batch and height dimension. */
ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension,
outSliceHeight);

View File

@ -52,7 +52,7 @@ constexpr int kOutputTensor = 0;
bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias,
const TfLiteFullyConnectedParams* params) {
// MLI optimized version only supports int8_t dataype and no fused Relu and
// MLI optimized version only supports int8_t datatype and no fused Relu and
// symmetric per-tensor quantization of weights (not per-axis)
bool ret_val = (filter->type == kTfLiteInt8) &&
(input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
@ -190,9 +190,9 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
ops::micro::TensorSlicer in_slice(&mli_in, input_size_dimension,
mli_in.shape[input_size_dimension]);
/* output tensor is alreade sliced in the output size dimension.
/* output tensor is already sliced in the output size dimension.
out_ch_slice.Sub() is the tensor for the amount of output size of this
itteration of the weight slice loop. This tensor needs to be further
iteration of the weight slice loop. This tensor needs to be further
sliced over the batch */
ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
slice_size);

View File

@ -43,7 +43,7 @@ enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 };
bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
const TfLitePoolParams* params) {
// MLI optimized version only supports int8_t dataype and no fused Relu
// MLI optimized version only supports int8_t datatype and no fused Relu
return (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone);
}

View File

@ -163,7 +163,7 @@ TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
init_arc_scratch_buffers();
/* strategy for FC kernels:
first allocate input, because this cannot be sliced. (in case of batch
processing, only a single input needs to be allocated) then weigths & bias
processing, only a single input needs to be allocated) then weights & bias
because if fully loaded, they can be reused over batches. then output.
The number of output channels (for weights slicing) depends on size of
output and size of weights&bias */
@ -275,7 +275,7 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
max_out_lines_for_input =
(max_lines_in - kernel_height + 1) / stride_height;
}
// Ten compute how many ouput lines fit into the output tensor.
// Then compute how many output lines fit into the output tensor.
max_lines_out =
std::min(out_height, static_cast<int>(out->capacity) / line_size_out);
// the smallest of the two determines the slice height for the output, and

View File

@ -141,7 +141,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
int output_width = output->dims->data[2];
int output_height = output->dims->data[1];
// Dynimically allocate per-channel quantization parameters.
// Dynamically allocate per-channel quantization parameters.
const int num_channels = filter->dims->data[kConvQuantizedDimension];
data->per_channel_output_multiplier =
static_cast<int32_t*>(context->AllocatePersistentBuffer(

View File

@ -127,7 +127,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
// Per channel quantization is only needed for int8_t inference. For other
// quantized types, only a single scale and zero point is needed.
const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
// Dynimically allocate per-channel quantization parameters.
// Dynamically allocate per-channel quantization parameters.
data->per_channel_output_multiplier =
reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
context, num_channels * sizeof(int32_t)));

View File

@ -362,7 +362,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
// Per channel quantization is only needed for int8_t inference. For other
// quantized types, only a single scale and zero point is needed.
const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
// Dynimically allocate per-channel quantization parameters.
// Dynamically allocate per-channel quantization parameters.
data->per_channel_output_multiplier =
reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
context, num_channels * sizeof(int32_t)));

View File

@ -157,7 +157,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
int output_width = output->dims->data[2];
int output_height = output->dims->data[1];
// Dynimically allocate per-channel quantization parameters.
// Dynamically allocate per-channel quantization parameters.
const int num_channels = filter->dims->data[kConvQuantizedDimension];
TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
context, num_channels * sizeof(int32_t),

View File

@ -145,7 +145,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
// Per channel quantization is only needed for int8_t inference. For other
// quantized types, only a single scale and zero point is needed.
const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
// Dynimically allocate per-channel quantization parameters.
// Dynamically allocate per-channel quantization parameters.
TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
context, num_channels * sizeof(int32_t),
reinterpret_cast<void**>(&data->per_channel_output_multiplier)));

View File

@ -325,7 +325,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
// Per channel quantization is only needed for int8_t inference. For other
// quantized types, only a single scale and zero point is needed.
const int num_channels = filter->dims->data[kConvQuantizedDimension];
// Dynimically allocate per-channel quantization parameters.
// Dynamically allocate per-channel quantization parameters.
op_data->per_channel_output_multiplier =
reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
context, num_channels * sizeof(int32_t)));

View File

@ -368,7 +368,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
// Per channel quantization is only needed for int8_t inference. For other
// quantized types, only a single scale and zero point is needed.
const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
// Dynimically allocate per-channel quantization parameters.
// Dynamically allocate per-channel quantization parameters.
op_data->per_channel_output_multiplier =
reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
context, num_channels * sizeof(int32_t)));