a bit more
This commit is contained in:
parent
04ff97cbb6
commit
620a090a63
@ -170,7 +170,7 @@ std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
|
||||
}
|
||||
for (int d = 0; d < out_z; ++d) {
|
||||
c += " " + s_conv + "(r[" + std::to_string(d) +
|
||||
"], src, args.weigths.GetPtr(),";
|
||||
"], src, args.weights.GetPtr(),";
|
||||
c += " " + std::to_string(filters_counter) + ");\n";
|
||||
filters_counter += ch_count;
|
||||
}
|
||||
@ -201,7 +201,7 @@ bool IsConvConstantsSupported(const DeviceInfo& device_info,
|
||||
if (device_info.IsAMD() &&
|
||||
definition.precision != CalculationsPrecision::F32 &&
|
||||
definition.src_tensors[0].storage_type != TensorStorageType::BUFFER) {
|
||||
// BUG, some AMD gpus crashe without it
|
||||
// BUG, some AMD GPUs crash without it
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@ -104,7 +104,7 @@ void UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI, T>& weights,
|
||||
absl::MakeSpan(ptr, float_count / 4));
|
||||
}
|
||||
|
||||
op->args_.AddObject("weigths",
|
||||
op->args_.AddObject("weights",
|
||||
absl::make_unique<BufferDescriptor>(std::move(desc)));
|
||||
}
|
||||
|
||||
|
||||
@ -85,7 +85,7 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
|
||||
const TfLiteConvParams* params) {
|
||||
const auto* affine_quantization =
|
||||
reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
|
||||
// MLI optimized version only supports int8_t dataype, dilation factor of 1
|
||||
// MLI optimized version only supports int8_t datatype, dilation factor of 1
|
||||
// and per-axis quantization of weights (no broadcasting/per-tensor)
|
||||
bool ret_val = (filter->type == kTfLiteInt8) &&
|
||||
(input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
|
||||
@ -159,7 +159,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
int output_width = output->dims->data[2];
|
||||
int output_height = output->dims->data[1];
|
||||
|
||||
// Dynimically allocate per-channel quantization parameters.
|
||||
// Dynamically allocate per-channel quantization parameters.
|
||||
const int num_channels = filter->dims->data[kConvQuantizedDimension];
|
||||
data->per_channel_output_multiplier =
|
||||
reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
|
||||
@ -241,7 +241,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
|
||||
const OpData& data, const TfLiteTensor* input, const TfLiteTensor* filter,
|
||||
const TfLiteTensor* bias, TfLiteTensor* output) {
|
||||
// Run Conv MLI kernel
|
||||
// MLI optimized version only supports int8_t dataype and dilation factor of 1
|
||||
// MLI optimized version only supports int8_t datatype and dilation factor of 1
|
||||
if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
|
||||
(params->dilation_height_factor == 1)) {
|
||||
mli_tensor mli_in = {};
|
||||
@ -299,7 +299,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
|
||||
const int overlap = kernel_height - cfg.stride_height;
|
||||
|
||||
// for weight slicing (on output channels)
|
||||
// NHWC layout for weigths, output channel dimension is the first dimension.
|
||||
// NHWC layout for weights, output channel dimension is the first dimension.
|
||||
const int weight_out_ch_dimension = 0;
|
||||
int slice_channels =
|
||||
static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
|
||||
@ -362,9 +362,9 @@ TfLiteStatus EvalMliQuantizedPerChannel(
|
||||
in_slice_height, cfg.padding_top,
|
||||
cfg.padding_bottom, overlap);
|
||||
|
||||
/* output tensor is alreade sliced in the output channel dimension.
|
||||
/* output tensor is already sliced in the output channel dimension.
|
||||
out_ch_slice.Sub() is the tensor for the amount of output channels of this
|
||||
itteration of the weight slice loop. This tensor needs to be further
|
||||
iteration of the weight slice loop. This tensor needs to be further
|
||||
sliced over the batch and height dimension. */
|
||||
ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
|
||||
out_slice_height);
|
||||
|
||||
@ -72,7 +72,7 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
|
||||
const int in_ch = SizeOfDimension(input, 3);
|
||||
const int filters_num = SizeOfDimension(filter, 3);
|
||||
|
||||
// MLI optimized version only supports int8_t dataype, dilation factor of 1
|
||||
// MLI optimized version only supports int8_t datatype, dilation factor of 1
|
||||
// and per-axis quantization of weights (no broadcasting/per-tensor) (in_ch ==
|
||||
// filters_num) || (in_ch == 1)) is a forbidding of channel multiplier logic
|
||||
// for multichannel input.
|
||||
@ -150,7 +150,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
// Per channel quantization is only needed for int8 inference. For other
|
||||
// quantized types, only a single scale and zero point is needed.
|
||||
const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
|
||||
// Dynimically allocate per-channel quantization parameters.
|
||||
// Dynamically allocate per-channel quantization parameters.
|
||||
data->per_channel_output_multiplier =
|
||||
reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
|
||||
context, num_channels * sizeof(int32_t)));
|
||||
@ -280,7 +280,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
|
||||
const int overlap = kernelHeight - cfg.stride_height;
|
||||
|
||||
// for weight slicing (on output channels)
|
||||
// HWCN layout for weigths, output channel dimension is the first dimension.
|
||||
// HWCN layout for weights, output channel dimension is the first dimension.
|
||||
const int weight_out_ch_dimension = 3;
|
||||
// bias has only 1 dimension
|
||||
const int bias_out_ch_dimension = 0;
|
||||
@ -345,9 +345,9 @@ TfLiteStatus EvalMliQuantizedPerChannel(
|
||||
mli_mov_tensor_sync(w_slice.Sub(), ©_config, w_ptr);
|
||||
mli_mov_tensor_sync(b_slice.Sub(), ©_config, b_ptr);
|
||||
|
||||
/* input tensor is alreade sliced in the channel dimension.
|
||||
/* input tensor is already sliced in the channel dimension.
|
||||
out_ch_slice.Sub() is the tensor for the amount of channels of this
|
||||
itteration of the weight slice loop. This tensor needs to be further
|
||||
iteration of the weight slice loop. This tensor needs to be further
|
||||
sliced over the batch and height dimension. in_ch_slice.Sub() tensor
|
||||
contains batches of HWC tensors. so it is a 4 dimensional tensor. because
|
||||
the mli kernel will process one HWC tensor at a time, the 4 dimensional
|
||||
@ -360,9 +360,9 @@ TfLiteStatus EvalMliQuantizedPerChannel(
|
||||
inSliceHeight, padding_top,
|
||||
padding_bottom, overlap);
|
||||
|
||||
/* output tensor is alreade sliced in the output channel dimension.
|
||||
/* output tensor is already sliced in the output channel dimension.
|
||||
out_ch_slice.Sub() is the tensor for the amount of output channels of this
|
||||
itteration of the weight slice loop. This tensor needs to be further
|
||||
iteration of the weight slice loop. This tensor needs to be further
|
||||
sliced over the batch and height dimension. */
|
||||
ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension,
|
||||
outSliceHeight);
|
||||
|
||||
@ -52,7 +52,7 @@ constexpr int kOutputTensor = 0;
|
||||
bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
|
||||
const TfLiteTensor* filter, const TfLiteTensor* bias,
|
||||
const TfLiteFullyConnectedParams* params) {
|
||||
// MLI optimized version only supports int8_t dataype and no fused Relu and
|
||||
// MLI optimized version only supports int8_t datatype and no fused Relu and
|
||||
// symmetric per-tensor quantization of weights (not per-axis)
|
||||
bool ret_val = (filter->type == kTfLiteInt8) &&
|
||||
(input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
|
||||
@ -190,9 +190,9 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
|
||||
ops::micro::TensorSlicer in_slice(&mli_in, input_size_dimension,
|
||||
mli_in.shape[input_size_dimension]);
|
||||
|
||||
/* output tensor is alreade sliced in the output size dimension.
|
||||
/* output tensor is already sliced in the output size dimension.
|
||||
out_ch_slice.Sub() is the tensor for the amount of output size of this
|
||||
itteration of the weight slice loop. This tensor needs to be further
|
||||
iteration of the weight slice loop. This tensor needs to be further
|
||||
sliced over the batch */
|
||||
ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
|
||||
slice_size);
|
||||
|
||||
@ -43,7 +43,7 @@ enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 };
|
||||
|
||||
bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
|
||||
const TfLitePoolParams* params) {
|
||||
// MLI optimized version only supports int8_t dataype and no fused Relu
|
||||
// MLI optimized version only supports int8_t datatype and no fused Relu
|
||||
return (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone);
|
||||
}
|
||||
|
||||
|
||||
@ -163,7 +163,7 @@ TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
|
||||
init_arc_scratch_buffers();
|
||||
/* strategy for FC kernels:
|
||||
first allocate input, because this cannot be sliced. (in case of batch
|
||||
processing, only a single input needs to be allocated) then weigths & bias
|
||||
processing, only a single input needs to be allocated) then weights & bias
|
||||
because if fully loaded, they can be reused over batches. then output.
|
||||
The number of output channels (for weights slicing) depends on size of
|
||||
output and size of weights&bias */
|
||||
@ -275,7 +275,7 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
|
||||
max_out_lines_for_input =
|
||||
(max_lines_in - kernel_height + 1) / stride_height;
|
||||
}
|
||||
// Ten compute how many ouput lines fit into the output tensor.
|
||||
// Then compute how many output lines fit into the output tensor.
|
||||
max_lines_out =
|
||||
std::min(out_height, static_cast<int>(out->capacity) / line_size_out);
|
||||
// the smallest of the two determines the slice height for the output, and
|
||||
|
||||
@ -141,7 +141,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
int output_width = output->dims->data[2];
|
||||
int output_height = output->dims->data[1];
|
||||
|
||||
// Dynimically allocate per-channel quantization parameters.
|
||||
// Dynamically allocate per-channel quantization parameters.
|
||||
const int num_channels = filter->dims->data[kConvQuantizedDimension];
|
||||
data->per_channel_output_multiplier =
|
||||
static_cast<int32_t*>(context->AllocatePersistentBuffer(
|
||||
|
||||
@ -127,7 +127,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
// Per channel quantization is only needed for int8_t inference. For other
|
||||
// quantized types, only a single scale and zero point is needed.
|
||||
const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
|
||||
// Dynimically allocate per-channel quantization parameters.
|
||||
// Dynamically allocate per-channel quantization parameters.
|
||||
data->per_channel_output_multiplier =
|
||||
reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
|
||||
context, num_channels * sizeof(int32_t)));
|
||||
|
||||
@ -362,7 +362,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
// Per channel quantization is only needed for int8_t inference. For other
|
||||
// quantized types, only a single scale and zero point is needed.
|
||||
const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
|
||||
// Dynimically allocate per-channel quantization parameters.
|
||||
// Dynamically allocate per-channel quantization parameters.
|
||||
data->per_channel_output_multiplier =
|
||||
reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
|
||||
context, num_channels * sizeof(int32_t)));
|
||||
|
||||
@ -157,7 +157,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
int output_width = output->dims->data[2];
|
||||
int output_height = output->dims->data[1];
|
||||
|
||||
// Dynimically allocate per-channel quantization parameters.
|
||||
// Dynamically allocate per-channel quantization parameters.
|
||||
const int num_channels = filter->dims->data[kConvQuantizedDimension];
|
||||
TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
|
||||
context, num_channels * sizeof(int32_t),
|
||||
|
||||
@ -145,7 +145,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
// Per channel quantization is only needed for int8_t inference. For other
|
||||
// quantized types, only a single scale and zero point is needed.
|
||||
const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
|
||||
// Dynimically allocate per-channel quantization parameters.
|
||||
// Dynamically allocate per-channel quantization parameters.
|
||||
TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
|
||||
context, num_channels * sizeof(int32_t),
|
||||
reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
|
||||
|
||||
@ -325,7 +325,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
// Per channel quantization is only needed for int8_t inference. For other
|
||||
// quantized types, only a single scale and zero point is needed.
|
||||
const int num_channels = filter->dims->data[kConvQuantizedDimension];
|
||||
// Dynimically allocate per-channel quantization parameters.
|
||||
// Dynamically allocate per-channel quantization parameters.
|
||||
op_data->per_channel_output_multiplier =
|
||||
reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
|
||||
context, num_channels * sizeof(int32_t)));
|
||||
|
||||
@ -368,7 +368,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
|
||||
// Per channel quantization is only needed for int8_t inference. For other
|
||||
// quantized types, only a single scale and zero point is needed.
|
||||
const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
|
||||
// Dynimically allocate per-channel quantization parameters.
|
||||
// Dynamically allocate per-channel quantization parameters.
|
||||
op_data->per_channel_output_multiplier =
|
||||
reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
|
||||
context, num_channels * sizeof(int32_t)));
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user