Merge pull request #44183 from foss-for-synopsys-dwc-arc-processors:arc_mli_build_fix

PiperOrigin-RevId: 338287124
Change-Id: I9d856ed6271df1f4eefd1dd83abcbdf2bf7cc7e2
This commit is contained in:
TensorFlower Gardener 2020-10-21 11:06:05 -07:00
commit 2fef5cc015
6 changed files with 76 additions and 63 deletions

View File

@ -66,6 +66,7 @@ struct OpData {
int32_t output_activation_max;
};
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
inline PaddingType RuntimePaddingType(TfLitePadding padding) {
switch (padding) {
case TfLitePadding::kTfLitePaddingSame:
@ -77,6 +78,7 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) {
return PaddingType::kNone;
}
}
#endif
bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias,
@ -194,7 +196,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
data->output_zero_point = output->params.zero_point;
return kTfLiteOk;
} // namespace conv
}
void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, const OpData& data,
@ -259,10 +261,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
ConvertToMliTensor<int8_t>(input, &mli_in);
ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
ConvertToMliTensor<int8_t>(output, &mli_out);
ops::micro::ConvertToMliTensor<int8_t>(input, &mli_in);
ops::micro::ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
ops::micro::ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
ops::micro::ConvertToMliTensor<int8_t>(output, &mli_out);
if (params->activation == kTfLiteActRelu) {
cfg.relu.type = MLI_RELU_GEN;
@ -313,14 +315,16 @@ TfLiteStatus EvalMliQuantizedPerChannel(
mli_tensor out_local = mli_out;
mli_mov_cfg_t copy_config;
mli_mov_cfg_for_copy(&copy_config);
TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
TF_LITE_ENSURE_STATUS(ops::micro::get_arc_scratch_buffer_for_conv_tensors(
context, &in_local, &weights_local, &bias_local, &out_local));
TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_io(
&in_local, &out_local, kernel_height, cfg.stride_height,
cfg.padding_top, cfg.padding_bottom, &in_slice_height,
&out_slice_height));
TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
&weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
TF_LITE_ENSURE_STATUS(
ops::micro::arc_scratch_buffer_calc_slice_size_weights(
&weights_local, &bias_local, weight_out_ch_dimension,
&slice_channels));
/* is_local indicates that the tensor is already in local memory,
so in that case the original tensor can be used,
@ -330,10 +334,12 @@ TfLiteStatus EvalMliQuantizedPerChannel(
const bool w_is_local = weights_local.data == mli_weights.data;
const bool b_is_local = bias_local.data == mli_bias.data;
TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels);
TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension, slice_channels);
TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
0, 0, 0, true);
ops::micro::TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension,
slice_channels);
ops::micro::TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension,
slice_channels);
ops::micro::TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension,
slice_channels, 0, 0, 0, true);
mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@ -352,15 +358,16 @@ TfLiteStatus EvalMliQuantizedPerChannel(
dimension. for that the sliceHeight has been calculated. The tensor slicer
is configured that it will completely slice the nBatch dimension (0) and
slice the height dimension (1) in chunks of 'sliceHeight' */
TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height,
cfg.padding_top, cfg.padding_bottom, overlap);
ops::micro::TensorSlicer in_slice(&mli_in, height_dimension,
in_slice_height, cfg.padding_top,
cfg.padding_bottom, overlap);
/* output tensor is alreade sliced in the output channel dimension.
out_ch_slice.Sub() is the tensor for the amount of output channels of this
itteration of the weight slice loop. This tensor needs to be further
sliced over the batch and height dimension. */
TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
out_slice_height);
ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
out_slice_height);
/* setup the pointers to the local or remote tensor to make the code
* inside the loop easier. */

View File

@ -242,10 +242,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
ConvertToMliTensor<int8_t>(input, &mli_in);
ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
ConvertToMliTensor<int8_t>(output, &mli_out);
ops::micro::ConvertToMliTensor<int8_t>(input, &mli_in);
ops::micro::ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
ops::micro::ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
ops::micro::ConvertToMliTensor<int8_t>(output, &mli_out);
if (params->activation == kTfLiteActRelu) {
cfg.relu.type = MLI_RELU_GEN;
@ -301,7 +301,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
mli_mov_cfg_t copy_config;
mli_mov_cfg_for_copy(&copy_config);
TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
TF_LITE_ENSURE_STATUS(ops::micro::get_arc_scratch_buffer_for_conv_tensors(
context, &in_local, &weights_local, &bias_local, &out_local));
/* is_local indicates that the tensor is already in local memory,
so in that case the original tensor can be used,
@ -311,10 +311,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
const bool w_is_local = weights_local.data == mli_weights.data;
const bool b_is_local = bias_local.data == mli_bias.data;
TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_io(
&in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top,
cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_weights(
&weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
/* if input channels is not equal to output channels, a channel multiplier
@ -324,13 +324,14 @@ TfLiteStatus EvalMliQuantizedPerChannel(
slice_channels = (slice_channels / in_channels) * in_channels;
}
TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels);
TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0,
0, 0, true);
TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
0, 0, 0, true);
TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0,
0, 0, true);
ops::micro::TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension,
slice_channels);
ops::micro::TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension,
slice_channels, 0, 0, 0, true);
ops::micro::TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension,
slice_channels, 0, 0, 0, true);
ops::micro::TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension,
slice_channels, 0, 0, 0, true);
mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@ -355,14 +356,16 @@ TfLiteStatus EvalMliQuantizedPerChannel(
the sliceHeight has been calculated. The tensor slicer is configured that
it will completely slice the nBatch dimension (0) and slice the height
dimension (1) in chunks of 'sliceHeight' */
TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight,
padding_top, padding_bottom, overlap);
ops::micro::TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension,
inSliceHeight, padding_top,
padding_bottom, overlap);
/* output tensor is alreade sliced in the output channel dimension.
out_ch_slice.Sub() is the tensor for the amount of output channels of this
itteration of the weight slice loop. This tensor needs to be further
sliced over the batch and height dimension. */
TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight);
ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension,
outSliceHeight);
/* setup the pointers to the local or remote tensor to make the code
* inside the loop easier. */

View File

@ -29,9 +29,6 @@ limitations under the License.
#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
namespace tflite {
namespace ops {
namespace micro {
namespace fully_connected {
namespace {
struct OpData {
@ -127,10 +124,10 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
mli_tensor mli_bias = {};
mli_tensor mli_out = {};
ConvertToMliTensor<int8_t>(input, &mli_in);
ConvertToMliTensor<int8_t>(filter, &mli_weights);
ConvertToMliTensor<int32_t>(bias, &mli_bias);
ConvertToMliTensor<int8_t>(output, &mli_out);
ops::micro::ConvertToMliTensor<int8_t>(input, &mli_in);
ops::micro::ConvertToMliTensor<int8_t>(filter, &mli_weights);
ops::micro::ConvertToMliTensor<int32_t>(bias, &mli_bias);
ops::micro::ConvertToMliTensor<int8_t>(output, &mli_out);
/* The input tensor can have more than 2 dimensions. for the compute this
doesn't make any difference because all the inputs or a batch entry will
@ -156,9 +153,10 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
int slice_size = mli_weights.shape[weight_out_dimension];
/* allocate the local buffers, and compute the slice size */
TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors(
context, &in_local, &weights_local, &bias_local, &out_local));
TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
TF_LITE_ENSURE_STATUS(
ops::micro::get_arc_scratch_buffer_for_fully_connect_tensors(
context, &in_local, &weights_local, &bias_local, &out_local));
TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_weights(
&weights_local, &bias_local, weight_out_dimension, &slice_size));
int max_out_slice_size =
out_local.capacity / mli_hlp_tensor_element_size(&out_local);
@ -172,10 +170,11 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
const bool w_is_local = weights_local.data == mli_weights.data;
const bool b_is_local = bias_local.data == mli_bias.data;
TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size);
TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0,
true);
ops::micro::TensorSlicer w_slice(&mli_weights, weight_out_dimension,
slice_size);
ops::micro::TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
ops::micro::TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension,
slice_size, 0, 0, 0, true);
mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@ -188,15 +187,15 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
// Slice the input over the batches (one at a time with the size of a
// complete input)
TensorSlicer in_slice(&mli_in, input_size_dimension,
mli_in.shape[input_size_dimension]);
ops::micro::TensorSlicer in_slice(&mli_in, input_size_dimension,
mli_in.shape[input_size_dimension]);
/* output tensor is alreade sliced in the output size dimension.
out_ch_slice.Sub() is the tensor for the amount of output size of this
itteration of the weight slice loop. This tensor needs to be further
sliced over the batch */
TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
slice_size);
ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
slice_size);
/* setup the pointers to the local or remote tensor to make the code
* inside the loop easier. */
@ -359,19 +358,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
return kTfLiteOk;
}
} // namespace fully_connected
TfLiteRegistration Register_FULLY_CONNECTED() {
return {/*init=*/fully_connected::Init,
return {/*init=*/Init,
/*free=*/nullptr,
/*prepare=*/fully_connected::Prepare,
/*invoke=*/fully_connected::Eval,
/*prepare=*/Prepare,
/*invoke=*/Eval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -25,13 +25,13 @@ TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim,
int slice_size, int padding_pre, int padding_post,
int overlap, bool interleave_mode)
: full_tensor_(full_tensor),
sub_tensor_{},
sub_cfg_{},
done_(false),
sliceDim_(slice_dim),
pad_pre_(padding_pre),
pad_post_(padding_post),
overlap_(overlap),
sub_cfg_{},
sub_tensor_{},
done_(false) {
overlap_(overlap) {
/* In the interleave mode, the slicing happens from the deepest dimension up
to the slice_dim for example in an HWC layout this can mode can be used to
slice in the C dimenstion. in this mode the data is not contiguous in memory

View File

@ -123,6 +123,10 @@ endif
CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
ldflags_to_remove = -Wl,--fatal-warnings -Wl,--gc-sections
LDFLAGS := $(filter-out $(ldflags_to_remove),$(LDFLAGS))
MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
CXXFLAGS += $(PLATFORM_FLAGS)

View File

@ -87,6 +87,10 @@ ifeq ($(TARGET), himax_we1_evb)
CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
ldflags_to_remove = -Wl,--fatal-warnings -Wl,--gc-sections
LDFLAGS := $(filter-out $(ldflags_to_remove),$(LDFLAGS))
MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
endif