Merge pull request #44183 from foss-for-synopsys-dwc-arc-processors:arc_mli_build_fix

PiperOrigin-RevId: 338287124 Change-Id: I9d856ed6271df1f4eefd1dd83abcbdf2bf7cc7e2
2020-10-21 11:06:05 -07:00 · 2020-10-21 11:06:05 -07:00 · 2fef5cc015
commit 2fef5cc015
parent fb6b46e7ad a1612c0da6
6 changed files with 76 additions and 63 deletions
--- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
@ -66,6 +66,7 @@ struct OpData {
  int32_t output_activation_max;
 };

+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
 inline PaddingType RuntimePaddingType(TfLitePadding padding) {
  switch (padding) {
    case TfLitePadding::kTfLitePaddingSame:
@ -77,6 +78,7 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) {
      return PaddingType::kNone;
  }
 }
+#endif

 bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                     const TfLiteTensor* filter, const TfLiteTensor* bias,
@ -194,7 +196,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  data->output_zero_point = output->params.zero_point;

  return kTfLiteOk;
-}  // namespace conv
+}

 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                   TfLiteConvParams* params, const OpData& data,
@ -259,10 +261,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
    mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
    mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;

-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
-    ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
+    ops::micro::ConvertToMliTensor<int8_t>(input, &mli_in);
+    ops::micro::ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
+    ops::micro::ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
+    ops::micro::ConvertToMliTensor<int8_t>(output, &mli_out);

    if (params->activation == kTfLiteActRelu) {
      cfg.relu.type = MLI_RELU_GEN;
@ -313,14 +315,16 @@ TfLiteStatus EvalMliQuantizedPerChannel(
    mli_tensor out_local = mli_out;
    mli_mov_cfg_t copy_config;
    mli_mov_cfg_for_copy(&copy_config);
-    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
+    TF_LITE_ENSURE_STATUS(ops::micro::get_arc_scratch_buffer_for_conv_tensors(
        context, &in_local, &weights_local, &bias_local, &out_local));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+    TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_io(
        &in_local, &out_local, kernel_height, cfg.stride_height,
        cfg.padding_top, cfg.padding_bottom, &in_slice_height,
        &out_slice_height));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
-        &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
+    TF_LITE_ENSURE_STATUS(
+        ops::micro::arc_scratch_buffer_calc_slice_size_weights(
+            &weights_local, &bias_local, weight_out_ch_dimension,
+            &slice_channels));

    /* is_local indicates that the tensor is already in local memory,
       so in that case the original tensor can be used,
@ -330,10 +334,12 @@ TfLiteStatus EvalMliQuantizedPerChannel(
    const bool w_is_local = weights_local.data == mli_weights.data;
    const bool b_is_local = bias_local.data == mli_bias.data;

-    TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels);
-    TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension, slice_channels);
-    TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
-                              0, 0, 0, true);
+    ops::micro::TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension,
+                                     slice_channels);
+    ops::micro::TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension,
+                                     slice_channels);
+    ops::micro::TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension,
+                                          slice_channels, 0, 0, 0, true);

    mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
    mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@ -352,15 +358,16 @@ TfLiteStatus EvalMliQuantizedPerChannel(
      dimension. for that the sliceHeight has been calculated. The tensor slicer
      is configured that it will completely slice the nBatch dimension (0) and
      slice the height dimension (1) in chunks of 'sliceHeight' */
-      TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height,
-                            cfg.padding_top, cfg.padding_bottom, overlap);
+      ops::micro::TensorSlicer in_slice(&mli_in, height_dimension,
+                                        in_slice_height, cfg.padding_top,
+                                        cfg.padding_bottom, overlap);

      /* output tensor is alreade sliced in the output channel dimension.
      out_ch_slice.Sub() is the tensor for the amount of output channels of this
      itteration of the weight slice loop. This tensor needs to be further
      sliced over the batch and height dimension. */
-      TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
-                             out_slice_height);
+      ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
+                                         out_slice_height);

      /* setup the pointers to the local or remote tensor to make the code
       * inside the loop easier. */
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@ -242,10 +242,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
  mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
  mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;

-  ConvertToMliTensor<int8_t>(input, &mli_in);
-  ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
-  ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
-  ConvertToMliTensor<int8_t>(output, &mli_out);
+  ops::micro::ConvertToMliTensor<int8_t>(input, &mli_in);
+  ops::micro::ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
+  ops::micro::ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
+  ops::micro::ConvertToMliTensor<int8_t>(output, &mli_out);

  if (params->activation == kTfLiteActRelu) {
    cfg.relu.type = MLI_RELU_GEN;
@ -301,7 +301,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
  mli_mov_cfg_t copy_config;
  mli_mov_cfg_for_copy(&copy_config);

-  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
+  TF_LITE_ENSURE_STATUS(ops::micro::get_arc_scratch_buffer_for_conv_tensors(
      context, &in_local, &weights_local, &bias_local, &out_local));
  /* is_local indicates that the tensor is already in local memory,
     so in that case the original tensor can be used,
@ -311,10 +311,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
  const bool w_is_local = weights_local.data == mli_weights.data;
  const bool b_is_local = bias_local.data == mli_bias.data;

-  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+  TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_io(
      &in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top,
      cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
-  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+  TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_weights(
      &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));

  /* if input channels is not equal to output channels, a channel multiplier
@ -324,13 +324,14 @@ TfLiteStatus EvalMliQuantizedPerChannel(
    slice_channels = (slice_channels / in_channels) * in_channels;
  }

-  TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels);
-  TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0,
-                       0, 0, true);
-  TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
-                            0, 0, 0, true);
-  TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0,
-                           0, 0, true);
+  ops::micro::TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension,
+                                   slice_channels);
+  ops::micro::TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension,
+                                   slice_channels, 0, 0, 0, true);
+  ops::micro::TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension,
+                                        slice_channels, 0, 0, 0, true);
+  ops::micro::TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension,
+                                       slice_channels, 0, 0, 0, true);

  mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
  mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@ -355,14 +356,16 @@ TfLiteStatus EvalMliQuantizedPerChannel(
    the sliceHeight has been calculated. The tensor slicer is configured that
    it will completely slice the nBatch dimension (0) and slice the height
    dimension (1) in chunks of 'sliceHeight' */
-    TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight,
-                          padding_top, padding_bottom, overlap);
+    ops::micro::TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension,
+                                      inSliceHeight, padding_top,
+                                      padding_bottom, overlap);

    /* output tensor is alreade sliced in the output channel dimension.
    out_ch_slice.Sub() is the tensor for the amount of output channels of this
    itteration of the weight slice loop. This tensor needs to be further
    sliced over the batch and height dimension. */
-    TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight);
+    ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension,
+                                       outSliceHeight);

    /* setup the pointers to the local or remote tensor to make the code
     * inside the loop easier. */
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@ -29,9 +29,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"

 namespace tflite {
-namespace ops {
-namespace micro {
-namespace fully_connected {
 namespace {

 struct OpData {
@ -127,10 +124,10 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
  mli_tensor mli_bias = {};
  mli_tensor mli_out = {};

-  ConvertToMliTensor<int8_t>(input, &mli_in);
-  ConvertToMliTensor<int8_t>(filter, &mli_weights);
-  ConvertToMliTensor<int32_t>(bias, &mli_bias);
-  ConvertToMliTensor<int8_t>(output, &mli_out);
+  ops::micro::ConvertToMliTensor<int8_t>(input, &mli_in);
+  ops::micro::ConvertToMliTensor<int8_t>(filter, &mli_weights);
+  ops::micro::ConvertToMliTensor<int32_t>(bias, &mli_bias);
+  ops::micro::ConvertToMliTensor<int8_t>(output, &mli_out);

  /* The input tensor can have more than 2 dimensions. for the compute this
     doesn't make any difference because all the inputs or a batch entry will
@ -156,9 +153,10 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
  int slice_size = mli_weights.shape[weight_out_dimension];

  /* allocate the local buffers, and compute the slice size */
-  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors(
-      context, &in_local, &weights_local, &bias_local, &out_local));
-  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+  TF_LITE_ENSURE_STATUS(
+      ops::micro::get_arc_scratch_buffer_for_fully_connect_tensors(
+          context, &in_local, &weights_local, &bias_local, &out_local));
+  TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_weights(
      &weights_local, &bias_local, weight_out_dimension, &slice_size));
  int max_out_slice_size =
      out_local.capacity / mli_hlp_tensor_element_size(&out_local);
@ -172,10 +170,11 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
  const bool w_is_local = weights_local.data == mli_weights.data;
  const bool b_is_local = bias_local.data == mli_bias.data;

-  TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size);
-  TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
-  TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0,
-                            true);
+  ops::micro::TensorSlicer w_slice(&mli_weights, weight_out_dimension,
+                                   slice_size);
+  ops::micro::TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
+  ops::micro::TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension,
+                                        slice_size, 0, 0, 0, true);

  mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
  mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@ -188,15 +187,15 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,

    // Slice the input over the batches (one at a time with the size of a
    // complete input)
-    TensorSlicer in_slice(&mli_in, input_size_dimension,
-                          mli_in.shape[input_size_dimension]);
+    ops::micro::TensorSlicer in_slice(&mli_in, input_size_dimension,
+                                      mli_in.shape[input_size_dimension]);

    /* output tensor is alreade sliced in the output size dimension.
    out_ch_slice.Sub() is the tensor for the amount of output size of this
    itteration of the weight slice loop. This tensor needs to be further
    sliced over the batch */
-    TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
-                           slice_size);
+    ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
+                                       slice_size);

    /* setup the pointers to the local or remote tensor to make the code
     * inside the loop easier. */
@ -359,19 +358,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  return kTfLiteOk;
 }

-}  // namespace fully_connected
-
 TfLiteRegistration Register_FULLY_CONNECTED() {
-  return {/*init=*/fully_connected::Init,
+  return {/*init=*/Init,
          /*free=*/nullptr,
-          /*prepare=*/fully_connected::Prepare,
-          /*invoke=*/fully_connected::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
          /*custom_name=*/nullptr,
          /*version=*/0};
 }

-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
@ -25,13 +25,13 @@ TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim,
                           int slice_size, int padding_pre, int padding_post,
                           int overlap, bool interleave_mode)
    : full_tensor_(full_tensor),
+      sub_tensor_{},
+      sub_cfg_{},
+      done_(false),
      sliceDim_(slice_dim),
      pad_pre_(padding_pre),
      pad_post_(padding_post),
-      overlap_(overlap),
-      sub_cfg_{},
-      sub_tensor_{},
-      done_(false) {
+      overlap_(overlap) {
  /* In the interleave mode, the slicing happens from the deepest dimension up
  to the slice_dim for example in an HWC layout this can mode can be used to
  slice in the C dimenstion. in this mode the data is not contiguous in memory
--- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@ -123,6 +123,10 @@ endif

  CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+
+  ldflags_to_remove = -Wl,--fatal-warnings -Wl,--gc-sections
+  LDFLAGS := $(filter-out $(ldflags_to_remove),$(LDFLAGS))
+  
  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))

  CXXFLAGS += $(PLATFORM_FLAGS)
--- a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
@ -87,6 +87,10 @@ ifeq ($(TARGET), himax_we1_evb)

  CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+
+  ldflags_to_remove = -Wl,--fatal-warnings -Wl,--gc-sections
+  LDFLAGS := $(filter-out $(ldflags_to_remove),$(LDFLAGS))
+
  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))

 endif