From a1612c0da602c2b5016f56ad581f918e06ce707c Mon Sep 17 00:00:00 2001
From: gerbauz <tsvetkov@synopsys.com>
Date: Tue, 20 Oct 2020 19:32:47 +0300
Subject: [PATCH] Switch conv, depthwise_conv and fully_connected kernels to
 flat namespace. Fixed code acording to options (-Wreorder, 
 -Wunused-function). Fixed filters for new default linker options.

This is incremental progress towards a flat namespace for TFLM. See
https://abseil.io/tips/130 for more context.
---
 tensorflow/lite/micro/kernels/arc_mli/conv.cc | 41 ++++++++++-------
 .../micro/kernels/arc_mli/depthwise_conv.cc   | 37 ++++++++-------
 .../micro/kernels/arc_mli/fully_connected.cc  | 45 +++++++++----------
 .../lite/micro/kernels/arc_mli/mli_slicers.cc |  8 ++--
 .../tools/make/targets/arc/arc_common.inc     |  4 ++
 .../make/targets/himax_we1_evb_makefile.inc   |  4 ++
 6 files changed, 76 insertions(+), 63 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
index 55ef2650bef..4522421fa56 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
@@ -66,6 +66,7 @@ struct OpData {
   int32_t output_activation_max;
 };
 
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
 inline PaddingType RuntimePaddingType(TfLitePadding padding) {
   switch (padding) {
     case TfLitePadding::kTfLitePaddingSame:
@@ -77,6 +78,7 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) {
       return PaddingType::kNone;
   }
 }
+#endif
 
 bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                      const TfLiteTensor* filter, const TfLiteTensor* bias,
@@ -194,7 +196,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   data->output_zero_point = output->params.zero_point;
 
   return kTfLiteOk;
-}  // namespace conv
+}
 
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteConvParams* params, const OpData& data,
@@ -259,10 +261,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
     mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
 
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
-    ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
+    ops::micro::ConvertToMliTensor<int8_t>(input, &mli_in);
+    ops::micro::ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
+    ops::micro::ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
+    ops::micro::ConvertToMliTensor<int8_t>(output, &mli_out);
 
     if (params->activation == kTfLiteActRelu) {
       cfg.relu.type = MLI_RELU_GEN;
@@ -313,14 +315,16 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     mli_tensor out_local = mli_out;
     mli_mov_cfg_t copy_config;
     mli_mov_cfg_for_copy(&copy_config);
-    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
+    TF_LITE_ENSURE_STATUS(ops::micro::get_arc_scratch_buffer_for_conv_tensors(
         context, &in_local, &weights_local, &bias_local, &out_local));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+    TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_io(
         &in_local, &out_local, kernel_height, cfg.stride_height,
         cfg.padding_top, cfg.padding_bottom, &in_slice_height,
         &out_slice_height));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
-        &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
+    TF_LITE_ENSURE_STATUS(
+        ops::micro::arc_scratch_buffer_calc_slice_size_weights(
+            &weights_local, &bias_local, weight_out_ch_dimension,
+            &slice_channels));
 
     /* is_local indicates that the tensor is already in local memory,
        so in that case the original tensor can be used,
@@ -330,10 +334,12 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     const bool w_is_local = weights_local.data == mli_weights.data;
     const bool b_is_local = bias_local.data == mli_bias.data;
 
-    TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels);
-    TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension, slice_channels);
-    TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
-                              0, 0, 0, true);
+    ops::micro::TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension,
+                                     slice_channels);
+    ops::micro::TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension,
+                                     slice_channels);
+    ops::micro::TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension,
+                                          slice_channels, 0, 0, 0, true);
 
     mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
     mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@@ -352,15 +358,16 @@ TfLiteStatus EvalMliQuantizedPerChannel(
       dimension. for that the sliceHeight has been calculated. The tensor slicer
       is configured that it will completely slice the nBatch dimension (0) and
       slice the height dimension (1) in chunks of 'sliceHeight' */
-      TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height,
-                            cfg.padding_top, cfg.padding_bottom, overlap);
+      ops::micro::TensorSlicer in_slice(&mli_in, height_dimension,
+                                        in_slice_height, cfg.padding_top,
+                                        cfg.padding_bottom, overlap);
 
       /* output tensor is alreade sliced in the output channel dimension.
       out_ch_slice.Sub() is the tensor for the amount of output channels of this
       itteration of the weight slice loop. This tensor needs to be further
       sliced over the batch and height dimension. */
-      TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
-                             out_slice_height);
+      ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
+                                         out_slice_height);
 
       /* setup the pointers to the local or remote tensor to make the code
        * inside the loop easier. */
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
index d30a5308708..8fe5d307cdd 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@@ -242,10 +242,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
   mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
   mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
 
-  ConvertToMliTensor<int8_t>(input, &mli_in);
-  ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
-  ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
-  ConvertToMliTensor<int8_t>(output, &mli_out);
+  ops::micro::ConvertToMliTensor<int8_t>(input, &mli_in);
+  ops::micro::ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
+  ops::micro::ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
+  ops::micro::ConvertToMliTensor<int8_t>(output, &mli_out);
 
   if (params->activation == kTfLiteActRelu) {
     cfg.relu.type = MLI_RELU_GEN;
@@ -301,7 +301,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
   mli_mov_cfg_t copy_config;
   mli_mov_cfg_for_copy(&copy_config);
 
-  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
+  TF_LITE_ENSURE_STATUS(ops::micro::get_arc_scratch_buffer_for_conv_tensors(
       context, &in_local, &weights_local, &bias_local, &out_local));
   /* is_local indicates that the tensor is already in local memory,
      so in that case the original tensor can be used,
@@ -311,10 +311,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
   const bool w_is_local = weights_local.data == mli_weights.data;
   const bool b_is_local = bias_local.data == mli_bias.data;
 
-  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+  TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_io(
       &in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top,
       cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
-  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+  TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_weights(
       &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
 
   /* if input channels is not equal to output channels, a channel multiplier
@@ -324,13 +324,14 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     slice_channels = (slice_channels / in_channels) * in_channels;
   }
 
-  TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels);
-  TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0,
-                       0, 0, true);
-  TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
-                            0, 0, 0, true);
-  TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0,
-                           0, 0, true);
+  ops::micro::TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension,
+                                   slice_channels);
+  ops::micro::TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension,
+                                   slice_channels, 0, 0, 0, true);
+  ops::micro::TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension,
+                                        slice_channels, 0, 0, 0, true);
+  ops::micro::TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension,
+                                       slice_channels, 0, 0, 0, true);
 
   mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
   mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@@ -355,14 +356,16 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     the sliceHeight has been calculated. The tensor slicer is configured that
     it will completely slice the nBatch dimension (0) and slice the height
     dimension (1) in chunks of 'sliceHeight' */
-    TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight,
-                          padding_top, padding_bottom, overlap);
+    ops::micro::TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension,
+                                      inSliceHeight, padding_top,
+                                      padding_bottom, overlap);
 
     /* output tensor is alreade sliced in the output channel dimension.
     out_ch_slice.Sub() is the tensor for the amount of output channels of this
     itteration of the weight slice loop. This tensor needs to be further
     sliced over the batch and height dimension. */
-    TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight);
+    ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension,
+                                       outSliceHeight);
 
     /* setup the pointers to the local or remote tensor to make the code
      * inside the loop easier. */
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
index 2d201653efc..ea5c6c6eaf3 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@@ -29,9 +29,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace fully_connected {
 namespace {
 
 struct OpData {
@@ -127,10 +124,10 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   mli_tensor mli_bias = {};
   mli_tensor mli_out = {};
 
-  ConvertToMliTensor<int8_t>(input, &mli_in);
-  ConvertToMliTensor<int8_t>(filter, &mli_weights);
-  ConvertToMliTensor<int32_t>(bias, &mli_bias);
-  ConvertToMliTensor<int8_t>(output, &mli_out);
+  ops::micro::ConvertToMliTensor<int8_t>(input, &mli_in);
+  ops::micro::ConvertToMliTensor<int8_t>(filter, &mli_weights);
+  ops::micro::ConvertToMliTensor<int32_t>(bias, &mli_bias);
+  ops::micro::ConvertToMliTensor<int8_t>(output, &mli_out);
 
   /* The input tensor can have more than 2 dimensions. for the compute this
      doesn't make any difference because all the inputs or a batch entry will
@@ -156,9 +153,10 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   int slice_size = mli_weights.shape[weight_out_dimension];
 
   /* allocate the local buffers, and compute the slice size */
-  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors(
-      context, &in_local, &weights_local, &bias_local, &out_local));
-  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+  TF_LITE_ENSURE_STATUS(
+      ops::micro::get_arc_scratch_buffer_for_fully_connect_tensors(
+          context, &in_local, &weights_local, &bias_local, &out_local));
+  TF_LITE_ENSURE_STATUS(ops::micro::arc_scratch_buffer_calc_slice_size_weights(
       &weights_local, &bias_local, weight_out_dimension, &slice_size));
   int max_out_slice_size =
       out_local.capacity / mli_hlp_tensor_element_size(&out_local);
@@ -172,10 +170,11 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   const bool w_is_local = weights_local.data == mli_weights.data;
   const bool b_is_local = bias_local.data == mli_bias.data;
 
-  TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size);
-  TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
-  TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0,
-                            true);
+  ops::micro::TensorSlicer w_slice(&mli_weights, weight_out_dimension,
+                                   slice_size);
+  ops::micro::TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
+  ops::micro::TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension,
+                                        slice_size, 0, 0, 0, true);
 
   mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
   mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@@ -188,15 +187,15 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 
     // Slice the input over the batches (one at a time with the size of a
     // complete input)
-    TensorSlicer in_slice(&mli_in, input_size_dimension,
-                          mli_in.shape[input_size_dimension]);
+    ops::micro::TensorSlicer in_slice(&mli_in, input_size_dimension,
+                                      mli_in.shape[input_size_dimension]);
 
     /* output tensor is alreade sliced in the output size dimension.
     out_ch_slice.Sub() is the tensor for the amount of output size of this
     itteration of the weight slice loop. This tensor needs to be further
     sliced over the batch */
-    TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
-                           slice_size);
+    ops::micro::TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
+                                       slice_size);
 
     /* setup the pointers to the local or remote tensor to make the code
      * inside the loop easier. */
@@ -359,19 +358,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace fully_connected
-
 TfLiteRegistration Register_FULLY_CONNECTED() {
-  return {/*init=*/fully_connected::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/fully_connected::Prepare,
-          /*invoke=*/fully_connected::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
index e20eea22a03..905c6fedf9d 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
@@ -25,13 +25,13 @@ TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim,
                            int slice_size, int padding_pre, int padding_post,
                            int overlap, bool interleave_mode)
     : full_tensor_(full_tensor),
+      sub_tensor_{},
+      sub_cfg_{},
+      done_(false),
       sliceDim_(slice_dim),
       pad_pre_(padding_pre),
       pad_post_(padding_post),
-      overlap_(overlap),
-      sub_cfg_{},
-      sub_tensor_{},
-      done_(false) {
+      overlap_(overlap) {
   /* In the interleave mode, the slicing happens from the deepest dimension up
   to the slice_dim for example in an HWC layout this can mode can be used to
   slice in the C dimenstion. in this mode the data is not contiguous in memory
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
index 28c0fcd8571..c396c1076f3 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@@ -123,6 +123,10 @@ endif
 
   CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
   CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+
+  ldflags_to_remove = -Wl,--fatal-warnings -Wl,--gc-sections
+  LDFLAGS := $(filter-out $(ldflags_to_remove),$(LDFLAGS))
+  
   MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
 
   CXXFLAGS += $(PLATFORM_FLAGS)
diff --git a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
index d19ce680b41..11c39867e31 100644
--- a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
@@ -87,6 +87,10 @@ ifeq ($(TARGET), himax_we1_evb)
 
   CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
   CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+
+  ldflags_to_remove = -Wl,--fatal-warnings -Wl,--gc-sections
+  LDFLAGS := $(filter-out $(ldflags_to_remove),$(LDFLAGS))
+
   MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
 
 endif