Cadence: HiFi4 Neural Network (NN) source download and Fix issue with Softmax

HiFi4 Neural Network (NN) source download Fixed compilation issue with softmax.cc Signed-off-by: Prasad Nikam pnikam@cadence.com Signed-off-by: Niranjan Yadla nyadla@cadence.com
2020-03-27 14:18:28 -07:00 · 2020-03-27 14:18:28 -07:00 · 2722af9700
commit 2722af9700
parent 22325a5c7d
4 changed files with 92 additions and 176 deletions
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/softmax.cc
@ -19,7 +19,7 @@
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ******************************************************************************/

-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -40,39 +40,34 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
-#include "xtensa_tf_micro_common.h"

+#include "xtensa_tf_micro_common.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace activations {
 namespace {

-struct OpData {
-  int32_t input_multiplier = 0;
-  int input_left_shift = 0;
-  int32_t input_range_radius = 0;
-  int diff_min = 0;
-};
-
-TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
+TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
                                    const TfLiteTensor* input,
                                    TfLiteTensor* output,
                                    const TfLiteSoftmaxParams* params,
-                                    OpData* data) {
+                                    SoftmaxParams* op_data) {
  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
    if (input->type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteUInt8);
      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
    } else {
+      TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
      if (output->type == kTfLiteInt16) {
        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
        // NOTE: Current int16 softmax output does not require symmetric scaling
        // - so no need to verify scale here.
      } else {
+        TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
        TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
      }
@ -80,12 +75,19 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,

    static const int kScaledDiffIntegerBits = 5;

+    int input_left_shift;
    tflite::PreprocessSoftmaxScaling(
        static_cast<double>(params->beta),
        static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
-        &data->input_multiplier, &data->input_left_shift);
-    data->diff_min = -1.0 * tflite::CalculateInputRadius(
-                                kScaledDiffIntegerBits, data->input_left_shift);
+        &op_data->input_multiplier, &input_left_shift);
+    op_data->input_left_shift = input_left_shift;
+    op_data->diff_min =
+        -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
+                                            op_data->input_left_shift);
+  } else {
+    TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
+    op_data->beta = static_cast<double>(params->beta);
  }
  return kTfLiteOk;
 }
@ -99,207 +101,118 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 void Free(TfLiteContext* context, void* buffer) {}

 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+
  return kTfLiteOk;
 }

-// Takes a 1D tensor and performs softmax along it.
-void Softmax1DFloat(const TfLiteTensor* input, TfLiteTensor* output,
-                    TfLiteSoftmaxParams* params) {
-  const int input_size = input->dims->data[0];
-  tflite::reference_ops::Softmax(input->data.f, input_size, 1, params->beta,
-                                 output->data.f);
-}
-
-// Takes a 2D tensor and perform softmax along the last dimension.
-TfLiteStatus Softmax2DFloat(TfLiteContext* context, const TfLiteTensor* input,
-                            TfLiteTensor* output, TfLiteSoftmaxParams* params) {
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
+// Takes a tensor and performs softmax along the last dimension.
+TfLiteStatus SoftmaxFloat(TfLiteContext *context, const TfLiteTensor* input, TfLiteTensor* output,
+                  const SoftmaxParams& op_data) {
+  const RuntimeShape& input_shape = GetTensorShape(input);
+  const float *input_data = GetTensorData<float>(input);
+  const RuntimeShape& output_shape = GetTensorShape(output);
+  float *output_data = GetTensorData<float>(output);
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);

  ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
-  float* p_scratch = (float*)xtensa_nnlib_scratch_buf;
+  float *p_scratch = (float *)xtensa_nnlib_scratch_buf;

-  if (input->dims->data[1] * sizeof(float) > XTENSA_NNLIB_MAX_SCRATCH_SIZE) {
+  if(depth * sizeof(float) > XTENSA_NNLIB_MAX_SCRATCH_SIZE)
+  {
    TF_LITE_KERNEL_LOG(context, "Softmax: insufficient scratch memory");
    return kTfLiteError;
  }

-  for (int i = 0; i < batch_size * input_size; ++i) {
-    p_scratch[i] = input->data.f[i] * params->beta;
+  for (int i = 0; i < outer_size; ++i) {
+    for (int c = 0; c < depth; ++c) {
+      p_scratch[c] = input_data[i * depth + c] * static_cast<float>(op_data.beta);
    }

-  for (int i = 0; i < batch_size; ++i) {
-    int err = xa_nn_vec_softmax_f32_f32(&output->data.f[i * input_size],
-                                        &p_scratch[i * input_size], input_size);
-    CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_softmax_f32_f32 failed");
+    int err = xa_nn_vec_softmax_f32_f32(&output_data[i * depth],
+        p_scratch,
+        depth);
+    CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_softmax_f32_f32 failed");                                                                \
  }
  return kTfLiteOk;
 }

-void Softmax1DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
-                        TfLiteSoftmaxParams* params, OpData* data) {
-  // (ahentz): this is arguably a dirty trick. Since the implementation
-  // always traverses the last dimension of a 4D tensor, we will pretend our 1D
-  // tensor is 4D in a special way. We will convert a (Y) shape into a (1,
-  // 1, 1, Y) shape.
-  const int input_size = input->dims->data[0];
-  const int32_t shape_data[4] = {1, 1, 1, input_size};
-  RuntimeShape shape(4, shape_data);
-  SoftmaxParams op_params;
-  op_params.input_multiplier = data->input_multiplier;
-  op_params.input_left_shift = data->input_left_shift;
-  op_params.diff_min = data->diff_min;
+TfLiteStatus SoftmaxQuantized(TfLiteContext* context, const TfLiteTensor* input, TfLiteTensor* output,
+                      const SoftmaxParams& op_data) {
  if (input->type == kTfLiteUInt8) {
-    tflite::reference_ops::Softmax(op_params, shape,
-                                   GetTensorData<uint8_t>(input), shape,
-                                   GetTensorData<uint8_t>(output));
-  } else {
-    if (output->type == kTfLiteInt16) {
-      tflite::reference_integer_ops::Softmax(
-          op_params, shape, GetTensorData<int8_t>(input), shape,
-          GetTensorData<int16_t>(output));
-    } else {
-      tflite::reference_integer_ops::Softmax(
-          op_params, shape, GetTensorData<int8_t>(input), shape,
-          GetTensorData<int8_t>(output));
-    }
-  }
-}
+    const RuntimeShape& input_shape = GetTensorShape(input);
+    const uint8_t *input_data = GetTensorData<uint8_t>(input);
+    const RuntimeShape& output_shape = GetTensorShape(output);
+    uint8_t *output_data = GetTensorData<uint8_t>(output);
+    const int trailing_dim = input_shape.DimensionsCount() - 1;
+    const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+    const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);

-TfLiteStatus Softmax2DQuantized(TfLiteContext* context,
-                                const TfLiteTensor* input, TfLiteTensor* output,
-                                TfLiteSoftmaxParams* params, OpData* data) {
-  // (ahentz): this is arguably a dirty trick. Since the implementation
-  // always traverses the last dimension of a 4D tensor, we will pretend our 2D
-  // tensor is 4D in a special way. We will convert a (X, Y) shape into a (X,
-  // 1, 1, Y) shape.
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
-  const int32_t shape_data[4] = {batch_size, 1, 1, input_size};
-  RuntimeShape shape(4, shape_data);
-  SoftmaxParams op_params;
-  op_params.input_multiplier = data->input_multiplier;
-  op_params.input_left_shift = data->input_left_shift;
-  op_params.diff_min = data->diff_min;
-
-  if (input->type == kTfLiteUInt8) {
    ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
-    void* p_scratch = (void*)xtensa_nnlib_scratch_buf;
+    void *p_scratch = (void *)xtensa_nnlib_scratch_buf;

-    if (get_softmax_scratch_size(PREC_ASYM8, PREC_ASYM8, input_size) >
-        XTENSA_NNLIB_MAX_SCRATCH_SIZE) {
+    if(get_softmax_scratch_size(PREC_ASYM8, PREC_ASYM8, depth) > XTENSA_NNLIB_MAX_SCRATCH_SIZE)
+    {
      TF_LITE_KERNEL_LOG(context, "Softmax: insufficient scratch memory");
      return kTfLiteError;
    }

-    for (int i = 0; i < batch_size; ++i) {
-      int err = xa_nn_vec_softmax_asym8_asym8(
-          &output->data.uint8[i * input_size],
-          &input->data.uint8[i * input_size], op_params.diff_min,
-          op_params.input_left_shift, op_params.input_multiplier, input_size,
-          p_scratch);
-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_softmax_asym8_asym8 failed");
+    for (int i = 0; i < outer_size; ++i) {
+      int err = xa_nn_vec_softmax_asym8_asym8(&output_data[i * depth],
+          &input_data[i * depth],
+          op_data.diff_min,
+          op_data.input_left_shift,
+          op_data.input_multiplier,
+          depth,
+          p_scratch
+          );
+      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_softmax_asym8_asym8 failed");                                                                \
    }
  } else {
    if (output->type == kTfLiteInt16) {
-      tflite::reference_integer_ops::Softmax(
-          op_params, shape, GetTensorData<int8_t>(input), shape,
-          GetTensorData<int16_t>(output));
-    } else {
-      tflite::reference_integer_ops::Softmax(
-          op_params, shape, GetTensorData<int8_t>(input), shape,
-          GetTensorData<int8_t>(output));
-    }
-  }
-  return kTfLiteOk;
-}
-
-// Takes a 4D tensor and perform softmax along the forth dimension.
-void Softmax4DFloat(const TfLiteTensor* input, TfLiteTensor* output,
-                    TfLiteSoftmaxParams* params) {
-  SoftmaxParams op_params;
-  op_params.beta = static_cast<double>(params->beta);
      tflite::reference_ops::Softmax(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(output), GetTensorData<float>(output));
-}
-
-void Softmax4DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
-                        TfLiteSoftmaxParams* params, OpData* data) {
-  SoftmaxParams op_params;
-  op_params.input_multiplier = data->input_multiplier;
-  op_params.input_left_shift = data->input_left_shift;
-  op_params.diff_min = data->diff_min;
-  if (input->type == kTfLiteUInt8) {
-    tflite::reference_ops::Softmax(
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(output), GetTensorData<uint8_t>(output));
-  } else {
-    if (output->type == kTfLiteInt16) {
-      tflite::reference_integer_ops::Softmax(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+          op_data, GetTensorShape(input), GetTensorData<int8_t>(input),
          GetTensorShape(output), GetTensorData<int16_t>(output));
    } else {
-      tflite::reference_integer_ops::Softmax(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      tflite::reference_ops::Softmax(
+          op_data, GetTensorShape(input), GetTensorData<int8_t>(input),
          GetTensorShape(output), GetTensorData<int8_t>(output));
    }
  }
+  return kTfLiteOk;
 }

 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);

  const TfLiteTensor* input = GetInput(context, node, 0);
  TfLiteTensor* output = GetOutput(context, node, 0);

-  OpData local_data_object;
-  OpData* data = &local_data_object;
+  SoftmaxParams op_data;
  TF_LITE_ENSURE_STATUS(
-      CalculateSoftmaxOpData(context, input, output, params, data));
+      CalculateSoftmaxParams(context, input, output, params, &op_data));

-  // (ahentz): consider an implementation that works for many (all?)
-  // dimensions.
  switch (input->type) {
    case kTfLiteFloat32: {
-      if (NumDimensions(input) == 1) {
-        Softmax1DFloat(input, output, params);
-        return kTfLiteOk;
-      }
-      if (NumDimensions(input) == 2) {
-        return Softmax2DFloat(context, input, output, params);
-      }
-      if (NumDimensions(input) == 4) {
-        Softmax4DFloat(input, output, params);
-        return kTfLiteOk;
-      }
-      TF_LITE_KERNEL_LOG(
-          context, "Only 1D, 2D and 4D tensors supported currently, got %dD.",
-          NumDimensions(input));
-      return kTfLiteError;
+      return SoftmaxFloat(context, input, output, op_data);
    }
    case kTfLiteInt8:
    case kTfLiteUInt8: {
-      if (NumDimensions(input) == 1) {
-        Softmax1DQuantized(input, output, params, data);
-        return kTfLiteOk;
-      }
-      if (NumDimensions(input) == 2) {
-        return Softmax2DQuantized(context, input, output, params, data);
-      }
-      if (NumDimensions(input) == 4) {
-        Softmax4DQuantized(input, output, params, data);
-        return kTfLiteOk;
-      }
-      TF_LITE_KERNEL_LOG(context,
-                         "Only 2D and 4D tensors supported currently, got %dD.",
-                         NumDimensions(input));
-      return kTfLiteError;
+      return SoftmaxQuantized(context, input, output, op_data);
    }
    default:
      TF_LITE_KERNEL_LOG(
          context,
-          "Only float32, uint8_t and int8_t supported currently, got %d.",
+          "Only float32, uint8_t and int8_t input supported currently, got %d.",
          input->type);
      return kTfLiteError;
  }
@ -307,11 +220,14 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace activations

 TfLiteRegistration* Register_SOFTMAX() {
-  static TfLiteRegistration r = {};
-  r.init = activations::Init;
-  r.free = activations::Free;
-  r.prepare = activations::SoftmaxPrepare;
-  r.invoke = activations::SoftmaxEval;
+  static TfLiteRegistration r = {activations::Init,
+                                 activations::Free,
+                                 activations::SoftmaxPrepare,
+                                 activations::SoftmaxEval,
+                                 nullptr,
+                                 0,
+                                 nullptr,
+                                 0};
  return &r;
 }

--- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifi_nn_library.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifi_nn_library.inc
@ -1,6 +1,6 @@
 ifneq ($(filter xtensa_hifi, $(ALL_TAGS)),)

-    XTENSA_PATH = $(MAKEFILE_DIR)/../../kernels/xtensa_hifi
+    XTENSA_PATH = $(MAKEFILE_DIR)/downloads

    ifneq (,$(filter hifi4%, $(TARGET_ARCH)))

--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
@ -5,6 +5,8 @@
 ifeq ($(TARGET), xtensa_hifi)
  TARGET_ARCH := hifi3_bd5

+$(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib,))
+
  PLATFORM_ARGS = \
    -mno-mul16 \
    -mno-mul32 \
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc
@ -7,8 +7,6 @@
 ifeq ($(TARGET), xtensa-xpg)
  TARGET_ARCH := xtensa-xpg

-$(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib,))
-
  PLATFORM_ARGS = \
    -DTF_LITE_MCU_DEBUG_LOG \
    --xtensa-core=$(XTENSA_CORE) \