Merge pull request #40943 from pnikam-cad:hifi4_nnlib_v2_2_0_update

PiperOrigin-RevId: 322616499 Change-Id: I6a08cb4f11abe33c38c91a72cc45c635d0f78797
2020-07-22 11:30:39 -07:00 · 2020-07-22 11:30:39 -07:00 · bf3b14ffcb
commit bf3b14ffcb
parent 7cd0adacea 3b0642fbcd
14 changed files with 1353 additions and 647 deletions
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/activations.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/activations.cc
@ -1,5 +1,5 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/

+******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
@ -41,8 +41,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 #include "tensorflow/lite/micro/micro_utils.h"
-#include "xtensa_tf_micro_common.h"

 namespace tflite {
 namespace ops {
@ -109,6 +109,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {

  switch (input->type) {
    case kTfLiteFloat32: {
+#if HIFI_VFPU
      int err;
      const float* inp_data_ptr;
      float* out_data_ptr;
@ -119,11 +120,13 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
      inp_data_ptr = GetTensorData<float>(input);
      out_data_ptr = GetTensorData<float>(output);

-      const float f32_pos_inf = 0x7F800000;
-      err = xa_nn_vec_relu_f32_f32(out_data_ptr, inp_data_ptr, f32_pos_inf,
-                                   flat_size);
+      err = xa_nn_vec_relu_std_f32_f32(out_data_ptr, inp_data_ptr, flat_size);

-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu1_f32_f32 failed");
+      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu_std_f32_f32 failed");
+#else
+      ReluFloat(GetTensorShape(input), GetTensorData<float>(input),
+                GetTensorShape(output), GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
      return kTfLiteOk;
    }
    case kTfLiteInt8: {
@ -140,14 +143,17 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
      const RuntimeShape& input_shape = GetTensorShape(input);
      const RuntimeShape& output_shape = GetTensorShape(output);
      const int flat_size = MatchingFlatSize(input_shape, output_shape);
+      const uint8_t zero = input->params.zero_point;

      inp_data_ptr = GetTensorData<uint8_t>(input);
      out_data_ptr = GetTensorData<uint8_t>(output);

      err = xa_nn_vec_activation_min_max_asym8_asym8(
-          out_data_ptr, inp_data_ptr, 0, 255, flat_size);  // Is 255 right?
+          out_data_ptr, inp_data_ptr, zero, std::numeric_limits<uint8_t>::max(),
+          flat_size);

-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_activation_min_max_8_8 failed");
+      CHECK_ERR_HIFI_NNLIB_KER(
+          err, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
      return kTfLiteOk;
    }
    default: {
@ -168,6 +174,7 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {

  switch (input->type) {
    case kTfLiteFloat32: {
+#if HIFI_VFPU
      int err;
      const float* inp_data_ptr;
      float* out_data_ptr;
@ -180,7 +187,11 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {

      err = xa_nn_vec_relu6_f32_f32(out_data_ptr, inp_data_ptr, flat_size);

-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu1_f32_f32 failed");
+      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu6_f32_f32 failed");
+#else
+      Relu6Float(GetTensorShape(input), GetTensorData<float>(input),
+                 GetTensorShape(output), GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
      return kTfLiteOk;
    }
    case kTfLiteInt8: {
@ -209,7 +220,8 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
      err = xa_nn_vec_activation_min_max_asym8_asym8(out_data_ptr, inp_data_ptr,
                                                     zero, six, flat_size);

-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_activation_min_max_8_8 failed");
+      CHECK_ERR_HIFI_NNLIB_KER(
+          err, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
      return kTfLiteOk;
    }
    default: {
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/add.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/add.cc
@ -0,0 +1,273 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace add {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  bool requires_broadcast;
+
+  // These fields are used in both the general 8-bit -> 8bit quantized path,
+  // and the special 16-bit -> 16bit quantized path
+  int input1_shift;
+  int input2_shift;
+  int32 output_activation_min;
+  int32 output_activation_max;
+
+  // These fields are used only in the general 8-bit -> 8bit quantized path
+  int32 input1_multiplier;
+  int32 input2_multiplier;
+  int32 output_multiplier;
+  int output_shift;
+  int left_shift;
+  int32 input1_offset;
+  int32 input2_offset;
+  int32 output_offset;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
+                             const TfLiteTensor* input1,
+                             const TfLiteTensor* input2, TfLiteTensor* output,
+                             OpData* data) {
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    // 8bit -> 8bit general quantized path, with general rescalings
+    data->input1_offset = -input1->params.zero_point;
+    data->input2_offset = -input2->params.zero_point;
+    data->output_offset = output->params.zero_point;
+    data->left_shift = 20;
+    const double twice_max_input_scale =
+        2 * static_cast<double>(
+                std::max(input1->params.scale, input2->params.scale));
+    const double real_input1_multiplier =
+        static_cast<double>(input1->params.scale) / twice_max_input_scale;
+    const double real_input2_multiplier =
+        static_cast<double>(input2->params.scale) / twice_max_input_scale;
+    const double real_output_multiplier =
+        twice_max_input_scale /
+        ((1 << data->left_shift) * static_cast<double>(output->params.scale));
+
+    QuantizeMultiplierSmallerThanOneExp(
+        real_input1_multiplier, &data->input1_multiplier, &data->input1_shift);
+
+    QuantizeMultiplierSmallerThanOneExp(
+        real_input2_multiplier, &data->input2_multiplier, &data->input2_shift);
+
+    QuantizeMultiplierSmallerThanOneExp(
+        real_output_multiplier, &data->output_multiplier, &data->output_shift);
+
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->output_activation_min,
+        &data->output_activation_max));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalAdd(TfLiteContext* context, TfLiteNode* node,
+                     TfLiteAddParams* params, const OpData* data,
+                     const TfLiteTensor* input1, const TfLiteTensor* input2,
+                     TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+#define TF_LITE_ADD(opname)                                                   \
+  reference_ops::opname(op_params, GetTensorShape(input1),                    \
+                        GetTensorData<float>(input1), GetTensorShape(input2), \
+                        GetTensorData<float>(input2), GetTensorShape(output), \
+                        GetTensorData<float>(output))
+  if (data->requires_broadcast) {
+    TF_LITE_ADD(BroadcastAdd4DSlow);
+  } else {
+#if HIFI_VFPU
+    int err;
+    const RuntimeShape& input1_shape = GetTensorShape(input1);
+    const RuntimeShape& input2_shape = GetTensorShape(input2);
+    const RuntimeShape& output_shape = GetTensorShape(output);
+    const int flat_size =
+        MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+    err = xa_nn_elm_add_f32xf32_f32(GetTensorData<float>(output),
+                                    GetTensorData<float>(input1),
+                                    GetTensorData<float>(input2), flat_size);
+
+    CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_add_f32xf32_f32 failed");
+
+    err = xa_nn_vec_activation_min_max_f32_f32(
+        GetTensorData<float>(output), GetTensorData<float>(output),
+        output_activation_min, output_activation_max, flat_size);
+
+    CHECK_ERR_HIFI_NNLIB_KER(err,
+                             "xa_nn_vec_activation_min_max_f32_f32 failed");
+#else
+    TF_LITE_ADD(Add);
+#endif /* HIFI_VFPU */
+  }
+#undef TF_LITE_ADD
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
+                              TfLiteAddParams* params, const OpData* data,
+                              const TfLiteTensor* input1,
+                              const TfLiteTensor* input2,
+                              TfLiteTensor* output) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    tflite::ArithmeticParams op_params;
+    op_params.left_shift = data->left_shift;
+    op_params.input1_offset = data->input1_offset;
+    op_params.input1_multiplier = data->input1_multiplier;
+    op_params.input1_shift = data->input1_shift;
+    op_params.input2_offset = data->input2_offset;
+    op_params.input2_multiplier = data->input2_multiplier;
+    op_params.input2_shift = data->input2_shift;
+    op_params.output_offset = data->output_offset;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
+#define TF_LITE_ADD(type, opname, dtype)                             \
+  type::opname(op_params, GetTensorShape(input1),                    \
+               GetTensorData<dtype>(input1), GetTensorShape(input2), \
+               GetTensorData<dtype>(input2), GetTensorShape(output), \
+               GetTensorData<dtype>(output));
+    if (output->type == kTfLiteInt8) {
+      if (need_broadcast) {
+        TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
+      } else {
+        TF_LITE_ADD(reference_integer_ops, Add, int8_t);
+      }
+    } else {
+      if (need_broadcast) {
+        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
+      } else {
+        int err;
+        const RuntimeShape& input1_shape = GetTensorShape(input1);
+        const RuntimeShape& input2_shape = GetTensorShape(input2);
+        const RuntimeShape& output_shape = GetTensorShape(output);
+        const int flat_size =
+            MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+        err = xa_nn_elm_add_asym8xasym8_asym8(
+            GetTensorData<uint8_t>(output), op_params.output_offset,
+            op_params.output_shift, op_params.output_multiplier,
+            op_params.quantized_activation_min,
+            op_params.quantized_activation_max, GetTensorData<uint8_t>(input1),
+            op_params.input1_offset, op_params.input1_shift,
+            op_params.input1_multiplier, GetTensorData<uint8_t>(input2),
+            op_params.input2_offset, op_params.input2_shift,
+            op_params.input2_multiplier, op_params.left_shift, flat_size);
+
+        CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_add_asym8xasym8_asym8 failed");
+      }
+    }
+#undef TF_LITE_ADD
+  }
+
+  return kTfLiteOk;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+
+  TF_LITE_ENSURE_STATUS(
+      CalculateOpData(context, params, input1, input2, output, data));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    TF_LITE_ENSURE_OK(
+        context, EvalAdd(context, node, params, data, input1, input2, output));
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, data,
+                                                input1, input2, output));
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(output->type), output->type);
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace add
+
+TfLiteRegistration Register_ADD() {
+  return {/*init=*/add::Init,
+          /*free=*/nullptr,
+          /*prepare=*/add::Prepare,
+          /*invoke=*/add::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
@ -1,5 +1,5 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/

+******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
@ -44,7 +44,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"

 namespace tflite {
 namespace ops {
@ -55,7 +55,6 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;

 // Conv is quantized along dimension 0:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@ -71,9 +70,8 @@ struct OpData {
  int output_shift;

  // Per channel output multiplier and shift.
-  // (b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;

  // The range of the fused activation layer. For example for kNone and
  // uint8_t these would be 0 and 255.
@ -94,10 +92,10 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) {
 }

 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, int width, int height,
-                             int filter_width, int filter_height, int out_width,
-                             int out_height, const TfLiteType data_type,
-                             OpData* data) {
+                             const TfLiteConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             int out_width, int out_height,
+                             const TfLiteType data_type, OpData* data) {
  bool has_bias = node->inputs->size == 3;
  // Check number of inputs/outputs
  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@ -131,8 +129,69 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
  return kTfLiteOk;
 }

+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+
+  int input_width = input->dims->data[2];
+  int input_height = input->dims->data[1];
+  int filter_width = filter->dims->data[2];
+  int filter_height = filter->dims->data[1];
+  int output_width = output->dims->data[2];
+  int output_height = output->dims->data[1];
+
+  // Dynimically allocate per-channel quantization parameters.
+  const int num_channels = filter->dims->data[kConvQuantizedDimension];
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&data->per_channel_output_shift)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+
+    TF_LITE_ENSURE(context,
+                   affine_quantization->scale->size == 1 ||
+                       affine_quantization->scale->size ==
+                           filter->dims->data[kConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  return CalculateOpData(context, node, params, input_width, input_height,
+                         filter_width, filter_height, output_width,
+                         output_height, input->type, data);
+}  // namespace conv
+
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteConvParams* params, OpData* data,
+                           TfLiteConvParams* params, const OpData& data,
                           const TfLiteTensor* input,
                           const TfLiteTensor* filter, const TfLiteTensor* bias,
                           TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
@ -143,9 +202,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,

  if ((params->dilation_width_factor == 1) &&
      (params->dilation_height_factor == 1)) {
-    const uint8 *input_data, *filter_data;
+    const uint8_t *input_data, *filter_data;
    const int32_t* bias_data;
-    uint8* output_data;
+    uint8_t* output_data;
    const RuntimeShape& input_shape = GetTensorShape(input);
    const RuntimeShape& filter_shape = GetTensorShape(filter);
    const RuntimeShape& output_shape = GetTensorShape(output);
@ -158,14 +217,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,

    const int stride_width = params->stride_width;
    const int stride_height = params->stride_height;
-    const int dilation_width_factor = 1;
-    const int dilation_height_factor = 1;
-    const int pad_width = data->padding.width;
-    const int pad_height = data->padding.height;
-    const int32 output_activation_min = data->output_activation_min;
-    const int32 output_activation_max = data->output_activation_max;
-    const int32 output_multiplier = data->output_multiplier;
-    const int output_shift = -data->output_shift;
+    const int pad_width = data.padding.width;
+    const int pad_height = data.padding.height;
+    const int32 output_activation_min = data.output_activation_min;
+    const int32 output_activation_max = data.output_activation_max;
+    const int32 output_multiplier = data.output_multiplier;
+    const int output_shift = -data.output_shift;
    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
@ -186,13 +243,14 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
    const int filter_depth = filter_shape.Dims(3);

    int err, output_data_format = 0;
-    void* p_scratch;
-    uint8 *p_filter, *p_out_scratch;
+    uint8_t* p_scratch;
+    uint8_t* p_filter;
    // Calculate filter_depth_padded as next near multiple of 4
    int filter_depth_padded = (filter_depth + 3) & (~3);
    int out_length = output_height * output_width * output_depth;
+    int filter_size_padded = filter_height * filter_width * filter_depth_padded;
    int required_scratch, input_precision = PREC_ASYM8;
-    int h, w, c;
+    int h, c;

    required_scratch = xa_nn_conv2d_std_getsize(
        input_height, input_depth, filter_height, filter_width, stride_height,
@ -207,19 +265,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
    ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
    p_scratch = xtensa_nnlib_scratch_buf;

-    p_filter = (uint8*)p_scratch;
-    p_out_scratch =
-        (p_filter +
-         ALIGNED_SIZE((sizeof(uint8_t) * filter_height * filter_width *
-                       filter_depth_padded * output_depth),
-                      8));
+    p_filter = p_scratch;
    required_scratch +=
-        ALIGNED_SIZE((sizeof(uint8_t) * filter_height * filter_width *
-                      filter_depth_padded * output_depth),
-                     8);
-    p_scratch =
-        (uint8*)(p_out_scratch + ALIGNED_SIZE(sizeof(uint8_t) * out_length, 8));
-    required_scratch += ALIGNED_SIZE(sizeof(uint8_t) * out_length, 8);
+        ALIGNED_SIZE((sizeof(uint8_t) * filter_size_padded * output_depth), 8);
+    p_scratch +=
+        ALIGNED_SIZE(sizeof(uint8_t) * filter_size_padded * output_depth, 8);

    if (required_scratch > (int)XTENSA_NNLIB_MAX_SCRATCH_SIZE) {
      TF_LITE_KERNEL_LOG(context,
@ -240,9 +290,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
    }

    for (int batch = 0; batch < batches; ++batch) {
-      uint8* p_out_temp;
-      p_out_temp = (uint8*)&p_out_scratch[0];
-      p_out_temp = (uint8*)ALIGN_PTR(p_out_temp, 8);
+      uint8_t* p_out_temp;
+      p_out_temp = &output_data[batch * out_length];

      err = xa_nn_conv2d_std_asym8xasym8(
          p_out_temp,
@ -252,24 +301,24 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
          filter_width, output_depth, stride_width, stride_height, pad_width,
          pad_height, output_height, output_width, input_offset, filter_offset,
          output_multiplier, output_shift, output_offset, output_data_format,
-          p_scratch);
+          static_cast<void*>(p_scratch));

      CHECK_ERR_HIFI_NNLIB_KER(
          err, "conv2d_std_asym8: xa_nn_conv2d_std_asym8xasym8 failed");

-      for (int i = 0; i < out_length; i++) {
-        uint8* p_temp;
-        p_temp = &output_data[batch * out_length];
+      err = xa_nn_vec_activation_min_max_asym8_asym8(
+          p_out_temp, p_out_temp, output_activation_min, output_activation_max,
+          out_length);

-        ACTIVATION_MIN_MAX_ASYM8(p_temp[i], p_out_temp[i],
-                                 output_activation_min, output_activation_max)
-      }
+      CHECK_ERR_HIFI_NNLIB_KER(
+          err, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
    }
  } else {
+    // TODO(b/154032858): Investigate removing extra copies.
    ConvParams op_params;
    op_params.padding_type = RuntimePaddingType(params->padding);
-    op_params.padding_values.width = data->padding.width;
-    op_params.padding_values.height = data->padding.height;
+    op_params.padding_values.width = data.padding.width;
+    op_params.padding_values.height = data.padding.height;
    op_params.stride_width = params->stride_width;
    op_params.stride_height = params->stride_height;
    op_params.dilation_width_factor = params->dilation_width_factor;
@ -277,10 +326,10 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
    op_params.input_offset = input_offset;
    op_params.weights_offset = filter_offset;
    op_params.output_offset = output_offset;
-    op_params.output_multiplier = data->output_multiplier;
-    op_params.output_shift = -data->output_shift;
-    op_params.quantized_activation_min = data->output_activation_min;
-    op_params.quantized_activation_max = data->output_activation_max;
+    op_params.output_multiplier = data.output_multiplier;
+    op_params.output_shift = -data.output_shift;
+    op_params.quantized_activation_min = data.output_activation_min;
+    op_params.quantized_activation_max = data.output_activation_max;
    reference_ops::Conv(op_params, GetTensorShape(input),
                        GetTensorData<uint8_t>(input), GetTensorShape(filter),
                        GetTensorData<uint8_t>(filter), GetTensorShape(bias),
@ -292,11 +341,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }

 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, OpData* data,
+                             TfLiteConvParams* params, const OpData& data,
                             const TfLiteTensor* input,
                             const TfLiteTensor* filter,
                             const TfLiteTensor* bias, TfLiteTensor* output,
                             TfLiteTensor* im2col) {
+  // TODO(b/154032858): Investigate removing extra copies.
  ConvParams op_params;
  op_params.input_offset = -input->params.zero_point;
  op_params.output_offset = output->params.zero_point;
@ -304,14 +354,14 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
  op_params.stride_width = params->stride_width;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;

  reference_integer_ops::ConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
+      op_params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, GetTensorShape(input),
      GetTensorData<int8>(input), GetTensorShape(filter),
      GetTensorData<int8>(filter), GetTensorShape(bias),
      GetTensorData<int32>(bias), GetTensorShape(output),
@ -319,7 +369,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 }

 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteConvParams* params, OpData* data,
+                       TfLiteConvParams* params, const OpData& data,
                       const TfLiteTensor* input, const TfLiteTensor* filter,
                       const TfLiteTensor* bias, TfLiteTensor* im2col,
                       TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
@ -327,6 +377,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);

+#if HIFI_VFPU
  if ((params->dilation_width_factor == 1) &&
      (params->dilation_height_factor == 1)) {
    const float *input_data, *filter_data;
@ -344,10 +395,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,

    const int stride_width = params->stride_width;
    const int stride_height = params->stride_height;
-    const int dilation_width_factor = 1;
-    const int dilation_height_factor = 1;
-    const int pad_width = data->padding.width;
-    const int pad_height = data->padding.height;
+    const int pad_width = data.padding.width;
+    const int pad_height = data.padding.height;
    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
@ -366,13 +415,14 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
    const int output_width = output_shape.Dims(2);
    const int filter_depth = filter_shape.Dims(3);
    int err, output_data_format = 0;
-    void* p_scratch;
-    float *p_filter, *p_out_scratch;
+    uint8_t* p_scratch;
+    float* p_filter;
    // Calculate filter_depth_padded as next near multiple of 2
    int filter_depth_padded = (filter_depth + 1) & (~1);
    int out_length = output_height * output_width * output_depth;
+    int filter_size_padded = filter_height * filter_width * filter_depth_padded;
    int required_scratch, input_precision = PREC_F32;
-    int h, w, c;
+    int h, c;

    required_scratch = xa_nn_conv2d_std_getsize(
        input_height, input_depth, filter_height, filter_width, stride_height,
@ -387,19 +437,11 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
    ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
    p_scratch = xtensa_nnlib_scratch_buf;

-    p_filter = (float*)p_scratch;
-    p_out_scratch =
-        (float*)((uint8_t*)p_filter +
-                 ALIGNED_SIZE((sizeof(float) * filter_height * filter_width *
-                               filter_depth_padded * output_depth),
-                              8));
+    p_filter = reinterpret_cast<float*>(p_scratch);
+    p_scratch +=
+        ALIGNED_SIZE((sizeof(float) * filter_size_padded * output_depth), 8);
    required_scratch +=
-        ALIGNED_SIZE((sizeof(float) * filter_height * filter_width *
-                      filter_depth_padded * output_depth),
-                     8);
-    p_scratch = (float*)((uint8_t*)p_out_scratch +
-                         ALIGNED_SIZE(sizeof(float) * out_length, 8));
-    required_scratch += ALIGNED_SIZE(sizeof(float) * out_length, 8);
+        ALIGNED_SIZE((sizeof(float) * filter_size_padded * output_depth), 8);

    if (required_scratch > (int)XTENSA_NNLIB_MAX_SCRATCH_SIZE) {
      TF_LITE_KERNEL_LOG(context,
@ -420,8 +462,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,

    for (int batch = 0; batch < batches; ++batch) {
      float* p_out_temp;
-      p_out_temp = (float*)&p_out_scratch[0];
-      p_out_temp = (float*)ALIGN_PTR(p_out_temp, 8);
+      p_out_temp = &output_data[batch * out_length];

      err = xa_nn_conv2d_std_f32(
          p_out_temp,
@ -429,23 +470,26 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
          p_filter, bias_data, input_height, input_width, input_depth,
          filter_height, filter_width, output_depth, stride_width,
          stride_height, pad_width, pad_height, output_height, output_width,
-          output_data_format, p_scratch);
+          output_data_format, static_cast<void*>(p_scratch));

      CHECK_ERR_HIFI_NNLIB_KER(
          err, "conv2d_std_f32: xa_nn_conv2d_std_f32xf32 failed");

-      for (int i = 0; i < out_length; i++) {
-        float* p_temp;
-        p_temp = &output_data[batch * out_length];
-        ACTIVATION_MIN_MAX(float, p_temp[i], p_out_temp[i],
-                           output_activation_min, output_activation_max)
+      err = xa_nn_vec_activation_min_max_f32_f32(
+          p_out_temp, p_out_temp, output_activation_min, output_activation_max,
+          out_length);
+
+      CHECK_ERR_HIFI_NNLIB_KER(err,
+                               "xa_nn_vec_activation_min_max_f32_f32 failed");
    }
-    }
-  } else {
+  } else
+#endif /* HIFI_VFPU */
+  {
+    // TODO(b/154032858): Investigate removing extra copies.
    ConvParams op_params;
    op_params.padding_type = RuntimePaddingType(params->padding);
-    op_params.padding_values.width = data->padding.width;
-    op_params.padding_values.height = data->padding.height;
+    op_params.padding_values.width = data.padding.width;
+    op_params.padding_values.height = data.padding.height;
    op_params.stride_width = params->stride_width;
    op_params.stride_height = params->stride_height;
    op_params.dilation_width_factor = params->dilation_width_factor;
@ -471,50 +515,20 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);

-  int input_width = input->dims->data[2];
-  int input_height = input->dims->data[1];
-  int filter_width = filter->dims->data[2];
-  int filter_height = filter->dims->data[1];
-  int output_width = output->dims->data[2];
-  int output_height = output->dims->data[1];
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-
-    TF_LITE_ENSURE(context,
-                   affine_quantization->scale->size == 1 ||
-                       affine_quantization->scale->size ==
-                           filter->dims->data[kConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(
-      context, node, params, input_width, input_height, filter_width,
-      filter_height, output_width, output_height, input->type, &data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));

  switch (input->type) {  // Already know in/out types are same.
    case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, nullptr,
+      EvalFloat(context, node, params, data, input, filter, bias, nullptr,
                nullptr, output);
      break;
    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
                              output, nullptr);
      break;
    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, nullptr,
+      EvalQuantized(context, node, params, data, input, filter, bias, nullptr,
                    nullptr, output);
      break;
    default:
@ -528,9 +542,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace conv

 TfLiteRegistration Register_CONV_2D() {
-  return {/*init=*/nullptr,
+  return {/*init=*/conv::Init,
          /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/conv::Prepare,
          /*invoke=*/conv::Eval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc
@ -1,5 +1,5 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/

+******************************************************************************/
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
@ -45,7 +45,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"

 namespace tflite {
 namespace ops {
@ -57,8 +57,6 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-// Per channel quantization is not needed for any model on xtensa.
-constexpr int kMaxChannels = 256;

 // Depthwise conv is quantized along dimension 3:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@ -72,10 +70,8 @@ struct OpData {
  int output_shift;

  // Per channel output multiplier and shift.
-  // (b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
  // The range of the fused activation layer. For example for kNone and
  // uint8_t these would be 0 and 255.
  int32_t output_activation_min;
@ -107,26 +103,88 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
    int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];

-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+    return tflite::PopulateConvolutionQuantizationParams(
        context, input, filter, bias, output, params->activation,
        &data->output_multiplier, &data->output_shift,
        &data->output_activation_min, &data->output_activation_max,
        data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels);
  }
  return kTfLiteOk;
 }

 }  // namespace

+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+
+  // Per channel quantization is only needed for int8 inference. For other
+  // quantized types, only a single scale and zero point is needed.
+  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+  // Dynimically allocate per-channel quantization parameters.
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, num_channels * sizeof(int32_t),
+      reinterpret_cast<void**>(&data->per_channel_output_shift)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  return CalculateOpData(context, node, params, width, height, filter_width,
+                         filter_height, data_type, data);
+}
+
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteDepthwiseConvParams* params, OpData* data,
+                       TfLiteDepthwiseConvParams* params, const OpData* data,
                       const TfLiteTensor* input, const TfLiteTensor* filter,
                       const TfLiteTensor* bias, TfLiteTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);

+#if HIFI_VFPU
  if ((params->dilation_width_factor == 1) &&
      (params->dilation_height_factor == 1)) {
    const float *input_data, *filter_data, *bias_data;
@ -143,10 +201,6 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,

    const int stride_width = params->stride_width;
    const int stride_height = params->stride_height;
-    const int dilation_width_factor = 1;
-    const int dilation_height_factor = 1;
-    // const int dilation_width_factor = params->dilation_width_factor;;
-    // const int dilation_height_factor = params->dilation_height_factor;
    const int pad_width = data->padding.width;
    const int pad_height = data->padding.height;
    const int depth_multiplier = params->depth_multiplier;
@ -168,7 +222,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);

    int32_t err, input_data_format = 0, output_data_format = 0;
-    void* p_scratch;
+    uint8_t* p_scratch;
    float* p_filter;
    int filter_depth_padded, filter_size_padded, required_scratch;
    int input_precision = PREC_F32;
@ -198,9 +252,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
      return kTfLiteError;
    }

-    p_filter = (float*)p_scratch;
-    p_scratch = (void*)((uint8_t*)p_filter +
-                        ALIGNED_SIZE(sizeof(float) * filter_size_padded, 8));
+    p_filter = reinterpret_cast<float*>(p_scratch);
+    p_scratch += ALIGNED_SIZE(sizeof(float) * filter_size_padded, 8);

    for (h = 0; h < filter_height * filter_width; h++) {
      for (c = 0; c < filter_depth; c++) {
@ -220,37 +273,22 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
          input_height, input_width, input_depth, filter_height, filter_width,
          depth_multiplier, stride_width, stride_height, pad_width, pad_height,
          output_height, output_width, input_data_format, output_data_format,
-          p_scratch);
+          static_cast<void*>(p_scratch));

      CHECK_ERR_HIFI_NNLIB_KER(
          err, "DepthwiseConvFloat: xa_nn_conv2d_depthwise_f32 failed");
    }

-    // pre loop for activation_min_max to handle alignment
    int out_length = batches * output_height * output_width * output_depth;
-    uint32 p_unalign_val = (uint32)output_data, p_align_val;
-    p_align_val = (p_unalign_val + 7) & (~7);
-
-    int pre_loop_count = p_align_val - p_unalign_val;
-    pre_loop_count = MIN(pre_loop_count, out_length);
-
-    for (i = 0; i < pre_loop_count; i++) {
-      ACTIVATION_MIN_MAX(float, output_data[i], output_data[i],
-                         output_activation_min, output_activation_max)
-    }
-
-    out_length = out_length - pre_loop_count;
-
-    if (out_length) {
    err = xa_nn_vec_activation_min_max_f32_f32(
-          &output_data[i], &output_data[i], output_activation_min,
-          output_activation_max, out_length);
+        output_data, output_data, output_activation_min, output_activation_max,
+        out_length);

    CHECK_ERR_HIFI_NNLIB_KER(
-          err,
-          "DepthwiseConvFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
-    }
-  } else {
+        err, "DepthwiseConvFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
+  } else
+#endif /* HIFI_VFPU */
+  {
    tflite::DepthwiseParams op_params;
    // Padding type is ignored, but still set.
    op_params.padding_type = PaddingType::kSame;
@ -274,8 +312,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
 }

 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
+                             TfLiteDepthwiseConvParams* params,
+                             const OpData* data, const TfLiteTensor* input,
                             const TfLiteTensor* filter,
                             const TfLiteTensor* bias, TfLiteTensor* output) {
  DepthwiseParams op_params;
@ -290,7 +328,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
  op_params.input_offset = -input->params.zero_point;
  op_params.weights_offset = 0;
  op_params.output_offset = output->params.zero_point;
-  // (b/130439627): Use calculated value for clamping.
+  // TODO(b/130439627): Use calculated value for clamping.
  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();

@ -304,8 +342,8 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 }

 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteDepthwiseConvParams* params, OpData* data,
-                           const TfLiteTensor* input,
+                           TfLiteDepthwiseConvParams* params,
+                           const OpData* data, const TfLiteTensor* input,
                           const TfLiteTensor* filter, const TfLiteTensor* bias,
                           TfLiteTensor* output) {
  const int32_t input_offset = -input->params.zero_point;
@ -314,9 +352,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,

  if ((params->dilation_width_factor == 1) &&
      (params->dilation_height_factor == 1)) {
-    const uint8 *input_data, *filter_data;
+    const uint8_t *input_data, *filter_data;
    const int32_t* bias_data;
-    uint8* output_data;
+    uint8_t* output_data;
    const RuntimeShape& input_shape = GetTensorShape(input);
    const RuntimeShape& filter_shape = GetTensorShape(filter);
    const RuntimeShape& output_shape = GetTensorShape(output);
@ -329,10 +367,6 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,

    const int stride_width = params->stride_width;
    const int stride_height = params->stride_height;
-    const int dilation_width_factor = 1;
-    const int dilation_height_factor = 1;
-    // const int dilation_width_factor = params->dilation_width_factor;
-    // const int dilation_height_factor = params->dilation_height_factor;
    const int pad_width = data->padding.width;
    const int pad_height = data->padding.height;
    const int depth_multiplier = params->depth_multiplier;
@ -360,11 +394,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);

    int32_t err, i, input_data_format = 0, output_data_format = 0;
-    void* p_scratch;
-    uint8* p_filter;
+    uint8_t* p_scratch;
+    uint8_t* p_filter;
    int filter_depth_padded, filter_size_padded, required_scratch;
    int input_precision = PREC_ASYM8;
-    int h, c;
+    int h;

    ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
    p_scratch = xtensa_nnlib_scratch_buf;
@ -390,18 +424,15 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
      return kTfLiteError;
    }

-    p_filter = (uint8*)p_scratch;
-    p_scratch = (void*)(p_filter +
-                        ALIGNED_SIZE(sizeof(uint8_t) * filter_size_padded, 8));
+    p_filter = p_scratch;
+    p_scratch += ALIGNED_SIZE(sizeof(uint8_t) * filter_size_padded, 8);
+    int pad_value = filter_depth_padded - filter_depth;

    for (h = 0; h < filter_height * filter_width; h++) {
-      for (c = 0; c < filter_depth; c++) {
-        p_filter[h * filter_depth_padded + c] =
-            filter_data[h * filter_depth + c];
-      }
-      for (c = filter_depth; c < filter_depth_padded; c++) {
-        p_filter[h * filter_depth_padded + c] = -filter_offset;
-      }
+      memcpy(&p_filter[h * filter_depth_padded], &filter_data[h * filter_depth],
+             filter_depth);
+      memset(&p_filter[h * filter_depth_padded + filter_depth], -filter_offset,
+             pad_value);
    }

    for (i = 0; i < batches; i++) {
@ -413,37 +444,22 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
          depth_multiplier, stride_width, stride_height, pad_width, pad_height,
          output_height, output_width, input_offset, filter_offset,
          output_multiplier, output_shift, output_offset, input_data_format,
-          output_data_format, p_scratch);
+          output_data_format, static_cast<void*>(p_scratch));

      CHECK_ERR_HIFI_NNLIB_KER(
          err, "DepthwiseConvAsym8: xa_nn_conv2d_depthwise_asym8xasym8 failed");
    }

-    // pre loop for activation_min_max to handle alignment
    int out_length = batches * output_height * output_width * output_depth;
-    uint32 p_unalign_val = (uint32)output_data, p_align_val;
-    p_align_val = (p_unalign_val + 7) & (~7);
-
-    int pre_loop_count = p_align_val - p_unalign_val;
-    pre_loop_count = MIN(pre_loop_count, out_length);
-
-    for (i = 0; i < pre_loop_count; i++) {
-      ACTIVATION_MIN_MAX_ASYM8(output_data[i], output_data[i],
-                               output_activation_min, output_activation_max)
-    }
-
-    out_length = out_length - pre_loop_count;
-
-    if (out_length > 0) {
    err = xa_nn_vec_activation_min_max_asym8_asym8(
-          &output_data[i], &output_data[i], output_activation_min,
-          output_activation_max, out_length);
+        output_data, output_data, output_activation_min, output_activation_max,
+        out_length);

    CHECK_ERR_HIFI_NNLIB_KER(
        err,
        "DepthwiseConvAsym8: xa_nn_vec_activation_min_max_asym8_asym8 "
        "failed");
-    }
+
  } else {
    tflite::DepthwiseParams op_params;
    // Padding type is ignored, but still set.
@ -474,8 +490,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
  auto* params =
      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));

  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
@ -483,38 +503,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* bias =
      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;

-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    TF_LITE_ENSURE(
-        context, affine_quantization->scale->size == 1 ||
-                     affine_quantization->scale->size ==
-                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, data_type,
-                                        &data));
-
-  // (aselle): Consider whether float conv and quantized conv should be
+  // TODO(aselle): Consider whether float conv and quantized conv should be
  // separate ops to avoid dispatch overhead here.
  switch (input->type) {  // Already know in/out types are same.
    case kTfLiteFloat32:
@ -538,9 +527,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace depthwise_conv

 TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
-  return {/*init=*/nullptr,
+  return {/*init=*/depthwise_conv::Init,
          /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/depthwise_conv::Prepare,
          /*invoke=*/depthwise_conv::Eval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/floor.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/floor.cc
@ -1,5 +1,5 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/

+******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
@ -39,7 +39,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"

 namespace tflite {
 namespace ops {
@ -53,6 +53,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+#if HIFI_VFPU
  int err;
  const float* inp_data_ptr;
  float* out_data_ptr;
@ -66,6 +67,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  err = xa_nn_elm_floor_f32_f32(out_data_ptr, inp_data_ptr, flat_size);

  CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_floor_f32_f32 failed");
+#else
+  reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
+                       GetTensorShape(output), GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
  return kTfLiteOk;
 }
 }  // namespace floor
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/fully_connected.cc
@ -1,5 +1,5 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/

+******************************************************************************/
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
@ -43,7 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"

 namespace tflite {
 namespace ops {
@ -70,7 +70,7 @@ constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;

 TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFullyConnectedParams* params,
+                             TfLiteFusedActivation activation,
                             TfLiteType data_type, const TfLiteTensor* input,
                             const TfLiteTensor* filter,
                             const TfLiteTensor* bias, TfLiteTensor* output,
@ -84,7 +84,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
    data->output_shift = -exponent;
    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-        context, params->activation, output, &data->output_activation_min,
+        context, activation, output, &data->output_activation_min,
        &data->output_activation_max));
  }
  return status;
@ -92,20 +92,50 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,

 }  // namespace

+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
+  return CalculateOpData(context, params->activation, input->type, input,
+                         filter, bias, output, data);
+}
+
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               TfLiteFullyConnectedParams* params, OpData* data,
-                               const TfLiteTensor* input,
+                               const OpData& data, const TfLiteTensor* input,
                               const TfLiteTensor* filter,
                               const TfLiteTensor* bias, TfLiteTensor* output) {
-  FullyConnectedParams op_params;
+  tflite::FullyConnectedParams op_params;
  op_params.input_offset = -input->params.zero_point;
  op_params.weights_offset = -filter->params.zero_point;
  op_params.output_offset = output->params.zero_point;
-  op_params.output_multiplier = data->output_multiplier;
-  // (b/138810107): Figure out whether output shift should be inverted
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_multiplier = data.output_multiplier;
+  // TODO(b/138810107): Figure out whether output shift should be inverted
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;

  reference_integer_ops::FullyConnected(
      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
@ -116,8 +146,7 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 }

 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteFullyConnectedParams* params, OpData* data,
-                           const TfLiteTensor* input,
+                           const OpData& data, const TfLiteTensor* input,
                           const TfLiteTensor* filter, const TfLiteTensor* bias,
                           TfLiteTensor* output) {
  const int32_t input_offset = -input->params.zero_point;
@ -128,11 +157,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
  op_params.input_offset = input_offset;
  op_params.weights_offset = filter_offset;
  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;

 #define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
  reference_ops::FullyConnected(                                       \
@ -162,11 +191,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
        CHECK_ERR_HIFI_NNLIB_KER(
            ret, "xa_nn_fully_connected_asym8xasym8_asym8 failed");
      }
-      for (int i = 0; i < batches * out_depth; i++) {
-        ACTIVATION_MIN_MAX_ASYM8(p_out[i], p_out[i],
-                                 data->output_activation_min,
-                                 data->output_activation_max)
-      }
+      ret = xa_nn_vec_activation_min_max_asym8_asym8(
+          p_out, p_out, data.output_activation_min, data.output_activation_max,
+          batches * out_depth);
+
+      CHECK_ERR_HIFI_NNLIB_KER(
+          ret, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
      break;
    }
    case kTfLiteInt16:
@ -182,15 +212,16 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }

 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteFullyConnectedParams* params, OpData* data,
+                       TfLiteFusedActivation activation,
                       const TfLiteTensor* input, const TfLiteTensor* filter,
                       const TfLiteTensor* bias, TfLiteTensor* output) {
  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
+  CalculateActivationRange(activation, &output_activation_min,
                           &output_activation_max);
  tflite::FullyConnectedParams op_params;
  op_params.float_activation_min = output_activation_min;
  op_params.float_activation_max = output_activation_max;
+#if HIFI_VFPU
  int ret, b, weight_depth, out_depth, batches;
  weight_depth =
      GetTensorShape(filter).Dims(GetTensorShape(filter).DimensionsCount() - 1);
@ -208,43 +239,48 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
    CHECK_ERR_HIFI_NNLIB_KER(ret, "xa_nn_fully_connected_f32 failed.");
  }
  float* p_out = GetTensorData<float>(output);
-  for (int i = 0; i < batches * out_depth; i++) {
-    ACTIVATION_MIN_MAX(float, p_out[i], p_out[i], output_activation_min,
-                       output_activation_max)
-  }
+  ret = xa_nn_vec_activation_min_max_f32_f32(
+      p_out, p_out, output_activation_min, output_activation_max,
+      batches * out_depth);
+  CHECK_ERR_HIFI_NNLIB_KER(ret, "xa_nn_vec_activation_min_max_f32_f32 failed");
+#else
+  tflite::reference_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
  return kTfLiteOk;
 }

 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto* params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);

  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);

-  TfLiteType data_type = input->type;
-  OpData local_data_object;
-  OpData* data = &local_data_object;
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
-                                        filter, bias, output, data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));

-  switch (filter->type) {  // Already know in/out types are same.
+  // Checks in Prepare ensure input, output and filter types are all the same.
+  switch (input->type) {
    case kTfLiteFloat32:
-      return EvalFloat(context, node, params, data, input, filter, bias,
+      return EvalFloat(context, node, params->activation, input, filter, bias,
                       output);
    case kTfLiteInt8:
-      return EvalQuantizedInt8(context, node, params, data, input, filter, bias,
+      return EvalQuantizedInt8(context, node, data, input, filter, bias,
                               output);

    case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, data, input, filter, bias,
-                           output);
+      return EvalQuantized(context, node, data, input, filter, bias, output);

    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(filter->type), filter->type);
+                         TfLiteTypeGetName(input->type), input->type);
      return kTfLiteError;
  }
  return kTfLiteOk;
@ -253,9 +289,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace fully_connected

 TfLiteRegistration Register_FULLY_CONNECTED() {
-  return {/*init=*/nullptr,
+  return {/*init=*/fully_connected::Init,
          /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/fully_connected::Prepare,
          /*invoke=*/fully_connected::Eval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/logistic.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/logistic.cc
@ -1,5 +1,5 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/

+******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
@ -34,32 +34,68 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/lite/kernels/internal/reference/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"

 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/logistic.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"

 namespace tflite {
 namespace ops {
 namespace micro {
 namespace activations {
-
+namespace {
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;

-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+struct OpData {
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
+  int input_left_shift;
+};
+
+TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
+                                       OpData* data) {
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);

+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                      std::numeric_limits<int8_t>::min());
+
+    static constexpr int kInputIntegerBits = 4;
+    const double input_real_multiplier =
+        static_cast<double>(input->params.scale) *
+        static_cast<double>(1 << (31 - kInputIntegerBits));
+
+    const double q = std::frexp(input_real_multiplier, &data->input_left_shift);
+    data->input_multiplier = static_cast<int32_t>(TfLiteRound(q * (1ll << 31)));
+
+    data->input_range_radius =
+        CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31);
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  OpData data;
+  CalculateArithmeticOpData(context, node, &data);
+
  if (input->type == kTfLiteFloat32) {
    switch (output->type) {
      case kTfLiteFloat32: {
+#if HIFI_VFPU
        int err;
        const float* inp_data_ptr;
        float* out_data_ptr;
@ -73,6 +109,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
        err = xa_nn_vec_sigmoid_f32_f32(out_data_ptr, inp_data_ptr, flat_size);

        CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_sigmoid_f32_f32 failed");
+#else
+        reference_ops::Logistic(
+            GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
        return kTfLiteOk;
      }
      default:
@ -84,11 +125,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  } else if (input->type == kTfLiteInt8) {
    switch (output->type) {
      case kTfLiteInt8: {
-        reference_ops::Logistic(
-            GetTensorShape(input), GetTensorData<int8_t>(input),
-            input->params.scale, input->params.zero_point,
-            GetTensorShape(output), GetTensorData<int8_t>(output),
-            output->params.scale, output->params.zero_point);
+        reference_integer_ops::Logistic(
+            input->params.zero_point, data.input_range_radius,
+            data.input_multiplier, data.input_left_shift,
+            NumElements(input->dims), GetTensorData<int8_t>(input),
+            GetTensorData<int8_t>(output));
        return kTfLiteOk;
      }
      default:
@ -98,7 +139,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
        return kTfLiteError;
    }
  } else {
-    // (b/141211002): Also support other data types once we have supported
+    // TODO(b/141211002): Also support other data types once we have supported
    // temporary tensors in TFLM.
    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
                       TfLiteTypeGetName(input->type),
@ -114,7 +155,7 @@ TfLiteRegistration Register_LOGISTIC() {
  return {/*init=*/nullptr,
          /*free=*/nullptr,
          /*prepare=*/nullptr,
-          /*invoke=*/activations::Eval,
+          /*invoke=*/activations::LogisticEval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
          /*custom_name=*/nullptr,
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/mul.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/mul.cc
@ -0,0 +1,229 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace mul {
+
+constexpr int kInput1Tensor = 0;
+constexpr int kInput2Tensor = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+
+  int32_t output_multiplier;
+  int output_shift;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteMulParams* params, OpData* data) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
+  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
+
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->output_activation_min,
+        &data->output_activation_max));
+
+    double real_multiplier = static_cast<double>(input1->params.scale) *
+                             static_cast<double>(input2->params.scale) /
+                             static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier,
+                       &data->output_shift);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
+  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->dims->size == 0) {
+    return AllocateOutputDimensionsFromInput(context, input1, input2, output);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteMulParams* params, OpData* data,
+                           const TfLiteTensor* input1,
+                           const TfLiteTensor* input2, TfLiteTensor* output) {
+  if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8) {
+    tflite::ArithmeticParams op_params;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    op_params.input1_offset = -input1->params.zero_point;
+    op_params.input2_offset = -input2->params.zero_point;
+    op_params.output_offset = output->params.zero_point;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
+
+#define TF_LITE_MUL(type, opname, dtype)                             \
+  type::opname(op_params, GetTensorShape(input1),                    \
+               GetTensorData<dtype>(input1), GetTensorShape(input2), \
+               GetTensorData<dtype>(input2), GetTensorShape(output), \
+               GetTensorData<dtype>(output));
+
+    if (output->type == kTfLiteInt8) {
+      if (need_broadcast) {
+        TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t);
+      } else {
+        TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
+      }
+    } else if (output->type == kTfLiteUInt8) {
+      if (need_broadcast) {
+        TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, uint8_t);
+      } else {
+        int err;
+        const RuntimeShape& input1_shape = GetTensorShape(input1);
+        const RuntimeShape& input2_shape = GetTensorShape(input2);
+        const RuntimeShape& output_shape = GetTensorShape(output);
+        const int flat_size =
+            MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+        err = xa_nn_elm_mul_asym8xasym8_asym8(
+            GetTensorData<uint8_t>(output), op_params.output_offset,
+            op_params.output_shift, op_params.output_multiplier,
+            op_params.quantized_activation_min,
+            op_params.quantized_activation_max, GetTensorData<uint8_t>(input1),
+            op_params.input1_offset, GetTensorData<uint8_t>(input2),
+            op_params.input2_offset, flat_size);
+
+        CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_mul_asym8xasym8_asym8 failed");
+      }
+    }
+#undef TF_LITE_MUL
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteMulParams* params, OpData* data,
+                       const TfLiteTensor* input1, const TfLiteTensor* input2,
+                       TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  SetActivationParams(output_activation_min, output_activation_max, &op_params);
+
+  bool need_broadcast = reference_ops::ProcessBroadcastShapes(
+      GetTensorShape(input1), GetTensorShape(input2), &op_params);
+#define TF_LITE_MUL(opname)                                                   \
+  reference_ops::opname(op_params, GetTensorShape(input1),                    \
+                        GetTensorData<float>(input1), GetTensorShape(input2), \
+                        GetTensorData<float>(input2), GetTensorShape(output), \
+                        GetTensorData<float>(output));
+
+  if (need_broadcast) {
+    TF_LITE_MUL(BroadcastMul4DSlow);
+  } else {
+#if HIFI_VFPU
+    int err;
+    const RuntimeShape& input1_shape = GetTensorShape(input1);
+    const RuntimeShape& input2_shape = GetTensorShape(input2);
+    const RuntimeShape& output_shape = GetTensorShape(output);
+    const int flat_size =
+        MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+    err = xa_nn_elm_mul_f32xf32_f32(GetTensorData<float>(output),
+                                    GetTensorData<float>(input1),
+                                    GetTensorData<float>(input2), flat_size);
+
+    CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_mul_f32xf32_f32 failed");
+
+    err = xa_nn_vec_activation_min_max_f32_f32(
+        GetTensorData<float>(output), GetTensorData<float>(output),
+        output_activation_min, output_activation_max, flat_size);
+
+    CHECK_ERR_HIFI_NNLIB_KER(err,
+                             "xa_nn_vec_activation_min_max_f32_f32 failed");
+#else
+    TF_LITE_MUL(Mul);
+#endif /* HIFI_VFPU */
+  }
+#undef TF_LITE_MUL
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+  OpData data;
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
+  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, &data));
+
+  switch (input1->type) {
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+      TF_LITE_ENSURE_OK(context, EvalQuantized(context, node, params, &data,
+                                               input1, input2, output));
+      break;
+    case kTfLiteFloat32:
+      TF_LITE_ENSURE_OK(context, EvalFloat(context, node, params, &data, input1,
+                                           input2, output));
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input1->type), input1->type);
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+}  // namespace mul
+
+TfLiteRegistration Register_MUL() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/nullptr,
+          /*invoke=*/mul::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/pooling.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/pooling.cc
@ -1,5 +1,5 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/

+******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
@ -40,7 +40,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"

 namespace tflite {
 namespace ops {
@ -83,6 +83,7 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
  CalculateActivationRange(params->activation, &activation_min,
                           &activation_max);

+#if HIFI_VFPU
  const int stride_height = params->stride_height;
  const int stride_width = params->stride_width;
  const int pad_width = data->padding.width;
@ -168,6 +169,20 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
    CHECK_ERR_HIFI_NNLIB_KER(
        err, "AveragepoolFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
  }
+#else
+  PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.float_activation_min = activation_min;
+  op_params.float_activation_max = activation_max;
+  reference_ops::AveragePool(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(output), GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
  return kTfLiteOk;
 }

@ -177,7 +192,6 @@ TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
                                  const OpData* data, const TfLiteTensor* input,
                                  TfLiteTensor* output) {
  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
-
  int32_t activation_min, activation_max;
  (void)CalculateActivationRangeQuantized(context, params->activation, output,
                                          &activation_min, &activation_max);
@ -295,6 +309,7 @@ TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
  CalculateActivationRange(params->activation, &activation_min,
                           &activation_max);

+#if HIFI_VFPU
  const int stride_height = params->stride_height;
  const int stride_width = params->stride_width;
  const int pad_width = data->padding.width;
@ -378,6 +393,20 @@ TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
    CHECK_ERR_HIFI_NNLIB_KER(
        err, "MaxpoolFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
  }
+#else
+  tflite::PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.float_activation_min = activation_min;
+  op_params.float_activation_max = activation_max;
+  reference_ops::MaxPool(op_params, GetTensorShape(input),
+                         GetTensorData<float>(input), GetTensorShape(output),
+                         GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
  return kTfLiteOk;
 }

@ -491,7 +520,6 @@ TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
  }
  return kTfLiteOk;
 }
-
 }  // namespace


@ -504,7 +532,7 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {

  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));

-  // Inputs and outputs share the same type, guarenteed by the converter.
+  // Inputs and outputs share the same type, guaranteed by the converter.
  switch (input->type) {
    case kTfLiteFloat32:
      AverageEvalFloat(context, node, params, &data, input, output);
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/softmax.cc
@ -1,5 +1,5 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+/*******************************************************************************
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@ -17,8 +17,8 @@
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/

+******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
@ -43,7 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 namespace tflite {
 namespace ops {
 namespace micro {
@ -105,6 +105,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
 // Takes a tensor and performs softmax along the last dimension.
 TfLiteStatus SoftmaxFloat(TfLiteContext* context, const TfLiteTensor* input,
                          TfLiteTensor* output, const SoftmaxParams& op_data) {
+#if HIFI_VFPU
  const RuntimeShape& input_shape = GetTensorShape(input);
  const float* input_data = GetTensorData<float>(input);
  const RuntimeShape& output_shape = GetTensorShape(output);
@ -133,6 +134,11 @@ TfLiteStatus SoftmaxFloat(TfLiteContext* context, const TfLiteTensor* input,
        xa_nn_vec_softmax_f32_f32(&output_data[i * depth], p_scratch, depth);
    CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_softmax_f32_f32 failed");
  }
+#else
+  tflite::reference_ops::Softmax(
+      op_data, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(output), GetTensorData<float>(output));
+#endif /* HIFI_VFPU */
  return kTfLiteOk;
 }

--- a/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
@ -1,5 +1,5 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+/*******************************************************************************
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@ -18,7 +18,6 @@
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ******************************************************************************/
-
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
@ -44,8 +43,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/activation_utils.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 #include "tensorflow/lite/micro/micro_utils.h"
-#include "xtensa_tf_micro_common.h"

 namespace tflite {
 namespace ops {
@ -53,10 +52,6 @@ namespace micro {
 namespace svdf {
 namespace {

-// These constants represent constants specific to the hotword "OK G" model.
-// They exist until (b/132070898) is fixed.
-constexpr int kScratchTensorMaxSize = 64;
-
 struct OpData {
  int32 effective_scale_1_a;
  int32 effective_scale_2_a;
@ -64,6 +59,8 @@ struct OpData {
  // shift value - typically between [-32, 32].
  int effective_scale_1_b;
  int effective_scale_2_b;
+  int scratch_tensor_index;
+  int scratch_output_tensor_index;
 };

 /**
@ -84,6 +81,7 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
    float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
    float* const __restrict__ output_ptr) {
  // Compute matmul(activation_state, weights_time).
+#if HIFI_VFPU
  float* scratch_bias = scratch_ptr;
  if (bias_ptr) {
    const float* bias_data = bias_ptr;
@ -111,6 +109,51 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
      weights_time_vec += memory_size * rank;
    }
  }
+#else
+  for (int b = 0; b < batch_size; ++b) {
+    // Perform batched vector dot product:
+    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
+    const float* vector1_ptr = weights_time_ptr;
+    const float* vector2_ptr = state_ptr + b * memory_size * num_filters;
+    for (int i = 0; i < num_filters; ++i) {
+      *scratch_ptr_batch = 0.f;
+      for (int j = 0; j < memory_size; ++j) {
+        *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
+      }
+      scratch_ptr_batch++;
+    }
+  }
+
+  // Initialize output with bias if provided.
+  if (bias_ptr) {
+    // VectorBatchVectorAssign
+    for (int i = 0; i < batch_size; ++i) {
+      float* output_data = output_ptr + i * num_units;
+      const float* bias_data = bias_ptr;
+      for (int j = 0; j < num_units; ++j) {
+        *output_data++ = *bias_data++;
+      }
+    }
+  } else {
+    float* output_data = output_ptr;
+    for (int i = 0; i < batch_size * num_units; ++i) {
+      *output_data++ = 0.0f;
+    }
+  }
+
+  // Reduction sum.
+  for (int b = 0; b < batch_size; ++b) {
+    float* output_ptr_batch = output_ptr + b * num_units;
+    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
+
+    // Reduction sum vector
+    for (int i = 0; i < num_units; ++i) {
+      for (int j = 0; j < rank; j++) {
+        output_ptr_batch[i] += *scratch_ptr_batch++;
+      }
+    }
+  }
+#endif /* HIFI_VFPU */

  // Apply activation.
  for (int b = 0; b < batch_size; ++b) {
@ -127,7 +170,8 @@ inline TfLiteStatus EvalFloatSVDF(
    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
    const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
    const TfLiteTensor* bias, const TfLiteSVDFParams* params,
-    TfLiteTensor* activation_state, TfLiteTensor* output) {
+    int scratch_tensor_index, TfLiteTensor* activation_state,
+    TfLiteTensor* output) {
  const int rank = params->rank;
  const int batch_size = input->dims->data[0];
  const int input_size = input->dims->data[1];
@ -142,10 +186,11 @@ inline TfLiteStatus EvalFloatSVDF(

  float* state_ptr = GetTensorData<float>(activation_state);

-  // TODO(b/132070898): Move this temp variable to the new scratch buffer API
-  // when ready.
-  float scratch_tensor[kScratchTensorMaxSize];
-  float* scratch_ptr = scratch_tensor;
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  float* scratch_ptr = static_cast<float*>(
+      context->GetScratchBuffer(context, scratch_tensor_index));

  float* output_ptr = GetTensorData<float>(output);

@ -174,6 +219,7 @@ inline TfLiteStatus EvalFloatSVDF(
    float* result = &state_ptr[memory_size - 1];
    float* result_in_batch = result;

+#if HIFI_VFPU
    float* out_scratch = scratch_ptr;
    float* bias_scratch = output_ptr;
    for (int i = 0; i < num_units; i++) bias_scratch[i] = 0.0f;
@ -195,6 +241,20 @@ inline TfLiteStatus EvalFloatSVDF(
        result_in_batch += memory_size;
      }
    }
+#else
+    for (int i = 0; i < batch_size; ++i) {
+      const float* matrix_ptr = matrix;
+      for (int j = 0; j < num_filters; ++j) {
+        float dot_prod = 0.0f;
+        const float* vector_in_batch = vector + i * input_size;
+        for (int k = 0; k < input_size; ++k) {
+          dot_prod += *matrix_ptr++ * *vector_in_batch++;
+        }
+        *result_in_batch = dot_prod;
+        result_in_batch += memory_size;
+      }
+    }
+#endif /* HIFI_VFPU */
  }

  return ApplyTimeWeightsBiasAndActivation(
@ -203,13 +263,15 @@ inline TfLiteStatus EvalFloatSVDF(
      output_ptr);
 }

-void EvalIntegerSVDF(
-    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input_tensor,
+void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
+                     const TfLiteTensor* input_tensor,
                     const TfLiteTensor* weights_feature_tensor,
-    const TfLiteTensor* weights_time_tensor, const TfLiteTensor* bias_tensor,
-    const TfLiteSVDFParams* params, TfLiteTensor* activation_state_tensor,
-    TfLiteTensor* output_tensor, int32_t scale_1_a, int scale_1_b,
-    int32_t scale_2_a, int scale_2_b, int32_t input_zp, int32_t output_zp) {
+                     const TfLiteTensor* weights_time_tensor,
+                     const TfLiteTensor* bias_tensor,
+                     const TfLiteSVDFParams* params,
+                     TfLiteTensor* activation_state_tensor,
+                     TfLiteTensor* output_tensor, const OpData& data,
+                     int32_t input_zp, int32_t output_zp) {
  const int n_rank = params->rank;
  const int n_batch = input_tensor->dims->data[0];
  const int n_input = input_tensor->dims->data[1];
@ -217,10 +279,13 @@ void EvalIntegerSVDF(
  const int n_unit = n_filter / n_rank;
  const int n_memory = weights_time_tensor->dims->data[1];

-  // TODO(b/132070898): Move these temp variables to the new scratch buffer API
-  // when ready.
-  int32_t scratch_tensor[kScratchTensorMaxSize];
-  int32_t scratch_output_tensor[kScratchTensorMaxSize];
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  int32_t* scratch_tensor = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_tensor_index));
+  int32_t* scratch_output_tensor = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_output_tensor_index));

  // Shift states.
  int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
@ -254,8 +319,8 @@ void EvalIntegerSVDF(
        for (int c = 0; c < n_input; c++) {
          dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp);
        }
-        dot_prod =
-            MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
+        dot_prod = MultiplyByQuantizedMultiplier(
+            dot_prod, data.effective_scale_1_a, data.effective_scale_1_b);
        dot_prod = std::min(std::max(output_min, dot_prod), output_max);
        // This assumes state is symmetrically quantized. Otherwise last bit of
        // state should be initialized to its zero point and accumulate the
@ -328,7 +393,8 @@ void EvalIntegerSVDF(
    const int32_t output_min = std::numeric_limits<int8_t>::min();
    for (int i = 0; i < n_batch * n_unit; ++i) {
      int32_t x1 = scratch_output_tensor[i];
-      int32_t x2 = MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b);
+      int32_t x2 = MultiplyByQuantizedMultiplier(x1, data.effective_scale_2_a,
+                                                 data.effective_scale_2_b);
      int32_t x3 = x2 + output_zp;
      int32_t x4 = std::min(std::max(output_min, x3), output_max);
      GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
@ -349,8 +415,20 @@ constexpr int kInputActivationStateTensor = 4;
 // Output tensor.
 constexpr int kOutputTensor = 0;

+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);

  // Validate Tensor Inputs (dtype depends on quantization):
  // [0] = Input, {2, batch_size, input_size}
@ -359,7 +437,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  // [3] = Bias (optional), {1, num_units}
  // [4] = Activation State (variable),
  //         {2, batch_size, memory_size * num_filters}
-
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  const TfLiteTensor* weights_feature =
      GetInput(context, node, kWeightsFeatureTensor);
@ -378,8 +455,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  const int num_units = num_filters / rank;
  const int memory_size = weights_time->dims->data[1];

-  const bool is_full_integer = input->type == kTfLiteInt8;
-
  // Validate Input Tensor:
  TF_LITE_ENSURE(context,
                 input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
@ -403,7 +478,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);

  // Validate Optional Bias Input Tensor:
-  if (bias) {
+  if (bias != nullptr) {
    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
  }

@ -413,51 +488,75 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
                    memory_size * num_filters);

-  if (is_full_integer) {
    TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);

+    if (input->type == kTfLiteInt8) {
      TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
      TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
-
-    if (bias) {
+      TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
+      if (bias != nullptr) {
        TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
      }

-    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
-
-    // Validate Scratch Tensors:
-    // [0] = (shared - see float block below for usage)
-    // [1] = Output Temp, int8_t, {2, num_units, batch_size}
-    // TODO(b/132070898): Scratch values are used as stack variables in
-    // EvalIntegerSVDF().
-
-    // Validate output tensor:
      TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
-  } else {
-    TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);

-    // Validate Input Tensor dtypes:
+      const auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
+          input->quantization.params);
+      const auto* weights_feature_params =
+          static_cast<const TfLiteAffineQuantization*>(
+              weights_feature->quantization.params);
+      const auto* state_params = static_cast<const TfLiteAffineQuantization*>(
+          activation_state->quantization.params);
+      const auto* weight_time_params =
+          static_cast<const TfLiteAffineQuantization*>(
+              weights_time->quantization.params);
+      const auto* output_params = static_cast<const TfLiteAffineQuantization*>(
+          output->quantization.params);
+      const double effective_scale_1 =
+          static_cast<double>(input_params->scale->data[0] *
+                              weights_feature_params->scale->data[0] /
+                              state_params->scale->data[0]);
+      const double effective_scale_2 = static_cast<double>(
+          state_params->scale->data[0] * weight_time_params->scale->data[0] /
+          output_params->scale->data[0]);
+
+      TFLITE_DCHECK(node->user_data != nullptr);
+      OpData* data = static_cast<OpData*>(node->user_data);
+
+      QuantizeMultiplier(effective_scale_1, &(data->effective_scale_1_a),
+                         &(data->effective_scale_1_b));
+      QuantizeMultiplier(effective_scale_2, &(data->effective_scale_2_a),
+                         &(data->effective_scale_2_b));
+
+      TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+
+      const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+          context, batch_size * num_filters * sizeof(int32_t),
+          &(data->scratch_tensor_index));
+      TF_LITE_ENSURE_OK(context, scratch_status);
+
+      const TfLiteStatus scratch_output_status =
+          context->RequestScratchBufferInArena(
+              context, batch_size * num_units * sizeof(int32_t),
+              &(data->scratch_output_tensor_index));
+      TF_LITE_ENSURE_OK(context, scratch_output_status);
+    } else {
      TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
      TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteFloat32);
      TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteFloat32);
-
-    if (bias) {
+      if (bias != nullptr) {
        TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
      }
-
-    // Validate shared Scratch Tensor:
-    // [0] = Holds dot-product of time-forward calculations in
-    //       ApplyTimeWeightsBiasAndActivation():
-    //         float/int32, {2, batch_size, num_filters}
-    // TODO(b/132070898): Scratch values are used as stack variables in
-    // EvalIntegerSVDF().
-
-    // Full-float SVDF only uses the one shared scratch tensor (see above for
-    // usage).
-    // TODO(b/132070898): Use input tensor as variable until scratch tensor
-    // allocation has been implemented.
-    // TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
      TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
+
+      TFLITE_DCHECK(node->user_data != nullptr);
+      OpData* data = static_cast<OpData*>(node->user_data);
+
+      TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+      const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+          context, batch_size * num_filters * sizeof(float),
+          &(data->scratch_tensor_index));
+      TF_LITE_ENSURE_OK(context, scratch_status);
    }

  return kTfLiteOk;
@ -476,56 +575,24 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
      GetVariableInput(context, node, kInputActivationStateTensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);

-  const bool is_full_integer = input->type == kTfLiteInt8;
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));

  switch (weights_feature->type) {
    case kTfLiteFloat32: {
-      // TODO(b/132070898): Use input tensor as variable until scratch tensor
-      // allocation has been implemented.
-      // TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
      return EvalFloatSVDF(context, node, input, weights_feature, weights_time,
-                           bias, params, activation_state, output);
+                           bias, params, data.scratch_tensor_index,
+                           activation_state, output);
      break;
    }

    case kTfLiteInt8: {
-      if (is_full_integer) {
-        // TODO(b/132070898): Store these values in ::Prepare() instead of
-        // ::Eval():
-        // Calculate effective scales.
-        OpData op_data;
-        auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
-            input->quantization.params);
-        auto* weights_feature_params =
-            reinterpret_cast<TfLiteAffineQuantization*>(
-                weights_feature->quantization.params);
-        auto* state_params = reinterpret_cast<TfLiteAffineQuantization*>(
-            activation_state->quantization.params);
-        auto* weight_time_params = reinterpret_cast<TfLiteAffineQuantization*>(
-            weights_time->quantization.params);
-        auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
-            output->quantization.params);
-        const double effective_scale_1 =
-            static_cast<double>(input_params->scale->data[0] *
-                                weights_feature_params->scale->data[0] /
-                                state_params->scale->data[0]);
-        const double effective_scale_2 = static_cast<double>(
-            state_params->scale->data[0] * weight_time_params->scale->data[0] /
-            output_params->scale->data[0]);
-        QuantizeMultiplier(effective_scale_1, &op_data.effective_scale_1_a,
-                           &op_data.effective_scale_1_b);
-        QuantizeMultiplier(effective_scale_2, &op_data.effective_scale_2_a,
-                           &op_data.effective_scale_2_b);
-
      TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
-        EvalIntegerSVDF(
-            context, node, input, weights_feature, weights_time, bias, params,
-            activation_state, output, op_data.effective_scale_1_a,
-            op_data.effective_scale_1_b, op_data.effective_scale_2_a,
-            op_data.effective_scale_2_b, input->params.zero_point,
-            output->params.zero_point);
+
+      EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias,
+                      params, activation_state, output, data,
+                      input->params.zero_point, output->params.zero_point);
      return kTfLiteOk;
-      }
      break;
    }

@ -540,7 +607,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace svdf

 TfLiteRegistration Register_SVDF() {
-  return {/*init=*/nullptr,
+  return {/*init=*/svdf::Init,
          /*free=*/nullptr,
          /*prepare=*/svdf::Prepare,
          /*invoke=*/svdf::Eval,
--- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifi_nn_library.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifi_nn_library.inc
@ -4,6 +4,8 @@ ifneq ($(filter xtensa_hifi, $(ALL_TAGS)),)

    ifneq (,$(filter hifi4%, $(TARGET_ARCH)))

+        NNLIB = xa_nnlib_hifi4
+
        CCFLAGS += -DNNLIB_V2 \
                   -DXTENSA_NNLIB_MAX_SCRATCH_SIZE=70*1024

@ -11,56 +13,60 @@ ifneq ($(filter xtensa_hifi, $(ALL_TAGS)),)
                    -DXTENSA_NNLIB_MAX_SCRATCH_SIZE=70*1024

        MICROLITE_CC_SRCS += \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_f32_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_asym8_asym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_32_16.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_32_8.c  \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_softmax_asym8_asym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_floor_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_circ_buf.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_asym8xasym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_matXvec_asym8xasym8_asym8_circ.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_matXvec_f32_circ.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_asym8xasym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_circ_buf.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/fc/hifi4/xa_nn_fully_connected.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_16x16.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x16.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_asym8xasym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_f32_nhwc.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8_nhwc.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_f32_nhwc.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8_nhwc.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_inv_256_tbl.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_sigmoidf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_tanhf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_reluf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_softmaxf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_alognf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/scl_sigmoidf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/scl_tanhf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/expf_tbl.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/pow2f_tbl.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/inff_tbl.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/tanhf_tbl.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/nanf_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_f32_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_asym8_asym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_32_16.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_32_8.c  \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_softmax_asym8_asym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_floor_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_add_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_add_quant8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_mul_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_mul_quant8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_circ_buf.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_asym8xasym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_matXvec_asym8xasym8_asym8_circ.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_matXvec_f32_circ.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_asym8xasym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_circ_buf.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/fc/hifi4/xa_nn_fully_connected.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_16x16.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x16.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_asym8xasym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_f32_nhwc.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8_nhwc.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_f32_nhwc.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8_nhwc.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_inv_256_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_sigmoidf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_tanhf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_reluf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_softmaxf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_alognf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/scl_sigmoidf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/scl_tanhf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/expf_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/pow2f_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/inff_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/tanhf_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/nanf_tbl.c \

-        INCLUDES += -I$(XTENSA_PATH)/xa_nnlib/algo/kernels/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/include/nnlib/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/include/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/algo/common/include/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/include/ \
+        INCLUDES += -I$(XTENSA_PATH)/$(NNLIB)/algo/kernels/ \
+                    -I$(XTENSA_PATH)/$(NNLIB)/include/nnlib/ \
+                    -I$(XTENSA_PATH)/$(NNLIB)/include/ \
+                    -I$(XTENSA_PATH)/$(NNLIB)/algo/common/include/ \
+                    -I$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/include/ \

    endif

--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
@ -5,7 +5,7 @@
 ifeq ($(TARGET), xtensa_hifi)
  TARGET_ARCH := hifi3_bd5

-$(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib,))
+$(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib_hifi4,))

  PLATFORM_ARGS = \
    -mno-mul16 \
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@ -80,8 +80,8 @@ EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
 ZEPHYR_URL := "https://github.com/antmicro/zephyr/archive/55e36b9.zip"
 ZEPHYR_MD5 := "755622eb4812fde918a6382b65d50c3b"

-XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
-XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"
+XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_06_27.zip"
+XTENSA_HIFI4_MD5 :="45fdc1209a8da62ab568aa6040f7eabf"

 ETHOSU_URL := "https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-bcb5aaa99756f1b5c1295b079ebdd60996bc75a5.tar.gz"
 ETHOSU_MD5 := "d2073c8d88fc167fd5c46b5dcda58ea1"