Merge pull request #40943 from pnikam-cad:hifi4_nnlib_v2_2_0_update

PiperOrigin-RevId: 322616499 Change-Id: I6a08cb4f11abe33c38c91a72cc45c635d0f78797
2020-07-22 11:30:39 -07:00 · 2020-07-22 11:30:39 -07:00 · bf3b14ffcb
commit bf3b14ffcb
parent 7cd0adacea 3b0642fbcd
14 changed files with 1353 additions and 647 deletions
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/activations.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/activations.cc
@ -1,24 +1,24 @@
-/******************************************************************************
+/*******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
+*
- * Permission is hereby granted, free of charge, to any person obtaining
+* Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
+* a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
+* "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
+* not with any other processors and platforms, subject to
- * the following conditions:
+* the following conditions:
- *
+*
- * The above copyright notice and this permission notice shall be included
+* The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
+* in all copies or substantial portions of the Software.
- *
+*
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ******************************************************************************/
 ******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
@ -41,8 +41,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "xtensa_tf_micro_common.h"
 namespace tflite {
 namespace ops {
@ -109,6 +109,7 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
  switch (input->type) {
    case kTfLiteFloat32: {
 #if HIFI_VFPU
      int err;
      const float* inp_data_ptr;
      float* out_data_ptr;
@ -119,11 +120,13 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
      inp_data_ptr = GetTensorData<float>(input);
      out_data_ptr = GetTensorData<float>(output);
-      const float f32_pos_inf = 0x7F800000;
+      err = xa_nn_vec_relu_std_f32_f32(out_data_ptr, inp_data_ptr, flat_size);
      err = xa_nn_vec_relu_f32_f32(out_data_ptr, inp_data_ptr, f32_pos_inf,
                                   flat_size);
-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu1_f32_f32 failed");
+      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu_std_f32_f32 failed");
 #else
      ReluFloat(GetTensorShape(input), GetTensorData<float>(input),
                GetTensorShape(output), GetTensorData<float>(output));
 #endif /* HIFI_VFPU */
      return kTfLiteOk;
    }
    case kTfLiteInt8: {
@ -140,14 +143,17 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
      const RuntimeShape& input_shape = GetTensorShape(input);
      const RuntimeShape& output_shape = GetTensorShape(output);
      const int flat_size = MatchingFlatSize(input_shape, output_shape);
      const uint8_t zero = input->params.zero_point;
      inp_data_ptr = GetTensorData<uint8_t>(input);
      out_data_ptr = GetTensorData<uint8_t>(output);
      err = xa_nn_vec_activation_min_max_asym8_asym8(
-          out_data_ptr, inp_data_ptr, 0, 255, flat_size);  // Is 255 right?
+          out_data_ptr, inp_data_ptr, zero, std::numeric_limits<uint8_t>::max(),
          flat_size);
-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_activation_min_max_8_8 failed");
+      CHECK_ERR_HIFI_NNLIB_KER(
          err, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
      return kTfLiteOk;
    }
    default: {
@ -168,6 +174,7 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
  switch (input->type) {
    case kTfLiteFloat32: {
 #if HIFI_VFPU
      int err;
      const float* inp_data_ptr;
      float* out_data_ptr;
@ -180,7 +187,11 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
      err = xa_nn_vec_relu6_f32_f32(out_data_ptr, inp_data_ptr, flat_size);
-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu1_f32_f32 failed");
+      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_relu6_f32_f32 failed");
 #else
      Relu6Float(GetTensorShape(input), GetTensorData<float>(input),
                 GetTensorShape(output), GetTensorData<float>(output));
 #endif /* HIFI_VFPU */
      return kTfLiteOk;
    }
    case kTfLiteInt8: {
@ -209,7 +220,8 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
      err = xa_nn_vec_activation_min_max_asym8_asym8(out_data_ptr, inp_data_ptr,
                                                     zero, six, flat_size);
-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_activation_min_max_8_8 failed");
+      CHECK_ERR_HIFI_NNLIB_KER(
          err, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
      return kTfLiteOk;
    }
    default: {
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/add.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/add.cc
@ -0,0 +1,273 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/reference/add.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace add {
 constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 struct OpData {
  bool requires_broadcast;
  // These fields are used in both the general 8-bit -> 8bit quantized path,
  // and the special 16-bit -> 16bit quantized path
  int input1_shift;
  int input2_shift;
  int32 output_activation_min;
  int32 output_activation_max;
  // These fields are used only in the general 8-bit -> 8bit quantized path
  int32 input1_multiplier;
  int32 input2_multiplier;
  int32 output_multiplier;
  int output_shift;
  int left_shift;
  int32 input1_offset;
  int32 input2_offset;
  int32 output_offset;
 };
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteAddParams* params,
                             const TfLiteTensor* input1,
                             const TfLiteTensor* input2, TfLiteTensor* output,
                             OpData* data) {
  data->requires_broadcast = !HaveSameShapes(input1, input2);
  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
    // 8bit -> 8bit general quantized path, with general rescalings
    data->input1_offset = -input1->params.zero_point;
    data->input2_offset = -input2->params.zero_point;
    data->output_offset = output->params.zero_point;
    data->left_shift = 20;
    const double twice_max_input_scale =
        2 * static_cast<double>(
                std::max(input1->params.scale, input2->params.scale));
    const double real_input1_multiplier =
        static_cast<double>(input1->params.scale) / twice_max_input_scale;
    const double real_input2_multiplier =
        static_cast<double>(input2->params.scale) / twice_max_input_scale;
    const double real_output_multiplier =
        twice_max_input_scale /
        ((1 << data->left_shift) * static_cast<double>(output->params.scale));
    QuantizeMultiplierSmallerThanOneExp(
        real_input1_multiplier, &data->input1_multiplier, &data->input1_shift);
    QuantizeMultiplierSmallerThanOneExp(
        real_input2_multiplier, &data->input2_multiplier, &data->input2_shift);
    QuantizeMultiplierSmallerThanOneExp(
        real_output_multiplier, &data->output_multiplier, &data->output_shift);
    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
        context, params->activation, output, &data->output_activation_min,
        &data->output_activation_max));
  }
  return kTfLiteOk;
 }
 TfLiteStatus EvalAdd(TfLiteContext* context, TfLiteNode* node,
                     TfLiteAddParams* params, const OpData* data,
                     const TfLiteTensor* input1, const TfLiteTensor* input2,
                     TfLiteTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
  tflite::ArithmeticParams op_params;
  SetActivationParams(output_activation_min, output_activation_max, &op_params);
 #define TF_LITE_ADD(opname)                                                   \
  reference_ops::opname(op_params, GetTensorShape(input1),                    \
                        GetTensorData<float>(input1), GetTensorShape(input2), \
                        GetTensorData<float>(input2), GetTensorShape(output), \
                        GetTensorData<float>(output))
  if (data->requires_broadcast) {
    TF_LITE_ADD(BroadcastAdd4DSlow);
  } else {
 #if HIFI_VFPU
    int err;
    const RuntimeShape& input1_shape = GetTensorShape(input1);
    const RuntimeShape& input2_shape = GetTensorShape(input2);
    const RuntimeShape& output_shape = GetTensorShape(output);
    const int flat_size =
        MatchingElementsSize(input1_shape, input2_shape, output_shape);
    err = xa_nn_elm_add_f32xf32_f32(GetTensorData<float>(output),
                                    GetTensorData<float>(input1),
                                    GetTensorData<float>(input2), flat_size);
    CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_add_f32xf32_f32 failed");
    err = xa_nn_vec_activation_min_max_f32_f32(
        GetTensorData<float>(output), GetTensorData<float>(output),
        output_activation_min, output_activation_max, flat_size);
    CHECK_ERR_HIFI_NNLIB_KER(err,
                             "xa_nn_vec_activation_min_max_f32_f32 failed");
 #else
    TF_LITE_ADD(Add);
 #endif /* HIFI_VFPU */
  }
 #undef TF_LITE_ADD
  return kTfLiteOk;
 }
 TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                              TfLiteAddParams* params, const OpData* data,
                              const TfLiteTensor* input1,
                              const TfLiteTensor* input2,
                              TfLiteTensor* output) {
  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
    tflite::ArithmeticParams op_params;
    op_params.left_shift = data->left_shift;
    op_params.input1_offset = data->input1_offset;
    op_params.input1_multiplier = data->input1_multiplier;
    op_params.input1_shift = data->input1_shift;
    op_params.input2_offset = data->input2_offset;
    op_params.input2_multiplier = data->input2_multiplier;
    op_params.input2_shift = data->input2_shift;
    op_params.output_offset = data->output_offset;
    op_params.output_multiplier = data->output_multiplier;
    op_params.output_shift = data->output_shift;
    SetActivationParams(data->output_activation_min,
                        data->output_activation_max, &op_params);
    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
        GetTensorShape(input1), GetTensorShape(input2), &op_params);
 #define TF_LITE_ADD(type, opname, dtype)                             \
  type::opname(op_params, GetTensorShape(input1),                    \
               GetTensorData<dtype>(input1), GetTensorShape(input2), \
               GetTensorData<dtype>(input2), GetTensorShape(output), \
               GetTensorData<dtype>(output));
    if (output->type == kTfLiteInt8) {
      if (need_broadcast) {
        TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
      } else {
        TF_LITE_ADD(reference_integer_ops, Add, int8_t);
      }
    } else {
      if (need_broadcast) {
        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
      } else {
        int err;
        const RuntimeShape& input1_shape = GetTensorShape(input1);
        const RuntimeShape& input2_shape = GetTensorShape(input2);
        const RuntimeShape& output_shape = GetTensorShape(output);
        const int flat_size =
            MatchingElementsSize(input1_shape, input2_shape, output_shape);
        err = xa_nn_elm_add_asym8xasym8_asym8(
            GetTensorData<uint8_t>(output), op_params.output_offset,
            op_params.output_shift, op_params.output_multiplier,
            op_params.quantized_activation_min,
            op_params.quantized_activation_max, GetTensorData<uint8_t>(input1),
            op_params.input1_offset, op_params.input1_shift,
            op_params.input1_multiplier, GetTensorData<uint8_t>(input2),
            op_params.input2_offset, op_params.input2_shift,
            op_params.input2_multiplier, op_params.left_shift, flat_size);
        CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_add_asym8xasym8_asym8 failed");
      }
    }
 #undef TF_LITE_ADD
  }
  return kTfLiteOk;
 }
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  void* data = nullptr;
  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
      kTfLiteError) {
    return nullptr;
  }
  return data;
 }
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  TFLITE_DCHECK(node->builtin_data != nullptr);
  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  OpData* data = static_cast<OpData*>(node->user_data);
  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
  TF_LITE_ENSURE_STATUS(
      CalculateOpData(context, params, input1, input2, output, data));
  return kTfLiteOk;
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
  TFLITE_DCHECK(node->user_data != nullptr);
  const OpData* data = static_cast<const OpData*>(node->user_data);
  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  if (output->type == kTfLiteFloat32) {
    TF_LITE_ENSURE_OK(
        context, EvalAdd(context, node, params, data, input1, input2, output));
  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
    TF_LITE_ENSURE_OK(context, EvalAddQuantized(context, node, params, data,
                                                input1, input2, output));
  } else {
    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                       TfLiteTypeGetName(output->type), output->type);
    return kTfLiteError;
  }
  return kTfLiteOk;
 }
 }  // namespace add
 TfLiteRegistration Register_ADD() {
  return {/*init=*/add::Init,
          /*free=*/nullptr,
          /*prepare=*/add::Prepare,
          /*invoke=*/add::Eval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
          /*custom_name=*/nullptr,
          /*version=*/0};
 }
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/conv.cc
@ -1,24 +1,24 @@
-/******************************************************************************
+/*******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
+*
- * Permission is hereby granted, free of charge, to any person obtaining
+* Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
+* a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
+* "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
+* not with any other processors and platforms, subject to
- * the following conditions:
+* the following conditions:
- *
+*
- * The above copyright notice and this permission notice shall be included
+* The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
+* in all copies or substantial portions of the Software.
- *
+*
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ******************************************************************************/
 ******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
@ -44,7 +44,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 namespace tflite {
 namespace ops {
@ -55,7 +55,6 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 constexpr int kMaxChannels = 256;
 // Conv is quantized along dimension 0:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@ -71,9 +70,8 @@ struct OpData {
  int output_shift;
  // Per channel output multiplier and shift.
-  // (b/141139247): Allocate these dynamically when possible.
+  int32_t* per_channel_output_multiplier;
-  int32_t per_channel_output_multiplier[kMaxChannels];
+  int32_t* per_channel_output_shift;
  int32_t per_channel_output_shift[kMaxChannels];
  // The range of the fused activation layer. For example for kNone and
  // uint8_t these would be 0 and 255.
@ -94,10 +92,10 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) {
 }
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, int width, int height,
+                             const TfLiteConvParams* params, int width,
-                             int filter_width, int filter_height, int out_width,
+                             int height, int filter_width, int filter_height,
-                             int out_height, const TfLiteType data_type,
+                             int out_width, int out_height,
-                             OpData* data) {
+                             const TfLiteType data_type, OpData* data) {
  bool has_bias = node->inputs->size == 3;
  // Check number of inputs/outputs
  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@ -131,8 +129,69 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
  return kTfLiteOk;
 }
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  void* data = nullptr;
  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
      kTfLiteError) {
    return nullptr;
  }
  return data;
 }
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  TFLITE_DCHECK(node->builtin_data != nullptr);
  OpData* data = static_cast<OpData*>(node->user_data);
  const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
  int input_width = input->dims->data[2];
  int input_height = input->dims->data[1];
  int filter_width = filter->dims->data[2];
  int filter_height = filter->dims->data[1];
  int output_width = output->dims->data[2];
  int output_height = output->dims->data[1];
  // Dynimically allocate per-channel quantization parameters.
  const int num_channels = filter->dims->data[kConvQuantizedDimension];
  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
      context, num_channels * sizeof(int32_t),
      reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
      context, num_channels * sizeof(int32_t),
      reinterpret_cast<void**>(&data->per_channel_output_shift)));
  // All per-channel quantized tensors need valid zero point and scale arrays.
  if (input->type == kTfLiteInt8) {
    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
                      kTfLiteAffineQuantization);
    const auto* affine_quantization =
        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
    TF_LITE_ENSURE(context, affine_quantization);
    TF_LITE_ENSURE(context, affine_quantization->scale);
    TF_LITE_ENSURE(context, affine_quantization->zero_point);
    TF_LITE_ENSURE(context,
                   affine_quantization->scale->size == 1 ||
                       affine_quantization->scale->size ==
                           filter->dims->data[kConvQuantizedDimension]);
    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                      affine_quantization->zero_point->size);
  }
  return CalculateOpData(context, node, params, input_width, input_height,
                         filter_width, filter_height, output_width,
                         output_height, input->type, data);
 }  // namespace conv
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteConvParams* params, OpData* data,
+                           TfLiteConvParams* params, const OpData& data,
                           const TfLiteTensor* input,
                           const TfLiteTensor* filter, const TfLiteTensor* bias,
                           TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
@ -143,9 +202,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
  if ((params->dilation_width_factor == 1) &&
      (params->dilation_height_factor == 1)) {
-    const uint8 *input_data, *filter_data;
+    const uint8_t *input_data, *filter_data;
    const int32_t* bias_data;
-    uint8* output_data;
+    uint8_t* output_data;
    const RuntimeShape& input_shape = GetTensorShape(input);
    const RuntimeShape& filter_shape = GetTensorShape(filter);
    const RuntimeShape& output_shape = GetTensorShape(output);
@ -158,14 +217,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
    const int stride_width = params->stride_width;
    const int stride_height = params->stride_height;
-    const int dilation_width_factor = 1;
+    const int pad_width = data.padding.width;
-    const int dilation_height_factor = 1;
+    const int pad_height = data.padding.height;
-    const int pad_width = data->padding.width;
+    const int32 output_activation_min = data.output_activation_min;
-    const int pad_height = data->padding.height;
+    const int32 output_activation_max = data.output_activation_max;
-    const int32 output_activation_min = data->output_activation_min;
+    const int32 output_multiplier = data.output_multiplier;
-    const int32 output_activation_max = data->output_activation_max;
+    const int output_shift = -data.output_shift;
    const int32 output_multiplier = data->output_multiplier;
    const int output_shift = -data->output_shift;
    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
@ -186,13 +243,14 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
    const int filter_depth = filter_shape.Dims(3);
    int err, output_data_format = 0;
-    void* p_scratch;
+    uint8_t* p_scratch;
-    uint8 *p_filter, *p_out_scratch;
+    uint8_t* p_filter;
    // Calculate filter_depth_padded as next near multiple of 4
    int filter_depth_padded = (filter_depth + 3) & (~3);
    int out_length = output_height * output_width * output_depth;
    int filter_size_padded = filter_height * filter_width * filter_depth_padded;
    int required_scratch, input_precision = PREC_ASYM8;
-    int h, w, c;
+    int h, c;
    required_scratch = xa_nn_conv2d_std_getsize(
        input_height, input_depth, filter_height, filter_width, stride_height,
@ -207,19 +265,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
    ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
    p_scratch = xtensa_nnlib_scratch_buf;
-    p_filter = (uint8*)p_scratch;
+    p_filter = p_scratch;
    p_out_scratch =
        (p_filter +
         ALIGNED_SIZE((sizeof(uint8_t) * filter_height * filter_width *
                       filter_depth_padded * output_depth),
                      8));
    required_scratch +=
-        ALIGNED_SIZE((sizeof(uint8_t) * filter_height * filter_width *
+        ALIGNED_SIZE((sizeof(uint8_t) * filter_size_padded * output_depth), 8);
-                      filter_depth_padded * output_depth),
+    p_scratch +=
-                     8);
+        ALIGNED_SIZE(sizeof(uint8_t) * filter_size_padded * output_depth, 8);
    p_scratch =
        (uint8*)(p_out_scratch + ALIGNED_SIZE(sizeof(uint8_t) * out_length, 8));
    required_scratch += ALIGNED_SIZE(sizeof(uint8_t) * out_length, 8);
    if (required_scratch > (int)XTENSA_NNLIB_MAX_SCRATCH_SIZE) {
      TF_LITE_KERNEL_LOG(context,
@ -240,9 +290,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
    }
    for (int batch = 0; batch < batches; ++batch) {
-      uint8* p_out_temp;
+      uint8_t* p_out_temp;
-      p_out_temp = (uint8*)&p_out_scratch[0];
+      p_out_temp = &output_data[batch * out_length];
      p_out_temp = (uint8*)ALIGN_PTR(p_out_temp, 8);
      err = xa_nn_conv2d_std_asym8xasym8(
          p_out_temp,
@ -252,24 +301,24 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
          filter_width, output_depth, stride_width, stride_height, pad_width,
          pad_height, output_height, output_width, input_offset, filter_offset,
          output_multiplier, output_shift, output_offset, output_data_format,
-          p_scratch);
+          static_cast<void*>(p_scratch));
      CHECK_ERR_HIFI_NNLIB_KER(
          err, "conv2d_std_asym8: xa_nn_conv2d_std_asym8xasym8 failed");
-      for (int i = 0; i < out_length; i++) {
+      err = xa_nn_vec_activation_min_max_asym8_asym8(
-        uint8* p_temp;
+          p_out_temp, p_out_temp, output_activation_min, output_activation_max,
-        p_temp = &output_data[batch * out_length];
+          out_length);
-        ACTIVATION_MIN_MAX_ASYM8(p_temp[i], p_out_temp[i],
+      CHECK_ERR_HIFI_NNLIB_KER(
-                                 output_activation_min, output_activation_max)
+          err, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
      }
    }
  } else {
    // TODO(b/154032858): Investigate removing extra copies.
    ConvParams op_params;
    op_params.padding_type = RuntimePaddingType(params->padding);
-    op_params.padding_values.width = data->padding.width;
+    op_params.padding_values.width = data.padding.width;
-    op_params.padding_values.height = data->padding.height;
+    op_params.padding_values.height = data.padding.height;
    op_params.stride_width = params->stride_width;
    op_params.stride_height = params->stride_height;
    op_params.dilation_width_factor = params->dilation_width_factor;
@ -277,10 +326,10 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
    op_params.input_offset = input_offset;
    op_params.weights_offset = filter_offset;
    op_params.output_offset = output_offset;
-    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_multiplier = data.output_multiplier;
-    op_params.output_shift = -data->output_shift;
+    op_params.output_shift = -data.output_shift;
-    op_params.quantized_activation_min = data->output_activation_min;
+    op_params.quantized_activation_min = data.output_activation_min;
-    op_params.quantized_activation_max = data->output_activation_max;
+    op_params.quantized_activation_max = data.output_activation_max;
    reference_ops::Conv(op_params, GetTensorShape(input),
                        GetTensorData<uint8_t>(input), GetTensorShape(filter),
                        GetTensorData<uint8_t>(filter), GetTensorShape(bias),
@ -292,11 +341,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, OpData* data,
+                             TfLiteConvParams* params, const OpData& data,
                             const TfLiteTensor* input,
                             const TfLiteTensor* filter,
                             const TfLiteTensor* bias, TfLiteTensor* output,
                             TfLiteTensor* im2col) {
  // TODO(b/154032858): Investigate removing extra copies.
  ConvParams op_params;
  op_params.input_offset = -input->params.zero_point;
  op_params.output_offset = output->params.zero_point;
@ -304,14 +354,14 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
  op_params.stride_width = params->stride_width;
  op_params.dilation_height_factor = params->dilation_height_factor;
  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.height = data.padding.height;
-  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.width = data.padding.width;
-  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_min = data.output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.quantized_activation_max = data.output_activation_max;
  reference_integer_ops::ConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
+      op_params, data.per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
+      data.per_channel_output_shift, GetTensorShape(input),
      GetTensorData<int8>(input), GetTensorShape(filter),
      GetTensorData<int8>(filter), GetTensorShape(bias),
      GetTensorData<int32>(bias), GetTensorShape(output),
@ -319,7 +369,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 }
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteConvParams* params, OpData* data,
+                       TfLiteConvParams* params, const OpData& data,
                       const TfLiteTensor* input, const TfLiteTensor* filter,
                       const TfLiteTensor* bias, TfLiteTensor* im2col,
                       TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
@ -327,6 +377,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
 #if HIFI_VFPU
  if ((params->dilation_width_factor == 1) &&
      (params->dilation_height_factor == 1)) {
    const float *input_data, *filter_data;
@ -344,10 +395,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
    const int stride_width = params->stride_width;
    const int stride_height = params->stride_height;
-    const int dilation_width_factor = 1;
+    const int pad_width = data.padding.width;
-    const int dilation_height_factor = 1;
+    const int pad_height = data.padding.height;
    const int pad_width = data->padding.width;
    const int pad_height = data->padding.height;
    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
@ -366,13 +415,14 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
    const int output_width = output_shape.Dims(2);
    const int filter_depth = filter_shape.Dims(3);
    int err, output_data_format = 0;
-    void* p_scratch;
+    uint8_t* p_scratch;
-    float *p_filter, *p_out_scratch;
+    float* p_filter;
    // Calculate filter_depth_padded as next near multiple of 2
    int filter_depth_padded = (filter_depth + 1) & (~1);
    int out_length = output_height * output_width * output_depth;
    int filter_size_padded = filter_height * filter_width * filter_depth_padded;
    int required_scratch, input_precision = PREC_F32;
-    int h, w, c;
+    int h, c;
    required_scratch = xa_nn_conv2d_std_getsize(
        input_height, input_depth, filter_height, filter_width, stride_height,
@ -387,19 +437,11 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
    ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
    p_scratch = xtensa_nnlib_scratch_buf;
-    p_filter = (float*)p_scratch;
+    p_filter = reinterpret_cast<float*>(p_scratch);
-    p_out_scratch =
+    p_scratch +=
-        (float*)((uint8_t*)p_filter +
+        ALIGNED_SIZE((sizeof(float) * filter_size_padded * output_depth), 8);
                 ALIGNED_SIZE((sizeof(float) * filter_height * filter_width *
                               filter_depth_padded * output_depth),
                              8));
    required_scratch +=
-        ALIGNED_SIZE((sizeof(float) * filter_height * filter_width *
+        ALIGNED_SIZE((sizeof(float) * filter_size_padded * output_depth), 8);
                      filter_depth_padded * output_depth),
                     8);
    p_scratch = (float*)((uint8_t*)p_out_scratch +
                         ALIGNED_SIZE(sizeof(float) * out_length, 8));
    required_scratch += ALIGNED_SIZE(sizeof(float) * out_length, 8);
    if (required_scratch > (int)XTENSA_NNLIB_MAX_SCRATCH_SIZE) {
      TF_LITE_KERNEL_LOG(context,
@ -420,8 +462,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
    for (int batch = 0; batch < batches; ++batch) {
      float* p_out_temp;
-      p_out_temp = (float*)&p_out_scratch[0];
+      p_out_temp = &output_data[batch * out_length];
      p_out_temp = (float*)ALIGN_PTR(p_out_temp, 8);
      err = xa_nn_conv2d_std_f32(
          p_out_temp,
@ -429,23 +470,26 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
          p_filter, bias_data, input_height, input_width, input_depth,
          filter_height, filter_width, output_depth, stride_width,
          stride_height, pad_width, pad_height, output_height, output_width,
-          output_data_format, p_scratch);
+          output_data_format, static_cast<void*>(p_scratch));
      CHECK_ERR_HIFI_NNLIB_KER(
          err, "conv2d_std_f32: xa_nn_conv2d_std_f32xf32 failed");
-      for (int i = 0; i < out_length; i++) {
+      err = xa_nn_vec_activation_min_max_f32_f32(
-        float* p_temp;
+          p_out_temp, p_out_temp, output_activation_min, output_activation_max,
-        p_temp = &output_data[batch * out_length];
+          out_length);
-        ACTIVATION_MIN_MAX(float, p_temp[i], p_out_temp[i],
+
-                           output_activation_min, output_activation_max)
+      CHECK_ERR_HIFI_NNLIB_KER(err,
-      }
+                               "xa_nn_vec_activation_min_max_f32_f32 failed");
    }
-  } else {
+  } else
 #endif /* HIFI_VFPU */
  {
    // TODO(b/154032858): Investigate removing extra copies.
    ConvParams op_params;
    op_params.padding_type = RuntimePaddingType(params->padding);
-    op_params.padding_values.width = data->padding.width;
+    op_params.padding_values.width = data.padding.width;
-    op_params.padding_values.height = data->padding.height;
+    op_params.padding_values.height = data.padding.height;
    op_params.stride_width = params->stride_width;
    op_params.stride_height = params->stride_height;
    op_params.dilation_width_factor = params->dilation_width_factor;
@ -471,50 +515,20 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  int input_width = input->dims->data[2];
+  TFLITE_DCHECK(node->user_data != nullptr);
-  int input_height = input->dims->data[1];
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
  int filter_width = filter->dims->data[2];
  int filter_height = filter->dims->data[1];
  int output_width = output->dims->data[2];
  int output_height = output->dims->data[1];
  OpData data;
  // All per-channel quantized tensors need valid zero point and scale arrays.
  if (input->type == kTfLiteInt8) {
    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
                      kTfLiteAffineQuantization);
    const auto* affine_quantization =
        reinterpret_cast<TfLiteAffineQuantization*>(
            filter->quantization.params);
    TF_LITE_ENSURE(context, affine_quantization);
    TF_LITE_ENSURE(context, affine_quantization->scale);
    TF_LITE_ENSURE(context, affine_quantization->zero_point);
    TF_LITE_ENSURE(context,
                   affine_quantization->scale->size == 1 ||
                       affine_quantization->scale->size ==
                           filter->dims->data[kConvQuantizedDimension]);
    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                      affine_quantization->zero_point->size);
  }
  TF_LITE_ENSURE_STATUS(CalculateOpData(
      context, node, params, input_width, input_height, filter_width,
      filter_height, output_width, output_height, input->type, &data));
  switch (input->type) {  // Already know in/out types are same.
    case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, nullptr,
+      EvalFloat(context, node, params, data, input, filter, bias, nullptr,
                nullptr, output);
      break;
    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
                              output, nullptr);
      break;
    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, nullptr,
+      EvalQuantized(context, node, params, data, input, filter, bias, nullptr,
                    nullptr, output);
      break;
    default:
@ -528,9 +542,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace conv
 TfLiteRegistration Register_CONV_2D() {
-  return {/*init=*/nullptr,
+  return {/*init=*/conv::Init,
          /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/conv::Prepare,
          /*invoke=*/conv::Eval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/depthwise_conv.cc
@ -1,24 +1,24 @@
-/******************************************************************************
+/*******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
+*
- * Permission is hereby granted, free of charge, to any person obtaining
+* Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
+* a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
+* "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
+* not with any other processors and platforms, subject to
- * the following conditions:
+* the following conditions:
- *
+*
- * The above copyright notice and this permission notice shall be included
+* The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
+* in all copies or substantial portions of the Software.
- *
+*
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ******************************************************************************/
 ******************************************************************************/
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
@ -45,7 +45,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 namespace tflite {
 namespace ops {
@ -57,8 +57,6 @@ constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 // Per channel quantization is not needed for any model on xtensa.
 constexpr int kMaxChannels = 256;
 // Depthwise conv is quantized along dimension 3:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@ -72,10 +70,8 @@ struct OpData {
  int output_shift;
  // Per channel output multiplier and shift.
-  // (b/141139247): Allocate these dynamically when possible.
+  int32_t* per_channel_output_multiplier;
-  int32_t per_channel_output_multiplier[kMaxChannels];
+  int32_t* per_channel_output_shift;
  int32_t per_channel_output_shift[kMaxChannels];
  // The range of the fused activation layer. For example for kNone and
  // uint8_t these would be 0 and 255.
  int32_t output_activation_min;
@ -107,26 +103,88 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
    int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+    return tflite::PopulateConvolutionQuantizationParams(
        context, input, filter, bias, output, params->activation,
        &data->output_multiplier, &data->output_shift,
        &data->output_activation_min, &data->output_activation_max,
        data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels);
  }
  return kTfLiteOk;
 }
 }  // namespace
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  void* data = nullptr;
  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
      kTfLiteError) {
    return nullptr;
  }
  return data;
 }
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  TFLITE_DCHECK(node->builtin_data != nullptr);
  auto* params =
      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
  OpData* data = static_cast<OpData*>(node->user_data);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
  const TfLiteType data_type = input->type;
  int width = SizeOfDimension(input, 2);
  int height = SizeOfDimension(input, 1);
  int filter_width = SizeOfDimension(filter, 2);
  int filter_height = SizeOfDimension(filter, 1);
  // Per channel quantization is only needed for int8 inference. For other
  // quantized types, only a single scale and zero point is needed.
  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
  // Dynimically allocate per-channel quantization parameters.
  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
      context, num_channels * sizeof(int32_t),
      reinterpret_cast<void**>(&data->per_channel_output_multiplier)));
  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
      context, num_channels * sizeof(int32_t),
      reinterpret_cast<void**>(&data->per_channel_output_shift)));
  // All per-channel quantized tensors need valid zero point and scale arrays.
  if (input->type == kTfLiteInt8) {
    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
                      kTfLiteAffineQuantization);
    const auto* affine_quantization =
        reinterpret_cast<TfLiteAffineQuantization*>(
            filter->quantization.params);
    TF_LITE_ENSURE(context, affine_quantization);
    TF_LITE_ENSURE(context, affine_quantization->scale);
    TF_LITE_ENSURE(context, affine_quantization->zero_point);
    TF_LITE_ENSURE(
        context, affine_quantization->scale->size == 1 ||
                     affine_quantization->scale->size ==
                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                      affine_quantization->zero_point->size);
  }
  return CalculateOpData(context, node, params, width, height, filter_width,
                         filter_height, data_type, data);
 }
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteDepthwiseConvParams* params, OpData* data,
+                       TfLiteDepthwiseConvParams* params, const OpData* data,
                       const TfLiteTensor* input, const TfLiteTensor* filter,
                       const TfLiteTensor* bias, TfLiteTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
 #if HIFI_VFPU
  if ((params->dilation_width_factor == 1) &&
      (params->dilation_height_factor == 1)) {
    const float *input_data, *filter_data, *bias_data;
@ -143,10 +201,6 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
    const int stride_width = params->stride_width;
    const int stride_height = params->stride_height;
    const int dilation_width_factor = 1;
    const int dilation_height_factor = 1;
    // const int dilation_width_factor = params->dilation_width_factor;;
    // const int dilation_height_factor = params->dilation_height_factor;
    const int pad_width = data->padding.width;
    const int pad_height = data->padding.height;
    const int depth_multiplier = params->depth_multiplier;
@ -168,7 +222,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
    int32_t err, input_data_format = 0, output_data_format = 0;
-    void* p_scratch;
+    uint8_t* p_scratch;
    float* p_filter;
    int filter_depth_padded, filter_size_padded, required_scratch;
    int input_precision = PREC_F32;
@ -198,9 +252,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
      return kTfLiteError;
    }
-    p_filter = (float*)p_scratch;
+    p_filter = reinterpret_cast<float*>(p_scratch);
-    p_scratch = (void*)((uint8_t*)p_filter +
+    p_scratch += ALIGNED_SIZE(sizeof(float) * filter_size_padded, 8);
                        ALIGNED_SIZE(sizeof(float) * filter_size_padded, 8));
    for (h = 0; h < filter_height * filter_width; h++) {
      for (c = 0; c < filter_depth; c++) {
@ -220,37 +273,22 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
          input_height, input_width, input_depth, filter_height, filter_width,
          depth_multiplier, stride_width, stride_height, pad_width, pad_height,
          output_height, output_width, input_data_format, output_data_format,
-          p_scratch);
+          static_cast<void*>(p_scratch));
      CHECK_ERR_HIFI_NNLIB_KER(
          err, "DepthwiseConvFloat: xa_nn_conv2d_depthwise_f32 failed");
    }
    // pre loop for activation_min_max to handle alignment
    int out_length = batches * output_height * output_width * output_depth;
-    uint32 p_unalign_val = (uint32)output_data, p_align_val;
+    err = xa_nn_vec_activation_min_max_f32_f32(
-    p_align_val = (p_unalign_val + 7) & (~7);
+        output_data, output_data, output_activation_min, output_activation_max,
        out_length);
-    int pre_loop_count = p_align_val - p_unalign_val;
+    CHECK_ERR_HIFI_NNLIB_KER(
-    pre_loop_count = MIN(pre_loop_count, out_length);
+        err, "DepthwiseConvFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
-
+  } else
-    for (i = 0; i < pre_loop_count; i++) {
+#endif /* HIFI_VFPU */
-      ACTIVATION_MIN_MAX(float, output_data[i], output_data[i],
+  {
                         output_activation_min, output_activation_max)
    }
    out_length = out_length - pre_loop_count;
    if (out_length) {
      err = xa_nn_vec_activation_min_max_f32_f32(
          &output_data[i], &output_data[i], output_activation_min,
          output_activation_max, out_length);
      CHECK_ERR_HIFI_NNLIB_KER(
          err,
          "DepthwiseConvFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
    }
  } else {
    tflite::DepthwiseParams op_params;
    // Padding type is ignored, but still set.
    op_params.padding_type = PaddingType::kSame;
@ -274,8 +312,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, OpData* data,
+                             TfLiteDepthwiseConvParams* params,
-                             const TfLiteTensor* input,
+                             const OpData* data, const TfLiteTensor* input,
                             const TfLiteTensor* filter,
                             const TfLiteTensor* bias, TfLiteTensor* output) {
  DepthwiseParams op_params;
@ -290,7 +328,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
  op_params.input_offset = -input->params.zero_point;
  op_params.weights_offset = 0;
  op_params.output_offset = output->params.zero_point;
-  // (b/130439627): Use calculated value for clamping.
+  // TODO(b/130439627): Use calculated value for clamping.
  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
@ -304,8 +342,8 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 }
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteDepthwiseConvParams* params, OpData* data,
+                           TfLiteDepthwiseConvParams* params,
-                           const TfLiteTensor* input,
+                           const OpData* data, const TfLiteTensor* input,
                           const TfLiteTensor* filter, const TfLiteTensor* bias,
                           TfLiteTensor* output) {
  const int32_t input_offset = -input->params.zero_point;
@ -314,9 +352,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
  if ((params->dilation_width_factor == 1) &&
      (params->dilation_height_factor == 1)) {
-    const uint8 *input_data, *filter_data;
+    const uint8_t *input_data, *filter_data;
    const int32_t* bias_data;
-    uint8* output_data;
+    uint8_t* output_data;
    const RuntimeShape& input_shape = GetTensorShape(input);
    const RuntimeShape& filter_shape = GetTensorShape(filter);
    const RuntimeShape& output_shape = GetTensorShape(output);
@ -329,10 +367,6 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
    const int stride_width = params->stride_width;
    const int stride_height = params->stride_height;
    const int dilation_width_factor = 1;
    const int dilation_height_factor = 1;
    // const int dilation_width_factor = params->dilation_width_factor;
    // const int dilation_height_factor = params->dilation_height_factor;
    const int pad_width = data->padding.width;
    const int pad_height = data->padding.height;
    const int depth_multiplier = params->depth_multiplier;
@ -360,11 +394,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
    int32_t err, i, input_data_format = 0, output_data_format = 0;
-    void* p_scratch;
+    uint8_t* p_scratch;
-    uint8* p_filter;
+    uint8_t* p_filter;
    int filter_depth_padded, filter_size_padded, required_scratch;
    int input_precision = PREC_ASYM8;
-    int h, c;
+    int h;
    ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM;
    p_scratch = xtensa_nnlib_scratch_buf;
@ -390,18 +424,15 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
      return kTfLiteError;
    }
-    p_filter = (uint8*)p_scratch;
+    p_filter = p_scratch;
-    p_scratch = (void*)(p_filter +
+    p_scratch += ALIGNED_SIZE(sizeof(uint8_t) * filter_size_padded, 8);
-                        ALIGNED_SIZE(sizeof(uint8_t) * filter_size_padded, 8));
+    int pad_value = filter_depth_padded - filter_depth;
    for (h = 0; h < filter_height * filter_width; h++) {
-      for (c = 0; c < filter_depth; c++) {
+      memcpy(&p_filter[h * filter_depth_padded], &filter_data[h * filter_depth],
-        p_filter[h * filter_depth_padded + c] =
+             filter_depth);
-            filter_data[h * filter_depth + c];
+      memset(&p_filter[h * filter_depth_padded + filter_depth], -filter_offset,
-      }
+             pad_value);
      for (c = filter_depth; c < filter_depth_padded; c++) {
        p_filter[h * filter_depth_padded + c] = -filter_offset;
      }
    }
    for (i = 0; i < batches; i++) {
@ -413,37 +444,22 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
          depth_multiplier, stride_width, stride_height, pad_width, pad_height,
          output_height, output_width, input_offset, filter_offset,
          output_multiplier, output_shift, output_offset, input_data_format,
-          output_data_format, p_scratch);
+          output_data_format, static_cast<void*>(p_scratch));
      CHECK_ERR_HIFI_NNLIB_KER(
          err, "DepthwiseConvAsym8: xa_nn_conv2d_depthwise_asym8xasym8 failed");
    }
    // pre loop for activation_min_max to handle alignment
    int out_length = batches * output_height * output_width * output_depth;
-    uint32 p_unalign_val = (uint32)output_data, p_align_val;
+    err = xa_nn_vec_activation_min_max_asym8_asym8(
-    p_align_val = (p_unalign_val + 7) & (~7);
+        output_data, output_data, output_activation_min, output_activation_max,
        out_length);
-    int pre_loop_count = p_align_val - p_unalign_val;
+    CHECK_ERR_HIFI_NNLIB_KER(
-    pre_loop_count = MIN(pre_loop_count, out_length);
+        err,
        "DepthwiseConvAsym8: xa_nn_vec_activation_min_max_asym8_asym8 "
        "failed");
    for (i = 0; i < pre_loop_count; i++) {
      ACTIVATION_MIN_MAX_ASYM8(output_data[i], output_data[i],
                               output_activation_min, output_activation_max)
    }
    out_length = out_length - pre_loop_count;
    if (out_length > 0) {
      err = xa_nn_vec_activation_min_max_asym8_asym8(
          &output_data[i], &output_data[i], output_activation_min,
          output_activation_max, out_length);
      CHECK_ERR_HIFI_NNLIB_KER(
          err,
          "DepthwiseConvAsym8: xa_nn_vec_activation_min_max_asym8_asym8 "
          "failed");
    }
  } else {
    tflite::DepthwiseParams op_params;
    // Padding type is ignored, but still set.
@ -474,8 +490,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  TFLITE_DCHECK(node->builtin_data != nullptr);
  auto* params =
      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
  const OpData& data = *(static_cast<const OpData*>(node->user_data));
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
@ -483,38 +503,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* bias =
      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
-  const TfLiteType data_type = input->type;
+  // TODO(aselle): Consider whether float conv and quantized conv should be
  int width = SizeOfDimension(input, 2);
  int height = SizeOfDimension(input, 1);
  int filter_width = SizeOfDimension(filter, 2);
  int filter_height = SizeOfDimension(filter, 1);
  OpData data;
  // All per-channel quantized tensors need valid zero point and scale arrays.
  if (input->type == kTfLiteInt8) {
    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
                      kTfLiteAffineQuantization);
    const auto* affine_quantization =
        reinterpret_cast<TfLiteAffineQuantization*>(
            filter->quantization.params);
    TF_LITE_ENSURE(context, affine_quantization);
    TF_LITE_ENSURE(context, affine_quantization->scale);
    TF_LITE_ENSURE(context, affine_quantization->zero_point);
    TF_LITE_ENSURE(
        context, affine_quantization->scale->size == 1 ||
                     affine_quantization->scale->size ==
                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                      affine_quantization->zero_point->size);
  }
  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
                                        filter_width, filter_height, data_type,
                                        &data));
  // (aselle): Consider whether float conv and quantized conv should be
  // separate ops to avoid dispatch overhead here.
  switch (input->type) {  // Already know in/out types are same.
    case kTfLiteFloat32:
@ -538,9 +527,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace depthwise_conv
 TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
-  return {/*init=*/nullptr,
+  return {/*init=*/depthwise_conv::Init,
          /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/depthwise_conv::Prepare,
          /*invoke=*/depthwise_conv::Eval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/floor.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/floor.cc
@ -1,24 +1,24 @@
-/******************************************************************************
+/*******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
+*
- * Permission is hereby granted, free of charge, to any person obtaining
+* Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
+* a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
+* "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
+* not with any other processors and platforms, subject to
- * the following conditions:
+* the following conditions:
- *
+*
- * The above copyright notice and this permission notice shall be included
+* The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
+* in all copies or substantial portions of the Software.
- *
+*
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ******************************************************************************/
 ******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
@ -39,7 +39,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 namespace tflite {
 namespace ops {
@ -53,6 +53,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 #if HIFI_VFPU
  int err;
  const float* inp_data_ptr;
  float* out_data_ptr;
@ -66,6 +67,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  err = xa_nn_elm_floor_f32_f32(out_data_ptr, inp_data_ptr, flat_size);
  CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_floor_f32_f32 failed");
 #else
  reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
                       GetTensorShape(output), GetTensorData<float>(output));
 #endif /* HIFI_VFPU */
  return kTfLiteOk;
 }
 }  // namespace floor
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/fully_connected.cc
@ -1,24 +1,24 @@
-/******************************************************************************
+/*******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
+*
- * Permission is hereby granted, free of charge, to any person obtaining
+* Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
+* a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
+* "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
+* not with any other processors and platforms, subject to
- * the following conditions:
+* the following conditions:
- *
+*
- * The above copyright notice and this permission notice shall be included
+* The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
+* in all copies or substantial portions of the Software.
- *
+*
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ******************************************************************************/
 ******************************************************************************/
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
@ -43,7 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 namespace tflite {
 namespace ops {
@ -70,7 +70,7 @@ constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFullyConnectedParams* params,
+                             TfLiteFusedActivation activation,
                             TfLiteType data_type, const TfLiteTensor* input,
                             const TfLiteTensor* filter,
                             const TfLiteTensor* bias, TfLiteTensor* output,
@ -84,7 +84,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
    data->output_shift = -exponent;
    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-        context, params->activation, output, &data->output_activation_min,
+        context, activation, output, &data->output_activation_min,
        &data->output_activation_max));
  }
  return status;
@ -92,20 +92,50 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 }  // namespace
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  void* data = nullptr;
  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
      kTfLiteError) {
    return nullptr;
  }
  return data;
 }
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TFLITE_DCHECK(node->user_data != nullptr);
  TFLITE_DCHECK(node->builtin_data != nullptr);
  OpData* data = static_cast<OpData*>(node->user_data);
  const auto params =
      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE_EQ(context, input->type, output->type);
  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                     "Hybrid models are not supported on TFLite Micro.");
  return CalculateOpData(context, params->activation, input->type, input,
                         filter, bias, output, data);
 }
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               TfLiteFullyConnectedParams* params, OpData* data,
+                               const OpData& data, const TfLiteTensor* input,
                               const TfLiteTensor* input,
                               const TfLiteTensor* filter,
                               const TfLiteTensor* bias, TfLiteTensor* output) {
-  FullyConnectedParams op_params;
+  tflite::FullyConnectedParams op_params;
  op_params.input_offset = -input->params.zero_point;
  op_params.weights_offset = -filter->params.zero_point;
  op_params.output_offset = output->params.zero_point;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
-  // (b/138810107): Figure out whether output shift should be inverted
+  // TODO(b/138810107): Figure out whether output shift should be inverted
-  op_params.output_shift = -data->output_shift;
+  op_params.output_shift = -data.output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_min = data.output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.quantized_activation_max = data.output_activation_max;
  reference_integer_ops::FullyConnected(
      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
@ -116,8 +146,7 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 }
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteFullyConnectedParams* params, OpData* data,
+                           const OpData& data, const TfLiteTensor* input,
                           const TfLiteTensor* input,
                           const TfLiteTensor* filter, const TfLiteTensor* bias,
                           TfLiteTensor* output) {
  const int32_t input_offset = -input->params.zero_point;
@ -128,11 +157,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
  op_params.input_offset = input_offset;
  op_params.weights_offset = filter_offset;
  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
+  op_params.output_shift = -data.output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_min = data.output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.quantized_activation_max = data.output_activation_max;
 #define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
  reference_ops::FullyConnected(                                       \
@ -162,11 +191,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
        CHECK_ERR_HIFI_NNLIB_KER(
            ret, "xa_nn_fully_connected_asym8xasym8_asym8 failed");
      }
-      for (int i = 0; i < batches * out_depth; i++) {
+      ret = xa_nn_vec_activation_min_max_asym8_asym8(
-        ACTIVATION_MIN_MAX_ASYM8(p_out[i], p_out[i],
+          p_out, p_out, data.output_activation_min, data.output_activation_max,
-                                 data->output_activation_min,
+          batches * out_depth);
-                                 data->output_activation_max)
+
-      }
+      CHECK_ERR_HIFI_NNLIB_KER(
          ret, "xa_nn_vec_activation_min_max_asym8_asym8 failed");
      break;
    }
    case kTfLiteInt16:
@ -182,15 +212,16 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteFullyConnectedParams* params, OpData* data,
+                       TfLiteFusedActivation activation,
                       const TfLiteTensor* input, const TfLiteTensor* filter,
                       const TfLiteTensor* bias, TfLiteTensor* output) {
  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
+  CalculateActivationRange(activation, &output_activation_min,
                           &output_activation_max);
  tflite::FullyConnectedParams op_params;
  op_params.float_activation_min = output_activation_min;
  op_params.float_activation_max = output_activation_max;
 #if HIFI_VFPU
  int ret, b, weight_depth, out_depth, batches;
  weight_depth =
      GetTensorShape(filter).Dims(GetTensorShape(filter).DimensionsCount() - 1);
@ -208,43 +239,48 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
    CHECK_ERR_HIFI_NNLIB_KER(ret, "xa_nn_fully_connected_f32 failed.");
  }
  float* p_out = GetTensorData<float>(output);
-  for (int i = 0; i < batches * out_depth; i++) {
+  ret = xa_nn_vec_activation_min_max_f32_f32(
-    ACTIVATION_MIN_MAX(float, p_out[i], p_out[i], output_activation_min,
+      p_out, p_out, output_activation_min, output_activation_max,
-                       output_activation_max)
+      batches * out_depth);
-  }
+  CHECK_ERR_HIFI_NNLIB_KER(ret, "xa_nn_vec_activation_min_max_f32_f32 failed");
 #else
  tflite::reference_ops::FullyConnected(
      op_params, GetTensorShape(input), GetTensorData<float>(input),
      GetTensorShape(filter), GetTensorData<float>(filter),
      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
      GetTensorData<float>(output));
 #endif /* HIFI_VFPU */
  return kTfLiteOk;
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
+  TFLITE_DCHECK(node->builtin_data != nullptr);
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  const auto* params =
      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TfLiteType data_type = input->type;
+  TFLITE_DCHECK(node->user_data != nullptr);
-  OpData local_data_object;
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
  OpData* data = &local_data_object;
  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
                                        filter, bias, output, data));
-  switch (filter->type) {  // Already know in/out types are same.
+  // Checks in Prepare ensure input, output and filter types are all the same.
  switch (input->type) {
    case kTfLiteFloat32:
-      return EvalFloat(context, node, params, data, input, filter, bias,
+      return EvalFloat(context, node, params->activation, input, filter, bias,
                       output);
    case kTfLiteInt8:
-      return EvalQuantizedInt8(context, node, params, data, input, filter, bias,
+      return EvalQuantizedInt8(context, node, data, input, filter, bias,
                               output);
    case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, data, input, filter, bias,
+      return EvalQuantized(context, node, data, input, filter, bias, output);
                           output);
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(filter->type), filter->type);
+                         TfLiteTypeGetName(input->type), input->type);
      return kTfLiteError;
  }
  return kTfLiteOk;
@ -253,9 +289,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace fully_connected
 TfLiteRegistration Register_FULLY_CONNECTED() {
-  return {/*init=*/nullptr,
+  return {/*init=*/fully_connected::Init,
          /*free=*/nullptr,
-          /*prepare=*/nullptr,
+          /*prepare=*/fully_connected::Prepare,
          /*invoke=*/fully_connected::Eval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/logistic.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/logistic.cc
@ -1,24 +1,24 @@
-/******************************************************************************
+/*******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
+*
- * Permission is hereby granted, free of charge, to any person obtaining
+* Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
+* a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
+* "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
+* not with any other processors and platforms, subject to
- * the following conditions:
+* the following conditions:
- *
+*
- * The above copyright notice and this permission notice shall be included
+* The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
+* in all copies or substantial portions of the Software.
- *
+*
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ******************************************************************************/
 ******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
@ -34,32 +34,68 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/kernels/internal/reference/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/logistic.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace activations {
-
+namespace {
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+struct OpData {
  int32_t input_zero_point;
  int32_t input_range_radius;
  int32_t input_multiplier;
  int input_left_shift;
 };
 TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
                                       OpData* data) {
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE_EQ(context, input->type, output->type);
  if (input->type == kTfLiteInt8) {
    TF_LITE_ENSURE_EQ(context, output->params.zero_point,
                      std::numeric_limits<int8_t>::min());
    static constexpr int kInputIntegerBits = 4;
    const double input_real_multiplier =
        static_cast<double>(input->params.scale) *
        static_cast<double>(1 << (31 - kInputIntegerBits));
    const double q = std::frexp(input_real_multiplier, &data->input_left_shift);
    data->input_multiplier = static_cast<int32_t>(TfLiteRound(q * (1ll << 31)));
    data->input_range_radius =
        CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31);
  }
  return kTfLiteOk;
 }
 }  // namespace
 TfLiteStatus LogisticEval(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  OpData data;
  CalculateArithmeticOpData(context, node, &data);
  if (input->type == kTfLiteFloat32) {
    switch (output->type) {
      case kTfLiteFloat32: {
 #if HIFI_VFPU
        int err;
        const float* inp_data_ptr;
        float* out_data_ptr;
@ -73,6 +109,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
        err = xa_nn_vec_sigmoid_f32_f32(out_data_ptr, inp_data_ptr, flat_size);
        CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_sigmoid_f32_f32 failed");
 #else
        reference_ops::Logistic(
            GetTensorShape(input), GetTensorData<float>(input),
            GetTensorShape(output), GetTensorData<float>(output));
 #endif /* HIFI_VFPU */
        return kTfLiteOk;
      }
      default:
@ -84,11 +125,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  } else if (input->type == kTfLiteInt8) {
    switch (output->type) {
      case kTfLiteInt8: {
-        reference_ops::Logistic(
+        reference_integer_ops::Logistic(
-            GetTensorShape(input), GetTensorData<int8_t>(input),
+            input->params.zero_point, data.input_range_radius,
-            input->params.scale, input->params.zero_point,
+            data.input_multiplier, data.input_left_shift,
-            GetTensorShape(output), GetTensorData<int8_t>(output),
+            NumElements(input->dims), GetTensorData<int8_t>(input),
-            output->params.scale, output->params.zero_point);
+            GetTensorData<int8_t>(output));
        return kTfLiteOk;
      }
      default:
@ -98,7 +139,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
        return kTfLiteError;
    }
  } else {
-    // (b/141211002): Also support other data types once we have supported
+    // TODO(b/141211002): Also support other data types once we have supported
    // temporary tensors in TFLM.
    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
                       TfLiteTypeGetName(input->type),
@ -114,7 +155,7 @@ TfLiteRegistration Register_LOGISTIC() {
  return {/*init=*/nullptr,
          /*free=*/nullptr,
          /*prepare=*/nullptr,
-          /*invoke=*/activations::Eval,
+          /*invoke=*/activations::LogisticEval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
          /*custom_name=*/nullptr,
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/mul.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/mul.cc
@ -0,0 +1,229 @@
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/reference/mul.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
 #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 namespace tflite {
 namespace ops {
 namespace micro {
 namespace mul {
 constexpr int kInput1Tensor = 0;
 constexpr int kInput2Tensor = 1;
 constexpr int kOutputTensor = 0;
 struct OpData {
  int32_t output_activation_min;
  int32_t output_activation_max;
  int32_t output_multiplier;
  int output_shift;
 };
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                             TfLiteMulParams* params, OpData* data) {
  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
        context, params->activation, output, &data->output_activation_min,
        &data->output_activation_max));
    double real_multiplier = static_cast<double>(input1->params.scale) *
                             static_cast<double>(input2->params.scale) /
                             static_cast<double>(output->params.scale);
    QuantizeMultiplier(real_multiplier, &data->output_multiplier,
                       &data->output_shift);
  }
  return kTfLiteOk;
 }
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  if (output->dims->size == 0) {
    return AllocateOutputDimensionsFromInput(context, input1, input2, output);
  }
  return kTfLiteOk;
 }
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                           TfLiteMulParams* params, OpData* data,
                           const TfLiteTensor* input1,
                           const TfLiteTensor* input2, TfLiteTensor* output) {
  if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8) {
    tflite::ArithmeticParams op_params;
    SetActivationParams(data->output_activation_min,
                        data->output_activation_max, &op_params);
    op_params.input1_offset = -input1->params.zero_point;
    op_params.input2_offset = -input2->params.zero_point;
    op_params.output_offset = output->params.zero_point;
    op_params.output_multiplier = data->output_multiplier;
    op_params.output_shift = data->output_shift;
    bool need_broadcast = reference_ops::ProcessBroadcastShapes(
        GetTensorShape(input1), GetTensorShape(input2), &op_params);
 #define TF_LITE_MUL(type, opname, dtype)                             \
  type::opname(op_params, GetTensorShape(input1),                    \
               GetTensorData<dtype>(input1), GetTensorShape(input2), \
               GetTensorData<dtype>(input2), GetTensorShape(output), \
               GetTensorData<dtype>(output));
    if (output->type == kTfLiteInt8) {
      if (need_broadcast) {
        TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t);
      } else {
        TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
      }
    } else if (output->type == kTfLiteUInt8) {
      if (need_broadcast) {
        TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, uint8_t);
      } else {
        int err;
        const RuntimeShape& input1_shape = GetTensorShape(input1);
        const RuntimeShape& input2_shape = GetTensorShape(input2);
        const RuntimeShape& output_shape = GetTensorShape(output);
        const int flat_size =
            MatchingElementsSize(input1_shape, input2_shape, output_shape);
        err = xa_nn_elm_mul_asym8xasym8_asym8(
            GetTensorData<uint8_t>(output), op_params.output_offset,
            op_params.output_shift, op_params.output_multiplier,
            op_params.quantized_activation_min,
            op_params.quantized_activation_max, GetTensorData<uint8_t>(input1),
            op_params.input1_offset, GetTensorData<uint8_t>(input2),
            op_params.input2_offset, flat_size);
        CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_mul_asym8xasym8_asym8 failed");
      }
    }
 #undef TF_LITE_MUL
  }
  return kTfLiteOk;
 }
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                       TfLiteMulParams* params, OpData* data,
                       const TfLiteTensor* input1, const TfLiteTensor* input2,
                       TfLiteTensor* output) {
  float output_activation_min, output_activation_max;
  CalculateActivationRange(params->activation, &output_activation_min,
                           &output_activation_max);
  tflite::ArithmeticParams op_params;
  SetActivationParams(output_activation_min, output_activation_max, &op_params);
  bool need_broadcast = reference_ops::ProcessBroadcastShapes(
      GetTensorShape(input1), GetTensorShape(input2), &op_params);
 #define TF_LITE_MUL(opname)                                                   \
  reference_ops::opname(op_params, GetTensorShape(input1),                    \
                        GetTensorData<float>(input1), GetTensorShape(input2), \
                        GetTensorData<float>(input2), GetTensorShape(output), \
                        GetTensorData<float>(output));
  if (need_broadcast) {
    TF_LITE_MUL(BroadcastMul4DSlow);
  } else {
 #if HIFI_VFPU
    int err;
    const RuntimeShape& input1_shape = GetTensorShape(input1);
    const RuntimeShape& input2_shape = GetTensorShape(input2);
    const RuntimeShape& output_shape = GetTensorShape(output);
    const int flat_size =
        MatchingElementsSize(input1_shape, input2_shape, output_shape);
    err = xa_nn_elm_mul_f32xf32_f32(GetTensorData<float>(output),
                                    GetTensorData<float>(input1),
                                    GetTensorData<float>(input2), flat_size);
    CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_elm_mul_f32xf32_f32 failed");
    err = xa_nn_vec_activation_min_max_f32_f32(
        GetTensorData<float>(output), GetTensorData<float>(output),
        output_activation_min, output_activation_max, flat_size);
    CHECK_ERR_HIFI_NNLIB_KER(err,
                             "xa_nn_vec_activation_min_max_f32_f32 failed");
 #else
    TF_LITE_MUL(Mul);
 #endif /* HIFI_VFPU */
  }
 #undef TF_LITE_MUL
  return kTfLiteOk;
 }
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
  OpData data;
  const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
  const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, &data));
  switch (input1->type) {
    case kTfLiteUInt8:
    case kTfLiteInt8:
      TF_LITE_ENSURE_OK(context, EvalQuantized(context, node, params, &data,
                                               input1, input2, output));
      break;
    case kTfLiteFloat32:
      TF_LITE_ENSURE_OK(context, EvalFloat(context, node, params, &data, input1,
                                           input2, output));
      break;
    default:
      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                         TfLiteTypeGetName(input1->type), input1->type);
      return kTfLiteError;
  }
  return kTfLiteOk;
 }
 }  // namespace mul
 TfLiteRegistration Register_MUL() {
  return {/*init=*/nullptr,
          /*free=*/nullptr,
          /*prepare=*/nullptr,
          /*invoke=*/mul::Eval,
          /*profiling_string=*/nullptr,
          /*builtin_code=*/0,
          /*custom_name=*/nullptr,
          /*version=*/0};
 }
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/pooling.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/pooling.cc
@ -1,24 +1,24 @@
-/******************************************************************************
+/*******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
+*
- * Permission is hereby granted, free of charge, to any person obtaining
+* Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
+* a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
+* "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
+* not with any other processors and platforms, subject to
- * the following conditions:
+* the following conditions:
- *
+*
- * The above copyright notice and this permission notice shall be included
+* The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
+* in all copies or substantial portions of the Software.
- *
+*
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ******************************************************************************/
 ******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
@ -40,7 +40,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 namespace tflite {
 namespace ops {
@ -83,6 +83,7 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
  CalculateActivationRange(params->activation, &activation_min,
                           &activation_max);
 #if HIFI_VFPU
  const int stride_height = params->stride_height;
  const int stride_width = params->stride_width;
  const int pad_width = data->padding.width;
@ -168,6 +169,20 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
    CHECK_ERR_HIFI_NNLIB_KER(
        err, "AveragepoolFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
  }
 #else
  PoolParams op_params;
  op_params.stride_height = params->stride_height;
  op_params.stride_width = params->stride_width;
  op_params.filter_height = params->filter_height;
  op_params.filter_width = params->filter_width;
  op_params.padding_values.height = data->padding.height;
  op_params.padding_values.width = data->padding.width;
  op_params.float_activation_min = activation_min;
  op_params.float_activation_max = activation_max;
  reference_ops::AveragePool(
      op_params, GetTensorShape(input), GetTensorData<float>(input),
      GetTensorShape(output), GetTensorData<float>(output));
 #endif /* HIFI_VFPU */
  return kTfLiteOk;
 }
@ -177,7 +192,6 @@ TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
                                  const OpData* data, const TfLiteTensor* input,
                                  TfLiteTensor* output) {
  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
  int32_t activation_min, activation_max;
  (void)CalculateActivationRangeQuantized(context, params->activation, output,
                                          &activation_min, &activation_max);
@ -295,6 +309,7 @@ TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
  CalculateActivationRange(params->activation, &activation_min,
                           &activation_max);
 #if HIFI_VFPU
  const int stride_height = params->stride_height;
  const int stride_width = params->stride_width;
  const int pad_width = data->padding.width;
@ -378,6 +393,20 @@ TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
    CHECK_ERR_HIFI_NNLIB_KER(
        err, "MaxpoolFloat: xa_nn_vec_activation_min_max_f32_f32 failed");
  }
 #else
  tflite::PoolParams op_params;
  op_params.stride_height = params->stride_height;
  op_params.stride_width = params->stride_width;
  op_params.filter_height = params->filter_height;
  op_params.filter_width = params->filter_width;
  op_params.padding_values.height = data->padding.height;
  op_params.padding_values.width = data->padding.width;
  op_params.float_activation_min = activation_min;
  op_params.float_activation_max = activation_max;
  reference_ops::MaxPool(op_params, GetTensorShape(input),
                         GetTensorData<float>(input), GetTensorShape(output),
                         GetTensorData<float>(output));
 #endif /* HIFI_VFPU */
  return kTfLiteOk;
 }
@ -491,7 +520,6 @@ TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
  }
  return kTfLiteOk;
 }
 }  // namespace
@ -504,7 +532,7 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
-  // Inputs and outputs share the same type, guarenteed by the converter.
+  // Inputs and outputs share the same type, guaranteed by the converter.
  switch (input->type) {
    case kTfLiteFloat32:
      AverageEvalFloat(context, node, params, &data, input, output);
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/softmax.cc
@ -1,24 +1,24 @@
-/******************************************************************************
+/*******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
+*
- * Permission is hereby granted, free of charge, to any person obtaining
+* Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
+* a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
+* "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
+* not with any other processors and platforms, subject to
- * the following conditions:
+* the following conditions:
- *
+*
- * The above copyright notice and this permission notice shall be included
+* The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
+* in all copies or substantial portions of the Software.
- *
+*
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ******************************************************************************/
 ******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
@ -43,7 +43,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
-#include "xtensa_tf_micro_common.h"
+#include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 namespace tflite {
 namespace ops {
 namespace micro {
@ -105,6 +105,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
 // Takes a tensor and performs softmax along the last dimension.
 TfLiteStatus SoftmaxFloat(TfLiteContext* context, const TfLiteTensor* input,
                          TfLiteTensor* output, const SoftmaxParams& op_data) {
 #if HIFI_VFPU
  const RuntimeShape& input_shape = GetTensorShape(input);
  const float* input_data = GetTensorData<float>(input);
  const RuntimeShape& output_shape = GetTensorShape(output);
@ -133,6 +134,11 @@ TfLiteStatus SoftmaxFloat(TfLiteContext* context, const TfLiteTensor* input,
        xa_nn_vec_softmax_f32_f32(&output_data[i * depth], p_scratch, depth);
    CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_softmax_f32_f32 failed");
  }
 #else
  tflite::reference_ops::Softmax(
      op_data, GetTensorShape(input), GetTensorData<float>(input),
      GetTensorShape(output), GetTensorData<float>(output));
 #endif /* HIFI_VFPU */
  return kTfLiteOk;
 }
--- a/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifi/svdf.cc
@ -1,5 +1,5 @@
-/******************************************************************************
+/*******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
+ * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
@ -18,7 +18,6 @@
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ******************************************************************************/
 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
@ -44,8 +43,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/kernels/activation_utils.h"
 #include "tensorflow/lite/micro/kernels/xtensa_hifi/xtensa_tf_micro_common.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "xtensa_tf_micro_common.h"
 namespace tflite {
 namespace ops {
@ -53,10 +52,6 @@ namespace micro {
 namespace svdf {
 namespace {
 // These constants represent constants specific to the hotword "OK G" model.
 // They exist until (b/132070898) is fixed.
 constexpr int kScratchTensorMaxSize = 64;
 struct OpData {
  int32 effective_scale_1_a;
  int32 effective_scale_2_a;
@ -64,6 +59,8 @@ struct OpData {
  // shift value - typically between [-32, 32].
  int effective_scale_1_b;
  int effective_scale_2_b;
  int scratch_tensor_index;
  int scratch_output_tensor_index;
 };
 /**
@ -84,6 +81,7 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
    float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
    float* const __restrict__ output_ptr) {
  // Compute matmul(activation_state, weights_time).
 #if HIFI_VFPU
  float* scratch_bias = scratch_ptr;
  if (bias_ptr) {
    const float* bias_data = bias_ptr;
@ -111,6 +109,51 @@ static inline TfLiteStatus ApplyTimeWeightsBiasAndActivation(
      weights_time_vec += memory_size * rank;
    }
  }
 #else
  for (int b = 0; b < batch_size; ++b) {
    // Perform batched vector dot product:
    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
    const float* vector1_ptr = weights_time_ptr;
    const float* vector2_ptr = state_ptr + b * memory_size * num_filters;
    for (int i = 0; i < num_filters; ++i) {
      *scratch_ptr_batch = 0.f;
      for (int j = 0; j < memory_size; ++j) {
        *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
      }
      scratch_ptr_batch++;
    }
  }
  // Initialize output with bias if provided.
  if (bias_ptr) {
    // VectorBatchVectorAssign
    for (int i = 0; i < batch_size; ++i) {
      float* output_data = output_ptr + i * num_units;
      const float* bias_data = bias_ptr;
      for (int j = 0; j < num_units; ++j) {
        *output_data++ = *bias_data++;
      }
    }
  } else {
    float* output_data = output_ptr;
    for (int i = 0; i < batch_size * num_units; ++i) {
      *output_data++ = 0.0f;
    }
  }
  // Reduction sum.
  for (int b = 0; b < batch_size; ++b) {
    float* output_ptr_batch = output_ptr + b * num_units;
    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
    // Reduction sum vector
    for (int i = 0; i < num_units; ++i) {
      for (int j = 0; j < rank; j++) {
        output_ptr_batch[i] += *scratch_ptr_batch++;
      }
    }
  }
 #endif /* HIFI_VFPU */
  // Apply activation.
  for (int b = 0; b < batch_size; ++b) {
@ -127,7 +170,8 @@ inline TfLiteStatus EvalFloatSVDF(
    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
    const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
    const TfLiteTensor* bias, const TfLiteSVDFParams* params,
-    TfLiteTensor* activation_state, TfLiteTensor* output) {
+    int scratch_tensor_index, TfLiteTensor* activation_state,
    TfLiteTensor* output) {
  const int rank = params->rank;
  const int batch_size = input->dims->data[0];
  const int input_size = input->dims->data[1];
@ -142,10 +186,11 @@ inline TfLiteStatus EvalFloatSVDF(
  float* state_ptr = GetTensorData<float>(activation_state);
-  // TODO(b/132070898): Move this temp variable to the new scratch buffer API
+  TFLITE_DCHECK(context != nullptr);
-  // when ready.
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
-  float scratch_tensor[kScratchTensorMaxSize];
+
-  float* scratch_ptr = scratch_tensor;
+  float* scratch_ptr = static_cast<float*>(
      context->GetScratchBuffer(context, scratch_tensor_index));
  float* output_ptr = GetTensorData<float>(output);
@ -174,6 +219,7 @@ inline TfLiteStatus EvalFloatSVDF(
    float* result = &state_ptr[memory_size - 1];
    float* result_in_batch = result;
 #if HIFI_VFPU
    float* out_scratch = scratch_ptr;
    float* bias_scratch = output_ptr;
    for (int i = 0; i < num_units; i++) bias_scratch[i] = 0.0f;
@ -195,6 +241,20 @@ inline TfLiteStatus EvalFloatSVDF(
        result_in_batch += memory_size;
      }
    }
 #else
    for (int i = 0; i < batch_size; ++i) {
      const float* matrix_ptr = matrix;
      for (int j = 0; j < num_filters; ++j) {
        float dot_prod = 0.0f;
        const float* vector_in_batch = vector + i * input_size;
        for (int k = 0; k < input_size; ++k) {
          dot_prod += *matrix_ptr++ * *vector_in_batch++;
        }
        *result_in_batch = dot_prod;
        result_in_batch += memory_size;
      }
    }
 #endif /* HIFI_VFPU */
  }
  return ApplyTimeWeightsBiasAndActivation(
@ -203,13 +263,15 @@ inline TfLiteStatus EvalFloatSVDF(
      output_ptr);
 }
-void EvalIntegerSVDF(
+void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
-    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input_tensor,
+                     const TfLiteTensor* input_tensor,
-    const TfLiteTensor* weights_feature_tensor,
+                     const TfLiteTensor* weights_feature_tensor,
-    const TfLiteTensor* weights_time_tensor, const TfLiteTensor* bias_tensor,
+                     const TfLiteTensor* weights_time_tensor,
-    const TfLiteSVDFParams* params, TfLiteTensor* activation_state_tensor,
+                     const TfLiteTensor* bias_tensor,
-    TfLiteTensor* output_tensor, int32_t scale_1_a, int scale_1_b,
+                     const TfLiteSVDFParams* params,
-    int32_t scale_2_a, int scale_2_b, int32_t input_zp, int32_t output_zp) {
+                     TfLiteTensor* activation_state_tensor,
                     TfLiteTensor* output_tensor, const OpData& data,
                     int32_t input_zp, int32_t output_zp) {
  const int n_rank = params->rank;
  const int n_batch = input_tensor->dims->data[0];
  const int n_input = input_tensor->dims->data[1];
@ -217,10 +279,13 @@ void EvalIntegerSVDF(
  const int n_unit = n_filter / n_rank;
  const int n_memory = weights_time_tensor->dims->data[1];
-  // TODO(b/132070898): Move these temp variables to the new scratch buffer API
+  TFLITE_DCHECK(context != nullptr);
-  // when ready.
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
-  int32_t scratch_tensor[kScratchTensorMaxSize];
+
-  int32_t scratch_output_tensor[kScratchTensorMaxSize];
+  int32_t* scratch_tensor = static_cast<int32_t*>(
      context->GetScratchBuffer(context, data.scratch_tensor_index));
  int32_t* scratch_output_tensor = static_cast<int32_t*>(
      context->GetScratchBuffer(context, data.scratch_output_tensor_index));
  // Shift states.
  int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
@ -254,8 +319,8 @@ void EvalIntegerSVDF(
        for (int c = 0; c < n_input; c++) {
          dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp);
        }
-        dot_prod =
+        dot_prod = MultiplyByQuantizedMultiplier(
-            MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
+            dot_prod, data.effective_scale_1_a, data.effective_scale_1_b);
        dot_prod = std::min(std::max(output_min, dot_prod), output_max);
        // This assumes state is symmetrically quantized. Otherwise last bit of
        // state should be initialized to its zero point and accumulate the
@ -328,7 +393,8 @@ void EvalIntegerSVDF(
    const int32_t output_min = std::numeric_limits<int8_t>::min();
    for (int i = 0; i < n_batch * n_unit; ++i) {
      int32_t x1 = scratch_output_tensor[i];
-      int32_t x2 = MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b);
+      int32_t x2 = MultiplyByQuantizedMultiplier(x1, data.effective_scale_2_a,
                                                 data.effective_scale_2_b);
      int32_t x3 = x2 + output_zp;
      int32_t x4 = std::min(std::max(output_min, x3), output_max);
      GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
@ -349,8 +415,20 @@ constexpr int kInputActivationStateTensor = 4;
 // Output tensor.
 constexpr int kOutputTensor = 0;
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
  void* data = nullptr;
  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
      kTfLiteError) {
    return nullptr;
  }
  return data;
 }
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
  const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
  // Validate Tensor Inputs (dtype depends on quantization):
  // [0] = Input, {2, batch_size, input_size}
@ -359,7 +437,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  // [3] = Bias (optional), {1, num_units}
  // [4] = Activation State (variable),
  //         {2, batch_size, memory_size * num_filters}
  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
  const TfLiteTensor* weights_feature =
      GetInput(context, node, kWeightsFeatureTensor);
@ -378,8 +455,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  const int num_units = num_filters / rank;
  const int memory_size = weights_time->dims->data[1];
  const bool is_full_integer = input->type == kTfLiteInt8;
  // Validate Input Tensor:
  TF_LITE_ENSURE(context,
                 input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
@ -403,7 +478,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);
  // Validate Optional Bias Input Tensor:
-  if (bias) {
+  if (bias != nullptr) {
    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
  }
@ -413,53 +488,77 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
                    memory_size * num_filters);
  if (is_full_integer) {
    TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
-    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
+    if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
+      TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
      TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
      TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
      if (bias != nullptr) {
        TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
      }
-    if (bias) {
+      TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
-      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+
      const auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
          input->quantization.params);
      const auto* weights_feature_params =
          static_cast<const TfLiteAffineQuantization*>(
              weights_feature->quantization.params);
      const auto* state_params = static_cast<const TfLiteAffineQuantization*>(
          activation_state->quantization.params);
      const auto* weight_time_params =
          static_cast<const TfLiteAffineQuantization*>(
              weights_time->quantization.params);
      const auto* output_params = static_cast<const TfLiteAffineQuantization*>(
          output->quantization.params);
      const double effective_scale_1 =
          static_cast<double>(input_params->scale->data[0] *
                              weights_feature_params->scale->data[0] /
                              state_params->scale->data[0]);
      const double effective_scale_2 = static_cast<double>(
          state_params->scale->data[0] * weight_time_params->scale->data[0] /
          output_params->scale->data[0]);
      TFLITE_DCHECK(node->user_data != nullptr);
      OpData* data = static_cast<OpData*>(node->user_data);
      QuantizeMultiplier(effective_scale_1, &(data->effective_scale_1_a),
                         &(data->effective_scale_1_b));
      QuantizeMultiplier(effective_scale_2, &(data->effective_scale_2_a),
                         &(data->effective_scale_2_b));
      TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
      const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
          context, batch_size * num_filters * sizeof(int32_t),
          &(data->scratch_tensor_index));
      TF_LITE_ENSURE_OK(context, scratch_status);
      const TfLiteStatus scratch_output_status =
          context->RequestScratchBufferInArena(
              context, batch_size * num_units * sizeof(int32_t),
              &(data->scratch_output_tensor_index));
      TF_LITE_ENSURE_OK(context, scratch_output_status);
    } else {
      TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
      TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteFloat32);
      TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteFloat32);
      if (bias != nullptr) {
        TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
      }
      TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
      TFLITE_DCHECK(node->user_data != nullptr);
      OpData* data = static_cast<OpData*>(node->user_data);
      TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
      const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
          context, batch_size * num_filters * sizeof(float),
          &(data->scratch_tensor_index));
      TF_LITE_ENSURE_OK(context, scratch_status);
    }
    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
    // Validate Scratch Tensors:
    // [0] = (shared - see float block below for usage)
    // [1] = Output Temp, int8_t, {2, num_units, batch_size}
    // TODO(b/132070898): Scratch values are used as stack variables in
    // EvalIntegerSVDF().
    // Validate output tensor:
    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
  } else {
    TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
    // Validate Input Tensor dtypes:
    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteFloat32);
    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteFloat32);
    if (bias) {
      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
    }
    // Validate shared Scratch Tensor:
    // [0] = Holds dot-product of time-forward calculations in
    //       ApplyTimeWeightsBiasAndActivation():
    //         float/int32, {2, batch_size, num_filters}
    // TODO(b/132070898): Scratch values are used as stack variables in
    // EvalIntegerSVDF().
    // Full-float SVDF only uses the one shared scratch tensor (see above for
    // usage).
    // TODO(b/132070898): Use input tensor as variable until scratch tensor
    // allocation has been implemented.
    // TF_LITE_ENSURE_EQ(context, node->temporaries->size, 1);
    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
  }
  return kTfLiteOk;
 }
@ -476,56 +575,24 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
      GetVariableInput(context, node, kInputActivationStateTensor);
  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const bool is_full_integer = input->type == kTfLiteInt8;
+  TFLITE_DCHECK(node->user_data != nullptr);
  const OpData& data = *(static_cast<const OpData*>(node->user_data));
  switch (weights_feature->type) {
    case kTfLiteFloat32: {
      // TODO(b/132070898): Use input tensor as variable until scratch tensor
      // allocation has been implemented.
      // TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
      return EvalFloatSVDF(context, node, input, weights_feature, weights_time,
-                           bias, params, activation_state, output);
+                           bias, params, data.scratch_tensor_index,
                           activation_state, output);
      break;
    }
    case kTfLiteInt8: {
-      if (is_full_integer) {
+      TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
        // TODO(b/132070898): Store these values in ::Prepare() instead of
        // ::Eval():
        // Calculate effective scales.
        OpData op_data;
        auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
            input->quantization.params);
        auto* weights_feature_params =
            reinterpret_cast<TfLiteAffineQuantization*>(
                weights_feature->quantization.params);
        auto* state_params = reinterpret_cast<TfLiteAffineQuantization*>(
            activation_state->quantization.params);
        auto* weight_time_params = reinterpret_cast<TfLiteAffineQuantization*>(
            weights_time->quantization.params);
        auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
            output->quantization.params);
        const double effective_scale_1 =
            static_cast<double>(input_params->scale->data[0] *
                                weights_feature_params->scale->data[0] /
                                state_params->scale->data[0]);
        const double effective_scale_2 = static_cast<double>(
            state_params->scale->data[0] * weight_time_params->scale->data[0] /
            output_params->scale->data[0]);
        QuantizeMultiplier(effective_scale_1, &op_data.effective_scale_1_a,
                           &op_data.effective_scale_1_b);
        QuantizeMultiplier(effective_scale_2, &op_data.effective_scale_2_a,
                           &op_data.effective_scale_2_b);
-        TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
+      EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias,
-        EvalIntegerSVDF(
+                      params, activation_state, output, data,
-            context, node, input, weights_feature, weights_time, bias, params,
+                      input->params.zero_point, output->params.zero_point);
-            activation_state, output, op_data.effective_scale_1_a,
+      return kTfLiteOk;
            op_data.effective_scale_1_b, op_data.effective_scale_2_a,
            op_data.effective_scale_2_b, input->params.zero_point,
            output->params.zero_point);
        return kTfLiteOk;
      }
      break;
    }
@ -540,7 +607,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace svdf
 TfLiteRegistration Register_SVDF() {
-  return {/*init=*/nullptr,
+  return {/*init=*/svdf::Init,
          /*free=*/nullptr,
          /*prepare=*/svdf::Prepare,
          /*invoke=*/svdf::Eval,
--- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifi_nn_library.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifi_nn_library.inc
@ -4,6 +4,8 @@ ifneq ($(filter xtensa_hifi, $(ALL_TAGS)),)
    ifneq (,$(filter hifi4%, $(TARGET_ARCH)))
        NNLIB = xa_nnlib_hifi4
        CCFLAGS += -DNNLIB_V2 \
                   -DXTENSA_NNLIB_MAX_SCRATCH_SIZE=70*1024
@ -11,56 +13,60 @@ ifneq ($(filter xtensa_hifi, $(ALL_TAGS)),)
                    -DXTENSA_NNLIB_MAX_SCRATCH_SIZE=70*1024
        MICROLITE_CC_SRCS += \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_f32_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_f32_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_asym8_asym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_asym8_asym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_32_16.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_32_16.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_activations_32_8.c  \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_activations_32_8.c  \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi4/xa_nn_softmax_asym8_asym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/activations/hifi4/xa_nn_softmax_asym8_asym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_floor_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_floor_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_circ_buf.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_add_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_asym8xasym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_add_quant8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_mul_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_matXvec_asym8xasym8_asym8_circ.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/basic/hifi4/xa_nn_elm_mul_quant8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_matXvec_f32_circ.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_circ_buf.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_asym8xasym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_std_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_asym8xasym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_matXvec_asym8xasym8_asym8_circ.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/cnn/hifi4/xa_nn_circ_buf.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_matXvec_f32_circ.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/fc/hifi4/xa_nn_fully_connected.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_16x16.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_conv2d_depthwise_asym8xasym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x16.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/cnn/hifi4/xa_nn_circ_buf.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/fc/hifi4/xa_nn_fully_connected.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi4/xa_nn_matXvec_asym8xasym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_16x16.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x16.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_8x8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/matXvec/hifi4/xa_nn_matXvec_asym8xasym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_f32.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_f32_nhwc.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8_nhwc.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_f32_nhwc.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_f32.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8_nhwc.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/kernels/pool/hifi4/xa_nn_inv_256_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_f32_nhwc.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_sigmoidf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_avgpool_asym8_nhwc.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_tanhf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_f32_nhwc.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_reluf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_maxpool_asym8_nhwc.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_softmaxf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/kernels/pool/hifi4/xa_nn_inv_256_tbl.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/vec_alognf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_sigmoidf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/scl_sigmoidf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_tanhf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/scl_tanhf_hifi4.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_reluf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/expf_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_softmaxf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/pow2f_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/vec_alognf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/inff_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/scl_sigmoidf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/tanhf_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/scl_tanhf_hifi4.c \
-                             $(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/src/nanf_tbl.c \
+                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/expf_tbl.c \
                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/pow2f_tbl.c \
                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/inff_tbl.c \
                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/tanhf_tbl.c \
                             $(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/src/nanf_tbl.c \
-        INCLUDES += -I$(XTENSA_PATH)/xa_nnlib/algo/kernels/ \
+        INCLUDES += -I$(XTENSA_PATH)/$(NNLIB)/algo/kernels/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/include/nnlib/ \
+                    -I$(XTENSA_PATH)/$(NNLIB)/include/nnlib/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/include/ \
+                    -I$(XTENSA_PATH)/$(NNLIB)/include/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/algo/common/include/ \
+                    -I$(XTENSA_PATH)/$(NNLIB)/algo/common/include/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/algo/ndsp/hifi4/include/ \
+                    -I$(XTENSA_PATH)/$(NNLIB)/algo/ndsp/hifi4/include/ \
    endif
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
@ -5,7 +5,7 @@
 ifeq ($(TARGET), xtensa_hifi)
  TARGET_ARCH := hifi3_bd5
-$(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib,))
+$(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib_hifi4,))
  PLATFORM_ARGS = \
    -mno-mul16 \
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@ -80,8 +80,8 @@ EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
 ZEPHYR_URL := "https://github.com/antmicro/zephyr/archive/55e36b9.zip"
 ZEPHYR_MD5 := "755622eb4812fde918a6382b65d50c3b"
-XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
+XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_06_27.zip"
-XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"
+XTENSA_HIFI4_MD5 :="45fdc1209a8da62ab568aa6040f7eabf"
 ETHOSU_URL := "https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-bcb5aaa99756f1b5c1295b079ebdd60996bc75a5.tar.gz"
 ETHOSU_MD5 := "d2073c8d88fc167fd5c46b5dcda58ea1"