Merge pull request #39077 from foss-for-synopsys-dwc-arc-processors:arcmli_upstream

PiperOrigin-RevId: 312835786 Change-Id: I08f27121798a2a59c845d6b357e135716d690184
2020-05-22 09:10:59 -07:00 · 2020-05-22 09:10:59 -07:00 · 29fb4d12a7
commit 29fb4d12a7
parent f6e9d0ca52 6ccf21ef6d
44 changed files with 6275 additions and 1317 deletions
--- a/tensorflow/lite/micro/arc_emsdp/debug_log.cc
+++ b/tensorflow/lite/micro/arc_emsdp/debug_log.cc
@ -0,0 +1,111 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+
+// Print to debug console by default. One can define next to extend destinations
+// set: EMSDP_LOG_TO_MEMORY
+//   : fill .debug_log memory region (data section) with passed chars.
+// EMSDP_LOG_TO_HOST
+//   : Use MetaWare HostLink to print output log. Requires Synopsys MetaWare
+//   debugger
+// EMSDP_LOG_TO_UART
+//   : use default debug UART (out to FTDI channel 0). The same USB Port is used
+//   for JTAG.
+#define EMSDP_LOG_TO_UART
+
+// Memory size for symbols dump in EMSDP_LOG_TO_MEMORY destination
+#define EMSDP_LOG_TO_MEMORY_SIZE (2 * 1024)
+
+// EMSDP Debug UART related defines (registers and bits)
+#define EMSDP_DBG_UART_BASE (0xF0004000U)
+#define DW_UART_CPR_FIFO_STAT (1 << 10)
+#define DW_UART_USR_TFNF (0x02)
+#define DW_UART_LSR_TXD_EMPTY (0x20)
+
+// EMSDP UART registers map (only necessairy fields)
+typedef volatile struct dw_uart_reg {
+  uint32_t DATA; /* data in/out and DLL */
+  uint32_t RES1[4];
+  uint32_t LSR; /* Line Status Register */
+  uint32_t RES2[25];
+  uint32_t USR; /* UART status register */
+  uint32_t RES3[29];
+  uint32_t CPR; /* Component parameter register */
+} DW_UART_REG;
+
+// For simplicity we assume U-boot has already initialized debug console during
+// application loading (or on reset). Hence, we use only status and data
+// registers to organize blocking loop for printing symbols. No input and no IRQ
+// handling. See embarc_osp repository for full EMSDP uart driver.
+// (https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp)
+void DbgUartSendStr(const char* s) {
+  DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE);
+  const char* src = s;
+  while (*src) {
+    // Check uart status to send char
+    bool uart_is_ready = false;
+    if (uart_reg_ptr->CPR & DW_UART_CPR_FIFO_STAT)
+      uart_is_ready = ((uart_reg_ptr->USR & DW_UART_USR_TFNF) != 0);
+    else
+      uart_is_ready = ((uart_reg_ptr->LSR & DW_UART_LSR_TXD_EMPTY) != 0);
+
+    // Send char if uart is ready.
+    if (uart_is_ready) uart_reg_ptr->DATA = *src++;
+  }
+}
+
+// Simple dump of symbols to a pre-allocated memory region.
+// When total log exceeds memory region size, cursor is moved to its begining.
+// The memory region can be viewed afterward with debugger.
+// It can be viewed/read with debugger afterward.
+void LogToMem(const char* s) {
+  static int cursor = 0;
+#pragma Bss(".debug_log")
+  static volatile char debug_log_mem[EMSDP_LOG_TO_MEMORY_SIZE];
+#pragma Bss()
+
+  const char* src = s;
+  while (*src) {
+    debug_log_mem[cursor] = *src++;
+    cursor = (cursor < EMSDP_LOG_TO_MEMORY_SIZE) ? cursor + 1 : 0;
+  }
+  debug_log_mem[cursor] = '^';
+}
+
+extern "C" void DebugLog(const char* s) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+
+#if defined EMSDP_LOG_TO_UART
+  DbgUartSendStr(s);
+#endif
+
+#if defined EMSDP_LOG_TO_MEMORY
+#warning \
+    "EMSDP_LOG_TO_MEMORY is defined. View .debug_log memory region for stdout"
+  LogToMem(s);
+#endif
+
+#if defined EMSDP_LOG_TO_HOST
+#warning "EMSDP_LOG_TO_HOST is defined. Ensure hostlib is linked."
+  fprintf(stderr, "%s", s);
+#endif
+
+#endif  // TF_LITE_STRIP_ERROR_STRINGS
+}
--- a/tensorflow/lite/micro/examples/hello_world/README.md
+++ b/tensorflow/lite/micro/examples/hello_world/README.md
@ -14,6 +14,7 @@ of the device.

 ## Table of contents

+-   [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
 -   [Deploy to Arduino](#deploy-to-arduino)
 -   [Deploy to ESP32](#deploy-to-esp32)
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
@ -21,6 +22,78 @@ of the device.
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Train your own model](#train-your-own-model)

+## Deploy to ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+board. General information and instructions on using the board with TensorFlow
+Lite Micro can be found in the common
+[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+### Initial Setup
+
+Follow the instructions on the
+[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+to get and install all required tools for work with ARC EM SDP.
+
+### Generate Example Project
+
+The example project for ARC EM SDP platform can be generated with the following
+command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_hello_world_make_project
+```
+
+### Build and Run Example
+
+For more detailed information on building and running examples see the
+appropriate sections of general descriptions of the
+[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
+In the directory with generated project you can also find a
+*README_ARC_EMSDP.md* file with instructions and options on building and
+running. Here we only briefly mention main steps which are typically enough to
+get it started.
+
+1.  You need to
+    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
+    and open an serial connection.
+
+2.  Go to the generated example project director
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/hello_world/make
+    ```
+
+3.  Build the example using
+
+    ```
+    make app
+    ```
+
+4.  To generate artefacts for self-boot of example from the board use
+
+    ```
+    make flash
+    ```
+
+5.  To run application from the board using microSD card:
+
+    *   Copy the content of the created /bin folder into the root of microSD
+        card. Note that the card must be formatted as FAT32 with default cluster
+        size (but less than 32 Kbytes)
+    *   Plug in the microSD card into the J11 connector.
+    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
+        button.
+
+6.  If you have the MetaWare Debugger installed in your environment:
+
+    *   To run application from the console using it type `make run`.
+    *   To stop the execution type `Ctrl+C` in the console several times.
+
+In both cases (step 5 and 6) you will see the application output in the serial
+terminal.
+
 ## Deploy to Arduino

 The following instructions will help you build and deploy this sample
--- a/tensorflow/lite/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/micro/examples/micro_speech/README.md
@ -16,6 +16,7 @@ kilobytes of Flash.

 ## Table of contents

+-   [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
 -   [Deploy to Arduino](#deploy-to-arduino)
 -   [Deploy to ESP32](#deploy-to-esp32)
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
@ -25,6 +26,95 @@ kilobytes of Flash.
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Train your own model](#train-your-own-model)

+## Deploy to ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+board. General information and instructions on using the board with TensorFlow
+Lite Micro can be found in the common
+[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+This example is quantized with symmetric uint8 scheme. As noted in
+[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md),
+embARC MLI supports optimized kernels for int8 quantization only. Therefore,
+this example will only use TFLM reference kernels.
+
+The ARC EM SDP board contains the rich set of extension interfaces. You can
+choose any compatible microphone and modify
+[audio_provider.cc](/tensorflow/lite/micro/examples/micro_speech/audio_provider.cc)
+file accordingly to use input from your specific camera. By default, results of
+running this example are printed to the console. If you would like to instead
+implement some target-specific actions, you need to modify
+[command_responder.cc](/tensorflow/lite/micro/examples/micro_speech/command_responder.cc)
+accordingly.
+
+The reference implementations of these files are used by default on the EM SDP.
+
+### Initial setup
+
+Follow the instructions on the
+[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+to get and install all required tools for work with ARC EM SDP.
+
+### Generate Example Project
+
+As default example doesn’t provide any output without real audio, it is
+recommended to get started with example for mock data. The project for ARC EM
+SDP platform can be generated with the following command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_micro_speech_mock_make_project
+```
+
+### Build and Run Example
+
+For more detailed information on building and running examples see the
+appropriate sections of general descriptions of the
+[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
+In the directory with generated project you can also find a
+*README_ARC_EMSDP.md* file with instructions and options on building and
+running. Here we only briefly mention main steps which are typically enough to
+get it started.
+
+1.  You need to
+    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
+    and open an serial connection.
+
+2.  Go to the generated example project director
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/micro_speech_mock/make
+    ```
+
+3.  Build the example using
+
+    ```
+    make app
+    ```
+
+4.  To generate artefacts for self-boot of example from the board use
+
+    ```
+    make flash
+    ```
+
+5.  To run application from the board using microSD card:
+
+    *   Copy the content of the created /bin folder into the root of microSD
+        card. Note that the card must be formatted as FAT32 with default cluster
+        size (but less than 32 Kbytes)
+    *   Plug in the microSD card into the J11 connector.
+    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
+        button.
+
+6.  If you have the MetaWare Debugger installed in your environment:
+
+    *   To run application from the console using it type `make run`.
+    *   To stop the execution type `Ctrl+C` in the console several times.
+
+In both cases (step 5 and 6) you will see the application output in the serial
+terminal.
+
 ## Deploy to Arduino

 The following instructions will help you build and deploy this sample
--- a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
@ -0,0 +1,28 @@
+ifeq ($(TARGET), arc_emsdp)
+
+# Patch of arc make project to adjust it specifically for micro speech example. 
+# In particular:
+# - Extend Heap and stack size for application needs
+# - Use Linker command file with better usage of fast memory
+# - In case project was generated with MLI usage, reduce scratch buffers.
+
+  MICRO_SPEECH_HDRS += \
+  micro_speech_patch.txt
+  
+  MICRO_SPEECH_TEST_HDRS += \
+  micro_speech_patch.txt
+  
+  MICRO_SPEECH_MOCK_HDRS += \
+  micro_speech_patch.txt
+
+%/micro_speech_patch.txt: %/emsdp.lcf %/Makefile
+	@cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $< 
+	@echo emsdp.lcf > $@
+	@sed -E -i 's#-Hheap=[^ ]*#\-Hheap=16K \-Hstack=16K#g' $(word 2, $^)
+	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
+	CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
+	CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\
+	  $(word 2, $^)
+	@echo Makefile >> $@
+
+endif
--- a/tensorflow/lite/micro/examples/person_detection/README.md
+++ b/tensorflow/lite/micro/examples/person_detection/README.md
@ -5,7 +5,9 @@ network to recognize people in images captured by a camera.  It is designed to
 run on systems with small amounts of memory such as microcontrollers and DSPs.

 ## Table of contents
+
 -   [Getting started](#getting-started)
+-   [Running on ARC EM SDP](#running-on-arc-em-sdp)
 -   [Running on Arduino](#running-on-arduino)
 -   [Running on ESP32](#running-on-esp32)
 -   [Running on SparkFun Edge](#running-on-sparkfun-edge)
@ -13,6 +15,94 @@ run on systems with small amounts of memory such as microcontrollers and DSPs.
 -   [Debugging image capture](#debugging-image-capture)
 -   [Training your own model](#training-your-own-model)

+## Running on ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+board. General information and instructions on using the board with TensorFlow
+Lite Micro can be found in the common
+[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+This example is quantized with symmetric uint8 scheme. As noted in
+[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md),
+embARC MLI supports optimized kernels for int8 quantization only. Therefore,
+this example will only use TFLM reference kernels.
+
+The ARC EM SDP board contains the reach set of extension interfaces. You can
+choose any compatible camera and modify
+[image_provider.cc](/tensorflow/lite/micro/examples/person_detection/image_provider.cc)
+file accordingly to use input from your specific camera. By default, results of
+running this example are printed to the console. If you would like to instead
+implement some target-specific actions, you need to modify
+[detection_responder.cc](/tensorflow/lite/micro/examples/person_detection/detection_responder.cc)
+accordingly.
+
+The reference implementations of these files are used by default on the EM SDP.
+
+### Initial setup
+
+Follow the instructions on the
+[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+to get and install all required tools for work with ARC EM SDP.
+
+### Generate Example Project
+
+The example project for ARC EM SDP platform can be generated with the following
+command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_make_project
+```
+
+### Build and Run Example
+
+For more detailed information on building and running examples see the
+appropriate sections of general descriptions of the
+[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
+In the directory with generated project you can also find a
+*README_ARC_EMSDP.md* file with instructions and options on building and
+running. Here we only briefly mention main steps which are typically enough to
+get it started.
+
+1.  You need to
+    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
+    and open an serial connection.
+
+2.  Go to the generated example project director
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make
+    ```
+
+3.  Build the example using
+
+    ```
+    make app
+    ```
+
+4.  To generate artefacts for self-boot of example from the board use
+
+    ```
+    make flash
+    ```
+
+5.  To run application from the board using microSD card:
+
+    *   Copy the content of the created /bin folder into the root of microSD
+        card. Note that the card must be formatted as FAT32 with default cluster
+        size (but less than 32 Kbytes)
+    *   Plug in the microSD card into the J11 connector.
+    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
+        button.
+
+6.  If you have the MetaWare Debugger installed in your environment:
+
+    *   To run application from the console using it type `make run`.
+    *   To stop the execution type `Ctrl+C` in the console several times.
+
+In both cases (step 5 and 6) you will see the application output in the serial
+terminal.
+
 ## Running on Arduino

 The following instructions will help you build and deploy this sample
--- a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc
@ -0,0 +1,24 @@
+ifeq ($(TARGET), arc_emsdp)
+
+# Patch of arc make project to adjust it specifically 
+# for person detection example. In particular:
+# - Use Linker command file with better usage of fast memory
+# - In case project was generated with MLI usage, reduce scratch buffers.
+
+  person_detection_HDRS += \
+  person_detection_patch.txt
+  
+  person_detection_TEST_HDRS += \
+  person_detection_patch.txt
+  
+
+%/person_detection_patch.txt: %/emsdp.lcf %/Makefile
+	@cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $< 
+	@echo emsdp.lcf > $@
+	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
+	CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
+	CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\
+	  $(word 2, $^)
+	@echo Makefile >> $@
+
+endif
--- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
@ -6,13 +6,101 @@ run on systems with small amounts of memory such as microcontrollers and DSPs.
 This uses the experimental int8 quantized version of the person detection model.

 ## Table of contents
+
 -   [Getting started](#getting-started)
+-   [Running on ARC EM SDP](#running-on-arc-em-sdp)
 -   [Running on Arduino](#running-on-arduino)
 -   [Running on SparkFun Edge](#running-on-sparkfun-edge)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Debugging image capture](#debugging-image-capture)
 -   [Training your own model](#training-your-own-model)

+## Running on ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+board. General information and instructions on using the board with TensorFlow
+Lite Micro can be found in the common
+[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+This example uses asymmetric int8 quantization and can therefore leverage
+optimized int8 kernels from the embARC MLI library
+
+The ARC EM SDP board contains a rich set of extension interfaces. You can choose
+any compatible camera and modify
+[image_provider.cc](/tensorflow/lite/micro/examples/person_detection_experimental/image_provider.cc)
+file accordingly to use input from your specific camera. By default, results of
+running this example are printed to the console. If you would like to instead
+implement some target-specific actions, you need to modify
+[detection_responder.cc](/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.cc)
+accordingly.
+
+The reference implementations of these files are used by default on the EM SDP.
+
+### Initial setup
+
+Follow the instructions on the
+[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+to get and install all required tools for work with ARC EM SDP.
+
+### Generate Example Project
+
+The example project for ARC EM SDP platform can be generated with the following
+command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
+```
+
+### Build and Run Example
+
+For more detailed information on building and running examples see the
+appropriate sections of general descriptions of the
+[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
+In the directory with generated project you can also find a
+*README_ARC_EMSDP.md* file with instructions and options on building and
+running. Here we only briefly mention main steps which are typically enough to
+get it started.
+
+1.  You need to
+    [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
+    and open an serial connection.
+
+2.  Go to the generated example project director
+
+    ```
+    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make
+    ```
+
+3.  Build the example using
+
+    ```
+    make app
+    ```
+
+4.  To generate artefacts for self-boot of example from the board use
+
+    ```
+    make flash
+    ```
+
+5.  To run application from the board using microSD card:
+
+    *   Copy the content of the created /bin folder into the root of microSD
+        card. Note that the card must be formatted as FAT32 with default cluster
+        size (but less than 32 Kbytes)
+    *   Plug in the microSD card into the J11 connector.
+    *   Push the RST button. If a red LED is lit beside RST button, push the CFG
+        button.
+
+6.  If you have the MetaWare Debugger installed in your environment:
+
+    *   To run application from the console using it type `make run`.
+    *   To stop the execution type `Ctrl+C` in the console several times.
+
+In both cases (step 5 and 6) you will see the application output in the serial
+terminal.
+
 ## Running on Arduino

 The following instructions will help you build and deploy this sample
--- a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
@ -0,0 +1,21 @@
+ifeq ($(TARGET), arc_emsdp)
+
+# Patch of arc make project to adjust it specifically 
+# for experimental person detection example. In particular:
+# - Use Linker command file with better usage of fast memory
+# - Stripout TFLM reference code by default.
+
+  person_detection_HDRS += \
+  person_detection_int8_patch.txt
+  
+  person_detection_TEST_HDRS += \
+  person_detection_int8_patch.txt
+  
+
+%/person_detection_int8_patch.txt: %/emsdp.lcf %/Makefile
+	@cp tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf $< 
+	@echo emsdp.lcf > $@
+	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= true#' $(word 2, $^)
+	@echo Makefile > $@
+
+endif
--- a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf
@ -0,0 +1,74 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Difference with common EMSDP LCF file (to reduce data access time): 
+# - move data from external PSRAM to on-chip memory
+# - move text from SRAM to ICCM
+#
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+    PSRAM   : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
+    SRAM    : ORIGIN = 0x20000000, LENGTH = 0x00040000
+    IVT     : ORIGIN = 0x60000000, LENGTH = 0x400
+    ICCM0   : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
+#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
+#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    }
+
+SECTIONS {
+
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+    } > IVT
+
+    GROUP BLOCK(4): {
+        .text? : { *('.text$crt*') }
+        * (TEXT): {}
+        * (LIT): {}
+    } > ICCM0
+
+    GROUP BLOCK(4): {
+        .rodata_in_data? : {}
+    } > PSRAM
+
+    GROUP BLOCK(4): {
+    /* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+        .debug_log? : {}
+    } > SRAM
+
+    GROUP BLOCK(4): {
+       .Zdata? : {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
+    } > DCCM
+
+    GROUP BLOCK(4): {
+        .Xdata? : {}
+    } > XCCM
+
+    GROUP BLOCK(4): {
+        .Ydata? : {}
+    } > YCCM
+}
+
+
--- a/tensorflow/lite/micro/kernels/arc/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc/conv.cc
@ -1,343 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/conv.h"
-
-#include "mli_api.h"  // NOLINT
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace conv {
-
-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
-
-// This file has 2 implementation of Conv.
-
-const int kTensorNotAllocated = -1;
-
-struct OpData {
-  TfLitePaddingValues padding;
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
-inline PaddingType RuntimePaddingType(TfLitePadding padding) {
-  switch (padding) {
-    case TfLitePadding::kTfLitePaddingSame:
-      return PaddingType::kSame;
-    case TfLitePadding::kTfLitePaddingValid:
-      return PaddingType::kValid;
-    case TfLitePadding::kTfLitePaddingUnknown:
-    default:
-      return PaddingType::kNone;
-  }
-}
-
-TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, int width, int height,
-                             int filter_width, int filter_height, int out_width,
-                             int out_height, const TfLiteType data_type,
-                             OpData* data) {
-  bool has_bias = node->inputs->size == 3;
-  // Check number of inputs/outputs
-  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  // Matching GetWindowedOutputSize in TensorFlow.
-  auto padding = params->padding;
-  data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width,
-      params->dilation_height_factor, params->dilation_width_factor, height,
-      width, filter_height, filter_width, padding, &out_height, &out_width);
-
-  // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
-        context, input, filter, bias, output, params->activation,
-        &data->output_multiplier, &data->output_shift,
-        &data->output_activation_min, &data->output_activation_max,
-        data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift)));
-  }
-  return kTfLiteOk;
-}
-
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteConvParams* params, OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* im2col,
-                   TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
-
-  ConvParams op_params;
-  op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-  reference_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<uint8_t>(input), GetTensorShape(filter),
-                      GetTensorData<uint8_t>(filter), GetTensorShape(bias),
-                      GetTensorData<int32_t>(bias), GetTensorShape(output),
-                      GetTensorData<uint8_t>(output), GetTensorShape(im2col),
-                      GetTensorData<uint8_t>(im2col), nullptr);
-}
-
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output,
-                             TfLiteTensor* im2col) {
-  // Run Conv MLI kernel
-  // MLI optimized version only supports int8 dataype and dilation factor of 1
-  if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
-      (params->dilation_height_factor == 1)) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_weights = {0};
-    mli_tensor mli_bias = {0};
-    mli_tensor mli_out = {0};
-    mli_conv2d_cfg cfg = {};
-
-    // reuse space allocated for OpData parameters
-    mli_weights.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_multiplier;
-    mli_bias.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_shift;
-
-    int16_t filter_zero_point = 0;
-    int16_t bias_zero_point = 0;
-    mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
-    mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
-
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
-    ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
-
-    if (params->activation == kTfLiteActRelu) {
-      cfg.relu.type = MLI_RELU_GEN;
-    } else if (params->activation == kTfLiteActRelu6) {
-      cfg.relu.type = MLI_RELU_6;
-    } else if (params->activation == kTfLiteActRelu1) {
-      cfg.relu.type = MLI_RELU_1;
-    } else {
-      cfg.relu.type = MLI_RELU_NONE;
-    }
-
-    cfg.stride_width = params->stride_width;
-    cfg.stride_height = params->stride_height;
-    if (params->padding == kTfLitePaddingValid) {
-      cfg.padding_left = 0;
-      cfg.padding_right = 0;
-      cfg.padding_top = 0;
-      cfg.padding_bottom = 0;
-    } else {
-      cfg.padding_left = data->padding.width;
-      cfg.padding_right = data->padding.width + data->padding.width_offset;
-      cfg.padding_top = data->padding.height;
-      cfg.padding_bottom = data->padding.height + data->padding.height_offset;
-    }
-
-    mli_point_to_subtsr_cfg substr_cfg_in = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg substr_cfg_out = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
-    mli_tensor sub_mli_in = {0};
-    mli_tensor sub_mli_out = {0};
-
-    const int batches =
-        MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
-
-    for (int i = 0; i < batches; i++) {
-      substr_cfg_in.start_coord[0] = i;
-      substr_cfg_out.start_coord[0] = i;
-      mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
-
-      mli_krn_conv2d_hwc_sa8_sa8_sa32(&sub_mli_in, &mli_weights, &mli_bias,
-                                      &cfg, &sub_mli_out);
-    }
-  } else {
-    ConvParams op_params;
-    op_params.input_offset = -input->params.zero_point;
-    op_params.output_offset = output->params.zero_point;
-    op_params.stride_height = params->stride_height;
-    op_params.stride_width = params->stride_width;
-    op_params.dilation_height_factor = params->dilation_height_factor;
-    op_params.dilation_width_factor = params->dilation_width_factor;
-    op_params.padding_values.height = data->padding.height;
-    op_params.padding_values.width = data->padding.width;
-
-    reference_integer_ops::ConvPerChannel(
-        op_params, data->per_channel_output_multiplier,
-        data->per_channel_output_shift, GetTensorShape(input),
-        GetTensorData<int8>(input), GetTensorShape(filter),
-        GetTensorData<int8>(filter), GetTensorShape(bias),
-        GetTensorData<int32>(bias), GetTensorShape(output),
-        GetTensorData<int8>(output));
-  }
-}
-
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteConvParams* params, OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* im2col,
-               TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-
-  ConvParams op_params;
-  op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  reference_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<float>(input), GetTensorShape(filter),
-                      GetTensorData<float>(filter), GetTensorShape(bias),
-                      GetTensorData<float>(bias), GetTensorShape(output),
-                      GetTensorData<float>(output), GetTensorShape(im2col),
-                      GetTensorData<float>(im2col));
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-
-  int input_width = input->dims->data[2];
-  int input_height = input->dims->data[1];
-  int filter_width = filter->dims->data[2];
-  int filter_height = filter->dims->data[1];
-  int output_width = output->dims->data[2];
-  int output_height = output->dims->data[1];
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    // Conv is quantized along dimension 0:
-    // https://www.tensorflow.org/lite/performance/quantization_spec
-    TF_LITE_ENSURE_EQ(context, filter->dims->data[0],
-                      affine_quantization->scale->size);
-    TF_LITE_ENSURE_EQ(context, filter->dims->data[0],
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(
-      context, node, params, input_width, input_height, filter_width,
-      filter_height, output_width, output_height, input->type, &data));
-
-  switch (input->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, nullptr,
-                nullptr, output);
-      break;
-    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
-                              output, nullptr);
-      break;
-    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, nullptr,
-                    nullptr, output);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace conv
-
-TfLiteRegistration* Register_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/conv::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
@ -1,344 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
-
-#include "mli_api.h"  // NOLINT
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
-#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace depthwise_conv {
-namespace {
-
-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
-
-struct OpData {
-  TfLitePaddingValues padding;
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
-TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, int width,
-                             int height, int filter_width, int filter_height,
-                             const TfLiteType data_type, OpData* data) {
-  bool has_bias = node->inputs->size == 3;
-  // Check number of inputs/outputs
-  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  int unused_output_height, unused_output_width;
-  data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width, 1, 1, height, width,
-      filter_height, filter_width, params->padding, &unused_output_height,
-      &unused_output_width);
-
-  // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-    // Ensure filter and bias channel count does not exceed space reserved for
-    // quantization metadata.
-    const auto filter_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    const auto bias_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(bias->quantization.params);
-    TF_LITE_ENSURE(context, filter_quantization->scale->size <= kMaxChannels);
-    TF_LITE_ENSURE(context, bias_quantization->scale->size <= kMaxChannels);
-
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
-        context, input, filter, bias, output, params->activation,
-        &data->output_multiplier, &data->output_shift,
-        &data->output_activation_min, &data->output_activation_max,
-        data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift)));
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace
-
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
-}
-
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output) {
-  // Run Depthwise Conv MLI kernel
-  // MLI optimized version only supports int8 dataype and dilation factor of 1
-  if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
-      (params->dilation_height_factor == 1)) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_weights = {0};
-    mli_tensor mli_bias = {0};
-    mli_tensor mli_out = {0};
-    mli_conv2d_cfg cfg = {};
-
-    // reuse space allocated for OpData parameters
-    mli_weights.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_multiplier;
-    mli_bias.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_shift;
-
-    int16_t filter_zero_point = 0;
-    int16_t bias_zero_point = 0;
-    mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
-    mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
-
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
-    ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
-
-    if (params->activation == kTfLiteActRelu) {
-      cfg.relu.type = MLI_RELU_GEN;
-    } else if (params->activation == kTfLiteActRelu6) {
-      cfg.relu.type = MLI_RELU_6;
-    } else if (params->activation == kTfLiteActRelu1) {
-      cfg.relu.type = MLI_RELU_1;
-    } else {
-      cfg.relu.type = MLI_RELU_NONE;
-    }
-
-    cfg.stride_width = params->stride_width;
-    cfg.stride_height = params->stride_height;
-    if (params->padding == kTfLitePaddingValid) {
-      cfg.padding_left = 0;
-      cfg.padding_right = 0;
-      cfg.padding_top = 0;
-      cfg.padding_bottom = 0;
-    } else {
-      cfg.padding_left = data->padding.width;
-      cfg.padding_right = data->padding.width + data->padding.width_offset;
-      cfg.padding_top = data->padding.height;
-      cfg.padding_bottom = data->padding.height + data->padding.height_offset;
-    }
-
-    mli_point_to_subtsr_cfg substr_cfg_in = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg substr_cfg_out = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
-    mli_tensor sub_mli_in = {0};
-    mli_tensor sub_mli_out = {0};
-
-    const int batches =
-        MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
-
-    for (int i = 0; i < batches; i++) {
-      substr_cfg_in.start_coord[0] = i;
-      substr_cfg_out.start_coord[0] = i;
-      mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
-
-      mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(&sub_mli_in, &mli_weights,
-                                                &mli_bias, &cfg, &sub_mli_out);
-    }
-  } else {
-    DepthwiseParams op_params;
-    op_params.padding_type = PaddingType::kSame;
-    op_params.padding_values.width = data->padding.width;
-    op_params.padding_values.height = data->padding.height;
-    op_params.stride_width = params->stride_width;
-    op_params.stride_height = params->stride_height;
-    op_params.dilation_width_factor = params->dilation_width_factor;
-    op_params.dilation_height_factor = params->dilation_height_factor;
-    op_params.depth_multiplier = params->depth_multiplier;
-    op_params.input_offset = -input->params.zero_point;
-    op_params.weights_offset = 0;
-    op_params.output_offset = output->params.zero_point;
-    // TODO(b/130439627): Use calculated value for clamping.
-    op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
-    op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
-
-    reference_integer_ops::DepthwiseConvPerChannel(
-        op_params, data->per_channel_output_multiplier,
-        data->per_channel_output_shift, GetTensorShape(input),
-        GetTensorData<int8>(input), GetTensorShape(filter),
-        GetTensorData<int8>(filter), GetTensorShape(bias),
-        GetTensorData<int32>(bias), GetTensorShape(output),
-        GetTensorData<int8>(output));
-  }
-}
-
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-
-  tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),
-      GetTensorShape(output), GetTensorData<uint8_t>(output));
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
-      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
-
-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    // Depthwise conv is quantized along dimension 3:
-    // https://www.tensorflow.org/lite/performance/quantization_spec
-    TF_LITE_ENSURE_EQ(context, filter->dims->data[3],
-                      affine_quantization->scale->size);
-    TF_LITE_ENSURE_EQ(context, filter->dims->data[3],
-                      affine_quantization->zero_point->size);
-  }
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, data_type,
-                                        &data));
-  switch (input->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, output);
-      break;
-    case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
-                              output);
-      break;
-    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, output);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace depthwise_conv
-
-TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/depthwise_conv::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc/fully_connected.cc
@ -1,248 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
-
-#include "mli_api.h"  // NOLINT
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace fully_connected {
-namespace {
-
-struct OpData {
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-  // The index of the temporary tensor where the quantized inputs are cached.
-  int input_quantized_index;
-};
-
-constexpr int kInputTensor = 0;
-constexpr int kWeightsTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
-TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFullyConnectedParams* params,
-                             TfLiteType data_type, const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output,
-                             OpData* data) {
-  TfLiteStatus status = kTfLiteOk;
-  if (data_type != kTfLiteFloat32) {
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-        context, params->activation, output, &data->output_activation_min,
-        &data->output_activation_max));
-  }
-  return status;
-}
-
-}  // namespace
-
-TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               TfLiteFullyConnectedParams* params, OpData* data,
-                               const TfLiteTensor* input,
-                               const TfLiteTensor* filter,
-                               const TfLiteTensor* bias, TfLiteTensor* output) {
-  // Run Fully Connected MLI kernel
-  // MLI optimized version only supports int8 dataype and no fused Relu
-  // TODO: subject to add mli_saturate kernel
-  // work around for issue #35318, mli fully connect kernel only supports
-  // zeropoint == 0 for weights. this check can be removed once issue #35318 is
-  // resolved.
-  if ((filter->params.zero_point == 0) &&
-      (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone)) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_weights = {0};
-    mli_tensor mli_bias = {0};
-    mli_tensor mli_out = {0};
-
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensor<int8_t>(filter, &mli_weights);
-    ConvertToMliTensor<int32_t>(bias, &mli_bias);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
-
-    mli_point_to_subtsr_cfg substr_cfg_in = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg substr_cfg_out = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
-    mli_tensor sub_mli_in = {0};
-    mli_tensor sub_mli_out = {0};
-
-    const int batches =
-        MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
-
-    for (int i = 0; i < batches; i++) {
-      substr_cfg_in.start_coord[0] = i;
-      substr_cfg_out.start_coord[0] = i;
-      mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
-
-      mli_krn_fully_connected_sa8_sa8_sa32(&sub_mli_in, &mli_weights, &mli_bias,
-                                           &sub_mli_out);
-    }
-  } else {
-    FullyConnectedParams op_params;
-    op_params.input_offset = -input->params.zero_point;
-    op_params.weights_offset = -filter->params.zero_point;
-    op_params.output_offset = output->params.zero_point;
-    op_params.output_multiplier = data->output_multiplier;
-    // TODO(b/138810107): Figure out whether output shift should be inverted
-    op_params.output_shift = -data->output_shift;
-    op_params.quantized_activation_min = data->output_activation_min;
-    op_params.quantized_activation_max = data->output_activation_max;
-
-    reference_integer_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(filter), GetTensorData<int8_t>(filter),
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteFullyConnectedParams* params, OpData* data,
-                           const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
-
-  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-
-#define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
-  reference_ops::FullyConnected(                                       \
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
-      GetTensorShape(output), GetTensorData<output_data_type>(output))
-  switch (output->type) {
-    case kTfLiteUInt8:
-      TF_LITE_FULLY_CONNECTED(uint8_t);
-      break;
-    case kTfLiteInt16:
-      TF_LITE_FULLY_CONNECTED(int16_t);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(output->type), output->type);
-      return kTfLiteError;
-  }
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteFullyConnectedParams* params, OpData* data,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-  tflite::FullyConnectedParams op_params;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-  tflite::reference_ops::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TfLiteType data_type = input->type;
-  OpData local_data_object;
-  OpData* data = &local_data_object;
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
-                                        filter, bias, output, data));
-
-  switch (filter->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      return EvalFloat(context, node, params, data, input, filter, bias,
-                       output);
-    case kTfLiteInt8:
-      return EvalQuantizedInt8(context, node, params, data, input, filter, bias,
-                               output);
-
-    case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, data, input, filter, bias,
-                           output);
-
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(filter->type), filter->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace fully_connected
-
-TfLiteRegistration* Register_FULLY_CONNECTED() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/fully_connected::Eval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc/pooling.cc
@ -1,292 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/kernels/internal/reference/pooling.h"
-
-#include "mli_api.h"  // NOLINT
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace pooling {
-
-namespace {
-
-constexpr int kInputTensor = 0;
-constexpr int kOutputTensor = 0;
-
-struct OpData {
-  TfLitePaddingValues padding;
-};
-
-TfLiteStatus CalculateOpData(const TfLiteContext* context,
-                             const TfLitePoolParams* params,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* output, OpData* data) {
-  // input: batch, height, width, channel
-  int height = SizeOfDimension(input, 1);
-  int width = SizeOfDimension(input, 2);
-
-  int out_height, out_width;
-
-  data->padding = ComputePaddingHeightWidth(
-      params->stride_height, params->stride_width,
-      /*dilation_rate_height=*/1,
-      /*dilation_rate_width=*/1, height, width, params->filter_height,
-      params->filter_width, params->padding, &out_height, &out_width);
-
-  return kTfLiteOk;
-}
-
-void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
-                      const TfLitePoolParams* params, const OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
-  float activation_min, activation_max;
-  CalculateActivationRange(params->activation, &activation_min,
-                           &activation_max);
-
-  PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.float_activation_min = activation_min;
-  op_params.float_activation_max = activation_max;
-  reference_ops::AveragePool(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(output), GetTensorData<float>(output));
-}
-
-void AverageEvalUint8(TfLiteContext* context, const TfLiteNode* node,
-                      const TfLitePoolParams* params, const OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
-
-  PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
-  reference_ops::AveragePool(
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-      GetTensorShape(output), GetTensorData<uint8_t>(output));
-}
-
-void AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
-                     const TfLitePoolParams* params, const OpData* data,
-                     const TfLiteTensor* input, TfLiteTensor* output) {
-  // Run Average Pooling MLI kernel
-  // MLI optimized version only supports int8 dataype and no fused Relu
-  // TODO: subject to add mli_saturate kernel
-  if (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_out = {0};
-    mli_pool_cfg cfg = {0};
-
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
-
-    cfg.kernel_width = params->filter_width;
-    cfg.kernel_height = params->filter_height;
-    cfg.stride_width = params->stride_width;
-    cfg.stride_height = params->stride_height;
-
-    if (params->padding == kTfLitePaddingValid) {
-      cfg.padding_left = 0;
-      cfg.padding_right = 0;
-      cfg.padding_top = 0;
-      cfg.padding_bottom = 0;
-    } else {
-      cfg.padding_left = data->padding.width;
-      cfg.padding_right = data->padding.width + data->padding.width_offset;
-      cfg.padding_top = data->padding.height;
-      cfg.padding_bottom = data->padding.height + data->padding.height_offset;
-    }
-
-    mli_point_to_subtsr_cfg substr_cfg_in = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg substr_cfg_out = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
-    mli_tensor sub_mli_in = {0};
-    mli_tensor sub_mli_out = {0};
-
-    const int batches =
-        MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
-
-    for (int i = 0; i < batches; i++) {
-      substr_cfg_in.start_coord[0] = i;
-      substr_cfg_out.start_coord[0] = i;
-      mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
-
-      mli_krn_avepool_hwc_sa8(&sub_mli_in, &cfg, &sub_mli_out);
-    }
-  } else {
-    int32_t activation_min, activation_max;
-    (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                            &activation_min, &activation_max);
-    PoolParams op_params;
-    op_params.stride_height = params->stride_height;
-    op_params.stride_width = params->stride_width;
-    op_params.filter_height = params->filter_height;
-    op_params.filter_width = params->filter_width;
-    op_params.padding_values.height = data->padding.height;
-    op_params.padding_values.width = data->padding.width;
-    op_params.quantized_activation_min = activation_min;
-    op_params.quantized_activation_max = activation_max;
-    reference_integer_ops::AveragePool(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
-  }
-}
-
-void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLitePoolParams* params, OpData* data,
-                  const TfLiteTensor* input, TfLiteTensor* output) {
-  float activation_min, activation_max;
-  CalculateActivationRange(params->activation, &activation_min,
-                           &activation_max);
-
-  tflite::PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.float_activation_min = activation_min;
-  op_params.float_activation_max = activation_max;
-  reference_ops::MaxPool(op_params, GetTensorShape(input),
-                         GetTensorData<float>(input), GetTensorShape(output),
-                         GetTensorData<float>(output));
-}
-
-void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
-                           TfLitePoolParams* params, OpData* data,
-                           const TfLiteTensor* input, TfLiteTensor* output) {
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
-
-  tflite::PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
-  reference_ops::MaxPool(op_params, GetTensorShape(input),
-                         GetTensorData<uint8_t>(input), GetTensorShape(output),
-                         GetTensorData<uint8_t>(output));
-}
-
-}  // namespace
-
-TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
-  OpData data;
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
-
-  // Inputs and outputs share the same type, guarenteed by the converter.
-  switch (input->type) {
-    case kTfLiteFloat32:
-      AverageEvalFloat(context, node, params, &data, input, output);
-      break;
-    case kTfLiteUInt8:
-      AverageEvalUint8(context, node, params, &data, input, output);
-      break;
-    case kTfLiteInt8:
-      AverageEvalInt8(context, node, params, &data, input, output);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
-                         TfLiteTypeGetName(input->type));
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
-  OpData data;
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
-
-  switch (input->type) {
-    case kTfLiteFloat32:
-      MaxEvalFloat(context, node, params, &data, input, output);
-      break;
-    case kTfLiteUInt8:
-      MaxEvalQuantizedUInt8(context, node, params, &data, input, output);
-      break;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
-                         TfLiteTypeGetName(input->type));
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace pooling
-
-TfLiteRegistration* Register_AVERAGE_POOL_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/pooling::AverageEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
-}
-
-TfLiteRegistration* Register_MAX_POOL_2D() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
-                                 /*invoke=*/pooling::MaxEval,
-                                 /*profiling_string=*/nullptr,
-                                 /*builtin_code=*/0,
-                                 /*custom_name=*/nullptr,
-                                 /*version=*/0};
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc_mli/README.md
+++ b/tensorflow/lite/micro/kernels/arc_mli/README.md
@ -0,0 +1,96 @@
+# EmbARC MLI Library Based Optimizations of TensorFlow Lite Micro Kernels for ARC Platforms.
+
+This folder contains kernel implementations which use optimized
+[embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli).
+It allows acceleration of inference operations which use int8 (asymmetric
+quantization).
+
+## Usage
+
+embARC MLI Library is used by default to speed up execution of some kernels for
+asymmetrically quantized layers. This means that usual project generation for
+ARC specific target implies usage of embARC MLI.
+
+For example:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
+```
+
+In case MLI implementation can’t be used, kernels in this folder fallback to
+TFLM reference implementations. For applications which may not benefit from MLI
+library, projects can be generated without these implementations by adding
+`TAGS=no_arc_mli` in the command line, which can reduce overall code size:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_int8_make_project
+```
+
+For ARC EM SDP board, a pre-compiled MLI library is downloaded and used in the
+application. For a custom target ARC-based platform, MLI sources are downloaded
+and compiled during project generation phase. To build library from sources for
+ARC EM SDP platform, add `BUILD_ARC_MLI=true` option to make command:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp BUILD_ARC_MLI=true generate_person_detection_int8_make_project
+```
+
+If an application exclusively uses accelerated MLI kernel implementations, one
+can strip out TFLM reference kernel implementations to reduce code size of
+application. Build application with `MLI_ONLY=true` option in generated project
+(after the project was built):
+
+```
+cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make
+
+make app MLI_ONLY=true
+```
+
+if you try this and application execution fails, then most probably MLI can’t be
+used for some nodes and you need to revert to using TFLM reference kernels.
+
+## Limitations
+
+Currently, the MLI Library provides optimized implementation only for int8
+(asymmetric) versions of the following kernels: 1. Convolution 2D – Per axis
+quantization only, `dilation_ratio==1` 2. Depthwise Convolution 2D – Per axis
+quantization only, `dilation_ratio==1` 3. Average Pooling 4. Max Pooling 5.
+Fully Connected
+
+Currently only
+[/tensorflow/lite/micro/examples/person_detection_experimental](/tensorflow/lite/micro/examples/person_detection_experimental)
+is quantized using this specification. Other examples can be executed on
+ARC-based targets, but will only use reference kernels.
+
+## Scratch Buffers and Slicing
+
+The following information applies only for ARC EM SDP and other targets with XY
+memory. embARC MLI uses specific optimizations which assumes node operands are
+in XY memory and/or DCCM (Data Closely Coupled Memory). As operands might be
+quite big and may not fit in available XY memory, special slicing logic is
+applied which allows kernel calculations to be split into multiple parts. For
+this reason, internal static buffers are allocated in these X, Y and DCCM memory
+banks and used to execute sub-calculations.
+
+All this is performed automatically and invisible to the user. Half of the DCCM
+memory bank and the full XY banks are occupied for MLI specific needs. If the
+user needs space in XY memory for other tasks, these arrays can be reduced by
+setting specific sizes. For this, add the following option to build command
+replacing **<size[a|b|c]>** with required values:
+
+```
+EXT_CFLAGS=”-DSCRATCH_MEM_Z_SIZE=<size_a> -DSCRATCH_MEM_X_SIZE=<size_b> -DSCRATCH_MEM_Y_SIZE=<size_c>”
+```
+
+For example, to reduce sizes of arrays placed in DCCM and XCCM to 32k and 8k
+respectively, use next command:
+
+```
+make app EXT_CFLAGS=”-DSCRATCH_MEM_Z_SIZE=32*1024 -DSCRATCH_MEM_X_SIZE=8*1024”
+```
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository,
+and third party dependencies are covered by their respective licenses, in the
+third_party folder of this package.
--- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
@ -0,0 +1,490 @@
+/* Copyright 2019-2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace conv {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+constexpr int kMaxChannels = 256;
+
+// Conv is quantized along dimension 0:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+constexpr int kConvQuantizedDimension = 0;
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+
+  // Per channel output multiplier and shift.
+  int32_t per_channel_output_multiplier[kMaxChannels];
+  int32_t per_channel_output_shift[kMaxChannels];
+
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+inline PaddingType RuntimePaddingType(TfLitePadding padding) {
+  switch (padding) {
+    case TfLitePadding::kTfLitePaddingSame:
+      return PaddingType::kSame;
+    case TfLitePadding::kTfLitePaddingValid:
+      return PaddingType::kValid;
+    case TfLitePadding::kTfLitePaddingUnknown:
+    default:
+      return PaddingType::kNone;
+  }
+}
+
+bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
+                     const TfLiteTensor* filter, const TfLiteTensor* bias,
+                     const TfLiteConvParams* params) {
+  const auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+  // MLI optimized version only supports int8 dataype, dilation factor of 1 and
+  // per-axis quantization of weights (no broadcasting/per-tensor)
+  bool ret_val = (filter->type == kTfLiteInt8) &&
+                 (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
+                 (params->dilation_width_factor == 1) &&
+                 (params->dilation_height_factor == 1) &&
+                 (affine_quantization->scale->size ==
+                  filter->dims->data[kConvQuantizedDimension]) &&
+                 affine_quantization->scale->size <= (kMaxChannels * 2);
+  return ret_val;
+}
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteConvParams* params, int width, int height,
+                             int filter_width, int filter_height, int out_width,
+                             int out_height, const TfLiteType data_type,
+                             bool mli_is_applicable, OpData* data) {
+  bool has_bias = node->inputs->size == 3;
+  // Check number of inputs/outputs
+  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  // Matching GetWindowedOutputSize in TensorFlow.
+  auto padding = params->padding;
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width,
+      params->dilation_height_factor, params->dilation_width_factor, height,
+      width, filter_height, filter_width, padding, &out_height, &out_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  if (data_type != kTfLiteFloat32 && !mli_is_applicable) {
+    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    const TfLiteTensor* bias =
+        GetOptionalInputTensor(context, node, kBiasTensor);
+    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    int output_channels = filter->dims->data[kConvQuantizedDimension];
+
+    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params->activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier,
+        reinterpret_cast<int*>(data->per_channel_output_shift),
+        output_channels));
+  }
+#endif
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteConvParams* params, OpData* data,
+                           const TfLiteTensor* input,
+                           const TfLiteTensor* filter, const TfLiteTensor* bias,
+                           TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
+                           TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  ConvParams op_params;
+  op_params.padding_type = RuntimePaddingType(params->padding);
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  reference_ops::Conv(op_params, GetTensorShape(input),
+                      GetTensorData<uint8_t>(input), GetTensorShape(filter),
+                      GetTensorData<uint8_t>(filter), GetTensorShape(bias),
+                      GetTensorData<int32_t>(bias), GetTensorShape(output),
+                      GetTensorData<uint8_t>(output), GetTensorShape(im2col),
+                      GetTensorData<uint8_t>(im2col), nullptr);
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus EvalMliQuantizedPerChannel(
+    TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params,
+    OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
+    const TfLiteTensor* bias, TfLiteTensor* output) {
+  // Run Conv MLI kernel
+  // MLI optimized version only supports int8 dataype and dilation factor of 1
+  if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
+      (params->dilation_height_factor == 1)) {
+    mli_tensor mli_in = {0};
+    mli_tensor mli_weights = {0};
+    mli_tensor mli_bias = {0};
+    mli_tensor mli_out = {0};
+    mli_conv2d_cfg cfg = {};
+
+    // reuse space allocated for OpData parameters
+    mli_weights.el_params.asym.scale.pi16 =
+        (int16_t*)data->per_channel_output_multiplier;
+    mli_bias.el_params.asym.scale.pi16 =
+        (int16_t*)data->per_channel_output_shift;
+
+    int16_t filter_zero_point = 0;
+    int16_t bias_zero_point = 0;
+    mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
+    mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
+
+    ConvertToMliTensor<int8_t>(input, &mli_in);
+    ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
+    ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
+    ConvertToMliTensor<int8_t>(output, &mli_out);
+
+    if (params->activation == kTfLiteActRelu) {
+      cfg.relu.type = MLI_RELU_GEN;
+    } else if (params->activation == kTfLiteActRelu6) {
+      cfg.relu.type = MLI_RELU_6;
+    } else if (params->activation == kTfLiteActRelu1) {
+      cfg.relu.type = MLI_RELU_1;
+    } else {
+      cfg.relu.type = MLI_RELU_NONE;
+    }
+
+    cfg.stride_width = params->stride_width;
+    cfg.stride_height = params->stride_height;
+    if (params->padding == kTfLitePaddingValid) {
+      cfg.padding_left = 0;
+      cfg.padding_right = 0;
+      cfg.padding_top = 0;
+      cfg.padding_bottom = 0;
+    } else {
+      cfg.padding_left = data->padding.width;
+      cfg.padding_right = data->padding.width + data->padding.width_offset;
+      cfg.padding_top = data->padding.height;
+      cfg.padding_bottom = data->padding.height + data->padding.height_offset;
+    }
+
+    // for height slicing
+    const int height_dimension = 1;
+    int in_slice_height = 0;
+    int out_slice_height = 0;
+    const int kernel_height =
+        static_cast<int>(mli_weights.shape[KRNL_H_DIM_HWC]);
+    const int overlap = kernel_height - cfg.stride_height;
+
+    // for weight slicing (on output channels)
+    // NHWC layout for weigths, output channel dimension is the first dimension.
+    const int weight_out_ch_dimension = 0;
+    int slice_channels =
+        static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
+    // Batch-Height-Width-Channel layout means last dimension is output
+    // channels.
+    const int out_tensor_ch_dimension = 3;
+
+    // Tensors for data in fast (local) memory and config to copy data from
+    // external to local memory
+    mli_tensor weights_local = mli_weights;
+    mli_tensor bias_local = mli_bias;
+    mli_tensor in_local = mli_in;
+    mli_tensor out_local = mli_out;
+    mli_mov_cfg_t copy_config;
+    mli_mov_cfg_for_copy(&copy_config);
+    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
+        context, &in_local, &weights_local, &bias_local, &out_local));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+        &in_local, &out_local, kernel_height, cfg.stride_height,
+        cfg.padding_top, cfg.padding_bottom, &in_slice_height,
+        &out_slice_height));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+        &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
+
+    /* is_local indicates that the tensor is already in local memory,
+       so in that case the original tensor can be used,
+       and there is no need to copy it to the local tensor*/
+    const bool in_is_local = in_local.data == mli_in.data;
+    const bool out_is_local = out_local.data == mli_out.data;
+    const bool w_is_local = weights_local.data == mli_weights.data;
+    const bool b_is_local = bias_local.data == mli_bias.data;
+
+    TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels);
+    TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension, slice_channels);
+    TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
+                              0, 0, 0, true);
+
+    mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+    mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
+
+    void* input_buffer_ptr = NULL;
+    int input_buffer_size = 0;
+
+    while (!w_slice.Done()) {
+      mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
+      mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
+
+      /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional
+      tensor. because the mli kernel will process one HWC tensor at a time, the
+      4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
+      on top of that there could be a need to also slice in the Height
+      dimension. for that the sliceHeight has been calculated. The tensor slicer
+      is configured that it will completely slice the nBatch dimension (0) and
+      slice the height dimension (1) in chunks of 'sliceHeight' */
+      TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height,
+                            cfg.padding_top, cfg.padding_bottom, overlap);
+
+      /* output tensor is alreade sliced in the output channel dimension.
+      out_ch_slice.Sub() is the tensor for the amount of output channels of this
+      itteration of the weight slice loop. This tensor needs to be further
+      sliced over the batch and height dimension. */
+      TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
+                             out_slice_height);
+
+      /* setup the pointers to the local or remote tensor to make the code
+       * inside the loop easier. */
+      mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+      mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+      while (!out_slice.Done()) {
+        TF_LITE_ENSURE(context, !in_slice.Done());
+        cfg.padding_top = in_slice.GetPaddingPre();
+        cfg.padding_bottom = in_slice.GetPaddingPost();
+
+        // if same input copy as previous iteration, skip the copy of input
+        if ((in_slice.Sub()->data != input_buffer_ptr) ||
+            (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
+          mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+          input_buffer_ptr = in_slice.Sub()->data;
+          input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
+        }
+        mli_krn_conv2d_nhwc_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr);
+        mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+        in_slice.Next();
+        out_slice.Next();
+      }
+      w_slice.Next();
+      b_slice.Next();
+      out_ch_slice.Next();
+      TF_LITE_ENSURE(context, in_slice.Done());
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                                     TfLiteConvParams* params, OpData* data,
+                                     const TfLiteTensor* input,
+                                     const TfLiteTensor* filter,
+                                     const TfLiteTensor* bias,
+                                     TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  ConvParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  reference_integer_ops::ConvPerChannel(
+      op_params, data->per_channel_output_multiplier,
+      data->per_channel_output_shift, GetTensorShape(input),
+      GetTensorData<int8>(input), GetTensorShape(filter),
+      GetTensorData<int8>(filter), GetTensorShape(bias),
+      GetTensorData<int32>(bias), GetTensorShape(output),
+      GetTensorData<int8>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Node configuration is not supported by ARC MLI Library.");
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteConvParams* params, OpData* data,
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* im2col,
+                       TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+
+  ConvParams op_params;
+  op_params.padding_type = RuntimePaddingType(params->padding);
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  reference_ops::Conv(op_params, GetTensorShape(input),
+                      GetTensorData<float>(input), GetTensorShape(filter),
+                      GetTensorData<float>(filter), GetTensorShape(bias),
+                      GetTensorData<float>(bias), GetTensorShape(output),
+                      GetTensorData<float>(output), GetTensorShape(im2col),
+                      GetTensorData<float>(im2col));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+
+  int input_width = input->dims->data[2];
+  int input_height = input->dims->data[1];
+  int filter_width = filter->dims->data[2];
+  int filter_height = filter->dims->data[1];
+  int output_width = output->dims->data[2];
+  int output_height = output->dims->data[1];
+
+  OpData data;
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+
+    TF_LITE_ENSURE(context,
+                   affine_quantization->scale->size == 1 ||
+                       affine_quantization->scale->size ==
+                           filter->dims->data[kConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+  bool mli_is_applicable =
+      IsMliApplicable(context, input, filter, bias, params);
+  TF_LITE_ENSURE_STATUS(
+      CalculateOpData(context, node, params, input_width, input_height,
+                      filter_width, filter_height, output_width, output_height,
+                      input->type, mli_is_applicable, &data));
+
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      return EvalFloat(context, node, params, &data, input, filter, bias,
+                       nullptr, nullptr, output);
+      break;
+    case kTfLiteInt8:
+      if (mli_is_applicable) {
+        return EvalMliQuantizedPerChannel(context, node, params, &data, input,
+                                          filter, bias, output);
+
+      } else {
+        return EvalQuantizedPerChannel(context, node, params, &data, input,
+                                       filter, bias, output);
+      }
+      break;
+    case kTfLiteUInt8:
+      return EvalQuantized(context, node, params, &data, input, filter, bias,
+                           nullptr, nullptr, output);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace conv
+
+TfLiteRegistration* Register_CONV_2D() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/nullptr,
+                                 /*invoke=*/conv::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
@ -0,0 +1,506 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This test checks that slicing logic doesn`t affect result of convolution
+// kernel
+//
+// This test doesn`t replace default convolution test
+// (tensorflow/lite/micro/kernels/conv_test.cc). It is added to the whole
+// testset only in case MLI for ARC platform is used during generation (which is
+// handled in arc_mli.inc). So such tests won`t be generated for other
+// platforms.
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+// Common inputs and outputs 1.
+static const int kInput1Elements = 20;
+static const int kInput1Shape[] = {4, 1, 5, 2, 2};
+static const float kInput1Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                    2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+static const int kFilter1Elements = 36;
+static const int kFilter1Shape[] = {4, 2, 3, 3, 2};
+static const float kFilter1Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+static const int kBias1Elements = 2;
+static const int kBias1Shape[] = {1, 2};
+static const float kBias1Data[] = {2, 2};
+static const int kOutput1Elements = 20;
+static const int kOutput1Shape[] = {4, 1, 5, 2, 2};
+static const float kGolden1Data[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50,
+                                     50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+
+// Common inputs and outputs 2.
+static const int kInput2Elements = 80;
+static const int kInput2Shape[] = {4, 1, 20, 2, 2};
+static const float kInput2Data[] = {
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+static const int kFilter2Elements = 36;
+static const int kFilter2Shape[] = {4, 2, 3, 3, 2};
+static const float kFilter2Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+static const int kBias2Elements = 2;
+static const int kBias2Shape[] = {1, 2};
+static const float kBias2Data[] = {2, 2};
+static const int kOutput2Elements = 80;
+static const int kOutput2Shape[] = {4, 1, 20, 2, 2};
+static const float kGolden2Data[] = {
+    34, 34, 34, 34, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+    50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+    50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+    50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+    50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+
+// Common inputs and outputs 3.
+static const int kInput3Elements = 40;
+static const int kInput3Shape[] = {4, 1, 2, 2, 10};
+static const float kInput3Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const int kFilter3Elements = 90;
+static const int kFilter3Shape[] = {4, 1, 3, 3, 10};  // 1 3 3 10
+static const float kFilter3Data[] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const int kBias3Elements = 1;
+static const int kBias3Shape[] = {1, 1};
+static const float kBias3Data[] = {1};
+static const int kOutput3Elements = 4;
+static const int kOutput3Shape[] = {4, 1, 2, 2, 1};  // 2 2 1
+static const float kGolden3Data[] = {41, 41, 41, 41};
+
+// Common inputs and outputs 4.
+static const int kInput4Elements = 80;
+static const int kInput4Shape[] = {4, 1, 4, 2, 10};
+static const float kInput4Data[] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const int kFilter4Elements = 90;
+static const int kFilter4Shape[] = {4, 1, 3, 3, 10};
+static const float kFilter4Data[] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const int kBias4Elements = 1;
+static const int kBias4Shape[] = {1, 1};
+static const float kBias4Data[] = {1};
+static const int kOutput4Elements = 8;
+static const int kOutput4Shape[] = {4, 1, 4, 2, 1};
+static const float kGolden4Data[] = {41, 41, 61, 61, 61, 61, 41, 41};
+
+static TfLiteConvParams common_conv_params = {
+    kTfLitePaddingSame,  // padding
+    1,                   // stride_width
+    1,                   // stride_height
+    kTfLiteActNone,      // activation
+    1,                   // dilation_width_factor
+    1,                   // dilation_height_factor
+};
+
+template <typename T>
+TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
+                                 const T* expected_output_data, T* output_data,
+                                 int output_length,
+                                 TfLiteConvParams* conv_params,
+                                 float tolerance = 1e-5) {
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_CONV_2D, 1);
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  const char* init_data = reinterpret_cast<const char*>(conv_params);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(conv_params);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TfLiteStatus return_val = registration->invoke(&context, &node);
+  if (return_val != kTfLiteOk) {
+    return return_val;
+  }
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_length; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
+                              tolerance);
+  }
+  return kTfLiteOk;
+}
+
+void TestConvQuantizedPerChannel(
+    const int* input_dims_data, const float* input_data,
+    int8_t* input_quantized, float input_scale, int input_zero_point,
+    const int* filter_dims_data, const float* filter_data,
+    int8_t* filter_data_quantized, const int* bias_dims_data,
+    const float* bias_data, int32_t* bias_data_quantized, float* bias_scales,
+    int* bias_zero_points, const int* output_dims_data,
+    const float* expected_output_data, int8_t* expected_output_data_quantized,
+    int8_t* output_data, float output_scale, int output_zero_point,
+    TfLiteConvParams* conv_params) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  int filter_zero_points[5];
+  float filter_scales[5];
+  TfLiteAffineQuantization filter_quant;
+  TfLiteAffineQuantization bias_quant;
+  TfLiteTensor input_tensor =
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point, "input_tensor");
+  TfLiteTensor filter_tensor = CreateSymmetricPerChannelQuantizedTensor(
+      filter_data, filter_data_quantized, filter_dims, filter_scales,
+      filter_zero_points, &filter_quant, 0 /* quantized dimension */,
+      "filter_tensor");
+
+  // DN: to replace scales and quantized data to avoid second quantization
+  int channel_count = filter_dims->data[0];
+  float true_filter_scales[5] = {1.0, 1.0, 1.0, 1.0, 1.0};
+  true_filter_scales[0] = static_cast<float>(channel_count);
+  TfLiteAffineQuantization* to_change =
+      (TfLiteAffineQuantization*)filter_tensor.quantization.params;
+  to_change->scale = FloatArrayFromFloats(true_filter_scales);
+
+  int filter_size = filter_tensor.bytes;
+  for (int i = 0; i < filter_size; ++i) {
+    filter_tensor.data.int8[i] = filter_data[i];
+  }
+
+  TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor(
+      bias_data, bias_data_quantized, bias_dims, input_scale, &filter_scales[1],
+      bias_scales, bias_zero_points, &bias_quant, 0 /* quantized dimension */,
+      "bias_tensor");
+  TfLiteTensor output_tensor =
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point, "output_tensor");
+
+  float input_scales[] = {1, input_scale};
+  int input_zero_points[] = {1, input_zero_point};
+  TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
+                                          IntArrayFromInts(input_zero_points)};
+  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
+
+  float output_scales[] = {1, output_scale};
+  int output_zero_points[] = {1, output_zero_point};
+  TfLiteAffineQuantization output_quant = {
+      FloatArrayFromFloats(output_scales),
+      IntArrayFromInts(output_zero_points)};
+  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      input_tensor,
+      filter_tensor,
+      bias_tensor,
+      output_tensor,
+  };
+
+  tflite::AsymmetricQuantize(expected_output_data,
+                             expected_output_data_quantized, output_dims_count,
+                             output_scale, output_zero_point);
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      ValidateConvGoldens(tensors, tensors_size, expected_output_data_quantized,
+                          output_data, output_dims_count, conv_params,
+                          1.0 /* tolerance */));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+// Test group 1
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel1) {
+  const int output_dims_count = 20;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[tflite::testing::kInput1Elements];
+  int8_t filter_quantized[tflite::testing::kFilter1Elements];
+  int32_t bias_quantized[tflite::testing::kBias1Elements];
+  int8_t golden_quantized[tflite::testing::kOutput1Elements];
+  int8_t output_data[output_dims_count];
+
+  int zero_points[tflite::testing::kBias1Elements + 1];
+  float scales[tflite::testing::kBias1Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput1Shape, tflite::testing::kInput1Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter1Shape, tflite::testing::kFilter1Data,
+      filter_quantized, tflite::testing::kBias1Shape,
+      tflite::testing::kBias1Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput1Shape, tflite::testing::kGolden1Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel1) {
+  const int output_dims_count = 20;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+#pragma Bss(".Xdata")
+  static int8_t input_quantized[tflite::testing::kInput1Elements];
+  static int8_t filter_quantized[tflite::testing::kFilter1Elements];
+  static int32_t bias_quantized[tflite::testing::kBias1Elements];
+  static int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  int8_t golden_quantized[tflite::testing::kOutput1Elements];
+  int zero_points[tflite::testing::kBias1Elements + 1];
+  float scales[tflite::testing::kBias1Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput1Shape, tflite::testing::kInput1Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter1Shape, tflite::testing::kFilter1Data,
+      filter_quantized, tflite::testing::kBias1Shape,
+      tflite::testing::kBias1Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput1Shape, tflite::testing::kGolden1Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+// Test group 2
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel2) {
+  const int output_dims_count = 80;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[tflite::testing::kInput2Elements];
+  int8_t filter_quantized[tflite::testing::kFilter2Elements];
+  int32_t bias_quantized[tflite::testing::kBias2Elements];
+  int8_t golden_quantized[tflite::testing::kOutput2Elements];
+  int8_t output_data[output_dims_count];
+
+  int zero_points[tflite::testing::kBias2Elements + 1];
+  float scales[tflite::testing::kBias2Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput2Shape, tflite::testing::kInput2Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter2Shape, tflite::testing::kFilter2Data,
+      filter_quantized, tflite::testing::kBias2Shape,
+      tflite::testing::kBias2Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput2Shape, tflite::testing::kGolden2Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel2) {
+  const int output_dims_count = 80;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+#pragma Bss(".Xdata")
+  static int8_t input_quantized[tflite::testing::kInput2Elements];
+  static int8_t filter_quantized[tflite::testing::kFilter2Elements];
+  static int32_t bias_quantized[tflite::testing::kBias2Elements];
+  static int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  int8_t golden_quantized[tflite::testing::kOutput2Elements];
+  int zero_points[tflite::testing::kBias2Elements + 1];
+  float scales[tflite::testing::kBias2Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput2Shape, tflite::testing::kInput2Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter2Shape, tflite::testing::kFilter2Data,
+      filter_quantized, tflite::testing::kBias2Shape,
+      tflite::testing::kBias2Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput2Shape, tflite::testing::kGolden2Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+// Test group 3
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel3) {
+  const int output_dims_count = 4;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[tflite::testing::kInput3Elements];
+  int8_t filter_quantized[tflite::testing::kFilter3Elements];
+  int32_t bias_quantized[tflite::testing::kBias3Elements];
+  int8_t golden_quantized[tflite::testing::kOutput3Elements];
+  int8_t output_data[output_dims_count];
+
+  int zero_points[tflite::testing::kBias3Elements + 1];
+  float scales[tflite::testing::kBias3Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput3Shape, tflite::testing::kInput3Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter3Shape, tflite::testing::kFilter3Data,
+      filter_quantized, tflite::testing::kBias3Shape,
+      tflite::testing::kBias3Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput3Shape, tflite::testing::kGolden3Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel3) {
+  const int output_dims_count = 4;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+#pragma Bss(".Xdata")
+  static int8_t input_quantized[tflite::testing::kInput3Elements];
+  static int8_t filter_quantized[tflite::testing::kFilter3Elements];
+  static int32_t bias_quantized[tflite::testing::kBias3Elements];
+  static int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  int8_t golden_quantized[tflite::testing::kOutput3Elements];
+  int zero_points[tflite::testing::kBias3Elements + 1];
+  float scales[tflite::testing::kBias3Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput3Shape, tflite::testing::kInput3Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter3Shape, tflite::testing::kFilter3Data,
+      filter_quantized, tflite::testing::kBias3Shape,
+      tflite::testing::kBias3Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput3Shape, tflite::testing::kGolden3Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+// Test group 4
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel4) {
+  const int output_dims_count = 8;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[tflite::testing::kInput4Elements];
+  int8_t filter_quantized[tflite::testing::kFilter4Elements];
+  int32_t bias_quantized[tflite::testing::kBias4Elements];
+  int8_t golden_quantized[tflite::testing::kOutput4Elements];
+  int8_t output_data[output_dims_count];
+
+  int zero_points[tflite::testing::kBias4Elements + 1];
+  float scales[tflite::testing::kBias4Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput4Shape, tflite::testing::kInput4Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter4Shape, tflite::testing::kFilter4Data,
+      filter_quantized, tflite::testing::kBias4Shape,
+      tflite::testing::kBias4Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput4Shape, tflite::testing::kGolden4Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel4) {
+  const int output_dims_count = 8;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+#pragma Bss(".Xdata")
+  static int8_t input_quantized[tflite::testing::kInput4Elements];
+  static int8_t filter_quantized[tflite::testing::kFilter4Elements];
+  static int32_t bias_quantized[tflite::testing::kBias4Elements];
+  static int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  int8_t golden_quantized[tflite::testing::kOutput4Elements];
+  int zero_points[tflite::testing::kBias4Elements + 1];
+  float scales[tflite::testing::kBias4Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput4Shape, tflite::testing::kInput4Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter4Shape, tflite::testing::kFilter4Data,
+      filter_quantized, tflite::testing::kBias4Shape,
+      tflite::testing::kBias4Data, bias_quantized, scales, zero_points,
+      tflite::testing::kOutput4Shape, tflite::testing::kGolden4Data,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &tflite::testing::common_conv_params);
+}
+TF_LITE_MICRO_TESTS_END
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@ -0,0 +1,515 @@
+/* Copyright 2017-2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace depthwise_conv {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+constexpr int kMaxChannels = 256;
+
+// Depthwise conv is quantized along dimension 3:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+constexpr int kDepthwiseConvQuantizedDimension = 3;
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+
+  // Per channel output multiplier and shift.
+  int32_t per_channel_output_multiplier[kMaxChannels];
+  int32_t per_channel_output_shift[kMaxChannels];
+
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
+                     const TfLiteTensor* filter, const TfLiteTensor* bias,
+                     const TfLiteDepthwiseConvParams* params) {
+  const auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+  const int in_ch = SizeOfDimension(input, 3);
+  const int filters_num = SizeOfDimension(filter, 3);
+
+  // MLI optimized version only supports int8 dataype, dilation factor of 1 and
+  // per-axis quantization of weights (no broadcasting/per-tensor)
+  // (in_ch == filters_num) || (in_ch == 1)) is a forbidding of
+  // channel multiplier logic for multichannel input.
+  bool ret_val = (filter->type == kTfLiteInt8) &&
+                 (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
+                 (params->dilation_width_factor == 1) &&
+                 (params->dilation_height_factor == 1) &&
+                 (affine_quantization->scale->size ==
+                  filter->dims->data[kDepthwiseConvQuantizedDimension]) &&
+                 ((in_ch == filters_num) || (in_ch == 1)) &&
+                 affine_quantization->scale->size <= (kMaxChannels * 2);
+  return ret_val;
+}
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             const TfLiteType data_type, bool mli_is_applicable,
+                             OpData* data) {
+  bool has_bias = node->inputs->size == 3;
+  // Check number of inputs/outputs
+  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  int unused_output_height, unused_output_width;
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width, 1, 1, height, width,
+      filter_height, filter_width, params->padding, &unused_output_height,
+      &unused_output_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  if (data_type != kTfLiteFloat32 && !mli_is_applicable) {
+    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    const TfLiteTensor* bias =
+        GetOptionalInputTensor(context, node, kBiasTensor);
+    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+
+    // Ensure filter and bias channel count does not exceed space reserved for
+    // quantization metadata.
+    const auto filter_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    const auto bias_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(bias->quantization.params);
+    TF_LITE_ENSURE(context, filter_quantization->scale->size <= kMaxChannels);
+    TF_LITE_ENSURE(context, bias_quantization->scale->size <= kMaxChannels);
+
+    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params->activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier,
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
+  }
+#endif
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteDepthwiseConvParams* params, OpData* data,
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  tflite::reference_ops::DepthwiseConv(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus EvalMliQuantizedPerChannel(
+    TfLiteContext* context, TfLiteNode* node, TfLiteDepthwiseConvParams* params,
+    OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
+    const TfLiteTensor* bias, TfLiteTensor* output) {
+  // Run Depthwise Conv MLI kernel
+  mli_tensor mli_in = {0};
+  mli_tensor mli_weights = {0};
+  mli_tensor mli_bias = {0};
+  mli_tensor mli_out = {0};
+  mli_conv2d_cfg cfg = {};
+
+  // reuse space allocated for OpData parameters
+  mli_weights.el_params.asym.scale.pi16 =
+      (int16_t*)data->per_channel_output_multiplier;
+  mli_bias.el_params.asym.scale.pi16 = (int16_t*)data->per_channel_output_shift;
+
+  int16_t filter_zero_point = 0;
+  int16_t bias_zero_point = 0;
+  mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
+  mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
+
+  ConvertToMliTensor<int8_t>(input, &mli_in);
+  ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
+  ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
+  ConvertToMliTensor<int8_t>(output, &mli_out);
+
+  if (params->activation == kTfLiteActRelu) {
+    cfg.relu.type = MLI_RELU_GEN;
+  } else if (params->activation == kTfLiteActRelu6) {
+    cfg.relu.type = MLI_RELU_6;
+  } else if (params->activation == kTfLiteActRelu1) {
+    cfg.relu.type = MLI_RELU_1;
+  } else {
+    cfg.relu.type = MLI_RELU_NONE;
+  }
+
+  cfg.stride_width = params->stride_width;
+  cfg.stride_height = params->stride_height;
+  if (params->padding == kTfLitePaddingValid) {
+    cfg.padding_left = 0;
+    cfg.padding_right = 0;
+    cfg.padding_top = 0;
+    cfg.padding_bottom = 0;
+  } else {
+    cfg.padding_left = data->padding.width;
+    cfg.padding_right = data->padding.width + data->padding.width_offset;
+    cfg.padding_top = data->padding.height;
+    cfg.padding_bottom = data->padding.height + data->padding.height_offset;
+  }
+
+  // for height slicing
+  const int heightDimension = 1;
+  int inSliceHeight = 0;
+  int outSliceHeight = 0;
+  const int kernelHeight =
+      static_cast<int>(mli_weights.shape[KRNL_DW_H_DIM_HWC]);
+  const int overlap = kernelHeight - cfg.stride_height;
+
+  // for weight slicing (on output channels)
+  // HWCN layout for weigths, output channel dimension is the first dimension.
+  const int weight_out_ch_dimension = 3;
+  // bias has only 1 dimension
+  const int bias_out_ch_dimension = 0;
+  // Batch-Height-Width-Channel layout means last dimension is output channels.
+  const int out_tensor_ch_dimension = 3;
+  const int32_t in_channels = mli_in.shape[out_tensor_ch_dimension];
+  const int32_t out_channels = mli_out.shape[out_tensor_ch_dimension];
+  int slice_channels =
+      static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
+
+  // Tensors for data in fast (local) memory
+  // and config to copy data from external to local memory
+  mli_tensor weights_local = mli_weights;
+  mli_tensor bias_local = mli_bias;
+  mli_tensor in_local = mli_in;
+  mli_tensor out_local = mli_out;  // this assumes that output shape
+                                   // is already filled in the tensor struct.
+  mli_mov_cfg_t copy_config;
+  mli_mov_cfg_for_copy(&copy_config);
+
+  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
+      context, &in_local, &weights_local, &bias_local, &out_local));
+  /* is_local indicates that the tensor is already in local memory,
+     so in that case the original tensor can be used,
+     and there is no need to copy it to the local tensor*/
+  const bool in_is_local = in_local.data == mli_in.data;
+  const bool out_is_local = out_local.data == mli_out.data;
+  const bool w_is_local = weights_local.data == mli_weights.data;
+  const bool b_is_local = bias_local.data == mli_bias.data;
+
+  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+      &in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top,
+      cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
+  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+      &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
+
+  /* if input channels is not equal to output channels, a channel multiplier
+     is used. in this case the slice channels needs to be rounded down to a
+     multiple of the input channels */
+  if (in_channels != out_channels) {
+    slice_channels = (slice_channels / in_channels) * in_channels;
+  }
+
+  TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels);
+  TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0,
+                       0, 0, true);
+  TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
+                            0, 0, 0, true);
+  TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0,
+                           0, 0, true);
+
+  mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+  mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
+
+  void* input_buffer_ptr = NULL;
+  int input_buffer_size = 0;
+  int padding_top = cfg.padding_top;
+  int padding_bottom = cfg.padding_bottom;
+
+  while (!w_slice.Done()) {
+    mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
+    mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
+
+    /* input tensor is alreade sliced in the  channel dimension.
+    out_ch_slice.Sub() is the tensor for the amount of channels of this
+    itteration of the weight slice loop. This tensor needs to be further
+    sliced over the batch and height dimension. in_ch_slice.Sub() tensor
+    contains batches of HWC tensors. so it is a 4 dimensional tensor. because
+    the mli kernel will process one HWC tensor at a time, the 4 dimensional
+    tensor needs to be sliced into nBatch 3 dimensional tensors. on top of
+    that there could be a need to also slice in the Height dimension. for that
+    the sliceHeight has been calculated. The tensor slicer is configured that
+    it will completely slice the nBatch dimension (0) and slice the height
+    dimension (1) in chunks of 'sliceHeight' */
+    TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight,
+                          padding_top, padding_bottom, overlap);
+
+    /* output tensor is alreade sliced in the output channel dimension.
+    out_ch_slice.Sub() is the tensor for the amount of output channels of this
+    itteration of the weight slice loop. This tensor needs to be further
+    sliced over the batch and height dimension. */
+    TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight);
+
+    /* setup the pointers to the local or remote tensor to make the code
+     * inside the loop easier. */
+    mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+    mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+    while (!out_slice.Done()) {
+      TF_LITE_ENSURE(context, !in_slice.Done());
+      cfg.padding_top = in_slice.GetPaddingPre();
+      cfg.padding_bottom = in_slice.GetPaddingPost();
+
+      // if same input copy as previous iteration, skip the copy of input
+      if ((in_slice.Sub()->data != input_buffer_ptr) ||
+          (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
+        mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+        input_buffer_ptr = in_slice.Sub()->data;
+        input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
+      }
+      mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg,
+                                                 out_ptr);
+      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+      in_slice.Next();
+      out_slice.Next();
+    }
+    w_slice.Next();
+    b_slice.Next();
+    out_ch_slice.Next();
+    in_ch_slice.Next();
+    TF_LITE_ENSURE(context, in_slice.Done());
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                                     TfLiteDepthwiseConvParams* params,
+                                     OpData* data, const TfLiteTensor* input,
+                                     const TfLiteTensor* filter,
+                                     const TfLiteTensor* bias,
+                                     TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  DepthwiseParams op_params;
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.weights_offset = 0;
+  op_params.output_offset = output->params.zero_point;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  reference_integer_ops::DepthwiseConvPerChannel(
+      op_params, data->per_channel_output_multiplier,
+      data->per_channel_output_shift, GetTensorShape(input),
+      GetTensorData<int8>(input), GetTensorShape(filter),
+      GetTensorData<int8>(filter), GetTensorShape(bias),
+      GetTensorData<int32>(bias), GetTensorShape(output),
+      GetTensorData<int8>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Node configuration is not supported by ARC MLI Library.");
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteDepthwiseConvParams* params, OpData* data,
+                           const TfLiteTensor* input,
+                           const TfLiteTensor* filter, const TfLiteTensor* bias,
+                           TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data->output_shift;
+
+  tflite::reference_ops::DepthwiseConv(
+      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+      GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      GetTensorShape(output), GetTensorData<uint8_t>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias =
+      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+
+  OpData data;
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  bool mli_is_applicable =
+      IsMliApplicable(context, input, filter, bias, params);
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, data_type,
+                                        mli_is_applicable, &data));
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      return EvalFloat(context, node, params, &data, input, filter, bias,
+                       output);
+      break;
+    case kTfLiteInt8:
+      if (mli_is_applicable) {
+        return EvalMliQuantizedPerChannel(context, node, params, &data, input,
+                                          filter, bias, output);
+      } else {
+        return EvalQuantizedPerChannel(context, node, params, &data, input,
+                                       filter, bias, output);
+      }
+      break;
+    case kTfLiteUInt8:
+      return EvalQuantized(context, node, params, &data, input, filter, bias,
+                           output);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace depthwise_conv
+
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/nullptr,
+                                 /*invoke=*/depthwise_conv::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
@ -0,0 +1,550 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This test checks that slicing logic doesn`t affect result of depthwise
+// convolution kernel
+//
+// This test doesn`t replace default depthwise convolution test
+// (tensorflow/lite/micro/kernels/depthwise_conv_test.cc). It is added to the
+// whole testset only in case MLI for ARC platform is used during generation
+// (which is handled in arc_mli.inc). So such tests won`t be generated for other
+// platforms.
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+constexpr int kMaxFilterChannels = 64;
+constexpr int kMaxBiasChannels = 64;
+
+// Index of the output tensor in context->tensors, specific to
+// DepthwiseConv.
+constexpr int kOutputTensorIndex = 3;
+
+// Creates a DepthwiseConv opeerator, calls it with the provided input tensors
+// and some defaults parameters, and compares the output with
+// expected_output_data.
+//
+// The tensors parameter contains both the input tensors as well as a
+// preallocated output tensor into which the output is stored.
+template <typename T>
+TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
+                                          int output_length,
+                                          TfLiteFusedActivation activation,
+                                          float tolerance, int tensors_size,
+                                          TfLiteTensor* tensors) {
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  int input_depth = tensors[0].dims->data[3];
+  int output_depth = tensors[1].dims->data[3];
+  int depth_mul = output_depth / input_depth;
+  TfLiteDepthwiseConvParams builtin_data;
+  builtin_data.padding = kTfLitePaddingValid;
+  builtin_data.activation = activation;
+  builtin_data.stride_height = 1;
+  builtin_data.stride_width = 1;
+  builtin_data.dilation_height_factor = 1;
+  builtin_data.dilation_width_factor = 1;
+  builtin_data.depth_multiplier = depth_mul;
+
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TfLiteStatus invoke_status = registration->invoke(&context, &node);
+  if (invoke_status != kTfLiteOk) {
+    return invoke_status;
+  }
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  const T* output_data = tflite::GetTensorData<T>(&tensors[kOutputTensorIndex]);
+  for (int i = 0; i < output_length; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
+                              tolerance);
+  }
+  return kTfLiteOk;
+}
+
+void TestDepthwiseConvQuantizedPerChannel(
+    const int* input_dims_data, const float* input_data,
+    int8_t* input_quantized, float input_scale, int input_zero_point,
+    const int* filter_dims_data, const float* filter_data,
+    int8_t* filter_data_quantized, const int* bias_dims_data,
+    const float* bias_data, int32_t* bias_data_quantized,
+    const int* output_dims_data, const float* expected_output_data,
+    int8_t* expected_output_data_quantized, int8_t* output_data,
+    float output_scale, int output_zero_point,
+    TfLiteFusedActivation activation) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  int filter_zero_points[kMaxFilterChannels];
+  float filter_scales[kMaxFilterChannels];
+  int bias_zero_points[kMaxBiasChannels];
+  float bias_scales[kMaxBiasChannels];
+  TfLiteAffineQuantization filter_quant;
+  TfLiteAffineQuantization bias_quant;
+  TfLiteTensor input_tensor =
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point, "input_tensor");
+  TfLiteTensor filter_tensor = CreateSymmetricPerChannelQuantizedTensor(
+      filter_data, filter_data_quantized, filter_dims, filter_scales,
+      filter_zero_points, &filter_quant, 3 /* quantized dimension */,
+      "filter_tensor");
+  TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor(
+      bias_data, bias_data_quantized, bias_dims, input_scale, &filter_scales[1],
+      bias_scales, bias_zero_points, &bias_quant, 3 /* quantized dimension */,
+      "bias_tensor");
+  TfLiteTensor output_tensor =
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            input_zero_point, "output_tensor");
+
+  float input_scales[] = {1, input_scale};
+  int input_zero_points[] = {1, input_zero_point};
+  TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
+                                          IntArrayFromInts(input_zero_points)};
+  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
+
+  float output_scales[] = {1, output_scale};
+  int output_zero_points[] = {1, output_zero_point};
+  TfLiteAffineQuantization output_quant = {
+      FloatArrayFromFloats(output_scales),
+      IntArrayFromInts(output_zero_points)};
+  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      input_tensor,
+      filter_tensor,
+      bias_tensor,
+      output_tensor,
+  };
+
+  AsymmetricQuantize(expected_output_data, expected_output_data_quantized,
+                     output_dims_count, output_scale, output_zero_point);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, ValidateDepthwiseConvGoldens(expected_output_data_quantized,
+                                              output_dims_count, activation,
+                                              1.0, tensors_size, tensors));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+// Test group 1
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel1) {
+  const int input_elements = 20;
+  const int input_shape[] = {4, 1, 5, 2, 2};
+  const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int filter_elements = 36;
+  const int filter_shape[] = {4, 2, 3, 3, 2};
+  const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int bias_elements = 2;
+  const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 20;
+  const float bias_values[] = {2, 2};
+  const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50,
+                          50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+  const int output_shape[] = {4, 1, 5, 2, 2};
+  const int output_dims_count = 20;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel1) {
+  const int input_elements = 20;
+  const int input_shape[] = {4, 1, 5, 2, 2};
+  const int filter_elements = 36;
+  const int filter_shape[] = {4, 2, 3, 3, 2};
+  const int bias_elements = 2;
+  const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 20;
+  const int output_shape[] = {4, 1, 5, 2, 2};
+  const int output_dims_count = 20;
+
+#pragma Bss(".Zdata")
+  const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const float bias_values[] = {2, 2};
+  int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50,
+                          50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+// Test group 2
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel2) {
+  const int input_elements = 80;
+  const int input_shape[] = {4, 1, 20, 2, 2};
+  const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int filter_elements = 36;
+  const int filter_shape[] = {4, 2, 3, 3, 2};
+  const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int bias_elements = 2;
+  const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 80;
+  const float bias_values[] = {2, 2};
+  const float golden[] = {
+      34, 34, 34, 34, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+  const int output_shape[] = {4, 1, 20, 2, 2};
+  const int output_dims_count = 80;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel2) {
+  const int input_elements = 80;
+  const int input_shape[] = {4, 1, 20, 2, 2};
+  const int filter_elements = 36;
+  const int filter_shape[] = {4, 2, 3, 3, 2};
+  const int bias_elements = 2;
+  const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 80;
+  const int output_shape[] = {4, 1, 20, 2, 2};
+  const int output_dims_count = 80;
+
+#pragma Bss(".Zdata")
+  float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  float bias_values[] = {2, 2};
+  int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  const float golden[] = {
+      34, 34, 34, 34, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+      50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+// Test group 3
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel3) {
+  const int input_elements = 40;
+  const int input_shape[] = {4, 1, 2, 2, 10};
+  const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int filter_elements = 90;
+  const int filter_shape[] = {4, 1, 3, 3, 10};
+  const float filter_values[] = {
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int bias_elements = 1;
+  const int bias_shape[] = {4, 1, 1, 1, 1};
+  const int output_elements = 4;
+  const float bias_values[] = {1};
+  const float golden[] = {41, 41, 41, 41};
+  const int output_shape[] = {4, 1, 2, 2, 1};
+  const int output_dims_count = 4;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel3) {
+  const int input_elements = 40;
+  const int input_shape[] = {4, 1, 2, 2, 10};
+  const int filter_elements = 90;
+  const int filter_shape[] = {4, 1, 3, 3, 10};
+  const int bias_elements = 1;
+  const int bias_shape[] = {4, 1, 1, 1, 1};
+  const int output_elements = 4;
+  const int output_shape[] = {4, 1, 2, 2, 1};
+  const int output_dims_count = 4;
+
+#pragma Bss(".Zdata")
+  float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float filter_values[] = {
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float bias_values[] = {1};
+  int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  const float golden[] = {41, 41, 41, 41};
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+// Test group 4
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel4) {
+  const int input_elements = 80;
+  const int input_shape[] = {4, 1, 4, 2, 10};
+  const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int filter_elements = 90;
+  const int filter_shape[] = {4, 1, 3, 3, 10};
+  const float filter_values[] = {
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int bias_elements = 1;
+  const int bias_shape[] = {4, 1, 1, 1, 1};
+  const int output_elements = 8;
+  const float bias_values[] = {1};
+  const float golden[] = {41, 41, 61, 61, 61, 61, 41, 41};
+  const int output_shape[] = {4, 1, 4, 2, 1};
+  const int output_dims_count = 8;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel4) {
+  const int input_elements = 80;
+  const int input_shape[] = {4, 1, 4, 2, 10};
+  const int filter_elements = 90;
+  const int filter_shape[] = {4, 1, 3, 3, 10};
+  const int bias_elements = 1;
+  const int bias_shape[] = {4, 1, 1, 1, 1};
+  const int output_elements = 8;
+  const int output_shape[] = {4, 1, 4, 2, 1};
+  const int output_dims_count = 8;
+
+#pragma Bss(".Zdata")
+  float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float filter_values[] = {
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float bias_values[] = {1};
+  int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  const float golden[] = {41, 41, 61, 61, 61, 61, 41, 41};
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+TF_LITE_MICRO_TESTS_END
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@ -0,0 +1,385 @@
+/* Copyright 2017-2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace fully_connected {
+namespace {
+
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  // The index of the temporary tensor where the quantized inputs are cached.
+  int input_quantized_index;
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
+                     const TfLiteTensor* filter, const TfLiteTensor* bias,
+                     const TfLiteFullyConnectedParams* params) {
+  // MLI optimized version only supports int8 dataype and no fused Relu and
+  // symmetric per-tensor quantization of weights (not per-axis)
+  bool ret_val = (filter->type == kTfLiteInt8) &&
+                 (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
+                 (params->activation == kTfLiteActNone) &&
+                 (filter->params.zero_point == 0);
+  return ret_val;
+}
+
+TfLiteStatus CalculateOpData(TfLiteContext* context,
+                             TfLiteFullyConnectedParams* params,
+                             TfLiteType data_type, const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output,
+                             OpData* data) {
+  TfLiteStatus status = kTfLiteOk;
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  if (data_type != kTfLiteFloat32 &&
+      !IsMliApplicable(context, input, filter, bias, params)) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->output_activation_min,
+        &data->output_activation_max));
+  }
+#endif
+  return status;
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  OpData* data = nullptr;
+  TfLiteStatus status = context->AllocatePersistentBuffer(
+      context, sizeof(OpData), reinterpret_cast<void**>(&data));
+  if (status != kTfLiteOk || data == nullptr) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE(context, data != nullptr);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
+  TfLiteType data_type = input->type;
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
+                                        filter, bias, output, data));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                                  TfLiteFullyConnectedParams* params,
+                                  OpData* data, const TfLiteTensor* input,
+                                  const TfLiteTensor* filter,
+                                  const TfLiteTensor* bias,
+                                  TfLiteTensor* output) {
+  mli_tensor mli_in = {0};
+  mli_tensor mli_weights = {0};
+  mli_tensor mli_bias = {0};
+  mli_tensor mli_out = {0};
+
+  ConvertToMliTensor<int8_t>(input, &mli_in);
+  ConvertToMliTensor<int8_t>(filter, &mli_weights);
+  ConvertToMliTensor<int32_t>(bias, &mli_bias);
+  ConvertToMliTensor<int8_t>(output, &mli_out);
+
+  /* The input tensor can have more than 2 dimensions. for the compute this
+     doesn't make any difference because all the inputs or a batch entry will
+     be used anyway. because the MLI kernel doesn't recognize the multiple
+     dimensions, the tensor shape is casted to a {batchnum, inputsize} shape. */
+  mli_in.shape[0] = mli_out.shape[0];
+  mli_in.shape[1] = mli_weights.shape[1];
+  mli_in.shape[2] = 0;
+  mli_in.shape[3] = 0;
+  mli_in.rank = 2;
+
+  // Tensors for data in fast (local) memory and config to copy data from
+  // external to local memory
+  mli_tensor weights_local = mli_weights;
+  mli_tensor bias_local = mli_bias;
+  mli_tensor in_local = mli_in;
+  mli_tensor out_local = mli_out;
+  mli_mov_cfg_t copy_config;
+  mli_mov_cfg_for_copy(&copy_config);
+  const int weight_out_dimension = 0;
+  const int out_tensor_dimension = 1;
+  const int input_size_dimension = 1;
+  int slice_size = mli_weights.shape[weight_out_dimension];
+
+  /* allocate the local buffers, and compute the slice size */
+  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors(
+      context, &in_local, &weights_local, &bias_local, &out_local));
+  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+      &weights_local, &bias_local, weight_out_dimension, &slice_size));
+  int max_out_slice_size =
+      out_local.capacity / mli_hlp_tensor_element_size(&out_local);
+  if (slice_size > max_out_slice_size) slice_size = max_out_slice_size;
+
+  /* is_local indicates that the tensor is already in local memory,
+     so in that case the original tensor can be used,
+     and there is no need to copy it to the local tensor*/
+  const bool in_is_local = in_local.data == mli_in.data;
+  const bool out_is_local = out_local.data == mli_out.data;
+  const bool w_is_local = weights_local.data == mli_weights.data;
+  const bool b_is_local = bias_local.data == mli_bias.data;
+
+  TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size);
+  TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
+  TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0,
+                            true);
+
+  mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+  mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
+
+  void* input_buffer_ptr = NULL;
+
+  while (!w_slice.Done()) {
+    mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
+    mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
+
+    // Slice the input over the batches (one at a time with the size of a
+    // complete input)
+    TensorSlicer in_slice(&mli_in, input_size_dimension,
+                          mli_in.shape[input_size_dimension]);
+
+    /* output tensor is alreade sliced in the output size dimension.
+    out_ch_slice.Sub() is the tensor for the amount of output size of this
+    itteration of the weight slice loop. This tensor needs to be further
+    sliced over the batch */
+    TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
+                           slice_size);
+
+    /* setup the pointers to the local or remote tensor to make the code
+     * inside the loop easier. */
+    mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+    mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+    while (!out_slice.Done()) {
+      // if same input copy as previous iteration, skip the copy of input
+      if (in_slice.Sub()->data != input_buffer_ptr) {
+        mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+        input_buffer_ptr = in_slice.Sub()->data;
+      }
+      mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, out_ptr);
+      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+      in_slice.Next();
+      out_slice.Next();
+    }
+    w_slice.Next();
+    b_slice.Next();
+    out_ch_slice.Next();
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                               TfLiteFullyConnectedParams* params, OpData* data,
+                               const TfLiteTensor* input,
+                               const TfLiteTensor* filter,
+                               const TfLiteTensor* bias, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  FullyConnectedParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.weights_offset = -filter->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  reference_integer_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(filter), GetTensorData<int8_t>(filter),
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      GetTensorShape(output), GetTensorData<int8_t>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Node configuration is not supported by ARC MLI Library.");
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteFullyConnectedParams* params, OpData* data,
+                           const TfLiteTensor* input,
+                           const TfLiteTensor* filter, const TfLiteTensor* bias,
+                           TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+#define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
+  reference_ops::FullyConnected(                                       \
+      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
+      GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
+      GetTensorShape(output), GetTensorData<output_data_type>(output))
+  switch (output->type) {
+    case kTfLiteUInt8:
+      TF_LITE_FULLY_CONNECTED(uint8_t);
+      break;
+    case kTfLiteInt16:
+      TF_LITE_FULLY_CONNECTED(int16_t);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(output->type), output->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteFullyConnectedParams* params, OpData* data,
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  tflite::FullyConnectedParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  tflite::reference_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  TF_LITE_ENSURE(context, data != nullptr);
+
+  // Checks in Prepare ensure input, output and filter types are all the same.
+  switch (input->type) {
+    case kTfLiteFloat32:
+      return EvalFloat(context, node, params, data, input, filter, bias,
+                       output);
+    case kTfLiteInt8:
+      if (IsMliApplicable(context, input, filter, bias, params)) {
+        return EvalMliQuantizedInt8(context, node, params, data, input, filter,
+                                    bias, output);
+      } else {
+        return EvalQuantizedInt8(context, node, params, data, input, filter,
+                                 bias, output);
+      }
+
+    case kTfLiteUInt8:
+      return EvalQuantized(context, node, params, data, input, filter, bias,
+                           output);
+
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(filter->type), filter->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace fully_connected
+
+TfLiteRegistration* Register_FULLY_CONNECTED() {
+  static TfLiteRegistration r = {/*init=*/fully_connected::Init,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/fully_connected::Prepare,
+                                 /*invoke=*/fully_connected::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
@ -0,0 +1,425 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This test checks that slicing logic doesn`t affect result of fully
+// connected kernel
+//
+// This test doesn`t replace default fully connected test
+// (tensorflow/lite/micro/kernels/fully_connected_test.cc). It is added to the
+// whole testset only in case MLI for ARC platform is used during generation
+// (which is handled in arc_mli.inc). So such tests won`t be generated for other
+// platforms.
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+template <typename T>
+void TestFullyConnectedQuantized(
+    const int* input_dims_data, const T* input_data, const float input_min,
+    const float input_max, const int* weights_dims_data, const T* weights_data,
+    const float weights_min, const float weights_max, const int* bias_dims_data,
+    const int32_t* bias_data, const float bias_scale,
+    const T* expected_output_data, const int* output_dims_data,
+    const float output_min, const float output_max,
+    TfLiteFusedActivation activation, T* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(weights_data, weights_dims, "weights_tensor",
+                            weights_min, weights_max),
+      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_scale),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  tensors[0].params.zero_point = 0;
+  tensors[1].params.zero_point = 0;
+  tensors[3].params.zero_point = 0;
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 4);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteFullyConnectedParams builtin_data = {
+      activation,
+      kTfLiteFullyConnectedWeightsFormatDefault,
+  };
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+// Test group 1
+TF_LITE_MICRO_TEST(SystemSimpleTestQuantized1) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const int8_t input_data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                               2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int weights_dims_data[] = {2, 3, 10};
+  const int8_t weights_data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {1, 1, 1};
+  const int8_t expected_output_data[] = {41, 41, 41, 41, 41, 41};
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(LocalSimpleTestQuantized1) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_local[] = {2, 2, 10};
+  const int weights_dims_data_local[] = {2, 3, 10};
+  const int bias_dims_data_local[] = {1, 3};
+  const int output_dims_data_local[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+
+#pragma Bss(".Zdata")
+  const int8_t input_data_local[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int8_t weights_data_local[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int32_t bias_data_local[] = {1, 1, 1};
+  int8_t output_data_local[output_dims_count];
+#pragma Bss()
+
+  const int8_t expected_output_data[] = {41, 41, 41, 41, 41, 41};
+
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_local, input_data_local, input_min, input_max,
+      weights_dims_data_local, weights_data_local, weights_min, weights_max,
+      bias_dims_data_local, bias_data_local, bias_scale, expected_output_data,
+      output_dims_data_local, output_min, output_max, kTfLiteActNone,
+      output_data_local);
+}
+
+// Test group 2
+TF_LITE_MICRO_TEST(SystemSimpleTestQuantized2) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_2[] = {2, 10, 4};
+  const int8_t input_data_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int weights_dims_data_2[] = {2, 6, 4};
+  const int8_t weights_data_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int bias_dims_data_2[] = {1, 6};
+  const int32_t bias_data_2[] = {1, 1, 1, 1, 1, 1};
+  const int8_t expected_output_data_2[] = {
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+      17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
+  const int output_dims_data_2[] = {2, 10, 6};
+
+  const int output_dims_count_2 = 60;
+  int8_t output_data_2[output_dims_count_2];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_2, input_data_2, input_min, input_max,
+      weights_dims_data_2, weights_data_2, weights_min, weights_max,
+      bias_dims_data_2, bias_data_2, bias_scale, expected_output_data_2,
+      output_dims_data_2, output_min, output_max, kTfLiteActNone,
+      output_data_2);
+}
+
+TF_LITE_MICRO_TEST(LocalSimpleTestQuantized2) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_local_2[] = {2, 10, 4};
+  const int weights_dims_data_local_2[] = {2, 6, 4};
+  const int bias_dims_data_local_2[] = {1, 6};
+  const int output_dims_data_local_2[] = {2, 10, 6};
+
+  const int output_dims_count_local_2 = 60;
+
+#pragma Bss(".Zdata")
+  const int8_t input_data_local_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int8_t weights_data_local_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int32_t bias_data_local_2[] = {1, 1, 1, 1, 1, 1};
+  int8_t output_data_local_2[output_dims_count_local_2];
+#pragma Bss()
+
+  const int8_t expected_output_data_local_2[] = {41, 41, 41, 41, 41, 41};
+
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_local_2, input_data_local_2, input_min, input_max,
+      weights_dims_data_local_2, weights_data_local_2, weights_min, weights_max,
+      bias_dims_data_local_2, bias_data_local_2, bias_scale,
+      expected_output_data_local_2, output_dims_data_local_2, output_min,
+      output_max, kTfLiteActNone, output_data_local_2);
+}
+
+// Test group 3
+TF_LITE_MICRO_TEST(SystemSimpleTestQuantized3) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_3[] = {2, 2, 5};
+  const int8_t input_data_3[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int weights_dims_data_3[] = {2, 10, 5};
+  const int8_t weights_data_3[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int bias_dims_data_3[] = {1, 10};
+  const int32_t bias_data_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int8_t expected_output_data_3[] = {21, 21, 21, 21, 21, 21, 21,
+                                           21, 21, 21, 21, 21, 21, 21,
+                                           21, 21, 21, 21, 21, 21};
+  const int output_dims_data_3[] = {2, 2, 10};
+
+  const int output_dims_count_3 = 20;
+  int8_t output_data_3[output_dims_count_3];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_3, input_data_3, input_min, input_max,
+      weights_dims_data_3, weights_data_3, weights_min, weights_max,
+      bias_dims_data_3, bias_data_3, bias_scale, expected_output_data_3,
+      output_dims_data_3, output_min, output_max, kTfLiteActNone,
+      output_data_3);
+}
+
+TF_LITE_MICRO_TEST(LocalSimpleTestQuantized3) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_local_3[] = {2, 2, 5};
+  const int weights_dims_data_local_3[] = {2, 10, 5};
+  const int bias_dims_data_local_3[] = {1, 10};
+  const int output_dims_data_local_3[] = {2, 2, 10};
+
+  const int output_dims_count_local_3 = 20;
+
+#pragma Bss(".Zdata")
+  static int8_t input_data_local_3[10];
+  static int8_t weights_data_local_3[50];
+  static int32_t bias_data_local_3[10];
+  static int8_t output_data_local_3[output_dims_count_local_3];
+#pragma Bss()
+
+  for (int i = 0; i < 10; ++i) {
+    input_data_local_3[i] = 2;
+  }
+
+  for (int i = 0; i < 50; ++i) {
+    weights_data_local_3[i] = 2;
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    bias_data_local_3[i] = 1;
+  }
+
+  for (int i = 0; i < 20; ++i) {
+    output_data_local_3[i] = 0;
+  }
+
+  const int8_t expected_output_data_local_3[] = {21, 21, 21, 21, 21, 21, 21,
+                                                 21, 21, 21, 21, 21, 21, 21,
+                                                 21, 21, 21, 21, 21, 21};
+
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_local_3, input_data_local_3, input_min, input_max,
+      weights_dims_data_local_3, weights_data_local_3, weights_min, weights_max,
+      bias_dims_data_local_3, bias_data_local_3, bias_scale,
+      expected_output_data_local_3, output_dims_data_local_3, output_min,
+      output_max, kTfLiteActNone, output_data_local_3);
+}
+
+// Test group 4
+TF_LITE_MICRO_TEST(SystemSimpleTestQuantized4) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_4[] = {2, 5, 10};
+  const int8_t input_data_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int weights_dims_data_4[] = {2, 5, 10};
+  const int8_t weights_data_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int bias_dims_data_4[] = {1, 5};
+  const int32_t bias_data_4[] = {1, 1, 1, 1, 1};
+  const int8_t expected_output_data_4[] = {41, 41, 41, 41, 41, 41, 41, 41, 41,
+                                           41, 41, 41, 41, 41, 41, 41, 41, 41,
+                                           41, 41, 41, 41, 41, 41, 41};
+  const int output_dims_data_4[] = {2, 5, 5};
+
+  const int output_dims_count_4 = 25;
+  int8_t output_data_4[output_dims_count_4];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_4, input_data_4, input_min, input_max,
+      weights_dims_data_4, weights_data_4, weights_min, weights_max,
+      bias_dims_data_4, bias_data_4, bias_scale, expected_output_data_4,
+      output_dims_data_4, output_min, output_max, kTfLiteActNone,
+      output_data_4);
+}
+
+TF_LITE_MICRO_TEST(LocalSimpleTestQuantized4) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_local_4[] = {2, 5, 10};
+  const int weights_dims_data_local_4[] = {2, 5, 10};
+  const int bias_dims_data_local_4[] = {1, 5};
+  const int output_dims_data_local_4[] = {2, 5, 5};
+
+  const int output_dims_count_local_4 = 25;
+
+#pragma Bss(".Zdata")
+  const int8_t input_data_local_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int8_t weights_data_local_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int32_t bias_data_local_4[] = {1, 1, 1, 1, 1};
+  int8_t output_data_local_4[output_dims_count_local_4];
+#pragma Bss()
+
+  const int8_t expected_output_data_local_4[] = {
+      41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
+      41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41};
+
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_local_4, input_data_local_4, input_min, input_max,
+      weights_dims_data_local_4, weights_data_local_4, weights_min, weights_max,
+      bias_dims_data_local_4, bias_data_local_4, bias_scale,
+      expected_output_data_local_4, output_dims_data_local_4, output_min,
+      output_max, kTfLiteActNone, output_data_local_4);
+}
+
+TF_LITE_MICRO_TESTS_END
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
@ -0,0 +1,126 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mli_slicers.h"  // NOLINT
+
+#include <algorithm>
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim,
+                           int slice_size, int padding_pre, int padding_post,
+                           int overlap, bool interleave_mode)
+    : full_tensor_(full_tensor),
+      sliceDim_(slice_dim),
+      pad_pre_(padding_pre),
+      pad_post_(padding_post),
+      overlap_(overlap),
+      sub_cfg_{0},
+      sub_tensor_{0},
+      done_(false) {
+  /* In the interleave mode, the slicing happens from the deepest dimension up
+  to the slice_dim for example in an HWC layout this can mode can be used to
+  slice in the C dimenstion. in this mode the data is not contiguous in memory
+  anymore */
+  if (interleave_mode) {
+    for (int i = 0; i < full_tensor->rank; i++) {
+      if (i > slice_dim) {
+        sub_cfg_.size[i] = 1;
+      } else if (i == slice_dim) {
+        sub_cfg_.size[i] = slice_size;
+      } else {
+        sub_cfg_.size[i] = full_tensor->shape[i];
+      }
+    }
+    sub_cfg_.sub_tensor_rank = full_tensor->rank;
+
+  } else {
+    /* In the not interleaved mode, the slicing happens from the outer most
+    dimension up to the slice_dim for example in an HWC layout this mode can be
+    used to slice in the H dimension. in this mode the data of the slice is
+    still contiguous in memory (if that was the case in the input tensor */
+    for (int i = 0; i < full_tensor->rank; i++) {
+      if (i < slice_dim) {
+        sub_cfg_.size[i] = 1;
+      } else if (i == slice_dim) {
+        sub_cfg_.size[i] = slice_size;
+      } else {
+        sub_cfg_.size[i] = full_tensor->shape[i];
+      }
+    }
+    sub_cfg_.sub_tensor_rank = full_tensor->rank - slice_dim;
+  }
+
+  ComputeSubTensor();
+}
+
+void TensorSlicer::ComputeSubTensor(void) {
+  // subtsr_cfg_ is used to keep track of the iteration.
+  // A copy is created to update it with the correct clipping and padding for
+  // the current slice
+  mli_sub_tensor_cfg cfg_new = sub_cfg_;
+
+  // begin and end spans the complete input region including padding areas.
+  const int begin = (int)sub_cfg_.offset[sliceDim_] - pad_pre_;
+  // end is clipped to the end of the full input region. this is needed for
+  // cases where the last slice is smaller than the rest.
+  const int end = std::min(begin + sub_cfg_.size[sliceDim_] + overlap_,
+                           full_tensor_->shape[sliceDim_] + pad_post_);
+  // The start coordinate of the subtensor is clipped to zero
+  cfg_new.offset[sliceDim_] = std::max(begin, 0);
+  // and the stop coordinate is clipped to the size of the full tensor
+  const int stop_coord =
+      std::min(end, static_cast<int>(full_tensor_->shape[sliceDim_]));
+  // compute the size of the subtensor
+  cfg_new.size[sliceDim_] = stop_coord - cfg_new.offset[sliceDim_];
+
+  // compute the padding configuration for the current slice.
+  actual_padding_pre = cfg_new.offset[sliceDim_] - begin;
+  actual_padding_post = end - stop_coord;
+
+  mli_hlp_create_subtensor(full_tensor_, &cfg_new, &sub_tensor_);
+}
+
+void TensorSlicer::Next(void) {
+  for (int i = full_tensor_->rank - 1; i >= 0; i--) {
+    sub_cfg_.offset[i] += sub_cfg_.size[i];
+    if (sub_cfg_.offset[i] >= full_tensor_->shape[i]) {
+      // wrap
+      sub_cfg_.offset[i] = 0;
+      // and continue to the next dimension, if no next dimension we are done.
+      if (i == 0) done_ = true;
+      continue;
+    } else {
+      // carry is false, so break from the loop
+      break;
+    }
+  }
+
+  if (!done_) ComputeSubTensor();
+}
+
+bool TensorSlicer::Done(void) { return done_; }
+
+int TensorSlicer::GetPaddingPre(void) { return actual_padding_pre; }
+
+int TensorSlicer::GetPaddingPost(void) { return actual_padding_post; }
+
+mli_tensor* TensorSlicer::Sub(void) { return &sub_tensor_; }
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h
@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
+
+#include "mli_api.h"  // NOLINT
+namespace tflite {
+namespace ops {
+namespace micro {
+
+class TensorSlicer {
+ public:
+  TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size,
+               int padding_pre = 0, int padding_post = 0, int overlap = 0,
+               bool interleave_mode = false);
+  ~TensorSlicer() = default;
+
+  void Next();
+  bool Done();
+  int GetPaddingPre();
+  int GetPaddingPost();
+
+  mli_tensor* Sub();
+
+  // Default constructor is deleted
+  TensorSlicer() = delete;
+
+ private:
+  const mli_tensor* full_tensor_;
+  mli_tensor sub_tensor_;
+  mli_sub_tensor_cfg sub_cfg_;
+  bool done_;
+  int sliceDim_;
+  int pad_pre_, pad_post_, overlap_;
+  int actual_padding_pre, actual_padding_post;
+
+  void ComputeSubTensor();
+};
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
@ -0,0 +1,376 @@
+/* Copyright 2019-2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace pooling {
+
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  TfLitePaddingValues padding;
+};
+
+enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 };
+
+bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
+                     const TfLitePoolParams* params) {
+  // MLI optimized version only supports int8 dataype and no fused Relu
+  return (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone);
+}
+
+TfLiteStatus CalculateOpData(const TfLiteContext* context,
+                             const TfLitePoolParams* params,
+                             const TfLiteTensor* input,
+                             const TfLiteTensor* output, OpData* data) {
+  // input: batch, height, width, channel
+  int height = SizeOfDimension(input, 1);
+  int width = SizeOfDimension(input, 2);
+
+  int out_height, out_width;
+
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width,
+      /*dilation_rate_height=*/1,
+      /*dilation_rate_width=*/1, height, width, params->filter_height,
+      params->filter_width, params->padding, &out_height, &out_width);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
+                              const TfLitePoolParams* params,
+                              const OpData* data, const TfLiteTensor* input,
+                              TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  float activation_min, activation_max;
+  CalculateActivationRange(params->activation, &activation_min,
+                           &activation_max);
+
+  PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.float_activation_min = activation_min;
+  op_params.float_activation_max = activation_max;
+  reference_ops::AveragePool(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(output), GetTensorData<float>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+// Prepare MLI tensors and run Average or Max Pooling
+TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params,
+                     const OpData* data, const TfLiteTensor* input,
+                     TfLiteTensor* output, const MliPoolingType pooling_type) {
+  mli_tensor mli_in = {0};
+  mli_tensor mli_out = {0};
+  mli_pool_cfg cfg = {0};
+
+  ConvertToMliTensor<int8_t>(input, &mli_in);
+  ConvertToMliTensor<int8_t>(output, &mli_out);
+
+  cfg.kernel_width = params->filter_width;
+  cfg.kernel_height = params->filter_height;
+  cfg.stride_width = params->stride_width;
+  cfg.stride_height = params->stride_height;
+
+  if (params->padding == kTfLitePaddingValid) {
+    cfg.padding_left = 0;
+    cfg.padding_right = 0;
+    cfg.padding_top = 0;
+    cfg.padding_bottom = 0;
+  } else {
+    cfg.padding_left = data->padding.width;
+    cfg.padding_right = data->padding.width + data->padding.width_offset;
+    cfg.padding_top = data->padding.height;
+    cfg.padding_bottom = data->padding.height + data->padding.height_offset;
+  }
+
+  const int height_dimension = 1;
+  int in_slice_height = 0;
+  int out_slice_height = 0;
+  const int overlap = cfg.kernel_height - cfg.stride_height;
+
+  // Tensors for data in fast (local) memory and config to copy data from
+  // external to local memory
+  mli_tensor in_local = mli_in;
+  mli_tensor out_local = mli_out;
+  mli_mov_cfg_t copy_config;
+  mli_mov_cfg_for_copy(&copy_config);
+  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_pooling_tensors(
+      context, &in_local, &out_local));
+  bool in_is_local = in_local.data == mli_in.data;
+  bool out_is_local = out_local.data == mli_out.data;
+  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+      &in_local, &out_local, cfg.kernel_height, cfg.stride_height,
+      cfg.padding_top, cfg.padding_bottom, &in_slice_height,
+      &out_slice_height));
+
+  /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional
+     tensor. because the mli kernel will process one HWC tensor at a time, the 4
+     dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. on
+     top of that there could be a need to also slice in the Height dimension.
+     for that the sliceHeight has been calculated. The tensor slicer is
+     configured that it will completely slice the nBatch dimension (0) and slice
+     the height dimension (1) in chunks of 'sliceHeight' */
+  TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height,
+                        cfg.padding_top, cfg.padding_bottom, overlap);
+  TensorSlicer out_slice(&mli_out, height_dimension, out_slice_height);
+
+  /* is_local indicates that the tensor is already in local memory,
+     so in that case the original tensor can be used,
+     and there is no need to copy it to the local tensor*/
+  mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+  mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+  while (!out_slice.Done()) {
+    cfg.padding_top = in_slice.GetPaddingPre();
+    cfg.padding_bottom = in_slice.GetPaddingPost();
+
+    mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+    if (pooling_type == AveragePooling)
+      mli_krn_avepool_hwc_sa8(in_ptr, &cfg, out_ptr);
+    else if (pooling_type == MaxPooling)
+      mli_krn_maxpool_hwc_sa8(in_ptr, &cfg, out_ptr);
+    mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+    in_slice.Next();
+    out_slice.Next();
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
+                                  const TfLiteNode* node,
+                                  const TfLitePoolParams* params,
+                                  const OpData* data, const TfLiteTensor* input,
+                                  TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
+  int32_t activation_min, activation_max;
+  (void)CalculateActivationRangeQuantized(context, params->activation, output,
+                                          &activation_min, &activation_max);
+  PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = activation_min;
+  op_params.quantized_activation_max = activation_max;
+
+  if (input->type == kTfLiteUInt8) {
+    reference_ops::AveragePool(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(output), GetTensorData<uint8_t>(output));
+  } else {
+    reference_integer_ops::AveragePool(
+        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(output), GetTensorData<int8_t>(output));
+  }
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(
+      context,
+      "Node configuration or type %s (%d) is not supported by ARC MLI Library.",
+      TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
+                          TfLitePoolParams* params, OpData* data,
+                          const TfLiteTensor* input, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  float activation_min, activation_max;
+  CalculateActivationRange(params->activation, &activation_min,
+                           &activation_max);
+
+  tflite::PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.float_activation_min = activation_min;
+  op_params.float_activation_max = activation_max;
+  reference_ops::MaxPool(op_params, GetTensorShape(input),
+                         GetTensorData<float>(input), GetTensorShape(output),
+                         GetTensorData<float>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+
+TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                              TfLitePoolParams* params, OpData* data,
+                              const TfLiteTensor* input, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
+  int32_t activation_min, activation_max;
+  (void)CalculateActivationRangeQuantized(context, params->activation, output,
+                                          &activation_min, &activation_max);
+
+  tflite::PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = activation_min;
+  op_params.quantized_activation_max = activation_max;
+
+  if (input->type == kTfLiteUInt8) {
+    reference_ops::MaxPool(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(output), GetTensorData<uint8_t>(output));
+  } else {
+    reference_integer_ops::MaxPool(
+        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(output), GetTensorData<int8_t>(output));
+  }
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(
+      context,
+      "Node configuration or type %s (%d) is not supported by ARC MLI Library.",
+      TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
+}
+}  // namespace
+
+TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+  OpData data;
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
+
+  // Inputs and outputs share the same type, guaranteed by the converter.
+  switch (input->type) {
+    case kTfLiteFloat32:
+      return AverageEvalFloat(context, node, params, &data, input, output);
+      break;
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+      if (IsMliApplicable(context, input, params)) {
+        return EvalMli(context, params, &data, input, output, AveragePooling);
+      } else {
+        return AverageEvalQuantized(context, node, params, &data, input,
+                                    output);
+      }
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+  OpData data;
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
+
+  switch (input->type) {
+    case kTfLiteFloat32:
+      return MaxEvalFloat(context, node, params, &data, input, output);
+      break;
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+      if (IsMliApplicable(context, input, params)) {
+        return EvalMli(context, params, &data, input, output, MaxPooling);
+      } else {
+        return MaxEvalQuantized(context, node, params, &data, input, output);
+      }
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace pooling
+
+TfLiteRegistration* Register_AVERAGE_POOL_2D() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/nullptr,
+                                 /*invoke=*/pooling::AverageEval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+TfLiteRegistration* Register_MAX_POOL_2D() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/nullptr,
+                                 /*invoke=*/pooling::MaxEval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
@ -0,0 +1,422 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This test checks that slicing logic doesn`t affect result of pooling kernels
+//
+// This test doesn`t replace default pooling test
+// (tensorflow/lite/micro/kernels/pooling.cc). It is added to the
+// whole testset only in case MLI for ARC platform is used during generation
+// (which is handled in arc_mli.inc). So such tests won`t be generated for other
+// platforms.
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+template <typename T>
+void TestAveragePoolingQuantized(
+    const int* input_dims_data, const T* input_data, const float input_min,
+    const float input_max, const int filter_height, const int filter_width,
+    const int stride_height, const int stride_width,
+    const T* expected_output_data, const int* output_dims_data,
+    float output_min, float output_max, TfLitePadding padding,
+    TfLiteFusedActivation activation, T* output_data) {
+  static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLitePoolParams builtin_data = {padding,      stride_width,  stride_height,
+                                   filter_width, filter_height, activation};
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
+  }
+}
+
+template <typename T>
+void TestMaxPoolQuantized(const int* input_dims_data, const T* input_data,
+                          float input_min, float input_max, int filter_width,
+                          int filter_height, int stride_width,
+                          int stride_height, const T* expected_output_data,
+                          float output_min, float output_max,
+                          const int* output_dims_data, TfLitePadding padding,
+                          TfLiteFusedActivation activation, T* output_data) {
+  static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLitePoolParams builtin_data = {
+      padding,      stride_width,  stride_height,
+      filter_width, filter_height, activation,
+  };
+
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SystemAveragePoolTestInt1) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int8_t output_data[3];
+
+  const int kInput1Shape[] = {4, 1, 2, 4, 1};
+  const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput1Shape[] = {4, 1, 1, 3, 1};
+  const int8_t kGolden1Data[] = {1, 1, 1};
+
+  tflite::testing::TestAveragePoolingQuantized(
+      kInput1Shape,                       // Input shape
+      kInput1Data, input_min, input_max,  // input quantization range
+      2, 2,                               // filter height, filter width
+      1, 1,                               // stride height, stride width
+      kGolden1Data,
+      kOutput1Shape,           // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(LocalAveragePoolTestInt1) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int8_t output_data[3];
+
+#pragma Bss(".Zdata")
+  const int kInput1Shape[] = {4, 1, 2, 4, 1};
+  const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput1Shape[] = {4, 1, 1, 3, 1};
+  const int8_t kGolden1Data[] = {1, 1, 1};
+#pragma Bss()
+
+  tflite::testing::TestAveragePoolingQuantized(
+      kInput1Shape,                       // Input shape
+      kInput1Data, input_min, input_max,  // input quantization range
+      2, 2,                               // filter height, filter width
+      1, 1,                               // stride height, stride width
+      kGolden1Data,
+      kOutput1Shape,           // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+// Test group AVG 2
+TF_LITE_MICRO_TEST(SystemAveragePoolTestInt2) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int8_t output_data[45];
+
+  const int kInput2Shape[] = {4, 1, 6, 10, 1};
+  const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput2Shape[] = {4, 1, 5, 9, 1};
+  const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+  tflite::testing::TestAveragePoolingQuantized(
+      kInput2Shape,                       // Input shape
+      kInput2Data, input_min, input_max,  // input quantization range
+      2, 2,                               // filter height, filter width
+      1, 1,                               // stride height, stride width
+      kGolden2Data,
+      kOutput2Shape,           // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(LocalAveragePoolTestInt2) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int8_t output_data[45];
+
+#pragma Bss(".Zdata")
+  const int kInput2Shape[] = {4, 1, 6, 10, 1};
+  const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput2Shape[] = {4, 1, 5, 9, 1};
+  const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+#pragma Bss()
+
+  tflite::testing::TestAveragePoolingQuantized(
+      kInput2Shape,                       // Input shape
+      kInput2Data, input_min, input_max,  // input quantization range
+      2, 2,                               // filter height, filter width
+      1, 1,                               // stride height, stride width
+      kGolden2Data,
+      kOutput2Shape,           // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+// Test group MAX 1
+TF_LITE_MICRO_TEST(SystemMaxPoolTestInt1) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[3];
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+
+  const int kInput1Shape[] = {4, 1, 2, 4, 1};
+  const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput1Shape[] = {4, 1, 1, 3, 1};
+  const int8_t kGolden1Data[] = {1, 1, 1};
+
+  tflite::testing::TestMaxPoolQuantized(
+      kInput1Shape,  // Input shape
+      kInput1Data, input_min, input_max, filter_width, filter_height,
+      stride_width, stride_height, kGolden1Data, output_min, output_max,
+      kOutput1Shape,  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(LocalMaxPoolTestInt1) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[3];
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+
+#pragma Bss(".Zdata")
+  const int kInput1Shape[] = {4, 1, 2, 4, 1};
+  const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput1Shape[] = {4, 1, 1, 3, 1};
+  const int8_t kGolden1Data[] = {1, 1, 1};
+#pragma Bss()
+
+  tflite::testing::TestMaxPoolQuantized(
+      kInput1Shape,  // Input shape
+      kInput1Data, input_min, input_max, filter_width, filter_height,
+      stride_width, stride_height, kGolden1Data, output_min, output_max,
+      kOutput1Shape,  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+// Test group MAX 2
+TF_LITE_MICRO_TEST(SystemMaxPoolTestInt2) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[45];
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+
+  const int kInput2Shape[] = {4, 1, 6, 10, 1};
+  const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput2Shape[] = {4, 1, 5, 9, 1};
+  const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+  tflite::testing::TestMaxPoolQuantized(
+      kInput2Shape,  // Input shape
+      kInput2Data, input_min, input_max, filter_width, filter_height,
+      stride_width, stride_height, kGolden2Data, output_min, output_max,
+      kOutput2Shape,  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(LocalMaxPoolTestInt2) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[45];
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+
+#pragma Bss(".Zdata")
+  const int kInput2Shape[] = {4, 1, 6, 10, 1};
+  const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput2Shape[] = {4, 1, 5, 9, 1};
+  const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+#pragma Bss()
+
+  tflite::testing::TestMaxPoolQuantized(
+      kInput2Shape,  // Input shape
+      kInput2Data, input_min, input_max, filter_width, filter_height,
+      stride_width, stride_height, kGolden2Data, output_min, output_max,
+      kOutput2Shape,  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
@ -0,0 +1,338 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+
+#include <limits.h>
+
+#include <algorithm>
+
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2,
+                                     int* grant_size_1, int* grant_size_2) {
+  int maxrequest = 0;
+  int secondrequest = 0;
+  int maxavailable = 0;
+  int secondavail = 0;
+
+  // determine the largest requested buffer.
+  if (request_size_1 > request_size_2) {
+    maxrequest = request_size_1;
+    secondrequest = request_size_2;
+  } else {
+    maxrequest = request_size_2;
+    secondrequest = request_size_1;
+  }
+
+  // find the two largest available buffers.
+  get_arc_scratch_buffer_two_max_sizes(&maxavailable, &secondavail);
+
+  // in case two buffers are available, the largest buffer can go to the largest
+  // request.
+  if (secondavail > 0) {  // this condition can be enhanced to prevent cases
+                          // where the second buffer is so small that it is
+                          // better to use one buffer and split it.
+    if (request_size_1 > request_size_2) {
+      *grant_size_1 = maxavailable;
+      *grant_size_2 = secondavail;
+    } else {
+      *grant_size_1 = secondavail;
+      *grant_size_2 = maxavailable;
+    }
+  } else {
+    // In case only one buffer is available,
+    // use only the max buffer, and split it.
+    *grant_size_1 = maxavailable / 2;
+    *grant_size_2 = maxavailable / 2;
+  }
+}
+
+static TfLiteStatus get_arc_scratch_buffer_for_io_tensors(
+    TfLiteContext* context, mli_tensor* in, mli_tensor* out) {
+#ifdef __Xxy
+  int request_size_in = 0;
+  int request_size_out = 0;
+  int grant_size_in = 0;
+  int grant_size_out = 0;
+  if (!inside_arc_ccm(in->data)) {
+    // In case the input tensor contains multiple batches, it has rank 4
+    // because the mli kernel cannot operate on batches, we need to have the
+    // size of a single HWC tensor. that is why the start_rank is 1 in case of
+    // input rank 4
+    int start_rank = in->rank - 3;
+    request_size_in = mli_hlp_count_elem_num(in, start_rank) *
+                      mli_hlp_tensor_element_size(in);
+  }
+  if (!inside_arc_ccm(out->data)) {
+    // In case the input tensor contains multiple batches, it has rank 4
+    // because the mli kernel cannot operate on batches, we need to have the
+    // size of a single batch. that is why the start_rank is 1 in case of input
+    // rank 4
+    int start_rank = out->rank - 3;
+    request_size_out = mli_hlp_count_elem_num(out, start_rank) *
+                       mli_hlp_tensor_element_size(out);
+  }
+
+  get_arc_two_buffer_sizes(request_size_in, request_size_out, &grant_size_in,
+                           &grant_size_out);
+
+  if (!inside_arc_ccm(in->data)) {
+    in->data = get_arc_scratch_buffer(grant_size_in);
+    in->capacity = grant_size_in;
+    if (in->data == NULL) return kTfLiteError;
+  }
+  if (!inside_arc_ccm(out->data)) {
+    out->data = get_arc_scratch_buffer(grant_size_out);
+    out->capacity = grant_size_out;
+    if (out->data == NULL) return kTfLiteError;
+  }
+#endif
+  return kTfLiteOk;
+}
+
+TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
+                                                     mli_tensor* in,
+                                                     mli_tensor* weights,
+                                                     mli_tensor* bias,
+                                                     mli_tensor* out) {
+  TfLiteStatus ret_val = kTfLiteOk;
+#ifdef __Xxy
+  init_arc_scratch_buffers();
+  if (!inside_arc_ccm(weights->data)) {
+    int weights_size = mli_hlp_count_elem_num(weights, 0) *
+                       mli_hlp_tensor_element_size(weights);
+    int max_weights_size = 0;
+    weights->data = get_arc_scratch_buffer(weights_size);
+    weights->capacity = weights_size;
+    if (weights->data == NULL) {
+      get_arc_scratch_buffer_max_size(&max_weights_size);
+      weights->data = get_arc_scratch_buffer(max_weights_size);
+      weights->capacity = max_weights_size;
+      if (max_weights_size == 0) ret_val = kTfLiteError;
+    }
+    if (weights->data == NULL) ret_val = kTfLiteError;
+  }
+
+  if (!inside_arc_ccm(bias->data)) {
+    uint32_t bias_mem_requirements =
+        mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
+    bias->data = get_arc_scratch_buffer(bias_mem_requirements);
+    bias->capacity = bias_mem_requirements;
+  }
+
+  if (ret_val == kTfLiteOk) {
+    ret_val = get_arc_scratch_buffer_for_io_tensors(context, in, out);
+  }
+
+  if (bias->data == NULL) {
+    int max_bias_size = 0;
+    get_arc_scratch_buffer_max_size(&max_bias_size);
+    bias->data = get_arc_scratch_buffer(max_bias_size);
+    bias->capacity = max_bias_size;
+    if (max_bias_size == 0) ret_val = kTfLiteError;
+  }
+  if (bias->data == NULL) ret_val = kTfLiteError;
+
+#endif
+  return ret_val;
+}
+
+TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
+    TfLiteContext* context, mli_tensor* in, mli_tensor* weights,
+    mli_tensor* bias, mli_tensor* out) {
+  TfLiteStatus ret_val = kTfLiteOk;
+#ifdef __Xxy
+  init_arc_scratch_buffers();
+  /* strategy for FC kernels:
+     first allocate input, because this cannot be sliced. (in case of batch
+     processing, only a single input needs to be allocated) then weigths & bias
+     because if fully loaded, they can be reused over batches. then output.
+     The number of output channels (for weights slicing) depends on size of
+     output and size of weights&bias */
+
+  if (!inside_arc_ccm(in->data)) {
+    /* In case the input tensor contains multiple batches,
+       only count the size if the inner most dimension */
+    int size_in = mli_hlp_count_elem_num(in, in->rank - 1) *
+                  mli_hlp_tensor_element_size(in);
+    in->data = get_arc_scratch_buffer(size_in);
+    in->capacity = size_in;
+    if (in->data == NULL) {
+      in->capacity = 0;
+      ret_val = kTfLiteError;
+    }
+  }
+
+  if (!inside_arc_ccm(weights->data)) {
+    int weights_size = mli_hlp_count_elem_num(weights, 0) *
+                       mli_hlp_tensor_element_size(weights);
+    int max_weights_size = 0;
+    weights->data = get_arc_scratch_buffer(weights_size);
+    weights->capacity = weights_size;
+    if (weights->data == NULL) {
+      get_arc_scratch_buffer_max_size(&max_weights_size);
+      weights->data = get_arc_scratch_buffer(max_weights_size);
+      weights->capacity = max_weights_size;
+      if (max_weights_size == 0) ret_val = kTfLiteError;
+    }
+    if (weights->data == NULL) ret_val = kTfLiteError;
+  }
+
+  if (!inside_arc_ccm(bias->data)) {
+    int bias_mem_requirements =
+        mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
+    bias->data = get_arc_scratch_buffer(bias_mem_requirements);
+    bias->capacity = bias_mem_requirements;
+  }
+
+  if (!inside_arc_ccm(out->data)) {
+    /* In case the input tensor contains multiple batches,
+       only count the size if the inner most dimension */
+    int out_size = mli_hlp_count_elem_num(out, out->rank - 1) *
+                   mli_hlp_tensor_element_size(out);
+    int max_out_size = 0;
+    out->data = get_arc_scratch_buffer(out_size);
+    out->capacity = out_size;
+    if (out->data == NULL) {
+      get_arc_scratch_buffer_max_size(&max_out_size);
+      out->data = get_arc_scratch_buffer(max_out_size);
+      out->capacity = max_out_size;
+      if (max_out_size == 0) ret_val = kTfLiteError;
+    }
+    if (out->data == NULL) ret_val = kTfLiteError;
+  }
+
+  if (bias->data == NULL) {
+    int max_bias_size = 0;
+    get_arc_scratch_buffer_max_size(&max_bias_size);
+    bias->data = get_arc_scratch_buffer(max_bias_size);
+    bias->capacity = max_bias_size;
+    if (max_bias_size == 0) ret_val = kTfLiteError;
+  }
+  if (bias->data == NULL) ret_val = kTfLiteError;
+
+#endif
+  return ret_val;
+}
+
+TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
+    const mli_tensor* in, const mli_tensor* out, const int kernel_height,
+    const int stride_height, const int padding_top, const int padding_bot,
+    int* in_slice_height, int* out_slice_height) {
+  const int height_dimension = 1;
+  const int in_height = in->shape[height_dimension];
+  const int out_height = out->shape[height_dimension];
+  const int line_size_in = mli_hlp_count_elem_num(in, height_dimension + 1) *
+                           mli_hlp_tensor_element_size(in);
+  const int line_size_out = mli_hlp_count_elem_num(out, height_dimension + 1) *
+                            mli_hlp_tensor_element_size(out);
+  int max_lines_in = 0;
+  int max_lines_out = 0;
+  int max_out_lines_for_input = 0;
+  bool fit = (in->capacity >= in_height * line_size_in) &&
+             (out->capacity >= out_height * line_size_out);
+  if (fit) {
+    // in case both tensors completely fit in the capacity, there is no need for
+    // slicing
+    *in_slice_height = in_height;
+    *out_slice_height = out_height;
+  } else {
+    // First compute how many lines fit into the input tensor, and compute how
+    // many output lines can be computed with that.
+    max_lines_in =
+        std::min(in_height, static_cast<int>(in->capacity) / line_size_in);
+    if (max_lines_in >= in_height) {
+      max_out_lines_for_input = out_height;
+    } else if (2 * max_lines_in >= in_height) {
+      // in this case only two slices are needed, so both could benefit from
+      // padding. take the MIN to get the worst case.
+      max_out_lines_for_input =
+          (max_lines_in + std::min(padding_top, padding_bot) - kernel_height +
+           1) /
+          stride_height;
+    } else {
+      max_out_lines_for_input =
+          (max_lines_in - kernel_height + 1) / stride_height;
+    }
+    // Ten compute how many ouput lines fit into the output tensor.
+    max_lines_out =
+        std::min(out_height, static_cast<int>(out->capacity) / line_size_out);
+    // the smallest of the two determines the slice height for the output, and
+    // the derived sliceheight for the input.
+    *out_slice_height = std::min(max_out_lines_for_input, max_lines_out);
+    *in_slice_height = *out_slice_height * stride_height;
+  }
+
+  if ((*in_slice_height > 0) && (*out_slice_height > 0)) {
+    return kTfLiteOk;
+  } else {
+    return kTfLiteError;
+  }
+}
+
+TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
+    const mli_tensor* weights, const mli_tensor* bias,
+    const int weight_out_ch_dimension, int* slice_channels) {
+  const int channels = weights->shape[weight_out_ch_dimension];
+  const int ch_size_w = (mli_hlp_count_elem_num(weights, 0) / channels) *
+                        mli_hlp_tensor_element_size(weights);
+  const int ch_size_b = (mli_hlp_count_elem_num(bias, 0) / channels) *
+                        mli_hlp_tensor_element_size(bias);
+  int max_ch_weigths = 0;
+  int max_ch_bias = 0;
+
+  bool fit = (weights->capacity >= channels * ch_size_w) &&
+             (bias->capacity >= channels * ch_size_b);
+  if (fit) {
+    // in case both tensors completely fit in the capacity, there is no need for
+    // slicing
+    *slice_channels = channels;
+  } else {
+    // First compute how many channels fit into the weights tensor
+    max_ch_weigths =
+        std::min(channels, static_cast<int>(weights->capacity) / ch_size_w);
+    // Ten compute how many channels fit into the bias tensor.
+    max_ch_bias =
+        std::min(channels, static_cast<int>(bias->capacity) / ch_size_b);
+    // the smallest of the two determines the slice size
+    *slice_channels = std::min(max_ch_weigths, max_ch_bias);
+  }
+
+  if (*slice_channels > 0) {
+    return kTfLiteOk;
+  } else {
+    return kTfLiteError;
+  }
+}
+
+TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
+                                                        mli_tensor* in,
+                                                        mli_tensor* out) {
+#ifdef __Xxy
+  init_arc_scratch_buffers();
+  return get_arc_scratch_buffer_for_io_tensors(context, in, out);
+#else
+  return kTfLiteOk;
+#endif
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h
@ -0,0 +1,129 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
+#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+/**
+ * @brief Function to allocate scratch buffers for the convolution tensors
+ *
+ * @detail This function will update the data pointers in the 4 tensors with
+ * pointers to scratch buffers in fast local memory.
+ *
+ * @param context  [I] pointer to TfLite context (needed for error handling)
+ * @param in [IO] pointer to the input tensor
+ * @param weights [IO] pointer to the weights tensor
+ * @param bias [IO] pointer to the bias tensor
+ * @param output [IO] pointer to the output tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
+                                                     mli_tensor* in,
+                                                     mli_tensor* weights,
+                                                     mli_tensor* bias,
+                                                     mli_tensor* out);
+
+/**
+ * @brief Function to allocate scratch buffers for pooling kernels with only
+ * input and output buffers
+ *
+ * @detail This function will update the data pointers in the 2 tensors with
+ * pointers to scratch buffers in fast local memory.
+ *
+ * @param context  [I] pointer to TfLite context (needed for error handling)
+ * @param in [IO] pointer to the input tensor
+ * @param output [IO] pointer to the output tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
+                                                        mli_tensor* in,
+                                                        mli_tensor* out);
+
+/**
+ * @brief Function to allocate scratch buffers for the fully connect tensors
+ *
+ * @detail This function will update the data pointers in the 4 tensors with
+ * pointers to scratch buffers in fast local memory.
+ *
+ * @param context  [I] pointer to TfLite context (needed for error handling)
+ * @param in [IO] pointer to the input tensor
+ * @param weights [IO] pointer to the weights tensor
+ * @param bias [IO] pointer to the bias tensor
+ * @param output [IO] pointer to the output tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
+    TfLiteContext* context, mli_tensor* in, mli_tensor* weights,
+    mli_tensor* bias, mli_tensor* out);
+
+/**
+ * @brief Function to calculate slice size for io tensors
+ *
+ * @detail This function will calculate the slice size in the height dimension
+ * for input and output tensors. it takes into account the kernel size and the
+ * padding. the function will look at the capacity filed in the in and out
+ * tensor to determine the available buffersize.
+ *
+ * @param in [I] pointer to the input tensor
+ * @param out [I] pointer to the output tensor
+ * @param kernelHeight [I] size of the kernel in height dimension
+ * @param strideHeight [I] input stride in height dimension
+ * @param padding_top [I] number of lines with zeros at the top
+ * @param padding_bot [I] number of lines with zeros at the bottom
+ * @param inSliceHeight [O] slice size in height dimension for the input tensor
+ * @param outSliceHeight [O] slice size in height dimension for the output
+ * tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
+    const mli_tensor* in, const mli_tensor* out, const int kernelHeight,
+    const int strideHeight, const int padding_top, const int padding_bot,
+    int* in_slice_height, int* out_slice_height);
+
+/**
+ * @brief Function to calculate slice size for weight slicing
+ *
+ * @detail This function will calculate the slice size in the output channel
+ * dimension for weight and bias tensors. the function will look at the capacity
+ * filed in the weights and bias tensor to determine the available buffersize.
+ *
+ * @param weights [I] pointer to the input tensor
+ * @param bias [I] pointer to the output tensor
+ * @param weightOutChDimension [I] dimension of the output channels in the
+ * weights tensor
+ * @param sliceChannels [O] slice size in output channel dimension
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
+    const mli_tensor* weights, const mli_tensor* bias,
+    const int weight_out_ch_dimension, int* slice_channels);
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
@ -0,0 +1,135 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+#include <limits.h>
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+/* by default use all the XY memory, and half of the DCCM because DCCM is also
+ * used for the data section and the stack. the values can be overruled by
+ * adding a -D option to the makefile of the application
+ */
+#ifndef SCRATCH_MEM_X_SIZE
+#ifdef core_config_xy_size
+#define SCRATCH_MEM_X_SIZE (core_config_xy_size)
+#else
+#define SCRATCH_MEM_X_SIZE (0)
+#endif
+#endif
+
+#ifndef SCRATCH_MEM_Y_SIZE
+#ifdef core_config_xy_size
+#define SCRATCH_MEM_Y_SIZE (core_config_xy_size)
+#else
+#define SCRATCH_MEM_Y_SIZE (0)
+#endif
+#endif
+
+#ifndef SCRATCH_MEM_Z_SIZE
+#ifdef core_config_dccm_size
+#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2)
+#else
+#define SCRATCH_MEM_Z_SIZE (0)
+#endif
+#endif
+
+namespace {
+#pragma Bss(".Xdata")
+static int8_t scratch_mem_x[SCRATCH_MEM_X_SIZE];
+#pragma Bss()
+
+#pragma Bss(".Ydata")
+static int8_t scratch_mem_y[SCRATCH_MEM_Y_SIZE];
+#pragma Bss()
+
+#pragma Bss(".Zdata")
+static int8_t scratch_mem_z[SCRATCH_MEM_Z_SIZE];
+#pragma Bss()
+}  // namespace
+
+static int8_t *scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z};
+static uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE,
+                                   SCRATCH_MEM_Z_SIZE};
+
+void *get_arc_scratch_buffer(int size) {
+  // Function to asign fast memory from one of 3 scratch buffers.
+  // Best Fit strategy - memory is allocated from that memory bank that leaves
+  // the least unused memory.
+  void *buf = NULL;
+  int best_mem_idx = -1;
+  int best_mem_delta = INT_MAX;
+  const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
+  // find a local memory that fits the data size.
+  for (int mem_idx = 0; mem_idx < num_mem; ++mem_idx) {
+    // Best Fit
+    if ((size <= scratch_sizes[mem_idx]) &&
+        (scratch_sizes[mem_idx] - size < best_mem_delta)) {
+      best_mem_idx = mem_idx;
+      best_mem_delta = scratch_sizes[mem_idx] - size;
+    }
+  }
+  if (best_mem_idx >= 0) {
+    buf = static_cast<void *>(scratch_mem[best_mem_idx]);
+    scratch_mem[best_mem_idx] += size;
+    scratch_sizes[best_mem_idx] -= size;
+  }
+  return buf;
+}
+
+void get_arc_scratch_buffer_max_size(int *size) {
+  int maxavailable = 0;
+  const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
+  // find the largest available buffer.
+  for (int i = 0; i < num_mem; i++) {
+    if (scratch_sizes[i] > maxavailable) {
+      maxavailable = scratch_sizes[i];
+    }
+  }
+  *size = maxavailable;
+}
+
+void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) {
+  int maxavailable = 0;
+  int secondavail = 0;
+  const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
+  // find the two largest available buffers.
+  for (int i = 0; i < num_mem; i++) {
+    if (scratch_sizes[i] > maxavailable) {
+      secondavail = maxavailable;
+      maxavailable = scratch_sizes[i];
+    } else if (scratch_sizes[i] > secondavail) {
+      secondavail = scratch_sizes[i];
+    }
+  }
+  *size1 = maxavailable;
+  *size2 = secondavail;
+}
+
+void init_arc_scratch_buffers(void) {
+  scratch_mem[0] = scratch_mem_x;
+  scratch_mem[1] = scratch_mem_y;
+  scratch_mem[2] = scratch_mem_z;
+  scratch_sizes[0] = SCRATCH_MEM_X_SIZE;
+  scratch_sizes[1] = SCRATCH_MEM_Y_SIZE;
+  scratch_sizes[2] = SCRATCH_MEM_Z_SIZE;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
@ -0,0 +1,68 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
+#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
+
+#include "mli_api.h"  // NOLINT
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+void init_arc_scratch_buffers(void);
+void* get_arc_scratch_buffer(
+    int size);  // Function to assign fast memory from one of 3 scratch buffers.
+
+void get_arc_scratch_buffer_max_size(int* size);
+void get_arc_scratch_buffer_two_max_sizes(int* size1, int* size2);
+
+static inline bool inside_arc_dccm(void* p) {
+#if core_config_dccm_present
+  return ((unsigned)p >= core_config_dccm_base) &&
+         ((unsigned)p < core_config_dccm_base + core_config_dccm_size);
+#else
+  return false;
+#endif
+}
+
+static inline bool inside_arc_xccm(void* p) {
+#if core_config_xy
+  return ((unsigned)p >= core_config_xy_x_base) &&
+         ((unsigned)p < core_config_xy_x_base + core_config_xy_size);
+#else
+  return false;
+#endif
+}
+
+static inline bool inside_arc_yccm(void* p) {
+#if core_config_xy
+  return ((unsigned)p >= core_config_xy_y_base) &&
+         ((unsigned)p < core_config_xy_y_base + core_config_xy_size);
+#else
+  return false;
+#endif
+}
+
+static inline bool inside_arc_ccm(void* p) {
+  return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p);
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
--- a/tensorflow/lite/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/micro/kernels/conv_test.cc
@ -409,8 +409,9 @@ TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannel) {

 TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannelRelu6) {
  // conv params:
-  // padding, stride_<width,height>, dilation_<width, height>, activation
-  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6};
+  // padding, stride_<width,height>, activation, dilation_<width, height>
+  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1,
+                                  kTfLiteActRelu6,     1, 1};
  const int kInputShape[] = {4, 1, 2, 2, 4};  // [len,N,H,W,C]
  const int kInputElements =
      kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4];
--- a/tensorflow/lite/micro/kernels/pooling_test.cc
+++ b/tensorflow/lite/micro/kernels/pooling_test.cc
@ -496,7 +496,7 @@ TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingSameStride1ActNone) {
       F2QS(8.5, output_min, output_max), F2QS(7., output_min, output_max)},
      {4, 1, 2, 4, 1},         // Output shape
      output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActNone, output_data);
+      kTfLitePaddingSame, kTfLiteActNone, output_data);
 }

 TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloat) {
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@ -90,7 +90,7 @@ patch_cifar10_dataset() {
 }

 build_embarc_mli() {
-  gmake -j 4 -C ${1}/lib/make TCF_FILE=${2}
+  make -j 4 -C ${1}/lib/make TCF_FILE=${2}
 }

 # Main function handling the download, verify, extract, and patch process.
@ -173,7 +173,12 @@ download_and_extract() {
  elif [[ ${action} == "patch_cifar10_dataset" ]]; then
    patch_cifar10_dataset ${dir}
  elif [[ ${action} == "build_embarc_mli" ]]; then
-    build_embarc_mli ${dir} ${action_param1}
+    if [[ "${action_param1}" == *.tcf ]]; then
+      cp ${action_param1} ${dir}/hw/arc.tcf
+      build_embarc_mli ${dir} ../../hw/arc.tcf
+    else
+      build_embarc_mli ${dir} ${action_param1}
+    fi
  elif [[ ${action} ]]; then
    echo "Unknown action '${action}'"
    exit 1
--- a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
@ -0,0 +1,104 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Settings for embARC MLI library for ARC platform. 
+
+ifeq ($(TARGET_ARCH), arc)
+
+# MLI Library is used by default for ARC platform whenever it is possible.
+# To use TFLM reference implementation MLI should be intentionally turned off 
+# by passing 'no_arc_mli' tag (make -f <tflm_main_makefile> TAGS=no_arc_mli ...)
+ifeq ($(filter no_arc_mli,$(ALL_TAGS)),)
+
+ALL_TAGS += arc_mli
+
+ifeq ($(BUILD_ARC_MLI),true)
+  MLI_LIB_DIR ?= arc_mli_$(basename $(TCF_FILE_NAME))
+
+  $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
+
+  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
+  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
+  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_LIB_DIR)/LICENSE
+else
+ifneq ($(ARC_MLI_PRE_COMPILED_TARGET),)
+  MLI_LIB_DIR ?= arc_mli_package
+  $(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),))
+
+  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
+  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/$(ARC_MLI_PRE_COMPILED_TARGET)/release/libmli.a
+  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/$(ARC_MLI_PRE_COMPILED_TARGET)/release/libmli.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_LIB_DIR)/LICENSE
+else 
+$(error Target for pre compiled ARC MLI library is not defined)
+endif
+endif
+
+  THIRD_PARTY_CC_HDRS += $(MLI_LIB)
+  GENERATED_PROJECT_LIBS += $(MLI_LIB)
+
+  INCLUDES += \
+    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
+    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
+
+  GENERATED_PROJECT_INCLUDES += \
+    -I. \
+    -I./third_party/$(MLI_INCLUDE_FOLDER) \
+    -I./third_party/$(MLI_INCLUDE_FOLDER)/api
+
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h
+
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
+
+
+  MICROLITE_TEST_SRCS += $(wildcard tensorflow/lite/micro/kernels/arc_mli/*test.cc)
+
+  ARC_MLI_TESTS := conv depthwise_conv pooling fully_connected
+  ARC_MLI_TESTS += $(foreach TEST,$(ARC_MLI_TESTS), $(TEST)_slicing)
+
+generate_arc_mli_test_projects: $(foreach TEST,$(ARC_MLI_TESTS), generate_kernel_$(TEST)_test_make_project)
+  
+  ARC_EXTRA_APP_SETTINGS += \
+    \nMLI_ONLY ?= false\n\
+    \nifeq \($(DLR)\(MLI_ONLY\), true\)\
+    \nCCFLAGS += -DTF_LITE_STRIP_REFERENCE_IMPL\
+    \nCXXFLAGS += -DTF_LITE_STRIP_REFERENCE_IMPL\
+    \nendif\n
+
+
+
+endif # no_embarc_mli
+endif # TARGET_ARCH
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@ -130,24 +130,37 @@ endef
 define generate_arc_project

 ifeq ($(TARGET_ARCH), arc)
-$(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/Makefile.tpl
+
+$(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
 	@mkdir -p $$(dir $$@)
 	@sed -E 's#\%\{SRCS\}\%#$(4)#g' $$< | \
-	sed -E '1 i\CC = ccac\nCXX = ccac\nLD = ccac\n' | \
+	sed -E 's#\%\{CC\}\%#$(CC_TOOL)#g' | \
+	sed -E 's#\%\{CXX\}\%#$(CXX_TOOL)#g' | \
+	sed -E 's#\%\{LD\}\%#$(LD_TOOL)#g' | \
 	sed -E 's#\%\{EXECUTABLE\}\%#$(3).elf#g' | \
 	sed -E 's#\%\{LINKER_FLAGS\}\%#$(6)#g' | \
 	sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' | \
-	sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' > $$@
+	sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' | \
+	sed -E 's#\%\{EXTRA_APP_SETTINGS\}\%#$(ARC_EXTRA_APP_SETTINGS)#g' | \
+	sed -E 's#\%\{EXTRA_APP_RULES\}\%#$(ARC_EXTRA_APP_RULES)#g' | \
+	sed -E 's#\%\{BIN_DEPEND\}\%#$(ARC_BIN_DEPEND)#g' | \
+	sed -E 's#\%\{BIN_RULE\}\%#$(ARC_BIN_RULE)#g' | \
+	sed -E 's#\%\{EXTRA_RM_TARGETS\}\%#$(ARC_EXTRA_RM_TARGETS)#g' | \
+	sed -E 's#\%\{APP_RUN_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \
+	sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_DEBUG_CMD)#g' | \
+	sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@

-
-# Special rule to copy TCF in case the local filesystem file name has been defined
-ifneq ($(TCF_FILE_NAME), )
-$(PRJDIR)$(3)/$(1)/$(TCF_FILE_NAME): $(TCF_FILE)
+$(PRJDIR)$(3)/$(1)/%: tensorflow/lite/micro/tools/make/templates/arc/%.tpl
 	@cp $$< $$@
-endif
+
+$(foreach var,$(ARC_TARGET_FILES_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var))))
+
 endif
 endef

+
+
+
 # Creates a set of rules to build a standalone Arduino project for an
 # executable, including all of the source and header files required in a
 # separate folder and a simple makefile.
--- a/tensorflow/lite/micro/tools/make/targets/arc/README.md
+++ b/tensorflow/lite/micro/tools/make/targets/arc/README.md
@ -0,0 +1,315 @@
+# Building TensorFlow Lite for Microcontrollers for Synopsys DesignWare ARC EM/HS Processors
+
+This document contains the general information on building and running
+TensorFlow Lite Micro for targets based on the Synopsys ARC EM/HS Processors.
+
+## Table of Contents
+
+-   [Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit)
+-   [ARC EM Software Development Platform (ARC EM SDP)](#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+-   [Custom ARC EM or HS Platform](#Custom-ARC-EMHS-Platform)
+
+## Install the Synopsys DesignWare ARC MetaWare Development Toolkit
+
+The Synopsys DesignWare ARC MetaWare Development Toolkit (MWDT) is required to
+build and run Tensorflow Lite Micro applications for all ARC EM/HS targets.
+
+To license MWDT, please see further details
+[here](https://www.synopsys.com/dw/ipdir.php?ds=sw_metaware)
+
+To request an evaluation version of MWDT, please use the
+[Synopsys Eval Portal](https://eval.synopsys.com/) and follow the link for the
+MetaWare Development Toolkit (Important: Do not confuse this with MetaWare EV
+Development Toolkit or MetaWare Lite options also available on this page)
+
+Run the downloaded installer and follow the instructions to set up the toolchain
+on your platform.
+
+TensorFlow Lite for Microcontrollers builds are divided into two phases:
+Application Project Generation and Application Project Building/Running. The
+former phase requires \*nix environment while the latter does not.
+
+For basic project generation targeting
+[ARC EM Software Development Platform](#ARC-EM-Software-Development-Platform-ARC-EM-SDP),
+MetaWare is NOT required for the Project Generation Phase. However, it is
+required in case the following: - For project generation for custom (not EM SDP)
+targets - To build microlib target library with all required TFLM objects for
+external use
+
+Please consider the above when choosing whether to install Windows or Linux or
+both versions of MWDT
+
+## ARC EM Software Development Platform (ARC EM SDP)
+
+This section describes how to deploy on an
+[ARC EM SDP board](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+
+### Initial Setup
+
+To use the EM SDP, you need the following hardware and software:
+
+#### ARC EM SDP
+
+More information on the platform, including ordering information, can be found
+[here](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform).
+
+#### MetaWare Development Toolkit
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit)
+section for instructions on toolchain installation.
+
+#### Digilent Adept 2 System Software Package
+
+If you wish to use the MetaWare Debugger to debug your code, you need to also
+install the Digilent Adept 2 software, which includes the necessary drivers for
+connecting to the targets. This is available from oficial
+[Digilent site](https://reference.digilentinc.com/reference/software/adept/start?redirect=1#software_downloads).
+You should install the “System” component, and Runtime. Utilities and SDK are
+NOT required.
+
+Digilent installation is NOT required if you plan to deploy to EM SDP via the SD
+card instead of using the debugger.
+
+#### Make Tool
+
+A `'make'` tool is required for both phases of deploying Tensorflow Lite Micro
+applications on ARC EM SDP: 1. Application project generation 2. Working with
+generated application (build and run)
+
+For the first phase you need an environment and make tool compatible with
+Tensorflow Lite for Micro build system. At the moment of this writing, this
+requires make >=3.82 and a *nix-like environment which supports shell and native
+commands for file manipulations. MWDT toolkit is not required for this phase.
+
+For the second phase, requirements are less strict. The gmake version delivered
+with MetaWare Development Toolkit is sufficient. There are no shell and *nix
+command dependencies, so Windows can be used
+
+#### Serial Terminal Emulation Application
+
+The Debug UART port of the EM SDP is used to print application output. The USB
+connection provides both the debug channel and RS232 transport. You can use any
+terminal emulation program (like [PuTTY](https://www.putty.org/)) to view UART
+output from the EM SDP.
+
+#### microSD Card
+
+If you want to self-boot your application (start it independently from a
+debugger connection), you also need a microSD card with a minimum size of 512 MB
+and a way to write to the card from your development host
+
+### Connect the Board
+
+1.  Make sure Boot switches of the board (S3) are configured in the next way:
+
+Switch # | Switch position
+:------: | :-------------:
+1        | Low (0)
+2        | Low (0)
+3        | High (1)
+4        | Low (0)
+
+1.  Connect the power supply included in the product package to the ARC EM SDP.
+2.  Connect the USB cable to connector J10 on the ARC EM SDP (near the RST and
+    CFG buttons) and to an available USB port on your development host.
+3.  Determine the COM port assigned to the USB Serial Port (on Windows, using
+    Device Manager is an easy way to do this)
+4.  Execute the serial terminal application you installed in the previous step
+    and open the serial connection with the early defined COM port (speed 115200
+    baud; 8 bits; 1 stop bit; no parity).
+5.  Push the CFG button on the board. After a few seconds you should see the
+    boot log in the terminal which begins as follows:
+
+```
+U-Boot <Versioning info>
+
+CPU:   ARC EM11D v5.0 at 40 MHz
+Subsys:ARC Data Fusion IP Subsystem
+Model: snps,emsdp
+Board: ARC EM Software Development Platform v1.0
+…
+```
+
+### Generate Application Project for ARC EM SDP
+
+Before building an example or test application, you need to generate a TFLM
+project for this application from TensorFlow sources and external dependencies.
+To generate it for ARC EM SDP board you need to set `TARGET=arc_emsdp` on the
+make command line. For instance, to build the Person Detect test application,
+use a shell to execute the following command from the root directory of the
+TensorFlow repo:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_test_int8_make_project TARGET=arc_emsdp
+```
+
+The application project will be generated into
+*tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_test_int8/make*
+
+Info on generating and building example applications for EM SDP
+(*tensorflow/lite/micro/examples*) can be found in the appropriate readme file
+placed in the same directory with the examples. In general, it’s the same
+process which described in this Readme.
+
+The
+[embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli)
+is used by default to speed up execution of some kernels for asymmetrically
+quantized layers. Kernels which use MLI-based implementations are kept in the
+*tensorflow/lite/micro/kernels/arc_mli* folder. For applications which may not
+benefit from MLI library, the project can be generated without these
+implementations by adding `TAGS=no_arc_mli` in the command line. This can reduce
+code size when the optimized kernels are not required.
+
+For more options on embARC MLI usage see
+[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md).
+
+### Build the Application
+
+You may need to adjust the following commands in order to use the appropriate
+make tool available in your environment (ie: `make` or `gmake`)
+
+1.  Open command shell and change the working directory to the location which
+    contains the generated project, as described in the previous section
+
+2.  Clean previous build artifacts (optional)
+
+    make clean
+
+3.  Build application
+
+    make app
+
+### Run the Application on the Board Using MetaWare Debugger
+
+In case you do not have access to the MetaWare Debugger or have chosen not to
+install the Digilent drivers, you can skip to the next section.
+
+To run the application from the console, use the following command:
+
+```
+   make run
+```
+
+If application runs in an infinite loop, type `Ctrl+C` several times to exit the
+debugger.
+
+To run the application in the GUI debugger, use the following command:
+
+```
+   make debug
+```
+
+In both cases you will see the application output in the serial terminal.
+
+### Run the Application on the Board from the microSD Card
+
+1.  Use the following command in the same command shell you used for building
+    the application, as described in the previous step
+
+    make flash
+
+2.  Copy the content of the created *./bin* folder into the root of microSD
+    card. Note that the card must be formatted as FAT32 with default cluster
+    size (but less than 32 Kbytes)
+
+3.  Plug in the microSD card into the J11 connector.
+
+4.  Push the RST button. If a red LED is lit beside RST button, push the CFG
+    button.
+
+You will see the application output in the serial terminal.
+
+## Custom ARC EM/HS Platform
+
+This section describes how to deploy on a Custom ARC EM/HS platform defined only
+by a TCF (Tool Configuration File, created at CPU configuration time) and
+optional LCF (Linker Command File). In this case, the real hardware is unknown,
+and applications can be run only in the nSIM simulator included with the
+MetaWare toolkit
+
+### Initial Setup
+
+To with custom ARC EM/HS platform, you need the following : * Synopsys MetaWare
+Development Toolkit version 2019.12 or higher * Make tool (make or gmake)
+
+See
+[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit)
+section for instructions on toolchain installation. See
+[MetaWare Development Toolkit](#MetaWare-Development-Toolkit) and
+[Make Tool](#Make-Tool) sections for instructions on toolchain installation and
+comments about make versions.
+
+### Generate Application Project
+
+Before building the application itself, you need to generate the project for
+this application from TensorFlow sources and external dependencies. To generate
+it for a custom TCF you need to set the following variables in the make command
+line: * TARGET_ARCH=arc * TCF_FILE=<path to TCF file> * (optional)
+LCF_FILE=<path to LCF file>
+
+If you don’t supply an external LCF, the one embedded in the TCF will be used
+instead
+
+For instance, to build **Person Detection** test application, use the following
+command from the root directory of the TensorFlow repo:
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_test_int8_make_project TARGET_ARCH=arc TCF_FILE=<path_to_tcf_file> LCF_FILE=<path_to_lcf_file>
+```
+
+The application project will be generated into
+*tensorflow/lite/micro/tools/make/gen/<tcf_file_basename>_arc/prj/person_detection_test_int8/make*
+
+The
+[embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli)
+is used by default to speed up execution of some kernels for asymmetrically
+quantized layers. Kernels which use MLI-based implementations are kept in the
+*tensorflow/lite/micro/kernels/arc_mli* folder. For applications which may not
+benefit from MLI library, the project can be generated without these
+implementations by adding `TAGS=no_arc_mli` in the command line. This can reduce
+code size when the optimized kernels are not required.
+
+For more options on embARC MLI usage see
+[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md).
+
+### Build the Application
+
+You may need to adjust the following commands in order to use the appropriate
+make tool available in your environment (ie: `make` or `gmake`)
+
+1.  Open command shell and change the working directory to the location which
+    contains the generated project, as described in the previous section
+
+2.  Clean previous build artifacts (optional)
+
+    make clean
+
+3.  Build application
+
+    make app
+
+### Run the Application with MetaWare Debugger on the nSim Simulator.
+
+To run application from the console, use the following command:
+
+```
+   make run
+```
+
+If application runs in an infinite loop, type `Ctrl+C` several times to exit the
+debugger.
+
+To run the application in the GUI debugger, use the following command:
+
+```
+   make debug
+```
+
+You will see the application output in the same console where you ran it.
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository,
+and third-party dependencies are covered by their respective licenses, in the
+third_party folder of this package.
--- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@ -0,0 +1,138 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Common Settings for ARC platform and its projects. 
+# Might be reused across different targets
+
+ifeq ($(TARGET_ARCH), arc)
+
+  DLR := $$$$
+
+  # List of folders to search project files for copy with path changing
+  # For instance, TCF and LCF files are copied into the root of generated project
+  ARC_TARGET_FILES_DIRS ?=
+
+  # For the following variables see arc_app_makefile.tpl for usage
+
+  # Additional text into application settings section of arc makefile project 
+  ARC_EXTRA_APP_SETTINGS ?=
+
+  # Additional text into application general rules of arc makefile project 
+  ARC_EXTRA_APP_RULES ?=
+  
+  # additional arguments for RM command of "clean" target rule ("make clean" command)
+  ARC_EXTRA_RM_TARGETS ?=
+
+  # Dependencies of "flash" target rule ("make flash" command)
+  ARC_BIN_DEPEND ?=
+  
+  # Commands in "flash" target rule ("make flash" command)
+  ARC_BIN_RULE ?= \t$(DLR)\(error Flash rule isnt defined for this ARC target\)
+  
+  # Command to run app on "make run" command of generated project
+  ARC_APP_RUN_CMD ?= 
+  
+  # Command to run app on "make debug" command of generated project
+  ARC_APP_DEBUG_CMD ?= 
+  
+  # Additional text into application execution rules of arc makefile project 
+  ARC_EXTRA_EXECUTE_RULES ?= 
+
+# We overwrite project generator to exclude everything not relevant to ARC platform.
+# ARC targets cannot work with non-ARC development tools.
+# Basic make project is updated to be applicable for general ARC platform
+define generate_microlite_projects
+$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
+$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+endef
+
+# Copy rule generator to do file copies with changing paths in generated project
+# Arguments are:
+# 1 - Path files in generated project.
+# 2 - Path files in the source repo
+# Used in helper_functions.inc for arc projects to copy files
+define path_changing_copy_file
+$(1)/%: $(2)/%
+	@mkdir -p $$(dir $$@)
+	@cp $$< $$@
+endef
+
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+# Not applicable for ARC, leaving it empty.
+$(BINDIR)%.bin:
+
+
+ifeq ($(ARC_TOOLCHAIN), mwdt)
+  CC_TOOL := ccac
+  AR_TOOL := arac
+  CXX_TOOL := ccac
+  LD_TOOL := ccac
+
+  ARC_APP_RUN_CMD = mdb -run -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS\)
+  ARC_APP_DEBUG_CMD = mdb -OK -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS\)
+
+  # The variable TCF_FILE stores path to Tool Configuration File (*.tcf). 
+  # This file is used by MWDT toolchain to properly compile/run code
+  TCF_FILE ?= 
+
+  LCF_FILE ?= 
+
+  BUILD_ARC_MLI ?= true
+
+# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), 
+# this variable is used later to add the option to the linker/compiler flags.
+# This condition also handles the case when the user/makefile specifies 
+# the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
+ifneq (,$(findstring .tcf,$(TCF_FILE)))
+  TCF_FILE_NAME = $(notdir $(TCF_FILE))
+  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE))
+  MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
+else
+  TCF_FILE_NAME = $(TCF_FILE)
+endif
+
+  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config
+  
+  PLATFORM_FLAGS += -Hnocopyr -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections 
+  
+  # Use compact CRT. It requires pre-defined heap size
+  PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset 
+  
+  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) 
+  
+  PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K 
+
+ifneq ($(LCF_FILE), )
+  PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
+  MAKE_PROJECT_FILES += $(notdir $(LCF_FILE))
+ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(LCF_FILE))),)
+  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
+endif
+endif
+
+  CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
+  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
+
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+  LDFLAGS += $(PLATFORM_LDFLAGS)
+
+
+
+
+endif # ARC_TOOLCHAIN
+endif  # TARGET_ARCH
+
--- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
@ -0,0 +1,85 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Common EMSDP LCF File for applications
+#
+# external SRAM memory is used for code, because some TFLM applications includes the whole 
+# set of supported kernels which doesn't fit to ICCM0. 
+# It could slow performance a bit. Smaller applications can use ICCM0 instead.
+#
+# External PSRAM is used for potentially big sections. In particular:
+# - rodata_in data which typically includes protobuf with model.
+# - other .data which typically includes tensor arena.
+#
+# stack and heap are kept in DCCM which is the closest memory to the core 
+
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+    PSRAM   : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
+    SRAM    : ORIGIN = 0x20000000, LENGTH = 0x00040000
+    IVT     : ORIGIN = 0x60000000, LENGTH = 0x400
+    ICCM0   : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
+#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
+#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    }
+
+SECTIONS {
+
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+    } > IVT
+
+    GROUP BLOCK(4): {
+        .text? : { *('.text$crt*') }
+        * (TEXT): {}
+        * (LIT): {}
+    } > SRAM
+
+    GROUP BLOCK(4): {
+       .Zdata? : {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32K): {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+    } > DCCM
+        
+    GROUP BLOCK(4): {
+        .Xdata? : {}
+    } > XCCM
+
+    GROUP BLOCK(4): {
+        .Ydata? : {}
+    } > YCCM
+
+    GROUP BLOCK(4): {
+    /* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+    } > PSRAM
+
+    GROUP BLOCK(4): {
+        .rodata_in_data? : {}
+    } > PSRAM
+
+    GROUP BLOCK(4): {
+        .debug_log? : {}
+    } > SRAM
+}
+
+
--- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf
@ -0,0 +1,74 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Difference with common EMSDP LCF file (to reduce data access time): 
+# - move data from external PSRAM to DCCM
+# - move text from SRAM to ICCM
+#
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+    PSRAM   : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
+    SRAM    : ORIGIN = 0x20000000, LENGTH = 0x00040000
+    IVT     : ORIGIN = 0x60000000, LENGTH = 0x400
+    ICCM0   : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
+#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
+#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    }
+
+SECTIONS {
+
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+    } > IVT
+
+    GROUP BLOCK(4): {
+        .text? : { *('.text$crt*') }
+        * (TEXT): {}
+        * (LIT): {}
+    } > ICCM0
+
+    GROUP BLOCK(4): {
+        .rodata_in_data? : {}
+    } > PSRAM
+
+    GROUP BLOCK(4): {
+        .debug_log? : {}
+    } > SRAM
+
+    GROUP BLOCK(4): {
+    /* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+       .Zdata? : {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+    } > DCCM
+
+    GROUP BLOCK(4): {
+        .Xdata? : {}
+    } > XCCM
+
+    GROUP BLOCK(4): {
+        .Ydata? : {}
+    } > YCCM
+}
+
+
--- a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
@ -0,0 +1,73 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Settings for EMSDP target (ARC processor)
+ifeq ($(TARGET), arc_emsdp)
+
+  TARGET_ARCH := arc
+  ARC_TOOLCHAIN := mwdt
+
+
+  BUILD_ARC_MLI := false
+  ARC_MLI_PRE_COMPILED_TARGET := emsdp_em11d_em9d_dfss
+
+ifneq ($(filter no_arc_mli,$(ALL_TAGS)),)
+  MLI_LIB_DIR = arc_mli_package
+  $(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),))
+else ifeq ($(BUILD_ARC_MLI), true)
+  MLI_LIB_DIR = arc_mli_$(ARC_MLI_PRE_COMPILED_TARGET)
+endif
+
+  TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/hw/emsdp_em11d_em9d_dfss.tcf
+  LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
+  UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env
+  UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE))
+    
+
+include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
+  
+   ARC_EXTRA_APP_SETTINGS = \
+      BIN_DIR = .$(DLR)\(PS\)bin\n\
+      BIN_FILE = $(DLR)\(BIN_DIR\)$(DLR)\(PS\)app.elf\n
+
+   ARC_EXTRA_APP_RULES = \
+     $(DLR)\(BIN_FILE\): $(DLR)\(BIN_DIR\) $(DLR)\(OUT_NAME\)\
+     \n\t\@$(DLR)\(CP\) $(DLR)\(OUT_NAME\) $(DLR)\(BIN_FILE\)\
+     \n\t\@$(DLR)\(CP\) $(UBOOT_FILE_NAME) $(DLR)\(BIN_DIR\)$(DLR)\(PS\)$(UBOOT_FILE_NAME)\
+     \n \
+     \n$(DLR)\(BIN_DIR\):\
+     \n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\
+
+   ARC_EXTRA_RM_TARGETS = $(DLR)\(BIN_DIR\)
+
+   ARC_BIN_DEPEND = $(DLR)\(BIN_DIR\) $(DLR)\(BIN_FILE\)
+   ARC_BIN_RULE = \t@echo Copy content of $(DLR)\(BIN_DIR\) into the root of SD card and follow instructions
+   
+   ARC_APP_RUN_CMD = mdb -run -digilent -nooptions $(DLR)\(DBG_ARGS\)
+   ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS\)
+   ARC_EXTRA_EXECUTE_RULES = 
+
+  MAKE_PROJECT_FILES += $(UBOOT_FILE_NAME)
+ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(UBOOT_FILE))),)
+  ARC_TARGET_FILES_DIRS += $(dir $(UBOOT_FILE))
+endif
+
+  MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC_EMSDP.md
+
+  # for default EMSDP configuration we can use em9d_va rt libs
+  # for better performance runtime should be built for emsdp configuration
+  # No hostlink library for smaller codesize purpose
+  PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio -Hhostlib=
+
+endif
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@ -1,86 +1,40 @@
-# Settings for arc processors
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Settings for not pre-defined ARC processors. 
+# User need to specify ARC target with Tool Configuration File (*.tcf). 
+# Path to this file must be passed through TCF_FILE variable.
+# Otherwise, default em7d_voice_audio configuration is used 
 ifeq ($(TARGET_ARCH), arc)

-  CC_TOOL = ccac
-  AR_TOOL = arac
-  CXX_TOOL = ccac
+# Known target are specified with their own make configurations. 
+ifeq ($(filter $(TARGET), arc_emsdp),)
+
+ARC_TOOLCHAIN := mwdt

 ifneq ($(TCF_FILE), )
  TARGET = $(basename $(notdir $(TCF_FILE)))
 else
+  $(warning TCF_FILE variable is not specified. Use default em7d_voice_audio configuration)
  TARGET = em7d_voice_audio
  TCF_FILE = em7d_voice_audio
 endif

-# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags.
-# This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
-ifneq (,$(findstring .tcf,$(TCF_FILE)))
-  TCF_FILE_NAME = $(notdir $(TCF_FILE))
-  THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME)
-else
-  TCF_FILE_NAME = $(TCF_FILE)
-endif
+include $(MAKEFILE_DIR)/targets/arc/arc_common.inc

-  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
-  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map
+MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC.md

-  CXXFLAGS += $(PLATFORM_FLAGS)
-  CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS))
-  CCFLAGS += $(PLATFORM_FLAGS)
-  LDFLAGS += $(PLATFORM_LDFLAGS)
+endif  # $(TARGET)
+endif  # $(TARGET_ARCH)...

-  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
-
-  USE_EMBARC_MLI ?= true
-
-ifeq ($(USE_EMBARC_MLI), true)
-  ALL_TAGS += arc
-
-ifeq ($(PRE_COMPILED_MLI),true)
-  $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,))
-
-  MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include
-  MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/embarc_osp/LICENSE
-else
-  MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME))
-
-  $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
-
-  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
-  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
-  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/$(MLI_LIB_DIR)/LICENSE
-endif
-
-  THIRD_PARTY_CC_HDRS += $(MLI_LIB)
-  GENERATED_PROJECT_LIBS += $(MLI_LIB)
-
-  INCLUDES += \
-    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
-    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
-
-  GENERATED_PROJECT_INCLUDES += \
-    -I. \
-    -I./third_party/$(MLI_INCLUDE_FOLDER) \
-    -I./third_party/$(MLI_INCLUDE_FOLDER)/api
-
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
-
-endif # USE_EMBARC_MLI
-
-endif
--- a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl
+++ b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl
@ -0,0 +1,45 @@
+# TensorFlow Lite Micro ARC Make Project
+
+This folder has been autogenerated by TensorFlow, and contains sources, headers, and project files needed to build a single TensorFlow Lite Micro application using make tool and a Synopsys DesignWare ARC processor compatible toolchain, specifically the ARC MetaWare Development Toolkit (MWDT).  
+
+This project has been generated for a target defined by TCF file only (Tool Configuration File). The real target board is unspecified, and applications can be run only in the nSIM simulator included with MWDT.
+
+See
+[tensorflow/lite/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro)
+for details on how projects like this can be generated from the main source tree.
+
+## Usage
+
+See [Custom ARC EM/HS Platform](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#Custom-ARC-EMHS-Platform) section for more detailed information on requirements and usage of this project. 
+
+The Makefile contains all the information on building and running the project. One can modify it to satisfy specific needs. Next actions are available out of the box. You may need to adjust the following commands in order to use the appropriate make tool available in your environment, ie: `make` or `gmake`
+
+1. Build the application.
+
+       make app 
+
+2. Build the application passing additional flags to compiler.
+
+       make app EXT_CFLAGS=[additional compiler flags]
+
+3. Build the application and stripout TFLM reference kernel fallback implementations in order to reduce code size. This only has an effect in case the project was generated with MLI support. See more info in [EmbARC MLI Library Based Optimizations](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/kernels/arc_mli/README.md). `false` is the default value. 
+
+       make app MLI_ONLY=[true|false]
+
+4. Delete all artifacts created during build.
+
+       make clean
+
+5. Run the application with the nSIM simulator in console mode.
+
+       make run 
+
+6. Run the application with the nSIM simulator, but using the MetaWare Debugger GUI for further execution/debugging capabilities. 	
+
+       make debug 
+
+
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository, and third party dependencies are covered by their respective licenses, in the third_party folder of this package.
--- a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl
+++ b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl
@ -0,0 +1,48 @@
+# TensorFlow Lite Micro ARC Make Project for EM SDP Board.
+
+This folder has been autogenerated by TensorFlow, and contains source, header, and project files needed to build a single TensorFlow Lite Micro target using make tool and and a Synopsys DesignWare ARC processor compatible toolchain, specifically the ARC MetaWare Development Toolkit (MWDT).  
+
+This project has been generated for the ARC EM Software Development Platform (EM SDP). The built application can be run only on this platform.
+
+See
+[tensorflow/lite/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro)
+for details on how projects like this can be generated from the main source tree.
+
+## Usage
+
+See [ARC EM Software Development Platform](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) section for more detailed information on requirements and usage of this project. 
+
+The Makefile contains all the information on building and running the project. One can modify it to satisfy specific needs. Next actions are available out of the box. You may need to adjust the following commands in order to use the appropriate make tool available in your environment, ie: `make` or `gmake`:
+
+1. Build the application.
+
+       make app 
+
+2. Build the application passing additional flags to compiler.
+
+       make app EXT_CFLAGS=[additional compiler flags]
+
+3. Build the application and stripout TFLM reference kernel fallback implementations in order to reduce code size. This only has an effect in case the project was generated with MLI support. See more info in [EmbARC MLI Library Based Optimizations](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/kernels/arc_mli/README.md). `false` is the default value. 
+
+       make app MLI_ONLY=[true|false]
+
+4. Delete all artifacts created during build.
+
+       make clean
+
+5. Run the application with the nSIM simulator in console mode.
+
+       make run 
+
+6. Load the application and open MetaWare Debugger GUI for further execution/debugging. 	
+
+       make debug 
+
+7. Generate necessary artefacts for self-booting execution from flash. See [reference to Run the application on the board from the micro SD card](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#Run-the-Application-on-the-Board-from-the-microSD-Card). 	
+
+       make flash
+
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository, and third party dependencies are covered by their respective licenses, in the third_party folder of this package.
--- a/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
+++ b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
@ -0,0 +1,114 @@
+#=============================================================
+# OS-specific definitions
+#=============================================================
+COMMA=,
+OPEN_PAREN=(
+CLOSE_PAREN=)
+BACKSLASH=\$(nullstring)
+ifneq ($(ComSpec)$(COMSPEC),)
+    O_SYS=Windows
+    RM=del /F /Q
+    MKDIR=mkdir 
+    CP=copy /Y
+    TYPE=type
+    PS=$(BACKSLASH)
+    Q=
+    coQ=\$(nullstring)
+    fix_platform_path = $(subst /,$(PS), $(1))
+    DEV_NULL = nul
+else
+    O_SYS=Unix
+    RM=rm -rf
+    MKDIR=mkdir -p
+    CP=cp 
+    TYPE=cat
+    PS=/
+    Q=$(BACKSLASH)
+    coQ=
+    fix_platform_path=$(1)
+    DEV_NULL=/dev/null
+endif
+
+#=============================================================
+# Toolchain definitions
+#=============================================================
+CC = %{CC}%
+CXX = %{CXX}%
+LD = %{LD}%
+
+
+#=============================================================
+# Applications settings
+#=============================================================
+OUT_NAME = %{EXECUTABLE}%
+
+DBG_ARGS ?= 
+
+RUN_ARGS ?= 
+
+EXT_CFLAGS ?=
+
+CXXFLAGS += %{CXX_FLAGS}%
+
+CCFLAGS += %{CC_FLAGS}%
+
+LDFLAGS += %{LINKER_FLAGS}%
+
+%{EXTRA_APP_SETTINGS}%
+
+
+#=============================================================
+# Files and directories
+#=============================================================
+SRCS := \
+%{SRCS}%
+
+OBJS := \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
+
+
+#=============================================================
+# Common rules
+#=============================================================
+.PHONY: all app flash clean run debug
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.c
+	$(CC) $(CCFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
+
+$(OUT_NAME): $(OBJS)
+	$(LD) $(CXXFLAGS) -o $@ -Ccrossref $(OBJS) $(LDFLAGS)
+
+%{EXTRA_APP_RULES}%
+
+
+#=================================================================
+# Global rules
+#=================================================================
+all: $(OUT_NAME)
+
+app: $(OUT_NAME)
+
+flash: %{BIN_DEPEND}%
+%{BIN_RULE}%
+
+clean: 
+	-@$(RM) $(call fix_platform_path,$(OBJS))
+	-@$(RM) $(OUT_NAME) %{EXTRA_RM_TARGETS}%
+
+#=================================================================
+# Execution rules
+#=================================================================
+
+APP_RUN := %{APP_RUN_CMD}%
+APP_DEBUG := %{APP_DEBUG_CMD}%
+
+run: $(OUT_NAME)
+	$(APP_RUN) $(OUT_NAME) $(RUN_ARGS)
+
+debug: $(OUT_NAME)
+	$(APP_DEBUG) $(OUT_NAME) $(RUN_ARGS)
+
+%{EXTRA_EXECUTE_RULES}%
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@ -71,11 +71,11 @@ PERSON_MODEL_MD5 := "fe2934bd0788f1dcc7af3f0a954542ab"
 PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip"
 PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"

-EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip"
-EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
+EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip"
+EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"

-EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/6316034d421cbbb59756239908d7c9a99075a3bb.zip"
-EMBARC_MLI_MD5 := "db0910cf0e07e43f74ae7a31de485d56"
+EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip"
+EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"

 XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
 XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"