Merge pull request #39077 from foss-for-synopsys-dwc-arc-processors:arcmli_upstream

PiperOrigin-RevId: 312835786
Change-Id: I08f27121798a2a59c845d6b357e135716d690184
This commit is contained in:
TensorFlower Gardener 2020-05-22 09:10:59 -07:00
commit 29fb4d12a7
44 changed files with 6275 additions and 1317 deletions

View File

@ -0,0 +1,111 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/micro/debug_log.h"
#include <cstdint>
#include <cstdio>
#include <cstring>
// Print to debug console by default. One can define next to extend destinations
// set: EMSDP_LOG_TO_MEMORY
// : fill .debug_log memory region (data section) with passed chars.
// EMSDP_LOG_TO_HOST
// : Use MetaWare HostLink to print output log. Requires Synopsys MetaWare
// debugger
// EMSDP_LOG_TO_UART
// : use default debug UART (out to FTDI channel 0). The same USB Port is used
// for JTAG.
#define EMSDP_LOG_TO_UART
// Memory size for symbols dump in EMSDP_LOG_TO_MEMORY destination
#define EMSDP_LOG_TO_MEMORY_SIZE (2 * 1024)
// EMSDP Debug UART related defines (registers and bits)
#define EMSDP_DBG_UART_BASE (0xF0004000U)
#define DW_UART_CPR_FIFO_STAT (1 << 10)
#define DW_UART_USR_TFNF (0x02)
#define DW_UART_LSR_TXD_EMPTY (0x20)
// EMSDP UART registers map (only necessairy fields)
typedef volatile struct dw_uart_reg {
uint32_t DATA; /* data in/out and DLL */
uint32_t RES1[4];
uint32_t LSR; /* Line Status Register */
uint32_t RES2[25];
uint32_t USR; /* UART status register */
uint32_t RES3[29];
uint32_t CPR; /* Component parameter register */
} DW_UART_REG;
// For simplicity we assume U-boot has already initialized debug console during
// application loading (or on reset). Hence, we use only status and data
// registers to organize blocking loop for printing symbols. No input and no IRQ
// handling. See embarc_osp repository for full EMSDP uart driver.
// (https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp)
void DbgUartSendStr(const char* s) {
DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE);
const char* src = s;
while (*src) {
// Check uart status to send char
bool uart_is_ready = false;
if (uart_reg_ptr->CPR & DW_UART_CPR_FIFO_STAT)
uart_is_ready = ((uart_reg_ptr->USR & DW_UART_USR_TFNF) != 0);
else
uart_is_ready = ((uart_reg_ptr->LSR & DW_UART_LSR_TXD_EMPTY) != 0);
// Send char if uart is ready.
if (uart_is_ready) uart_reg_ptr->DATA = *src++;
}
}
// Simple dump of symbols to a pre-allocated memory region.
// When total log exceeds memory region size, cursor is moved to its begining.
// The memory region can be viewed afterward with debugger.
// It can be viewed/read with debugger afterward.
void LogToMem(const char* s) {
static int cursor = 0;
#pragma Bss(".debug_log")
static volatile char debug_log_mem[EMSDP_LOG_TO_MEMORY_SIZE];
#pragma Bss()
const char* src = s;
while (*src) {
debug_log_mem[cursor] = *src++;
cursor = (cursor < EMSDP_LOG_TO_MEMORY_SIZE) ? cursor + 1 : 0;
}
debug_log_mem[cursor] = '^';
}
extern "C" void DebugLog(const char* s) {
#ifndef TF_LITE_STRIP_ERROR_STRINGS
#if defined EMSDP_LOG_TO_UART
DbgUartSendStr(s);
#endif
#if defined EMSDP_LOG_TO_MEMORY
#warning \
"EMSDP_LOG_TO_MEMORY is defined. View .debug_log memory region for stdout"
LogToMem(s);
#endif
#if defined EMSDP_LOG_TO_HOST
#warning "EMSDP_LOG_TO_HOST is defined. Ensure hostlib is linked."
fprintf(stderr, "%s", s);
#endif
#endif // TF_LITE_STRIP_ERROR_STRINGS
}

View File

@ -14,6 +14,7 @@ of the device.
## Table of contents
- [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
- [Deploy to Arduino](#deploy-to-arduino)
- [Deploy to ESP32](#deploy-to-esp32)
- [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
@ -21,6 +22,78 @@ of the device.
- [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
- [Train your own model](#train-your-own-model)
## Deploy to ARC EM SDP
The following instructions will help you to build and deploy this example to
[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
board. General information and instructions on using the board with TensorFlow
Lite Micro can be found in the common
[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
### Initial Setup
Follow the instructions on the
[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
to get and install all required tools for work with ARC EM SDP.
### Generate Example Project
The example project for ARC EM SDP platform can be generated with the following
command:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_hello_world_make_project
```
### Build and Run Example
For more detailed information on building and running examples see the
appropriate sections of general descriptions of the
[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
In the directory with generated project you can also find a
*README_ARC_EMSDP.md* file with instructions and options on building and
running. Here we only briefly mention main steps which are typically enough to
get it started.
1. You need to
[connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
and open an serial connection.
2. Go to the generated example project director
```
cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/hello_world/make
```
3. Build the example using
```
make app
```
4. To generate artefacts for self-boot of example from the board use
```
make flash
```
5. To run application from the board using microSD card:
* Copy the content of the created /bin folder into the root of microSD
card. Note that the card must be formatted as FAT32 with default cluster
size (but less than 32 Kbytes)
* Plug in the microSD card into the J11 connector.
* Push the RST button. If a red LED is lit beside RST button, push the CFG
button.
6. If you have the MetaWare Debugger installed in your environment:
* To run application from the console using it type `make run`.
* To stop the execution type `Ctrl+C` in the console several times.
In both cases (step 5 and 6) you will see the application output in the serial
terminal.
## Deploy to Arduino
The following instructions will help you build and deploy this sample

View File

@ -16,6 +16,7 @@ kilobytes of Flash.
## Table of contents
- [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
- [Deploy to Arduino](#deploy-to-arduino)
- [Deploy to ESP32](#deploy-to-esp32)
- [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
@ -25,6 +26,95 @@ kilobytes of Flash.
- [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
- [Train your own model](#train-your-own-model)
## Deploy to ARC EM SDP
The following instructions will help you to build and deploy this example to
[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
board. General information and instructions on using the board with TensorFlow
Lite Micro can be found in the common
[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
This example is quantized with symmetric uint8 scheme. As noted in
[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md),
embARC MLI supports optimized kernels for int8 quantization only. Therefore,
this example will only use TFLM reference kernels.
The ARC EM SDP board contains the rich set of extension interfaces. You can
choose any compatible microphone and modify
[audio_provider.cc](/tensorflow/lite/micro/examples/micro_speech/audio_provider.cc)
file accordingly to use input from your specific camera. By default, results of
running this example are printed to the console. If you would like to instead
implement some target-specific actions, you need to modify
[command_responder.cc](/tensorflow/lite/micro/examples/micro_speech/command_responder.cc)
accordingly.
The reference implementations of these files are used by default on the EM SDP.
### Initial setup
Follow the instructions on the
[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
to get and install all required tools for work with ARC EM SDP.
### Generate Example Project
As default example doesnt provide any output without real audio, it is
recommended to get started with example for mock data. The project for ARC EM
SDP platform can be generated with the following command:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_micro_speech_mock_make_project
```
### Build and Run Example
For more detailed information on building and running examples see the
appropriate sections of general descriptions of the
[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
In the directory with generated project you can also find a
*README_ARC_EMSDP.md* file with instructions and options on building and
running. Here we only briefly mention main steps which are typically enough to
get it started.
1. You need to
[connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
and open an serial connection.
2. Go to the generated example project director
```
cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/micro_speech_mock/make
```
3. Build the example using
```
make app
```
4. To generate artefacts for self-boot of example from the board use
```
make flash
```
5. To run application from the board using microSD card:
* Copy the content of the created /bin folder into the root of microSD
card. Note that the card must be formatted as FAT32 with default cluster
size (but less than 32 Kbytes)
* Plug in the microSD card into the J11 connector.
* Push the RST button. If a red LED is lit beside RST button, push the CFG
button.
6. If you have the MetaWare Debugger installed in your environment:
* To run application from the console using it type `make run`.
* To stop the execution type `Ctrl+C` in the console several times.
In both cases (step 5 and 6) you will see the application output in the serial
terminal.
## Deploy to Arduino
The following instructions will help you build and deploy this sample

View File

@ -0,0 +1,28 @@
ifeq ($(TARGET), arc_emsdp)
# Patch of arc make project to adjust it specifically for micro speech example.
# In particular:
# - Extend Heap and stack size for application needs
# - Use Linker command file with better usage of fast memory
# - In case project was generated with MLI usage, reduce scratch buffers.
MICRO_SPEECH_HDRS += \
micro_speech_patch.txt
MICRO_SPEECH_TEST_HDRS += \
micro_speech_patch.txt
MICRO_SPEECH_MOCK_HDRS += \
micro_speech_patch.txt
%/micro_speech_patch.txt: %/emsdp.lcf %/Makefile
@cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $<
@echo emsdp.lcf > $@
@sed -E -i 's#-Hheap=[^ ]*#\-Hheap=16K \-Hstack=16K#g' $(word 2, $^)
@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\
$(word 2, $^)
@echo Makefile >> $@
endif

View File

@ -5,7 +5,9 @@ network to recognize people in images captured by a camera. It is designed to
run on systems with small amounts of memory such as microcontrollers and DSPs.
## Table of contents
- [Getting started](#getting-started)
- [Running on ARC EM SDP](#running-on-arc-em-sdp)
- [Running on Arduino](#running-on-arduino)
- [Running on ESP32](#running-on-esp32)
- [Running on SparkFun Edge](#running-on-sparkfun-edge)
@ -13,6 +15,94 @@ run on systems with small amounts of memory such as microcontrollers and DSPs.
- [Debugging image capture](#debugging-image-capture)
- [Training your own model](#training-your-own-model)
## Running on ARC EM SDP
The following instructions will help you to build and deploy this example to
[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
board. General information and instructions on using the board with TensorFlow
Lite Micro can be found in the common
[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
This example is quantized with symmetric uint8 scheme. As noted in
[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md),
embARC MLI supports optimized kernels for int8 quantization only. Therefore,
this example will only use TFLM reference kernels.
The ARC EM SDP board contains the reach set of extension interfaces. You can
choose any compatible camera and modify
[image_provider.cc](/tensorflow/lite/micro/examples/person_detection/image_provider.cc)
file accordingly to use input from your specific camera. By default, results of
running this example are printed to the console. If you would like to instead
implement some target-specific actions, you need to modify
[detection_responder.cc](/tensorflow/lite/micro/examples/person_detection/detection_responder.cc)
accordingly.
The reference implementations of these files are used by default on the EM SDP.
### Initial setup
Follow the instructions on the
[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
to get and install all required tools for work with ARC EM SDP.
### Generate Example Project
The example project for ARC EM SDP platform can be generated with the following
command:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_make_project
```
### Build and Run Example
For more detailed information on building and running examples see the
appropriate sections of general descriptions of the
[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
In the directory with generated project you can also find a
*README_ARC_EMSDP.md* file with instructions and options on building and
running. Here we only briefly mention main steps which are typically enough to
get it started.
1. You need to
[connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
and open an serial connection.
2. Go to the generated example project director
```
cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make
```
3. Build the example using
```
make app
```
4. To generate artefacts for self-boot of example from the board use
```
make flash
```
5. To run application from the board using microSD card:
* Copy the content of the created /bin folder into the root of microSD
card. Note that the card must be formatted as FAT32 with default cluster
size (but less than 32 Kbytes)
* Plug in the microSD card into the J11 connector.
* Push the RST button. If a red LED is lit beside RST button, push the CFG
button.
6. If you have the MetaWare Debugger installed in your environment:
* To run application from the console using it type `make run`.
* To stop the execution type `Ctrl+C` in the console several times.
In both cases (step 5 and 6) you will see the application output in the serial
terminal.
## Running on Arduino
The following instructions will help you build and deploy this sample

View File

@ -0,0 +1,24 @@
ifeq ($(TARGET), arc_emsdp)
# Patch of arc make project to adjust it specifically
# for person detection example. In particular:
# - Use Linker command file with better usage of fast memory
# - In case project was generated with MLI usage, reduce scratch buffers.
person_detection_HDRS += \
person_detection_patch.txt
person_detection_TEST_HDRS += \
person_detection_patch.txt
%/person_detection_patch.txt: %/emsdp.lcf %/Makefile
@cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $<
@echo emsdp.lcf > $@
@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\
$(word 2, $^)
@echo Makefile >> $@
endif

View File

@ -6,13 +6,101 @@ run on systems with small amounts of memory such as microcontrollers and DSPs.
This uses the experimental int8 quantized version of the person detection model.
## Table of contents
- [Getting started](#getting-started)
- [Running on ARC EM SDP](#running-on-arc-em-sdp)
- [Running on Arduino](#running-on-arduino)
- [Running on SparkFun Edge](#running-on-sparkfun-edge)
- [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
- [Debugging image capture](#debugging-image-capture)
- [Training your own model](#training-your-own-model)
## Running on ARC EM SDP
The following instructions will help you to build and deploy this example to
[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
board. General information and instructions on using the board with TensorFlow
Lite Micro can be found in the common
[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
This example uses asymmetric int8 quantization and can therefore leverage
optimized int8 kernels from the embARC MLI library
The ARC EM SDP board contains a rich set of extension interfaces. You can choose
any compatible camera and modify
[image_provider.cc](/tensorflow/lite/micro/examples/person_detection_experimental/image_provider.cc)
file accordingly to use input from your specific camera. By default, results of
running this example are printed to the console. If you would like to instead
implement some target-specific actions, you need to modify
[detection_responder.cc](/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.cc)
accordingly.
The reference implementations of these files are used by default on the EM SDP.
### Initial setup
Follow the instructions on the
[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
to get and install all required tools for work with ARC EM SDP.
### Generate Example Project
The example project for ARC EM SDP platform can be generated with the following
command:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
```
### Build and Run Example
For more detailed information on building and running examples see the
appropriate sections of general descriptions of the
[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
In the directory with generated project you can also find a
*README_ARC_EMSDP.md* file with instructions and options on building and
running. Here we only briefly mention main steps which are typically enough to
get it started.
1. You need to
[connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
and open an serial connection.
2. Go to the generated example project director
```
cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make
```
3. Build the example using
```
make app
```
4. To generate artefacts for self-boot of example from the board use
```
make flash
```
5. To run application from the board using microSD card:
* Copy the content of the created /bin folder into the root of microSD
card. Note that the card must be formatted as FAT32 with default cluster
size (but less than 32 Kbytes)
* Plug in the microSD card into the J11 connector.
* Push the RST button. If a red LED is lit beside RST button, push the CFG
button.
6. If you have the MetaWare Debugger installed in your environment:
* To run application from the console using it type `make run`.
* To stop the execution type `Ctrl+C` in the console several times.
In both cases (step 5 and 6) you will see the application output in the serial
terminal.
## Running on Arduino
The following instructions will help you build and deploy this sample

View File

@ -0,0 +1,21 @@
ifeq ($(TARGET), arc_emsdp)
# Patch of arc make project to adjust it specifically
# for experimental person detection example. In particular:
# - Use Linker command file with better usage of fast memory
# - Stripout TFLM reference code by default.
person_detection_HDRS += \
person_detection_int8_patch.txt
person_detection_TEST_HDRS += \
person_detection_int8_patch.txt
%/person_detection_int8_patch.txt: %/emsdp.lcf %/Makefile
@cp tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf $<
@echo emsdp.lcf > $@
@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= true#' $(word 2, $^)
@echo Makefile > $@
endif

View File

@ -0,0 +1,74 @@
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#
# Difference with common EMSDP LCF file (to reduce data access time):
# - move data from external PSRAM to on-chip memory
# - move text from SRAM to ICCM
#
# CCMWRAP memory regions indicate unusable portions of the address space
# due to CCM memory wrapping into upper addresses beyond its size
MEMORY {
PSRAM : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
SRAM : ORIGIN = 0x20000000, LENGTH = 0x00040000
IVT : ORIGIN = 0x60000000, LENGTH = 0x400
ICCM0 : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
# CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
DCCM : ORIGIN = 0x80000000, LENGTH = 0x00020000
# CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
XCCM : ORIGIN = 0x90000000, LENGTH = 0x00004000
# CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
YCCM : ORIGIN = 0xa0000000, LENGTH = 0x00004000
# CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
}
SECTIONS {
GROUP BLOCK(4) : {
.vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
} > IVT
GROUP BLOCK(4): {
.text? : { *('.text$crt*') }
* (TEXT): {}
* (LIT): {}
} > ICCM0
GROUP BLOCK(4): {
.rodata_in_data? : {}
} > PSRAM
GROUP BLOCK(4): {
/* _SDA_BASE_ computed implicitly */
.sdata?: {}
.sbss?: {}
* (DATA): {}
* (BSS): {}
.debug_log? : {}
} > SRAM
GROUP BLOCK(4): {
.Zdata? : {}
.heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
.stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
} > DCCM
GROUP BLOCK(4): {
.Xdata? : {}
} > XCCM
GROUP BLOCK(4): {
.Ydata? : {}
} > YCCM
}

View File

@ -1,343 +0,0 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/reference/conv.h"
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/padding.h"
#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
namespace tflite {
namespace ops {
namespace micro {
namespace conv {
constexpr int kInputTensor = 0;
constexpr int kFilterTensor = 1;
constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
constexpr int kMaxChannels = 256;
// This file has 2 implementation of Conv.
const int kTensorNotAllocated = -1;
struct OpData {
TfLitePaddingValues padding;
// The scaling factor from input to output (aka the 'real multiplier') can
// be represented as a fixed point multiplier plus a left shift.
int32_t output_multiplier;
int output_shift;
// Per channel output multiplier and shift.
// TODO(b/141139247): Allocate these dynamically when possible.
int32_t per_channel_output_multiplier[kMaxChannels];
int32_t per_channel_output_shift[kMaxChannels];
// The range of the fused activation layer. For example for kNone and
// uint8_t these would be 0 and 255.
int32_t output_activation_min;
int32_t output_activation_max;
};
inline PaddingType RuntimePaddingType(TfLitePadding padding) {
switch (padding) {
case TfLitePadding::kTfLitePaddingSame:
return PaddingType::kSame;
case TfLitePadding::kTfLitePaddingValid:
return PaddingType::kValid;
case TfLitePadding::kTfLitePaddingUnknown:
default:
return PaddingType::kNone;
}
}
TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, int width, int height,
int filter_width, int filter_height, int out_width,
int out_height, const TfLiteType data_type,
OpData* data) {
bool has_bias = node->inputs->size == 3;
// Check number of inputs/outputs
TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
// Matching GetWindowedOutputSize in TensorFlow.
auto padding = params->padding;
data->padding = ComputePaddingHeightWidth(
params->stride_height, params->stride_width,
params->dilation_height_factor, params->dilation_width_factor, height,
width, filter_height, filter_width, padding, &out_height, &out_width);
// Note that quantized inference requires that all tensors have their
// parameters set. This is usually done during quantized training.
if (data_type != kTfLiteFloat32) {
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
const TfLiteTensor* bias =
GetOptionalInputTensor(context, node, kBiasTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
context, input, filter, bias, output, params->activation,
&data->output_multiplier, &data->output_shift,
&data->output_activation_min, &data->output_activation_max,
data->per_channel_output_multiplier,
reinterpret_cast<int*>(data->per_channel_output_shift)));
}
return kTfLiteOk;
}
void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, OpData* data,
const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* im2col,
TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
const int32_t input_offset = -input->params.zero_point;
const int32_t filter_offset = -filter->params.zero_point;
const int32_t output_offset = output->params.zero_point;
ConvParams op_params;
op_params.padding_type = RuntimePaddingType(params->padding);
op_params.padding_values.width = data->padding.width;
op_params.padding_values.height = data->padding.height;
op_params.stride_width = params->stride_width;
op_params.stride_height = params->stride_height;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.input_offset = input_offset;
op_params.weights_offset = filter_offset;
op_params.output_offset = output_offset;
op_params.output_multiplier = data->output_multiplier;
op_params.output_shift = -data->output_shift;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
reference_ops::Conv(op_params, GetTensorShape(input),
GetTensorData<uint8_t>(input), GetTensorShape(filter),
GetTensorData<uint8_t>(filter), GetTensorShape(bias),
GetTensorData<int32_t>(bias), GetTensorShape(output),
GetTensorData<uint8_t>(output), GetTensorShape(im2col),
GetTensorData<uint8_t>(im2col), nullptr);
}
void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, OpData* data,
const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output,
TfLiteTensor* im2col) {
// Run Conv MLI kernel
// MLI optimized version only supports int8 dataype and dilation factor of 1
if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
(params->dilation_height_factor == 1)) {
mli_tensor mli_in = {0};
mli_tensor mli_weights = {0};
mli_tensor mli_bias = {0};
mli_tensor mli_out = {0};
mli_conv2d_cfg cfg = {};
// reuse space allocated for OpData parameters
mli_weights.el_params.asym.scale.pi16 =
(int16_t*)data->per_channel_output_multiplier;
mli_bias.el_params.asym.scale.pi16 =
(int16_t*)data->per_channel_output_shift;
int16_t filter_zero_point = 0;
int16_t bias_zero_point = 0;
mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
ConvertToMliTensor<int8_t>(input, &mli_in);
ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
ConvertToMliTensor<int8_t>(output, &mli_out);
if (params->activation == kTfLiteActRelu) {
cfg.relu.type = MLI_RELU_GEN;
} else if (params->activation == kTfLiteActRelu6) {
cfg.relu.type = MLI_RELU_6;
} else if (params->activation == kTfLiteActRelu1) {
cfg.relu.type = MLI_RELU_1;
} else {
cfg.relu.type = MLI_RELU_NONE;
}
cfg.stride_width = params->stride_width;
cfg.stride_height = params->stride_height;
if (params->padding == kTfLitePaddingValid) {
cfg.padding_left = 0;
cfg.padding_right = 0;
cfg.padding_top = 0;
cfg.padding_bottom = 0;
} else {
cfg.padding_left = data->padding.width;
cfg.padding_right = data->padding.width + data->padding.width_offset;
cfg.padding_top = data->padding.height;
cfg.padding_bottom = data->padding.height + data->padding.height_offset;
}
mli_point_to_subtsr_cfg substr_cfg_in = {
{0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
mli_point_to_subtsr_cfg substr_cfg_out = {
{0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
mli_tensor sub_mli_in = {0};
mli_tensor sub_mli_out = {0};
const int batches =
MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
for (int i = 0; i < batches; i++) {
substr_cfg_in.start_coord[0] = i;
substr_cfg_out.start_coord[0] = i;
mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
mli_krn_conv2d_hwc_sa8_sa8_sa32(&sub_mli_in, &mli_weights, &mli_bias,
&cfg, &sub_mli_out);
}
} else {
ConvParams op_params;
op_params.input_offset = -input->params.zero_point;
op_params.output_offset = output->params.zero_point;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
reference_integer_ops::ConvPerChannel(
op_params, data->per_channel_output_multiplier,
data->per_channel_output_shift, GetTensorShape(input),
GetTensorData<int8>(input), GetTensorShape(filter),
GetTensorData<int8>(filter), GetTensorShape(bias),
GetTensorData<int32>(bias), GetTensorShape(output),
GetTensorData<int8>(output));
}
}
void EvalFloat(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, OpData* data,
const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* im2col,
TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
float output_activation_min, output_activation_max;
CalculateActivationRange(params->activation, &output_activation_min,
&output_activation_max);
ConvParams op_params;
op_params.padding_type = RuntimePaddingType(params->padding);
op_params.padding_values.width = data->padding.width;
op_params.padding_values.height = data->padding.height;
op_params.stride_width = params->stride_width;
op_params.stride_height = params->stride_height;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.float_activation_min = output_activation_min;
op_params.float_activation_max = output_activation_max;
reference_ops::Conv(op_params, GetTensorShape(input),
GetTensorData<float>(input), GetTensorShape(filter),
GetTensorData<float>(filter), GetTensorShape(bias),
GetTensorData<float>(bias), GetTensorShape(output),
GetTensorData<float>(output), GetTensorShape(im2col),
GetTensorData<float>(im2col));
}
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
int input_width = input->dims->data[2];
int input_height = input->dims->data[1];
int filter_width = filter->dims->data[2];
int filter_height = filter->dims->data[1];
int output_width = output->dims->data[2];
int output_height = output->dims->data[1];
OpData data;
// All per-channel quantized tensors need valid zero point and scale arrays.
if (input->type == kTfLiteInt8) {
TF_LITE_ENSURE_EQ(context, filter->quantization.type,
kTfLiteAffineQuantization);
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(
filter->quantization.params);
TF_LITE_ENSURE(context, affine_quantization);
TF_LITE_ENSURE(context, affine_quantization->scale);
TF_LITE_ENSURE(context, affine_quantization->zero_point);
// Conv is quantized along dimension 0:
// https://www.tensorflow.org/lite/performance/quantization_spec
TF_LITE_ENSURE_EQ(context, filter->dims->data[0],
affine_quantization->scale->size);
TF_LITE_ENSURE_EQ(context, filter->dims->data[0],
affine_quantization->zero_point->size);
}
TF_LITE_ENSURE_STATUS(CalculateOpData(
context, node, params, input_width, input_height, filter_width,
filter_height, output_width, output_height, input->type, &data));
switch (input->type) { // Already know in/out types are same.
case kTfLiteFloat32:
EvalFloat(context, node, params, &data, input, filter, bias, nullptr,
nullptr, output);
break;
case kTfLiteInt8:
EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
output, nullptr);
break;
case kTfLiteUInt8:
EvalQuantized(context, node, params, &data, input, filter, bias, nullptr,
nullptr, output);
break;
default:
TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
}
return kTfLiteOk;
}
} // namespace conv
TfLiteRegistration* Register_CONV_2D() {
static TfLiteRegistration r = {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/conv::Eval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
return &r;
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -1,344 +0,0 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/padding.h"
#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
namespace tflite {
namespace ops {
namespace micro {
namespace depthwise_conv {
namespace {
constexpr int kInputTensor = 0;
constexpr int kFilterTensor = 1;
constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
constexpr int kMaxChannels = 256;
struct OpData {
TfLitePaddingValues padding;
// The scaling factor from input to output (aka the 'real multiplier') can
// be represented as a fixed point multiplier plus a left shift.
int32_t output_multiplier;
int output_shift;
// Per channel output multiplier and shift.
// TODO(b/141139247): Allocate these dynamically when possible.
int32_t per_channel_output_multiplier[kMaxChannels];
int32_t per_channel_output_shift[kMaxChannels];
// The range of the fused activation layer. For example for kNone and
// uint8_t these would be 0 and 255.
int32_t output_activation_min;
int32_t output_activation_max;
};
TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params, int width,
int height, int filter_width, int filter_height,
const TfLiteType data_type, OpData* data) {
bool has_bias = node->inputs->size == 3;
// Check number of inputs/outputs
TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
int unused_output_height, unused_output_width;
data->padding = ComputePaddingHeightWidth(
params->stride_height, params->stride_width, 1, 1, height, width,
filter_height, filter_width, params->padding, &unused_output_height,
&unused_output_width);
// Note that quantized inference requires that all tensors have their
// parameters set. This is usually done during quantized training.
if (data_type != kTfLiteFloat32) {
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
const TfLiteTensor* bias =
GetOptionalInputTensor(context, node, kBiasTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
// Ensure filter and bias channel count does not exceed space reserved for
// quantization metadata.
const auto filter_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(
filter->quantization.params);
const auto bias_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(bias->quantization.params);
TF_LITE_ENSURE(context, filter_quantization->scale->size <= kMaxChannels);
TF_LITE_ENSURE(context, bias_quantization->scale->size <= kMaxChannels);
TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
context, input, filter, bias, output, params->activation,
&data->output_multiplier, &data->output_shift,
&data->output_activation_min, &data->output_activation_max,
data->per_channel_output_multiplier,
reinterpret_cast<int*>(data->per_channel_output_shift)));
}
return kTfLiteOk;
}
} // namespace
void EvalFloat(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params, OpData* data,
const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
float output_activation_min, output_activation_max;
CalculateActivationRange(params->activation, &output_activation_min,
&output_activation_max);
tflite::DepthwiseParams op_params;
// Padding type is ignored, but still set.
op_params.padding_type = PaddingType::kSame;
op_params.padding_values.width = data->padding.width;
op_params.padding_values.height = data->padding.height;
op_params.stride_width = params->stride_width;
op_params.stride_height = params->stride_height;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.depth_multiplier = params->depth_multiplier;
op_params.float_activation_min = output_activation_min;
op_params.float_activation_max = output_activation_max;
tflite::reference_ops::DepthwiseConv(
op_params, GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(filter), GetTensorData<float>(filter),
GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
GetTensorData<float>(output));
}
void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params, OpData* data,
const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
// Run Depthwise Conv MLI kernel
// MLI optimized version only supports int8 dataype and dilation factor of 1
if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
(params->dilation_height_factor == 1)) {
mli_tensor mli_in = {0};
mli_tensor mli_weights = {0};
mli_tensor mli_bias = {0};
mli_tensor mli_out = {0};
mli_conv2d_cfg cfg = {};
// reuse space allocated for OpData parameters
mli_weights.el_params.asym.scale.pi16 =
(int16_t*)data->per_channel_output_multiplier;
mli_bias.el_params.asym.scale.pi16 =
(int16_t*)data->per_channel_output_shift;
int16_t filter_zero_point = 0;
int16_t bias_zero_point = 0;
mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
ConvertToMliTensor<int8_t>(input, &mli_in);
ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
ConvertToMliTensor<int8_t>(output, &mli_out);
if (params->activation == kTfLiteActRelu) {
cfg.relu.type = MLI_RELU_GEN;
} else if (params->activation == kTfLiteActRelu6) {
cfg.relu.type = MLI_RELU_6;
} else if (params->activation == kTfLiteActRelu1) {
cfg.relu.type = MLI_RELU_1;
} else {
cfg.relu.type = MLI_RELU_NONE;
}
cfg.stride_width = params->stride_width;
cfg.stride_height = params->stride_height;
if (params->padding == kTfLitePaddingValid) {
cfg.padding_left = 0;
cfg.padding_right = 0;
cfg.padding_top = 0;
cfg.padding_bottom = 0;
} else {
cfg.padding_left = data->padding.width;
cfg.padding_right = data->padding.width + data->padding.width_offset;
cfg.padding_top = data->padding.height;
cfg.padding_bottom = data->padding.height + data->padding.height_offset;
}
mli_point_to_subtsr_cfg substr_cfg_in = {
{0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
mli_point_to_subtsr_cfg substr_cfg_out = {
{0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
mli_tensor sub_mli_in = {0};
mli_tensor sub_mli_out = {0};
const int batches =
MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
for (int i = 0; i < batches; i++) {
substr_cfg_in.start_coord[0] = i;
substr_cfg_out.start_coord[0] = i;
mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(&sub_mli_in, &mli_weights,
&mli_bias, &cfg, &sub_mli_out);
}
} else {
DepthwiseParams op_params;
op_params.padding_type = PaddingType::kSame;
op_params.padding_values.width = data->padding.width;
op_params.padding_values.height = data->padding.height;
op_params.stride_width = params->stride_width;
op_params.stride_height = params->stride_height;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.depth_multiplier = params->depth_multiplier;
op_params.input_offset = -input->params.zero_point;
op_params.weights_offset = 0;
op_params.output_offset = output->params.zero_point;
// TODO(b/130439627): Use calculated value for clamping.
op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
reference_integer_ops::DepthwiseConvPerChannel(
op_params, data->per_channel_output_multiplier,
data->per_channel_output_shift, GetTensorShape(input),
GetTensorData<int8>(input), GetTensorShape(filter),
GetTensorData<int8>(filter), GetTensorShape(bias),
GetTensorData<int32>(bias), GetTensorShape(output),
GetTensorData<int8>(output));
}
}
void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params, OpData* data,
const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
const int32_t input_offset = -input->params.zero_point;
const int32_t filter_offset = -filter->params.zero_point;
const int32_t output_offset = output->params.zero_point;
tflite::DepthwiseParams op_params;
// Padding type is ignored, but still set.
op_params.padding_type = PaddingType::kSame;
op_params.padding_values.width = data->padding.width;
op_params.padding_values.height = data->padding.height;
op_params.stride_width = params->stride_width;
op_params.stride_height = params->stride_height;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.depth_multiplier = params->depth_multiplier;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
op_params.input_offset = input_offset;
op_params.weights_offset = filter_offset;
op_params.output_offset = output_offset;
op_params.output_multiplier = data->output_multiplier;
// Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
op_params.output_shift = -data->output_shift;
tflite::reference_ops::DepthwiseConv(
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
GetTensorShape(filter), GetTensorData<uint8_t>(filter),
GetTensorShape(bias), GetTensorData<int32_t>(bias),
GetTensorShape(output), GetTensorData<uint8_t>(output));
}
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
auto* params =
reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
const TfLiteTensor* bias =
(NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
const TfLiteType data_type = input->type;
int width = SizeOfDimension(input, 2);
int height = SizeOfDimension(input, 1);
int filter_width = SizeOfDimension(filter, 2);
int filter_height = SizeOfDimension(filter, 1);
OpData data;
// All per-channel quantized tensors need valid zero point and scale arrays.
if (input->type == kTfLiteInt8) {
TF_LITE_ENSURE_EQ(context, filter->quantization.type,
kTfLiteAffineQuantization);
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(
filter->quantization.params);
TF_LITE_ENSURE(context, affine_quantization);
TF_LITE_ENSURE(context, affine_quantization->scale);
TF_LITE_ENSURE(context, affine_quantization->zero_point);
// Depthwise conv is quantized along dimension 3:
// https://www.tensorflow.org/lite/performance/quantization_spec
TF_LITE_ENSURE_EQ(context, filter->dims->data[3],
affine_quantization->scale->size);
TF_LITE_ENSURE_EQ(context, filter->dims->data[3],
affine_quantization->zero_point->size);
}
TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
filter_width, filter_height, data_type,
&data));
switch (input->type) { // Already know in/out types are same.
case kTfLiteFloat32:
EvalFloat(context, node, params, &data, input, filter, bias, output);
break;
case kTfLiteInt8:
EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
output);
break;
case kTfLiteUInt8:
EvalQuantized(context, node, params, &data, input, filter, bias, output);
break;
default:
TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
}
return kTfLiteOk;
}
} // namespace depthwise_conv
TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
static TfLiteRegistration r = {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/depthwise_conv::Eval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
return &r;
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -1,248 +0,0 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
namespace tflite {
namespace ops {
namespace micro {
namespace fully_connected {
namespace {
struct OpData {
// The scaling factor from input to output (aka the 'real multiplier') can
// be represented as a fixed point multiplier plus a left shift.
int32_t output_multiplier;
int output_shift;
// The range of the fused activation layer. For example for kNone and
// uint8_t these would be 0 and 255.
int32_t output_activation_min;
int32_t output_activation_max;
// The index of the temporary tensor where the quantized inputs are cached.
int input_quantized_index;
};
constexpr int kInputTensor = 0;
constexpr int kWeightsTensor = 1;
constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
TfLiteStatus CalculateOpData(TfLiteContext* context,
TfLiteFullyConnectedParams* params,
TfLiteType data_type, const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output,
OpData* data) {
TfLiteStatus status = kTfLiteOk;
if (data_type != kTfLiteFloat32) {
double real_multiplier = 0.0;
TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
context, input, filter, bias, output, &real_multiplier));
int exponent;
QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
data->output_shift = -exponent;
TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
context, params->activation, output, &data->output_activation_min,
&data->output_activation_max));
}
return status;
}
} // namespace
TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
TfLiteFullyConnectedParams* params, OpData* data,
const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
// Run Fully Connected MLI kernel
// MLI optimized version only supports int8 dataype and no fused Relu
// TODO: subject to add mli_saturate kernel
// work around for issue #35318, mli fully connect kernel only supports
// zeropoint == 0 for weights. this check can be removed once issue #35318 is
// resolved.
if ((filter->params.zero_point == 0) &&
(input->type == kTfLiteInt8 && params->activation == kTfLiteActNone)) {
mli_tensor mli_in = {0};
mli_tensor mli_weights = {0};
mli_tensor mli_bias = {0};
mli_tensor mli_out = {0};
ConvertToMliTensor<int8_t>(input, &mli_in);
ConvertToMliTensor<int8_t>(filter, &mli_weights);
ConvertToMliTensor<int32_t>(bias, &mli_bias);
ConvertToMliTensor<int8_t>(output, &mli_out);
mli_point_to_subtsr_cfg substr_cfg_in = {
{0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
mli_point_to_subtsr_cfg substr_cfg_out = {
{0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
mli_tensor sub_mli_in = {0};
mli_tensor sub_mli_out = {0};
const int batches =
MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
for (int i = 0; i < batches; i++) {
substr_cfg_in.start_coord[0] = i;
substr_cfg_out.start_coord[0] = i;
mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
mli_krn_fully_connected_sa8_sa8_sa32(&sub_mli_in, &mli_weights, &mli_bias,
&sub_mli_out);
}
} else {
FullyConnectedParams op_params;
op_params.input_offset = -input->params.zero_point;
op_params.weights_offset = -filter->params.zero_point;
op_params.output_offset = output->params.zero_point;
op_params.output_multiplier = data->output_multiplier;
// TODO(b/138810107): Figure out whether output shift should be inverted
op_params.output_shift = -data->output_shift;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
reference_integer_ops::FullyConnected(
op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
GetTensorShape(filter), GetTensorData<int8_t>(filter),
GetTensorShape(bias), GetTensorData<int32_t>(bias),
GetTensorShape(output), GetTensorData<int8_t>(output));
}
return kTfLiteOk;
}
TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
TfLiteFullyConnectedParams* params, OpData* data,
const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias,
TfLiteTensor* output) {
const int32_t input_offset = -input->params.zero_point;
const int32_t filter_offset = -filter->params.zero_point;
const int32_t output_offset = output->params.zero_point;
tflite::FullyConnectedParams op_params;
op_params.input_offset = input_offset;
op_params.weights_offset = filter_offset;
op_params.output_offset = output_offset;
op_params.output_multiplier = data->output_multiplier;
// Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
op_params.output_shift = -data->output_shift;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
#define TF_LITE_FULLY_CONNECTED(output_data_type) \
reference_ops::FullyConnected( \
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
GetTensorShape(filter), GetTensorData<uint8_t>(filter), \
GetTensorShape(bias), GetTensorData<int32_t>(bias), \
GetTensorShape(output), GetTensorData<output_data_type>(output))
switch (output->type) {
case kTfLiteUInt8:
TF_LITE_FULLY_CONNECTED(uint8_t);
break;
case kTfLiteInt16:
TF_LITE_FULLY_CONNECTED(int16_t);
break;
default:
TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
TfLiteTypeGetName(output->type), output->type);
return kTfLiteError;
}
return kTfLiteOk;
}
TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
TfLiteFullyConnectedParams* params, OpData* data,
const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
float output_activation_min, output_activation_max;
CalculateActivationRange(params->activation, &output_activation_min,
&output_activation_max);
tflite::FullyConnectedParams op_params;
op_params.float_activation_min = output_activation_min;
op_params.float_activation_max = output_activation_max;
tflite::reference_ops::FullyConnected(
op_params, GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(filter), GetTensorData<float>(filter),
GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
GetTensorData<float>(output));
return kTfLiteOk;
}
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
auto* params =
reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
TfLiteType data_type = input->type;
OpData local_data_object;
OpData* data = &local_data_object;
TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
filter, bias, output, data));
switch (filter->type) { // Already know in/out types are same.
case kTfLiteFloat32:
return EvalFloat(context, node, params, data, input, filter, bias,
output);
case kTfLiteInt8:
return EvalQuantizedInt8(context, node, params, data, input, filter, bias,
output);
case kTfLiteUInt8:
return EvalQuantized(context, node, params, data, input, filter, bias,
output);
default:
TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
TfLiteTypeGetName(filter->type), filter->type);
return kTfLiteError;
}
return kTfLiteOk;
}
} // namespace fully_connected
TfLiteRegistration* Register_FULLY_CONNECTED() {
static TfLiteRegistration r = {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/fully_connected::Eval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
return &r;
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -1,292 +0,0 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/reference/pooling.h"
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/padding.h"
#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
namespace tflite {
namespace ops {
namespace micro {
namespace pooling {
namespace {
constexpr int kInputTensor = 0;
constexpr int kOutputTensor = 0;
struct OpData {
TfLitePaddingValues padding;
};
TfLiteStatus CalculateOpData(const TfLiteContext* context,
const TfLitePoolParams* params,
const TfLiteTensor* input,
const TfLiteTensor* output, OpData* data) {
// input: batch, height, width, channel
int height = SizeOfDimension(input, 1);
int width = SizeOfDimension(input, 2);
int out_height, out_width;
data->padding = ComputePaddingHeightWidth(
params->stride_height, params->stride_width,
/*dilation_rate_height=*/1,
/*dilation_rate_width=*/1, height, width, params->filter_height,
params->filter_width, params->padding, &out_height, &out_width);
return kTfLiteOk;
}
void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
const TfLitePoolParams* params, const OpData* data,
const TfLiteTensor* input, TfLiteTensor* output) {
float activation_min, activation_max;
CalculateActivationRange(params->activation, &activation_min,
&activation_max);
PoolParams op_params;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.filter_height = params->filter_height;
op_params.filter_width = params->filter_width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.float_activation_min = activation_min;
op_params.float_activation_max = activation_max;
reference_ops::AveragePool(
op_params, GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(output), GetTensorData<float>(output));
}
void AverageEvalUint8(TfLiteContext* context, const TfLiteNode* node,
const TfLitePoolParams* params, const OpData* data,
const TfLiteTensor* input, TfLiteTensor* output) {
int32_t activation_min, activation_max;
(void)CalculateActivationRangeQuantized(context, params->activation, output,
&activation_min, &activation_max);
PoolParams op_params;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.filter_height = params->filter_height;
op_params.filter_width = params->filter_width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.quantized_activation_min = activation_min;
op_params.quantized_activation_max = activation_max;
reference_ops::AveragePool(
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
GetTensorShape(output), GetTensorData<uint8_t>(output));
}
void AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
const TfLitePoolParams* params, const OpData* data,
const TfLiteTensor* input, TfLiteTensor* output) {
// Run Average Pooling MLI kernel
// MLI optimized version only supports int8 dataype and no fused Relu
// TODO: subject to add mli_saturate kernel
if (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone) {
mli_tensor mli_in = {0};
mli_tensor mli_out = {0};
mli_pool_cfg cfg = {0};
ConvertToMliTensor<int8_t>(input, &mli_in);
ConvertToMliTensor<int8_t>(output, &mli_out);
cfg.kernel_width = params->filter_width;
cfg.kernel_height = params->filter_height;
cfg.stride_width = params->stride_width;
cfg.stride_height = params->stride_height;
if (params->padding == kTfLitePaddingValid) {
cfg.padding_left = 0;
cfg.padding_right = 0;
cfg.padding_top = 0;
cfg.padding_bottom = 0;
} else {
cfg.padding_left = data->padding.width;
cfg.padding_right = data->padding.width + data->padding.width_offset;
cfg.padding_top = data->padding.height;
cfg.padding_bottom = data->padding.height + data->padding.height_offset;
}
mli_point_to_subtsr_cfg substr_cfg_in = {
{0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
mli_point_to_subtsr_cfg substr_cfg_out = {
{0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
mli_tensor sub_mli_in = {0};
mli_tensor sub_mli_out = {0};
const int batches =
MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
for (int i = 0; i < batches; i++) {
substr_cfg_in.start_coord[0] = i;
substr_cfg_out.start_coord[0] = i;
mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
mli_krn_avepool_hwc_sa8(&sub_mli_in, &cfg, &sub_mli_out);
}
} else {
int32_t activation_min, activation_max;
(void)CalculateActivationRangeQuantized(context, params->activation, output,
&activation_min, &activation_max);
PoolParams op_params;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.filter_height = params->filter_height;
op_params.filter_width = params->filter_width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.quantized_activation_min = activation_min;
op_params.quantized_activation_max = activation_max;
reference_integer_ops::AveragePool(
op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
GetTensorShape(output), GetTensorData<int8_t>(output));
}
}
void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
TfLitePoolParams* params, OpData* data,
const TfLiteTensor* input, TfLiteTensor* output) {
float activation_min, activation_max;
CalculateActivationRange(params->activation, &activation_min,
&activation_max);
tflite::PoolParams op_params;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.filter_height = params->filter_height;
op_params.filter_width = params->filter_width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.float_activation_min = activation_min;
op_params.float_activation_max = activation_max;
reference_ops::MaxPool(op_params, GetTensorShape(input),
GetTensorData<float>(input), GetTensorShape(output),
GetTensorData<float>(output));
}
void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
TfLitePoolParams* params, OpData* data,
const TfLiteTensor* input, TfLiteTensor* output) {
int32_t activation_min, activation_max;
(void)CalculateActivationRangeQuantized(context, params->activation, output,
&activation_min, &activation_max);
tflite::PoolParams op_params;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.filter_height = params->filter_height;
op_params.filter_width = params->filter_width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.quantized_activation_min = activation_min;
op_params.quantized_activation_max = activation_max;
reference_ops::MaxPool(op_params, GetTensorShape(input),
GetTensorData<uint8_t>(input), GetTensorShape(output),
GetTensorData<uint8_t>(output));
}
} // namespace
TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
OpData data;
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
// Inputs and outputs share the same type, guarenteed by the converter.
switch (input->type) {
case kTfLiteFloat32:
AverageEvalFloat(context, node, params, &data, input, output);
break;
case kTfLiteUInt8:
AverageEvalUint8(context, node, params, &data, input, output);
break;
case kTfLiteInt8:
AverageEvalInt8(context, node, params, &data, input, output);
break;
default:
TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
TfLiteTypeGetName(input->type));
return kTfLiteError;
}
return kTfLiteOk;
}
TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
OpData data;
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
switch (input->type) {
case kTfLiteFloat32:
MaxEvalFloat(context, node, params, &data, input, output);
break;
case kTfLiteUInt8:
MaxEvalQuantizedUInt8(context, node, params, &data, input, output);
break;
default:
TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
TfLiteTypeGetName(input->type));
return kTfLiteError;
}
return kTfLiteOk;
}
} // namespace pooling
TfLiteRegistration* Register_AVERAGE_POOL_2D() {
static TfLiteRegistration r = {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/pooling::AverageEval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
return &r;
}
TfLiteRegistration* Register_MAX_POOL_2D() {
static TfLiteRegistration r = {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/pooling::MaxEval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
return &r;
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -0,0 +1,96 @@
# EmbARC MLI Library Based Optimizations of TensorFlow Lite Micro Kernels for ARC Platforms.
This folder contains kernel implementations which use optimized
[embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli).
It allows acceleration of inference operations which use int8 (asymmetric
quantization).
## Usage
embARC MLI Library is used by default to speed up execution of some kernels for
asymmetrically quantized layers. This means that usual project generation for
ARC specific target implies usage of embARC MLI.
For example:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
```
In case MLI implementation cant be used, kernels in this folder fallback to
TFLM reference implementations. For applications which may not benefit from MLI
library, projects can be generated without these implementations by adding
`TAGS=no_arc_mli` in the command line, which can reduce overall code size:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_int8_make_project
```
For ARC EM SDP board, a pre-compiled MLI library is downloaded and used in the
application. For a custom target ARC-based platform, MLI sources are downloaded
and compiled during project generation phase. To build library from sources for
ARC EM SDP platform, add `BUILD_ARC_MLI=true` option to make command:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp BUILD_ARC_MLI=true generate_person_detection_int8_make_project
```
If an application exclusively uses accelerated MLI kernel implementations, one
can strip out TFLM reference kernel implementations to reduce code size of
application. Build application with `MLI_ONLY=true` option in generated project
(after the project was built):
```
cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make
make app MLI_ONLY=true
```
if you try this and application execution fails, then most probably MLI cant be
used for some nodes and you need to revert to using TFLM reference kernels.
## Limitations
Currently, the MLI Library provides optimized implementation only for int8
(asymmetric) versions of the following kernels: 1. Convolution 2D Per axis
quantization only, `dilation_ratio==1` 2. Depthwise Convolution 2D Per axis
quantization only, `dilation_ratio==1` 3. Average Pooling 4. Max Pooling 5.
Fully Connected
Currently only
[/tensorflow/lite/micro/examples/person_detection_experimental](/tensorflow/lite/micro/examples/person_detection_experimental)
is quantized using this specification. Other examples can be executed on
ARC-based targets, but will only use reference kernels.
## Scratch Buffers and Slicing
The following information applies only for ARC EM SDP and other targets with XY
memory. embARC MLI uses specific optimizations which assumes node operands are
in XY memory and/or DCCM (Data Closely Coupled Memory). As operands might be
quite big and may not fit in available XY memory, special slicing logic is
applied which allows kernel calculations to be split into multiple parts. For
this reason, internal static buffers are allocated in these X, Y and DCCM memory
banks and used to execute sub-calculations.
All this is performed automatically and invisible to the user. Half of the DCCM
memory bank and the full XY banks are occupied for MLI specific needs. If the
user needs space in XY memory for other tasks, these arrays can be reduced by
setting specific sizes. For this, add the following option to build command
replacing **<size[a|b|c]>** with required values:
```
EXT_CFLAGS=”-DSCRATCH_MEM_Z_SIZE=<size_a> -DSCRATCH_MEM_X_SIZE=<size_b> -DSCRATCH_MEM_Y_SIZE=<size_c>
```
For example, to reduce sizes of arrays placed in DCCM and XCCM to 32k and 8k
respectively, use next command:
```
make app EXT_CFLAGS=”-DSCRATCH_MEM_Z_SIZE=32*1024 -DSCRATCH_MEM_X_SIZE=8*1024”
```
## License
TensorFlow's code is covered by the Apache2 License included in the repository,
and third party dependencies are covered by their respective licenses, in the
third_party folder of this package.

View File

@ -0,0 +1,490 @@
/* Copyright 2019-2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/reference/conv.h"
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/padding.h"
#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
namespace tflite {
namespace ops {
namespace micro {
namespace conv {
constexpr int kInputTensor = 0;
constexpr int kFilterTensor = 1;
constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
constexpr int kMaxChannels = 256;
// Conv is quantized along dimension 0:
// https://www.tensorflow.org/lite/performance/quantization_spec
constexpr int kConvQuantizedDimension = 0;
struct OpData {
TfLitePaddingValues padding;
// The scaling factor from input to output (aka the 'real multiplier') can
// be represented as a fixed point multiplier plus a left shift.
int32_t output_multiplier;
int output_shift;
// Per channel output multiplier and shift.
int32_t per_channel_output_multiplier[kMaxChannels];
int32_t per_channel_output_shift[kMaxChannels];
// The range of the fused activation layer. For example for kNone and
// uint8_t these would be 0 and 255.
int32_t output_activation_min;
int32_t output_activation_max;
};
inline PaddingType RuntimePaddingType(TfLitePadding padding) {
switch (padding) {
case TfLitePadding::kTfLitePaddingSame:
return PaddingType::kSame;
case TfLitePadding::kTfLitePaddingValid:
return PaddingType::kValid;
case TfLitePadding::kTfLitePaddingUnknown:
default:
return PaddingType::kNone;
}
}
bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias,
const TfLiteConvParams* params) {
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
// MLI optimized version only supports int8 dataype, dilation factor of 1 and
// per-axis quantization of weights (no broadcasting/per-tensor)
bool ret_val = (filter->type == kTfLiteInt8) &&
(input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
(params->dilation_width_factor == 1) &&
(params->dilation_height_factor == 1) &&
(affine_quantization->scale->size ==
filter->dims->data[kConvQuantizedDimension]) &&
affine_quantization->scale->size <= (kMaxChannels * 2);
return ret_val;
}
TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, int width, int height,
int filter_width, int filter_height, int out_width,
int out_height, const TfLiteType data_type,
bool mli_is_applicable, OpData* data) {
bool has_bias = node->inputs->size == 3;
// Check number of inputs/outputs
TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
// Matching GetWindowedOutputSize in TensorFlow.
auto padding = params->padding;
data->padding = ComputePaddingHeightWidth(
params->stride_height, params->stride_width,
params->dilation_height_factor, params->dilation_width_factor, height,
width, filter_height, filter_width, padding, &out_height, &out_width);
// Note that quantized inference requires that all tensors have their
// parameters set. This is usually done during quantized training.
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
if (data_type != kTfLiteFloat32 && !mli_is_applicable) {
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
const TfLiteTensor* bias =
GetOptionalInputTensor(context, node, kBiasTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
int output_channels = filter->dims->data[kConvQuantizedDimension];
TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
context, input, filter, bias, output, params->activation,
&data->output_multiplier, &data->output_shift,
&data->output_activation_min, &data->output_activation_max,
data->per_channel_output_multiplier,
reinterpret_cast<int*>(data->per_channel_output_shift),
output_channels));
}
#endif
return kTfLiteOk;
}
TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, OpData* data,
const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias,
TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
const int32_t input_offset = -input->params.zero_point;
const int32_t filter_offset = -filter->params.zero_point;
const int32_t output_offset = output->params.zero_point;
ConvParams op_params;
op_params.padding_type = RuntimePaddingType(params->padding);
op_params.padding_values.width = data->padding.width;
op_params.padding_values.height = data->padding.height;
op_params.stride_width = params->stride_width;
op_params.stride_height = params->stride_height;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.input_offset = input_offset;
op_params.weights_offset = filter_offset;
op_params.output_offset = output_offset;
op_params.output_multiplier = data->output_multiplier;
op_params.output_shift = -data->output_shift;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
reference_ops::Conv(op_params, GetTensorShape(input),
GetTensorData<uint8_t>(input), GetTensorShape(filter),
GetTensorData<uint8_t>(filter), GetTensorShape(bias),
GetTensorData<int32_t>(bias), GetTensorShape(output),
GetTensorData<uint8_t>(output), GetTensorShape(im2col),
GetTensorData<uint8_t>(im2col), nullptr);
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(context,
"Type %s (%d) is not supported by ARC MLI Library.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
#endif
}
TfLiteStatus EvalMliQuantizedPerChannel(
TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params,
OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
// Run Conv MLI kernel
// MLI optimized version only supports int8 dataype and dilation factor of 1
if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
(params->dilation_height_factor == 1)) {
mli_tensor mli_in = {0};
mli_tensor mli_weights = {0};
mli_tensor mli_bias = {0};
mli_tensor mli_out = {0};
mli_conv2d_cfg cfg = {};
// reuse space allocated for OpData parameters
mli_weights.el_params.asym.scale.pi16 =
(int16_t*)data->per_channel_output_multiplier;
mli_bias.el_params.asym.scale.pi16 =
(int16_t*)data->per_channel_output_shift;
int16_t filter_zero_point = 0;
int16_t bias_zero_point = 0;
mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
ConvertToMliTensor<int8_t>(input, &mli_in);
ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
ConvertToMliTensor<int8_t>(output, &mli_out);
if (params->activation == kTfLiteActRelu) {
cfg.relu.type = MLI_RELU_GEN;
} else if (params->activation == kTfLiteActRelu6) {
cfg.relu.type = MLI_RELU_6;
} else if (params->activation == kTfLiteActRelu1) {
cfg.relu.type = MLI_RELU_1;
} else {
cfg.relu.type = MLI_RELU_NONE;
}
cfg.stride_width = params->stride_width;
cfg.stride_height = params->stride_height;
if (params->padding == kTfLitePaddingValid) {
cfg.padding_left = 0;
cfg.padding_right = 0;
cfg.padding_top = 0;
cfg.padding_bottom = 0;
} else {
cfg.padding_left = data->padding.width;
cfg.padding_right = data->padding.width + data->padding.width_offset;
cfg.padding_top = data->padding.height;
cfg.padding_bottom = data->padding.height + data->padding.height_offset;
}
// for height slicing
const int height_dimension = 1;
int in_slice_height = 0;
int out_slice_height = 0;
const int kernel_height =
static_cast<int>(mli_weights.shape[KRNL_H_DIM_HWC]);
const int overlap = kernel_height - cfg.stride_height;
// for weight slicing (on output channels)
// NHWC layout for weigths, output channel dimension is the first dimension.
const int weight_out_ch_dimension = 0;
int slice_channels =
static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
// Batch-Height-Width-Channel layout means last dimension is output
// channels.
const int out_tensor_ch_dimension = 3;
// Tensors for data in fast (local) memory and config to copy data from
// external to local memory
mli_tensor weights_local = mli_weights;
mli_tensor bias_local = mli_bias;
mli_tensor in_local = mli_in;
mli_tensor out_local = mli_out;
mli_mov_cfg_t copy_config;
mli_mov_cfg_for_copy(&copy_config);
TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
context, &in_local, &weights_local, &bias_local, &out_local));
TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
&in_local, &out_local, kernel_height, cfg.stride_height,
cfg.padding_top, cfg.padding_bottom, &in_slice_height,
&out_slice_height));
TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
&weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
/* is_local indicates that the tensor is already in local memory,
so in that case the original tensor can be used,
and there is no need to copy it to the local tensor*/
const bool in_is_local = in_local.data == mli_in.data;
const bool out_is_local = out_local.data == mli_out.data;
const bool w_is_local = weights_local.data == mli_weights.data;
const bool b_is_local = bias_local.data == mli_bias.data;
TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels);
TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension, slice_channels);
TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
0, 0, 0, true);
mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
void* input_buffer_ptr = NULL;
int input_buffer_size = 0;
while (!w_slice.Done()) {
mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
/* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional
tensor. because the mli kernel will process one HWC tensor at a time, the
4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
on top of that there could be a need to also slice in the Height
dimension. for that the sliceHeight has been calculated. The tensor slicer
is configured that it will completely slice the nBatch dimension (0) and
slice the height dimension (1) in chunks of 'sliceHeight' */
TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height,
cfg.padding_top, cfg.padding_bottom, overlap);
/* output tensor is alreade sliced in the output channel dimension.
out_ch_slice.Sub() is the tensor for the amount of output channels of this
itteration of the weight slice loop. This tensor needs to be further
sliced over the batch and height dimension. */
TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
out_slice_height);
/* setup the pointers to the local or remote tensor to make the code
* inside the loop easier. */
mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
while (!out_slice.Done()) {
TF_LITE_ENSURE(context, !in_slice.Done());
cfg.padding_top = in_slice.GetPaddingPre();
cfg.padding_bottom = in_slice.GetPaddingPost();
// if same input copy as previous iteration, skip the copy of input
if ((in_slice.Sub()->data != input_buffer_ptr) ||
(mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
input_buffer_ptr = in_slice.Sub()->data;
input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
}
mli_krn_conv2d_nhwc_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr);
mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
in_slice.Next();
out_slice.Next();
}
w_slice.Next();
b_slice.Next();
out_ch_slice.Next();
TF_LITE_ENSURE(context, in_slice.Done());
}
}
return kTfLiteOk;
}
TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, OpData* data,
const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias,
TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
ConvParams op_params;
op_params.input_offset = -input->params.zero_point;
op_params.output_offset = output->params.zero_point;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
reference_integer_ops::ConvPerChannel(
op_params, data->per_channel_output_multiplier,
data->per_channel_output_shift, GetTensorShape(input),
GetTensorData<int8>(input), GetTensorShape(filter),
GetTensorData<int8>(filter), GetTensorShape(bias),
GetTensorData<int32>(bias), GetTensorShape(output),
GetTensorData<int8>(output));
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(context,
"Node configuration is not supported by ARC MLI Library.");
return kTfLiteError;
#endif
}
TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
TfLiteConvParams* params, OpData* data,
const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* im2col,
TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
float output_activation_min, output_activation_max;
CalculateActivationRange(params->activation, &output_activation_min,
&output_activation_max);
ConvParams op_params;
op_params.padding_type = RuntimePaddingType(params->padding);
op_params.padding_values.width = data->padding.width;
op_params.padding_values.height = data->padding.height;
op_params.stride_width = params->stride_width;
op_params.stride_height = params->stride_height;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.float_activation_min = output_activation_min;
op_params.float_activation_max = output_activation_max;
reference_ops::Conv(op_params, GetTensorShape(input),
GetTensorData<float>(input), GetTensorShape(filter),
GetTensorData<float>(filter), GetTensorShape(bias),
GetTensorData<float>(bias), GetTensorShape(output),
GetTensorData<float>(output), GetTensorShape(im2col),
GetTensorData<float>(im2col));
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(context,
"Type %s (%d) is not supported by ARC MLI Library.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
#endif
}
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
int input_width = input->dims->data[2];
int input_height = input->dims->data[1];
int filter_width = filter->dims->data[2];
int filter_height = filter->dims->data[1];
int output_width = output->dims->data[2];
int output_height = output->dims->data[1];
OpData data;
// All per-channel quantized tensors need valid zero point and scale arrays.
if (input->type == kTfLiteInt8) {
TF_LITE_ENSURE_EQ(context, filter->quantization.type,
kTfLiteAffineQuantization);
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(
filter->quantization.params);
TF_LITE_ENSURE(context, affine_quantization);
TF_LITE_ENSURE(context, affine_quantization->scale);
TF_LITE_ENSURE(context, affine_quantization->zero_point);
TF_LITE_ENSURE(context,
affine_quantization->scale->size == 1 ||
affine_quantization->scale->size ==
filter->dims->data[kConvQuantizedDimension]);
TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
affine_quantization->zero_point->size);
}
bool mli_is_applicable =
IsMliApplicable(context, input, filter, bias, params);
TF_LITE_ENSURE_STATUS(
CalculateOpData(context, node, params, input_width, input_height,
filter_width, filter_height, output_width, output_height,
input->type, mli_is_applicable, &data));
switch (input->type) { // Already know in/out types are same.
case kTfLiteFloat32:
return EvalFloat(context, node, params, &data, input, filter, bias,
nullptr, nullptr, output);
break;
case kTfLiteInt8:
if (mli_is_applicable) {
return EvalMliQuantizedPerChannel(context, node, params, &data, input,
filter, bias, output);
} else {
return EvalQuantizedPerChannel(context, node, params, &data, input,
filter, bias, output);
}
break;
case kTfLiteUInt8:
return EvalQuantized(context, node, params, &data, input, filter, bias,
nullptr, nullptr, output);
break;
default:
TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
}
return kTfLiteOk;
}
} // namespace conv
TfLiteRegistration* Register_CONV_2D() {
static TfLiteRegistration r = {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/conv::Eval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
return &r;
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -0,0 +1,506 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This test checks that slicing logic doesn`t affect result of convolution
// kernel
//
// This test doesn`t replace default convolution test
// (tensorflow/lite/micro/kernels/conv_test.cc). It is added to the whole
// testset only in case MLI for ARC platform is used during generation (which is
// handled in arc_mli.inc). So such tests won`t be generated for other
// platforms.
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
#include "tensorflow/lite/micro/micro_utils.h"
#include "tensorflow/lite/micro/testing/micro_test.h"
#include "tensorflow/lite/micro/testing/test_utils.h"
namespace tflite {
namespace testing {
namespace {
// Common inputs and outputs 1.
static const int kInput1Elements = 20;
static const int kInput1Shape[] = {4, 1, 5, 2, 2};
static const float kInput1Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
static const int kFilter1Elements = 36;
static const int kFilter1Shape[] = {4, 2, 3, 3, 2};
static const float kFilter1Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
static const int kBias1Elements = 2;
static const int kBias1Shape[] = {1, 2};
static const float kBias1Data[] = {2, 2};
static const int kOutput1Elements = 20;
static const int kOutput1Shape[] = {4, 1, 5, 2, 2};
static const float kGolden1Data[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
// Common inputs and outputs 2.
static const int kInput2Elements = 80;
static const int kInput2Shape[] = {4, 1, 20, 2, 2};
static const float kInput2Data[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
static const int kFilter2Elements = 36;
static const int kFilter2Shape[] = {4, 2, 3, 3, 2};
static const float kFilter2Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
static const int kBias2Elements = 2;
static const int kBias2Shape[] = {1, 2};
static const float kBias2Data[] = {2, 2};
static const int kOutput2Elements = 80;
static const int kOutput2Shape[] = {4, 1, 20, 2, 2};
static const float kGolden2Data[] = {
34, 34, 34, 34, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
// Common inputs and outputs 3.
static const int kInput3Elements = 40;
static const int kInput3Shape[] = {4, 1, 2, 2, 10};
static const float kInput3Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
static const int kFilter3Elements = 90;
static const int kFilter3Shape[] = {4, 1, 3, 3, 10}; // 1 3 3 10
static const float kFilter3Data[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
static const int kBias3Elements = 1;
static const int kBias3Shape[] = {1, 1};
static const float kBias3Data[] = {1};
static const int kOutput3Elements = 4;
static const int kOutput3Shape[] = {4, 1, 2, 2, 1}; // 2 2 1
static const float kGolden3Data[] = {41, 41, 41, 41};
// Common inputs and outputs 4.
static const int kInput4Elements = 80;
static const int kInput4Shape[] = {4, 1, 4, 2, 10};
static const float kInput4Data[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
static const int kFilter4Elements = 90;
static const int kFilter4Shape[] = {4, 1, 3, 3, 10};
static const float kFilter4Data[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
static const int kBias4Elements = 1;
static const int kBias4Shape[] = {1, 1};
static const float kBias4Data[] = {1};
static const int kOutput4Elements = 8;
static const int kOutput4Shape[] = {4, 1, 4, 2, 1};
static const float kGolden4Data[] = {41, 41, 61, 61, 61, 61, 41, 41};
static TfLiteConvParams common_conv_params = {
kTfLitePaddingSame, // padding
1, // stride_width
1, // stride_height
kTfLiteActNone, // activation
1, // dilation_width_factor
1, // dilation_height_factor
};
template <typename T>
TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
const T* expected_output_data, T* output_data,
int output_length,
TfLiteConvParams* conv_params,
float tolerance = 1e-5) {
TfLiteContext context;
PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
::tflite::ops::micro::AllOpsResolver resolver;
const TfLiteRegistration* registration =
resolver.FindOp(tflite::BuiltinOperator_CONV_2D, 1);
TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
const char* init_data = reinterpret_cast<const char*>(conv_params);
size_t init_data_size = 0;
void* user_data = nullptr;
if (registration->init) {
user_data = registration->init(&context, init_data, init_data_size);
}
int inputs_array_data[] = {3, 0, 1, 2};
TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
int outputs_array_data[] = {1, 3};
TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
int temporaries_array_data[] = {0};
TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
TfLiteNode node;
node.inputs = inputs_array;
node.outputs = outputs_array;
node.temporaries = temporaries_array;
node.user_data = user_data;
node.builtin_data = reinterpret_cast<void*>(conv_params);
node.custom_initial_data = nullptr;
node.custom_initial_data_size = 0;
node.delegate = nullptr;
if (registration->prepare) {
TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
}
TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
TfLiteStatus return_val = registration->invoke(&context, &node);
if (return_val != kTfLiteOk) {
return return_val;
}
if (registration->free) {
registration->free(&context, user_data);
}
for (int i = 0; i < output_length; ++i) {
TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
tolerance);
}
return kTfLiteOk;
}
void TestConvQuantizedPerChannel(
const int* input_dims_data, const float* input_data,
int8_t* input_quantized, float input_scale, int input_zero_point,
const int* filter_dims_data, const float* filter_data,
int8_t* filter_data_quantized, const int* bias_dims_data,
const float* bias_data, int32_t* bias_data_quantized, float* bias_scales,
int* bias_zero_points, const int* output_dims_data,
const float* expected_output_data, int8_t* expected_output_data_quantized,
int8_t* output_data, float output_scale, int output_zero_point,
TfLiteConvParams* conv_params) {
TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
const int output_dims_count = ElementCount(*output_dims);
int filter_zero_points[5];
float filter_scales[5];
TfLiteAffineQuantization filter_quant;
TfLiteAffineQuantization bias_quant;
TfLiteTensor input_tensor =
CreateQuantizedTensor(input_data, input_quantized, input_dims,
input_scale, input_zero_point, "input_tensor");
TfLiteTensor filter_tensor = CreateSymmetricPerChannelQuantizedTensor(
filter_data, filter_data_quantized, filter_dims, filter_scales,
filter_zero_points, &filter_quant, 0 /* quantized dimension */,
"filter_tensor");
// DN: to replace scales and quantized data to avoid second quantization
int channel_count = filter_dims->data[0];
float true_filter_scales[5] = {1.0, 1.0, 1.0, 1.0, 1.0};
true_filter_scales[0] = static_cast<float>(channel_count);
TfLiteAffineQuantization* to_change =
(TfLiteAffineQuantization*)filter_tensor.quantization.params;
to_change->scale = FloatArrayFromFloats(true_filter_scales);
int filter_size = filter_tensor.bytes;
for (int i = 0; i < filter_size; ++i) {
filter_tensor.data.int8[i] = filter_data[i];
}
TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor(
bias_data, bias_data_quantized, bias_dims, input_scale, &filter_scales[1],
bias_scales, bias_zero_points, &bias_quant, 0 /* quantized dimension */,
"bias_tensor");
TfLiteTensor output_tensor =
CreateQuantizedTensor(output_data, output_dims, output_scale,
output_zero_point, "output_tensor");
float input_scales[] = {1, input_scale};
int input_zero_points[] = {1, input_zero_point};
TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
IntArrayFromInts(input_zero_points)};
input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
float output_scales[] = {1, output_scale};
int output_zero_points[] = {1, output_zero_point};
TfLiteAffineQuantization output_quant = {
FloatArrayFromFloats(output_scales),
IntArrayFromInts(output_zero_points)};
output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
constexpr int inputs_size = 3;
constexpr int outputs_size = 1;
constexpr int tensors_size = inputs_size + outputs_size;
TfLiteTensor tensors[tensors_size] = {
input_tensor,
filter_tensor,
bias_tensor,
output_tensor,
};
tflite::AsymmetricQuantize(expected_output_data,
expected_output_data_quantized, output_dims_count,
output_scale, output_zero_point);
TF_LITE_MICRO_EXPECT_EQ(
kTfLiteOk,
ValidateConvGoldens(tensors, tensors_size, expected_output_data_quantized,
output_data, output_dims_count, conv_params,
1.0 /* tolerance */));
}
} // namespace
} // namespace testing
} // namespace tflite
TF_LITE_MICRO_TESTS_BEGIN
// Test group 1
TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel1) {
const int output_dims_count = 20;
const float input_scale = 1.0f;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
int8_t input_quantized[tflite::testing::kInput1Elements];
int8_t filter_quantized[tflite::testing::kFilter1Elements];
int32_t bias_quantized[tflite::testing::kBias1Elements];
int8_t golden_quantized[tflite::testing::kOutput1Elements];
int8_t output_data[output_dims_count];
int zero_points[tflite::testing::kBias1Elements + 1];
float scales[tflite::testing::kBias1Elements + 1];
tflite::testing::TestConvQuantizedPerChannel(
tflite::testing::kInput1Shape, tflite::testing::kInput1Data,
input_quantized, input_scale, input_zero_point,
tflite::testing::kFilter1Shape, tflite::testing::kFilter1Data,
filter_quantized, tflite::testing::kBias1Shape,
tflite::testing::kBias1Data, bias_quantized, scales, zero_points,
tflite::testing::kOutput1Shape, tflite::testing::kGolden1Data,
golden_quantized, output_data, output_scale, output_zero_point,
&tflite::testing::common_conv_params);
}
TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel1) {
const int output_dims_count = 20;
const float input_scale = 1.0f;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
#pragma Bss(".Xdata")
static int8_t input_quantized[tflite::testing::kInput1Elements];
static int8_t filter_quantized[tflite::testing::kFilter1Elements];
static int32_t bias_quantized[tflite::testing::kBias1Elements];
static int8_t output_data[output_dims_count];
#pragma Bss()
int8_t golden_quantized[tflite::testing::kOutput1Elements];
int zero_points[tflite::testing::kBias1Elements + 1];
float scales[tflite::testing::kBias1Elements + 1];
tflite::testing::TestConvQuantizedPerChannel(
tflite::testing::kInput1Shape, tflite::testing::kInput1Data,
input_quantized, input_scale, input_zero_point,
tflite::testing::kFilter1Shape, tflite::testing::kFilter1Data,
filter_quantized, tflite::testing::kBias1Shape,
tflite::testing::kBias1Data, bias_quantized, scales, zero_points,
tflite::testing::kOutput1Shape, tflite::testing::kGolden1Data,
golden_quantized, output_data, output_scale, output_zero_point,
&tflite::testing::common_conv_params);
}
// Test group 2
TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel2) {
const int output_dims_count = 80;
const float input_scale = 1.0f;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
int8_t input_quantized[tflite::testing::kInput2Elements];
int8_t filter_quantized[tflite::testing::kFilter2Elements];
int32_t bias_quantized[tflite::testing::kBias2Elements];
int8_t golden_quantized[tflite::testing::kOutput2Elements];
int8_t output_data[output_dims_count];
int zero_points[tflite::testing::kBias2Elements + 1];
float scales[tflite::testing::kBias2Elements + 1];
tflite::testing::TestConvQuantizedPerChannel(
tflite::testing::kInput2Shape, tflite::testing::kInput2Data,
input_quantized, input_scale, input_zero_point,
tflite::testing::kFilter2Shape, tflite::testing::kFilter2Data,
filter_quantized, tflite::testing::kBias2Shape,
tflite::testing::kBias2Data, bias_quantized, scales, zero_points,
tflite::testing::kOutput2Shape, tflite::testing::kGolden2Data,
golden_quantized, output_data, output_scale, output_zero_point,
&tflite::testing::common_conv_params);
}
TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel2) {
const int output_dims_count = 80;
const float input_scale = 1.0f;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
#pragma Bss(".Xdata")
static int8_t input_quantized[tflite::testing::kInput2Elements];
static int8_t filter_quantized[tflite::testing::kFilter2Elements];
static int32_t bias_quantized[tflite::testing::kBias2Elements];
static int8_t output_data[output_dims_count];
#pragma Bss()
int8_t golden_quantized[tflite::testing::kOutput2Elements];
int zero_points[tflite::testing::kBias2Elements + 1];
float scales[tflite::testing::kBias2Elements + 1];
tflite::testing::TestConvQuantizedPerChannel(
tflite::testing::kInput2Shape, tflite::testing::kInput2Data,
input_quantized, input_scale, input_zero_point,
tflite::testing::kFilter2Shape, tflite::testing::kFilter2Data,
filter_quantized, tflite::testing::kBias2Shape,
tflite::testing::kBias2Data, bias_quantized, scales, zero_points,
tflite::testing::kOutput2Shape, tflite::testing::kGolden2Data,
golden_quantized, output_data, output_scale, output_zero_point,
&tflite::testing::common_conv_params);
}
// Test group 3
TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel3) {
const int output_dims_count = 4;
const float input_scale = 1.0f;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
int8_t input_quantized[tflite::testing::kInput3Elements];
int8_t filter_quantized[tflite::testing::kFilter3Elements];
int32_t bias_quantized[tflite::testing::kBias3Elements];
int8_t golden_quantized[tflite::testing::kOutput3Elements];
int8_t output_data[output_dims_count];
int zero_points[tflite::testing::kBias3Elements + 1];
float scales[tflite::testing::kBias3Elements + 1];
tflite::testing::TestConvQuantizedPerChannel(
tflite::testing::kInput3Shape, tflite::testing::kInput3Data,
input_quantized, input_scale, input_zero_point,
tflite::testing::kFilter3Shape, tflite::testing::kFilter3Data,
filter_quantized, tflite::testing::kBias3Shape,
tflite::testing::kBias3Data, bias_quantized, scales, zero_points,
tflite::testing::kOutput3Shape, tflite::testing::kGolden3Data,
golden_quantized, output_data, output_scale, output_zero_point,
&tflite::testing::common_conv_params);
}
TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel3) {
const int output_dims_count = 4;
const float input_scale = 1.0f;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
#pragma Bss(".Xdata")
static int8_t input_quantized[tflite::testing::kInput3Elements];
static int8_t filter_quantized[tflite::testing::kFilter3Elements];
static int32_t bias_quantized[tflite::testing::kBias3Elements];
static int8_t output_data[output_dims_count];
#pragma Bss()
int8_t golden_quantized[tflite::testing::kOutput3Elements];
int zero_points[tflite::testing::kBias3Elements + 1];
float scales[tflite::testing::kBias3Elements + 1];
tflite::testing::TestConvQuantizedPerChannel(
tflite::testing::kInput3Shape, tflite::testing::kInput3Data,
input_quantized, input_scale, input_zero_point,
tflite::testing::kFilter3Shape, tflite::testing::kFilter3Data,
filter_quantized, tflite::testing::kBias3Shape,
tflite::testing::kBias3Data, bias_quantized, scales, zero_points,
tflite::testing::kOutput3Shape, tflite::testing::kGolden3Data,
golden_quantized, output_data, output_scale, output_zero_point,
&tflite::testing::common_conv_params);
}
// Test group 4
TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel4) {
const int output_dims_count = 8;
const float input_scale = 1.0f;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
int8_t input_quantized[tflite::testing::kInput4Elements];
int8_t filter_quantized[tflite::testing::kFilter4Elements];
int32_t bias_quantized[tflite::testing::kBias4Elements];
int8_t golden_quantized[tflite::testing::kOutput4Elements];
int8_t output_data[output_dims_count];
int zero_points[tflite::testing::kBias4Elements + 1];
float scales[tflite::testing::kBias4Elements + 1];
tflite::testing::TestConvQuantizedPerChannel(
tflite::testing::kInput4Shape, tflite::testing::kInput4Data,
input_quantized, input_scale, input_zero_point,
tflite::testing::kFilter4Shape, tflite::testing::kFilter4Data,
filter_quantized, tflite::testing::kBias4Shape,
tflite::testing::kBias4Data, bias_quantized, scales, zero_points,
tflite::testing::kOutput4Shape, tflite::testing::kGolden4Data,
golden_quantized, output_data, output_scale, output_zero_point,
&tflite::testing::common_conv_params);
}
TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel4) {
const int output_dims_count = 8;
const float input_scale = 1.0f;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
#pragma Bss(".Xdata")
static int8_t input_quantized[tflite::testing::kInput4Elements];
static int8_t filter_quantized[tflite::testing::kFilter4Elements];
static int32_t bias_quantized[tflite::testing::kBias4Elements];
static int8_t output_data[output_dims_count];
#pragma Bss()
int8_t golden_quantized[tflite::testing::kOutput4Elements];
int zero_points[tflite::testing::kBias4Elements + 1];
float scales[tflite::testing::kBias4Elements + 1];
tflite::testing::TestConvQuantizedPerChannel(
tflite::testing::kInput4Shape, tflite::testing::kInput4Data,
input_quantized, input_scale, input_zero_point,
tflite::testing::kFilter4Shape, tflite::testing::kFilter4Data,
filter_quantized, tflite::testing::kBias4Shape,
tflite::testing::kBias4Data, bias_quantized, scales, zero_points,
tflite::testing::kOutput4Shape, tflite::testing::kGolden4Data,
golden_quantized, output_data, output_scale, output_zero_point,
&tflite::testing::common_conv_params);
}
TF_LITE_MICRO_TESTS_END

View File

@ -0,0 +1,515 @@
/* Copyright 2017-2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/padding.h"
#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
namespace tflite {
namespace ops {
namespace micro {
namespace depthwise_conv {
namespace {
constexpr int kInputTensor = 0;
constexpr int kFilterTensor = 1;
constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
constexpr int kMaxChannels = 256;
// Depthwise conv is quantized along dimension 3:
// https://www.tensorflow.org/lite/performance/quantization_spec
constexpr int kDepthwiseConvQuantizedDimension = 3;
struct OpData {
TfLitePaddingValues padding;
// The scaling factor from input to output (aka the 'real multiplier') can
// be represented as a fixed point multiplier plus a left shift.
int32_t output_multiplier;
int output_shift;
// Per channel output multiplier and shift.
int32_t per_channel_output_multiplier[kMaxChannels];
int32_t per_channel_output_shift[kMaxChannels];
// The range of the fused activation layer. For example for kNone and
// uint8_t these would be 0 and 255.
int32_t output_activation_min;
int32_t output_activation_max;
};
bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias,
const TfLiteDepthwiseConvParams* params) {
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
const int in_ch = SizeOfDimension(input, 3);
const int filters_num = SizeOfDimension(filter, 3);
// MLI optimized version only supports int8 dataype, dilation factor of 1 and
// per-axis quantization of weights (no broadcasting/per-tensor)
// (in_ch == filters_num) || (in_ch == 1)) is a forbidding of
// channel multiplier logic for multichannel input.
bool ret_val = (filter->type == kTfLiteInt8) &&
(input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
(params->dilation_width_factor == 1) &&
(params->dilation_height_factor == 1) &&
(affine_quantization->scale->size ==
filter->dims->data[kDepthwiseConvQuantizedDimension]) &&
((in_ch == filters_num) || (in_ch == 1)) &&
affine_quantization->scale->size <= (kMaxChannels * 2);
return ret_val;
}
TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params, int width,
int height, int filter_width, int filter_height,
const TfLiteType data_type, bool mli_is_applicable,
OpData* data) {
bool has_bias = node->inputs->size == 3;
// Check number of inputs/outputs
TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
int unused_output_height, unused_output_width;
data->padding = ComputePaddingHeightWidth(
params->stride_height, params->stride_width, 1, 1, height, width,
filter_height, filter_width, params->padding, &unused_output_height,
&unused_output_width);
// Note that quantized inference requires that all tensors have their
// parameters set. This is usually done during quantized training.
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
if (data_type != kTfLiteFloat32 && !mli_is_applicable) {
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
const TfLiteTensor* bias =
GetOptionalInputTensor(context, node, kBiasTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
// Ensure filter and bias channel count does not exceed space reserved for
// quantization metadata.
const auto filter_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(
filter->quantization.params);
const auto bias_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(bias->quantization.params);
TF_LITE_ENSURE(context, filter_quantization->scale->size <= kMaxChannels);
TF_LITE_ENSURE(context, bias_quantization->scale->size <= kMaxChannels);
TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
context, input, filter, bias, output, params->activation,
&data->output_multiplier, &data->output_shift,
&data->output_activation_min, &data->output_activation_max,
data->per_channel_output_multiplier,
reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
}
#endif
return kTfLiteOk;
}
} // namespace
TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params, OpData* data,
const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
float output_activation_min, output_activation_max;
CalculateActivationRange(params->activation, &output_activation_min,
&output_activation_max);
tflite::DepthwiseParams op_params;
// Padding type is ignored, but still set.
op_params.padding_type = PaddingType::kSame;
op_params.padding_values.width = data->padding.width;
op_params.padding_values.height = data->padding.height;
op_params.stride_width = params->stride_width;
op_params.stride_height = params->stride_height;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.depth_multiplier = params->depth_multiplier;
op_params.float_activation_min = output_activation_min;
op_params.float_activation_max = output_activation_max;
tflite::reference_ops::DepthwiseConv(
op_params, GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(filter), GetTensorData<float>(filter),
GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
GetTensorData<float>(output));
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(context,
"Type %s (%d) is not supported by ARC MLI Library.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
#endif
}
TfLiteStatus EvalMliQuantizedPerChannel(
TfLiteContext* context, TfLiteNode* node, TfLiteDepthwiseConvParams* params,
OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
// Run Depthwise Conv MLI kernel
mli_tensor mli_in = {0};
mli_tensor mli_weights = {0};
mli_tensor mli_bias = {0};
mli_tensor mli_out = {0};
mli_conv2d_cfg cfg = {};
// reuse space allocated for OpData parameters
mli_weights.el_params.asym.scale.pi16 =
(int16_t*)data->per_channel_output_multiplier;
mli_bias.el_params.asym.scale.pi16 = (int16_t*)data->per_channel_output_shift;
int16_t filter_zero_point = 0;
int16_t bias_zero_point = 0;
mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
ConvertToMliTensor<int8_t>(input, &mli_in);
ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
ConvertToMliTensor<int8_t>(output, &mli_out);
if (params->activation == kTfLiteActRelu) {
cfg.relu.type = MLI_RELU_GEN;
} else if (params->activation == kTfLiteActRelu6) {
cfg.relu.type = MLI_RELU_6;
} else if (params->activation == kTfLiteActRelu1) {
cfg.relu.type = MLI_RELU_1;
} else {
cfg.relu.type = MLI_RELU_NONE;
}
cfg.stride_width = params->stride_width;
cfg.stride_height = params->stride_height;
if (params->padding == kTfLitePaddingValid) {
cfg.padding_left = 0;
cfg.padding_right = 0;
cfg.padding_top = 0;
cfg.padding_bottom = 0;
} else {
cfg.padding_left = data->padding.width;
cfg.padding_right = data->padding.width + data->padding.width_offset;
cfg.padding_top = data->padding.height;
cfg.padding_bottom = data->padding.height + data->padding.height_offset;
}
// for height slicing
const int heightDimension = 1;
int inSliceHeight = 0;
int outSliceHeight = 0;
const int kernelHeight =
static_cast<int>(mli_weights.shape[KRNL_DW_H_DIM_HWC]);
const int overlap = kernelHeight - cfg.stride_height;
// for weight slicing (on output channels)
// HWCN layout for weigths, output channel dimension is the first dimension.
const int weight_out_ch_dimension = 3;
// bias has only 1 dimension
const int bias_out_ch_dimension = 0;
// Batch-Height-Width-Channel layout means last dimension is output channels.
const int out_tensor_ch_dimension = 3;
const int32_t in_channels = mli_in.shape[out_tensor_ch_dimension];
const int32_t out_channels = mli_out.shape[out_tensor_ch_dimension];
int slice_channels =
static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
// Tensors for data in fast (local) memory
// and config to copy data from external to local memory
mli_tensor weights_local = mli_weights;
mli_tensor bias_local = mli_bias;
mli_tensor in_local = mli_in;
mli_tensor out_local = mli_out; // this assumes that output shape
// is already filled in the tensor struct.
mli_mov_cfg_t copy_config;
mli_mov_cfg_for_copy(&copy_config);
TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
context, &in_local, &weights_local, &bias_local, &out_local));
/* is_local indicates that the tensor is already in local memory,
so in that case the original tensor can be used,
and there is no need to copy it to the local tensor*/
const bool in_is_local = in_local.data == mli_in.data;
const bool out_is_local = out_local.data == mli_out.data;
const bool w_is_local = weights_local.data == mli_weights.data;
const bool b_is_local = bias_local.data == mli_bias.data;
TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
&in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top,
cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
&weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
/* if input channels is not equal to output channels, a channel multiplier
is used. in this case the slice channels needs to be rounded down to a
multiple of the input channels */
if (in_channels != out_channels) {
slice_channels = (slice_channels / in_channels) * in_channels;
}
TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels);
TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0,
0, 0, true);
TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
0, 0, 0, true);
TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0,
0, 0, true);
mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
void* input_buffer_ptr = NULL;
int input_buffer_size = 0;
int padding_top = cfg.padding_top;
int padding_bottom = cfg.padding_bottom;
while (!w_slice.Done()) {
mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
/* input tensor is alreade sliced in the channel dimension.
out_ch_slice.Sub() is the tensor for the amount of channels of this
itteration of the weight slice loop. This tensor needs to be further
sliced over the batch and height dimension. in_ch_slice.Sub() tensor
contains batches of HWC tensors. so it is a 4 dimensional tensor. because
the mli kernel will process one HWC tensor at a time, the 4 dimensional
tensor needs to be sliced into nBatch 3 dimensional tensors. on top of
that there could be a need to also slice in the Height dimension. for that
the sliceHeight has been calculated. The tensor slicer is configured that
it will completely slice the nBatch dimension (0) and slice the height
dimension (1) in chunks of 'sliceHeight' */
TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight,
padding_top, padding_bottom, overlap);
/* output tensor is alreade sliced in the output channel dimension.
out_ch_slice.Sub() is the tensor for the amount of output channels of this
itteration of the weight slice loop. This tensor needs to be further
sliced over the batch and height dimension. */
TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight);
/* setup the pointers to the local or remote tensor to make the code
* inside the loop easier. */
mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
while (!out_slice.Done()) {
TF_LITE_ENSURE(context, !in_slice.Done());
cfg.padding_top = in_slice.GetPaddingPre();
cfg.padding_bottom = in_slice.GetPaddingPost();
// if same input copy as previous iteration, skip the copy of input
if ((in_slice.Sub()->data != input_buffer_ptr) ||
(mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
input_buffer_ptr = in_slice.Sub()->data;
input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
}
mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg,
out_ptr);
mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
in_slice.Next();
out_slice.Next();
}
w_slice.Next();
b_slice.Next();
out_ch_slice.Next();
in_ch_slice.Next();
TF_LITE_ENSURE(context, in_slice.Done());
}
return kTfLiteOk;
}
TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params,
OpData* data, const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias,
TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
DepthwiseParams op_params;
op_params.padding_type = PaddingType::kSame;
op_params.padding_values.width = data->padding.width;
op_params.padding_values.height = data->padding.height;
op_params.stride_width = params->stride_width;
op_params.stride_height = params->stride_height;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.depth_multiplier = params->depth_multiplier;
op_params.input_offset = -input->params.zero_point;
op_params.weights_offset = 0;
op_params.output_offset = output->params.zero_point;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
reference_integer_ops::DepthwiseConvPerChannel(
op_params, data->per_channel_output_multiplier,
data->per_channel_output_shift, GetTensorShape(input),
GetTensorData<int8>(input), GetTensorShape(filter),
GetTensorData<int8>(filter), GetTensorShape(bias),
GetTensorData<int32>(bias), GetTensorShape(output),
GetTensorData<int8>(output));
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(context,
"Node configuration is not supported by ARC MLI Library.");
return kTfLiteError;
#endif
}
TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
TfLiteDepthwiseConvParams* params, OpData* data,
const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias,
TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
const int32_t input_offset = -input->params.zero_point;
const int32_t filter_offset = -filter->params.zero_point;
const int32_t output_offset = output->params.zero_point;
tflite::DepthwiseParams op_params;
// Padding type is ignored, but still set.
op_params.padding_type = PaddingType::kSame;
op_params.padding_values.width = data->padding.width;
op_params.padding_values.height = data->padding.height;
op_params.stride_width = params->stride_width;
op_params.stride_height = params->stride_height;
op_params.dilation_width_factor = params->dilation_width_factor;
op_params.dilation_height_factor = params->dilation_height_factor;
op_params.depth_multiplier = params->depth_multiplier;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
op_params.input_offset = input_offset;
op_params.weights_offset = filter_offset;
op_params.output_offset = output_offset;
op_params.output_multiplier = data->output_multiplier;
// Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
op_params.output_shift = -data->output_shift;
tflite::reference_ops::DepthwiseConv(
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
GetTensorShape(filter), GetTensorData<uint8_t>(filter),
GetTensorShape(bias), GetTensorData<int32_t>(bias),
GetTensorShape(output), GetTensorData<uint8_t>(output));
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(context,
"Type %s (%d) is not supported by ARC MLI Library.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
#endif
}
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
auto* params =
reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
const TfLiteTensor* bias =
(NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
const TfLiteType data_type = input->type;
int width = SizeOfDimension(input, 2);
int height = SizeOfDimension(input, 1);
int filter_width = SizeOfDimension(filter, 2);
int filter_height = SizeOfDimension(filter, 1);
OpData data;
// All per-channel quantized tensors need valid zero point and scale arrays.
if (input->type == kTfLiteInt8) {
TF_LITE_ENSURE_EQ(context, filter->quantization.type,
kTfLiteAffineQuantization);
const auto* affine_quantization =
reinterpret_cast<TfLiteAffineQuantization*>(
filter->quantization.params);
TF_LITE_ENSURE(context, affine_quantization);
TF_LITE_ENSURE(context, affine_quantization->scale);
TF_LITE_ENSURE(context, affine_quantization->zero_point);
TF_LITE_ENSURE(
context, affine_quantization->scale->size == 1 ||
affine_quantization->scale->size ==
filter->dims->data[kDepthwiseConvQuantizedDimension]);
TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
affine_quantization->zero_point->size);
}
bool mli_is_applicable =
IsMliApplicable(context, input, filter, bias, params);
TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
filter_width, filter_height, data_type,
mli_is_applicable, &data));
switch (input->type) { // Already know in/out types are same.
case kTfLiteFloat32:
return EvalFloat(context, node, params, &data, input, filter, bias,
output);
break;
case kTfLiteInt8:
if (mli_is_applicable) {
return EvalMliQuantizedPerChannel(context, node, params, &data, input,
filter, bias, output);
} else {
return EvalQuantizedPerChannel(context, node, params, &data, input,
filter, bias, output);
}
break;
case kTfLiteUInt8:
return EvalQuantized(context, node, params, &data, input, filter, bias,
output);
break;
default:
TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
}
return kTfLiteOk;
}
} // namespace depthwise_conv
TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
static TfLiteRegistration r = {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/depthwise_conv::Eval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
return &r;
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -0,0 +1,550 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This test checks that slicing logic doesn`t affect result of depthwise
// convolution kernel
//
// This test doesn`t replace default depthwise convolution test
// (tensorflow/lite/micro/kernels/depthwise_conv_test.cc). It is added to the
// whole testset only in case MLI for ARC platform is used during generation
// (which is handled in arc_mli.inc). So such tests won`t be generated for other
// platforms.
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
#include "tensorflow/lite/micro/testing/micro_test.h"
#include "tensorflow/lite/micro/testing/test_utils.h"
namespace tflite {
namespace testing {
namespace {
constexpr int kMaxFilterChannels = 64;
constexpr int kMaxBiasChannels = 64;
// Index of the output tensor in context->tensors, specific to
// DepthwiseConv.
constexpr int kOutputTensorIndex = 3;
// Creates a DepthwiseConv opeerator, calls it with the provided input tensors
// and some defaults parameters, and compares the output with
// expected_output_data.
//
// The tensors parameter contains both the input tensors as well as a
// preallocated output tensor into which the output is stored.
template <typename T>
TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
int output_length,
TfLiteFusedActivation activation,
float tolerance, int tensors_size,
TfLiteTensor* tensors) {
TfLiteContext context;
PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
::tflite::ops::micro::AllOpsResolver resolver;
const TfLiteRegistration* registration =
resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1);
TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
int input_depth = tensors[0].dims->data[3];
int output_depth = tensors[1].dims->data[3];
int depth_mul = output_depth / input_depth;
TfLiteDepthwiseConvParams builtin_data;
builtin_data.padding = kTfLitePaddingValid;
builtin_data.activation = activation;
builtin_data.stride_height = 1;
builtin_data.stride_width = 1;
builtin_data.dilation_height_factor = 1;
builtin_data.dilation_width_factor = 1;
builtin_data.depth_multiplier = depth_mul;
const char* init_data = reinterpret_cast<const char*>(&builtin_data);
size_t init_data_size = 0;
void* user_data = nullptr;
if (registration->init) {
user_data = registration->init(&context, init_data, init_data_size);
}
int inputs_array_data[] = {3, 0, 1, 2};
TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
int outputs_array_data[] = {1, 3};
TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
int temporaries_array_data[] = {0};
TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
TfLiteNode node;
node.inputs = inputs_array;
node.outputs = outputs_array;
node.temporaries = temporaries_array;
node.user_data = user_data;
node.builtin_data = reinterpret_cast<void*>(&builtin_data);
node.custom_initial_data = nullptr;
node.custom_initial_data_size = 0;
node.delegate = nullptr;
if (registration->prepare) {
TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
}
TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
TfLiteStatus invoke_status = registration->invoke(&context, &node);
if (invoke_status != kTfLiteOk) {
return invoke_status;
}
if (registration->free) {
registration->free(&context, user_data);
}
const T* output_data = tflite::GetTensorData<T>(&tensors[kOutputTensorIndex]);
for (int i = 0; i < output_length; ++i) {
TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
tolerance);
}
return kTfLiteOk;
}
void TestDepthwiseConvQuantizedPerChannel(
const int* input_dims_data, const float* input_data,
int8_t* input_quantized, float input_scale, int input_zero_point,
const int* filter_dims_data, const float* filter_data,
int8_t* filter_data_quantized, const int* bias_dims_data,
const float* bias_data, int32_t* bias_data_quantized,
const int* output_dims_data, const float* expected_output_data,
int8_t* expected_output_data_quantized, int8_t* output_data,
float output_scale, int output_zero_point,
TfLiteFusedActivation activation) {
TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
const int output_dims_count = ElementCount(*output_dims);
int filter_zero_points[kMaxFilterChannels];
float filter_scales[kMaxFilterChannels];
int bias_zero_points[kMaxBiasChannels];
float bias_scales[kMaxBiasChannels];
TfLiteAffineQuantization filter_quant;
TfLiteAffineQuantization bias_quant;
TfLiteTensor input_tensor =
CreateQuantizedTensor(input_data, input_quantized, input_dims,
input_scale, input_zero_point, "input_tensor");
TfLiteTensor filter_tensor = CreateSymmetricPerChannelQuantizedTensor(
filter_data, filter_data_quantized, filter_dims, filter_scales,
filter_zero_points, &filter_quant, 3 /* quantized dimension */,
"filter_tensor");
TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor(
bias_data, bias_data_quantized, bias_dims, input_scale, &filter_scales[1],
bias_scales, bias_zero_points, &bias_quant, 3 /* quantized dimension */,
"bias_tensor");
TfLiteTensor output_tensor =
CreateQuantizedTensor(output_data, output_dims, output_scale,
input_zero_point, "output_tensor");
float input_scales[] = {1, input_scale};
int input_zero_points[] = {1, input_zero_point};
TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
IntArrayFromInts(input_zero_points)};
input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
float output_scales[] = {1, output_scale};
int output_zero_points[] = {1, output_zero_point};
TfLiteAffineQuantization output_quant = {
FloatArrayFromFloats(output_scales),
IntArrayFromInts(output_zero_points)};
output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
constexpr int inputs_size = 3;
constexpr int outputs_size = 1;
constexpr int tensors_size = inputs_size + outputs_size;
TfLiteTensor tensors[tensors_size] = {
input_tensor,
filter_tensor,
bias_tensor,
output_tensor,
};
AsymmetricQuantize(expected_output_data, expected_output_data_quantized,
output_dims_count, output_scale, output_zero_point);
TF_LITE_MICRO_EXPECT_EQ(
kTfLiteOk, ValidateDepthwiseConvGoldens(expected_output_data_quantized,
output_dims_count, activation,
1.0, tensors_size, tensors));
}
} // namespace
} // namespace testing
} // namespace tflite
TF_LITE_MICRO_TESTS_BEGIN
// Test group 1
TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel1) {
const int input_elements = 20;
const int input_shape[] = {4, 1, 5, 2, 2};
const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int filter_elements = 36;
const int filter_shape[] = {4, 2, 3, 3, 2};
const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int bias_elements = 2;
const int bias_shape[] = {4, 1, 1, 1, 2};
const int output_elements = 20;
const float bias_values[] = {2, 2};
const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
const int output_shape[] = {4, 1, 5, 2, 2};
const int output_dims_count = 20;
int8_t output_data[output_dims_count];
const float input_scale = 1.0;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
int8_t input_quantized[input_elements];
int8_t filter_quantized[filter_elements];
int32_t bias_quantized[bias_elements];
int8_t golden_quantized[output_elements];
int zero_points[bias_elements + 1];
float scales[bias_elements + 1];
tflite::testing::TestDepthwiseConvQuantizedPerChannel(
input_shape, input_values, input_quantized, input_scale, input_zero_point,
filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
bias_quantized, output_shape, golden, golden_quantized, output_data,
output_scale, output_zero_point, kTfLiteActNone);
}
TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel1) {
const int input_elements = 20;
const int input_shape[] = {4, 1, 5, 2, 2};
const int filter_elements = 36;
const int filter_shape[] = {4, 2, 3, 3, 2};
const int bias_elements = 2;
const int bias_shape[] = {4, 1, 1, 1, 2};
const int output_elements = 20;
const int output_shape[] = {4, 1, 5, 2, 2};
const int output_dims_count = 20;
#pragma Bss(".Zdata")
const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const float bias_values[] = {2, 2};
int8_t output_data[output_dims_count];
#pragma Bss()
const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
const float input_scale = 1.0;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
int8_t input_quantized[input_elements];
int8_t filter_quantized[filter_elements];
int32_t bias_quantized[bias_elements];
int8_t golden_quantized[output_elements];
int zero_points[bias_elements + 1];
float scales[bias_elements + 1];
tflite::testing::TestDepthwiseConvQuantizedPerChannel(
input_shape, input_values, input_quantized, input_scale, input_zero_point,
filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
bias_quantized, output_shape, golden, golden_quantized, output_data,
output_scale, output_zero_point, kTfLiteActNone);
}
// Test group 2
TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel2) {
const int input_elements = 80;
const int input_shape[] = {4, 1, 20, 2, 2};
const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int filter_elements = 36;
const int filter_shape[] = {4, 2, 3, 3, 2};
const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int bias_elements = 2;
const int bias_shape[] = {4, 1, 1, 1, 2};
const int output_elements = 80;
const float bias_values[] = {2, 2};
const float golden[] = {
34, 34, 34, 34, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
const int output_shape[] = {4, 1, 20, 2, 2};
const int output_dims_count = 80;
int8_t output_data[output_dims_count];
const float input_scale = 1.0;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
int8_t input_quantized[input_elements];
int8_t filter_quantized[filter_elements];
int32_t bias_quantized[bias_elements];
int8_t golden_quantized[output_elements];
int zero_points[bias_elements + 1];
float scales[bias_elements + 1];
tflite::testing::TestDepthwiseConvQuantizedPerChannel(
input_shape, input_values, input_quantized, input_scale, input_zero_point,
filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
bias_quantized, output_shape, golden, golden_quantized, output_data,
output_scale, output_zero_point, kTfLiteActNone);
}
TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel2) {
const int input_elements = 80;
const int input_shape[] = {4, 1, 20, 2, 2};
const int filter_elements = 36;
const int filter_shape[] = {4, 2, 3, 3, 2};
const int bias_elements = 2;
const int bias_shape[] = {4, 1, 1, 1, 2};
const int output_elements = 80;
const int output_shape[] = {4, 1, 20, 2, 2};
const int output_dims_count = 80;
#pragma Bss(".Zdata")
float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
float bias_values[] = {2, 2};
int8_t output_data[output_dims_count];
#pragma Bss()
const float golden[] = {
34, 34, 34, 34, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
const float input_scale = 1.0;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
int8_t input_quantized[input_elements];
int8_t filter_quantized[filter_elements];
int32_t bias_quantized[bias_elements];
int8_t golden_quantized[output_elements];
int zero_points[bias_elements + 1];
float scales[bias_elements + 1];
tflite::testing::TestDepthwiseConvQuantizedPerChannel(
input_shape, input_values, input_quantized, input_scale, input_zero_point,
filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
bias_quantized, output_shape, golden, golden_quantized, output_data,
output_scale, output_zero_point, kTfLiteActNone);
}
// Test group 3
TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel3) {
const int input_elements = 40;
const int input_shape[] = {4, 1, 2, 2, 10};
const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
const int filter_elements = 90;
const int filter_shape[] = {4, 1, 3, 3, 10};
const float filter_values[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
const int bias_elements = 1;
const int bias_shape[] = {4, 1, 1, 1, 1};
const int output_elements = 4;
const float bias_values[] = {1};
const float golden[] = {41, 41, 41, 41};
const int output_shape[] = {4, 1, 2, 2, 1};
const int output_dims_count = 4;
int8_t output_data[output_dims_count];
const float input_scale = 1.0;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
int8_t input_quantized[input_elements];
int8_t filter_quantized[filter_elements];
int32_t bias_quantized[bias_elements];
int8_t golden_quantized[output_elements];
int zero_points[bias_elements + 1];
float scales[bias_elements + 1];
tflite::testing::TestDepthwiseConvQuantizedPerChannel(
input_shape, input_values, input_quantized, input_scale, input_zero_point,
filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
bias_quantized, output_shape, golden, golden_quantized, output_data,
output_scale, output_zero_point, kTfLiteActNone);
}
TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel3) {
const int input_elements = 40;
const int input_shape[] = {4, 1, 2, 2, 10};
const int filter_elements = 90;
const int filter_shape[] = {4, 1, 3, 3, 10};
const int bias_elements = 1;
const int bias_shape[] = {4, 1, 1, 1, 1};
const int output_elements = 4;
const int output_shape[] = {4, 1, 2, 2, 1};
const int output_dims_count = 4;
#pragma Bss(".Zdata")
float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
float filter_values[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
float bias_values[] = {1};
int8_t output_data[output_dims_count];
#pragma Bss()
const float golden[] = {41, 41, 41, 41};
const float input_scale = 1.0;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
int8_t input_quantized[input_elements];
int8_t filter_quantized[filter_elements];
int32_t bias_quantized[bias_elements];
int8_t golden_quantized[output_elements];
int zero_points[bias_elements + 1];
float scales[bias_elements + 1];
tflite::testing::TestDepthwiseConvQuantizedPerChannel(
input_shape, input_values, input_quantized, input_scale, input_zero_point,
filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
bias_quantized, output_shape, golden, golden_quantized, output_data,
output_scale, output_zero_point, kTfLiteActNone);
}
// Test group 4
TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel4) {
const int input_elements = 80;
const int input_shape[] = {4, 1, 4, 2, 10};
const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
const int filter_elements = 90;
const int filter_shape[] = {4, 1, 3, 3, 10};
const float filter_values[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
const int bias_elements = 1;
const int bias_shape[] = {4, 1, 1, 1, 1};
const int output_elements = 8;
const float bias_values[] = {1};
const float golden[] = {41, 41, 61, 61, 61, 61, 41, 41};
const int output_shape[] = {4, 1, 4, 2, 1};
const int output_dims_count = 8;
int8_t output_data[output_dims_count];
const float input_scale = 1.0;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
int8_t input_quantized[input_elements];
int8_t filter_quantized[filter_elements];
int32_t bias_quantized[bias_elements];
int8_t golden_quantized[output_elements];
int zero_points[bias_elements + 1];
float scales[bias_elements + 1];
tflite::testing::TestDepthwiseConvQuantizedPerChannel(
input_shape, input_values, input_quantized, input_scale, input_zero_point,
filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
bias_quantized, output_shape, golden, golden_quantized, output_data,
output_scale, output_zero_point, kTfLiteActNone);
}
TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel4) {
const int input_elements = 80;
const int input_shape[] = {4, 1, 4, 2, 10};
const int filter_elements = 90;
const int filter_shape[] = {4, 1, 3, 3, 10};
const int bias_elements = 1;
const int bias_shape[] = {4, 1, 1, 1, 1};
const int output_elements = 8;
const int output_shape[] = {4, 1, 4, 2, 1};
const int output_dims_count = 8;
#pragma Bss(".Zdata")
float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
float filter_values[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
float bias_values[] = {1};
int8_t output_data[output_dims_count];
#pragma Bss()
const float golden[] = {41, 41, 61, 61, 61, 61, 41, 41};
const float input_scale = 1.0;
const float output_scale = 1.0f;
const int input_zero_point = 0;
const int output_zero_point = 0;
int8_t input_quantized[input_elements];
int8_t filter_quantized[filter_elements];
int32_t bias_quantized[bias_elements];
int8_t golden_quantized[output_elements];
int zero_points[bias_elements + 1];
float scales[bias_elements + 1];
tflite::testing::TestDepthwiseConvQuantizedPerChannel(
input_shape, input_values, input_quantized, input_scale, input_zero_point,
filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
bias_quantized, output_shape, golden, golden_quantized, output_data,
output_scale, output_zero_point, kTfLiteActNone);
}
TF_LITE_MICRO_TESTS_END

View File

@ -0,0 +1,385 @@
/* Copyright 2017-2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/kernels/internal/common.h"
#include "tensorflow/lite/kernels/internal/quantization_util.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
namespace tflite {
namespace ops {
namespace micro {
namespace fully_connected {
namespace {
struct OpData {
// The scaling factor from input to output (aka the 'real multiplier') can
// be represented as a fixed point multiplier plus a left shift.
int32_t output_multiplier;
int output_shift;
// The range of the fused activation layer. For example for kNone and
// uint8_t these would be 0 and 255.
int32_t output_activation_min;
int32_t output_activation_max;
// The index of the temporary tensor where the quantized inputs are cached.
int input_quantized_index;
};
constexpr int kInputTensor = 0;
constexpr int kWeightsTensor = 1;
constexpr int kBiasTensor = 2;
constexpr int kOutputTensor = 0;
bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias,
const TfLiteFullyConnectedParams* params) {
// MLI optimized version only supports int8 dataype and no fused Relu and
// symmetric per-tensor quantization of weights (not per-axis)
bool ret_val = (filter->type == kTfLiteInt8) &&
(input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
(params->activation == kTfLiteActNone) &&
(filter->params.zero_point == 0);
return ret_val;
}
TfLiteStatus CalculateOpData(TfLiteContext* context,
TfLiteFullyConnectedParams* params,
TfLiteType data_type, const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output,
OpData* data) {
TfLiteStatus status = kTfLiteOk;
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
if (data_type != kTfLiteFloat32 &&
!IsMliApplicable(context, input, filter, bias, params)) {
double real_multiplier = 0.0;
TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
context, input, filter, bias, output, &real_multiplier));
int exponent;
QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
data->output_shift = -exponent;
TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
context, params->activation, output, &data->output_activation_min,
&data->output_activation_max));
}
#endif
return status;
}
} // namespace
void* Init(TfLiteContext* context, const char* buffer, size_t length) {
OpData* data = nullptr;
TfLiteStatus status = context->AllocatePersistentBuffer(
context, sizeof(OpData), reinterpret_cast<void**>(&data));
if (status != kTfLiteOk || data == nullptr) {
return nullptr;
}
return data;
}
TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
OpData* data = reinterpret_cast<OpData*>(node->user_data);
auto* params =
reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
TF_LITE_ENSURE(context, data != nullptr);
TF_LITE_ENSURE_EQ(context, input->type, output->type);
TF_LITE_ENSURE_MSG(context, input->type == filter->type,
"Hybrid models are not supported on TFLite Micro.");
TfLiteType data_type = input->type;
TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
filter, bias, output, data));
return kTfLiteOk;
}
TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
TfLiteFullyConnectedParams* params,
OpData* data, const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias,
TfLiteTensor* output) {
mli_tensor mli_in = {0};
mli_tensor mli_weights = {0};
mli_tensor mli_bias = {0};
mli_tensor mli_out = {0};
ConvertToMliTensor<int8_t>(input, &mli_in);
ConvertToMliTensor<int8_t>(filter, &mli_weights);
ConvertToMliTensor<int32_t>(bias, &mli_bias);
ConvertToMliTensor<int8_t>(output, &mli_out);
/* The input tensor can have more than 2 dimensions. for the compute this
doesn't make any difference because all the inputs or a batch entry will
be used anyway. because the MLI kernel doesn't recognize the multiple
dimensions, the tensor shape is casted to a {batchnum, inputsize} shape. */
mli_in.shape[0] = mli_out.shape[0];
mli_in.shape[1] = mli_weights.shape[1];
mli_in.shape[2] = 0;
mli_in.shape[3] = 0;
mli_in.rank = 2;
// Tensors for data in fast (local) memory and config to copy data from
// external to local memory
mli_tensor weights_local = mli_weights;
mli_tensor bias_local = mli_bias;
mli_tensor in_local = mli_in;
mli_tensor out_local = mli_out;
mli_mov_cfg_t copy_config;
mli_mov_cfg_for_copy(&copy_config);
const int weight_out_dimension = 0;
const int out_tensor_dimension = 1;
const int input_size_dimension = 1;
int slice_size = mli_weights.shape[weight_out_dimension];
/* allocate the local buffers, and compute the slice size */
TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors(
context, &in_local, &weights_local, &bias_local, &out_local));
TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
&weights_local, &bias_local, weight_out_dimension, &slice_size));
int max_out_slice_size =
out_local.capacity / mli_hlp_tensor_element_size(&out_local);
if (slice_size > max_out_slice_size) slice_size = max_out_slice_size;
/* is_local indicates that the tensor is already in local memory,
so in that case the original tensor can be used,
and there is no need to copy it to the local tensor*/
const bool in_is_local = in_local.data == mli_in.data;
const bool out_is_local = out_local.data == mli_out.data;
const bool w_is_local = weights_local.data == mli_weights.data;
const bool b_is_local = bias_local.data == mli_bias.data;
TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size);
TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0,
true);
mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
void* input_buffer_ptr = NULL;
while (!w_slice.Done()) {
mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
// Slice the input over the batches (one at a time with the size of a
// complete input)
TensorSlicer in_slice(&mli_in, input_size_dimension,
mli_in.shape[input_size_dimension]);
/* output tensor is alreade sliced in the output size dimension.
out_ch_slice.Sub() is the tensor for the amount of output size of this
itteration of the weight slice loop. This tensor needs to be further
sliced over the batch */
TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension,
slice_size);
/* setup the pointers to the local or remote tensor to make the code
* inside the loop easier. */
mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
while (!out_slice.Done()) {
// if same input copy as previous iteration, skip the copy of input
if (in_slice.Sub()->data != input_buffer_ptr) {
mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
input_buffer_ptr = in_slice.Sub()->data;
}
mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, out_ptr);
mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
in_slice.Next();
out_slice.Next();
}
w_slice.Next();
b_slice.Next();
out_ch_slice.Next();
}
return kTfLiteOk;
}
TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
TfLiteFullyConnectedParams* params, OpData* data,
const TfLiteTensor* input,
const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
FullyConnectedParams op_params;
op_params.input_offset = -input->params.zero_point;
op_params.weights_offset = -filter->params.zero_point;
op_params.output_offset = output->params.zero_point;
op_params.output_multiplier = data->output_multiplier;
op_params.output_shift = -data->output_shift;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
reference_integer_ops::FullyConnected(
op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
GetTensorShape(filter), GetTensorData<int8_t>(filter),
GetTensorShape(bias), GetTensorData<int32_t>(bias),
GetTensorShape(output), GetTensorData<int8_t>(output));
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(context,
"Node configuration is not supported by ARC MLI Library.");
return kTfLiteError;
#endif
}
TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
TfLiteFullyConnectedParams* params, OpData* data,
const TfLiteTensor* input,
const TfLiteTensor* filter, const TfLiteTensor* bias,
TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
const int32_t input_offset = -input->params.zero_point;
const int32_t filter_offset = -filter->params.zero_point;
const int32_t output_offset = output->params.zero_point;
tflite::FullyConnectedParams op_params;
op_params.input_offset = input_offset;
op_params.weights_offset = filter_offset;
op_params.output_offset = output_offset;
op_params.output_multiplier = data->output_multiplier;
// Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
op_params.output_shift = -data->output_shift;
op_params.quantized_activation_min = data->output_activation_min;
op_params.quantized_activation_max = data->output_activation_max;
#define TF_LITE_FULLY_CONNECTED(output_data_type) \
reference_ops::FullyConnected( \
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
GetTensorShape(filter), GetTensorData<uint8_t>(filter), \
GetTensorShape(bias), GetTensorData<int32_t>(bias), \
GetTensorShape(output), GetTensorData<output_data_type>(output))
switch (output->type) {
case kTfLiteUInt8:
TF_LITE_FULLY_CONNECTED(uint8_t);
break;
case kTfLiteInt16:
TF_LITE_FULLY_CONNECTED(int16_t);
break;
default:
TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
TfLiteTypeGetName(output->type), output->type);
return kTfLiteError;
}
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(context,
"Type %s (%d) is not supported by ARC MLI Library.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
#endif
}
TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
TfLiteFullyConnectedParams* params, OpData* data,
const TfLiteTensor* input, const TfLiteTensor* filter,
const TfLiteTensor* bias, TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
float output_activation_min, output_activation_max;
CalculateActivationRange(params->activation, &output_activation_min,
&output_activation_max);
tflite::FullyConnectedParams op_params;
op_params.float_activation_min = output_activation_min;
op_params.float_activation_max = output_activation_max;
tflite::reference_ops::FullyConnected(
op_params, GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(filter), GetTensorData<float>(filter),
GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
GetTensorData<float>(output));
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(context,
"Type %s (%d) is not supported by ARC MLI Library.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
#endif
}
TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
auto* params =
reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
OpData* data = reinterpret_cast<OpData*>(node->user_data);
TF_LITE_ENSURE(context, data != nullptr);
// Checks in Prepare ensure input, output and filter types are all the same.
switch (input->type) {
case kTfLiteFloat32:
return EvalFloat(context, node, params, data, input, filter, bias,
output);
case kTfLiteInt8:
if (IsMliApplicable(context, input, filter, bias, params)) {
return EvalMliQuantizedInt8(context, node, params, data, input, filter,
bias, output);
} else {
return EvalQuantizedInt8(context, node, params, data, input, filter,
bias, output);
}
case kTfLiteUInt8:
return EvalQuantized(context, node, params, data, input, filter, bias,
output);
default:
TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
TfLiteTypeGetName(filter->type), filter->type);
return kTfLiteError;
}
return kTfLiteOk;
}
} // namespace fully_connected
TfLiteRegistration* Register_FULLY_CONNECTED() {
static TfLiteRegistration r = {/*init=*/fully_connected::Init,
/*free=*/nullptr,
/*prepare=*/fully_connected::Prepare,
/*invoke=*/fully_connected::Eval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
return &r;
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -0,0 +1,425 @@
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This test checks that slicing logic doesn`t affect result of fully
// connected kernel
//
// This test doesn`t replace default fully connected test
// (tensorflow/lite/micro/kernels/fully_connected_test.cc). It is added to the
// whole testset only in case MLI for ARC platform is used during generation
// (which is handled in arc_mli.inc). So such tests won`t be generated for other
// platforms.
#include <cstdint>
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
#include "tensorflow/lite/micro/testing/micro_test.h"
#include "tensorflow/lite/micro/testing/test_utils.h"
namespace tflite {
namespace testing {
namespace {
template <typename T>
void TestFullyConnectedQuantized(
const int* input_dims_data, const T* input_data, const float input_min,
const float input_max, const int* weights_dims_data, const T* weights_data,
const float weights_min, const float weights_max, const int* bias_dims_data,
const int32_t* bias_data, const float bias_scale,
const T* expected_output_data, const int* output_dims_data,
const float output_min, const float output_max,
TfLiteFusedActivation activation, T* output_data) {
TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data);
TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
const int output_dims_count = ElementCount(*output_dims);
constexpr int inputs_size = 3;
constexpr int outputs_size = 1;
constexpr int tensors_size = inputs_size + outputs_size;
TfLiteTensor tensors[tensors_size] = {
CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
input_max),
CreateQuantizedTensor(weights_data, weights_dims, "weights_tensor",
weights_min, weights_max),
CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_scale),
CreateQuantizedTensor(output_data, output_dims, "output_tensor",
output_min, output_max),
};
tensors[0].params.zero_point = 0;
tensors[1].params.zero_point = 0;
tensors[3].params.zero_point = 0;
TfLiteContext context;
PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
::tflite::ops::micro::AllOpsResolver resolver;
const TfLiteRegistration* registration =
resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 4);
TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
TfLiteFullyConnectedParams builtin_data = {
activation,
kTfLiteFullyConnectedWeightsFormatDefault,
};
const char* init_data = reinterpret_cast<const char*>(&builtin_data);
size_t init_data_size = 0;
void* user_data = nullptr;
if (registration->init) {
user_data = registration->init(&context, init_data, init_data_size);
}
int inputs_array_data[] = {3, 0, 1, 2};
TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
int outputs_array_data[] = {1, 3};
TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
int temporaries_array_data[] = {0};
TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
TfLiteNode node;
node.inputs = inputs_array;
node.outputs = outputs_array;
node.temporaries = temporaries_array;
node.user_data = user_data;
node.builtin_data = reinterpret_cast<void*>(&builtin_data);
node.custom_initial_data = nullptr;
node.custom_initial_data_size = 0;
node.delegate = nullptr;
if (registration->prepare) {
TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
}
TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
if (registration->free) {
registration->free(&context, user_data);
}
for (int i = 0; i < output_dims_count; ++i) {
TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
}
}
} // namespace
} // namespace testing
} // namespace tflite
TF_LITE_MICRO_TESTS_BEGIN
// Test group 1
TF_LITE_MICRO_TEST(SystemSimpleTestQuantized1) {
const float input_min = -128.0f;
const float input_max = 127.0f;
const float weights_min = -128.0f;
const float weights_max = 127.0f;
const float bias_scale = 1.0f;
const float output_min = -128.0f;
const float output_max = 127.0f;
const int input_dims_data[] = {2, 2, 10};
const int8_t input_data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int weights_dims_data[] = {2, 3, 10};
const int8_t weights_data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int bias_dims_data[] = {1, 3};
const int32_t bias_data[] = {1, 1, 1};
const int8_t expected_output_data[] = {41, 41, 41, 41, 41, 41};
const int output_dims_data[] = {2, 2, 3};
const int output_dims_count = 6;
int8_t output_data[output_dims_count];
tflite::testing::TestFullyConnectedQuantized<int8_t>(
input_dims_data, input_data, input_min, input_max, weights_dims_data,
weights_data, weights_min, weights_max, bias_dims_data, bias_data,
bias_scale, expected_output_data, output_dims_data, output_min,
output_max, kTfLiteActNone, output_data);
}
TF_LITE_MICRO_TEST(LocalSimpleTestQuantized1) {
const float input_min = -128.0f;
const float input_max = 127.0f;
const float weights_min = -128.0f;
const float weights_max = 127.0f;
const float bias_scale = 1.0f;
const float output_min = -128.0f;
const float output_max = 127.0f;
const int input_dims_data_local[] = {2, 2, 10};
const int weights_dims_data_local[] = {2, 3, 10};
const int bias_dims_data_local[] = {1, 3};
const int output_dims_data_local[] = {2, 2, 3};
const int output_dims_count = 6;
#pragma Bss(".Zdata")
const int8_t input_data_local[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int8_t weights_data_local[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int32_t bias_data_local[] = {1, 1, 1};
int8_t output_data_local[output_dims_count];
#pragma Bss()
const int8_t expected_output_data[] = {41, 41, 41, 41, 41, 41};
tflite::testing::TestFullyConnectedQuantized<int8_t>(
input_dims_data_local, input_data_local, input_min, input_max,
weights_dims_data_local, weights_data_local, weights_min, weights_max,
bias_dims_data_local, bias_data_local, bias_scale, expected_output_data,
output_dims_data_local, output_min, output_max, kTfLiteActNone,
output_data_local);
}
// Test group 2
TF_LITE_MICRO_TEST(SystemSimpleTestQuantized2) {
const float input_min = -128.0f;
const float input_max = 127.0f;
const float weights_min = -128.0f;
const float weights_max = 127.0f;
const float bias_scale = 1.0f;
const float output_min = -128.0f;
const float output_max = 127.0f;
const int input_dims_data_2[] = {2, 10, 4};
const int8_t input_data_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int weights_dims_data_2[] = {2, 6, 4};
const int8_t weights_data_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int bias_dims_data_2[] = {1, 6};
const int32_t bias_data_2[] = {1, 1, 1, 1, 1, 1};
const int8_t expected_output_data_2[] = {
17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
const int output_dims_data_2[] = {2, 10, 6};
const int output_dims_count_2 = 60;
int8_t output_data_2[output_dims_count_2];
tflite::testing::TestFullyConnectedQuantized<int8_t>(
input_dims_data_2, input_data_2, input_min, input_max,
weights_dims_data_2, weights_data_2, weights_min, weights_max,
bias_dims_data_2, bias_data_2, bias_scale, expected_output_data_2,
output_dims_data_2, output_min, output_max, kTfLiteActNone,
output_data_2);
}
TF_LITE_MICRO_TEST(LocalSimpleTestQuantized2) {
const float input_min = -128.0f;
const float input_max = 127.0f;
const float weights_min = -128.0f;
const float weights_max = 127.0f;
const float bias_scale = 1.0f;
const float output_min = -128.0f;
const float output_max = 127.0f;
const int input_dims_data_local_2[] = {2, 10, 4};
const int weights_dims_data_local_2[] = {2, 6, 4};
const int bias_dims_data_local_2[] = {1, 6};
const int output_dims_data_local_2[] = {2, 10, 6};
const int output_dims_count_local_2 = 60;
#pragma Bss(".Zdata")
const int8_t input_data_local_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int8_t weights_data_local_2[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int32_t bias_data_local_2[] = {1, 1, 1, 1, 1, 1};
int8_t output_data_local_2[output_dims_count_local_2];
#pragma Bss()
const int8_t expected_output_data_local_2[] = {41, 41, 41, 41, 41, 41};
tflite::testing::TestFullyConnectedQuantized<int8_t>(
input_dims_data_local_2, input_data_local_2, input_min, input_max,
weights_dims_data_local_2, weights_data_local_2, weights_min, weights_max,
bias_dims_data_local_2, bias_data_local_2, bias_scale,
expected_output_data_local_2, output_dims_data_local_2, output_min,
output_max, kTfLiteActNone, output_data_local_2);
}
// Test group 3
TF_LITE_MICRO_TEST(SystemSimpleTestQuantized3) {
const float input_min = -128.0f;
const float input_max = 127.0f;
const float weights_min = -128.0f;
const float weights_max = 127.0f;
const float bias_scale = 1.0f;
const float output_min = -128.0f;
const float output_max = 127.0f;
const int input_dims_data_3[] = {2, 2, 5};
const int8_t input_data_3[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int weights_dims_data_3[] = {2, 10, 5};
const int8_t weights_data_3[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int bias_dims_data_3[] = {1, 10};
const int32_t bias_data_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
const int8_t expected_output_data_3[] = {21, 21, 21, 21, 21, 21, 21,
21, 21, 21, 21, 21, 21, 21,
21, 21, 21, 21, 21, 21};
const int output_dims_data_3[] = {2, 2, 10};
const int output_dims_count_3 = 20;
int8_t output_data_3[output_dims_count_3];
tflite::testing::TestFullyConnectedQuantized<int8_t>(
input_dims_data_3, input_data_3, input_min, input_max,
weights_dims_data_3, weights_data_3, weights_min, weights_max,
bias_dims_data_3, bias_data_3, bias_scale, expected_output_data_3,
output_dims_data_3, output_min, output_max, kTfLiteActNone,
output_data_3);
}
TF_LITE_MICRO_TEST(LocalSimpleTestQuantized3) {
const float input_min = -128.0f;
const float input_max = 127.0f;
const float weights_min = -128.0f;
const float weights_max = 127.0f;
const float bias_scale = 1.0f;
const float output_min = -128.0f;
const float output_max = 127.0f;
const int input_dims_data_local_3[] = {2, 2, 5};
const int weights_dims_data_local_3[] = {2, 10, 5};
const int bias_dims_data_local_3[] = {1, 10};
const int output_dims_data_local_3[] = {2, 2, 10};
const int output_dims_count_local_3 = 20;
#pragma Bss(".Zdata")
static int8_t input_data_local_3[10];
static int8_t weights_data_local_3[50];
static int32_t bias_data_local_3[10];
static int8_t output_data_local_3[output_dims_count_local_3];
#pragma Bss()
for (int i = 0; i < 10; ++i) {
input_data_local_3[i] = 2;
}
for (int i = 0; i < 50; ++i) {
weights_data_local_3[i] = 2;
}
for (int i = 0; i < 10; ++i) {
bias_data_local_3[i] = 1;
}
for (int i = 0; i < 20; ++i) {
output_data_local_3[i] = 0;
}
const int8_t expected_output_data_local_3[] = {21, 21, 21, 21, 21, 21, 21,
21, 21, 21, 21, 21, 21, 21,
21, 21, 21, 21, 21, 21};
tflite::testing::TestFullyConnectedQuantized<int8_t>(
input_dims_data_local_3, input_data_local_3, input_min, input_max,
weights_dims_data_local_3, weights_data_local_3, weights_min, weights_max,
bias_dims_data_local_3, bias_data_local_3, bias_scale,
expected_output_data_local_3, output_dims_data_local_3, output_min,
output_max, kTfLiteActNone, output_data_local_3);
}
// Test group 4
TF_LITE_MICRO_TEST(SystemSimpleTestQuantized4) {
const float input_min = -128.0f;
const float input_max = 127.0f;
const float weights_min = -128.0f;
const float weights_max = 127.0f;
const float bias_scale = 1.0f;
const float output_min = -128.0f;
const float output_max = 127.0f;
const int input_dims_data_4[] = {2, 5, 10};
const int8_t input_data_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int weights_dims_data_4[] = {2, 5, 10};
const int8_t weights_data_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int bias_dims_data_4[] = {1, 5};
const int32_t bias_data_4[] = {1, 1, 1, 1, 1};
const int8_t expected_output_data_4[] = {41, 41, 41, 41, 41, 41, 41, 41, 41,
41, 41, 41, 41, 41, 41, 41, 41, 41,
41, 41, 41, 41, 41, 41, 41};
const int output_dims_data_4[] = {2, 5, 5};
const int output_dims_count_4 = 25;
int8_t output_data_4[output_dims_count_4];
tflite::testing::TestFullyConnectedQuantized<int8_t>(
input_dims_data_4, input_data_4, input_min, input_max,
weights_dims_data_4, weights_data_4, weights_min, weights_max,
bias_dims_data_4, bias_data_4, bias_scale, expected_output_data_4,
output_dims_data_4, output_min, output_max, kTfLiteActNone,
output_data_4);
}
TF_LITE_MICRO_TEST(LocalSimpleTestQuantized4) {
const float input_min = -128.0f;
const float input_max = 127.0f;
const float weights_min = -128.0f;
const float weights_max = 127.0f;
const float bias_scale = 1.0f;
const float output_min = -128.0f;
const float output_max = 127.0f;
const int input_dims_data_local_4[] = {2, 5, 10};
const int weights_dims_data_local_4[] = {2, 5, 10};
const int bias_dims_data_local_4[] = {1, 5};
const int output_dims_data_local_4[] = {2, 5, 5};
const int output_dims_count_local_4 = 25;
#pragma Bss(".Zdata")
const int8_t input_data_local_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int8_t weights_data_local_4[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
const int32_t bias_data_local_4[] = {1, 1, 1, 1, 1};
int8_t output_data_local_4[output_dims_count_local_4];
#pragma Bss()
const int8_t expected_output_data_local_4[] = {
41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41};
tflite::testing::TestFullyConnectedQuantized<int8_t>(
input_dims_data_local_4, input_data_local_4, input_min, input_max,
weights_dims_data_local_4, weights_data_local_4, weights_min, weights_max,
bias_dims_data_local_4, bias_data_local_4, bias_scale,
expected_output_data_local_4, output_dims_data_local_4, output_min,
output_max, kTfLiteActNone, output_data_local_4);
}
TF_LITE_MICRO_TESTS_END

View File

@ -0,0 +1,126 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "mli_slicers.h" // NOLINT
#include <algorithm>
namespace tflite {
namespace ops {
namespace micro {
TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim,
int slice_size, int padding_pre, int padding_post,
int overlap, bool interleave_mode)
: full_tensor_(full_tensor),
sliceDim_(slice_dim),
pad_pre_(padding_pre),
pad_post_(padding_post),
overlap_(overlap),
sub_cfg_{0},
sub_tensor_{0},
done_(false) {
/* In the interleave mode, the slicing happens from the deepest dimension up
to the slice_dim for example in an HWC layout this can mode can be used to
slice in the C dimenstion. in this mode the data is not contiguous in memory
anymore */
if (interleave_mode) {
for (int i = 0; i < full_tensor->rank; i++) {
if (i > slice_dim) {
sub_cfg_.size[i] = 1;
} else if (i == slice_dim) {
sub_cfg_.size[i] = slice_size;
} else {
sub_cfg_.size[i] = full_tensor->shape[i];
}
}
sub_cfg_.sub_tensor_rank = full_tensor->rank;
} else {
/* In the not interleaved mode, the slicing happens from the outer most
dimension up to the slice_dim for example in an HWC layout this mode can be
used to slice in the H dimension. in this mode the data of the slice is
still contiguous in memory (if that was the case in the input tensor */
for (int i = 0; i < full_tensor->rank; i++) {
if (i < slice_dim) {
sub_cfg_.size[i] = 1;
} else if (i == slice_dim) {
sub_cfg_.size[i] = slice_size;
} else {
sub_cfg_.size[i] = full_tensor->shape[i];
}
}
sub_cfg_.sub_tensor_rank = full_tensor->rank - slice_dim;
}
ComputeSubTensor();
}
void TensorSlicer::ComputeSubTensor(void) {
// subtsr_cfg_ is used to keep track of the iteration.
// A copy is created to update it with the correct clipping and padding for
// the current slice
mli_sub_tensor_cfg cfg_new = sub_cfg_;
// begin and end spans the complete input region including padding areas.
const int begin = (int)sub_cfg_.offset[sliceDim_] - pad_pre_;
// end is clipped to the end of the full input region. this is needed for
// cases where the last slice is smaller than the rest.
const int end = std::min(begin + sub_cfg_.size[sliceDim_] + overlap_,
full_tensor_->shape[sliceDim_] + pad_post_);
// The start coordinate of the subtensor is clipped to zero
cfg_new.offset[sliceDim_] = std::max(begin, 0);
// and the stop coordinate is clipped to the size of the full tensor
const int stop_coord =
std::min(end, static_cast<int>(full_tensor_->shape[sliceDim_]));
// compute the size of the subtensor
cfg_new.size[sliceDim_] = stop_coord - cfg_new.offset[sliceDim_];
// compute the padding configuration for the current slice.
actual_padding_pre = cfg_new.offset[sliceDim_] - begin;
actual_padding_post = end - stop_coord;
mli_hlp_create_subtensor(full_tensor_, &cfg_new, &sub_tensor_);
}
void TensorSlicer::Next(void) {
for (int i = full_tensor_->rank - 1; i >= 0; i--) {
sub_cfg_.offset[i] += sub_cfg_.size[i];
if (sub_cfg_.offset[i] >= full_tensor_->shape[i]) {
// wrap
sub_cfg_.offset[i] = 0;
// and continue to the next dimension, if no next dimension we are done.
if (i == 0) done_ = true;
continue;
} else {
// carry is false, so break from the loop
break;
}
}
if (!done_) ComputeSubTensor();
}
bool TensorSlicer::Done(void) { return done_; }
int TensorSlicer::GetPaddingPre(void) { return actual_padding_pre; }
int TensorSlicer::GetPaddingPost(void) { return actual_padding_post; }
mli_tensor* TensorSlicer::Sub(void) { return &sub_tensor_; }
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -0,0 +1,56 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
#define TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
#include "mli_api.h" // NOLINT
namespace tflite {
namespace ops {
namespace micro {
class TensorSlicer {
public:
TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size,
int padding_pre = 0, int padding_post = 0, int overlap = 0,
bool interleave_mode = false);
~TensorSlicer() = default;
void Next();
bool Done();
int GetPaddingPre();
int GetPaddingPost();
mli_tensor* Sub();
// Default constructor is deleted
TensorSlicer() = delete;
private:
const mli_tensor* full_tensor_;
mli_tensor sub_tensor_;
mli_sub_tensor_cfg sub_cfg_;
bool done_;
int sliceDim_;
int pad_pre_, pad_post_, overlap_;
int actual_padding_pre, actual_padding_post;
void ComputeSubTensor();
};
} // namespace micro
} // namespace ops
} // namespace tflite
#endif // TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_

View File

@ -0,0 +1,376 @@
/* Copyright 2019-2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/kernels/internal/reference/pooling.h"
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
#include "tensorflow/lite/kernels/kernel_util.h"
#include "tensorflow/lite/kernels/padding.h"
#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
namespace tflite {
namespace ops {
namespace micro {
namespace pooling {
namespace {
constexpr int kInputTensor = 0;
constexpr int kOutputTensor = 0;
struct OpData {
TfLitePaddingValues padding;
};
enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 };
bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
const TfLitePoolParams* params) {
// MLI optimized version only supports int8 dataype and no fused Relu
return (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone);
}
TfLiteStatus CalculateOpData(const TfLiteContext* context,
const TfLitePoolParams* params,
const TfLiteTensor* input,
const TfLiteTensor* output, OpData* data) {
// input: batch, height, width, channel
int height = SizeOfDimension(input, 1);
int width = SizeOfDimension(input, 2);
int out_height, out_width;
data->padding = ComputePaddingHeightWidth(
params->stride_height, params->stride_width,
/*dilation_rate_height=*/1,
/*dilation_rate_width=*/1, height, width, params->filter_height,
params->filter_width, params->padding, &out_height, &out_width);
return kTfLiteOk;
}
TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
const TfLitePoolParams* params,
const OpData* data, const TfLiteTensor* input,
TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
float activation_min, activation_max;
CalculateActivationRange(params->activation, &activation_min,
&activation_max);
PoolParams op_params;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.filter_height = params->filter_height;
op_params.filter_width = params->filter_width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.float_activation_min = activation_min;
op_params.float_activation_max = activation_max;
reference_ops::AveragePool(
op_params, GetTensorShape(input), GetTensorData<float>(input),
GetTensorShape(output), GetTensorData<float>(output));
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(context,
"Type %s (%d) is not supported by ARC MLI Library.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
#endif
}
// Prepare MLI tensors and run Average or Max Pooling
TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params,
const OpData* data, const TfLiteTensor* input,
TfLiteTensor* output, const MliPoolingType pooling_type) {
mli_tensor mli_in = {0};
mli_tensor mli_out = {0};
mli_pool_cfg cfg = {0};
ConvertToMliTensor<int8_t>(input, &mli_in);
ConvertToMliTensor<int8_t>(output, &mli_out);
cfg.kernel_width = params->filter_width;
cfg.kernel_height = params->filter_height;
cfg.stride_width = params->stride_width;
cfg.stride_height = params->stride_height;
if (params->padding == kTfLitePaddingValid) {
cfg.padding_left = 0;
cfg.padding_right = 0;
cfg.padding_top = 0;
cfg.padding_bottom = 0;
} else {
cfg.padding_left = data->padding.width;
cfg.padding_right = data->padding.width + data->padding.width_offset;
cfg.padding_top = data->padding.height;
cfg.padding_bottom = data->padding.height + data->padding.height_offset;
}
const int height_dimension = 1;
int in_slice_height = 0;
int out_slice_height = 0;
const int overlap = cfg.kernel_height - cfg.stride_height;
// Tensors for data in fast (local) memory and config to copy data from
// external to local memory
mli_tensor in_local = mli_in;
mli_tensor out_local = mli_out;
mli_mov_cfg_t copy_config;
mli_mov_cfg_for_copy(&copy_config);
TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_pooling_tensors(
context, &in_local, &out_local));
bool in_is_local = in_local.data == mli_in.data;
bool out_is_local = out_local.data == mli_out.data;
TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
&in_local, &out_local, cfg.kernel_height, cfg.stride_height,
cfg.padding_top, cfg.padding_bottom, &in_slice_height,
&out_slice_height));
/* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional
tensor. because the mli kernel will process one HWC tensor at a time, the 4
dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. on
top of that there could be a need to also slice in the Height dimension.
for that the sliceHeight has been calculated. The tensor slicer is
configured that it will completely slice the nBatch dimension (0) and slice
the height dimension (1) in chunks of 'sliceHeight' */
TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height,
cfg.padding_top, cfg.padding_bottom, overlap);
TensorSlicer out_slice(&mli_out, height_dimension, out_slice_height);
/* is_local indicates that the tensor is already in local memory,
so in that case the original tensor can be used,
and there is no need to copy it to the local tensor*/
mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
while (!out_slice.Done()) {
cfg.padding_top = in_slice.GetPaddingPre();
cfg.padding_bottom = in_slice.GetPaddingPost();
mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
if (pooling_type == AveragePooling)
mli_krn_avepool_hwc_sa8(in_ptr, &cfg, out_ptr);
else if (pooling_type == MaxPooling)
mli_krn_maxpool_hwc_sa8(in_ptr, &cfg, out_ptr);
mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
in_slice.Next();
out_slice.Next();
}
return kTfLiteOk;
}
TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
const TfLiteNode* node,
const TfLitePoolParams* params,
const OpData* data, const TfLiteTensor* input,
TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
int32_t activation_min, activation_max;
(void)CalculateActivationRangeQuantized(context, params->activation, output,
&activation_min, &activation_max);
PoolParams op_params;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.filter_height = params->filter_height;
op_params.filter_width = params->filter_width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.quantized_activation_min = activation_min;
op_params.quantized_activation_max = activation_max;
if (input->type == kTfLiteUInt8) {
reference_ops::AveragePool(
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
GetTensorShape(output), GetTensorData<uint8_t>(output));
} else {
reference_integer_ops::AveragePool(
op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
GetTensorShape(output), GetTensorData<int8_t>(output));
}
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(
context,
"Node configuration or type %s (%d) is not supported by ARC MLI Library.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
#endif
}
TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
TfLitePoolParams* params, OpData* data,
const TfLiteTensor* input, TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
float activation_min, activation_max;
CalculateActivationRange(params->activation, &activation_min,
&activation_max);
tflite::PoolParams op_params;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.filter_height = params->filter_height;
op_params.filter_width = params->filter_width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.float_activation_min = activation_min;
op_params.float_activation_max = activation_max;
reference_ops::MaxPool(op_params, GetTensorShape(input),
GetTensorData<float>(input), GetTensorShape(output),
GetTensorData<float>(output));
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(context,
"Type %s (%d) is not supported by ARC MLI Library.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
#endif
}
TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
TfLitePoolParams* params, OpData* data,
const TfLiteTensor* input, TfLiteTensor* output) {
#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
int32_t activation_min, activation_max;
(void)CalculateActivationRangeQuantized(context, params->activation, output,
&activation_min, &activation_max);
tflite::PoolParams op_params;
op_params.stride_height = params->stride_height;
op_params.stride_width = params->stride_width;
op_params.filter_height = params->filter_height;
op_params.filter_width = params->filter_width;
op_params.padding_values.height = data->padding.height;
op_params.padding_values.width = data->padding.width;
op_params.quantized_activation_min = activation_min;
op_params.quantized_activation_max = activation_max;
if (input->type == kTfLiteUInt8) {
reference_ops::MaxPool(
op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
GetTensorShape(output), GetTensorData<uint8_t>(output));
} else {
reference_integer_ops::MaxPool(
op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
GetTensorShape(output), GetTensorData<int8_t>(output));
}
return kTfLiteOk;
#else
TF_LITE_KERNEL_LOG(
context,
"Node configuration or type %s (%d) is not supported by ARC MLI Library.",
TfLiteTypeGetName(input->type), input->type);
return kTfLiteError;
#endif
}
} // namespace
TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
OpData data;
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
// Inputs and outputs share the same type, guaranteed by the converter.
switch (input->type) {
case kTfLiteFloat32:
return AverageEvalFloat(context, node, params, &data, input, output);
break;
case kTfLiteUInt8:
case kTfLiteInt8:
if (IsMliApplicable(context, input, params)) {
return EvalMli(context, params, &data, input, output, AveragePooling);
} else {
return AverageEvalQuantized(context, node, params, &data, input,
output);
}
break;
default:
TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
TfLiteTypeGetName(input->type));
return kTfLiteError;
}
return kTfLiteOk;
}
TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
OpData data;
const TfLiteTensor* input = GetInput(context, node, kInputTensor);
TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
switch (input->type) {
case kTfLiteFloat32:
return MaxEvalFloat(context, node, params, &data, input, output);
break;
case kTfLiteUInt8:
case kTfLiteInt8:
if (IsMliApplicable(context, input, params)) {
return EvalMli(context, params, &data, input, output, MaxPooling);
} else {
return MaxEvalQuantized(context, node, params, &data, input, output);
}
break;
default:
TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
TfLiteTypeGetName(input->type));
return kTfLiteError;
}
return kTfLiteOk;
}
} // namespace pooling
TfLiteRegistration* Register_AVERAGE_POOL_2D() {
static TfLiteRegistration r = {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/pooling::AverageEval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
return &r;
}
TfLiteRegistration* Register_MAX_POOL_2D() {
static TfLiteRegistration r = {/*init=*/nullptr,
/*free=*/nullptr,
/*prepare=*/nullptr,
/*invoke=*/pooling::MaxEval,
/*profiling_string=*/nullptr,
/*builtin_code=*/0,
/*custom_name=*/nullptr,
/*version=*/0};
return &r;
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -0,0 +1,422 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
// This test checks that slicing logic doesn`t affect result of pooling kernels
//
// This test doesn`t replace default pooling test
// (tensorflow/lite/micro/kernels/pooling.cc). It is added to the
// whole testset only in case MLI for ARC platform is used during generation
// (which is handled in arc_mli.inc). So such tests won`t be generated for other
// platforms.
#include <cstdint>
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
#include "tensorflow/lite/micro/testing/micro_test.h"
#include "tensorflow/lite/micro/testing/test_utils.h"
namespace tflite {
namespace testing {
namespace {
template <typename T>
void TestAveragePoolingQuantized(
const int* input_dims_data, const T* input_data, const float input_min,
const float input_max, const int filter_height, const int filter_width,
const int stride_height, const int stride_width,
const T* expected_output_data, const int* output_dims_data,
float output_min, float output_max, TfLitePadding padding,
TfLiteFusedActivation activation, T* output_data) {
static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
const int output_dims_count = ElementCount(*output_dims);
constexpr int inputs_size = 1;
constexpr int outputs_size = 1;
constexpr int tensors_size = inputs_size + outputs_size;
TfLiteTensor tensors[tensors_size] = {
CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
input_max),
CreateQuantizedTensor(output_data, output_dims, "output_tensor",
output_min, output_max),
};
TfLiteContext context;
PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
::tflite::ops::micro::AllOpsResolver resolver;
const TfLiteRegistration* registration =
resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1);
TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
TfLitePoolParams builtin_data = {padding, stride_width, stride_height,
filter_width, filter_height, activation};
const char* init_data = reinterpret_cast<const char*>(&builtin_data);
size_t init_data_size = 0;
void* user_data = nullptr;
if (registration->init) {
user_data = registration->init(&context, init_data, init_data_size);
}
int inputs_array_data[] = {1, 0};
TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
int outputs_array_data[] = {1, 1};
TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
int temporaries_array_data[] = {0};
TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
TfLiteNode node;
node.inputs = inputs_array;
node.outputs = outputs_array;
node.temporaries = temporaries_array;
node.user_data = user_data;
node.builtin_data = reinterpret_cast<void*>(&builtin_data);
node.custom_initial_data = nullptr;
node.custom_initial_data_size = 0;
node.delegate = nullptr;
if (registration->prepare) {
TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
}
TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
if (registration->free) {
registration->free(&context, user_data);
}
for (int i = 0; i < output_dims_count; ++i) {
TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
}
}
template <typename T>
void TestMaxPoolQuantized(const int* input_dims_data, const T* input_data,
float input_min, float input_max, int filter_width,
int filter_height, int stride_width,
int stride_height, const T* expected_output_data,
float output_min, float output_max,
const int* output_dims_data, TfLitePadding padding,
TfLiteFusedActivation activation, T* output_data) {
static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
const int output_dims_count = ElementCount(*output_dims);
constexpr int inputs_size = 1;
constexpr int outputs_size = 1;
constexpr int tensors_size = inputs_size + outputs_size;
TfLiteTensor tensors[tensors_size] = {
CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
input_max),
CreateQuantizedTensor(output_data, output_dims, "output_tensor",
output_min, output_max),
};
TfLiteContext context;
PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
::tflite::ops::micro::AllOpsResolver resolver;
const TfLiteRegistration* registration =
resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1);
TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
TfLitePoolParams builtin_data = {
padding, stride_width, stride_height,
filter_width, filter_height, activation,
};
const char* init_data = reinterpret_cast<const char*>(&builtin_data);
size_t init_data_size = 0;
void* user_data = nullptr;
if (registration->init) {
user_data = registration->init(&context, init_data, init_data_size);
}
int inputs_array_data[] = {1, 0};
TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
int outputs_array_data[] = {1, 1};
TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
int temporaries_array_data[] = {0};
TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
TfLiteNode node;
node.inputs = inputs_array;
node.outputs = outputs_array;
node.temporaries = temporaries_array;
node.user_data = user_data;
node.builtin_data = reinterpret_cast<void*>(&builtin_data);
node.custom_initial_data = nullptr;
node.custom_initial_data_size = 0;
node.delegate = nullptr;
if (registration->prepare) {
TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
}
TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
if (registration->free) {
registration->free(&context, user_data);
}
for (int i = 0; i < output_dims_count; ++i) {
TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
}
}
} // namespace
} // namespace testing
} // namespace tflite
TF_LITE_MICRO_TESTS_BEGIN
TF_LITE_MICRO_TEST(SystemAveragePoolTestInt1) {
using tflite::testing::F2QS;
const float input_min = -128;
const float input_max = 127;
const float output_min = -128;
const float output_max = 127;
int8_t output_data[3];
const int kInput1Shape[] = {4, 1, 2, 4, 1};
const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
const int kOutput1Shape[] = {4, 1, 1, 3, 1};
const int8_t kGolden1Data[] = {1, 1, 1};
tflite::testing::TestAveragePoolingQuantized(
kInput1Shape, // Input shape
kInput1Data, input_min, input_max, // input quantization range
2, 2, // filter height, filter width
1, 1, // stride height, stride width
kGolden1Data,
kOutput1Shape, // Output shape
output_min, output_max, // output quantization range
kTfLitePaddingValid, kTfLiteActNone, output_data);
}
TF_LITE_MICRO_TEST(LocalAveragePoolTestInt1) {
using tflite::testing::F2QS;
const float input_min = -128;
const float input_max = 127;
const float output_min = -128;
const float output_max = 127;
int8_t output_data[3];
#pragma Bss(".Zdata")
const int kInput1Shape[] = {4, 1, 2, 4, 1};
const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
const int kOutput1Shape[] = {4, 1, 1, 3, 1};
const int8_t kGolden1Data[] = {1, 1, 1};
#pragma Bss()
tflite::testing::TestAveragePoolingQuantized(
kInput1Shape, // Input shape
kInput1Data, input_min, input_max, // input quantization range
2, 2, // filter height, filter width
1, 1, // stride height, stride width
kGolden1Data,
kOutput1Shape, // Output shape
output_min, output_max, // output quantization range
kTfLitePaddingValid, kTfLiteActNone, output_data);
}
// Test group AVG 2
TF_LITE_MICRO_TEST(SystemAveragePoolTestInt2) {
using tflite::testing::F2QS;
const float input_min = -128;
const float input_max = 127;
const float output_min = -128;
const float output_max = 127;
int8_t output_data[45];
const int kInput2Shape[] = {4, 1, 6, 10, 1};
const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
const int kOutput2Shape[] = {4, 1, 5, 9, 1};
const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
tflite::testing::TestAveragePoolingQuantized(
kInput2Shape, // Input shape
kInput2Data, input_min, input_max, // input quantization range
2, 2, // filter height, filter width
1, 1, // stride height, stride width
kGolden2Data,
kOutput2Shape, // Output shape
output_min, output_max, // output quantization range
kTfLitePaddingValid, kTfLiteActNone, output_data);
}
TF_LITE_MICRO_TEST(LocalAveragePoolTestInt2) {
using tflite::testing::F2QS;
const float input_min = -128;
const float input_max = 127;
const float output_min = -128;
const float output_max = 127;
int8_t output_data[45];
#pragma Bss(".Zdata")
const int kInput2Shape[] = {4, 1, 6, 10, 1};
const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
const int kOutput2Shape[] = {4, 1, 5, 9, 1};
const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
#pragma Bss()
tflite::testing::TestAveragePoolingQuantized(
kInput2Shape, // Input shape
kInput2Data, input_min, input_max, // input quantization range
2, 2, // filter height, filter width
1, 1, // stride height, stride width
kGolden2Data,
kOutput2Shape, // Output shape
output_min, output_max, // output quantization range
kTfLitePaddingValid, kTfLiteActNone, output_data);
}
// Test group MAX 1
TF_LITE_MICRO_TEST(SystemMaxPoolTestInt1) {
using tflite::testing::F2QS;
int8_t output_data[3];
const float input_min = -128;
const float input_max = 127;
const float output_min = -128;
const float output_max = 127;
int filter_width = 2;
int filter_height = 2;
int stride_width = 1;
int stride_height = 1;
const int kInput1Shape[] = {4, 1, 2, 4, 1};
const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
const int kOutput1Shape[] = {4, 1, 1, 3, 1};
const int8_t kGolden1Data[] = {1, 1, 1};
tflite::testing::TestMaxPoolQuantized(
kInput1Shape, // Input shape
kInput1Data, input_min, input_max, filter_width, filter_height,
stride_width, stride_height, kGolden1Data, output_min, output_max,
kOutput1Shape, // Output shape
kTfLitePaddingValid, kTfLiteActNone, output_data);
}
TF_LITE_MICRO_TEST(LocalMaxPoolTestInt1) {
using tflite::testing::F2QS;
int8_t output_data[3];
const float input_min = -128;
const float input_max = 127;
const float output_min = -128;
const float output_max = 127;
int filter_width = 2;
int filter_height = 2;
int stride_width = 1;
int stride_height = 1;
#pragma Bss(".Zdata")
const int kInput1Shape[] = {4, 1, 2, 4, 1};
const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
const int kOutput1Shape[] = {4, 1, 1, 3, 1};
const int8_t kGolden1Data[] = {1, 1, 1};
#pragma Bss()
tflite::testing::TestMaxPoolQuantized(
kInput1Shape, // Input shape
kInput1Data, input_min, input_max, filter_width, filter_height,
stride_width, stride_height, kGolden1Data, output_min, output_max,
kOutput1Shape, // Output shape
kTfLitePaddingValid, kTfLiteActNone, output_data);
}
// Test group MAX 2
TF_LITE_MICRO_TEST(SystemMaxPoolTestInt2) {
using tflite::testing::F2QS;
int8_t output_data[45];
const float input_min = -128;
const float input_max = 127;
const float output_min = -128;
const float output_max = 127;
int filter_width = 2;
int filter_height = 2;
int stride_width = 1;
int stride_height = 1;
const int kInput2Shape[] = {4, 1, 6, 10, 1};
const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
const int kOutput2Shape[] = {4, 1, 5, 9, 1};
const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
tflite::testing::TestMaxPoolQuantized(
kInput2Shape, // Input shape
kInput2Data, input_min, input_max, filter_width, filter_height,
stride_width, stride_height, kGolden2Data, output_min, output_max,
kOutput2Shape, // Output shape
kTfLitePaddingValid, kTfLiteActNone, output_data);
}
TF_LITE_MICRO_TEST(LocalMaxPoolTestInt2) {
using tflite::testing::F2QS;
int8_t output_data[45];
const float input_min = -128;
const float input_max = 127;
const float output_min = -128;
const float output_max = 127;
int filter_width = 2;
int filter_height = 2;
int stride_width = 1;
int stride_height = 1;
#pragma Bss(".Zdata")
const int kInput2Shape[] = {4, 1, 6, 10, 1};
const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
const int kOutput2Shape[] = {4, 1, 5, 9, 1};
const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
#pragma Bss()
tflite::testing::TestMaxPoolQuantized(
kInput2Shape, // Input shape
kInput2Data, input_min, input_max, filter_width, filter_height,
stride_width, stride_height, kGolden2Data, output_min, output_max,
kOutput2Shape, // Output shape
kTfLitePaddingValid, kTfLiteActNone, output_data);
}
TF_LITE_MICRO_TESTS_END

View File

@ -0,0 +1,338 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
#include <limits.h>
#include <algorithm>
#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
namespace tflite {
namespace ops {
namespace micro {
static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2,
int* grant_size_1, int* grant_size_2) {
int maxrequest = 0;
int secondrequest = 0;
int maxavailable = 0;
int secondavail = 0;
// determine the largest requested buffer.
if (request_size_1 > request_size_2) {
maxrequest = request_size_1;
secondrequest = request_size_2;
} else {
maxrequest = request_size_2;
secondrequest = request_size_1;
}
// find the two largest available buffers.
get_arc_scratch_buffer_two_max_sizes(&maxavailable, &secondavail);
// in case two buffers are available, the largest buffer can go to the largest
// request.
if (secondavail > 0) { // this condition can be enhanced to prevent cases
// where the second buffer is so small that it is
// better to use one buffer and split it.
if (request_size_1 > request_size_2) {
*grant_size_1 = maxavailable;
*grant_size_2 = secondavail;
} else {
*grant_size_1 = secondavail;
*grant_size_2 = maxavailable;
}
} else {
// In case only one buffer is available,
// use only the max buffer, and split it.
*grant_size_1 = maxavailable / 2;
*grant_size_2 = maxavailable / 2;
}
}
static TfLiteStatus get_arc_scratch_buffer_for_io_tensors(
TfLiteContext* context, mli_tensor* in, mli_tensor* out) {
#ifdef __Xxy
int request_size_in = 0;
int request_size_out = 0;
int grant_size_in = 0;
int grant_size_out = 0;
if (!inside_arc_ccm(in->data)) {
// In case the input tensor contains multiple batches, it has rank 4
// because the mli kernel cannot operate on batches, we need to have the
// size of a single HWC tensor. that is why the start_rank is 1 in case of
// input rank 4
int start_rank = in->rank - 3;
request_size_in = mli_hlp_count_elem_num(in, start_rank) *
mli_hlp_tensor_element_size(in);
}
if (!inside_arc_ccm(out->data)) {
// In case the input tensor contains multiple batches, it has rank 4
// because the mli kernel cannot operate on batches, we need to have the
// size of a single batch. that is why the start_rank is 1 in case of input
// rank 4
int start_rank = out->rank - 3;
request_size_out = mli_hlp_count_elem_num(out, start_rank) *
mli_hlp_tensor_element_size(out);
}
get_arc_two_buffer_sizes(request_size_in, request_size_out, &grant_size_in,
&grant_size_out);
if (!inside_arc_ccm(in->data)) {
in->data = get_arc_scratch_buffer(grant_size_in);
in->capacity = grant_size_in;
if (in->data == NULL) return kTfLiteError;
}
if (!inside_arc_ccm(out->data)) {
out->data = get_arc_scratch_buffer(grant_size_out);
out->capacity = grant_size_out;
if (out->data == NULL) return kTfLiteError;
}
#endif
return kTfLiteOk;
}
TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
mli_tensor* in,
mli_tensor* weights,
mli_tensor* bias,
mli_tensor* out) {
TfLiteStatus ret_val = kTfLiteOk;
#ifdef __Xxy
init_arc_scratch_buffers();
if (!inside_arc_ccm(weights->data)) {
int weights_size = mli_hlp_count_elem_num(weights, 0) *
mli_hlp_tensor_element_size(weights);
int max_weights_size = 0;
weights->data = get_arc_scratch_buffer(weights_size);
weights->capacity = weights_size;
if (weights->data == NULL) {
get_arc_scratch_buffer_max_size(&max_weights_size);
weights->data = get_arc_scratch_buffer(max_weights_size);
weights->capacity = max_weights_size;
if (max_weights_size == 0) ret_val = kTfLiteError;
}
if (weights->data == NULL) ret_val = kTfLiteError;
}
if (!inside_arc_ccm(bias->data)) {
uint32_t bias_mem_requirements =
mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
bias->data = get_arc_scratch_buffer(bias_mem_requirements);
bias->capacity = bias_mem_requirements;
}
if (ret_val == kTfLiteOk) {
ret_val = get_arc_scratch_buffer_for_io_tensors(context, in, out);
}
if (bias->data == NULL) {
int max_bias_size = 0;
get_arc_scratch_buffer_max_size(&max_bias_size);
bias->data = get_arc_scratch_buffer(max_bias_size);
bias->capacity = max_bias_size;
if (max_bias_size == 0) ret_val = kTfLiteError;
}
if (bias->data == NULL) ret_val = kTfLiteError;
#endif
return ret_val;
}
TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
TfLiteContext* context, mli_tensor* in, mli_tensor* weights,
mli_tensor* bias, mli_tensor* out) {
TfLiteStatus ret_val = kTfLiteOk;
#ifdef __Xxy
init_arc_scratch_buffers();
/* strategy for FC kernels:
first allocate input, because this cannot be sliced. (in case of batch
processing, only a single input needs to be allocated) then weigths & bias
because if fully loaded, they can be reused over batches. then output.
The number of output channels (for weights slicing) depends on size of
output and size of weights&bias */
if (!inside_arc_ccm(in->data)) {
/* In case the input tensor contains multiple batches,
only count the size if the inner most dimension */
int size_in = mli_hlp_count_elem_num(in, in->rank - 1) *
mli_hlp_tensor_element_size(in);
in->data = get_arc_scratch_buffer(size_in);
in->capacity = size_in;
if (in->data == NULL) {
in->capacity = 0;
ret_val = kTfLiteError;
}
}
if (!inside_arc_ccm(weights->data)) {
int weights_size = mli_hlp_count_elem_num(weights, 0) *
mli_hlp_tensor_element_size(weights);
int max_weights_size = 0;
weights->data = get_arc_scratch_buffer(weights_size);
weights->capacity = weights_size;
if (weights->data == NULL) {
get_arc_scratch_buffer_max_size(&max_weights_size);
weights->data = get_arc_scratch_buffer(max_weights_size);
weights->capacity = max_weights_size;
if (max_weights_size == 0) ret_val = kTfLiteError;
}
if (weights->data == NULL) ret_val = kTfLiteError;
}
if (!inside_arc_ccm(bias->data)) {
int bias_mem_requirements =
mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
bias->data = get_arc_scratch_buffer(bias_mem_requirements);
bias->capacity = bias_mem_requirements;
}
if (!inside_arc_ccm(out->data)) {
/* In case the input tensor contains multiple batches,
only count the size if the inner most dimension */
int out_size = mli_hlp_count_elem_num(out, out->rank - 1) *
mli_hlp_tensor_element_size(out);
int max_out_size = 0;
out->data = get_arc_scratch_buffer(out_size);
out->capacity = out_size;
if (out->data == NULL) {
get_arc_scratch_buffer_max_size(&max_out_size);
out->data = get_arc_scratch_buffer(max_out_size);
out->capacity = max_out_size;
if (max_out_size == 0) ret_val = kTfLiteError;
}
if (out->data == NULL) ret_val = kTfLiteError;
}
if (bias->data == NULL) {
int max_bias_size = 0;
get_arc_scratch_buffer_max_size(&max_bias_size);
bias->data = get_arc_scratch_buffer(max_bias_size);
bias->capacity = max_bias_size;
if (max_bias_size == 0) ret_val = kTfLiteError;
}
if (bias->data == NULL) ret_val = kTfLiteError;
#endif
return ret_val;
}
TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
const mli_tensor* in, const mli_tensor* out, const int kernel_height,
const int stride_height, const int padding_top, const int padding_bot,
int* in_slice_height, int* out_slice_height) {
const int height_dimension = 1;
const int in_height = in->shape[height_dimension];
const int out_height = out->shape[height_dimension];
const int line_size_in = mli_hlp_count_elem_num(in, height_dimension + 1) *
mli_hlp_tensor_element_size(in);
const int line_size_out = mli_hlp_count_elem_num(out, height_dimension + 1) *
mli_hlp_tensor_element_size(out);
int max_lines_in = 0;
int max_lines_out = 0;
int max_out_lines_for_input = 0;
bool fit = (in->capacity >= in_height * line_size_in) &&
(out->capacity >= out_height * line_size_out);
if (fit) {
// in case both tensors completely fit in the capacity, there is no need for
// slicing
*in_slice_height = in_height;
*out_slice_height = out_height;
} else {
// First compute how many lines fit into the input tensor, and compute how
// many output lines can be computed with that.
max_lines_in =
std::min(in_height, static_cast<int>(in->capacity) / line_size_in);
if (max_lines_in >= in_height) {
max_out_lines_for_input = out_height;
} else if (2 * max_lines_in >= in_height) {
// in this case only two slices are needed, so both could benefit from
// padding. take the MIN to get the worst case.
max_out_lines_for_input =
(max_lines_in + std::min(padding_top, padding_bot) - kernel_height +
1) /
stride_height;
} else {
max_out_lines_for_input =
(max_lines_in - kernel_height + 1) / stride_height;
}
// Ten compute how many ouput lines fit into the output tensor.
max_lines_out =
std::min(out_height, static_cast<int>(out->capacity) / line_size_out);
// the smallest of the two determines the slice height for the output, and
// the derived sliceheight for the input.
*out_slice_height = std::min(max_out_lines_for_input, max_lines_out);
*in_slice_height = *out_slice_height * stride_height;
}
if ((*in_slice_height > 0) && (*out_slice_height > 0)) {
return kTfLiteOk;
} else {
return kTfLiteError;
}
}
TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
const mli_tensor* weights, const mli_tensor* bias,
const int weight_out_ch_dimension, int* slice_channels) {
const int channels = weights->shape[weight_out_ch_dimension];
const int ch_size_w = (mli_hlp_count_elem_num(weights, 0) / channels) *
mli_hlp_tensor_element_size(weights);
const int ch_size_b = (mli_hlp_count_elem_num(bias, 0) / channels) *
mli_hlp_tensor_element_size(bias);
int max_ch_weigths = 0;
int max_ch_bias = 0;
bool fit = (weights->capacity >= channels * ch_size_w) &&
(bias->capacity >= channels * ch_size_b);
if (fit) {
// in case both tensors completely fit in the capacity, there is no need for
// slicing
*slice_channels = channels;
} else {
// First compute how many channels fit into the weights tensor
max_ch_weigths =
std::min(channels, static_cast<int>(weights->capacity) / ch_size_w);
// Ten compute how many channels fit into the bias tensor.
max_ch_bias =
std::min(channels, static_cast<int>(bias->capacity) / ch_size_b);
// the smallest of the two determines the slice size
*slice_channels = std::min(max_ch_weigths, max_ch_bias);
}
if (*slice_channels > 0) {
return kTfLiteOk;
} else {
return kTfLiteError;
}
}
TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
mli_tensor* in,
mli_tensor* out) {
#ifdef __Xxy
init_arc_scratch_buffers();
return get_arc_scratch_buffer_for_io_tensors(context, in, out);
#else
return kTfLiteOk;
#endif
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -0,0 +1,129 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/c/common.h"
namespace tflite {
namespace ops {
namespace micro {
/**
* @brief Function to allocate scratch buffers for the convolution tensors
*
* @detail This function will update the data pointers in the 4 tensors with
* pointers to scratch buffers in fast local memory.
*
* @param context [I] pointer to TfLite context (needed for error handling)
* @param in [IO] pointer to the input tensor
* @param weights [IO] pointer to the weights tensor
* @param bias [IO] pointer to the bias tensor
* @param output [IO] pointer to the output tensor
*
* @return Tf Lite status code
*/
TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
mli_tensor* in,
mli_tensor* weights,
mli_tensor* bias,
mli_tensor* out);
/**
* @brief Function to allocate scratch buffers for pooling kernels with only
* input and output buffers
*
* @detail This function will update the data pointers in the 2 tensors with
* pointers to scratch buffers in fast local memory.
*
* @param context [I] pointer to TfLite context (needed for error handling)
* @param in [IO] pointer to the input tensor
* @param output [IO] pointer to the output tensor
*
* @return Tf Lite status code
*/
TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
mli_tensor* in,
mli_tensor* out);
/**
* @brief Function to allocate scratch buffers for the fully connect tensors
*
* @detail This function will update the data pointers in the 4 tensors with
* pointers to scratch buffers in fast local memory.
*
* @param context [I] pointer to TfLite context (needed for error handling)
* @param in [IO] pointer to the input tensor
* @param weights [IO] pointer to the weights tensor
* @param bias [IO] pointer to the bias tensor
* @param output [IO] pointer to the output tensor
*
* @return Tf Lite status code
*/
TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
TfLiteContext* context, mli_tensor* in, mli_tensor* weights,
mli_tensor* bias, mli_tensor* out);
/**
* @brief Function to calculate slice size for io tensors
*
* @detail This function will calculate the slice size in the height dimension
* for input and output tensors. it takes into account the kernel size and the
* padding. the function will look at the capacity filed in the in and out
* tensor to determine the available buffersize.
*
* @param in [I] pointer to the input tensor
* @param out [I] pointer to the output tensor
* @param kernelHeight [I] size of the kernel in height dimension
* @param strideHeight [I] input stride in height dimension
* @param padding_top [I] number of lines with zeros at the top
* @param padding_bot [I] number of lines with zeros at the bottom
* @param inSliceHeight [O] slice size in height dimension for the input tensor
* @param outSliceHeight [O] slice size in height dimension for the output
* tensor
*
* @return Tf Lite status code
*/
TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
const mli_tensor* in, const mli_tensor* out, const int kernelHeight,
const int strideHeight, const int padding_top, const int padding_bot,
int* in_slice_height, int* out_slice_height);
/**
* @brief Function to calculate slice size for weight slicing
*
* @detail This function will calculate the slice size in the output channel
* dimension for weight and bias tensors. the function will look at the capacity
* filed in the weights and bias tensor to determine the available buffersize.
*
* @param weights [I] pointer to the input tensor
* @param bias [I] pointer to the output tensor
* @param weightOutChDimension [I] dimension of the output channels in the
* weights tensor
* @param sliceChannels [O] slice size in output channel dimension
*
* @return Tf Lite status code
*/
TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
const mli_tensor* weights, const mli_tensor* bias,
const int weight_out_ch_dimension, int* slice_channels);
} // namespace micro
} // namespace ops
} // namespace tflite
#endif // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_

View File

@ -0,0 +1,135 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
#include <limits.h>
namespace tflite {
namespace ops {
namespace micro {
/* by default use all the XY memory, and half of the DCCM because DCCM is also
* used for the data section and the stack. the values can be overruled by
* adding a -D option to the makefile of the application
*/
#ifndef SCRATCH_MEM_X_SIZE
#ifdef core_config_xy_size
#define SCRATCH_MEM_X_SIZE (core_config_xy_size)
#else
#define SCRATCH_MEM_X_SIZE (0)
#endif
#endif
#ifndef SCRATCH_MEM_Y_SIZE
#ifdef core_config_xy_size
#define SCRATCH_MEM_Y_SIZE (core_config_xy_size)
#else
#define SCRATCH_MEM_Y_SIZE (0)
#endif
#endif
#ifndef SCRATCH_MEM_Z_SIZE
#ifdef core_config_dccm_size
#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2)
#else
#define SCRATCH_MEM_Z_SIZE (0)
#endif
#endif
namespace {
#pragma Bss(".Xdata")
static int8_t scratch_mem_x[SCRATCH_MEM_X_SIZE];
#pragma Bss()
#pragma Bss(".Ydata")
static int8_t scratch_mem_y[SCRATCH_MEM_Y_SIZE];
#pragma Bss()
#pragma Bss(".Zdata")
static int8_t scratch_mem_z[SCRATCH_MEM_Z_SIZE];
#pragma Bss()
} // namespace
static int8_t *scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z};
static uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE,
SCRATCH_MEM_Z_SIZE};
void *get_arc_scratch_buffer(int size) {
// Function to asign fast memory from one of 3 scratch buffers.
// Best Fit strategy - memory is allocated from that memory bank that leaves
// the least unused memory.
void *buf = NULL;
int best_mem_idx = -1;
int best_mem_delta = INT_MAX;
const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
// find a local memory that fits the data size.
for (int mem_idx = 0; mem_idx < num_mem; ++mem_idx) {
// Best Fit
if ((size <= scratch_sizes[mem_idx]) &&
(scratch_sizes[mem_idx] - size < best_mem_delta)) {
best_mem_idx = mem_idx;
best_mem_delta = scratch_sizes[mem_idx] - size;
}
}
if (best_mem_idx >= 0) {
buf = static_cast<void *>(scratch_mem[best_mem_idx]);
scratch_mem[best_mem_idx] += size;
scratch_sizes[best_mem_idx] -= size;
}
return buf;
}
void get_arc_scratch_buffer_max_size(int *size) {
int maxavailable = 0;
const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
// find the largest available buffer.
for (int i = 0; i < num_mem; i++) {
if (scratch_sizes[i] > maxavailable) {
maxavailable = scratch_sizes[i];
}
}
*size = maxavailable;
}
void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) {
int maxavailable = 0;
int secondavail = 0;
const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
// find the two largest available buffers.
for (int i = 0; i < num_mem; i++) {
if (scratch_sizes[i] > maxavailable) {
secondavail = maxavailable;
maxavailable = scratch_sizes[i];
} else if (scratch_sizes[i] > secondavail) {
secondavail = scratch_sizes[i];
}
}
*size1 = maxavailable;
*size2 = secondavail;
}
void init_arc_scratch_buffers(void) {
scratch_mem[0] = scratch_mem_x;
scratch_mem[1] = scratch_mem_y;
scratch_mem[2] = scratch_mem_z;
scratch_sizes[0] = SCRATCH_MEM_X_SIZE;
scratch_sizes[1] = SCRATCH_MEM_Y_SIZE;
scratch_sizes[2] = SCRATCH_MEM_Z_SIZE;
}
} // namespace micro
} // namespace ops
} // namespace tflite

View File

@ -0,0 +1,68 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/c/common.h"
namespace tflite {
namespace ops {
namespace micro {
void init_arc_scratch_buffers(void);
void* get_arc_scratch_buffer(
int size); // Function to assign fast memory from one of 3 scratch buffers.
void get_arc_scratch_buffer_max_size(int* size);
void get_arc_scratch_buffer_two_max_sizes(int* size1, int* size2);
static inline bool inside_arc_dccm(void* p) {
#if core_config_dccm_present
return ((unsigned)p >= core_config_dccm_base) &&
((unsigned)p < core_config_dccm_base + core_config_dccm_size);
#else
return false;
#endif
}
static inline bool inside_arc_xccm(void* p) {
#if core_config_xy
return ((unsigned)p >= core_config_xy_x_base) &&
((unsigned)p < core_config_xy_x_base + core_config_xy_size);
#else
return false;
#endif
}
static inline bool inside_arc_yccm(void* p) {
#if core_config_xy
return ((unsigned)p >= core_config_xy_y_base) &&
((unsigned)p < core_config_xy_y_base + core_config_xy_size);
#else
return false;
#endif
}
static inline bool inside_arc_ccm(void* p) {
return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p);
}
} // namespace micro
} // namespace ops
} // namespace tflite
#endif // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_

View File

@ -409,8 +409,9 @@ TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannel) {
TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannelRelu6) {
// conv params:
// padding, stride_<width,height>, dilation_<width, height>, activation
TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6};
// padding, stride_<width,height>, activation, dilation_<width, height>
TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1,
kTfLiteActRelu6, 1, 1};
const int kInputShape[] = {4, 1, 2, 2, 4}; // [len,N,H,W,C]
const int kInputElements =
kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4];

View File

@ -496,7 +496,7 @@ TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingSameStride1ActNone) {
F2QS(8.5, output_min, output_max), F2QS(7., output_min, output_max)},
{4, 1, 2, 4, 1}, // Output shape
output_min, output_max, // output quantization range
kTfLitePaddingValid, kTfLiteActNone, output_data);
kTfLitePaddingSame, kTfLiteActNone, output_data);
}
TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloat) {

View File

@ -90,7 +90,7 @@ patch_cifar10_dataset() {
}
build_embarc_mli() {
gmake -j 4 -C ${1}/lib/make TCF_FILE=${2}
make -j 4 -C ${1}/lib/make TCF_FILE=${2}
}
# Main function handling the download, verify, extract, and patch process.
@ -173,7 +173,12 @@ download_and_extract() {
elif [[ ${action} == "patch_cifar10_dataset" ]]; then
patch_cifar10_dataset ${dir}
elif [[ ${action} == "build_embarc_mli" ]]; then
build_embarc_mli ${dir} ${action_param1}
if [[ "${action_param1}" == *.tcf ]]; then
cp ${action_param1} ${dir}/hw/arc.tcf
build_embarc_mli ${dir} ../../hw/arc.tcf
else
build_embarc_mli ${dir} ${action_param1}
fi
elif [[ ${action} ]]; then
echo "Unknown action '${action}'"
exit 1

View File

@ -0,0 +1,104 @@
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Settings for embARC MLI library for ARC platform.
ifeq ($(TARGET_ARCH), arc)
# MLI Library is used by default for ARC platform whenever it is possible.
# To use TFLM reference implementation MLI should be intentionally turned off
# by passing 'no_arc_mli' tag (make -f <tflm_main_makefile> TAGS=no_arc_mli ...)
ifeq ($(filter no_arc_mli,$(ALL_TAGS)),)
ALL_TAGS += arc_mli
ifeq ($(BUILD_ARC_MLI),true)
MLI_LIB_DIR ?= arc_mli_$(basename $(TCF_FILE_NAME))
$(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
THIRD_PARTY_CC_HDRS += \
third_party/$(MLI_LIB_DIR)/LICENSE
else
ifneq ($(ARC_MLI_PRE_COMPILED_TARGET),)
MLI_LIB_DIR ?= arc_mli_package
$(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),))
MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/$(ARC_MLI_PRE_COMPILED_TARGET)/release/libmli.a
MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/$(ARC_MLI_PRE_COMPILED_TARGET)/release/libmli.a
THIRD_PARTY_CC_HDRS += \
third_party/$(MLI_LIB_DIR)/LICENSE
else
$(error Target for pre compiled ARC MLI library is not defined)
endif
endif
THIRD_PARTY_CC_HDRS += $(MLI_LIB)
GENERATED_PROJECT_LIBS += $(MLI_LIB)
INCLUDES += \
-I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
-I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
GENERATED_PROJECT_INCLUDES += \
-I. \
-I./third_party/$(MLI_INCLUDE_FOLDER) \
-I./third_party/$(MLI_INCLUDE_FOLDER)/api
THIRD_PARTY_CC_HDRS += \
third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h
MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h
MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h
MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
MICROLITE_TEST_SRCS += $(wildcard tensorflow/lite/micro/kernels/arc_mli/*test.cc)
ARC_MLI_TESTS := conv depthwise_conv pooling fully_connected
ARC_MLI_TESTS += $(foreach TEST,$(ARC_MLI_TESTS), $(TEST)_slicing)
generate_arc_mli_test_projects: $(foreach TEST,$(ARC_MLI_TESTS), generate_kernel_$(TEST)_test_make_project)
ARC_EXTRA_APP_SETTINGS += \
\nMLI_ONLY ?= false\n\
\nifeq \($(DLR)\(MLI_ONLY\), true\)\
\nCCFLAGS += -DTF_LITE_STRIP_REFERENCE_IMPL\
\nCXXFLAGS += -DTF_LITE_STRIP_REFERENCE_IMPL\
\nendif\n
endif # no_embarc_mli
endif # TARGET_ARCH

View File

@ -130,24 +130,37 @@ endef
define generate_arc_project
ifeq ($(TARGET_ARCH), arc)
$(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/Makefile.tpl
$(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
@mkdir -p $$(dir $$@)
@sed -E 's#\%\{SRCS\}\%#$(4)#g' $$< | \
sed -E '1 i\CC = ccac\nCXX = ccac\nLD = ccac\n' | \
sed -E 's#\%\{CC\}\%#$(CC_TOOL)#g' | \
sed -E 's#\%\{CXX\}\%#$(CXX_TOOL)#g' | \
sed -E 's#\%\{LD\}\%#$(LD_TOOL)#g' | \
sed -E 's#\%\{EXECUTABLE\}\%#$(3).elf#g' | \
sed -E 's#\%\{LINKER_FLAGS\}\%#$(6)#g' | \
sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' | \
sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' > $$@
sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' | \
sed -E 's#\%\{EXTRA_APP_SETTINGS\}\%#$(ARC_EXTRA_APP_SETTINGS)#g' | \
sed -E 's#\%\{EXTRA_APP_RULES\}\%#$(ARC_EXTRA_APP_RULES)#g' | \
sed -E 's#\%\{BIN_DEPEND\}\%#$(ARC_BIN_DEPEND)#g' | \
sed -E 's#\%\{BIN_RULE\}\%#$(ARC_BIN_RULE)#g' | \
sed -E 's#\%\{EXTRA_RM_TARGETS\}\%#$(ARC_EXTRA_RM_TARGETS)#g' | \
sed -E 's#\%\{APP_RUN_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \
sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_DEBUG_CMD)#g' | \
sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@
# Special rule to copy TCF in case the local filesystem file name has been defined
ifneq ($(TCF_FILE_NAME), )
$(PRJDIR)$(3)/$(1)/$(TCF_FILE_NAME): $(TCF_FILE)
$(PRJDIR)$(3)/$(1)/%: tensorflow/lite/micro/tools/make/templates/arc/%.tpl
@cp $$< $$@
endif
$(foreach var,$(ARC_TARGET_FILES_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var))))
endif
endef
# Creates a set of rules to build a standalone Arduino project for an
# executable, including all of the source and header files required in a
# separate folder and a simple makefile.

View File

@ -0,0 +1,315 @@
# Building TensorFlow Lite for Microcontrollers for Synopsys DesignWare ARC EM/HS Processors
This document contains the general information on building and running
TensorFlow Lite Micro for targets based on the Synopsys ARC EM/HS Processors.
## Table of Contents
- [Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit)
- [ARC EM Software Development Platform (ARC EM SDP)](#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
- [Custom ARC EM or HS Platform](#Custom-ARC-EMHS-Platform)
## Install the Synopsys DesignWare ARC MetaWare Development Toolkit
The Synopsys DesignWare ARC MetaWare Development Toolkit (MWDT) is required to
build and run Tensorflow Lite Micro applications for all ARC EM/HS targets.
To license MWDT, please see further details
[here](https://www.synopsys.com/dw/ipdir.php?ds=sw_metaware)
To request an evaluation version of MWDT, please use the
[Synopsys Eval Portal](https://eval.synopsys.com/) and follow the link for the
MetaWare Development Toolkit (Important: Do not confuse this with MetaWare EV
Development Toolkit or MetaWare Lite options also available on this page)
Run the downloaded installer and follow the instructions to set up the toolchain
on your platform.
TensorFlow Lite for Microcontrollers builds are divided into two phases:
Application Project Generation and Application Project Building/Running. The
former phase requires \*nix environment while the latter does not.
For basic project generation targeting
[ARC EM Software Development Platform](#ARC-EM-Software-Development-Platform-ARC-EM-SDP),
MetaWare is NOT required for the Project Generation Phase. However, it is
required in case the following: - For project generation for custom (not EM SDP)
targets - To build microlib target library with all required TFLM objects for
external use
Please consider the above when choosing whether to install Windows or Linux or
both versions of MWDT
## ARC EM Software Development Platform (ARC EM SDP)
This section describes how to deploy on an
[ARC EM SDP board](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
### Initial Setup
To use the EM SDP, you need the following hardware and software:
#### ARC EM SDP
More information on the platform, including ordering information, can be found
[here](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform).
#### MetaWare Development Toolkit
See
[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit)
section for instructions on toolchain installation.
#### Digilent Adept 2 System Software Package
If you wish to use the MetaWare Debugger to debug your code, you need to also
install the Digilent Adept 2 software, which includes the necessary drivers for
connecting to the targets. This is available from oficial
[Digilent site](https://reference.digilentinc.com/reference/software/adept/start?redirect=1#software_downloads).
You should install the “System” component, and Runtime. Utilities and SDK are
NOT required.
Digilent installation is NOT required if you plan to deploy to EM SDP via the SD
card instead of using the debugger.
#### Make Tool
A `'make'` tool is required for both phases of deploying Tensorflow Lite Micro
applications on ARC EM SDP: 1. Application project generation 2. Working with
generated application (build and run)
For the first phase you need an environment and make tool compatible with
Tensorflow Lite for Micro build system. At the moment of this writing, this
requires make >=3.82 and a *nix-like environment which supports shell and native
commands for file manipulations. MWDT toolkit is not required for this phase.
For the second phase, requirements are less strict. The gmake version delivered
with MetaWare Development Toolkit is sufficient. There are no shell and *nix
command dependencies, so Windows can be used
#### Serial Terminal Emulation Application
The Debug UART port of the EM SDP is used to print application output. The USB
connection provides both the debug channel and RS232 transport. You can use any
terminal emulation program (like [PuTTY](https://www.putty.org/)) to view UART
output from the EM SDP.
#### microSD Card
If you want to self-boot your application (start it independently from a
debugger connection), you also need a microSD card with a minimum size of 512 MB
and a way to write to the card from your development host
### Connect the Board
1. Make sure Boot switches of the board (S3) are configured in the next way:
Switch # | Switch position
:------: | :-------------:
1 | Low (0)
2 | Low (0)
3 | High (1)
4 | Low (0)
1. Connect the power supply included in the product package to the ARC EM SDP.
2. Connect the USB cable to connector J10 on the ARC EM SDP (near the RST and
CFG buttons) and to an available USB port on your development host.
3. Determine the COM port assigned to the USB Serial Port (on Windows, using
Device Manager is an easy way to do this)
4. Execute the serial terminal application you installed in the previous step
and open the serial connection with the early defined COM port (speed 115200
baud; 8 bits; 1 stop bit; no parity).
5. Push the CFG button on the board. After a few seconds you should see the
boot log in the terminal which begins as follows:
```
U-Boot <Versioning info>
CPU: ARC EM11D v5.0 at 40 MHz
Subsys:ARC Data Fusion IP Subsystem
Model: snps,emsdp
Board: ARC EM Software Development Platform v1.0
```
### Generate Application Project for ARC EM SDP
Before building an example or test application, you need to generate a TFLM
project for this application from TensorFlow sources and external dependencies.
To generate it for ARC EM SDP board you need to set `TARGET=arc_emsdp` on the
make command line. For instance, to build the Person Detect test application,
use a shell to execute the following command from the root directory of the
TensorFlow repo:
```
make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_test_int8_make_project TARGET=arc_emsdp
```
The application project will be generated into
*tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_test_int8/make*
Info on generating and building example applications for EM SDP
(*tensorflow/lite/micro/examples*) can be found in the appropriate readme file
placed in the same directory with the examples. In general, its the same
process which described in this Readme.
The
[embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli)
is used by default to speed up execution of some kernels for asymmetrically
quantized layers. Kernels which use MLI-based implementations are kept in the
*tensorflow/lite/micro/kernels/arc_mli* folder. For applications which may not
benefit from MLI library, the project can be generated without these
implementations by adding `TAGS=no_arc_mli` in the command line. This can reduce
code size when the optimized kernels are not required.
For more options on embARC MLI usage see
[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md).
### Build the Application
You may need to adjust the following commands in order to use the appropriate
make tool available in your environment (ie: `make` or `gmake`)
1. Open command shell and change the working directory to the location which
contains the generated project, as described in the previous section
2. Clean previous build artifacts (optional)
make clean
3. Build application
make app
### Run the Application on the Board Using MetaWare Debugger
In case you do not have access to the MetaWare Debugger or have chosen not to
install the Digilent drivers, you can skip to the next section.
To run the application from the console, use the following command:
```
make run
```
If application runs in an infinite loop, type `Ctrl+C` several times to exit the
debugger.
To run the application in the GUI debugger, use the following command:
```
make debug
```
In both cases you will see the application output in the serial terminal.
### Run the Application on the Board from the microSD Card
1. Use the following command in the same command shell you used for building
the application, as described in the previous step
make flash
2. Copy the content of the created *./bin* folder into the root of microSD
card. Note that the card must be formatted as FAT32 with default cluster
size (but less than 32 Kbytes)
3. Plug in the microSD card into the J11 connector.
4. Push the RST button. If a red LED is lit beside RST button, push the CFG
button.
You will see the application output in the serial terminal.
## Custom ARC EM/HS Platform
This section describes how to deploy on a Custom ARC EM/HS platform defined only
by a TCF (Tool Configuration File, created at CPU configuration time) and
optional LCF (Linker Command File). In this case, the real hardware is unknown,
and applications can be run only in the nSIM simulator included with the
MetaWare toolkit
### Initial Setup
To with custom ARC EM/HS platform, you need the following : * Synopsys MetaWare
Development Toolkit version 2019.12 or higher * Make tool (make or gmake)
See
[Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit)
section for instructions on toolchain installation. See
[MetaWare Development Toolkit](#MetaWare-Development-Toolkit) and
[Make Tool](#Make-Tool) sections for instructions on toolchain installation and
comments about make versions.
### Generate Application Project
Before building the application itself, you need to generate the project for
this application from TensorFlow sources and external dependencies. To generate
it for a custom TCF you need to set the following variables in the make command
line: * TARGET_ARCH=arc * TCF_FILE=<path to TCF file> * (optional)
LCF_FILE=<path to LCF file>
If you dont supply an external LCF, the one embedded in the TCF will be used
instead
For instance, to build **Person Detection** test application, use the following
command from the root directory of the TensorFlow repo:
```
make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_test_int8_make_project TARGET_ARCH=arc TCF_FILE=<path_to_tcf_file> LCF_FILE=<path_to_lcf_file>
```
The application project will be generated into
*tensorflow/lite/micro/tools/make/gen/<tcf_file_basename>_arc/prj/person_detection_test_int8/make*
The
[embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli)
is used by default to speed up execution of some kernels for asymmetrically
quantized layers. Kernels which use MLI-based implementations are kept in the
*tensorflow/lite/micro/kernels/arc_mli* folder. For applications which may not
benefit from MLI library, the project can be generated without these
implementations by adding `TAGS=no_arc_mli` in the command line. This can reduce
code size when the optimized kernels are not required.
For more options on embARC MLI usage see
[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md).
### Build the Application
You may need to adjust the following commands in order to use the appropriate
make tool available in your environment (ie: `make` or `gmake`)
1. Open command shell and change the working directory to the location which
contains the generated project, as described in the previous section
2. Clean previous build artifacts (optional)
make clean
3. Build application
make app
### Run the Application with MetaWare Debugger on the nSim Simulator.
To run application from the console, use the following command:
```
make run
```
If application runs in an infinite loop, type `Ctrl+C` several times to exit the
debugger.
To run the application in the GUI debugger, use the following command:
```
make debug
```
You will see the application output in the same console where you ran it.
## License
TensorFlow's code is covered by the Apache2 License included in the repository,
and third-party dependencies are covered by their respective licenses, in the
third_party folder of this package.

View File

@ -0,0 +1,138 @@
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Common Settings for ARC platform and its projects.
# Might be reused across different targets
ifeq ($(TARGET_ARCH), arc)
DLR := $$$$
# List of folders to search project files for copy with path changing
# For instance, TCF and LCF files are copied into the root of generated project
ARC_TARGET_FILES_DIRS ?=
# For the following variables see arc_app_makefile.tpl for usage
# Additional text into application settings section of arc makefile project
ARC_EXTRA_APP_SETTINGS ?=
# Additional text into application general rules of arc makefile project
ARC_EXTRA_APP_RULES ?=
# additional arguments for RM command of "clean" target rule ("make clean" command)
ARC_EXTRA_RM_TARGETS ?=
# Dependencies of "flash" target rule ("make flash" command)
ARC_BIN_DEPEND ?=
# Commands in "flash" target rule ("make flash" command)
ARC_BIN_RULE ?= \t$(DLR)\(error Flash rule isnt defined for this ARC target\)
# Command to run app on "make run" command of generated project
ARC_APP_RUN_CMD ?=
# Command to run app on "make debug" command of generated project
ARC_APP_DEBUG_CMD ?=
# Additional text into application execution rules of arc makefile project
ARC_EXTRA_EXECUTE_RULES ?=
# We overwrite project generator to exclude everything not relevant to ARC platform.
# ARC targets cannot work with non-ARC development tools.
# Basic make project is updated to be applicable for general ARC platform
define generate_microlite_projects
$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
endef
# Copy rule generator to do file copies with changing paths in generated project
# Arguments are:
# 1 - Path files in generated project.
# 2 - Path files in the source repo
# Used in helper_functions.inc for arc projects to copy files
define path_changing_copy_file
$(1)/%: $(2)/%
@mkdir -p $$(dir $$@)
@cp $$< $$@
endef
# These are microcontroller-specific rules for converting the ELF output
# of the linker into a binary image that can be loaded directly.
# Not applicable for ARC, leaving it empty.
$(BINDIR)%.bin:
ifeq ($(ARC_TOOLCHAIN), mwdt)
CC_TOOL := ccac
AR_TOOL := arac
CXX_TOOL := ccac
LD_TOOL := ccac
ARC_APP_RUN_CMD = mdb -run -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS\)
ARC_APP_DEBUG_CMD = mdb -OK -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS\)
# The variable TCF_FILE stores path to Tool Configuration File (*.tcf).
# This file is used by MWDT toolchain to properly compile/run code
TCF_FILE ?=
LCF_FILE ?=
BUILD_ARC_MLI ?= true
# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension),
# this variable is used later to add the option to the linker/compiler flags.
# This condition also handles the case when the user/makefile specifies
# the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
ifneq (,$(findstring .tcf,$(TCF_FILE)))
TCF_FILE_NAME = $(notdir $(TCF_FILE))
ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE))
MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
else
TCF_FILE_NAME = $(TCF_FILE)
endif
PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config
PLATFORM_FLAGS += -Hnocopyr -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
# Use compact CRT. It requires pre-defined heap size
PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset
PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME)
PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K
ifneq ($(LCF_FILE), )
PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
MAKE_PROJECT_FILES += $(notdir $(LCF_FILE))
ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(LCF_FILE))),)
ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
endif
endif
CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
CXXFLAGS += $(PLATFORM_FLAGS)
CCFLAGS += $(PLATFORM_FLAGS)
LDFLAGS += $(PLATFORM_LDFLAGS)
endif # ARC_TOOLCHAIN
endif # TARGET_ARCH

View File

@ -0,0 +1,85 @@
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#
# Common EMSDP LCF File for applications
#
# external SRAM memory is used for code, because some TFLM applications includes the whole
# set of supported kernels which doesn't fit to ICCM0.
# It could slow performance a bit. Smaller applications can use ICCM0 instead.
#
# External PSRAM is used for potentially big sections. In particular:
# - rodata_in data which typically includes protobuf with model.
# - other .data which typically includes tensor arena.
#
# stack and heap are kept in DCCM which is the closest memory to the core
# CCMWRAP memory regions indicate unusable portions of the address space
# due to CCM memory wrapping into upper addresses beyond its size
MEMORY {
PSRAM : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
SRAM : ORIGIN = 0x20000000, LENGTH = 0x00040000
IVT : ORIGIN = 0x60000000, LENGTH = 0x400
ICCM0 : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
# CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
DCCM : ORIGIN = 0x80000000, LENGTH = 0x00020000
# CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
XCCM : ORIGIN = 0x90000000, LENGTH = 0x00004000
# CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
YCCM : ORIGIN = 0xa0000000, LENGTH = 0x00004000
# CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
}
SECTIONS {
GROUP BLOCK(4) : {
.vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
} > IVT
GROUP BLOCK(4): {
.text? : { *('.text$crt*') }
* (TEXT): {}
* (LIT): {}
} > SRAM
GROUP BLOCK(4): {
.Zdata? : {}
.stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32K): {}
.heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
} > DCCM
GROUP BLOCK(4): {
.Xdata? : {}
} > XCCM
GROUP BLOCK(4): {
.Ydata? : {}
} > YCCM
GROUP BLOCK(4): {
/* _SDA_BASE_ computed implicitly */
.sdata?: {}
.sbss?: {}
* (DATA): {}
* (BSS): {}
} > PSRAM
GROUP BLOCK(4): {
.rodata_in_data? : {}
} > PSRAM
GROUP BLOCK(4): {
.debug_log? : {}
} > SRAM
}

View File

@ -0,0 +1,74 @@
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#
# Difference with common EMSDP LCF file (to reduce data access time):
# - move data from external PSRAM to DCCM
# - move text from SRAM to ICCM
#
# CCMWRAP memory regions indicate unusable portions of the address space
# due to CCM memory wrapping into upper addresses beyond its size
MEMORY {
PSRAM : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
SRAM : ORIGIN = 0x20000000, LENGTH = 0x00040000
IVT : ORIGIN = 0x60000000, LENGTH = 0x400
ICCM0 : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
# CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
DCCM : ORIGIN = 0x80000000, LENGTH = 0x00020000
# CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
XCCM : ORIGIN = 0x90000000, LENGTH = 0x00004000
# CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
YCCM : ORIGIN = 0xa0000000, LENGTH = 0x00004000
# CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
}
SECTIONS {
GROUP BLOCK(4) : {
.vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
} > IVT
GROUP BLOCK(4): {
.text? : { *('.text$crt*') }
* (TEXT): {}
* (LIT): {}
} > ICCM0
GROUP BLOCK(4): {
.rodata_in_data? : {}
} > PSRAM
GROUP BLOCK(4): {
.debug_log? : {}
} > SRAM
GROUP BLOCK(4): {
/* _SDA_BASE_ computed implicitly */
.sdata?: {}
.sbss?: {}
* (DATA): {}
* (BSS): {}
.Zdata? : {}
.stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
.heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
} > DCCM
GROUP BLOCK(4): {
.Xdata? : {}
} > XCCM
GROUP BLOCK(4): {
.Ydata? : {}
} > YCCM
}

View File

@ -0,0 +1,73 @@
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Settings for EMSDP target (ARC processor)
ifeq ($(TARGET), arc_emsdp)
TARGET_ARCH := arc
ARC_TOOLCHAIN := mwdt
BUILD_ARC_MLI := false
ARC_MLI_PRE_COMPILED_TARGET := emsdp_em11d_em9d_dfss
ifneq ($(filter no_arc_mli,$(ALL_TAGS)),)
MLI_LIB_DIR = arc_mli_package
$(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),))
else ifeq ($(BUILD_ARC_MLI), true)
MLI_LIB_DIR = arc_mli_$(ARC_MLI_PRE_COMPILED_TARGET)
endif
TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/hw/emsdp_em11d_em9d_dfss.tcf
LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env
UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE))
include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
ARC_EXTRA_APP_SETTINGS = \
BIN_DIR = .$(DLR)\(PS\)bin\n\
BIN_FILE = $(DLR)\(BIN_DIR\)$(DLR)\(PS\)app.elf\n
ARC_EXTRA_APP_RULES = \
$(DLR)\(BIN_FILE\): $(DLR)\(BIN_DIR\) $(DLR)\(OUT_NAME\)\
\n\t\@$(DLR)\(CP\) $(DLR)\(OUT_NAME\) $(DLR)\(BIN_FILE\)\
\n\t\@$(DLR)\(CP\) $(UBOOT_FILE_NAME) $(DLR)\(BIN_DIR\)$(DLR)\(PS\)$(UBOOT_FILE_NAME)\
\n \
\n$(DLR)\(BIN_DIR\):\
\n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\
ARC_EXTRA_RM_TARGETS = $(DLR)\(BIN_DIR\)
ARC_BIN_DEPEND = $(DLR)\(BIN_DIR\) $(DLR)\(BIN_FILE\)
ARC_BIN_RULE = \t@echo Copy content of $(DLR)\(BIN_DIR\) into the root of SD card and follow instructions
ARC_APP_RUN_CMD = mdb -run -digilent -nooptions $(DLR)\(DBG_ARGS\)
ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS\)
ARC_EXTRA_EXECUTE_RULES =
MAKE_PROJECT_FILES += $(UBOOT_FILE_NAME)
ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(UBOOT_FILE))),)
ARC_TARGET_FILES_DIRS += $(dir $(UBOOT_FILE))
endif
MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC_EMSDP.md
# for default EMSDP configuration we can use em9d_va rt libs
# for better performance runtime should be built for emsdp configuration
# No hostlink library for smaller codesize purpose
PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio -Hhostlib=
endif

View File

@ -1,86 +1,40 @@
# Settings for arc processors
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Settings for not pre-defined ARC processors.
# User need to specify ARC target with Tool Configuration File (*.tcf).
# Path to this file must be passed through TCF_FILE variable.
# Otherwise, default em7d_voice_audio configuration is used
ifeq ($(TARGET_ARCH), arc)
CC_TOOL = ccac
AR_TOOL = arac
CXX_TOOL = ccac
# Known target are specified with their own make configurations.
ifeq ($(filter $(TARGET), arc_emsdp),)
ARC_TOOLCHAIN := mwdt
ifneq ($(TCF_FILE), )
TARGET = $(basename $(notdir $(TCF_FILE)))
else
$(warning TCF_FILE variable is not specified. Use default em7d_voice_audio configuration)
TARGET = em7d_voice_audio
TCF_FILE = em7d_voice_audio
endif
# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags.
# This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
ifneq (,$(findstring .tcf,$(TCF_FILE)))
TCF_FILE_NAME = $(notdir $(TCF_FILE))
THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME)
else
TCF_FILE_NAME = $(TCF_FILE)
endif
include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map
MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC.md
CXXFLAGS += $(PLATFORM_FLAGS)
CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS))
CCFLAGS += $(PLATFORM_FLAGS)
LDFLAGS += $(PLATFORM_LDFLAGS)
endif # $(TARGET)
endif # $(TARGET_ARCH)...
MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
USE_EMBARC_MLI ?= true
ifeq ($(USE_EMBARC_MLI), true)
ALL_TAGS += arc
ifeq ($(PRE_COMPILED_MLI),true)
$(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,))
MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include
MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a
THIRD_PARTY_CC_HDRS += \
third_party/embarc_osp/LICENSE
else
MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME))
$(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
THIRD_PARTY_CC_HDRS += \
third_party/$(MLI_LIB_DIR)/LICENSE
endif
THIRD_PARTY_CC_HDRS += $(MLI_LIB)
GENERATED_PROJECT_LIBS += $(MLI_LIB)
INCLUDES += \
-I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
-I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
GENERATED_PROJECT_INCLUDES += \
-I. \
-I./third_party/$(MLI_INCLUDE_FOLDER) \
-I./third_party/$(MLI_INCLUDE_FOLDER)/api
THIRD_PARTY_CC_HDRS += \
third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
endif # USE_EMBARC_MLI
endif

View File

@ -0,0 +1,45 @@
# TensorFlow Lite Micro ARC Make Project
This folder has been autogenerated by TensorFlow, and contains sources, headers, and project files needed to build a single TensorFlow Lite Micro application using make tool and a Synopsys DesignWare ARC processor compatible toolchain, specifically the ARC MetaWare Development Toolkit (MWDT).
This project has been generated for a target defined by TCF file only (Tool Configuration File). The real target board is unspecified, and applications can be run only in the nSIM simulator included with MWDT.
See
[tensorflow/lite/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro)
for details on how projects like this can be generated from the main source tree.
## Usage
See [Custom ARC EM/HS Platform](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#Custom-ARC-EMHS-Platform) section for more detailed information on requirements and usage of this project.
The Makefile contains all the information on building and running the project. One can modify it to satisfy specific needs. Next actions are available out of the box. You may need to adjust the following commands in order to use the appropriate make tool available in your environment, ie: `make` or `gmake`
1. Build the application.
make app
2. Build the application passing additional flags to compiler.
make app EXT_CFLAGS=[additional compiler flags]
3. Build the application and stripout TFLM reference kernel fallback implementations in order to reduce code size. This only has an effect in case the project was generated with MLI support. See more info in [EmbARC MLI Library Based Optimizations](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/kernels/arc_mli/README.md). `false` is the default value.
make app MLI_ONLY=[true|false]
4. Delete all artifacts created during build.
make clean
5. Run the application with the nSIM simulator in console mode.
make run
6. Run the application with the nSIM simulator, but using the MetaWare Debugger GUI for further execution/debugging capabilities.
make debug
## License
TensorFlow's code is covered by the Apache2 License included in the repository, and third party dependencies are covered by their respective licenses, in the third_party folder of this package.

View File

@ -0,0 +1,48 @@
# TensorFlow Lite Micro ARC Make Project for EM SDP Board.
This folder has been autogenerated by TensorFlow, and contains source, header, and project files needed to build a single TensorFlow Lite Micro target using make tool and and a Synopsys DesignWare ARC processor compatible toolchain, specifically the ARC MetaWare Development Toolkit (MWDT).
This project has been generated for the ARC EM Software Development Platform (EM SDP). The built application can be run only on this platform.
See
[tensorflow/lite/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro)
for details on how projects like this can be generated from the main source tree.
## Usage
See [ARC EM Software Development Platform](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) section for more detailed information on requirements and usage of this project.
The Makefile contains all the information on building and running the project. One can modify it to satisfy specific needs. Next actions are available out of the box. You may need to adjust the following commands in order to use the appropriate make tool available in your environment, ie: `make` or `gmake`:
1. Build the application.
make app
2. Build the application passing additional flags to compiler.
make app EXT_CFLAGS=[additional compiler flags]
3. Build the application and stripout TFLM reference kernel fallback implementations in order to reduce code size. This only has an effect in case the project was generated with MLI support. See more info in [EmbARC MLI Library Based Optimizations](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/kernels/arc_mli/README.md). `false` is the default value.
make app MLI_ONLY=[true|false]
4. Delete all artifacts created during build.
make clean
5. Run the application with the nSIM simulator in console mode.
make run
6. Load the application and open MetaWare Debugger GUI for further execution/debugging.
make debug
7. Generate necessary artefacts for self-booting execution from flash. See [reference to Run the application on the board from the micro SD card](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#Run-the-Application-on-the-Board-from-the-microSD-Card).
make flash
## License
TensorFlow's code is covered by the Apache2 License included in the repository, and third party dependencies are covered by their respective licenses, in the third_party folder of this package.

View File

@ -0,0 +1,114 @@
#=============================================================
# OS-specific definitions
#=============================================================
COMMA=,
OPEN_PAREN=(
CLOSE_PAREN=)
BACKSLASH=\$(nullstring)
ifneq ($(ComSpec)$(COMSPEC),)
O_SYS=Windows
RM=del /F /Q
MKDIR=mkdir
CP=copy /Y
TYPE=type
PS=$(BACKSLASH)
Q=
coQ=\$(nullstring)
fix_platform_path = $(subst /,$(PS), $(1))
DEV_NULL = nul
else
O_SYS=Unix
RM=rm -rf
MKDIR=mkdir -p
CP=cp
TYPE=cat
PS=/
Q=$(BACKSLASH)
coQ=
fix_platform_path=$(1)
DEV_NULL=/dev/null
endif
#=============================================================
# Toolchain definitions
#=============================================================
CC = %{CC}%
CXX = %{CXX}%
LD = %{LD}%
#=============================================================
# Applications settings
#=============================================================
OUT_NAME = %{EXECUTABLE}%
DBG_ARGS ?=
RUN_ARGS ?=
EXT_CFLAGS ?=
CXXFLAGS += %{CXX_FLAGS}%
CCFLAGS += %{CC_FLAGS}%
LDFLAGS += %{LINKER_FLAGS}%
%{EXTRA_APP_SETTINGS}%
#=============================================================
# Files and directories
#=============================================================
SRCS := \
%{SRCS}%
OBJS := \
$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
#=============================================================
# Common rules
#=============================================================
.PHONY: all app flash clean run debug
%.o: %.cc
$(CXX) $(CXXFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
%.o: %.c
$(CC) $(CCFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
$(OUT_NAME): $(OBJS)
$(LD) $(CXXFLAGS) -o $@ -Ccrossref $(OBJS) $(LDFLAGS)
%{EXTRA_APP_RULES}%
#=================================================================
# Global rules
#=================================================================
all: $(OUT_NAME)
app: $(OUT_NAME)
flash: %{BIN_DEPEND}%
%{BIN_RULE}%
clean:
-@$(RM) $(call fix_platform_path,$(OBJS))
-@$(RM) $(OUT_NAME) %{EXTRA_RM_TARGETS}%
#=================================================================
# Execution rules
#=================================================================
APP_RUN := %{APP_RUN_CMD}%
APP_DEBUG := %{APP_DEBUG_CMD}%
run: $(OUT_NAME)
$(APP_RUN) $(OUT_NAME) $(RUN_ARGS)
debug: $(OUT_NAME)
$(APP_DEBUG) $(OUT_NAME) $(RUN_ARGS)
%{EXTRA_EXECUTE_RULES}%

View File

@ -71,11 +71,11 @@ PERSON_MODEL_MD5 := "fe2934bd0788f1dcc7af3f0a954542ab"
PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip"
PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"
EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip"
EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip"
EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"
EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/6316034d421cbbb59756239908d7c9a99075a3bb.zip"
EMBARC_MLI_MD5 := "db0910cf0e07e43f74ae7a31de485d56"
EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip"
EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"