From f3f5afbe7bf3499e8735df0655344e7dc7ae554e Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Wed, 26 Feb 2020 10:38:36 +0000 Subject: [PATCH 001/557] docs: add tip to prefer tf.shape(x) over x.shape when writing custom layers/models See #36991 for details. --- tensorflow/python/ops/array_ops.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index 50afcfbc6e0..4f03b985b69 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -557,6 +557,14 @@ def shape_v2(input, out_type=dtypes.int32, name=None): >>> a.shape TensorShape([None, None, 10]) + + However, when defining custom layers and models that will be run in graph mode + at some point, prefer `tf.shape(x)` over `x.shape`. `x.shape` is the static shape + of `x` and usually evaluates to `None` in the first dimension during graph + construction (to represent the as yet unknown batch size). This can cause problems in + function calls like `tf.zeros(x.shape[0])` which don't support `None` values. + `tf.shape(x)` on the other hand gives the dynamic shape of `x` which isn't + evaluated until training/predicting begins where the full shape of `x` is known. `tf.shape` and `Tensor.shape` should be identical in eager mode. Within `tf.function` or within a `compat.v1` context, not all dimensions may be From 6f042c81d73079d226c10cc21832d4b2e61ca32a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Fri, 20 Mar 2020 07:22:09 +0100 Subject: [PATCH 002/557] TFLu: remove -fno-builtin compiler flag The flag may cause performance issues, since it disables special handling and optimizations of standard C library functions. --- tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc | 1 - tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc | 1 - tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc | 1 - tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc | 1 - tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc | 1 - tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc | 1 - 6 files changed, 6 deletions(-) diff --git a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc index 9494158cd50..aa221174d0c 100644 --- a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc @@ -40,7 +40,6 @@ $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downlo -fmessage-length=0 \ -fno-exceptions \ -fno-unwind-tables \ - -fno-builtin \ -ffunction-sections \ -fdata-sections \ -funsigned-char \ diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc index 878067cf083..3f3e2ce425d 100644 --- a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc @@ -19,7 +19,6 @@ ifeq ($(TARGET), bluepill) -fmessage-length=0 \ -fno-exceptions \ -fno-unwind-tables \ - -fno-builtin \ -ffunction-sections \ -fdata-sections \ -funsigned-char \ diff --git a/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc b/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc index 8b24f5beb92..e899cbd0672 100644 --- a/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc @@ -27,7 +27,6 @@ ifeq ($(TARGET), ecm3531) -fmessage-length=0 \ -fno-exceptions \ -fno-unwind-tables \ - -fno-builtin \ -ffunction-sections \ -fdata-sections \ -funsigned-char \ diff --git a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc index 85e5aa7154d..bfeec5e55a2 100644 --- a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc @@ -39,7 +39,6 @@ ifeq ($(TARGET), hexagon) -fdata-sections \ -ffunction-sections \ -fmessage-length=0 \ - -fno-builtin \ -fno-delete-null-pointer-checks \ -fno-exceptions \ -fno-register-global-dtors-with-atexit \ diff --git a/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc b/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc index 7336c520b11..9062f25254e 100644 --- a/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc @@ -16,7 +16,6 @@ ifeq ($(TARGET), riscv32_mcu) -DTF_LITE_MCU_DEBUG_LOG \ -DTF_LITE_USE_GLOBAL_ROUND \ -fno-unwind-tables \ - -fno-builtin \ -ffunction-sections \ -fdata-sections \ -funsigned-char \ diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc index 539f4528d06..24b36f119a2 100644 --- a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc @@ -16,7 +16,6 @@ ifeq ($(TARGET), stm32f4) -fmessage-length=0 \ -fno-exceptions \ -fno-unwind-tables \ - -fno-builtin \ -ffunction-sections \ -fdata-sections \ -funsigned-char \ From 5e4ce4f0776772798cbe0036b3b42a4aa416fabe Mon Sep 17 00:00:00 2001 From: Marcin Sielski Date: Mon, 13 Apr 2020 16:46:53 +0200 Subject: [PATCH 003/557] Fix a bug related to build TF Lite on RPI Zero. Why: * Enable to build TF Lite on RPI Zero. This change addresses the need by: * Changing compiler from arm-linux-gnueabi- to arm-linux-gnueabihf-. --- tensorflow/lite/tools/make/targets/rpi_makefile.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/tools/make/targets/rpi_makefile.inc b/tensorflow/lite/tools/make/targets/rpi_makefile.inc index 2225848ae64..71046d08131 100644 --- a/tensorflow/lite/tools/make/targets/rpi_makefile.inc +++ b/tensorflow/lite/tools/make/targets/rpi_makefile.inc @@ -32,7 +32,7 @@ ifeq ($(TARGET),rpi) # TODO(petewarden) In the future, we'll want to use OpenBLAS as a faster # alternative to Eigen on non-NEON ARM hardware like armv6. ifeq ($(TARGET_ARCH), armv6) - TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabi- + TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabihf- CXXFLAGS += \ -march=armv6 \ -mfpu=vfp \ From 966ed1cafc770e81e6a56be3f5715e0fe257b742 Mon Sep 17 00:00:00 2001 From: Fei Sun Date: Thu, 16 Apr 2020 18:41:20 +0800 Subject: [PATCH 004/557] Use provided host name/ip instead of localhost if possible --- .../distributed_runtime/rpc/grpc_server_lib.cc | 15 +++++++++++---- .../distributed_runtime/rpc/grpc_server_lib.h | 5 ++++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc index 32083fc272f..7e2c42dabea 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc @@ -132,8 +132,9 @@ GrpcServer::~GrpcServer() { void GrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {} // Look up the port that has been requested for this task in `server_def`. -Status GrpcServer::GetPort(const ServerDef& server_def, int* port) const { +Status GrpcServer::GetHostAndPort(const ServerDef& server_def, string* host_name, int* port) const { *port = -1; + *host_name = "localhost"; for (const auto& job : server_def.cluster().job()) { if (job.name() == server_def.job_name()) { auto iter = job.tasks().find(server_def.task_index()); @@ -153,6 +154,10 @@ Status GrpcServer::GetPort(const ServerDef& server_def, int* port) const { "Could not parse port for local server from \"", iter->second, "\"."); } + + if (colon_index != string::npos && !iter->second.substr(0, colon_index).empty()) { + *host_name = iter->second.substr(0, colon_index); + } } break; } @@ -175,7 +180,9 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) { // otherwise if 'task_index=-1' the program will abort. int requested_port; - TF_RETURN_IF_ERROR(GetPort(server_def_, &requested_port)); + string host_name; + TF_RETURN_IF_ERROR(GetHostAndPort(server_def_, &host_name, &requested_port)); + host_name_ = host_name; SessionOptions sess_opts; ConfigProto config = server_def_.default_session_config(); @@ -325,7 +332,7 @@ Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options, task.second); } if (job.name() == *options.job_name && task.first == options.task_index) { - host_port = strings::StrCat("localhost:", bound_port_); + host_port = strings::StrCat(host_name_, ":", bound_port_); } else { host_port = task.second; } @@ -478,7 +485,7 @@ Status GrpcServer::Join() { } const string GrpcServer::target() const { - return strings::StrCat("grpc://localhost:", bound_port_); + return strings::StrCat("grpc://", host_name_, ":", bound_port_); } std::shared_ptr<::grpc::ServerCredentials> GrpcServer::GetServerCredentials( diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h index 8e25b8835eb..feb174cde4e 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h @@ -104,7 +104,7 @@ class GrpcServer : public ServerInterface { Status UpdateServerDef(const ServerDef& server_def); protected: - virtual Status GetPort(const ServerDef& server_def, int* port) const; + virtual Status GetHostAndPort(const ServerDef& server_def, string* host_name, int* port) const; Status Init(const GrpcServerOptions& opts = GrpcServerOptions()); // A subclass can override this method to support secure credentials. @@ -136,6 +136,9 @@ class GrpcServer : public ServerInterface { // The port to which this server is bound. int bound_port_ = 0; + // The host name of this server + string host_name_; + // Guards server configuration, server, and state. mutex mu_; From 8a25f427db3d3dc5c9ddffc775b4c7dd4a96a6f9 Mon Sep 17 00:00:00 2001 From: Teng Lu Date: Fri, 17 Apr 2020 16:36:57 +0800 Subject: [PATCH 005/557] Enabe BF16 SoftmaxGrad(Sum), and fix accuracy by accum type. --- tensorflow/core/kernels/reduction_ops.h | 25 ++++++++++++++++++++++++- tensorflow/core/ops/nn_grad.cc | 4 ++++ tensorflow/python/ops/math_ops_test.py | 10 ++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h index 46d8051fff1..8814a2eb467 100644 --- a/tensorflow/core/kernels/reduction_ops.h +++ b/tensorflow/core/kernels/reduction_ops.h @@ -19,9 +19,9 @@ limitations under the License. // Functor definitions for Reduction ops, must be compilable by nvcc. #include -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor_types.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { namespace functor { @@ -58,6 +58,29 @@ struct ReduceEigenImpl { } }; +// Specialization for BF16 Reducer to fix accuracy. +// TODO: all BF16 Reducer should have specialization to fix accuracy. +#define CASTING_SPECIALIZATION(Reducer, ScalarType, IntermediateType) \ + template \ + struct ReduceEigenImpl> { \ + void operator()(const Device& d, OUT_T out, IN_T in, \ + const ReductionAxes& reduction_axes, \ + const Reducer& reducer) { \ + static_assert(std::is_same::value, \ + ""); \ + Reducer intermediate_reducer; \ + auto in_as_intermediate = in.template cast(); \ + out.device(d) = \ + in_as_intermediate.reduce(reduction_axes, intermediate_reducer) \ + .template cast(); \ + } \ + }; + +CASTING_SPECIALIZATION(Eigen::internal::SumReducer, bfloat16, float); +#undef CASTING_SPECIALIZATION + template struct ReduceEigenImpl Date: Sun, 26 Apr 2020 18:36:07 -0700 Subject: [PATCH 006/557] Update examples in docstring to use TF 2.x code The examples in docstrings of two APIs, tf.histogram_fixed_width_bins and tf.histogram_fixed_width still used TF 1.x code. This PR updates the docstring to use TF 2.x code in examples. Signed-off-by: Yong Tang --- tensorflow/python/ops/histogram_ops.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py index 92f3e7a24ba..009f9f63f48 100644 --- a/tensorflow/python/ops/histogram_ops.py +++ b/tensorflow/python/ops/histogram_ops.py @@ -68,10 +68,8 @@ def histogram_fixed_width_bins(values, value_range = [0.0, 5.0] new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] - with tf.compat.v1.get_default_session() as sess: - indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5) - variables.global_variables_initializer().run() - sess.run(indices) # [0, 0, 1, 2, 4, 4] + indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5) + indices # [0, 0, 1, 2, 4, 4] ``` """ with ops.name_scope(name, 'histogram_fixed_width_bins', @@ -137,10 +135,8 @@ def histogram_fixed_width(values, value_range = [0.0, 5.0] new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] - with tf.compat.v1.get_default_session() as sess: - hist = tf.histogram_fixed_width(new_values, value_range, nbins=5) - variables.global_variables_initializer().run() - sess.run(hist) => [2, 1, 1, 0, 2] + hist = tf.histogram_fixed_width(new_values, value_range, nbins=5) + hist # [2, 1, 1, 0, 2] ``` """ with ops.name_scope(name, 'histogram_fixed_width', From c7a16159f71bc5beb9a3fc35cc97a9e5b9f94d40 Mon Sep 17 00:00:00 2001 From: Kayou Date: Mon, 27 Apr 2020 14:18:08 +0200 Subject: [PATCH 007/557] Update check_cuda_libs.py --- third_party/gpus/check_cuda_libs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/gpus/check_cuda_libs.py b/third_party/gpus/check_cuda_libs.py index b7b36e6466e..728d178afec 100644 --- a/third_party/gpus/check_cuda_libs.py +++ b/third_party/gpus/check_cuda_libs.py @@ -59,7 +59,7 @@ def check_cuda_lib(path, check_soname=True): objdump = which("objdump") if check_soname and objdump is not None and not _is_windows(): # Decode is necessary as in py3 the return type changed from str to bytes - output = subprocess.check_output([objdump, "-p", path]).decode("ascii") + output = subprocess.check_output([objdump, "-p", path]).decode("utf-8") output = [line for line in output.splitlines() if "SONAME" in line] sonames = [line.strip().split(" ")[-1] for line in output] if not any([soname == os.path.basename(path) for soname in sonames]): From fe3a4bcf2f7d0be92b6b70de43cd05d61cb0e025 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 28 Apr 2020 09:00:24 -0700 Subject: [PATCH 008/557] Update tf.histogram_fixed_width docstring to comform to Python doctest Signed-off-by: Yong Tang --- tensorflow/python/ops/histogram_ops.py | 30 ++++++++++++++------------ 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py index 009f9f63f48..3ef711a838f 100644 --- a/tensorflow/python/ops/histogram_ops.py +++ b/tensorflow/python/ops/histogram_ops.py @@ -63,13 +63,14 @@ def histogram_fixed_width_bins(values, Examples: ```python - # Bins will be: (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf) - nbins = 5 - value_range = [0.0, 5.0] - new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] - - indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5) - indices # [0, 0, 1, 2, 4, 4] + >>> # Bins will be: (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf) + ... + >>> nbins = 5 + >>> value_range = [0.0, 5.0] + >>> new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] + >>> indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5) + >>> print(indices) + tf.Tensor([0 0 1 2 4 4], shape=(6,), dtype=int32) ``` """ with ops.name_scope(name, 'histogram_fixed_width_bins', @@ -130,13 +131,14 @@ def histogram_fixed_width(values, Examples: ```python - # Bins will be: (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf) - nbins = 5 - value_range = [0.0, 5.0] - new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] - - hist = tf.histogram_fixed_width(new_values, value_range, nbins=5) - hist # [2, 1, 1, 0, 2] + >>> # Bins will be: (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf) + ... + >>> nbins = 5 + >>> value_range = [0.0, 5.0] + >>> new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] + >>> hist = tf.histogram_fixed_width(new_values, value_range, nbins=5) + >>> print(hist) + tf.Tensor([2 1 1 0 2], shape=(5,), dtype=int32) ``` """ with ops.name_scope(name, 'histogram_fixed_width', From 58a378f9f608c942ffe66ba12cc85f8d8fc3e7a4 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 28 Apr 2020 12:49:13 -0700 Subject: [PATCH 009/557] Remove `print` in docstring as it causes discrepancy in doctest Signed-off-by: Yong Tang --- tensorflow/python/ops/histogram_ops.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py index 3ef711a838f..ffdd900ec71 100644 --- a/tensorflow/python/ops/histogram_ops.py +++ b/tensorflow/python/ops/histogram_ops.py @@ -68,9 +68,8 @@ def histogram_fixed_width_bins(values, >>> nbins = 5 >>> value_range = [0.0, 5.0] >>> new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] - >>> indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5) - >>> print(indices) - tf.Tensor([0 0 1 2 4 4], shape=(6,), dtype=int32) + >>> tf.histogram_fixed_width_bins(new_values, value_range, nbins=5) + ``` """ with ops.name_scope(name, 'histogram_fixed_width_bins', @@ -136,9 +135,8 @@ def histogram_fixed_width(values, >>> nbins = 5 >>> value_range = [0.0, 5.0] >>> new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] - >>> hist = tf.histogram_fixed_width(new_values, value_range, nbins=5) - >>> print(hist) - tf.Tensor([2 1 1 0 2], shape=(5,), dtype=int32) + >>> tf.histogram_fixed_width(new_values, value_range, nbins=5) + ``` """ with ops.name_scope(name, 'histogram_fixed_width', From 3a8b6ba5c1c8c2111c53490eba3f0c1a07f2494a Mon Sep 17 00:00:00 2001 From: Fei Sun Date: Wed, 29 Apr 2020 10:35:01 +0800 Subject: [PATCH 010/557] Edit according to PR comments --- .../core/distributed_runtime/rpc/grpc_server_lib.cc | 8 ++++---- tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc index 7e2c42dabea..2cfdde5f56f 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc @@ -132,7 +132,9 @@ GrpcServer::~GrpcServer() { void GrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {} // Look up the port that has been requested for this task in `server_def`. -Status GrpcServer::GetHostAndPort(const ServerDef& server_def, string* host_name, int* port) const { +Status GrpcServer::GetHostAndPort(const ServerDef& server_def, + string* host_name, + int* port) const { *port = -1; *host_name = "localhost"; for (const auto& job : server_def.cluster().job()) { @@ -180,9 +182,7 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) { // otherwise if 'task_index=-1' the program will abort. int requested_port; - string host_name; - TF_RETURN_IF_ERROR(GetHostAndPort(server_def_, &host_name, &requested_port)); - host_name_ = host_name; + TF_RETURN_IF_ERROR(GetHostAndPort(server_def_, &host_name_, &requested_port)); SessionOptions sess_opts; ConfigProto config = server_def_.default_session_config(); diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h index feb174cde4e..8ecf0e158bf 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h @@ -104,7 +104,9 @@ class GrpcServer : public ServerInterface { Status UpdateServerDef(const ServerDef& server_def); protected: - virtual Status GetHostAndPort(const ServerDef& server_def, string* host_name, int* port) const; + virtual Status GetHostAndPort(const ServerDef& server_def, + string* host_name, + int* port) const; Status Init(const GrpcServerOptions& opts = GrpcServerOptions()); // A subclass can override this method to support secure credentials. From 1c1203f4566d085f1ca8fd37c8313bb7b00170b1 Mon Sep 17 00:00:00 2001 From: Ajay P Date: Wed, 29 Apr 2020 06:10:49 +0000 Subject: [PATCH 011/557] Fixed eager mode gradient checkpointing by eliminating unecessary persistence of intermediate activations in memory --- tensorflow/python/ops/custom_gradient.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py index 4040a4db038..a20619f5be7 100644 --- a/tensorflow/python/ops/custom_gradient.py +++ b/tensorflow/python/ops/custom_gradient.py @@ -406,14 +406,17 @@ def _graph_mode_decorator(f, args, kwargs): def _eager_mode_decorator(f, args, kwargs): """Implement custom gradient decorator for eager mode.""" - with tape_lib.VariableWatcher() as variable_watcher: - result, grad_fn = f(*args, **kwargs) + + trainable_vars = [] + if 'trainable_variables' in kwargs: + trainable_vars = kwargs.pop('trainable_variables') + result, grad_fn = f(*args, **kwargs) all_inputs = list(args) + list(kwargs.values()) # The variables that grad_fn needs to return gradients for are the set of # variables used that are *not* part of the inputs. variables = [ v.deref() # pylint: disable=g-complex-comprehension - for v in set(v.ref() for v in variable_watcher.watched_variables()) + for v in set(v.ref() for v in trainable_vars) if all(v.deref() is not i for i in all_inputs) ] grad_argspec = tf_inspect.getfullargspec(grad_fn) @@ -483,7 +486,8 @@ def recompute_grad(f): """Inner function closure for calculating gradients.""" current_var_scope = variable_scope.get_variable_scope() - result = f(*args, **kwargs) + with tape_lib.stop_recording(): + result = f(*args, **kwargs) def grad(*dresult, **grad_kwargs): """Gradient function calculation for inner function.""" From 441d6983812af97104aa3453b09f3f411117d6c3 Mon Sep 17 00:00:00 2001 From: jacco Date: Tue, 14 Jan 2020 09:52:26 +0100 Subject: [PATCH 012/557] Use datamove in conv wrapper --- tensorflow/lite/micro/kernels/arc/conv.cc | 57 +++++-- .../lite/micro/kernels/arc/scratch_buffers.cc | 146 ++++++++++++++++++ .../lite/micro/kernels/arc/scratch_buffers.h | 42 +++++ .../micro/tools/make/targets/arc_makefile.inc | 5 + .../tools/make/third_party_downloads.inc | 4 +- 5 files changed, 235 insertions(+), 19 deletions(-) create mode 100644 tensorflow/lite/micro/kernels/arc/scratch_buffers.cc create mode 100644 tensorflow/lite/micro/kernels/arc/scratch_buffers.h diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/arc/conv.cc index 69542e12e90..46be76a407b 100644 --- a/tensorflow/lite/micro/kernels/arc/conv.cc +++ b/tensorflow/lite/micro/kernels/arc/conv.cc @@ -25,6 +25,9 @@ limitations under the License. #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/padding.h" #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" +#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" + +#include "mli_api.h" namespace tflite { namespace ops { @@ -139,7 +142,7 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, GetTensorData(im2col), nullptr); } -void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, +TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params, OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter, @@ -195,24 +198,43 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, cfg.padding_bottom = data->padding.height + data->padding.height_offset; } - mli_point_to_subtsr_cfg substr_cfg_in = { - {0, 0}, 2, static_cast(mli_in.shape[1])}; - mli_point_to_subtsr_cfg substr_cfg_out = { - {0, 0}, 2, static_cast(mli_out.shape[1])}; - mli_tensor sub_mli_in = {0}; - mli_tensor sub_mli_out = {0}; + // Get first input from batch + mli_point_to_subtsr_cfg subtsr_cfg_in = { {0, 0}, 2, static_cast(mli_in.shape[1]) }; + mli_point_to_subtsr_cfg subtsr_cfg_out = { {0, 0}, 2, static_cast(mli_out.shape[1]) }; + mli_tensor sub_mli_in = { 0 }; + mli_tensor sub_mli_out = { 0 }; + mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); + mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); - const int batches = - MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0); + // Tensors for data in fast (local) memory and config to copy data from external to local memory + mli_tensor weights_local = mli_weights; + mli_tensor bias_local = mli_bias; + mli_tensor in_local = sub_mli_in; + mli_tensor out_local = sub_mli_out; + mli_mov_cfg_t copy_config; + mli_mov_cfg_for_copy(©_config); + TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); + bool in_is_local = in_local.data == sub_mli_in.data; + bool out_is_local = out_local.data == sub_mli_out.data; + + mli_mov_tensor_sync(&mli_weights, ©_config, &weights_local); + mli_mov_tensor_sync(&mli_bias, ©_config, &bias_local); + const int batches = MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0); for (int i = 0; i < batches; i++) { - substr_cfg_in.start_coord[0] = i; - substr_cfg_out.start_coord[0] = i; - mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out); - - mli_krn_conv2d_hwc_sa8_sa8_sa32(&sub_mli_in, &mli_weights, &mli_bias, - &cfg, &sub_mli_out); + mli_mov_tensor_sync(&sub_mli_in, ©_config, &in_local); + mli_krn_conv2d_hwc_sa8_sa8_sa32(&in_local, &weights_local, &bias_local, &cfg, &out_local); + mli_mov_tensor_sync(&out_local, ©_config, &sub_mli_out); + subtsr_cfg_in.start_coord[0]++; + subtsr_cfg_out.start_coord[0]++; + mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); + mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); + if (in_is_local) { + in_local.data = sub_mli_in.data; + } + if (out_is_local) { + out_local.data = sub_mli_out.data; + } } } else { ConvParams op_params; @@ -233,6 +255,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, GetTensorData(bias), GetTensorShape(output), GetTensorData(output)); } + return kTfLiteOk; } void EvalFloat(TfLiteContext* context, TfLiteNode* node, @@ -309,7 +332,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { nullptr, output); break; case kTfLiteInt8: - EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias, + return EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias, output, nullptr); break; case kTfLiteUInt8: diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc new file mode 100644 index 00000000000..2ac60dd0f25 --- /dev/null +++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc @@ -0,0 +1,146 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" +#include + +/* by default use all the XY memory, and half of the DCCM because DCCM is also used + * for the data section and the stack. + * the values can be overruled by adding a -D option to the makefile of the application + */ +#ifndef SCRATCH_MEM_X_SIZE +#ifdef core_config_xy_size +#define SCRATCH_MEM_X_SIZE (core_config_xy_size) +#else +#define SCRATCH_MEM_X_SIZE (0) +#endif +#endif + +#ifndef SCRATCH_MEM_Y_SIZE +#ifdef core_config_xy_size +#define SCRATCH_MEM_Y_SIZE (core_config_xy_size) +#else +#define SCRATCH_MEM_Y_SIZE (0) +#endif +#endif + +#ifndef SCRATCH_MEM_Z_SIZE +#ifdef core_config_dccm_size +#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2) +#else +#define SCRATCH_MEM_Z_SIZE (0) +#endif +#endif + +namespace { +#pragma Data(".Xdata") + static int8_t scratch_mem_x[SCRATCH_MEM_X_SIZE]; +#pragma Data() + +#pragma Data(".Ydata") + static int8_t scratch_mem_y[SCRATCH_MEM_Y_SIZE]; +#pragma Data() + +#pragma Data(".Zdata") + static int8_t scratch_mem_z[SCRATCH_MEM_Z_SIZE]; +#pragma Data() +} + +static inline +bool inside_arc_dccm(void* p) { +#if core_config_dccm_present + return ((unsigned)p >= core_config_dccm_base) && ((unsigned)p < core_config_dccm_base + core_config_dccm_size); +#else + return false; +#endif +} +static inline +bool inside_arc_xccm(void* p) { +#if core_config_xy + return ((unsigned)p >= core_config_xy_x_base) && ((unsigned)p < core_config_xy_x_base + core_config_xy_size); +#else + return false; +#endif +} +static inline +bool inside_arc_yccm(void* p) { +#if core_config_xy + return ((unsigned)p >= core_config_xy_y_base) && ((unsigned)p < core_config_xy_y_base + core_config_xy_size); +#else + return false; +#endif +} + +static inline +bool inside_arc_ccm(void* p) { + return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p); +} + +TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, + mli_tensor* in, + mli_tensor* weights, + mli_tensor* bias, + mli_tensor* out) { +#ifdef __Xxy + // Function to assign fast memory from one of 3 scratch buffers. + // Best Fit strategy - memory is asigned to those tensor which leave less memory of bank unused + mli_tensor* tensors[3] = { weights, in, out }; + uint32_t tensor_sizes[3] = { + mli_hlp_count_elem_num(tensors[0], 0), mli_hlp_count_elem_num(tensors[1], 0), mli_hlp_count_elem_num(tensors[2], 0) }; + bool mem_is_free[3] = { true, true, true }; + int8_t* scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z}; + uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE, SCRATCH_MEM_Z_SIZE}; + + for (int i = 0; i < 3; ++i) { + int best_mem_idx = -1; + int best_mem_delta = INT_MAX; + // only for tensors that are not already located in one of the ccm memories, find a local memory that fits the data size. + if (inside_arc_ccm(tensors[i]->data)) continue; + for (int j = 0; j < 3; ++j) { + // Best Fit + if (mem_is_free[j] && tensor_sizes[i] <= scratch_sizes[j] && scratch_sizes[j] - tensor_sizes[i] < best_mem_delta) { + best_mem_idx = j; + best_mem_delta = scratch_sizes[j] - tensor_sizes[i]; + } + } + if (best_mem_idx >= 0) { + tensors[i]->data = static_cast(scratch_mem[best_mem_idx]); + tensors[i]->capacity = scratch_sizes[best_mem_idx]; + mem_is_free[best_mem_idx] = false; + } else { + return kTfLiteError; + } + } + + // Bias is expected to be much smaller than other operands, not affect performance and can be placed + // in the end of some of already used memory bank (to occupy free space of it) + bool is_bias_allocated = inside_arc_ccm(bias->data); + if (!is_bias_allocated) { + uint32_t bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias); + for (int i = 0; i < 3; ++i) { + if (tensors[i]->capacity - tensor_sizes[i] > bias_mem_requirements) { + bias->data = &((char*)tensors[i]->data)[tensor_sizes[i]]; + bias->capacity = bias_mem_requirements; + tensors[i]->capacity = tensor_sizes[i]; + is_bias_allocated = true; + break; + } + } + } + return (is_bias_allocated) ? kTfLiteOk : kTfLiteError; +#else + return kTfLiteOk; +#endif +} diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h new file mode 100644 index 00000000000..198cc5b83cf --- /dev/null +++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h @@ -0,0 +1,42 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_ +#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_ + +#include "tensorflow/lite/c/common.h" +#include "mli_api.h" + +/** + * @brief Function to allocate scratch buffers for the convolution tensors + * + * @detail This function will update the data pointers in the 4 tensors with pointers + * to scratch buffers in fast local memory. + * + * @param context [I] pointer to TfLite context (needed for error handling) + * @param in [IO] pointer to the input tensor + * @param weights [IO] pointer to the weights tensor + * @param bias [IO] pointer to the bias tensor + * @param output [IO] pointer to the output tensor + * + * @return Tf Lite status code + */ +TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, + mli_tensor* in, + mli_tensor* weights, + mli_tensor* bias, + mli_tensor* out); + +#endif // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_ diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index 0f56e5f4641..16e89266614 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -22,6 +22,7 @@ else endif PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -fslp-vectorize-aggressive -ffunction-sections -fdata-sections + PLATFORM_FLAGS += -tcf_core_config PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map CXXFLAGS += $(PLATFORM_FLAGS) @@ -80,6 +81,10 @@ endif third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \ third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \ third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h + + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc endif # USE_EMBARC_MLI diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc index b331db2c80e..69e7910f6c2 100644 --- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc +++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc @@ -74,8 +74,8 @@ PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc" EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip" EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776" -EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/6316034d421cbbb59756239908d7c9a99075a3bb.zip" -EMBARC_MLI_MD5 := "db0910cf0e07e43f74ae7a31de485d56" +EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/428cfd6a89f848e403a8b8ca02eab2a897ae8cd3.zip" +EMBARC_MLI_MD5 := "9c6c8f8877fa6dd738d7ab62665b3a6e" XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip" XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b" From 9d6f2440471312a44914db75e77dbe91ab532e7e Mon Sep 17 00:00:00 2001 From: jacco Date: Thu, 16 Jan 2020 15:39:33 +0100 Subject: [PATCH 013/557] add data move functionality to depthwise, fc, pooling --- .../person_detection_test.cc | 2 + .../lite/micro/kernels/arc/depthwise_conv.cc | 57 +++++++++++++------ .../lite/micro/kernels/arc/fully_connected.cc | 46 +++++++++++---- tensorflow/lite/micro/kernels/arc/pooling.cc | 44 ++++++++++---- .../lite/micro/kernels/arc/scratch_buffers.cc | 44 +++++++++++++- .../lite/micro/kernels/arc/scratch_buffers.h | 16 ++++++ 6 files changed, 168 insertions(+), 41 deletions(-) diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc index b0979735d4f..cac5596cd83 100644 --- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc +++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc @@ -28,7 +28,9 @@ limitations under the License. // Create an area of memory to use for input, output, and intermediate arrays. constexpr int tensor_arena_size = 125 * 1024; +#pragma Data(".System") uint8_t tensor_arena[tensor_arena_size]; +#pragma Data() TF_LITE_MICRO_TESTS_BEGIN diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc index 6322414f5c6..4cf7b08bda8 100644 --- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc +++ b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc @@ -26,6 +26,9 @@ limitations under the License. #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/padding.h" #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" +#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" + +#include "mli_api.h" namespace tflite { namespace ops { @@ -131,7 +134,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node, GetTensorData(output)); } -void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, +TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, TfLiteDepthwiseConvParams* params, OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter, @@ -186,24 +189,43 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, cfg.padding_bottom = data->padding.height + data->padding.height_offset; } - mli_point_to_subtsr_cfg substr_cfg_in = { - {0, 0}, 2, static_cast(mli_in.shape[1])}; - mli_point_to_subtsr_cfg substr_cfg_out = { - {0, 0}, 2, static_cast(mli_out.shape[1])}; - mli_tensor sub_mli_in = {0}; - mli_tensor sub_mli_out = {0}; + // Get first input from batch + mli_point_to_subtsr_cfg subtsr_cfg_in = { {0, 0}, 2, static_cast(mli_in.shape[1]) }; + mli_point_to_subtsr_cfg subtsr_cfg_out = { {0, 0}, 2, static_cast(mli_out.shape[1]) }; + mli_tensor sub_mli_in = { 0 }; + mli_tensor sub_mli_out = { 0 }; + mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); + mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); - const int batches = - MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0); + // Tensors for data in fast (local) memory and config to copy data from external to local memory + mli_tensor weights_local = mli_weights; + mli_tensor bias_local = mli_bias; + mli_tensor in_local = sub_mli_in; + mli_tensor out_local = sub_mli_out; + mli_mov_cfg_t copy_config; + mli_mov_cfg_for_copy(©_config); + TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); + bool in_is_local = in_local.data == sub_mli_in.data; + bool out_is_local = out_local.data == sub_mli_out.data; + + mli_mov_tensor_sync(&mli_weights, ©_config, &weights_local); + mli_mov_tensor_sync(&mli_bias, ©_config, &bias_local); + const int batches = MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0); for (int i = 0; i < batches; i++) { - substr_cfg_in.start_coord[0] = i; - substr_cfg_out.start_coord[0] = i; - mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out); - - mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(&sub_mli_in, &mli_weights, - &mli_bias, &cfg, &sub_mli_out); + mli_mov_tensor_sync(&sub_mli_in, ©_config, &in_local); + mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(&in_local, &weights_local, &bias_local, &cfg, &out_local); + mli_mov_tensor_sync(&out_local, ©_config, &sub_mli_out); + subtsr_cfg_in.start_coord[0]++; + subtsr_cfg_out.start_coord[0]++; + mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); + mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); + if (in_is_local) { + in_local.data = sub_mli_in.data; + } + if (out_is_local) { + out_local.data = sub_mli_out.data; + } } } else { DepthwiseParams op_params; @@ -230,6 +252,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, GetTensorData(bias), GetTensorShape(output), GetTensorData(output)); } + return kTfLiteOk; } void EvalQuantized(TfLiteContext* context, TfLiteNode* node, @@ -311,7 +334,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { EvalFloat(context, node, params, &data, input, filter, bias, output); break; case kTfLiteInt8: - EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias, + return EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias, output); break; case kTfLiteUInt8: diff --git a/tensorflow/lite/micro/kernels/arc/fully_connected.cc b/tensorflow/lite/micro/kernels/arc/fully_connected.cc index 57203f10487..9c484718b25 100644 --- a/tensorflow/lite/micro/kernels/arc/fully_connected.cc +++ b/tensorflow/lite/micro/kernels/arc/fully_connected.cc @@ -24,6 +24,10 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" +#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" + +#include "mli_api.h" + namespace tflite { namespace ops { @@ -95,24 +99,44 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node, ConvertToMliTensor(bias, &mli_bias); ConvertToMliTensor(output, &mli_out); - mli_point_to_subtsr_cfg substr_cfg_in = { - {0, 0}, 2, static_cast(mli_in.shape[1])}; - mli_point_to_subtsr_cfg substr_cfg_out = { - {0, 0}, 2, static_cast(mli_out.shape[1])}; + mli_point_to_subtsr_cfg subtsr_cfg_in = {{0, 0}, 2, static_cast(mli_in.shape[1])}; + mli_point_to_subtsr_cfg subtsr_cfg_out = {{0, 0}, 2, static_cast(mli_out.shape[1])}; mli_tensor sub_mli_in = {0}; mli_tensor sub_mli_out = {0}; + mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); + mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); + + // Tensors for data in fast (local) memory and config to copy data from external to local memory + mli_tensor weights_local = mli_weights; + mli_tensor bias_local = mli_bias; + mli_tensor in_local = sub_mli_in; + mli_tensor out_local = sub_mli_out; + mli_mov_cfg_t copy_config; + mli_mov_cfg_for_copy(©_config); + TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); + bool in_is_local = in_local.data == sub_mli_in.data; + bool out_is_local = out_local.data == sub_mli_out.data; + + mli_mov_tensor_sync(&mli_weights, ©_config, &weights_local); + mli_mov_tensor_sync(&mli_bias, ©_config, &bias_local); const int batches = MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0); for (int i = 0; i < batches; i++) { - substr_cfg_in.start_coord[0] = i; - substr_cfg_out.start_coord[0] = i; - mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out); - - mli_krn_fully_connected_sa8_sa8_sa32(&sub_mli_in, &mli_weights, &mli_bias, - &sub_mli_out); + mli_mov_tensor_sync(&sub_mli_in, ©_config, &in_local); + mli_krn_fully_connected_sa8_sa8_sa32(&in_local, &weights_local, &bias_local, &out_local); + mli_mov_tensor_sync(&out_local, ©_config, &sub_mli_out); + subtsr_cfg_in.start_coord[0]++; + subtsr_cfg_out.start_coord[0]++; + mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); + mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); + if (in_is_local) { + in_local.data = sub_mli_in.data; + } + if (out_is_local) { + out_local.data = sub_mli_out.data; + } } } else { FullyConnectedParams op_params; diff --git a/tensorflow/lite/micro/kernels/arc/pooling.cc b/tensorflow/lite/micro/kernels/arc/pooling.cc index 55452013028..ef72a6c0649 100644 --- a/tensorflow/lite/micro/kernels/arc/pooling.cc +++ b/tensorflow/lite/micro/kernels/arc/pooling.cc @@ -21,6 +21,9 @@ limitations under the License. #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/padding.h" #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" +#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" + +#include "mli_api.h" namespace tflite { namespace ops { @@ -97,7 +100,7 @@ void AverageEvalUint8(TfLiteContext* context, const TfLiteNode* node, GetTensorShape(output), GetTensorData(output)); } -void AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node, +TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node, const TfLitePoolParams* params, const OpData* data, const TfLiteTensor* input, TfLiteTensor* output) { // Run Average Pooling MLI kernel @@ -128,23 +131,39 @@ void AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node, cfg.padding_bottom = data->padding.height + data->padding.height_offset; } - mli_point_to_subtsr_cfg substr_cfg_in = { - {0, 0}, 2, static_cast(mli_in.shape[1])}; - mli_point_to_subtsr_cfg substr_cfg_out = { - {0, 0}, 2, static_cast(mli_out.shape[1])}; + mli_point_to_subtsr_cfg subtsr_cfg_in = {{0,0}, 2, static_cast(mli_in.shape[1])}; + mli_point_to_subtsr_cfg subtsr_cfg_out = {{0,0}, 2, static_cast(mli_out.shape[1])}; mli_tensor sub_mli_in = {0}; mli_tensor sub_mli_out = {0}; + mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); + mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); + + // Tensors for data in fast (local) memory and config to copy data from external to local memory + mli_tensor in_local = sub_mli_in; + mli_tensor out_local = sub_mli_out; + mli_mov_cfg_t copy_config; + mli_mov_cfg_for_copy(©_config); + TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_io_tensors(context, &in_local, &out_local)); + bool in_is_local = in_local.data == sub_mli_in.data; + bool out_is_local = out_local.data == sub_mli_out.data; const int batches = MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0); for (int i = 0; i < batches; i++) { - substr_cfg_in.start_coord[0] = i; - substr_cfg_out.start_coord[0] = i; - mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out); - - mli_krn_avepool_hwc_sa8(&sub_mli_in, &cfg, &sub_mli_out); + mli_mov_tensor_sync(&sub_mli_in, ©_config, &in_local); + mli_krn_avepool_hwc_sa8(&in_local, &cfg, &out_local); + mli_mov_tensor_sync(&out_local, ©_config, &sub_mli_out); + subtsr_cfg_in.start_coord[0]++; + subtsr_cfg_out.start_coord[0]++; + mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); + mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); + if (in_is_local) { + in_local.data = sub_mli_in.data; + } + if (out_is_local) { + out_local.data = sub_mli_out.data; + } } } else { int32_t activation_min, activation_max; @@ -163,6 +182,7 @@ void AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node, op_params, GetTensorShape(input), GetTensorData(input), GetTensorShape(output), GetTensorData(output)); } + return kTfLiteOk; } void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node, @@ -227,7 +247,7 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) { AverageEvalUint8(context, node, params, &data, input, output); break; case kTfLiteInt8: - AverageEvalInt8(context, node, params, &data, input, output); + return AverageEvalInt8(context, node, params, &data, input, output); break; default: TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported", diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc index 2ac60dd0f25..5bcc4752260 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc +++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc @@ -38,7 +38,9 @@ limitations under the License. #ifndef SCRATCH_MEM_Z_SIZE #ifdef core_config_dccm_size -#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2) +// temporary disable the use of dccm scratch mem +//#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2) +#define SCRATCH_MEM_Z_SIZE (0) #else #define SCRATCH_MEM_Z_SIZE (0) #endif @@ -144,3 +146,43 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, return kTfLiteOk; #endif } + +TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context, + mli_tensor* in, + mli_tensor* out) { +#ifdef __Xxy + // Function to assign fast memory from one of 3 scratch buffers. + // Best Fit strategy - memory is asigned to those tensor which leave less memory of bank unused + mli_tensor* tensors[2] = { in, out }; + uint32_t tensor_sizes[2] = { + mli_hlp_count_elem_num(tensors[0], 0), mli_hlp_count_elem_num(tensors[1], 0)}; + bool mem_is_free[3] = { true, true, true }; + int8_t* scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z}; + uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE, SCRATCH_MEM_Z_SIZE}; + int num_tensors = 2; + int num_memories = 3; + + + for (int i = 0; i < num_tensors; ++i) { + int best_mem_idx = -1; + int best_mem_delta = INT_MAX; + // only for tensors that are not already located in one of the ccm memories, find a local memory that fits the data size. + if (inside_arc_ccm(tensors[i]->data)) continue; + for (int j = 0; j < num_memories; ++j) { + // Best Fit + if (mem_is_free[j] && tensor_sizes[i] <= scratch_sizes[j] && scratch_sizes[j] - tensor_sizes[i] < best_mem_delta) { + best_mem_idx = j; + best_mem_delta = scratch_sizes[j] - tensor_sizes[i]; + } + } + if (best_mem_idx >= 0) { + tensors[i]->data = static_cast(scratch_mem[best_mem_idx]); + tensors[i]->capacity = scratch_sizes[best_mem_idx]; + mem_is_free[best_mem_idx] = false; + } else { + return kTfLiteError; + } + } +#endif + return kTfLiteOk; +} \ No newline at end of file diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h index 198cc5b83cf..d92ecc02d3a 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h +++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h @@ -39,4 +39,20 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, mli_tensor* bias, mli_tensor* out); +/** + * @brief Function to allocate scratch buffers for kernels with only input and output buffers + * + * @detail This function will update the data pointers in the 2 tensors with pointers + * to scratch buffers in fast local memory. + * + * @param context [I] pointer to TfLite context (needed for error handling) + * @param in [IO] pointer to the input tensor + * @param output [IO] pointer to the output tensor + * + * @return Tf Lite status code + */ +TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context, + mli_tensor* in, + mli_tensor* out); + #endif // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_ From bf8b8ac71ca40917a9ba09933179343f03879edb Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Mon, 20 Jan 2020 18:41:26 +0300 Subject: [PATCH 014/557] person_detection example: wrap data with named bss section --- .../person_detection_experimental/main_functions.cc | 2 ++ .../person_detection_test.cc | 4 ++-- tensorflow/lite/micro/kernels/arc/scratch_buffers.cc | 12 ++++++------ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc index 719f16b2d36..552b52c9c51 100644 --- a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc +++ b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc @@ -42,7 +42,9 @@ TfLiteTensor* input = nullptr; // An area of memory to use for input, output, and intermediate arrays. constexpr int kTensorArenaSize = 125 * 1024; +#pragma Bss(".tensor_arena") static uint8_t tensor_arena[kTensorArenaSize]; +#pragma Bss() } // namespace // The name of this function is important for Arduino compatibility. diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc index cac5596cd83..9c7212648cc 100644 --- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc +++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc @@ -28,9 +28,9 @@ limitations under the License. // Create an area of memory to use for input, output, and intermediate arrays. constexpr int tensor_arena_size = 125 * 1024; -#pragma Data(".System") +#pragma Bss(".tensor_arena") uint8_t tensor_arena[tensor_arena_size]; -#pragma Data() +#pragma Bss() TF_LITE_MICRO_TESTS_BEGIN diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc index 5bcc4752260..477f4f37b2b 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc +++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc @@ -47,17 +47,17 @@ limitations under the License. #endif namespace { -#pragma Data(".Xdata") +#pragma Bss(".Xdata") static int8_t scratch_mem_x[SCRATCH_MEM_X_SIZE]; -#pragma Data() +#pragma Bss() -#pragma Data(".Ydata") +#pragma Bss(".Ydata") static int8_t scratch_mem_y[SCRATCH_MEM_Y_SIZE]; -#pragma Data() +#pragma Bss() -#pragma Data(".Zdata") +#pragma Bss(".Zdata") static int8_t scratch_mem_z[SCRATCH_MEM_Z_SIZE]; -#pragma Data() +#pragma Bss() } static inline From d6917614dd5d5d3d58e699ab113b08ff07a1b2d6 Mon Sep 17 00:00:00 2001 From: jacco Date: Mon, 20 Jan 2020 16:56:53 +0100 Subject: [PATCH 015/557] add LCF file for ARC target --- .../micro/tools/make/targets/arc/memory.lcf | 49 +++++++++++++++++++ .../micro/tools/make/targets/arc_makefile.inc | 4 +- 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/memory.lcf diff --git a/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf b/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf new file mode 100644 index 00000000000..1d967bde0fa --- /dev/null +++ b/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf @@ -0,0 +1,49 @@ + # SYSTEM memory regions indicate where external memory might be located. + # The TCF has no specific knowledge of whether SYSTEM regions contain + # external memory or not. + # CCMWRAP memory regions indicate unusable portions of the address space + # due to CCM memory wrapping into upper addresses beyond its size + + MEMORY { + ICCM0 : ORIGIN = 0x00000000, LENGTH = 0x00080000 + # CCMWRAP0: ORIGIN = 0x00080000, LENGTH = 0x00080000 + # SYSTEM0 : ORIGIN = 0x00100000, LENGTH = 0x00700000 + DCCM : ORIGIN = 0x00800000, LENGTH = 0x00080000 + # CCMWRAP1: ORIGIN = 0x00880000, LENGTH = 0x00080000 + # SYSTEM1 : ORIGIN = 0x00900000, LENGTH = 0x00300000 + XCCM : ORIGIN = 0x00c00000, LENGTH = 0x00010000 + # CCMWRAP2: ORIGIN = 0x00c10000, LENGTH = 0x000f0000 + # SYSTEM2 : ORIGIN = 0x00d00000, LENGTH = 0x00100000 + YCCM : ORIGIN = 0x00e00000, LENGTH = 0x00010000 + # CCMWRAP3: ORIGIN = 0x00e10000, LENGTH = 0x000f0000 + # SYSTEM3 : ORIGIN = 0x00f00000, LENGTH = 0x00100000 + } + SECTIONS { + GROUP BLOCK(4): { + .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:132): {} + .text? : { *('.text$crt*') } + * (TEXT): {} + * (LIT): {} + .tensor_arena?: {} + } > ICCM0 + + GROUP BLOCK(4): { + /* _SDA_BASE_ computed implicitly */ + .sdata?: {} + .sbss?: {} + .protobuf?: {} + * (DATA): {} + * (BSS): {} + .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {} + .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {} + } > DCCM + GROUP BLOCK(4): { + .Xdata? : {} + } > XCCM + GROUP BLOCK(4): { + .Ydata? : {} + } > YCCM + } + + + diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index 16e89266614..09fabd5e2cf 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -23,7 +23,7 @@ endif PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -fslp-vectorize-aggressive -ffunction-sections -fdata-sections PLATFORM_FLAGS += -tcf_core_config - PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map + PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map -default_lcf=$(MAKEFILE_DIR)/targets/arc/memory.lcf CXXFLAGS += $(PLATFORM_FLAGS) CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS)) @@ -86,6 +86,8 @@ endif MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf + endif # USE_EMBARC_MLI endif From bab1f34a3cb829a900f30178cda321b418909ff1 Mon Sep 17 00:00:00 2001 From: jacco Date: Mon, 20 Jan 2020 17:05:42 +0100 Subject: [PATCH 016/557] Update URL to latest MLI lib with optimizations for person detect example --- tensorflow/lite/micro/tools/make/third_party_downloads.inc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc index 69e7910f6c2..8c8684ebec6 100644 --- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc +++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc @@ -74,8 +74,8 @@ PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc" EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip" EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776" -EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/428cfd6a89f848e403a8b8ca02eab2a897ae8cd3.zip" -EMBARC_MLI_MD5 := "9c6c8f8877fa6dd738d7ab62665b3a6e" +EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/d8702db473472764dcc8d2dff1f68c690d368be3.zip" +EMBARC_MLI_MD5 := "7a798dfe1424971b9ae50cd019e03616" XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip" XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b" From 279e034660d296ca3dc3eed1ea604ce61e96a58b Mon Sep 17 00:00:00 2001 From: jacco Date: Wed, 22 Jan 2020 14:46:58 +0100 Subject: [PATCH 017/557] fix memory allocation issue for person detect example --- .../lite/micro/kernels/arc/scratch_buffers.cc | 15 ++++++-- .../micro/tools/make/targets/arc/memory.lcf | 35 ++++++++++--------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc index 477f4f37b2b..4c75a0a0fd4 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc +++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc @@ -38,9 +38,7 @@ limitations under the License. #ifndef SCRATCH_MEM_Z_SIZE #ifdef core_config_dccm_size -// temporary disable the use of dccm scratch mem -//#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2) -#define SCRATCH_MEM_Z_SIZE (0) +#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2) #else #define SCRATCH_MEM_Z_SIZE (0) #endif @@ -141,6 +139,17 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, } } } + if (!is_bias_allocated) { + uint32_t bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias); + for (int i = 0; i < 3; ++i) { + if (mem_is_free[i]) { + bias->data = static_cast(scratch_mem[i]); + bias->capacity = bias_mem_requirements; + is_bias_allocated = true; + break; + } + } + } return (is_bias_allocated) ? kTfLiteOk : kTfLiteError; #else return kTfLiteOk; diff --git a/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf b/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf index 1d967bde0fa..00cf0a3050b 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf +++ b/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf @@ -5,30 +5,30 @@ # due to CCM memory wrapping into upper addresses beyond its size MEMORY { - ICCM0 : ORIGIN = 0x00000000, LENGTH = 0x00080000 - # CCMWRAP0: ORIGIN = 0x00080000, LENGTH = 0x00080000 - # SYSTEM0 : ORIGIN = 0x00100000, LENGTH = 0x00700000 - DCCM : ORIGIN = 0x00800000, LENGTH = 0x00080000 - # CCMWRAP1: ORIGIN = 0x00880000, LENGTH = 0x00080000 - # SYSTEM1 : ORIGIN = 0x00900000, LENGTH = 0x00300000 - XCCM : ORIGIN = 0x00c00000, LENGTH = 0x00010000 - # CCMWRAP2: ORIGIN = 0x00c10000, LENGTH = 0x000f0000 - # SYSTEM2 : ORIGIN = 0x00d00000, LENGTH = 0x00100000 - YCCM : ORIGIN = 0x00e00000, LENGTH = 0x00010000 - # CCMWRAP3: ORIGIN = 0x00e10000, LENGTH = 0x000f0000 - # SYSTEM3 : ORIGIN = 0x00f00000, LENGTH = 0x00100000 + ICCM0 : ORIGIN = 0x00000000, LENGTH = 0x00010000 + # CCMWRAP0: ORIGIN = 0x00010000, LENGTH = 0x0fff0000 + ICCM1 : ORIGIN = 0x10000000, LENGTH = 0x00080000 + # CCMWRAP1: ORIGIN = 0x10080000, LENGTH = 0x0ff80000 + # SYSTEM0 : ORIGIN = 0x20000000, LENGTH = 0x60000000 + DCCM : ORIGIN = 0x80000000, LENGTH = 0x00080000 + # CCMWRAP2: ORIGIN = 0x80080000, LENGTH = 0x0ff80000 + XCCM : ORIGIN = 0x90000000, LENGTH = 0x00008000 + # CCMWRAP3: ORIGIN = 0x90008000, LENGTH = 0x0fff8000 + YCCM : ORIGIN = 0xa0000000, LENGTH = 0x00008000 + # CCMWRAP4: ORIGIN = 0xa0008000, LENGTH = 0x0fff8000 + # SYSTEM1 : ORIGIN = 0xb0000000, LENGTH = 0x50000000 } SECTIONS { GROUP BLOCK(4): { .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:132): {} - .text? : { *('.text$crt*') } + .text? : { *('.text$crt*') } * (TEXT): {} * (LIT): {} - .tensor_arena?: {} - } > ICCM0 + .rodata_in_data?:{} + } > ICCM1 GROUP BLOCK(4): { - /* _SDA_BASE_ computed implicitly */ + /* _SDA_BASE_ computed implicitly */ .sdata?: {} .sbss?: {} .protobuf?: {} @@ -36,7 +36,8 @@ * (BSS): {} .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {} .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {} - } > DCCM + .tensor_arena?: {} + } > DCCM GROUP BLOCK(4): { .Xdata? : {} } > XCCM From b045244f289aacf22c51c9202b68e9ea311e9554 Mon Sep 17 00:00:00 2001 From: jacco Date: Mon, 10 Feb 2020 10:37:30 +0100 Subject: [PATCH 018/557] update MLI lib to performance optimized MLI1.1 pre-release --- tensorflow/lite/micro/tools/make/third_party_downloads.inc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc index 8c8684ebec6..6141efedbee 100644 --- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc +++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc @@ -74,8 +74,8 @@ PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc" EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip" EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776" -EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/d8702db473472764dcc8d2dff1f68c690d368be3.zip" -EMBARC_MLI_MD5 := "7a798dfe1424971b9ae50cd019e03616" +EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/97c09b81bd1c4d0455de298626c271d75faedba2.zip" +EMBARC_MLI_MD5 := "f7c5555a15e7837806cfaeb22d3c7b50" XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip" XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b" From f110cdd8303a2365fafa7c9ffab984d27f7538e5 Mon Sep 17 00:00:00 2001 From: jacco Date: Fri, 6 Mar 2020 15:00:54 +0100 Subject: [PATCH 019/557] Add slicing logic for convolution layers in case the tensors don't fit completely in local memory, slicing is used to split the tensors. --- tensorflow/lite/micro/kernels/arc/conv.cc | 61 +++--- .../lite/micro/kernels/arc/depthwise_conv.cc | 63 +++--- .../lite/micro/kernels/arc/fully_connected.cc | 1 + .../lite/micro/kernels/arc/mli_slicers.cc | 93 +++++++++ .../lite/micro/kernels/arc/mli_slicers.h | 56 +++++ tensorflow/lite/micro/kernels/arc/pooling.cc | 3 + .../lite/micro/kernels/arc/scratch_buf_mgr.cc | 192 ++++++++++++++++++ .../lite/micro/kernels/arc/scratch_buf_mgr.h | 75 +++++++ .../lite/micro/kernels/arc/scratch_buffers.cc | 179 +++++----------- .../lite/micro/kernels/arc/scratch_buffers.h | 75 +++---- .../micro/tools/make/targets/arc_makefile.inc | 4 + 11 files changed, 588 insertions(+), 214 deletions(-) create mode 100644 tensorflow/lite/micro/kernels/arc/mli_slicers.cc create mode 100644 tensorflow/lite/micro/kernels/arc/mli_slicers.h create mode 100644 tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc create mode 100644 tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/arc/conv.cc index 46be76a407b..8141154147b 100644 --- a/tensorflow/lite/micro/kernels/arc/conv.cc +++ b/tensorflow/lite/micro/kernels/arc/conv.cc @@ -26,6 +26,8 @@ limitations under the License. #include "tensorflow/lite/kernels/padding.h" #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h" #include "mli_api.h" @@ -198,44 +200,51 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, cfg.padding_bottom = data->padding.height + data->padding.height_offset; } - // Get first input from batch - mli_point_to_subtsr_cfg subtsr_cfg_in = { {0, 0}, 2, static_cast(mli_in.shape[1]) }; - mli_point_to_subtsr_cfg subtsr_cfg_out = { {0, 0}, 2, static_cast(mli_out.shape[1]) }; - mli_tensor sub_mli_in = { 0 }; - mli_tensor sub_mli_out = { 0 }; - mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); + const int heightDimension = 1; + int inSliceHeight = 0; + int outSliceHeight = 0; + const int kernelHeight = static_cast(mli_weights.shape[KRNL_H_DIM_HWC]); + const int overlap = kernelHeight - cfg.stride_height; // Tensors for data in fast (local) memory and config to copy data from external to local memory mli_tensor weights_local = mli_weights; mli_tensor bias_local = mli_bias; - mli_tensor in_local = sub_mli_in; - mli_tensor out_local = sub_mli_out; + mli_tensor in_local = mli_in; + mli_tensor out_local = mli_out; mli_mov_cfg_t copy_config; mli_mov_cfg_for_copy(©_config); TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); - bool in_is_local = in_local.data == sub_mli_in.data; - bool out_is_local = out_local.data == sub_mli_out.data; + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, &inSliceHeight, &outSliceHeight)); + + const bool in_is_local = in_local.data == mli_in.data; + const bool out_is_local = out_local.data == mli_out.data; + + /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor. + because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. + on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated. + The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) + in chunks of 'sliceHeight' */ + TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap); + TensorSlicer out_slice(&mli_out, heightDimension, outSliceHeight); + + mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; + mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; mli_mov_tensor_sync(&mli_weights, ©_config, &weights_local); mli_mov_tensor_sync(&mli_bias, ©_config, &bias_local); - const int batches = MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0); - for (int i = 0; i < batches; i++) { - mli_mov_tensor_sync(&sub_mli_in, ©_config, &in_local); - mli_krn_conv2d_hwc_sa8_sa8_sa32(&in_local, &weights_local, &bias_local, &cfg, &out_local); - mli_mov_tensor_sync(&out_local, ©_config, &sub_mli_out); - subtsr_cfg_in.start_coord[0]++; - subtsr_cfg_out.start_coord[0]++; - mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); - if (in_is_local) { - in_local.data = sub_mli_in.data; - } - if (out_is_local) { - out_local.data = sub_mli_out.data; - } + while (!out_slice.Done()) { + cfg.padding_top = in_slice.GetPaddingPre(); + cfg.padding_bottom = in_slice.GetPaddingPost(); + + mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); + mli_krn_conv2d_hwc_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr); + mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); + + in_slice.Next(); + out_slice.Next(); } + free_arc_scratch_buffers(); } else { ConvParams op_params; op_params.input_offset = -input->params.zero_point; diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc index 4cf7b08bda8..5921c4e4dff 100644 --- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc +++ b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc @@ -27,6 +27,8 @@ limitations under the License. #include "tensorflow/lite/kernels/padding.h" #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h" #include "mli_api.h" @@ -189,44 +191,53 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, cfg.padding_bottom = data->padding.height + data->padding.height_offset; } - // Get first input from batch - mli_point_to_subtsr_cfg subtsr_cfg_in = { {0, 0}, 2, static_cast(mli_in.shape[1]) }; - mli_point_to_subtsr_cfg subtsr_cfg_out = { {0, 0}, 2, static_cast(mli_out.shape[1]) }; - mli_tensor sub_mli_in = { 0 }; - mli_tensor sub_mli_out = { 0 }; - mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); + const int heightDimension = 1; + int inSliceHeight = 0; + int outSliceHeight = 0; + const int kernelHeight = static_cast(mli_weights.shape[KRNL_DW_H_DIM_HWC]); + const int overlap = kernelHeight - cfg.stride_height; // Tensors for data in fast (local) memory and config to copy data from external to local memory mli_tensor weights_local = mli_weights; mli_tensor bias_local = mli_bias; - mli_tensor in_local = sub_mli_in; - mli_tensor out_local = sub_mli_out; + mli_tensor in_local = mli_in; + mli_tensor out_local = mli_out; // this assumes that output shape is already filled in the tensor struct. mli_mov_cfg_t copy_config; mli_mov_cfg_for_copy(©_config); + TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); - bool in_is_local = in_local.data == sub_mli_in.data; - bool out_is_local = out_local.data == sub_mli_out.data; + /* if the tensor is already in local memory, is_local is true */ + const bool in_is_local = in_local.data == mli_in.data; + const bool out_is_local = out_local.data == mli_out.data; + + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, &inSliceHeight, &outSliceHeight)); + + /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor. + because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. + on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated. + The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) + in chunks of 'sliceHeight' */ + TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap); + TensorSlicer out_slice(&mli_out, heightDimension, outSliceHeight); + + mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; + mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; mli_mov_tensor_sync(&mli_weights, ©_config, &weights_local); mli_mov_tensor_sync(&mli_bias, ©_config, &bias_local); - const int batches = MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0); - for (int i = 0; i < batches; i++) { - mli_mov_tensor_sync(&sub_mli_in, ©_config, &in_local); - mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(&in_local, &weights_local, &bias_local, &cfg, &out_local); - mli_mov_tensor_sync(&out_local, ©_config, &sub_mli_out); - subtsr_cfg_in.start_coord[0]++; - subtsr_cfg_out.start_coord[0]++; - mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); - if (in_is_local) { - in_local.data = sub_mli_in.data; - } - if (out_is_local) { - out_local.data = sub_mli_out.data; - } + while (!out_slice.Done()) { + cfg.padding_top = in_slice.GetPaddingPre(); + cfg.padding_bottom = in_slice.GetPaddingPost(); + + mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); + mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr); + mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); + + in_slice.Next(); + out_slice.Next(); } + free_arc_scratch_buffers(); } else { DepthwiseParams op_params; op_params.padding_type = PaddingType::kSame; diff --git a/tensorflow/lite/micro/kernels/arc/fully_connected.cc b/tensorflow/lite/micro/kernels/arc/fully_connected.cc index 9c484718b25..42921037481 100644 --- a/tensorflow/lite/micro/kernels/arc/fully_connected.cc +++ b/tensorflow/lite/micro/kernels/arc/fully_connected.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h" #include "mli_api.h" diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc/mli_slicers.cc new file mode 100644 index 00000000000..0ae80d1afc3 --- /dev/null +++ b/tensorflow/lite/micro/kernels/arc/mli_slicers.cc @@ -0,0 +1,93 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "mli_slicers.h" + +#define MAX(A,B) (((A) > (B))? (A): (B)) +#define MIN(A,B) (((A) > (B))? (B): (A)) + +namespace tflite { +namespace ops { +namespace micro { + +TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size, int padding_pre, int padding_post, int overlap) + : full_tensor_(full_tensor) + , sliceDim_(slice_dim) + , pad_pre_(padding_pre) + , pad_post_(padding_post) + , overlap_(overlap) + , subtsr_cfg_{ {0, 0}, static_cast(slice_dim + 1), static_cast(slice_size) } + , sub_tensor_{0} + , done_(false){ + + ComputeSubTensor(); +} + +void TensorSlicer::ComputeSubTensor(void) { + // subtsr_cfg_ is used to keep track of the itteration. + // A copy is created to update it with the correct clipping and padding for the current slice + mli_point_to_subtsr_cfg cfg_new = subtsr_cfg_; + // add clipping of first_out_dim_size to not exceed total size in that dimensions + // add padding logic + + // begin and end spans the complete input region including padding areas. + const int begin = (int)subtsr_cfg_.start_coord[1] - pad_pre_; + // end is clipped to the end of the full input region. this is needed for cases where the last slice is smaller than the rest. + const int end = MIN(begin + subtsr_cfg_.first_out_dim_size + overlap_, full_tensor_->shape[sliceDim_] + pad_post_); + // The start coordinate of the subtensor is clipped to zero + cfg_new.start_coord[sliceDim_] = MAX(begin, 0); + // and the stop coordinate is clipped to the size of the full tensor + const int stop_coord = MIN(end, full_tensor_->shape[sliceDim_]); + // compute the size of the subtensor + cfg_new.first_out_dim_size = stop_coord - cfg_new.start_coord[sliceDim_]; + + // compute the padding configuration for the current slice. + actual_padding_pre = cfg_new.start_coord[sliceDim_] - begin; + actual_padding_post = end - stop_coord; + + mli_hlp_point_to_subtensor(full_tensor_, &cfg_new, &sub_tensor_); +} +void TensorSlicer::Next(void){ + // TODO make generic for any number of dimensions. + subtsr_cfg_.start_coord[1]+= subtsr_cfg_.first_out_dim_size; + if (subtsr_cfg_.start_coord[1] >= full_tensor_->shape[1]) { + subtsr_cfg_.start_coord[1] = 0; + subtsr_cfg_.start_coord[0]++; + if (subtsr_cfg_.start_coord[0] >= full_tensor_->shape[0]) { + done_ = true; + } + } + if (!done_) ComputeSubTensor(); +} + +bool TensorSlicer::Done(void) { + return done_; +} + +int TensorSlicer::GetPaddingPre(void) { + return actual_padding_pre; +} + +int TensorSlicer::GetPaddingPost(void) { + return actual_padding_post; +} + +mli_tensor* TensorSlicer::Sub(void) { + return &sub_tensor_; +} + +} // namespace micro +} // namespace ops +} // namespace tflite diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.h b/tensorflow/lite/micro/kernels/arc/mli_slicers.h new file mode 100644 index 00000000000..40f948a07ef --- /dev/null +++ b/tensorflow/lite/micro/kernels/arc/mli_slicers.h @@ -0,0 +1,56 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_ +#define TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_ + +#include "mli_api.h" +namespace tflite { +namespace ops { +namespace micro { + +class TensorSlicer { +public: + + TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size, int padding_pre = 0, int padding_post = 0, int overlap = 0); + ~TensorSlicer() = default; + + void Next(); + bool Done(); + int GetPaddingPre(); + int GetPaddingPost(); + + mli_tensor *Sub(); + + // Default constructor is deleted + TensorSlicer() = delete; + + +private: + const mli_tensor* full_tensor_; + mli_tensor sub_tensor_; + mli_point_to_subtsr_cfg subtsr_cfg_; + bool done_; + int sliceDim_; + int pad_pre_, pad_post_, overlap_; + int actual_padding_pre, actual_padding_post; + + void ComputeSubTensor(); +}; + +} // namespace micro +} // namespace ops +} // namespace tflite +#endif //TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_ diff --git a/tensorflow/lite/micro/kernels/arc/pooling.cc b/tensorflow/lite/micro/kernels/arc/pooling.cc index ef72a6c0649..dab0ad7e314 100644 --- a/tensorflow/lite/micro/kernels/arc/pooling.cc +++ b/tensorflow/lite/micro/kernels/arc/pooling.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/lite/kernels/padding.h" #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h" #include "mli_api.h" @@ -154,6 +155,7 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node, mli_mov_tensor_sync(&sub_mli_in, ©_config, &in_local); mli_krn_avepool_hwc_sa8(&in_local, &cfg, &out_local); mli_mov_tensor_sync(&out_local, ©_config, &sub_mli_out); + if (i == batches -1) break; subtsr_cfg_in.start_coord[0]++; subtsr_cfg_out.start_coord[0]++; mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); @@ -165,6 +167,7 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node, out_local.data = sub_mli_out.data; } } + free_arc_scratch_buffers(); } else { int32_t activation_min, activation_max; (void)CalculateActivationRangeQuantized(context, params->activation, output, diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc new file mode 100644 index 00000000000..26f4f45f17f --- /dev/null +++ b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc @@ -0,0 +1,192 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" +#include +#define MAX(A,B) (((A) > (B))? (A): (B)) +#define MIN(A,B) (((A) > (B))? (B): (A)) + +namespace tflite { +namespace ops { +namespace micro { + + + +void get_arc_two_buffer_sizes(int requestsize1, int requestsize2, int *grantsize1, int *grantsize2) { + int maxrequest = 0; + int secondrequest = 0; + int maxavailable = 0; + int secondavail = 0; + + // determine the largest requested buffer. + if (requestsize1 > requestsize2) { + maxrequest = requestsize1; + secondrequest = requestsize2; + } else { + maxrequest = requestsize2; + secondrequest = requestsize1; + } + + // find the two largest available buffers. + get_arc_scratch_buffer_two_max_sizes(&maxavailable, &secondavail); + + // in case two buffers are available, the largest buffer can go to the largest request. + if (secondavail > 0) { // this condition can be enhanced to prevent cases where the second buffer is so small that it is better to use one buffer and split it. + if (requestsize1 > requestsize2) { + *grantsize1 = maxavailable; + *grantsize2 = secondavail; + } else { + *grantsize1 = secondavail; + *grantsize2 = maxavailable; + } + } else { + // In case only one buffer is available, + // use only the max buffer, and split it. + // TODO compute optimal split ratio based on request ratio. + *grantsize1 = maxavailable / 2; + *grantsize2 = maxavailable / 2; + } +} + +TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, + mli_tensor* in, + mli_tensor* weights, + mli_tensor* bias, + mli_tensor* out) { +#ifdef __Xxy + + if (!inside_arc_ccm(weights->data)) { + int weights_size = mli_hlp_count_elem_num(weights, 0) * mli_hlp_tensor_element_size(weights); + weights->data = get_arc_scratch_buffer(weights_size); + weights->capacity = weights_size; + if (weights->data == NULL) return kTfLiteError; + } + + if (!inside_arc_ccm(bias->data)) { + uint32_t bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias); + bias->data = get_arc_scratch_buffer(bias_mem_requirements); + bias->capacity = bias_mem_requirements; + if (bias->data == NULL) return kTfLiteError; + } + + int requestSizeIn = 0; + int requestSizeOut = 0; + int grantsizeIn = 0; + int grantsizeOut = 0; + if (!inside_arc_ccm(in->data)) { + // In case the input tensor contains multiple batches, it has rank 4 + // because the mli kernel cannot operate on batches, we need to have the size + // of a single batch. that is why the startRank is 1 in case of input rank 4 + int startRank = in->rank - 3; // tOdo explain + requestSizeIn = mli_hlp_count_elem_num(in, startRank) * mli_hlp_tensor_element_size(in); + } + if (!inside_arc_ccm(out->data)) { + // In case the input tensor contains multiple batches, it has rank 4 + // because the mli kernel cannot operate on batches, we need to have the size + // of a single batch. that is why the startRank is 1 in case of input rank 4 + int startRank = out->rank - 3; + requestSizeOut = mli_hlp_count_elem_num(out, startRank) * mli_hlp_tensor_element_size(out); + } + + get_arc_two_buffer_sizes(requestSizeIn, requestSizeOut, &grantsizeIn, &grantsizeOut); + + if (!inside_arc_ccm(in->data)) { + in->data = get_arc_scratch_buffer(grantsizeIn); + in->capacity = grantsizeIn; + if (in->data == NULL) return kTfLiteError; + } + if (!inside_arc_ccm(out->data)) { + out->data = get_arc_scratch_buffer(grantsizeOut); + out->capacity = grantsizeOut; + if (out->data == NULL) return kTfLiteError; + } + + return kTfLiteOk; +#else + return kTfLiteOk; +#endif +} + +TfLiteStatus arc_scratch_buffer_calc_slice_size_io( + const mli_tensor *in, + const mli_tensor *out, + const int kernelHeight, + const int strideHeight, + int *inSliceHeight, + int *outSliceHeight) { + const int heightDimension = 1; // todo: compute from rank + const int inHeight = in->shape[heightDimension]; + const int outHeight = out->shape[heightDimension]; + const int lineSizeIn = mli_hlp_count_elem_num(in, heightDimension + 1) * mli_hlp_tensor_element_size(in); + const int lineSizeOut = mli_hlp_count_elem_num(out, heightDimension + 1) * mli_hlp_tensor_element_size(out); + int maxLinesIn = 0; + int maxLinesOut = 0; + int maxOutLinesForInput = 0; + bool fit = (in->capacity >= inHeight * lineSizeIn) && (out->capacity >= outHeight * lineSizeOut); + if (fit) { + // in case both tensors completely fit in the capacity, there is no need for slicing + *inSliceHeight = inHeight; + *outSliceHeight = outHeight; + } else { + // First compute how many lines fit into the input tensor, and compute how many output lines can be computed with that. + maxLinesIn = MIN(inHeight, in->capacity / lineSizeIn); + if (maxLinesIn >= inHeight) { + maxOutLinesForInput = outHeight; + } else { + maxOutLinesForInput = (maxLinesIn - kernelHeight + 1) / strideHeight; // TODO add padding exceptions and test by makin fit=false; + } + // Ten compute how many ouput lines fit into the output tensor. + maxLinesOut = MIN(outHeight, out->capacity / lineSizeOut); + // the smallest of the two determines the slice height for the output, and the derived sliceheight for the input. + *outSliceHeight = MIN(maxOutLinesForInput, maxLinesOut); + *inSliceHeight = *outSliceHeight * strideHeight; + } + if ((*inSliceHeight > 0) && (*outSliceHeight > 0)) { + return kTfLiteOk; + } else { + return kTfLiteError; + } +} + +TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context, + mli_tensor* in, + mli_tensor* out) { +#ifdef __Xxy + // Function to assign fast memory from one of 3 scratch buffers. + // Best Fit strategy - memory is asigned to those tensor which leave less memory of bank unused + mli_tensor* tensors[2] = { in, out }; + uint32_t tensor_sizes[2] = { + mli_hlp_count_elem_num(tensors[0], 0), mli_hlp_count_elem_num(tensors[1], 0)}; + int num_tensors = 2; + + + for (int i = 0; i < num_tensors; ++i) { + // only for tensors that are not already located in one of the ccm memories, find a local memory that fits the data size. + if (inside_arc_ccm(tensors[i]->data)) continue; + tensors[i]->data = get_arc_scratch_buffer(tensor_sizes[i]); + tensors[i]->capacity = tensor_sizes[i]; + + if (tensors[i]->data == NULL) { + return kTfLiteError; + } + } +#endif + return kTfLiteOk; +} + +} // namespace micro +} // namespace ops +} // namespace tflite \ No newline at end of file diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h new file mode 100644 index 00000000000..a27df8a5358 --- /dev/null +++ b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h @@ -0,0 +1,75 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_ +#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_ + +#include "tensorflow/lite/c/common.h" +#include "mli_api.h" + +namespace tflite { +namespace ops { +namespace micro { + +/** + * @brief Function to allocate scratch buffers for the convolution tensors + * + * @detail This function will update the data pointers in the 4 tensors with pointers + * to scratch buffers in fast local memory. + * + * @param context [I] pointer to TfLite context (needed for error handling) + * @param in [IO] pointer to the input tensor + * @param weights [IO] pointer to the weights tensor + * @param bias [IO] pointer to the bias tensor + * @param output [IO] pointer to the output tensor + * + * @return Tf Lite status code + */ +TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, + mli_tensor* in, + mli_tensor* weights, + mli_tensor* bias, + mli_tensor* out); + +/** + * @brief Function to allocate scratch buffers for kernels with only input and output buffers + * + * @detail This function will update the data pointers in the 2 tensors with pointers + * to scratch buffers in fast local memory. + * + * @param context [I] pointer to TfLite context (needed for error handling) + * @param in [IO] pointer to the input tensor + * @param output [IO] pointer to the output tensor + * + * @return Tf Lite status code + */ +TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context, + mli_tensor* in, + mli_tensor* out); + +TfLiteStatus arc_scratch_buffer_calc_slice_size_io( + const mli_tensor *in, + const mli_tensor *out, + const int kernelHeight, + const int strideHeight, + int *inSliceHeight, + int *outSliceHeight); + + +} // namespace micro +} // namespace ops +} // namespace tflite + +#endif // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_ diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc index 4c75a0a0fd4..5ef1b445a22 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc +++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc @@ -15,6 +15,12 @@ limitations under the License. #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" #include +#define MAX(A,B) (((A) > (B))? (A): (B)) +#define MIN(A,B) (((A) > (B))? (B): (A)) + +namespace tflite { +namespace ops { +namespace micro { /* by default use all the XY memory, and half of the DCCM because DCCM is also used * for the data section and the stack. @@ -58,140 +64,57 @@ namespace { #pragma Bss() } -static inline -bool inside_arc_dccm(void* p) { -#if core_config_dccm_present - return ((unsigned)p >= core_config_dccm_base) && ((unsigned)p < core_config_dccm_base + core_config_dccm_size); -#else - return false; -#endif -} -static inline -bool inside_arc_xccm(void* p) { -#if core_config_xy - return ((unsigned)p >= core_config_xy_x_base) && ((unsigned)p < core_config_xy_x_base + core_config_xy_size); -#else - return false; -#endif -} -static inline -bool inside_arc_yccm(void* p) { -#if core_config_xy - return ((unsigned)p >= core_config_xy_y_base) && ((unsigned)p < core_config_xy_y_base + core_config_xy_size); -#else - return false; -#endif -} +static int8_t* scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z}; +static uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE, SCRATCH_MEM_Z_SIZE}; -static inline -bool inside_arc_ccm(void* p) { - return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p); -} -TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, - mli_tensor* in, - mli_tensor* weights, - mli_tensor* bias, - mli_tensor* out) { -#ifdef __Xxy - // Function to assign fast memory from one of 3 scratch buffers. - // Best Fit strategy - memory is asigned to those tensor which leave less memory of bank unused - mli_tensor* tensors[3] = { weights, in, out }; - uint32_t tensor_sizes[3] = { - mli_hlp_count_elem_num(tensors[0], 0), mli_hlp_count_elem_num(tensors[1], 0), mli_hlp_count_elem_num(tensors[2], 0) }; - bool mem_is_free[3] = { true, true, true }; - int8_t* scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z}; - uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE, SCRATCH_MEM_Z_SIZE}; - - for (int i = 0; i < 3; ++i) { - int best_mem_idx = -1; - int best_mem_delta = INT_MAX; - // only for tensors that are not already located in one of the ccm memories, find a local memory that fits the data size. - if (inside_arc_ccm(tensors[i]->data)) continue; - for (int j = 0; j < 3; ++j) { - // Best Fit - if (mem_is_free[j] && tensor_sizes[i] <= scratch_sizes[j] && scratch_sizes[j] - tensor_sizes[i] < best_mem_delta) { - best_mem_idx = j; - best_mem_delta = scratch_sizes[j] - tensor_sizes[i]; - } - } - if (best_mem_idx >= 0) { - tensors[i]->data = static_cast(scratch_mem[best_mem_idx]); - tensors[i]->capacity = scratch_sizes[best_mem_idx]; - mem_is_free[best_mem_idx] = false; - } else { - return kTfLiteError; +void *get_arc_scratch_buffer(int size) { + // Function to asign fast memory from one of 3 scratch buffers. + // Best Fit strategy - memory is allocated from that memory bank that leaves the least unused memory. + void *buf = NULL; + int best_mem_idx = -1; + int best_mem_delta = INT_MAX; + // find a local memory that fits the data size. + for (int mem_idx = 0; mem_idx < sizeof(scratch_mem)/sizeof(scratch_mem[0]); ++mem_idx) { + // Best Fit + if ((size <= scratch_sizes[mem_idx]) && (scratch_sizes[mem_idx] - size < best_mem_delta)) { + best_mem_idx = mem_idx; + best_mem_delta = scratch_sizes[mem_idx] - size; } } - - // Bias is expected to be much smaller than other operands, not affect performance and can be placed - // in the end of some of already used memory bank (to occupy free space of it) - bool is_bias_allocated = inside_arc_ccm(bias->data); - if (!is_bias_allocated) { - uint32_t bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias); - for (int i = 0; i < 3; ++i) { - if (tensors[i]->capacity - tensor_sizes[i] > bias_mem_requirements) { - bias->data = &((char*)tensors[i]->data)[tensor_sizes[i]]; - bias->capacity = bias_mem_requirements; - tensors[i]->capacity = tensor_sizes[i]; - is_bias_allocated = true; - break; - } - } + if (best_mem_idx >= 0) { + buf = static_cast(scratch_mem[best_mem_idx]); + scratch_mem[best_mem_idx] += size; + scratch_sizes[best_mem_idx] -= size; } - if (!is_bias_allocated) { - uint32_t bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias); - for (int i = 0; i < 3; ++i) { - if (mem_is_free[i]) { - bias->data = static_cast(scratch_mem[i]); - bias->capacity = bias_mem_requirements; - is_bias_allocated = true; - break; - } - } - } - return (is_bias_allocated) ? kTfLiteOk : kTfLiteError; -#else - return kTfLiteOk; -#endif + return buf; } -TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context, - mli_tensor* in, - mli_tensor* out) { -#ifdef __Xxy - // Function to assign fast memory from one of 3 scratch buffers. - // Best Fit strategy - memory is asigned to those tensor which leave less memory of bank unused - mli_tensor* tensors[2] = { in, out }; - uint32_t tensor_sizes[2] = { - mli_hlp_count_elem_num(tensors[0], 0), mli_hlp_count_elem_num(tensors[1], 0)}; - bool mem_is_free[3] = { true, true, true }; - int8_t* scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z}; - uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE, SCRATCH_MEM_Z_SIZE}; - int num_tensors = 2; - int num_memories = 3; - - - for (int i = 0; i < num_tensors; ++i) { - int best_mem_idx = -1; - int best_mem_delta = INT_MAX; - // only for tensors that are not already located in one of the ccm memories, find a local memory that fits the data size. - if (inside_arc_ccm(tensors[i]->data)) continue; - for (int j = 0; j < num_memories; ++j) { - // Best Fit - if (mem_is_free[j] && tensor_sizes[i] <= scratch_sizes[j] && scratch_sizes[j] - tensor_sizes[i] < best_mem_delta) { - best_mem_idx = j; - best_mem_delta = scratch_sizes[j] - tensor_sizes[i]; - } - } - if (best_mem_idx >= 0) { - tensors[i]->data = static_cast(scratch_mem[best_mem_idx]); - tensors[i]->capacity = scratch_sizes[best_mem_idx]; - mem_is_free[best_mem_idx] = false; - } else { - return kTfLiteError; +void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) { + int maxavailable = 0; + int secondavail = 0; + // find the two largest available buffers. + for (int i = 0; i < 3; i++) { + if (scratch_sizes[i] > maxavailable) { + secondavail = maxavailable; + maxavailable = scratch_sizes[i]; + } else if (scratch_sizes[i] > secondavail) { + secondavail = scratch_sizes[i]; } } -#endif - return kTfLiteOk; -} \ No newline at end of file + *size1 = maxavailable; + *size2 = secondavail; +} + +void free_arc_scratch_buffers(void) { + scratch_mem[0] = scratch_mem_x; + scratch_mem[1] = scratch_mem_y; + scratch_mem[2] = scratch_mem_z; + scratch_sizes[0] = SCRATCH_MEM_X_SIZE; + scratch_sizes[1] = SCRATCH_MEM_Y_SIZE; + scratch_sizes[2] = SCRATCH_MEM_Z_SIZE; +} + +} // namespace micro +} // namespace ops +} // namespace tflite \ No newline at end of file diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h index d92ecc02d3a..52a12c7899d 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h +++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h @@ -19,40 +19,47 @@ limitations under the License. #include "tensorflow/lite/c/common.h" #include "mli_api.h" -/** - * @brief Function to allocate scratch buffers for the convolution tensors - * - * @detail This function will update the data pointers in the 4 tensors with pointers - * to scratch buffers in fast local memory. - * - * @param context [I] pointer to TfLite context (needed for error handling) - * @param in [IO] pointer to the input tensor - * @param weights [IO] pointer to the weights tensor - * @param bias [IO] pointer to the bias tensor - * @param output [IO] pointer to the output tensor - * - * @return Tf Lite status code - */ -TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, - mli_tensor* in, - mli_tensor* weights, - mli_tensor* bias, - mli_tensor* out); +namespace tflite { +namespace ops { +namespace micro { -/** - * @brief Function to allocate scratch buffers for kernels with only input and output buffers - * - * @detail This function will update the data pointers in the 2 tensors with pointers - * to scratch buffers in fast local memory. - * - * @param context [I] pointer to TfLite context (needed for error handling) - * @param in [IO] pointer to the input tensor - * @param output [IO] pointer to the output tensor - * - * @return Tf Lite status code - */ -TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context, - mli_tensor* in, - mli_tensor* out); + +void free_arc_scratch_buffers(void); +void *get_arc_scratch_buffer(int size);// Function to assign fast memory from one of 3 scratch buffers. + +void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2); + +static inline bool inside_arc_dccm(void* p) { +#if core_config_dccm_present + return ((unsigned)p >= core_config_dccm_base) && ((unsigned)p < core_config_dccm_base + core_config_dccm_size); +#else + return false; +#endif +} + +static inline bool inside_arc_xccm(void* p) { +#if core_config_xy + return ((unsigned)p >= core_config_xy_x_base) && ((unsigned)p < core_config_xy_x_base + core_config_xy_size); +#else + return false; +#endif +} + +static inline bool inside_arc_yccm(void* p) { +#if core_config_xy + return ((unsigned)p >= core_config_xy_y_base) && ((unsigned)p < core_config_xy_y_base + core_config_xy_size); +#else + return false; +#endif +} + +static inline +bool inside_arc_ccm(void* p) { + return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p); +} + +} // namespace micro +} // namespace ops +} // namespace tflite #endif // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_ diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index 09fabd5e2cf..a1f5546b8f5 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -85,6 +85,10 @@ endif MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf From c2e501e017b31b94c30bc5903bc613a8b0d7e109 Mon Sep 17 00:00:00 2001 From: jacco Date: Wed, 4 Mar 2020 09:58:48 +0100 Subject: [PATCH 020/557] Fix for upstream merge conflict the location of the header file was changed in the upstream archive. but the makefile was not updated. --- tensorflow/lite/micro/tools/make/targets/arc_makefile.inc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index a1f5546b8f5..5ce2e03bfc3 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -89,6 +89,7 @@ endif MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_tf_utils.h MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf From 210253668472888264a9c8f6eef9f58e3d7f3e34 Mon Sep 17 00:00:00 2001 From: jacco Date: Thu, 26 Mar 2020 17:26:19 +0100 Subject: [PATCH 021/557] update to new version of MLI needed for slicing --- tensorflow/lite/micro/kernels/arc/conv.cc | 2 +- tensorflow/lite/micro/kernels/arc/depthwise_conv.cc | 2 +- tensorflow/lite/micro/tools/make/third_party_downloads.inc | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/arc/conv.cc index 8141154147b..06be9384125 100644 --- a/tensorflow/lite/micro/kernels/arc/conv.cc +++ b/tensorflow/lite/micro/kernels/arc/conv.cc @@ -238,7 +238,7 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, cfg.padding_bottom = in_slice.GetPaddingPost(); mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); - mli_krn_conv2d_hwc_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr); + mli_krn_conv2d_nhwc_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr); mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); in_slice.Next(); diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc index 5921c4e4dff..fe47c7f25e0 100644 --- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc +++ b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc @@ -231,7 +231,7 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, cfg.padding_bottom = in_slice.GetPaddingPost(); mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); - mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr); + mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr); mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); in_slice.Next(); diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc index 6141efedbee..ce24ba29542 100644 --- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc +++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc @@ -74,8 +74,8 @@ PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc" EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip" EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776" -EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/97c09b81bd1c4d0455de298626c271d75faedba2.zip" -EMBARC_MLI_MD5 := "f7c5555a15e7837806cfaeb22d3c7b50" +EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/4b6c6eed65395dced1564006be8188781af16035.zip" +EMBARC_MLI_MD5 := "47167553c17ff8c7cd59fb1afb90c304" XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip" XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b" From b4bcc4e5743fbe031406745f2474bb27bc49ba2e Mon Sep 17 00:00:00 2001 From: jacco Date: Fri, 20 Mar 2020 16:32:14 +0100 Subject: [PATCH 022/557] add slicing logic for weight slicing in conv kernel for ARC backend --- tensorflow/lite/micro/kernels/arc/conv.cc | 78 ++++++-- .../lite/micro/kernels/arc/depthwise_conv.cc | 2 +- .../lite/micro/kernels/arc/mli_slicers.cc | 74 +++++-- .../lite/micro/kernels/arc/mli_slicers.h | 4 +- tensorflow/lite/micro/kernels/arc/pooling.cc | 48 +++-- .../lite/micro/kernels/arc/scratch_buf_mgr.cc | 184 +++++++++++------- .../lite/micro/kernels/arc/scratch_buf_mgr.h | 6 + .../lite/micro/kernels/arc/scratch_buffers.cc | 18 +- .../lite/micro/kernels/arc/scratch_buffers.h | 1 + 9 files changed, 278 insertions(+), 137 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/arc/conv.cc index 06be9384125..9e9a37821e8 100644 --- a/tensorflow/lite/micro/kernels/arc/conv.cc +++ b/tensorflow/lite/micro/kernels/arc/conv.cc @@ -200,12 +200,18 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, cfg.padding_bottom = data->padding.height + data->padding.height_offset; } + // for height slicing const int heightDimension = 1; int inSliceHeight = 0; int outSliceHeight = 0; const int kernelHeight = static_cast(mli_weights.shape[KRNL_H_DIM_HWC]); const int overlap = kernelHeight - cfg.stride_height; + // for weight slicing (on output channels) + const int weightOutChDimension = 0; // NHWC layout for weigths, output channel dimension is the first dimension. + int sliceChannels = static_cast(mli_weights.shape[weightOutChDimension]); + const int outTensorChDimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels. + // Tensors for data in fast (local) memory and config to copy data from external to local memory mli_tensor weights_local = mli_weights; mli_tensor bias_local = mli_bias; @@ -214,36 +220,68 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, mli_mov_cfg_t copy_config; mli_mov_cfg_for_copy(©_config); TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); - TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, &inSliceHeight, &outSliceHeight)); + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight)); + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, &sliceChannels)); + /* is_local indicates that the tensor is already in local memory, + so in that case the original tensor can be used, + and there is no need to copy it to the local tensor*/ const bool in_is_local = in_local.data == mli_in.data; const bool out_is_local = out_local.data == mli_out.data; + const bool w_is_local = weights_local.data == mli_weights.data; + const bool b_is_local = bias_local.data == mli_bias.data; - /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor. - because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. - on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated. - The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) - in chunks of 'sliceHeight' */ - TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap); - TensorSlicer out_slice(&mli_out, heightDimension, outSliceHeight); + TensorSlicer w_slice(&mli_weights, weightOutChDimension, sliceChannels); + TensorSlicer b_slice(&mli_bias, weightOutChDimension, sliceChannels); + TensorSlicer out_ch_slice(&mli_out, outTensorChDimension, sliceChannels, 0, 0, 0, true); - mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; - mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; + mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local; + mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local; - mli_mov_tensor_sync(&mli_weights, ©_config, &weights_local); - mli_mov_tensor_sync(&mli_bias, ©_config, &bias_local); + void *inputBufferPtr = NULL; - while (!out_slice.Done()) { - cfg.padding_top = in_slice.GetPaddingPre(); - cfg.padding_bottom = in_slice.GetPaddingPost(); + while (!w_slice.Done()){ + mli_mov_tensor_sync(w_slice.Sub(), ©_config, w_ptr); + mli_mov_tensor_sync(b_slice.Sub(), ©_config, b_ptr); - mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); - mli_krn_conv2d_nhwc_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr); - mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); + /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor. + because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. + on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated. + The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) + in chunks of 'sliceHeight' */ + TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap); - in_slice.Next(); - out_slice.Next(); + /* output tensor is alreade sliced in the output channel dimension. out_ch_slice.Sub() is the tensor for the amount of + output channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and + height dimension. */ + TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight); + + /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */ + mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; + mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; + + while (!out_slice.Done()) { + TF_LITE_ENSURE(context, !in_slice.Done()); + cfg.padding_top = in_slice.GetPaddingPre(); + cfg.padding_bottom = in_slice.GetPaddingPost(); + + // if same input copy as previous iteration, skip the copy of input + if (in_slice.Sub()->data != inputBufferPtr) { + mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); + inputBufferPtr = in_slice.Sub()->data; + } + mli_krn_conv2d_nhwc_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr); + mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); + + in_slice.Next(); + out_slice.Next(); + } + w_slice.Next(); + b_slice.Next(); + out_ch_slice.Next(); + TF_LITE_ENSURE(context, in_slice.Done()); } + free_arc_scratch_buffers(); } else { ConvParams op_params; diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc index fe47c7f25e0..00c46c442b7 100644 --- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc +++ b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc @@ -210,7 +210,7 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, const bool in_is_local = in_local.data == mli_in.data; const bool out_is_local = out_local.data == mli_out.data; - TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, &inSliceHeight, &outSliceHeight)); + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight)); /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor. because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc/mli_slicers.cc index 0ae80d1afc3..6c6c89715f8 100644 --- a/tensorflow/lite/micro/kernels/arc/mli_slicers.cc +++ b/tensorflow/lite/micro/kernels/arc/mli_slicers.cc @@ -22,53 +22,89 @@ namespace tflite { namespace ops { namespace micro { -TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size, int padding_pre, int padding_post, int overlap) +TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size, int padding_pre, int padding_post, int overlap, bool interleave_mode) : full_tensor_(full_tensor) , sliceDim_(slice_dim) , pad_pre_(padding_pre) , pad_post_(padding_post) , overlap_(overlap) - , subtsr_cfg_{ {0, 0}, static_cast(slice_dim + 1), static_cast(slice_size) } + , sub_cfg_{0} , sub_tensor_{0} , done_(false){ + /* In the interleave mode, the slicing happens from the deepest dimension up to the slice_dim + for example in an HWC layout this can mode can be used to slice in the C dimenstion. + in this mode the data is not contiguous in memory anymore */ + if (interleave_mode) { + for (int i = 0; i< full_tensor->rank; i++){ + if (i > slice_dim) { + sub_cfg_.size[i] = 1; + } else if (i == slice_dim) { + sub_cfg_.size[i] = slice_size; + } else { + sub_cfg_.size[i] = full_tensor->shape[i]; + } + } + sub_cfg_.sub_tensor_rank = full_tensor->rank; + + } else { + /* In the not interlevaed mode, the slicing happens from the outer most dimension up to the slice_dim + for example in an HWC layout this mode can be used to slice in the H dimension. + in this mode the data of the slice is still contiguous in memory (if that was the case in the input tensor */ + for (int i = 0; i< full_tensor->rank; i++){ + if (i < slice_dim) { + sub_cfg_.size[i] = 1; + } else if (i == slice_dim) { + sub_cfg_.size[i] = slice_size; + }else { + sub_cfg_.size[i] = full_tensor->shape[i]; + } + } + sub_cfg_.sub_tensor_rank = full_tensor->rank - slice_dim; + } + ComputeSubTensor(); } void TensorSlicer::ComputeSubTensor(void) { - // subtsr_cfg_ is used to keep track of the itteration. + + // subtsr_cfg_ is used to keep track of the iteration. // A copy is created to update it with the correct clipping and padding for the current slice - mli_point_to_subtsr_cfg cfg_new = subtsr_cfg_; - // add clipping of first_out_dim_size to not exceed total size in that dimensions - // add padding logic + mli_sub_tensor_cfg cfg_new = sub_cfg_; // begin and end spans the complete input region including padding areas. - const int begin = (int)subtsr_cfg_.start_coord[1] - pad_pre_; + const int begin = (int)sub_cfg_.offset[sliceDim_] - pad_pre_; // end is clipped to the end of the full input region. this is needed for cases where the last slice is smaller than the rest. - const int end = MIN(begin + subtsr_cfg_.first_out_dim_size + overlap_, full_tensor_->shape[sliceDim_] + pad_post_); + const int end = MIN(begin + sub_cfg_.size[sliceDim_] + overlap_, full_tensor_->shape[sliceDim_] + pad_post_); // The start coordinate of the subtensor is clipped to zero - cfg_new.start_coord[sliceDim_] = MAX(begin, 0); + cfg_new.offset[sliceDim_] = MAX(begin, 0); // and the stop coordinate is clipped to the size of the full tensor const int stop_coord = MIN(end, full_tensor_->shape[sliceDim_]); // compute the size of the subtensor - cfg_new.first_out_dim_size = stop_coord - cfg_new.start_coord[sliceDim_]; + cfg_new.size[sliceDim_] = stop_coord - cfg_new.offset[sliceDim_]; // compute the padding configuration for the current slice. - actual_padding_pre = cfg_new.start_coord[sliceDim_] - begin; + actual_padding_pre = cfg_new.offset[sliceDim_] - begin; actual_padding_post = end - stop_coord; - mli_hlp_point_to_subtensor(full_tensor_, &cfg_new, &sub_tensor_); + mli_hlp_create_subtensor(full_tensor_, &cfg_new, &sub_tensor_); } + void TensorSlicer::Next(void){ - // TODO make generic for any number of dimensions. - subtsr_cfg_.start_coord[1]+= subtsr_cfg_.first_out_dim_size; - if (subtsr_cfg_.start_coord[1] >= full_tensor_->shape[1]) { - subtsr_cfg_.start_coord[1] = 0; - subtsr_cfg_.start_coord[0]++; - if (subtsr_cfg_.start_coord[0] >= full_tensor_->shape[0]) { - done_ = true; + for (int i = full_tensor_->rank - 1; i >= 0; i--) { + sub_cfg_.offset[i] += sub_cfg_.size[i]; + if (sub_cfg_.offset[i] >= full_tensor_->shape[i]){ + // wrap + sub_cfg_.offset[i] = 0; + // and continue to the next dimension, if no next dimension we are done. + if (i == 0) done_ = true; + continue; + } else { + // carry is false, so break from the loop + break; } } + if (!done_) ComputeSubTensor(); } diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.h b/tensorflow/lite/micro/kernels/arc/mli_slicers.h index 40f948a07ef..3fc7d911fa5 100644 --- a/tensorflow/lite/micro/kernels/arc/mli_slicers.h +++ b/tensorflow/lite/micro/kernels/arc/mli_slicers.h @@ -24,7 +24,7 @@ namespace micro { class TensorSlicer { public: - TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size, int padding_pre = 0, int padding_post = 0, int overlap = 0); + TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size, int padding_pre = 0, int padding_post = 0, int overlap = 0, bool interleave_mode = false); ~TensorSlicer() = default; void Next(); @@ -41,7 +41,7 @@ public: private: const mli_tensor* full_tensor_; mli_tensor sub_tensor_; - mli_point_to_subtsr_cfg subtsr_cfg_; + mli_sub_tensor_cfg sub_cfg_; bool done_; int sliceDim_; int pad_pre_, pad_post_, overlap_; diff --git a/tensorflow/lite/micro/kernels/arc/pooling.cc b/tensorflow/lite/micro/kernels/arc/pooling.cc index dab0ad7e314..0cfa5363d69 100644 --- a/tensorflow/lite/micro/kernels/arc/pooling.cc +++ b/tensorflow/lite/micro/kernels/arc/pooling.cc @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" #include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h" #include "mli_api.h" @@ -139,33 +140,42 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node, mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); + const int heightDimension = 1; + int inSliceHeight = 0; + int outSliceHeight = 0; + const int overlap = cfg.kernel_height - cfg.stride_height; + // Tensors for data in fast (local) memory and config to copy data from external to local memory mli_tensor in_local = sub_mli_in; mli_tensor out_local = sub_mli_out; mli_mov_cfg_t copy_config; mli_mov_cfg_for_copy(©_config); TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_io_tensors(context, &in_local, &out_local)); - bool in_is_local = in_local.data == sub_mli_in.data; - bool out_is_local = out_local.data == sub_mli_out.data; + bool in_is_local = in_local.data == sub_mli_in.data; + bool out_is_local = out_local.data == sub_mli_out.data; + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, cfg.kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight)); - const int batches = - MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0); + /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor. + because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. + on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated. + The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) + in chunks of 'sliceHeight' */ + TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap); + TensorSlicer out_slice(&mli_out, heightDimension, outSliceHeight); - for (int i = 0; i < batches; i++) { - mli_mov_tensor_sync(&sub_mli_in, ©_config, &in_local); - mli_krn_avepool_hwc_sa8(&in_local, &cfg, &out_local); - mli_mov_tensor_sync(&out_local, ©_config, &sub_mli_out); - if (i == batches -1) break; - subtsr_cfg_in.start_coord[0]++; - subtsr_cfg_out.start_coord[0]++; - mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); - if (in_is_local) { - in_local.data = sub_mli_in.data; - } - if (out_is_local) { - out_local.data = sub_mli_out.data; - } + mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; + mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; + + while (!out_slice.Done()) { + cfg.padding_top = in_slice.GetPaddingPre(); + cfg.padding_bottom = in_slice.GetPaddingPost(); + + mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); + mli_krn_avepool_hwc_sa8(in_ptr, &cfg, out_ptr); + mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); + + in_slice.Next(); + out_slice.Next(); } free_arc_scratch_buffers(); } else { diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc index 26f4f45f17f..e9adbb37e9e 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc +++ b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc @@ -66,22 +66,128 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, mli_tensor* weights, mli_tensor* bias, mli_tensor* out) { +TfLiteStatus ret_val = kTfLiteOk; #ifdef __Xxy if (!inside_arc_ccm(weights->data)) { int weights_size = mli_hlp_count_elem_num(weights, 0) * mli_hlp_tensor_element_size(weights); + int maxWeightsSize = 0; weights->data = get_arc_scratch_buffer(weights_size); weights->capacity = weights_size; - if (weights->data == NULL) return kTfLiteError; + if (weights->data == NULL) { + get_arc_scratch_buffer_max_size(&maxWeightsSize); + weights->data = get_arc_scratch_buffer(maxWeightsSize); + weights->capacity = maxWeightsSize; + if (maxWeightsSize == 0) ret_val = kTfLiteError; + } + if (weights->data == NULL) ret_val = kTfLiteError; } if (!inside_arc_ccm(bias->data)) { uint32_t bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias); bias->data = get_arc_scratch_buffer(bias_mem_requirements); bias->capacity = bias_mem_requirements; - if (bias->data == NULL) return kTfLiteError; + } + if (ret_val == kTfLiteOk) { + ret_val = get_arc_scratch_buffer_for_io_tensors(context, in, out); + } + if (bias->data == NULL) { + int maxBiasSize = 0; + get_arc_scratch_buffer_max_size(&maxBiasSize); + bias->data = get_arc_scratch_buffer(maxBiasSize); + bias->capacity = maxBiasSize; + if (maxBiasSize == 0) ret_val = kTfLiteError; + } + if (bias->data == NULL) ret_val = kTfLiteError; + +#endif + return ret_val; +} + +TfLiteStatus arc_scratch_buffer_calc_slice_size_io( + const mli_tensor *in, + const mli_tensor *out, + const int kernelHeight, + const int strideHeight, + const int padding_top, + const int padding_bot, + int *inSliceHeight, + int *outSliceHeight) { + const int heightDimension = 1; // todo: compute from rank + const int inHeight = in->shape[heightDimension]; + const int outHeight = out->shape[heightDimension]; + const int lineSizeIn = mli_hlp_count_elem_num(in, heightDimension + 1) * mli_hlp_tensor_element_size(in); + const int lineSizeOut = mli_hlp_count_elem_num(out, heightDimension + 1) * mli_hlp_tensor_element_size(out); + int maxLinesIn = 0; + int maxLinesOut = 0; + int maxOutLinesForInput = 0; + bool fit = (in->capacity >= inHeight * lineSizeIn) && (out->capacity >= outHeight * lineSizeOut); + if (fit) { + // in case both tensors completely fit in the capacity, there is no need for slicing + *inSliceHeight = inHeight; + *outSliceHeight = outHeight; + } else { + // First compute how many lines fit into the input tensor, and compute how many output lines can be computed with that. + maxLinesIn = MIN(inHeight, in->capacity / lineSizeIn); + if (maxLinesIn >= inHeight) { + maxOutLinesForInput = outHeight; + } else if (2 * maxLinesIn >= inHeight) { + // in this case only two slices are needed, so both could benefit from padding. take the MIN to get the worst case. + maxOutLinesForInput = (maxLinesIn + MIN(padding_top, padding_bot) - kernelHeight + 1) / strideHeight; + } else { + maxOutLinesForInput = (maxLinesIn - kernelHeight + 1) / strideHeight; // TODO add padding exceptions and test by makin fit=false; + } + // Ten compute how many ouput lines fit into the output tensor. + maxLinesOut = MIN(outHeight, out->capacity / lineSizeOut); + // the smallest of the two determines the slice height for the output, and the derived sliceheight for the input. + *outSliceHeight = MIN(maxOutLinesForInput, maxLinesOut); + *inSliceHeight = *outSliceHeight * strideHeight; } + if ((*inSliceHeight > 0) && (*outSliceHeight > 0)) { + return kTfLiteOk; + } else { + return kTfLiteError; + } +} + +TfLiteStatus arc_scratch_buffer_calc_slice_size_weights( + const mli_tensor *weights, + const mli_tensor *bias, + int *sliceChannels) { + const int weightOutChDimension = 0; // NHWC layout for weigths, output channel dimension is the first dimension. + const int channels = weights->shape[weightOutChDimension]; + + + const int chSizeW = mli_hlp_count_elem_num(weights, weightOutChDimension + 1) * mli_hlp_tensor_element_size(weights); + const int chSizeB = mli_hlp_count_elem_num(bias, weightOutChDimension + 1) * mli_hlp_tensor_element_size(bias); + int maxChWeights = 0; + int maxChBias = 0; + + bool fit = (weights->capacity >= channels * chSizeW) && (bias->capacity >= channels * chSizeB); + if (fit) { + // in case both tensors completely fit in the capacity, there is no need for slicing + *sliceChannels = channels; + } else { + // First compute how many channels fit into the weights tensor + maxChWeights = MIN(channels, weights->capacity / chSizeW); + // Ten compute how many channels fit into the bias tensor. + maxChBias = MIN(channels, bias->capacity / chSizeB); + // the smallest of the two determines the slice size + *sliceChannels = MIN(maxChWeights, maxChBias); + } + + if (*sliceChannels > 0) { + return kTfLiteOk; + } else { + return kTfLiteError; + } +} + +TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context, + mli_tensor* in, + mli_tensor* out) { +#ifdef __Xxy int requestSizeIn = 0; int requestSizeOut = 0; int grantsizeIn = 0; @@ -89,8 +195,8 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, if (!inside_arc_ccm(in->data)) { // In case the input tensor contains multiple batches, it has rank 4 // because the mli kernel cannot operate on batches, we need to have the size - // of a single batch. that is why the startRank is 1 in case of input rank 4 - int startRank = in->rank - 3; // tOdo explain + // of a single HWC tensor. that is why the startRank is 1 in case of input rank 4 + int startRank = in->rank - 3; requestSizeIn = mli_hlp_count_elem_num(in, startRank) * mli_hlp_tensor_element_size(in); } if (!inside_arc_ccm(out->data)) { @@ -113,76 +219,6 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, out->capacity = grantsizeOut; if (out->data == NULL) return kTfLiteError; } - - return kTfLiteOk; -#else - return kTfLiteOk; -#endif -} - -TfLiteStatus arc_scratch_buffer_calc_slice_size_io( - const mli_tensor *in, - const mli_tensor *out, - const int kernelHeight, - const int strideHeight, - int *inSliceHeight, - int *outSliceHeight) { - const int heightDimension = 1; // todo: compute from rank - const int inHeight = in->shape[heightDimension]; - const int outHeight = out->shape[heightDimension]; - const int lineSizeIn = mli_hlp_count_elem_num(in, heightDimension + 1) * mli_hlp_tensor_element_size(in); - const int lineSizeOut = mli_hlp_count_elem_num(out, heightDimension + 1) * mli_hlp_tensor_element_size(out); - int maxLinesIn = 0; - int maxLinesOut = 0; - int maxOutLinesForInput = 0; - bool fit = (in->capacity >= inHeight * lineSizeIn) && (out->capacity >= outHeight * lineSizeOut); - if (fit) { - // in case both tensors completely fit in the capacity, there is no need for slicing - *inSliceHeight = inHeight; - *outSliceHeight = outHeight; - } else { - // First compute how many lines fit into the input tensor, and compute how many output lines can be computed with that. - maxLinesIn = MIN(inHeight, in->capacity / lineSizeIn); - if (maxLinesIn >= inHeight) { - maxOutLinesForInput = outHeight; - } else { - maxOutLinesForInput = (maxLinesIn - kernelHeight + 1) / strideHeight; // TODO add padding exceptions and test by makin fit=false; - } - // Ten compute how many ouput lines fit into the output tensor. - maxLinesOut = MIN(outHeight, out->capacity / lineSizeOut); - // the smallest of the two determines the slice height for the output, and the derived sliceheight for the input. - *outSliceHeight = MIN(maxOutLinesForInput, maxLinesOut); - *inSliceHeight = *outSliceHeight * strideHeight; - } - if ((*inSliceHeight > 0) && (*outSliceHeight > 0)) { - return kTfLiteOk; - } else { - return kTfLiteError; - } -} - -TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context, - mli_tensor* in, - mli_tensor* out) { -#ifdef __Xxy - // Function to assign fast memory from one of 3 scratch buffers. - // Best Fit strategy - memory is asigned to those tensor which leave less memory of bank unused - mli_tensor* tensors[2] = { in, out }; - uint32_t tensor_sizes[2] = { - mli_hlp_count_elem_num(tensors[0], 0), mli_hlp_count_elem_num(tensors[1], 0)}; - int num_tensors = 2; - - - for (int i = 0; i < num_tensors; ++i) { - // only for tensors that are not already located in one of the ccm memories, find a local memory that fits the data size. - if (inside_arc_ccm(tensors[i]->data)) continue; - tensors[i]->data = get_arc_scratch_buffer(tensor_sizes[i]); - tensors[i]->capacity = tensor_sizes[i]; - - if (tensors[i]->data == NULL) { - return kTfLiteError; - } - } #endif return kTfLiteOk; } diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h index a27df8a5358..fc348229235 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h +++ b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h @@ -64,9 +64,15 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io( const mli_tensor *out, const int kernelHeight, const int strideHeight, + const int padding_top, + const int padding_bot, int *inSliceHeight, int *outSliceHeight); +TfLiteStatus arc_scratch_buffer_calc_slice_size_weights( + const mli_tensor *weights, + const mli_tensor *bias, + int *sliceChannels); } // namespace micro } // namespace ops diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc index 5ef1b445a22..106743cf471 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc +++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc @@ -74,8 +74,9 @@ void *get_arc_scratch_buffer(int size) { void *buf = NULL; int best_mem_idx = -1; int best_mem_delta = INT_MAX; + const int numMem = sizeof(scratch_mem)/sizeof(scratch_mem[0]); // find a local memory that fits the data size. - for (int mem_idx = 0; mem_idx < sizeof(scratch_mem)/sizeof(scratch_mem[0]); ++mem_idx) { + for (int mem_idx = 0; mem_idx < numMem; ++mem_idx) { // Best Fit if ((size <= scratch_sizes[mem_idx]) && (scratch_sizes[mem_idx] - size < best_mem_delta)) { best_mem_idx = mem_idx; @@ -90,11 +91,24 @@ void *get_arc_scratch_buffer(int size) { return buf; } +void get_arc_scratch_buffer_max_size(int *size) { + int maxavailable = 0; + const int numMem = sizeof(scratch_mem)/sizeof(scratch_mem[0]); + // find the largest available buffer. + for (int i = 0; i < numMem; i++) { + if (scratch_sizes[i] > maxavailable) { + maxavailable = scratch_sizes[i]; + } + } + *size = maxavailable; +} + void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) { int maxavailable = 0; int secondavail = 0; + const int numMem = sizeof(scratch_mem)/sizeof(scratch_mem[0]); // find the two largest available buffers. - for (int i = 0; i < 3; i++) { + for (int i = 0; i < numMem; i++) { if (scratch_sizes[i] > maxavailable) { secondavail = maxavailable; maxavailable = scratch_sizes[i]; diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h index 52a12c7899d..927e480da5a 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h +++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h @@ -27,6 +27,7 @@ namespace micro { void free_arc_scratch_buffers(void); void *get_arc_scratch_buffer(int size);// Function to assign fast memory from one of 3 scratch buffers. +void get_arc_scratch_buffer_max_size(int *size); void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2); static inline bool inside_arc_dccm(void* p) { From 330c649075978d1718c7b590da38dea640f67698 Mon Sep 17 00:00:00 2001 From: jacco Date: Thu, 26 Mar 2020 17:25:37 +0100 Subject: [PATCH 023/557] weight slicing for depthwise and fully connected in ARC backend --- tensorflow/lite/micro/kernels/arc/conv.cc | 39 +-- .../lite/micro/kernels/arc/depthwise_conv.cc | 106 +++++-- .../lite/micro/kernels/arc/fully_connected.cc | 93 ++++-- .../lite/micro/kernels/arc/mli_slicers.cc | 2 +- tensorflow/lite/micro/kernels/arc/pooling.cc | 19 +- .../lite/micro/kernels/arc/scratch_buf_mgr.cc | 288 +++++++++++------- .../lite/micro/kernels/arc/scratch_buf_mgr.h | 71 ++++- .../lite/micro/kernels/arc/scratch_buffers.cc | 14 +- .../lite/micro/kernels/arc/scratch_buffers.h | 2 +- 9 files changed, 434 insertions(+), 200 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/arc/conv.cc index 9e9a37821e8..6cf26c7d6d9 100644 --- a/tensorflow/lite/micro/kernels/arc/conv.cc +++ b/tensorflow/lite/micro/kernels/arc/conv.cc @@ -201,16 +201,16 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, } // for height slicing - const int heightDimension = 1; - int inSliceHeight = 0; - int outSliceHeight = 0; - const int kernelHeight = static_cast(mli_weights.shape[KRNL_H_DIM_HWC]); - const int overlap = kernelHeight - cfg.stride_height; + const int height_dimension = 1; + int in_slice_height = 0; + int out_slice_height = 0; + const int kernel_height = static_cast(mli_weights.shape[KRNL_H_DIM_HWC]); + const int overlap = kernel_height - cfg.stride_height; // for weight slicing (on output channels) - const int weightOutChDimension = 0; // NHWC layout for weigths, output channel dimension is the first dimension. - int sliceChannels = static_cast(mli_weights.shape[weightOutChDimension]); - const int outTensorChDimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels. + const int weight_out_ch_dimension = 0; // NHWC layout for weigths, output channel dimension is the first dimension. + int slice_channels = static_cast(mli_weights.shape[weight_out_ch_dimension]); + const int out_tensor_ch_dimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels. // Tensors for data in fast (local) memory and config to copy data from external to local memory mli_tensor weights_local = mli_weights; @@ -220,8 +220,8 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, mli_mov_cfg_t copy_config; mli_mov_cfg_for_copy(©_config); TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); - TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight)); - TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, &sliceChannels)); + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &in_slice_height, &out_slice_height)); + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, weight_out_ch_dimension, &slice_channels)); /* is_local indicates that the tensor is already in local memory, so in that case the original tensor can be used, @@ -231,14 +231,15 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, const bool w_is_local = weights_local.data == mli_weights.data; const bool b_is_local = bias_local.data == mli_bias.data; - TensorSlicer w_slice(&mli_weights, weightOutChDimension, sliceChannels); - TensorSlicer b_slice(&mli_bias, weightOutChDimension, sliceChannels); - TensorSlicer out_ch_slice(&mli_out, outTensorChDimension, sliceChannels, 0, 0, 0, true); + TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels); + TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension, slice_channels); + TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true); mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local; mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local; - void *inputBufferPtr = NULL; + void *input_buffer_ptr = NULL; + int input_buffer_size = 0; while (!w_slice.Done()){ mli_mov_tensor_sync(w_slice.Sub(), ©_config, w_ptr); @@ -249,12 +250,12 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated. The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) in chunks of 'sliceHeight' */ - TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap); + TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height, cfg.padding_top, cfg.padding_bottom, overlap); /* output tensor is alreade sliced in the output channel dimension. out_ch_slice.Sub() is the tensor for the amount of output channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and height dimension. */ - TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight); + TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension, out_slice_height); /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */ mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; @@ -266,9 +267,10 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, cfg.padding_bottom = in_slice.GetPaddingPost(); // if same input copy as previous iteration, skip the copy of input - if (in_slice.Sub()->data != inputBufferPtr) { + if ((in_slice.Sub()->data != input_buffer_ptr) || (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) { mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); - inputBufferPtr = in_slice.Sub()->data; + input_buffer_ptr = in_slice.Sub()->data; + input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0); } mli_krn_conv2d_nhwc_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr); mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); @@ -282,7 +284,6 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, TF_LITE_ENSURE(context, in_slice.Done()); } - free_arc_scratch_buffers(); } else { ConvParams op_params; op_params.input_offset = -input->params.zero_point; diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc index 00c46c442b7..74e48c8c064 100644 --- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc +++ b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc @@ -191,12 +191,21 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, cfg.padding_bottom = data->padding.height + data->padding.height_offset; } + // for height slicing const int heightDimension = 1; int inSliceHeight = 0; int outSliceHeight = 0; const int kernelHeight = static_cast(mli_weights.shape[KRNL_DW_H_DIM_HWC]); const int overlap = kernelHeight - cfg.stride_height; + // for weight slicing (on output channels) + const int weight_out_ch_dimension = 3; // HWCN layout for weigths, output channel dimension is the first dimension. + const int bias_out_ch_dimension = 0; // bias has only 1 dimension + const int out_tensor_ch_dimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels. + const int32_t in_channels = mli_in.shape[out_tensor_ch_dimension]; + const int32_t out_channels = mli_out.shape[out_tensor_ch_dimension]; + int slice_channels = static_cast(mli_weights.shape[weight_out_ch_dimension]); + // Tensors for data in fast (local) memory and config to copy data from external to local memory mli_tensor weights_local = mli_weights; mli_tensor bias_local = mli_bias; @@ -206,38 +215,83 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, mli_mov_cfg_for_copy(©_config); TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); - /* if the tensor is already in local memory, is_local is true */ + /* is_local indicates that the tensor is already in local memory, + so in that case the original tensor can be used, + and there is no need to copy it to the local tensor*/ const bool in_is_local = in_local.data == mli_in.data; const bool out_is_local = out_local.data == mli_out.data; + const bool w_is_local = weights_local.data == mli_weights.data; + const bool b_is_local = bias_local.data == mli_bias.data; TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight)); + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, weight_out_ch_dimension, &slice_channels)); - /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor. - because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. - on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated. - The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) - in chunks of 'sliceHeight' */ - TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap); - TensorSlicer out_slice(&mli_out, heightDimension, outSliceHeight); - - mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; - mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; - - mli_mov_tensor_sync(&mli_weights, ©_config, &weights_local); - mli_mov_tensor_sync(&mli_bias, ©_config, &bias_local); - - while (!out_slice.Done()) { - cfg.padding_top = in_slice.GetPaddingPre(); - cfg.padding_bottom = in_slice.GetPaddingPost(); - - mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); - mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr); - mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); - - in_slice.Next(); - out_slice.Next(); + /* if input channels is not equal to output channels, a channel multiplier is used. + in this case the slice channels needs to be rounded down to a multiple of the input channels */ + if (in_channels != out_channels) { + slice_channels = (slice_channels / in_channels) * in_channels; } - free_arc_scratch_buffers(); + + TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0, 0, 0, true); + TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels); + TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true); + TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true); + + mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local; + mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local; + + void *input_buffer_ptr = NULL; + int input_buffer_size = 0; + int padding_top = cfg.padding_top; + int padding_bottom = cfg.padding_bottom; + + while (!w_slice.Done()){ + mli_mov_tensor_sync(w_slice.Sub(), ©_config, w_ptr); + mli_mov_tensor_sync(b_slice.Sub(), ©_config, b_ptr); + + /* input tensor is alreade sliced in the channel dimension. out_ch_slice.Sub() is the tensor for the amount of + channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and + height dimension. + in_ch_slice.Sub() tensor contains batches of HWC tensors. so it is a 4 dimensional tensor. + because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. + on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated. + The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) + in chunks of 'sliceHeight' */ + TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight, padding_top, padding_bottom, overlap); + + /* output tensor is alreade sliced in the output channel dimension. out_ch_slice.Sub() is the tensor for the amount of + output channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and + height dimension. */ + TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight); + + /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */ + mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; + mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; + + while (!out_slice.Done()) { + TF_LITE_ENSURE(context, !in_slice.Done()); + cfg.padding_top = in_slice.GetPaddingPre(); + cfg.padding_bottom = in_slice.GetPaddingPost(); + + // if same input copy as previous iteration, skip the copy of input + if ((in_slice.Sub()->data != input_buffer_ptr) || (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) { + mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); + input_buffer_ptr = in_slice.Sub()->data; + input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0); + } + mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr); + mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); + + in_slice.Next(); + out_slice.Next(); + } + w_slice.Next(); + b_slice.Next(); + out_ch_slice.Next(); + in_ch_slice.Next(); + TF_LITE_ENSURE(context, in_slice.Done()); + } + } else { DepthwiseParams op_params; op_params.padding_type = PaddingType::kSame; diff --git a/tensorflow/lite/micro/kernels/arc/fully_connected.cc b/tensorflow/lite/micro/kernels/arc/fully_connected.cc index 42921037481..cc9b95c570a 100644 --- a/tensorflow/lite/micro/kernels/arc/fully_connected.cc +++ b/tensorflow/lite/micro/kernels/arc/fully_connected.cc @@ -26,6 +26,7 @@ limitations under the License. #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" #include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h" #include "mli_api.h" @@ -100,44 +101,80 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node, ConvertToMliTensor(bias, &mli_bias); ConvertToMliTensor(output, &mli_out); - mli_point_to_subtsr_cfg subtsr_cfg_in = {{0, 0}, 2, static_cast(mli_in.shape[1])}; - mli_point_to_subtsr_cfg subtsr_cfg_out = {{0, 0}, 2, static_cast(mli_out.shape[1])}; - mli_tensor sub_mli_in = {0}; - mli_tensor sub_mli_out = {0}; - mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); + /* The input tensor can have more than 2 dimensions. for the compute this doesn't make any difference + because all the inputs or a batch entry will be used anyway. because the MLI kernel doesn't recognize + the multiple dimensions, the tensor shape is casted to a {batchnum, inputsize} shape. */ + mli_in.shape[0] = mli_out.shape[0]; + mli_in.shape[1] = mli_weights.shape[1]; + mli_in.shape[2] = 0; + mli_in.shape[3] = 0; + mli_in.rank = 2; // Tensors for data in fast (local) memory and config to copy data from external to local memory mli_tensor weights_local = mli_weights; mli_tensor bias_local = mli_bias; - mli_tensor in_local = sub_mli_in; - mli_tensor out_local = sub_mli_out; + mli_tensor in_local = mli_in; + mli_tensor out_local = mli_out; mli_mov_cfg_t copy_config; mli_mov_cfg_for_copy(©_config); - TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); - bool in_is_local = in_local.data == sub_mli_in.data; - bool out_is_local = out_local.data == sub_mli_out.data; + const int weight_out_dimension = 0; + const int out_tensor_dimension = 1; + const int batch_dimension = 0; + int slice_size = mli_weights.shape[weight_out_dimension]; - mli_mov_tensor_sync(&mli_weights, ©_config, &weights_local); - mli_mov_tensor_sync(&mli_bias, ©_config, &bias_local); + /* allocate the local buffers, and compute the slice size */ + TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, weight_out_dimension, &slice_size)); + int max_out_slice_size = out_local.capacity / mli_hlp_tensor_element_size(&out_local); + if (slice_size > max_out_slice_size) slice_size = max_out_slice_size; - const int batches = - MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0); + /* is_local indicates that the tensor is already in local memory, + so in that case the original tensor can be used, + and there is no need to copy it to the local tensor*/ + const bool in_is_local = in_local.data == mli_in.data; + const bool out_is_local = out_local.data == mli_out.data; + const bool w_is_local = weights_local.data == mli_weights.data; + const bool b_is_local = bias_local.data == mli_bias.data; - for (int i = 0; i < batches; i++) { - mli_mov_tensor_sync(&sub_mli_in, ©_config, &in_local); - mli_krn_fully_connected_sa8_sa8_sa32(&in_local, &weights_local, &bias_local, &out_local); - mli_mov_tensor_sync(&out_local, ©_config, &sub_mli_out); - subtsr_cfg_in.start_coord[0]++; - subtsr_cfg_out.start_coord[0]++; - mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); - if (in_is_local) { - in_local.data = sub_mli_in.data; - } - if (out_is_local) { - out_local.data = sub_mli_out.data; + TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size); + TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size); + TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0, true); + + mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local; + mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local; + + void *input_buffer_ptr = NULL; + + while (!w_slice.Done()){ + mli_mov_tensor_sync(w_slice.Sub(), ©_config, w_ptr); + mli_mov_tensor_sync(b_slice.Sub(), ©_config, b_ptr); + + TensorSlicer in_slice(&mli_in, batch_dimension, 1); + + /* output tensor is alreade sliced in the output size dimension. out_ch_slice.Sub() is the tensor for the amount of + output size of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch */ + TensorSlicer out_slice(out_ch_slice.Sub(), batch_dimension, 1); + + /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */ + mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; + mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; + + while (!out_slice.Done()) { + + // if same input copy as previous iteration, skip the copy of input + if (in_slice.Sub()->data != input_buffer_ptr) { + mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); + input_buffer_ptr = in_slice.Sub()->data; + } + mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, out_ptr); + mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); + + in_slice.Next(); + out_slice.Next(); } + w_slice.Next(); + b_slice.Next(); + out_ch_slice.Next(); } } else { FullyConnectedParams op_params; diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc/mli_slicers.cc index 6c6c89715f8..91bae5caa38 100644 --- a/tensorflow/lite/micro/kernels/arc/mli_slicers.cc +++ b/tensorflow/lite/micro/kernels/arc/mli_slicers.cc @@ -48,7 +48,7 @@ TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int sli sub_cfg_.sub_tensor_rank = full_tensor->rank; } else { - /* In the not interlevaed mode, the slicing happens from the outer most dimension up to the slice_dim + /* In the not interleaved mode, the slicing happens from the outer most dimension up to the slice_dim for example in an HWC layout this mode can be used to slice in the H dimension. in this mode the data of the slice is still contiguous in memory (if that was the case in the input tensor */ for (int i = 0; i< full_tensor->rank; i++){ diff --git a/tensorflow/lite/micro/kernels/arc/pooling.cc b/tensorflow/lite/micro/kernels/arc/pooling.cc index 0cfa5363d69..7a26a10e23b 100644 --- a/tensorflow/lite/micro/kernels/arc/pooling.cc +++ b/tensorflow/lite/micro/kernels/arc/pooling.cc @@ -140,9 +140,9 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node, mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); - const int heightDimension = 1; - int inSliceHeight = 0; - int outSliceHeight = 0; + const int height_dimension = 1; + int in_slice_height = 0; + int out_slice_height = 0; const int overlap = cfg.kernel_height - cfg.stride_height; // Tensors for data in fast (local) memory and config to copy data from external to local memory @@ -150,19 +150,22 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node, mli_tensor out_local = sub_mli_out; mli_mov_cfg_t copy_config; mli_mov_cfg_for_copy(©_config); - TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_io_tensors(context, &in_local, &out_local)); + TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_pooling_tensors(context, &in_local, &out_local)); bool in_is_local = in_local.data == sub_mli_in.data; bool out_is_local = out_local.data == sub_mli_out.data; - TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, cfg.kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight)); + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, cfg.kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &in_slice_height, &out_slice_height)); /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor. because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated. The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) in chunks of 'sliceHeight' */ - TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap); - TensorSlicer out_slice(&mli_out, heightDimension, outSliceHeight); + TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height, cfg.padding_top, cfg.padding_bottom, overlap); + TensorSlicer out_slice(&mli_out, height_dimension, out_slice_height); + /* is_local indicates that the tensor is already in local memory, + so in that case the original tensor can be used, + and there is no need to copy it to the local tensor*/ mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; @@ -177,7 +180,7 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node, in_slice.Next(); out_slice.Next(); } - free_arc_scratch_buffers(); + } else { int32_t activation_min, activation_max; (void)CalculateActivationRangeQuantized(context, params->activation, output, diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc index e9adbb37e9e..5bd2d6aed22 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc +++ b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc @@ -23,21 +23,19 @@ namespace tflite { namespace ops { namespace micro { - - -void get_arc_two_buffer_sizes(int requestsize1, int requestsize2, int *grantsize1, int *grantsize2) { +static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2, int *grant_size_1, int *grant_size_2) { int maxrequest = 0; int secondrequest = 0; int maxavailable = 0; int secondavail = 0; // determine the largest requested buffer. - if (requestsize1 > requestsize2) { - maxrequest = requestsize1; - secondrequest = requestsize2; + if (request_size_1 > request_size_2) { + maxrequest = request_size_1; + secondrequest = request_size_2; } else { - maxrequest = requestsize2; - secondrequest = requestsize1; + maxrequest = request_size_2; + secondrequest = request_size_1; } // find the two largest available buffers. @@ -45,40 +43,79 @@ void get_arc_two_buffer_sizes(int requestsize1, int requestsize2, int *grantsize // in case two buffers are available, the largest buffer can go to the largest request. if (secondavail > 0) { // this condition can be enhanced to prevent cases where the second buffer is so small that it is better to use one buffer and split it. - if (requestsize1 > requestsize2) { - *grantsize1 = maxavailable; - *grantsize2 = secondavail; + if (request_size_1 > request_size_2) { + *grant_size_1 = maxavailable; + *grant_size_2 = secondavail; } else { - *grantsize1 = secondavail; - *grantsize2 = maxavailable; + *grant_size_1 = secondavail; + *grant_size_2 = maxavailable; } } else { // In case only one buffer is available, // use only the max buffer, and split it. // TODO compute optimal split ratio based on request ratio. - *grantsize1 = maxavailable / 2; - *grantsize2 = maxavailable / 2; + *grant_size_1 = maxavailable / 2; + *grant_size_2 = maxavailable / 2; } } +static TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context, + mli_tensor* in, + mli_tensor* out) { +#ifdef __Xxy + int request_size_in = 0; + int request_size_out = 0; + int grant_size_in = 0; + int grant_size_out = 0; + if (!inside_arc_ccm(in->data)) { + // In case the input tensor contains multiple batches, it has rank 4 + // because the mli kernel cannot operate on batches, we need to have the size + // of a single HWC tensor. that is why the start_rank is 1 in case of input rank 4 + int start_rank = in->rank - 3; + request_size_in = mli_hlp_count_elem_num(in, start_rank) * mli_hlp_tensor_element_size(in); + } + if (!inside_arc_ccm(out->data)) { + // In case the input tensor contains multiple batches, it has rank 4 + // because the mli kernel cannot operate on batches, we need to have the size + // of a single batch. that is why the start_rank is 1 in case of input rank 4 + int start_rank = out->rank - 3; + request_size_out = mli_hlp_count_elem_num(out, start_rank) * mli_hlp_tensor_element_size(out); + } + + get_arc_two_buffer_sizes(request_size_in, request_size_out, &grant_size_in, &grant_size_out); + + if (!inside_arc_ccm(in->data)) { + in->data = get_arc_scratch_buffer(grant_size_in); + in->capacity = grant_size_in; + if (in->data == NULL) return kTfLiteError; + } + if (!inside_arc_ccm(out->data)) { + out->data = get_arc_scratch_buffer(grant_size_out); + out->capacity = grant_size_out; + if (out->data == NULL) return kTfLiteError; + } +#endif + return kTfLiteOk; +} + TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, - mli_tensor* in, - mli_tensor* weights, - mli_tensor* bias, + mli_tensor* in, + mli_tensor* weights, + mli_tensor* bias, mli_tensor* out) { TfLiteStatus ret_val = kTfLiteOk; #ifdef __Xxy - + init_arc_scratch_buffers(); if (!inside_arc_ccm(weights->data)) { int weights_size = mli_hlp_count_elem_num(weights, 0) * mli_hlp_tensor_element_size(weights); - int maxWeightsSize = 0; + int max_weights_size = 0; weights->data = get_arc_scratch_buffer(weights_size); weights->capacity = weights_size; if (weights->data == NULL) { - get_arc_scratch_buffer_max_size(&maxWeightsSize); - weights->data = get_arc_scratch_buffer(maxWeightsSize); - weights->capacity = maxWeightsSize; - if (maxWeightsSize == 0) ret_val = kTfLiteError; + get_arc_scratch_buffer_max_size(&max_weights_size); + weights->data = get_arc_scratch_buffer(max_weights_size); + weights->capacity = max_weights_size; + if (max_weights_size == 0) ret_val = kTfLiteError; } if (weights->data == NULL) ret_val = kTfLiteError; } @@ -88,15 +125,92 @@ TfLiteStatus ret_val = kTfLiteOk; bias->data = get_arc_scratch_buffer(bias_mem_requirements); bias->capacity = bias_mem_requirements; } + if (ret_val == kTfLiteOk) { ret_val = get_arc_scratch_buffer_for_io_tensors(context, in, out); } + if (bias->data == NULL) { - int maxBiasSize = 0; - get_arc_scratch_buffer_max_size(&maxBiasSize); - bias->data = get_arc_scratch_buffer(maxBiasSize); - bias->capacity = maxBiasSize; - if (maxBiasSize == 0) ret_val = kTfLiteError; + int max_bias_size = 0; + get_arc_scratch_buffer_max_size(&max_bias_size); + bias->data = get_arc_scratch_buffer(max_bias_size); + bias->capacity = max_bias_size; + if (max_bias_size == 0) ret_val = kTfLiteError; + } + if (bias->data == NULL) ret_val = kTfLiteError; + +#endif + return ret_val; +} + +TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(TfLiteContext* context, + mli_tensor* in, + mli_tensor* weights, + mli_tensor* bias, + mli_tensor* out) { +TfLiteStatus ret_val = kTfLiteOk; +#ifdef __Xxy + init_arc_scratch_buffers(); + /* strategy for FC kernels: + first allocate input, because this cannot be sliced. (in case of batch processing, only a single input needs to be allocated) + then weigths & bias because if fully loaded, they can be reused over batches. + then output. + The number of output channels (for weights slicing) depends on size of output and size of weights&bias */ + + if (!inside_arc_ccm(in->data)) { + /* In case the input tensor contains multiple batches, + only count the size if the inner most dimension */ + int size_in = mli_hlp_count_elem_num(in, in->rank - 1) * mli_hlp_tensor_element_size(in); + in->data = get_arc_scratch_buffer(size_in); + in->capacity = size_in; + if (in->data == NULL) { + in->capacity = 0; + ret_val = kTfLiteError; + } + } + + if (!inside_arc_ccm(weights->data)) { + int weights_size = mli_hlp_count_elem_num(weights, 0) * mli_hlp_tensor_element_size(weights); + int max_weights_size = 0; + weights->data = get_arc_scratch_buffer(weights_size); + weights->capacity = weights_size; + if (weights->data == NULL) { + get_arc_scratch_buffer_max_size(&max_weights_size); + weights->data = get_arc_scratch_buffer(max_weights_size); + weights->capacity = max_weights_size; + if (max_weights_size == 0) ret_val = kTfLiteError; + } + if (weights->data == NULL) ret_val = kTfLiteError; + } + + if (!inside_arc_ccm(bias->data)) { + int bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias); + bias->data = get_arc_scratch_buffer(bias_mem_requirements); + bias->capacity = bias_mem_requirements; + } + + if (!inside_arc_ccm(out->data)) { + /* In case the input tensor contains multiple batches, + only count the size if the inner most dimension */ + int out_size = mli_hlp_count_elem_num(out, out->rank - 1) * mli_hlp_tensor_element_size(out); + int max_out_size = 0; + out->data = get_arc_scratch_buffer(out_size); + out->capacity = out_size; + if (out->data == NULL) { + get_arc_scratch_buffer_max_size(&max_out_size); + out->data = get_arc_scratch_buffer(max_out_size); + out->capacity = max_out_size; + if (max_out_size == 0) ret_val = kTfLiteError; + } + if (out->data == NULL) ret_val = kTfLiteError; + } + + if (bias->data == NULL) { + int max_bias_size = 0; + get_arc_scratch_buffer_max_size(&max_bias_size); + bias->data = get_arc_scratch_buffer(max_bias_size); + bias->capacity = max_bias_size; + if (max_bias_size == 0) ret_val = kTfLiteError; } if (bias->data == NULL) ret_val = kTfLiteError; @@ -107,44 +221,44 @@ TfLiteStatus ret_val = kTfLiteOk; TfLiteStatus arc_scratch_buffer_calc_slice_size_io( const mli_tensor *in, const mli_tensor *out, - const int kernelHeight, - const int strideHeight, + const int kernel_height, + const int stride_height, const int padding_top, const int padding_bot, - int *inSliceHeight, - int *outSliceHeight) { - const int heightDimension = 1; // todo: compute from rank - const int inHeight = in->shape[heightDimension]; - const int outHeight = out->shape[heightDimension]; - const int lineSizeIn = mli_hlp_count_elem_num(in, heightDimension + 1) * mli_hlp_tensor_element_size(in); - const int lineSizeOut = mli_hlp_count_elem_num(out, heightDimension + 1) * mli_hlp_tensor_element_size(out); - int maxLinesIn = 0; - int maxLinesOut = 0; - int maxOutLinesForInput = 0; - bool fit = (in->capacity >= inHeight * lineSizeIn) && (out->capacity >= outHeight * lineSizeOut); + int *in_slice_height, + int *out_slice_height) { + const int height_dimension = 1; // todo: compute from rank + const int in_height = in->shape[height_dimension]; + const int out_height = out->shape[height_dimension]; + const int line_size_in = mli_hlp_count_elem_num(in, height_dimension + 1) * mli_hlp_tensor_element_size(in); + const int line_size_out = mli_hlp_count_elem_num(out, height_dimension + 1) * mli_hlp_tensor_element_size(out); + int max_lines_in = 0; + int max_lines_out = 0; + int max_out_lines_for_input = 0; + bool fit = (in->capacity >= in_height * line_size_in) && (out->capacity >= out_height * line_size_out); if (fit) { // in case both tensors completely fit in the capacity, there is no need for slicing - *inSliceHeight = inHeight; - *outSliceHeight = outHeight; + *in_slice_height = in_height; + *out_slice_height = out_height; } else { // First compute how many lines fit into the input tensor, and compute how many output lines can be computed with that. - maxLinesIn = MIN(inHeight, in->capacity / lineSizeIn); - if (maxLinesIn >= inHeight) { - maxOutLinesForInput = outHeight; - } else if (2 * maxLinesIn >= inHeight) { + max_lines_in = MIN(in_height, in->capacity / line_size_in); + if (max_lines_in >= in_height) { + max_out_lines_for_input = out_height; + } else if (2 * max_lines_in >= in_height) { // in this case only two slices are needed, so both could benefit from padding. take the MIN to get the worst case. - maxOutLinesForInput = (maxLinesIn + MIN(padding_top, padding_bot) - kernelHeight + 1) / strideHeight; + max_out_lines_for_input = (max_lines_in + MIN(padding_top, padding_bot) - kernel_height + 1) / stride_height; } else { - maxOutLinesForInput = (maxLinesIn - kernelHeight + 1) / strideHeight; // TODO add padding exceptions and test by makin fit=false; + max_out_lines_for_input = (max_lines_in - kernel_height + 1) / stride_height; // TODO add padding exceptions and test by makin fit=false; } // Ten compute how many ouput lines fit into the output tensor. - maxLinesOut = MIN(outHeight, out->capacity / lineSizeOut); + max_lines_out = MIN(out_height, out->capacity / line_size_out); // the smallest of the two determines the slice height for the output, and the derived sliceheight for the input. - *outSliceHeight = MIN(maxOutLinesForInput, maxLinesOut); - *inSliceHeight = *outSliceHeight * strideHeight; + *out_slice_height = MIN(max_out_lines_for_input, max_lines_out); + *in_slice_height = *out_slice_height * stride_height; } - if ((*inSliceHeight > 0) && (*outSliceHeight > 0)) { + if ((*in_slice_height > 0) && (*out_slice_height > 0)) { return kTfLiteOk; } else { return kTfLiteError; @@ -154,73 +268,43 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io( TfLiteStatus arc_scratch_buffer_calc_slice_size_weights( const mli_tensor *weights, const mli_tensor *bias, - int *sliceChannels) { - const int weightOutChDimension = 0; // NHWC layout for weigths, output channel dimension is the first dimension. - const int channels = weights->shape[weightOutChDimension]; + const int weight_out_ch_dimension, + int *slice_channels) { + const int channels = weights->shape[weight_out_ch_dimension]; + const int ch_size_w = (mli_hlp_count_elem_num(weights, 0) / channels) * mli_hlp_tensor_element_size(weights); + const int ch_size_b = (mli_hlp_count_elem_num(bias, 0) / channels) * mli_hlp_tensor_element_size(bias); + int max_ch_weigths = 0; + int max_ch_bias = 0; - - const int chSizeW = mli_hlp_count_elem_num(weights, weightOutChDimension + 1) * mli_hlp_tensor_element_size(weights); - const int chSizeB = mli_hlp_count_elem_num(bias, weightOutChDimension + 1) * mli_hlp_tensor_element_size(bias); - int maxChWeights = 0; - int maxChBias = 0; - - bool fit = (weights->capacity >= channels * chSizeW) && (bias->capacity >= channels * chSizeB); + bool fit = (weights->capacity >= channels * ch_size_w) && (bias->capacity >= channels * ch_size_b); if (fit) { // in case both tensors completely fit in the capacity, there is no need for slicing - *sliceChannels = channels; + *slice_channels = channels; } else { // First compute how many channels fit into the weights tensor - maxChWeights = MIN(channels, weights->capacity / chSizeW); + max_ch_weigths = MIN(channels, weights->capacity / ch_size_w); // Ten compute how many channels fit into the bias tensor. - maxChBias = MIN(channels, bias->capacity / chSizeB); + max_ch_bias = MIN(channels, bias->capacity / ch_size_b); // the smallest of the two determines the slice size - *sliceChannels = MIN(maxChWeights, maxChBias); + *slice_channels = MIN(max_ch_weigths, max_ch_bias); } - if (*sliceChannels > 0) { + if (*slice_channels > 0) { return kTfLiteOk; } else { return kTfLiteError; } } -TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context, +TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context, mli_tensor* in, mli_tensor* out) { #ifdef __Xxy - int requestSizeIn = 0; - int requestSizeOut = 0; - int grantsizeIn = 0; - int grantsizeOut = 0; - if (!inside_arc_ccm(in->data)) { - // In case the input tensor contains multiple batches, it has rank 4 - // because the mli kernel cannot operate on batches, we need to have the size - // of a single HWC tensor. that is why the startRank is 1 in case of input rank 4 - int startRank = in->rank - 3; - requestSizeIn = mli_hlp_count_elem_num(in, startRank) * mli_hlp_tensor_element_size(in); - } - if (!inside_arc_ccm(out->data)) { - // In case the input tensor contains multiple batches, it has rank 4 - // because the mli kernel cannot operate on batches, we need to have the size - // of a single batch. that is why the startRank is 1 in case of input rank 4 - int startRank = out->rank - 3; - requestSizeOut = mli_hlp_count_elem_num(out, startRank) * mli_hlp_tensor_element_size(out); - } - - get_arc_two_buffer_sizes(requestSizeIn, requestSizeOut, &grantsizeIn, &grantsizeOut); - - if (!inside_arc_ccm(in->data)) { - in->data = get_arc_scratch_buffer(grantsizeIn); - in->capacity = grantsizeIn; - if (in->data == NULL) return kTfLiteError; - } - if (!inside_arc_ccm(out->data)) { - out->data = get_arc_scratch_buffer(grantsizeOut); - out->capacity = grantsizeOut; - if (out->data == NULL) return kTfLiteError; - } -#endif + init_arc_scratch_buffers(); + return get_arc_scratch_buffer_for_io_tensors(context, in, out); +#else return kTfLiteOk; +#endif } } // namespace micro diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h index fc348229235..276f976cf0f 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h +++ b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h @@ -38,13 +38,13 @@ namespace micro { * @return Tf Lite status code */ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, - mli_tensor* in, - mli_tensor* weights, - mli_tensor* bias, + mli_tensor* in, + mli_tensor* weights, + mli_tensor* bias, mli_tensor* out); /** - * @brief Function to allocate scratch buffers for kernels with only input and output buffers + * @brief Function to allocate scratch buffers for pooling kernels with only input and output buffers * * @detail This function will update the data pointers in the 2 tensors with pointers * to scratch buffers in fast local memory. @@ -55,10 +55,49 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context, * * @return Tf Lite status code */ -TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context, +TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context, mli_tensor* in, mli_tensor* out); +/** + * @brief Function to allocate scratch buffers for the fully connect tensors + * + * @detail This function will update the data pointers in the 4 tensors with pointers + * to scratch buffers in fast local memory. + * + * @param context [I] pointer to TfLite context (needed for error handling) + * @param in [IO] pointer to the input tensor + * @param weights [IO] pointer to the weights tensor + * @param bias [IO] pointer to the bias tensor + * @param output [IO] pointer to the output tensor + * + * @return Tf Lite status code + */ +TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(TfLiteContext* context, + mli_tensor* in, + mli_tensor* weights, + mli_tensor* bias, + mli_tensor* out); + +/** + * @brief Function to calculate slice size for io tensors + * + * @detail This function will calculate the slice size in the height dimension + * for input and output tensors. it takes into account the kernel size and the padding. + * the function will look at the capacity filed in the in and out tensor to + * determine the available buffersize. + * + * @param in [I] pointer to the input tensor + * @param out [I] pointer to the output tensor + * @param kernelHeight [I] size of the kernel in height dimension + * @param strideHeight [I] input stride in height dimension + * @param padding_top [I] number of lines with zeros at the top + * @param padding_bot [I] number of lines with zeros at the bottom + * @param inSliceHeight [O] slice size in height dimension for the input tensor + * @param outSliceHeight [O] slice size in height dimension for the output tensor + * + * @return Tf Lite status code + */ TfLiteStatus arc_scratch_buffer_calc_slice_size_io( const mli_tensor *in, const mli_tensor *out, @@ -66,13 +105,29 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io( const int strideHeight, const int padding_top, const int padding_bot, - int *inSliceHeight, - int *outSliceHeight); + int *in_slice_height, + int *out_slice_height); +/** + * @brief Function to calculate slice size for weight slicing + * + * @detail This function will calculate the slice size in the output channel dimension + * for weight and bias tensors. + * the function will look at the capacity filed in the weights and bias tensor to + * determine the available buffersize. + * + * @param weights [I] pointer to the input tensor + * @param bias [I] pointer to the output tensor + * @param weightOutChDimension [I] dimension of the output channels in the weights tensor + * @param sliceChannels [O] slice size in output channel dimension + * + * @return Tf Lite status code + */ TfLiteStatus arc_scratch_buffer_calc_slice_size_weights( const mli_tensor *weights, const mli_tensor *bias, - int *sliceChannels); + const int weight_out_ch_dimension, + int *slice_channels); } // namespace micro } // namespace ops diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc index 106743cf471..f36059f82d2 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc +++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc @@ -74,9 +74,9 @@ void *get_arc_scratch_buffer(int size) { void *buf = NULL; int best_mem_idx = -1; int best_mem_delta = INT_MAX; - const int numMem = sizeof(scratch_mem)/sizeof(scratch_mem[0]); + const int num_mem = sizeof(scratch_mem)/sizeof(scratch_mem[0]); // find a local memory that fits the data size. - for (int mem_idx = 0; mem_idx < numMem; ++mem_idx) { + for (int mem_idx = 0; mem_idx < num_mem; ++mem_idx) { // Best Fit if ((size <= scratch_sizes[mem_idx]) && (scratch_sizes[mem_idx] - size < best_mem_delta)) { best_mem_idx = mem_idx; @@ -93,9 +93,9 @@ void *get_arc_scratch_buffer(int size) { void get_arc_scratch_buffer_max_size(int *size) { int maxavailable = 0; - const int numMem = sizeof(scratch_mem)/sizeof(scratch_mem[0]); + const int num_mem = sizeof(scratch_mem)/sizeof(scratch_mem[0]); // find the largest available buffer. - for (int i = 0; i < numMem; i++) { + for (int i = 0; i < num_mem; i++) { if (scratch_sizes[i] > maxavailable) { maxavailable = scratch_sizes[i]; } @@ -106,9 +106,9 @@ void get_arc_scratch_buffer_max_size(int *size) { void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) { int maxavailable = 0; int secondavail = 0; - const int numMem = sizeof(scratch_mem)/sizeof(scratch_mem[0]); + const int num_mem = sizeof(scratch_mem)/sizeof(scratch_mem[0]); // find the two largest available buffers. - for (int i = 0; i < numMem; i++) { + for (int i = 0; i < num_mem; i++) { if (scratch_sizes[i] > maxavailable) { secondavail = maxavailable; maxavailable = scratch_sizes[i]; @@ -120,7 +120,7 @@ void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) { *size2 = secondavail; } -void free_arc_scratch_buffers(void) { +void init_arc_scratch_buffers(void) { scratch_mem[0] = scratch_mem_x; scratch_mem[1] = scratch_mem_y; scratch_mem[2] = scratch_mem_z; diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h index 927e480da5a..703c164e077 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h +++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h @@ -24,7 +24,7 @@ namespace ops { namespace micro { -void free_arc_scratch_buffers(void); +void init_arc_scratch_buffers(void); void *get_arc_scratch_buffer(int size);// Function to assign fast memory from one of 3 scratch buffers. void get_arc_scratch_buffer_max_size(int *size); From 0b15d4264d6cc5695fca35b7f68dcf64e4353bcf Mon Sep 17 00:00:00 2001 From: jacco Date: Fri, 17 Jan 2020 19:30:30 +0300 Subject: [PATCH 024/557] Minor fixes to restore 'generate_projects' target functionality --- tensorflow/lite/micro/tools/make/targets/arc_makefile.inc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index 5ce2e03bfc3..eb890ef1999 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -95,4 +95,10 @@ endif endif # USE_EMBARC_MLI +# These are microcontroller-specific rules for converting the ELF output +# of the linker into a binary image that can be loaded directly. + +# Not applicable for ARC, leaving it empty. +$(BINDIR)%.bin: + endif From e6f9f08acb00745c429baf199486cb8a6e07c08c Mon Sep 17 00:00:00 2001 From: jacco Date: Tue, 21 Jan 2020 20:11:27 +0300 Subject: [PATCH 025/557] Initial implementation of TCF and LCF files support for IoTDK and EMSDP platforms --- .../micro/tools/make/helper_functions.inc | 7 + .../tools/make/targets/arc/emsdp/emsdp.lcf | 47 + .../targets/arc/emsdp/emsdp_em11d_dfss.tcf | 4907 +++++++++++++++++ .../tools/make/targets/arc/iotdk/iotdk.lcf | 47 + .../tools/make/targets/arc/iotdk/iotdk.tcf | 4621 ++++++++++++++++ .../micro/tools/make/targets/arc_makefile.inc | 15 + 6 files changed, 9644 insertions(+) create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc index 09771419843..a7f9bd788e3 100644 --- a/tensorflow/lite/micro/tools/make/helper_functions.inc +++ b/tensorflow/lite/micro/tools/make/helper_functions.inc @@ -145,6 +145,13 @@ ifneq ($(TCF_FILE_NAME), ) $(PRJDIR)$(3)/$(1)/$(TCF_FILE_NAME): $(TCF_FILE) @cp $$< $$@ endif + +# Special rule to copy LCF in case the local filesystem file name has been defined +ifneq ($(LCF_FILE), ) +$(PRJDIR)$(3)/$(1)/$(notdir $(LCF_FILE)): $(LCF_FILE) + @cp $$< $$@ +endif + endif endef diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf new file mode 100644 index 00000000000..fc34759d745 --- /dev/null +++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf @@ -0,0 +1,47 @@ +# SYSTEM memory regions indicate where external memory might be located. +# The TCF has no specific knowledge of whether SYSTEM regions contain +# external memory or not. +# CCMWRAP memory regions indicate unusable portions of the address space +# due to CCM memory wrapping into upper addresses beyond its size + +MEMORY { + IVT : ORIGIN = 0x00000000, LENGTH = 0x60000000 + ICCM0 : ORIGIN = 0x60000000, LENGTH = 0x00020000 +# CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000 +# SYSTEM1 : ORIGIN = 0x70000000, LENGTH = 0x10000000 + DCCM : ORIGIN = 0x80000000, LENGTH = 0x00020000 +# CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000 + XCCM : ORIGIN = 0x90000000, LENGTH = 0x00004000 +# CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000 + YCCM : ORIGIN = 0xa0000000, LENGTH = 0x00004000 +# CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000 + SYSTEM2 : ORIGIN = 0xb0000000, LENGTH = 0x50000000 + } +SECTIONS { + GROUP BLOCK(4): { + .text? : { *('.text$crt*') } + * (TEXT): {} + * (LIT): {} + } > ICCM0 + + GROUP BLOCK(4): { + /* _SDA_BASE_ computed implicitly */ + .sdata?: {} + .sbss?: {} + * (DATA): {} + * (BSS): {} + .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {} + .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {} + } > SYSTEM2 + GROUP BLOCK(4): { + .Xdata? : {} + } > XCCM + GROUP BLOCK(4): { + .Ydata? : {} + } > YCCM + GROUP BLOCK(4) : { + .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4) + } > IVT + } + + diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf new file mode 100644 index 00000000000..833fa9ca9b9 --- /dev/null +++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf @@ -0,0 +1,4907 @@ + + + + + + + + + + + + + + + + + + + + + + + +# +# option 16/L32/U32 Instructions +# ------ ---------- --------------------- +# +# none -/-/- None +# wlh1 1/1/1 MPYW/U, MPY/U, MPYH/U +# wlh2 2/2/2 MPYW/U, MPY/U, MPYH/U +# wlh3 2/3/3 MPYW/U, MPY/U, MPYH/U +# wlh4 2/4/5 MPYW/U, MPY/U, MPYH/U +# wlh5 5/9/9 MPYW/U, MPY/U, MPYH/U +# +# +-mpy_option none + +# code_protection --- The ARC EM architecture divides the memory into 16 regions, which can be protected individually. This feature adds a 16-bit input to the processor core, one bit per region. When the protect bit is set, the processor disables any load or store to the corresponding region. An attempt to access a protected region raises an EV_ProtV exception. +-code_protection false + +# stack_checking --- Stack checking is a mechanism for checking stack accesses and raising an exception when a stack overflow or underflow is detected. +-stack_checking true + +# unaligned_option --- This enables unaligned loads and stores. +-unaligned_option true + +# intvbase_preset --- This sets the interrupt vector base configuration register, VECBASE_AC_BUILD. The vector base address is aligned to a 1KB boundary, so the required address value should be divided by 1K (i.e. do not include the lower 10 bits). On reset, this register is loaded into the interrupt vector base address register, INT_VECTOR_BASE. +-intvbase_preset 0x0 + +# intvbase_preset_s --- This sets the secure interrupt vector base configuration register, VECBASE_AC_BUILD. The vector base address is aligned to a 1KB boundary, so the required address value should be divided by 1K (i.e. do not include the lower 10 bits). On reset, this register is loaded into the interrupt vector base address register, INT_VECTOR_BASE_S.This is effective only when 2+2 mode is enabled. +-intvbase_preset_s 0x0 + +# intvbase_ext --- Set this option to drive the upper 22 bits of the interrupt base vector externally, into signal intvbase_in. +-intvbase_ext false + +# nmi_option --- add Non-maskable external exception support +-nmi_option false + +# rgf_impl --- This defines whether the register file is implemented using flip-flops, or with a hard macro. +-rgf_impl flip_flops + +# rgf_num_regs --- This defines the size (in 32b register) of the processor register file. +-rgf_num_regs 32 + +# rgf_wr_ports --- This defines the number of write ports on the register file. +-rgf_wr_ports 2 + +# rgf_num_banks --- Dual register banks are useful if Fast IRQ has been configured, but may be selected even if not. +-rgf_num_banks 2 + +# rgf_banked_regs --- This selects the number of registers that are replicated in the second register-file bank. +-rgf_banked_regs 32 + +# turbo_boost --- This enables the Turbo Boost synthesis option. By enabling this option, the achievable clock frequency is increased, but at the cost of an additional cycle latency on branch instructions. +-turbo_boost false + +# infer_alu_adder --- infer: datapath is described as behavioral code: A + B +# instantiate: datapath is instantiated as a detailed multi-stage code of a carry-lookahead adder. It is generally preferable to use the infer option and add directives for your target synthesizer. +-infer_alu_adder infer + +# infer_mpy_wtree --- infer: datapath is described as behavioral code: A * B (applies to only wlh3, wlh4 and wlh5 designs) +# instantiate: datapath is instantiated as a detailed multi-stage code of a Wallace Tree multiplier It is generally preferable to use the infer option and add directives for your target synthesizer. +-infer_mpy_wtree instantiate + +# scantest_ram_bypass_mux --- This mux is used to make logic trapped between flops and memory (aka shadow logic) to be covered by scantest without requiring advanced sequential ATPG on the memory to be applied. Will add delay to functional access time +-scantest_ram_bypass_mux false + +# logic_bist --- This option will OR LBIST_EN with test_mode +-logic_bist false + +# power_domains --- Adds three separate power domains to the core, and propagates power-gate control signals to the top level of the core. Also generates UPF constraints and commands in the low-power scripts +-power_domains false + +# dvfs --- Adds logic to the core to allow dynamic controlling of voltage and frequency and propagates the associated control signals to the top level of core +-dvfs false + +# voltage_domains --- Creates a voltage domain split between RAM and std cell parts to support Ultra Low Voltage on cells and generates UPF constraints +-voltage_domains false + +# mem_bus_option --- The core supports two bus protocols for accessing external memory: AHB & AHB-Lite. AHB-Lite-single means instruction fetch and data access share a single AHB-Lite port. AHB-Lite-dual means separate AHB-Lite port for each initiator if present. +-mem_bus_option AHB + +# mem_bus_reg_interface --- Specifies whether the memory bus interface is registered. +-mem_bus_reg_interface true + +# dmi_burst_option --- This will enable high-throughput burst support on the DMI slave interfaces. By enabling this option, the peak DMI read throughput goes from 1 word per 3 cycles to N words per N+2 cycles, in which N is the AHB burst lengthDMI write throughput goes from 1 word per 3 cycles to 1 word per cycle. +-dmi_burst_option true + +# has_dmp_peripheral --- This option enables the redirection of load/store accesses to one segment (1/16) of the addressable space to a dedicated peripheral bus. This offers high system integration and reduces overall system cost. +-has_dmp_peripheral true + +# per0_base --- This option specifies the memory region assignment for this peripheral aperture +-per0_base 15 + +# per0_limit --- This option specifies the end of this peripheral aperture +-per0_limit 0 + +# per_bus_option --- The core supports one bus protocol for accessing the peripheral space, when enabled: AHB-Lite. +-per_bus_option AHB-Lite + +# per_bus_reg_interface --- Specifies whether the peripheral bus interface is registered. +-per_bus_reg_interface true + +# clock_gating --- This enables the insertion of architectural clock gate elements in the design. By enabling this option, the clocks to various parts of the design will be disabled when the logic they drive is not in use to save power. +-clock_gating false + +# back_compat --- This enables the addition of rst_a input in the clkgate module to support backward compatibility with the older EM and Subsystem releases. +-back_compat true + +# byte_parity --- If parity protection on the CCMs or Cache is configured, this option enables parity protection on a per-byte basis. Otherwise, parity is per word basis +-byte_parity false + +# prot_pipelined --- Check the box if CCM memories are configured for ECC, and you want single-bit errors to be corrected, written back to memory, and re-fetched. When unchecked, single bit errors are corrected when read from memory, but the offending memory location itself is not corrected with a writeback, no influence on Cache protection +-prot_pipelined false + +# cct_test_ena --- When ECC is configured, this option enables single bit error injection in CCT RAM models to demonstrate ECC protection on the RAMs. When enabled, the RAM models can only be used in HDL CCT simulation (no xCAM support) and are not intended for use in SoC level integration. +-cct_test_ena false + +# err_prot_ehce --- Enabled enhanced ECC architecture for CCM. Instruction fetch with single bit error is not replayed; ecc cac modules are shared to reduce area and timing opt. +-err_prot_ehce false + + +######## dsp_trig --- com.arc.hardware.dfss.dsp_trig.1_0 ######## + +# Create dsp_trig +-create com.arc.hardware.dfss.dsp_trig.1_0 System.CPUisle.ARCv2EM.dsp_trig + +# dsp_trig --- Command line option for EIA extension component 'dsp_trig'. +-dsp_trig true + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_gpio0 --- com.arc.hardware.dfss.io_gpio0.1_0 ######## + +# Create io_gpio0 +-create com.arc.hardware.dfss.io_gpio0.1_0 System.CPUisle.ARCv2EM.io_gpio0 + +# io_gpio0 --- Command line option for EIA extension component 'io_gpio0'. +-io_gpio0 true + +# io_gpio0_debounce --- Selects the inclusion of Debounce logic +-io_gpio0_debounce 1 + +# io_gpio0_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal +-io_gpio0_readback_sync 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + +# io_gpio0_direction_rst_value --- Reset value of the SWPORTA_DDR register, which determines the direction (input/output) of the GPIO interface. 0: input, 1: output. +-io_gpio0_direction_rst_value 0 + +# io_gpio0_output_rst_value --- Reset value of the SWPORTA_DR register, which determines the reset value of the GPIO output ports. Bits corresponding to input ports are ignored. +-io_gpio0_output_rst_value 0x0 + + +######## io_i2c_mst0 --- com.arc.hardware.dfss.io_i2c_mst0.1_0 ######## + +# Create io_i2c_mst0 +-create com.arc.hardware.dfss.io_i2c_mst0.1_0 System.CPUisle.ARCv2EM.io_i2c_mst0 + +# io_i2c_mst0 --- Command line option for APEX extension component 'io_i2c_mst0'. +-io_i2c_mst0 true + +# io_i2c_mst0_fs --- RX/TX FIFO size +-io_i2c_mst0_fs 16 + +# io_i2c_mst0_dma_support --- Specifies whether the DMA handshake interface is included +-io_i2c_mst0_dma_support None + +# io_i2c_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency. +-io_i2c_mst0_cdc_included 0 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_i2c_slv0 --- com.arc.hardware.dfss.io_i2c_slv0.1_0 ######## + +# Create io_i2c_slv0 +-create com.arc.hardware.dfss.io_i2c_slv0.1_0 System.CPUisle.ARCv2EM.io_i2c_slv0 + +# io_i2c_slv0 --- Command line option for APEX extension component 'io_i2c_slv0'. +-io_i2c_slv0 true + +# io_i2c_slv0_fs --- RX/TX FIFO size +-io_i2c_slv0_fs 16 + +# io_i2c_slv0_dma_support --- Specifies whether the DMA handshake interface is included +-io_i2c_slv0_dma_support None + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_spi_mst0 --- com.arc.hardware.dfss.io_spi_mst0.1_0 ######## + +# Create io_spi_mst0 +-create com.arc.hardware.dfss.io_spi_mst0.1_0 System.CPUisle.ARCv2EM.io_spi_mst0 + +# io_spi_mst0 --- Command line option for APEX extension component 'io_spi_mst0'. +-io_spi_mst0 true + +# io_spi_mst0_fz --- RX/TX FIFO depth +-io_spi_mst0_fs 16 + +# io_spi_mst0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. +-io_spi_mst0_max_xfer_size 16 + +# io_spi_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency. +-io_spi_mst0_cdc_included 0 + +# io_spi_mst0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_spi_mst0_dma_support Memory-Based + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## subsys_bcr --- com.arc.hardware.dfss.subsys_bcr.1_0 ######## + +# Create subsys_bcr +-create com.arc.hardware.dfss.subsys_bcr.1_0 System.CPUisle.ARCv2EM.subsys_bcr + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_spi_mst1 --- com.arc.hardware.dfss.io_spi_mst1.1_0 ######## + +# Create io_spi_mst1 +-create com.arc.hardware.dfss.io_spi_mst1.1_0 System.CPUisle.ARCv2EM.io_spi_mst1 + +# io_spi_mst1 --- Command line option for APEX extension component 'io_spi_mst1'. +-io_spi_mst1 true + +# io_spi_mst1_fz --- RX/TX FIFO depth +-io_spi_mst1_fs 16 + +# io_spi_mst1_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. +-io_spi_mst1_max_xfer_size 16 + +# io_spi_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency. +-io_spi_mst1_cdc_included 0 + +# io_spi_mst1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_spi_mst1_dma_support Memory-Based + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_spi_mst2 --- com.arc.hardware.dfss.io_spi_mst2.1_0 ######## + +# Create io_spi_mst2 +-create com.arc.hardware.dfss.io_spi_mst2.1_0 System.CPUisle.ARCv2EM.io_spi_mst2 + +# io_spi_mst2 --- Command line option for APEX extension component 'io_spi_mst2'. +-io_spi_mst2 true + +# io_spi_mst2_fz --- RX/TX FIFO depth +-io_spi_mst2_fs 16 + +# io_spi_mst2_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. +-io_spi_mst2_max_xfer_size 16 + +# io_spi_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency. +-io_spi_mst2_cdc_included 0 + +# io_spi_mst2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_spi_mst2_dma_support None + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_spi_slv0 --- com.arc.hardware.dfss.io_spi_slv0.1_0 ######## + +# Create io_spi_slv0 +-create com.arc.hardware.dfss.io_spi_slv0.1_0 System.CPUisle.ARCv2EM.io_spi_slv0 + +# io_spi_slv0 --- Command line option for APEX extension component 'io_spi_slv0'. +-io_spi_slv0 true + +# io_spi_slv0_fz --- RX/TX FIFO depth +-io_spi_slv0_fs 16 + +# io_spi_slv0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. +-io_spi_slv0_max_xfer_size 16 + +# io_spi_slv0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_spi_slv0_dma_support Memory-Based + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_gpio1 --- com.arc.hardware.dfss.io_gpio1.1_0 ######## + +# Create io_gpio1 +-create com.arc.hardware.dfss.io_gpio1.1_0 System.CPUisle.ARCv2EM.io_gpio1 + +# io_gpio1 --- Command line option for EIA extension component 'io_gpio1'. +-io_gpio1 true + +# io_gpio1_debounce --- Selects the inclusion of Debounce logic +-io_gpio1_debounce 1 + +# io_gpio1_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal +-io_gpio1_readback_sync 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + +# io_gpio1_direction_rst_value --- Reset value of the SWPORTA_DDR register, which determines the direction (input/output) of the GPIO interface. 0: input, 1: output. +-io_gpio1_direction_rst_value 0 + +# io_gpio1_output_rst_value --- Reset value of the SWPORTA_DR register, which determines the reset value of the GPIO output ports. Bits corresponding to input ports are ignored. +-io_gpio1_output_rst_value 0x0 + + +######## io_gpio2 --- com.arc.hardware.dfss.io_gpio2.1_0 ######## + +# Create io_gpio2 +-create com.arc.hardware.dfss.io_gpio2.1_0 System.CPUisle.ARCv2EM.io_gpio2 + +# io_gpio2 --- Command line option for EIA extension component 'io_gpio2'. +-io_gpio2 true + +# io_gpio2_debounce --- Selects the inclusion of Debounce logic +-io_gpio2_debounce 1 + +# io_gpio2_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal +-io_gpio2_readback_sync 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + +# io_gpio2_direction_rst_value --- Reset value of the SWPORTA_DDR register, which determines the direction (input/output) of the GPIO interface. 0: input, 1: output. +-io_gpio2_direction_rst_value 0 + +# io_gpio2_output_rst_value --- Reset value of the SWPORTA_DR register, which determines the reset value of the GPIO output ports. Bits corresponding to input ports are ignored. +-io_gpio2_output_rst_value 0x0 + + +######## io_i2c_mst1 --- com.arc.hardware.dfss.io_i2c_mst1.1_0 ######## + +# Create io_i2c_mst1 +-create com.arc.hardware.dfss.io_i2c_mst1.1_0 System.CPUisle.ARCv2EM.io_i2c_mst1 + +# io_i2c_mst1 --- Command line option for APEX extension component 'io_i2c_mst1'. +-io_i2c_mst1 true + +# io_i2c_mst1_fs --- RX/TX FIFO size +-io_i2c_mst1_fs 16 + +# io_i2c_mst1_dma_support --- Specifies whether the DMA handshake interface is included +-io_i2c_mst1_dma_support None + +# io_i2c_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency. +-io_i2c_mst1_cdc_included 0 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_i2c_mst2 --- com.arc.hardware.dfss.io_i2c_mst2.1_0 ######## + +# Create io_i2c_mst2 +-create com.arc.hardware.dfss.io_i2c_mst2.1_0 System.CPUisle.ARCv2EM.io_i2c_mst2 + +# io_i2c_mst2 --- Command line option for APEX extension component 'io_i2c_mst2'. +-io_i2c_mst2 true + +# io_i2c_mst2_fs --- RX/TX FIFO size +-io_i2c_mst2_fs 16 + +# io_i2c_mst2_dma_support --- Specifies whether the DMA handshake interface is included +-io_i2c_mst2_dma_support None + +# io_i2c_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency. +-io_i2c_mst2_cdc_included 0 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_uart0 --- com.arc.hardware.dfss.io_uart0.1_0 ######## + +# Create io_uart0 +-create com.arc.hardware.dfss.io_uart0.1_0 System.CPUisle.ARCv2EM.io_uart0 + +# io_uart0 --- Command line option for EIA extension component 'io_uart0'. +-io_uart0 true + +# io_uart0_fifo_mode --- Set the UART FIFO mode +-io_uart0_fifo_mode 16 + +# io_uart0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_uart0_dma_support None + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_uart1 --- com.arc.hardware.dfss.io_uart1.1_0 ######## + +# Create io_uart1 +-create com.arc.hardware.dfss.io_uart1.1_0 System.CPUisle.ARCv2EM.io_uart1 + +# io_uart1 --- Command line option for EIA extension component 'io_uart1'. +-io_uart1 true + +# io_uart1_fifo_mode --- Set the UART FIFO mode +-io_uart1_fifo_mode 16 + +# io_uart1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_uart1_dma_support None + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_uart2 --- com.arc.hardware.dfss.io_uart2.1_0 ######## + +# Create io_uart2 +-create com.arc.hardware.dfss.io_uart2.1_0 System.CPUisle.ARCv2EM.io_uart2 + +# io_uart2 --- Command line option for EIA extension component 'io_uart2'. +-io_uart2 true + +# io_uart2_fifo_mode --- Set the UART FIFO mode +-io_uart2_fifo_mode 16 + +# io_uart2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_uart2_dma_support None + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_uart3 --- com.arc.hardware.dfss.io_uart3.1_0 ######## + +# Create io_uart3 +-create com.arc.hardware.dfss.io_uart3.1_0 System.CPUisle.ARCv2EM.io_uart3 + +# io_uart3 --- Command line option for EIA extension component 'io_uart3'. +-io_uart3 true + +# io_uart3_fifo_mode --- Set the UART FIFO mode +-io_uart3_fifo_mode 16 + +# io_uart3_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_uart3_dma_support None + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_i2s_rx_mst0 --- com.arc.hardware.dfss.io_i2s_rx_mst0.1_0 ######## + +# Create io_i2s_rx_mst0 +-create com.arc.hardware.dfss.io_i2s_rx_mst0.1_0 System.CPUisle.ARCv2EM.io_i2s_rx_mst0 + +# io_i2s_rx_mst0 --- Command line option for APEX extension component 'io_i2s_rx_mst0'. +-io_i2s_rx_mst0 true + +# io_i2s_rx_mst0_fs --- RX FIFO size +-io_i2s_rx_mst0_fs 8 + +# io_i2s_rx_mst0_fw --- RX FIFO width +-io_i2s_rx_mst0_fw 16 + +# io_i2s_rx_mst0_dma_support --- Specifies whether the DMA handshake interface is included +-io_i2s_rx_mst0_dma_support Memory-Based + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_i2s_tx_mst0 --- com.arc.hardware.dfss.io_i2s_tx_mst0.1_0 ######## + +# Create io_i2s_tx_mst0 +-create com.arc.hardware.dfss.io_i2s_tx_mst0.1_0 System.CPUisle.ARCv2EM.io_i2s_tx_mst0 + +# io_i2s_tx_mst0 --- Command line option for APEX extension component 'io_i2s_tx_mst0'. +-io_i2s_tx_mst0 true + +# io_i2s_tx_mst0_fs --- TX FIFO size +-io_i2s_tx_mst0_fs 8 + +# io_i2s_tx_mst0_fw --- TX FIFO width +-io_i2s_tx_mst0_fw 16 + +# io_i2s_tx_mst0_dma_support --- Specifies whether the DMA handshake interface is included +-io_i2s_tx_mst0_dma_support Memory-Based + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_pdm_rx0 --- com.arc.hardware.dfss.io_pdm_rx0.1_0 ######## + +# Create io_pdm_rx0 +-create com.arc.hardware.dfss.io_pdm_rx0.1_0 System.CPUisle.ARCv2EM.io_pdm_rx0 + +# io_pdm_rx0 --- Command line option for APEX extension component 'io_pdm_rx0'. +-io_pdm_rx0 true + +# io_pdm_rx0_ch --- Number of Stereo Channels +-io_pdm_rx0_ch 1 + +# io_pdm_rx0_fs --- RX FIFO size +-io_pdm_rx0_fs 16 + +# io_pdm_rx0_ns --- Maximum number of CIC stages +-io_pdm_rx0_ns 4 + +# io_pdm_rx0_ds --- Maximum delay in the COMB filter of the CIC filter +-io_pdm_rx0_ds 2 + +# io_pdm_rx0_dma_support --- Specifies whether the DMA handshake interface is included +-io_pdm_rx0_dma_support Memory-Based + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## DCCM --- com.arc.hardware.DCCM.1_0 ######## + +# Create DCCM +-create com.arc.hardware.DCCM.1_0 System.CPUisle.ARCv2EM.DCCM + +# dccm_size --- This defines the size of the Data Closely Coupled Memory (DCCM) in bytes +-dccm_size 131072 + +# dccm_base --- Sets the initial memory region assignment for DCCM +-dccm_base 8 + +# dccm_interleave --- Split DCCM into even/odd memory banks. +-dccm_interleave false + +# dccm_prot --- Specifies the type of protection built for the DCCM. +-dccm_prot None + +# dccm_prot_level --- Specifies the level protection. +-dccm_prot_level Data_Only + +# dccm_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the DCCM +-dccm_prot_exceptions true + +# dccm_sec_lvl --- Specifies the level of secure DCCM. +-dccm_sec_lvl Non_Secure + +# dccm_dmi --- This enables external access through a DMI (direct memory interface) port. +-dccm_dmi true + + +######## DMA Controller --- com.arc.hardware.DMA_Controller.1_0 ######## + +# Create DMA Controller +-create com.arc.hardware.DMA_Controller.1_0 "System.CPUisle.ARCv2EM.DMA Controller" + +# dmac_channels --- This options specifies the number of DMA channels implemented in the DMA controller +-dmac_channels 16 + +# dmac_fifo_depth --- This option specifies the DMA transfer FIFO depth in 32b words. +-dmac_fifo_depth 2 + +# dmac_int_config --- None: the DMA controller cannot raise an interrupt +# Single-External: single done and single error interrupt signal for all DMA channels, and the interrupt signals are routed to a port at the top of the EM logical hierarchy +# Multiple-External: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are routed to ports at the top of the EM logical hierarchy +# Single-Internal: single done and single error interrupt signals for all DMA channels, and the interrupt signals are internal to the EM core +# Multiple-Internal: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are internal to the EM core +-dmac_int_config Multiple-Internal + +# dmac_separate_error_interrupts --- This specifies whether there is a separate error interrupt per DMA channel, or just one. +-dmac_separate_error_interrupts false + +# dmac_registers --- This option defines the number of DMA channels with their registers located in auxiliary space. +-dmac_registers 0 + +# dmac_mem_if --- This option specifies whether the DMA controller system memory interface is integrated into the existing EM system memory interfaces or has its own interface. +-dmac_mem_if integrated + +# dmac_per_if --- Internal vs DW peripheral interface. Specify (in hex) which channels have the DW interface, where bit 0 corresponds to DMA channel 0, bit 1 for DMA channel 1, etc. +# Example: 4 channel DMA controller where -dmac_per_if is set to 0x9 = DMA Channels 0 and 3 configured with the DW req interface, DMA Channels 1 and 2 configured with the internal req interface. +-dmac_per_if 0x7e00 + + +######## DSP --- com.arc.hardware.DSP.1_0 ######## + +# Create DSP +-create com.arc.hardware.DSP.1_0 System.CPUisle.ARCv2EM.DSP + +# dsp_complex --- Enable/disable support for single cycle 16b+16b complex instructions and butterfly operations, else 2-cycle complex instructions only without butterfly support +-dsp_complex true + +# dsp_itu --- Enable/disable support for ITU bit-accurate 1 bit fractional shift before accumulation, else 1-bit fractional shift result after accumulation only +-dsp_itu true + +# dsp_divsqrt --- Enable/disable support for divide and square root operations: DIV(U), REM(U), SQRT +-dsp_divsqrt radix2 + +# dsp_accshift --- Select support for accumulator shift operations: no supported, limited shift support only or full shift support and convergent rounding +-dsp_accshift full + +# dsp_impl --- The datapath components may be inferred from Verilog for better area or optimized using carry-save components for better timing +-dsp_impl optimized + + +######## Data Cache --- com.arc.hardware.Data_Cache.1_0 ######## + +# Create Data Cache +-create com.arc.hardware.Data_Cache.1_0 "System.CPUisle.ARCv2EM.Data Cache" + +# dc_size --- This defines the total size of the Data Cache in bytes. +-dc_size 16384 + +# dc_ways --- This defines the number of cache ways. +-dc_ways 2 + +# dc_bsize --- This defines the cache line length in bytes. +-dc_bsize 32 + +# dc_feature_level --- Feature Level, indicates locking and debug feature level 00 = Basic cache, with no locking or debug features 01 = Lock and flush features supported 10 = Lock, flush and advanced debug features supported 11 = Reserved +-dc_feature_level 2 + +# dc_uncached_region --- Enable an uncached region defined by aux reg +-dc_uncached_region false + +# dc_prot --- Specifies the type of protection built for DCACHE. +-dc_prot None + +# dc_prot_level --- Specifies the level of protection. +-dc_prot_level Data_Only + +# dc_prot_exceptions --- Builds exception generation hardware for uncorrectable (fatal) errors detected on DCACHE. +-dc_prot_exceptions true + + +######## Debug Interface --- com.arc.hardware.Debug_Interface.1_0 ######## + +# Create Debug Interface +-create com.arc.hardware.Debug_Interface.1_0 "System.CPUisle.ARCv2EM.Debug Interface" + +# dbg_en_option --- Adds an enable pin to the existing debug interface +-dbg_en_option false + +# secure_debug --- This enables secure debug feature +-secure_debug false + +# scdbg_aux_unlk --- An internal demo module will be included when enable +-scdbg_aux_unlk false + +# dbg_apb_option --- Adds an additional APB debug port alongside the BVCI one +-dbg_apb_option false + + +######## ICCM0 --- com.arc.hardware.ICCM0.1_0 ######## + +# Create ICCM0 +-create com.arc.hardware.ICCM0.1_0 System.CPUisle.ARCv2EM.ICCM0 + +# iccm0_size --- This defines the size of ICCM0 in bytes.This ICCM has 0 wait states. +-iccm0_size 131072 + +# iccm0_base --- Sets the initial memory region assignment for ICCM0 +-iccm0_base 6 + +# iccm0_wide --- Creates ICCM0 as 64b memory to reduce accesses. +-iccm0_wide false + +# iccm0_prot --- Specifies the type of protection built for ICCM0. +-iccm0_prot None + +# iccm0_prot_level --- Specifies the level of protection. +-iccm0_prot_level Data_Only + +# iccm0_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the ICCM0 +-iccm0_prot_exceptions true + +# iccm0_sec_lvl --- Specifies the level of secure ICCM0. +-iccm0_sec_lvl Non_Secure + +# iccm0_dmi --- This enables external access through a DMI (direct memory interface) port. +-iccm0_dmi true + + +######## Instruction Cache --- com.arc.hardware.Instruction_Cache.1_0 ######## + +# Create Instruction Cache +-create com.arc.hardware.Instruction_Cache.1_0 "System.CPUisle.ARCv2EM.Instruction Cache" + +# ic_size --- This defines the total size of the instruction cache in bytes. +-ic_size 16384 + +# ic_ways --- This defines the number of cache ways +-ic_ways 2 + +# ic_bsize --- This defines the cache line length in bytes. +-ic_bsize 64 + +# ic_disable_on_reset --- The instruction cache may be enabled immediately after reset, depending on this option. If this option is enabled, the last cache operation is set to failed, and the direct cache-RAM access is enabled. Furthermore, the instruction cache is invalidated all cache lines are invalidated and unlocked, and the tag RAM is cleared. +-ic_disable_on_reset false + +# ic_feature_level --- This defines the feature level of the cache. +-ic_feature_level 1 + +# ic_pwr_opt_level --- This selects power-optimization options in the micro-architecture of the instruction cache. +-ic_pwr_opt_level 0 + +# ic_prot --- Specifies the type of protection built for ICACHE. +-ic_prot None + +# ic_prot_level --- Specifies the level of protection. +-ic_prot_level Data_Only + +# ic_prot_exceptions --- Builds exception generation hardware for uncorrectable (fatal) errors detected on ICACHE. +-ic_prot_exceptions true + + +######## Interrupt Controller --- com.arc.hardware.Interrupt_Controller.1_0 ######## + +# Create Interrupt Controller +-create com.arc.hardware.Interrupt_Controller.1_0 "System.CPUisle.ARCv2EM.Interrupt Controller" + +# number_of_interrupts --- This is the total number of interrupts available to the core. Some interrupts are allocated statically to a specific interrupt line (for example, timer interrupts). For more information on Interrupt and register-file options, see DesignWare ARCv2 ISA Programmers Reference Manual. +-number_of_interrupts 96 + +# number_of_levels --- Priority levels in the interrupt controller. +-number_of_levels 4 + +# external_interrupts --- This is the total number of interrupt pins available for external system components. This parameter must be less than the total number of interrupts. +-external_interrupts 77 + +# firq_option --- This enables the fast-interrupts option, (priority level 0 interrupts), which uses an alternate register bank (if configured) instead of saving the context to memory. +-firq_option true + + +######## JTAG Interface --- com.arc.hardware.JTAG_Interface.1_0 ######## + +# Create JTAG Interface +-create com.arc.hardware.JTAG_Interface.1_0 "System.CPUisle.ARCv2EM.JTAG Interface" + +######## Timer 0 --- com.arc.hardware.Timer_0.1_0 ######## + +# Create Timer 0 +-create com.arc.hardware.Timer_0.1_0 "System.CPUisle.ARCv2EM.Timer 0" + +# timer_0_int_level --- This sets the interrupt level (and implicitly the priority: level 0 is highest) of timer 0. +-timer_0_int_level 1 + + +######## Watchdog Timer --- com.arc.hardware.Watchdog_Timer.1_0 ######## + +# Create Watchdog Timer +-create com.arc.hardware.Watchdog_Timer.1_0 "System.CPUisle.ARCv2EM.Watchdog Timer" + +# watchdog_size --- Specifies the bit width of timer's internal counter. +-watchdog_size 32 + +# watchdog_clk --- Specifies whether the timer should be driven from a separate clock. +-watchdog_clk false + + +######## Real-time Counter --- com.arc.hardware.Real_time_Counter.1_0 ######## + +# Create Real-time Counter +-create com.arc.hardware.Real_time_Counter.1_0 "System.CPUisle.ARCv2EM.Real-time Counter" + +######## Performance Monitor --- com.arc.hardware.Performance_Monitor.1_0 ######## + +# Create Performance Monitor +-create com.arc.hardware.Performance_Monitor.1_0 "System.CPUisle.ARCv2EM.Performance Monitor" + +# pct_counters --- Number of counters for performance monitoring. +-pct_counters 8 + + +######## SmaRT --- com.arc.hardware.SmaRT.1_0 ######## + +# Create SmaRT +-create com.arc.hardware.SmaRT.1_0 System.CPUisle.ARCv2EM.SmaRT + +# smart_stack_entries --- This specifies the number of entries in the trace buffer. +-smart_stack_entries 8 + +# smart_implementation --- Flip-flop = FF-based design. Memory = memory-based design (provides better density for larger trace buffers). +-smart_implementation flip-flop + + +######## XY --- com.arc.hardware.XY.1_0 ######## + +# Create XY +-create com.arc.hardware.XY.1_0 System.CPUisle.ARCv2EM.XY + +# xy_config --- XY memory configuration: +# One memory: DCCM only. +# Two memories: DCCM + Y. +# Three memories: DCCM + X + Y. +-xy_config dccm_x_y + +# xy_size --- Size of X and Y memories if included. +# X and Y memories both have the same configured size. +-xy_size 16384 + +# xy_interleave --- Split XY memories into odd/even instances to enable single cycle unaligned access. +-xy_interleave false + +# xy_x_base --- Base region for X memory. All accesses to this region will initiate a transfer on the X memory. +-xy_x_base 9 + +# xy_y_base --- Base region for Y memory. All accesses to this region will initiate a transfer on the Y memory. +-xy_y_base 10 + + +######## AGU --- com.arc.hardware.AGU.1_0 ######## + +# Create AGU +-create com.arc.hardware.AGU.1_0 System.CPUisle.ARCv2EM.AGU + +# agu_size --- Predefined configurations of modifiers, address +# pointers and offset registers +#

+# 
+#         address     address                     
+#         pointers    offset regs      modifiers  
+#        ----------- --------------- ------------ 
+# small:     4           2                 4      
+# medium:    8           4                 12     
+# large:     12          8                 24     
+# 
+# +-agu_size large + +# agu_accord --- Enable the accordion stage if operating frequency is critical +-agu_accord true + +# agu_wb_depth --- Write buffer depth +-agu_wb_depth 4 + + +######## Actionpoints --- com.arc.hardware.Actionpoints.1_0 ######## + +# Create Actionpoints +-create com.arc.hardware.Actionpoints.1_0 System.CPUisle.ARCv2EM.Actionpoints + +# num_actionpoints --- This is the number of trigger events available. +-num_actionpoints 8 + +# aps_feature --- Selects Actionpoint feature set +-aps_feature min + + +######## Bit stream --- com.arc.hardware.Bit_stream.1_0 ######## + +# Create Bit stream +-create com.arc.hardware.Bit_stream.1_0 "System.CPUisle.ARCv2EM.Bit stream" + +######## Floating-point unit --- com.arc.hardware.Floating_point_unit.1_0 ######## + +# Create Floating-point unit +-create com.arc.hardware.Floating_point_unit.1_0 "System.CPUisle.ARCv2EM.Floating-point unit" + +# fpu_dp_assist --- This enables double-precision acceleration instructions. +-fpu_dp_assist true + +# fpu_fma_option --- This enables the fused multiply-add & multiply-subtract instructions. +-fpu_fma_option true + +# fpu_mas_cycles --- Make mul/add/sub multicycle to achieve a higher clock speed. +-fpu_mas_cycles 2 + +# fpu_pipe_impl --- FPU pipelined implementation +-fpu_pipe_impl true + +# fpu_div_option --- This enables divide & square-root acceleration +-fpu_div_option true + +# fpu_div_cycles --- Controls div/sqrt implementation. +-fpu_div_cycles 17 + + +######## Memory Protection Unit --- com.arc.hardware.Memory_Protection_Unit.1_0 ######## + +# Create Memory Protection Unit +-create com.arc.hardware.Memory_Protection_Unit.1_0 "System.CPUisle.ARCv2EM.Memory Protection Unit" + +# mpu_num_regions --- Number of configured memory regions. +-mpu_num_regions 16 + +# mpu_32b --- Set the minimal region size to be 32 byte instead of 2KB. +-mpu_32b false + +# mpu_sid_option --- It will enable SID support in Secure Shield +-mpu_sid_option false + + +######## Real-time trace producer --- com.arc.hardware.Real_time_trace_producer.1_0 ######## + +# Create Real-time trace producer +-create com.arc.hardware.Real_time_trace_producer.1_0 "System.CPUisle.ARCv2EM.Real-time trace producer" + +# rtt_feature_level --- 'small' means that program trace only is available. `medium' adds data trace. `full' adds core and aux register trace. +-rtt_feature_level full + + +######## ARCv2EM CCT --- cct.1_0 ######## + +# Create ARCv2EM CCT +-create cct.1_0 "System.ARCv2EM CCT" + +# cct --- +# Option used to add a CCT to the design for command-line builds +# Without this architect can't add this component to a build +# via a cmdline -create command. +# with old scripts. +# +-cct true + +# no_hostlink --- +# This prevents the inclusion of the hostlink library when compiling +# C or C++ programs. The resultant executable, if it contains printfs, +# will print to an internal fixed buffer __mwwrite_buf. +# Other hostlink operations that require debugger assistance, such as file +# opens, will fail. +# +# Hostlink references incur memory cycles at unpredictable times and +# so can perturb cycle-timing results. Without hostlink, +# the debugger will not in any way interfere with the target while it is running. +# Therefore this option is useful for simulation in which you want precisely the +# same cycle timing to occur each time you run, or for accurate power consumption results. +# +-cct_no_hostlink false + +# has_subsystem_cct_flow --- +# The above option will check for the presence of subsystem component in the build configuration and suitably modifies the Makefile for the sub-system environment. +# +-has_subsystem_cct_flow false + + +######## BusFabric --- com.arc.hardware.ARCv2MSS.BusFabric.1_0 ######## + +# Create BusFabric +-create com.arc.hardware.ARCv2MSS.BusFabric.1_0 System.BusFabric + +######## ClkCtrl --- com.arc.hardware.ARCv2MSS.ClkCtrl.1_0 ######## + +# Create ClkCtrl +-create com.arc.hardware.ARCv2MSS.ClkCtrl.1_0 System.ClkCtrl + +######## DSP Software --- com.arc.software.dfss.sw_dsp.1_0 ######## + +# Create DSP Software +-create com.arc.software.dfss.sw_dsp.1_0 "System.DSP Software" + +# sw_dsp --- Command line option for Software element 'DSP Software' +-sw_dsp true + + +######## EMSDP_BOARD --- com.arc.hardware.ARCv2MSS.EMSDP_BOARD.1_0 ######## + +# Create EMSDP_BOARD +-create com.arc.hardware.ARCv2MSS.EMSDP_BOARD.1_0 System.EMSDP_BOARD + +# emsdp_sys_freq --- Select the core frequency. +-emsdp_sys_freq 40 + + +######## IO Software --- com.arc.software.dfss.sw_io.1_0 ######## + +# Create IO Software +-create com.arc.software.dfss.sw_io.1_0 "System.IO Software" + +# sw_io --- Command line option for Software element 'IO Software' +-sw_io true + + +######## Implementation --- com.arc.hardware.implementation.1_0 ######## + +# Create Implementation +-create com.arc.hardware.implementation.1_0 System.Implementation + +# ClockSpeed --- Target clock speed of the system +-clock_speed 10 + +# DDR2_clk_Ratio --- DDR2 Clock Vs System Clock Ratio +# 2x +# 3x +# 4x +-ddr2_clk_ratio 3x + +# ClockSkew --- The clock skew for the system +-clock_skew 0.2 + +# HoldMargin --- Margin for hold time checks +-hold_margin 0.05 + +# Floorplan --- Floorplan definition for relative placement of RAMs (at CPU-level) or the placement of the rams and CPU hard cores (at multicore level) +-floorplan em4_sensor + +# JTAGFrequency --- Select the frequency of the JTAG clock Tck (in MHz). +# +# The JTAG clock speed has to be less than 1/2 of the cpu clock otherwise the signals on the BVCI interface are not guaranteed to be valid. +# +# NOTE: The RTL simulations will work when the JTAG clock frequency is set to half the CPU clock, however this may not be the case when simulating at gate level due to delays on the IO pads. +# +# The default is set to 10 MHz so that there is no conflict when simulating with an ARCangel3 at 30MHz. (30 > 10*2) +# +# The speed of simulation can be greatly increased by using a faster JTAG clock, but a dependency will warn if it exceeds 1/2 of the cpu clock. +# +-jtag_tclk 4 + +# execution_trace_level --- +# This traces committed instructions as they execute, and gathers statistics +# visible in the debugger for counting instructions & cycle delays. +# At the "stats" level ony the statistics are gathered and no trace is printed. +# "file" is equivalent to "full", but the results go to a trace .txt file instead. +# +-execution_trace_level stats + +# tb_trace --- +# Enable instruction execution trace. +# This is available to arc_dev licensees (internal developers) only. +# +-tb_trace false + +# zero_based_arcnum --- +# In a multicore build, number ARCs from 0. +# If this is not selected, arcs are numbered from 1. +# (This provides the initial value to the arcnum signal.) +# +-zero_based_arcnum true + +# generate_ipxact --- +# Generate ipxact.xml file describing the CPUisle or archipelago frontier +# +-generate_ipxact false + +# ipxact_relative_path_names --- +# Use relative path names for Verilog files in the ipxact. +# Otherwise, absolute path names are used. +# +-ipxact_relative_path_names true + +# optional_encryption --- +# When selected, encrypted RTL output is generated. +# +-optional_encryption false + +# ignore_encrypt_license --- +# When selected, pretend the encryption license is missing. For testing. +# +-ignore_encrypt_license false + +# ignore_clear_license --- +# When selected, pretend the cleartest license is missing. For testing. +# +-ignore_clear_license false + +# OPTION_require_archipelago --- +# When selected, force use of archipelago. This is for testing purposes. +# +-require_archipelago false + + +######## Infrastructure Software --- com.arc.software.dfss.sw_infra.1_0 ######## + +# Create Infrastructure Software +-create com.arc.software.dfss.sw_infra.1_0 "System.Infrastructure Software" + +# sw_infra --- Command line option for Software element 'Infrastructure Software' +-sw_infra true + +# templateName --- Template name +-template_name siss_combo_sensor_dsp + + +######## subsys_infra --- com.arc.hardware.dfss.subsys_infra.1_0 ######## + +# Create subsys_infra +-create com.arc.hardware.dfss.subsys_infra.1_0 System.subsys_infra + +# subsys_infra --- Command line option for EIA glue logic. +-subsys_infra true + +# internal_interrupt --- Connect the IO interrupts internally +-internal_interrupt true + +# internal_dma_handshake --- Connect the DMA handshake signals internally +-internal_dma_handshake true + +# spi_tb_sw_test_mode --- +# This is a secret option, not seen by customers. +# If you check this, the SPI peripheral's testbenches will be set to SW test mode: +# The serial interface of the first SPI master io_spi_mstN peripheral is connected to all SPI slave peripherals io_spi_slvN. +# This is used for testing the SW drivers. +# +-spi_tb_sw_test_mode false + +# i3c_tb_sw_test_mode --- +# This is a secret option, not seen by customers. +# If you check this, the I3C peripheral's testbenches will be set to SW test mode: +# The serial interface of the io_i3cN peripheral is connected to the I2C slave peripherals io_i2c_slv0. +# This is used for testing the SW drivers. +# +-i3c_tb_sw_test_mode false + +# subsys_apex_offset --- Subsystem APEX address offset in the AUX address space. The aperture used by the subsystem is fixed to 0x0010_0000. In general, the APEX address offset must be in the range from 0x0010_0000 to 0xFFF0_0000. However, if your design includes the "UAUX Interface" component, then the APEX address offset must be in the range from 0x0010_0000 to 0x7FF0_0000 to avoid address conflicts with any UAUX components. +-subsys_apex_offset 0x8000_0000 + +# subsys_uaux_offset --- Subsystem UAUX address offset in the UAUX address space. The UAUX address offset must be an integer multiple of 0x0010_0000 in the range from 0x0000_0000 to 0x7FF0_0000. The aperture reserved for the subsystem is fixed to 0x0010_0000. +-subsys_uaux_offset 0x10_0000 + + +######## ARC_RTT --- com.arc.hardware.ARC_RTT.1_0 ######## + +# Create ARC_RTT +-create com.arc.hardware.ARC_RTT.1_0 System.ARC_RTT + +# has_nexus_if --- Please select Nexus interface to offload the data from RTT +-has_nexus_if true + +# has_on_chip_mem --- Please select the on-chip memory option to store the trace data in shared memory +-has_on_chip_mem true + +# nexus_data_wdt --- Please select the Nexus Data Width to offload the data from RTT +-nexus_data_wdt 16 + +# internal_memory_size --- Please select internal memory size to capture the trace data +-internal_memory_size 16k + +# ram_type --- Please select Types of internal memories to be inferred for the logic +-ram_type 1_PORT + +# power_domains --- Adds isolation signal inputs/power switch controls for use in UPF flow when configuring power domains. +-rtt_power_domains false + + +######## Tool Configuration --- cgen.1_0 ######## + +# Create Tool Configuration +-create cgen.1_0 "System.Tool Configuration" + +# mwdt_version --- Selects the MetaWare version to be used with the TCF file. +# Change from the default to an older or newer toolset version if you want the TCF file to be used with an older or newer version of the MetaWare tools. +-mwdt_version O-2018.09 + +# code_base_addr --- +# The base address to assign to the executable code segment in the linker command file when there is no ICCM in the build. This value is ignored when there is an ICCM. +# +-code_base_addr 0x0 + +# data_base_addr --- +# The base address to assign to the data segment in the linker command file when the data is not being mapped to a DCCM. This value is ignored when the data segment is mapped to a DCCM, as in that case the base address of the DCCM memory is used. +# +# A value of 0xffffffff means that the data segment will not be mapped to any specific address. +# +-data_base_addr 0xffff_ffff + +# underscores_in_numbers --- Use underscores in hex numbers to improve readability. +-underscores_in_numbers false + +# tcf_rebrand --- Alternate branding of TCF (not used) +-rebrand false + + +]]>
+
+ + + + + + + + + + + + + + ICCM0 + + GROUP BLOCK(4): { + /* _SDA_BASE_ computed implicitly */ + .sdata?: {} + .sbss?: {} + * (DATA): {} + * (BSS): {} + .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {} + .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {} + } > SYSTEM2 + GROUP BLOCK(4): { + .Xdata? : {} + } > XCCM + GROUP BLOCK(4): { + .Ydata? : {} + } > YCCM + GROUP BLOCK(4) : { + .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4) + } > IVT + } + +]]> + + + + + + 0x07, sub_opcode => 0x1E , latency_cycles => 8) + +// User extension instruction - dsp_sin +extern long dsp_sin(long); +#pragma intrinsic(dsp_sin, opcode => 0x07, sub_opcode => 0x1F , latency_cycles => 8) + +// User extension instruction - dsp_tan +extern long dsp_tan(long); +#pragma intrinsic(dsp_tan, opcode => 0x07, sub_opcode => 0x22 , latency_cycles => 11) + +// User extension instruction - dsp_acos +extern long dsp_acos(long); +#pragma intrinsic(dsp_acos, opcode => 0x07, sub_opcode => 0x23 , latency_cycles => 31) + +// User extension instruction - dsp_asin +extern long dsp_asin(long); +#pragma intrinsic(dsp_asin, opcode => 0x07, sub_opcode => 0x24 , latency_cycles => 31) + +// User extension instruction - dsp_atan +extern long dsp_atan(long); +#pragma intrinsic(dsp_atan, opcode => 0x07, sub_opcode => 0x25 , latency_cycles => 13) + +// User extension instruction - dsp_sqrt +extern long dsp_sqrt(long); +#pragma intrinsic(dsp_sqrt, opcode => 0x07, sub_opcode => 0x20 , latency_cycles => 31) + +// User extension instruction - dsp_sqrt15 +extern long dsp_sqrt15(long); +#pragma intrinsic(dsp_sqrt15, opcode => 0x07, sub_opcode => 0x21 , latency_cycles => 15) + +#define APEX_COM_ARC_HARDWARE_DFSS_DSP_TRIG_PRESENT 1 +#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO0_PRESENT 1 + +// User extension aux register io_gpio0_debounce +#define AR_IO_GPIO0_DEBOUNCE 0x80017048 +#pragma Aux_register(0x80017048, name=>"io_gpio0_debounce") + +// User extension aux register io_gpio0_clken +#define AR_IO_GPIO0_CLKEN 0x80017080 +#pragma Aux_register(0x80017080, name=>"io_gpio0_clken") + +// User extension aux register io_gpio0_swporta_dr +#define AR_IO_GPIO0_SWPORTA_DR 0x80017000 +#pragma Aux_register(0x80017000, name=>"io_gpio0_swporta_dr") + +// User extension aux register io_gpio0_swporta_ddr +#define AR_IO_GPIO0_SWPORTA_DDR 0x80017004 +#pragma Aux_register(0x80017004, name=>"io_gpio0_swporta_ddr") + +// User extension aux register io_gpio0_inten +#define AR_IO_GPIO0_INTEN 0x80017030 +#pragma Aux_register(0x80017030, name=>"io_gpio0_inten") + +// User extension aux register io_gpio0_intmask +#define AR_IO_GPIO0_INTMASK 0x80017034 +#pragma Aux_register(0x80017034, name=>"io_gpio0_intmask") + +// User extension aux register io_gpio0_inttype_level +#define AR_IO_GPIO0_INTTYPE_LEVEL 0x80017038 +#pragma Aux_register(0x80017038, name=>"io_gpio0_inttype_level") + +// User extension aux register io_gpio0_int_polarity +#define AR_IO_GPIO0_INT_POLARITY 0x8001703c +#pragma Aux_register(0x8001703c, name=>"io_gpio0_int_polarity") + +// User extension aux register io_gpio0_intstatus +#define AR_IO_GPIO0_INTSTATUS 0x80017040 +#pragma Aux_register(0x80017040, name=>"io_gpio0_intstatus") + +// User extension aux register io_gpio0_raw_intstatus +#define AR_IO_GPIO0_RAW_INTSTATUS 0x80017044 +#pragma Aux_register(0x80017044, name=>"io_gpio0_raw_intstatus") + +// User extension aux register io_gpio0_porta_eoi +#define AR_IO_GPIO0_PORTA_EOI 0x8001704c +#pragma Aux_register(0x8001704c, name=>"io_gpio0_porta_eoi") + +// User extension aux register io_gpio0_ext_porta +#define AR_IO_GPIO0_EXT_PORTA 0x80017050 +#pragma Aux_register(0x80017050, name=>"io_gpio0_ext_porta") + +// User extension aux register io_gpio0_ls_sync +#define AR_IO_GPIO0_LS_SYNC 0x80017060 +#pragma Aux_register(0x80017060, name=>"io_gpio0_ls_sync") + +// User extension aux register io_gpio0_int_bothedge +#define AR_IO_GPIO0_INT_BOTHEDGE 0x80017068 +#pragma Aux_register(0x80017068, name=>"io_gpio0_int_bothedge") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST0_PRESENT 1 + +// User extension aux register io_i2c_mst0_clken +#define AR_IO_I2C_MST0_CLKEN 0x800120c0 +#pragma Aux_register(0x800120c0, name=>"io_i2c_mst0_clken") + +// User extension aux register io_i2c_mst0_con +#define AR_IO_I2C_MST0_CON 0x80012000 +#pragma Aux_register(0x80012000, name=>"io_i2c_mst0_con") + +// User extension aux register io_i2c_mst0_tar +#define AR_IO_I2C_MST0_TAR 0x80012004 +#pragma Aux_register(0x80012004, name=>"io_i2c_mst0_tar") + +// User extension aux register io_i2c_mst0_data_cmd +#define AR_IO_I2C_MST0_DATA_CMD 0x80012010 +#pragma Aux_register(0x80012010, name=>"io_i2c_mst0_data_cmd") + +// User extension aux register io_i2c_mst0_ss_scl_hcnt +#define AR_IO_I2C_MST0_SS_SCL_HCNT 0x80012014 +#pragma Aux_register(0x80012014, name=>"io_i2c_mst0_ss_scl_hcnt") + +// User extension aux register io_i2c_mst0_ss_scl_lcnt +#define AR_IO_I2C_MST0_SS_SCL_LCNT 0x80012018 +#pragma Aux_register(0x80012018, name=>"io_i2c_mst0_ss_scl_lcnt") + +// User extension aux register io_i2c_mst0_fs_scl_hcnt +#define AR_IO_I2C_MST0_FS_SCL_HCNT 0x8001201c +#pragma Aux_register(0x8001201c, name=>"io_i2c_mst0_fs_scl_hcnt") + +// User extension aux register io_i2c_mst0_fs_scl_lcnt +#define AR_IO_I2C_MST0_FS_SCL_LCNT 0x80012020 +#pragma Aux_register(0x80012020, name=>"io_i2c_mst0_fs_scl_lcnt") + +// User extension aux register io_i2c_mst0_intr_stat +#define AR_IO_I2C_MST0_INTR_STAT 0x8001202c +#pragma Aux_register(0x8001202c, name=>"io_i2c_mst0_intr_stat") + +// User extension aux register io_i2c_mst0_intr_mask +#define AR_IO_I2C_MST0_INTR_MASK 0x80012030 +#pragma Aux_register(0x80012030, name=>"io_i2c_mst0_intr_mask") + +// User extension aux register io_i2c_mst0_raw_intr_stat +#define AR_IO_I2C_MST0_RAW_INTR_STAT 0x80012034 +#pragma Aux_register(0x80012034, name=>"io_i2c_mst0_raw_intr_stat") + +// User extension aux register io_i2c_mst0_rx_tl +#define AR_IO_I2C_MST0_RX_TL 0x80012038 +#pragma Aux_register(0x80012038, name=>"io_i2c_mst0_rx_tl") + +// User extension aux register io_i2c_mst0_tx_tl +#define AR_IO_I2C_MST0_TX_TL 0x8001203c +#pragma Aux_register(0x8001203c, name=>"io_i2c_mst0_tx_tl") + +// User extension aux register io_i2c_mst0_clr_intr +#define AR_IO_I2C_MST0_CLR_INTR 0x80012040 +#pragma Aux_register(0x80012040, name=>"io_i2c_mst0_clr_intr") + +// User extension aux register io_i2c_mst0_clr_rx_under +#define AR_IO_I2C_MST0_CLR_RX_UNDER 0x80012044 +#pragma Aux_register(0x80012044, name=>"io_i2c_mst0_clr_rx_under") + +// User extension aux register io_i2c_mst0_clr_rx_over +#define AR_IO_I2C_MST0_CLR_RX_OVER 0x80012048 +#pragma Aux_register(0x80012048, name=>"io_i2c_mst0_clr_rx_over") + +// User extension aux register io_i2c_mst0_clr_tx_over +#define AR_IO_I2C_MST0_CLR_TX_OVER 0x8001204c +#pragma Aux_register(0x8001204c, name=>"io_i2c_mst0_clr_tx_over") + +// User extension aux register io_i2c_mst0_clr_tx_abrt +#define AR_IO_I2C_MST0_CLR_TX_ABRT 0x80012054 +#pragma Aux_register(0x80012054, name=>"io_i2c_mst0_clr_tx_abrt") + +// User extension aux register io_i2c_mst0_clr_activity +#define AR_IO_I2C_MST0_CLR_ACTIVITY 0x8001205c +#pragma Aux_register(0x8001205c, name=>"io_i2c_mst0_clr_activity") + +// User extension aux register io_i2c_mst0_clr_stop_det +#define AR_IO_I2C_MST0_CLR_STOP_DET 0x80012060 +#pragma Aux_register(0x80012060, name=>"io_i2c_mst0_clr_stop_det") + +// User extension aux register io_i2c_mst0_clr_start_det +#define AR_IO_I2C_MST0_CLR_START_DET 0x80012064 +#pragma Aux_register(0x80012064, name=>"io_i2c_mst0_clr_start_det") + +// User extension aux register io_i2c_mst0_enable +#define AR_IO_I2C_MST0_ENABLE 0x8001206c +#pragma Aux_register(0x8001206c, name=>"io_i2c_mst0_enable") + +// User extension aux register io_i2c_mst0_status +#define AR_IO_I2C_MST0_STATUS 0x80012070 +#pragma Aux_register(0x80012070, name=>"io_i2c_mst0_status") + +// User extension aux register io_i2c_mst0_txflr +#define AR_IO_I2C_MST0_TXFLR 0x80012074 +#pragma Aux_register(0x80012074, name=>"io_i2c_mst0_txflr") + +// User extension aux register io_i2c_mst0_rxflr +#define AR_IO_I2C_MST0_RXFLR 0x80012078 +#pragma Aux_register(0x80012078, name=>"io_i2c_mst0_rxflr") + +// User extension aux register io_i2c_mst0_sda_hold +#define AR_IO_I2C_MST0_SDA_HOLD 0x8001207c +#pragma Aux_register(0x8001207c, name=>"io_i2c_mst0_sda_hold") + +// User extension aux register io_i2c_mst0_tx_abrt_source +#define AR_IO_I2C_MST0_TX_ABRT_SOURCE 0x80012080 +#pragma Aux_register(0x80012080, name=>"io_i2c_mst0_tx_abrt_source") + +// User extension aux register io_i2c_mst0_enable_status +#define AR_IO_I2C_MST0_ENABLE_STATUS 0x8001209c +#pragma Aux_register(0x8001209c, name=>"io_i2c_mst0_enable_status") + +// User extension aux register io_i2c_mst0_fs_spklen +#define AR_IO_I2C_MST0_FS_SPKLEN 0x800120a0 +#pragma Aux_register(0x800120a0, name=>"io_i2c_mst0_fs_spklen") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_SLV0_PRESENT 1 + +// User extension aux register io_i2c_slv0_clken +#define AR_IO_I2C_SLV0_CLKEN 0x800130c0 +#pragma Aux_register(0x800130c0, name=>"io_i2c_slv0_clken") + +// User extension aux register io_i2c_slv0_con +#define AR_IO_I2C_SLV0_CON 0x80013000 +#pragma Aux_register(0x80013000, name=>"io_i2c_slv0_con") + +// User extension aux register io_i2c_slv0_sar +#define AR_IO_I2C_SLV0_SAR 0x80013008 +#pragma Aux_register(0x80013008, name=>"io_i2c_slv0_sar") + +// User extension aux register io_i2c_slv0_data_cmd +#define AR_IO_I2C_SLV0_DATA_CMD 0x80013010 +#pragma Aux_register(0x80013010, name=>"io_i2c_slv0_data_cmd") + +// User extension aux register io_i2c_slv0_intr_stat +#define AR_IO_I2C_SLV0_INTR_STAT 0x8001302c +#pragma Aux_register(0x8001302c, name=>"io_i2c_slv0_intr_stat") + +// User extension aux register io_i2c_slv0_intr_mask +#define AR_IO_I2C_SLV0_INTR_MASK 0x80013030 +#pragma Aux_register(0x80013030, name=>"io_i2c_slv0_intr_mask") + +// User extension aux register io_i2c_slv0_raw_intr_stat +#define AR_IO_I2C_SLV0_RAW_INTR_STAT 0x80013034 +#pragma Aux_register(0x80013034, name=>"io_i2c_slv0_raw_intr_stat") + +// User extension aux register io_i2c_slv0_rx_tl +#define AR_IO_I2C_SLV0_RX_TL 0x80013038 +#pragma Aux_register(0x80013038, name=>"io_i2c_slv0_rx_tl") + +// User extension aux register io_i2c_slv0_tx_tl +#define AR_IO_I2C_SLV0_TX_TL 0x8001303c +#pragma Aux_register(0x8001303c, name=>"io_i2c_slv0_tx_tl") + +// User extension aux register io_i2c_slv0_clr_intr +#define AR_IO_I2C_SLV0_CLR_INTR 0x80013040 +#pragma Aux_register(0x80013040, name=>"io_i2c_slv0_clr_intr") + +// User extension aux register io_i2c_slv0_clr_rx_under +#define AR_IO_I2C_SLV0_CLR_RX_UNDER 0x80013044 +#pragma Aux_register(0x80013044, name=>"io_i2c_slv0_clr_rx_under") + +// User extension aux register io_i2c_slv0_clr_rx_over +#define AR_IO_I2C_SLV0_CLR_RX_OVER 0x80013048 +#pragma Aux_register(0x80013048, name=>"io_i2c_slv0_clr_rx_over") + +// User extension aux register io_i2c_slv0_clr_tx_over +#define AR_IO_I2C_SLV0_CLR_TX_OVER 0x8001304c +#pragma Aux_register(0x8001304c, name=>"io_i2c_slv0_clr_tx_over") + +// User extension aux register io_i2c_slv0_clr_rd_req +#define AR_IO_I2C_SLV0_CLR_RD_REQ 0x80013050 +#pragma Aux_register(0x80013050, name=>"io_i2c_slv0_clr_rd_req") + +// User extension aux register io_i2c_slv0_clr_tx_abrt +#define AR_IO_I2C_SLV0_CLR_TX_ABRT 0x80013054 +#pragma Aux_register(0x80013054, name=>"io_i2c_slv0_clr_tx_abrt") + +// User extension aux register io_i2c_slv0_clr_rx_done +#define AR_IO_I2C_SLV0_CLR_RX_DONE 0x80013058 +#pragma Aux_register(0x80013058, name=>"io_i2c_slv0_clr_rx_done") + +// User extension aux register io_i2c_slv0_clr_activity +#define AR_IO_I2C_SLV0_CLR_ACTIVITY 0x8001305c +#pragma Aux_register(0x8001305c, name=>"io_i2c_slv0_clr_activity") + +// User extension aux register io_i2c_slv0_clr_stop_det +#define AR_IO_I2C_SLV0_CLR_STOP_DET 0x80013060 +#pragma Aux_register(0x80013060, name=>"io_i2c_slv0_clr_stop_det") + +// User extension aux register io_i2c_slv0_clr_start_det +#define AR_IO_I2C_SLV0_CLR_START_DET 0x80013064 +#pragma Aux_register(0x80013064, name=>"io_i2c_slv0_clr_start_det") + +// User extension aux register io_i2c_slv0_enable +#define AR_IO_I2C_SLV0_ENABLE 0x8001306c +#pragma Aux_register(0x8001306c, name=>"io_i2c_slv0_enable") + +// User extension aux register io_i2c_slv0_status +#define AR_IO_I2C_SLV0_STATUS 0x80013070 +#pragma Aux_register(0x80013070, name=>"io_i2c_slv0_status") + +// User extension aux register io_i2c_slv0_txflr +#define AR_IO_I2C_SLV0_TXFLR 0x80013074 +#pragma Aux_register(0x80013074, name=>"io_i2c_slv0_txflr") + +// User extension aux register io_i2c_slv0_rxflr +#define AR_IO_I2C_SLV0_RXFLR 0x80013078 +#pragma Aux_register(0x80013078, name=>"io_i2c_slv0_rxflr") + +// User extension aux register io_i2c_slv0_sda_hold +#define AR_IO_I2C_SLV0_SDA_HOLD 0x8001307c +#pragma Aux_register(0x8001307c, name=>"io_i2c_slv0_sda_hold") + +// User extension aux register io_i2c_slv0_tx_abrt_source +#define AR_IO_I2C_SLV0_TX_ABRT_SOURCE 0x80013080 +#pragma Aux_register(0x80013080, name=>"io_i2c_slv0_tx_abrt_source") + +// User extension aux register io_i2c_slv0_sda_setup +#define AR_IO_I2C_SLV0_SDA_SETUP 0x80013094 +#pragma Aux_register(0x80013094, name=>"io_i2c_slv0_sda_setup") + +// User extension aux register io_i2c_slv0_enable_status +#define AR_IO_I2C_SLV0_ENABLE_STATUS 0x8001309c +#pragma Aux_register(0x8001309c, name=>"io_i2c_slv0_enable_status") + +// User extension aux register io_i2c_slv0_fs_spklen +#define AR_IO_I2C_SLV0_FS_SPKLEN 0x800130a0 +#pragma Aux_register(0x800130a0, name=>"io_i2c_slv0_fs_spklen") + +// User extension aux register io_i2c_slv0_clr_restart_det +#define AR_IO_I2C_SLV0_CLR_RESTART_DET 0x800130a8 +#pragma Aux_register(0x800130a8, name=>"io_i2c_slv0_clr_restart_det") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST0_PRESENT 1 + +// User extension aux register io_spi_mst0_ctrlr0 +#define AR_IO_SPI_MST0_CTRLR0 0x80010000 +#pragma Aux_register(0x80010000, name=>"io_spi_mst0_ctrlr0") + +// User extension aux register io_spi_mst0_ctrlr1 +#define AR_IO_SPI_MST0_CTRLR1 0x80010001 +#pragma Aux_register(0x80010001, name=>"io_spi_mst0_ctrlr1") + +// User extension aux register io_spi_mst0_spien +#define AR_IO_SPI_MST0_SPIEN 0x80010002 +#pragma Aux_register(0x80010002, name=>"io_spi_mst0_spien") + +// User extension aux register io_spi_mst0_ser +#define AR_IO_SPI_MST0_SER 0x80010004 +#pragma Aux_register(0x80010004, name=>"io_spi_mst0_ser") + +// User extension aux register io_spi_mst0_baudr +#define AR_IO_SPI_MST0_BAUDR 0x80010005 +#pragma Aux_register(0x80010005, name=>"io_spi_mst0_baudr") + +// User extension aux register io_spi_mst0_txftlr +#define AR_IO_SPI_MST0_TXFTLR 0x80010006 +#pragma Aux_register(0x80010006, name=>"io_spi_mst0_txftlr") + +// User extension aux register io_spi_mst0_rxftlr +#define AR_IO_SPI_MST0_RXFTLR 0x80010007 +#pragma Aux_register(0x80010007, name=>"io_spi_mst0_rxftlr") + +// User extension aux register io_spi_mst0_txflr +#define AR_IO_SPI_MST0_TXFLR 0x80010008 +#pragma Aux_register(0x80010008, name=>"io_spi_mst0_txflr") + +// User extension aux register io_spi_mst0_rxflr +#define AR_IO_SPI_MST0_RXFLR 0x80010009 +#pragma Aux_register(0x80010009, name=>"io_spi_mst0_rxflr") + +// User extension aux register io_spi_mst0_sr +#define AR_IO_SPI_MST0_SR 0x8001000a +#pragma Aux_register(0x8001000a, name=>"io_spi_mst0_sr") + +// User extension aux register io_spi_mst0_imr +#define AR_IO_SPI_MST0_IMR 0x8001000b +#pragma Aux_register(0x8001000b, name=>"io_spi_mst0_imr") + +// User extension aux register io_spi_mst0_isr +#define AR_IO_SPI_MST0_ISR 0x8001000c +#pragma Aux_register(0x8001000c, name=>"io_spi_mst0_isr") + +// User extension aux register io_spi_mst0_risr +#define AR_IO_SPI_MST0_RISR 0x8001000d +#pragma Aux_register(0x8001000d, name=>"io_spi_mst0_risr") + +// User extension aux register io_spi_mst0_txoicr +#define AR_IO_SPI_MST0_TXOICR 0x8001000e +#pragma Aux_register(0x8001000e, name=>"io_spi_mst0_txoicr") + +// User extension aux register io_spi_mst0_rxoicr +#define AR_IO_SPI_MST0_RXOICR 0x8001000f +#pragma Aux_register(0x8001000f, name=>"io_spi_mst0_rxoicr") + +// User extension aux register io_spi_mst0_rxuicr +#define AR_IO_SPI_MST0_RXUICR 0x80010010 +#pragma Aux_register(0x80010010, name=>"io_spi_mst0_rxuicr") + +// User extension aux register io_spi_mst0_icr +#define AR_IO_SPI_MST0_ICR 0x80010012 +#pragma Aux_register(0x80010012, name=>"io_spi_mst0_icr") + +// User extension aux register io_spi_mst0_clken +#define AR_IO_SPI_MST0_CLKEN 0x80010016 +#pragma Aux_register(0x80010016, name=>"io_spi_mst0_clken") + +// User extension aux register io_spi_mst0_dr +#define AR_IO_SPI_MST0_DR 0x80010018 +#pragma Aux_register(0x80010018, name=>"io_spi_mst0_dr") + +// User extension aux register io_spi_mst0_rx_sample_dly +#define AR_IO_SPI_MST0_RX_SAMPLE_DLY 0x8001003c +#pragma Aux_register(0x8001003c, name=>"io_spi_mst0_rx_sample_dly") +#define APEX_COM_ARC_HARDWARE_DFSS_SUBSYS_BCR_PRESENT 1 + +// User extension aux register SUBSYS_BUILD +#define AR_SUBSYS_BUILD 0xf0 +#pragma Aux_register(0xf0, name=>"SUBSYS_BUILD") + +// User extension aux register SUBSYS_DSP_0_BUILD +#define AR_SUBSYS_DSP_0_BUILD 0xa00 +#pragma Aux_register(0xa00, name=>"SUBSYS_DSP_0_BUILD") + +// User extension aux register SUBSYS_DSP_0_CONFIG +#define AR_SUBSYS_DSP_0_CONFIG 0xa02 +#pragma Aux_register(0xa02, name=>"SUBSYS_DSP_0_CONFIG") + +// User extension aux register SUBSYS_IO_0_BUILD +#define AR_SUBSYS_IO_0_BUILD 0xa04 +#pragma Aux_register(0xa04, name=>"SUBSYS_IO_0_BUILD") + +// User extension aux register SUBSYS_IO_1_BUILD +#define AR_SUBSYS_IO_1_BUILD 0xa05 +#pragma Aux_register(0xa05, name=>"SUBSYS_IO_1_BUILD") + +// User extension aux register SUBSYS_IO_2_BUILD +#define AR_SUBSYS_IO_2_BUILD 0xa06 +#pragma Aux_register(0xa06, name=>"SUBSYS_IO_2_BUILD") + +// User extension aux register SUBSYS_UAUX_OFFSET +#define AR_SUBSYS_UAUX_OFFSET 0xa1e +#pragma Aux_register(0xa1e, name=>"SUBSYS_UAUX_OFFSET") + +// User extension aux register SUBSYS_APEX_OFFSET +#define AR_SUBSYS_APEX_OFFSET 0xa1f +#pragma Aux_register(0xa1f, name=>"SUBSYS_APEX_OFFSET") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST1_PRESENT 1 + +// User extension aux register io_spi_mst1_ctrlr0 +#define AR_IO_SPI_MST1_CTRLR0 0x80010100 +#pragma Aux_register(0x80010100, name=>"io_spi_mst1_ctrlr0") + +// User extension aux register io_spi_mst1_ctrlr1 +#define AR_IO_SPI_MST1_CTRLR1 0x80010101 +#pragma Aux_register(0x80010101, name=>"io_spi_mst1_ctrlr1") + +// User extension aux register io_spi_mst1_spien +#define AR_IO_SPI_MST1_SPIEN 0x80010102 +#pragma Aux_register(0x80010102, name=>"io_spi_mst1_spien") + +// User extension aux register io_spi_mst1_ser +#define AR_IO_SPI_MST1_SER 0x80010104 +#pragma Aux_register(0x80010104, name=>"io_spi_mst1_ser") + +// User extension aux register io_spi_mst1_baudr +#define AR_IO_SPI_MST1_BAUDR 0x80010105 +#pragma Aux_register(0x80010105, name=>"io_spi_mst1_baudr") + +// User extension aux register io_spi_mst1_txftlr +#define AR_IO_SPI_MST1_TXFTLR 0x80010106 +#pragma Aux_register(0x80010106, name=>"io_spi_mst1_txftlr") + +// User extension aux register io_spi_mst1_rxftlr +#define AR_IO_SPI_MST1_RXFTLR 0x80010107 +#pragma Aux_register(0x80010107, name=>"io_spi_mst1_rxftlr") + +// User extension aux register io_spi_mst1_txflr +#define AR_IO_SPI_MST1_TXFLR 0x80010108 +#pragma Aux_register(0x80010108, name=>"io_spi_mst1_txflr") + +// User extension aux register io_spi_mst1_rxflr +#define AR_IO_SPI_MST1_RXFLR 0x80010109 +#pragma Aux_register(0x80010109, name=>"io_spi_mst1_rxflr") + +// User extension aux register io_spi_mst1_sr +#define AR_IO_SPI_MST1_SR 0x8001010a +#pragma Aux_register(0x8001010a, name=>"io_spi_mst1_sr") + +// User extension aux register io_spi_mst1_imr +#define AR_IO_SPI_MST1_IMR 0x8001010b +#pragma Aux_register(0x8001010b, name=>"io_spi_mst1_imr") + +// User extension aux register io_spi_mst1_isr +#define AR_IO_SPI_MST1_ISR 0x8001010c +#pragma Aux_register(0x8001010c, name=>"io_spi_mst1_isr") + +// User extension aux register io_spi_mst1_risr +#define AR_IO_SPI_MST1_RISR 0x8001010d +#pragma Aux_register(0x8001010d, name=>"io_spi_mst1_risr") + +// User extension aux register io_spi_mst1_txoicr +#define AR_IO_SPI_MST1_TXOICR 0x8001010e +#pragma Aux_register(0x8001010e, name=>"io_spi_mst1_txoicr") + +// User extension aux register io_spi_mst1_rxoicr +#define AR_IO_SPI_MST1_RXOICR 0x8001010f +#pragma Aux_register(0x8001010f, name=>"io_spi_mst1_rxoicr") + +// User extension aux register io_spi_mst1_rxuicr +#define AR_IO_SPI_MST1_RXUICR 0x80010110 +#pragma Aux_register(0x80010110, name=>"io_spi_mst1_rxuicr") + +// User extension aux register io_spi_mst1_icr +#define AR_IO_SPI_MST1_ICR 0x80010112 +#pragma Aux_register(0x80010112, name=>"io_spi_mst1_icr") + +// User extension aux register io_spi_mst1_clken +#define AR_IO_SPI_MST1_CLKEN 0x80010116 +#pragma Aux_register(0x80010116, name=>"io_spi_mst1_clken") + +// User extension aux register io_spi_mst1_dr +#define AR_IO_SPI_MST1_DR 0x80010118 +#pragma Aux_register(0x80010118, name=>"io_spi_mst1_dr") + +// User extension aux register io_spi_mst1_rx_sample_dly +#define AR_IO_SPI_MST1_RX_SAMPLE_DLY 0x8001013c +#pragma Aux_register(0x8001013c, name=>"io_spi_mst1_rx_sample_dly") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST2_PRESENT 1 + +// User extension aux register io_spi_mst2_ctrlr0 +#define AR_IO_SPI_MST2_CTRLR0 0x80010200 +#pragma Aux_register(0x80010200, name=>"io_spi_mst2_ctrlr0") + +// User extension aux register io_spi_mst2_ctrlr1 +#define AR_IO_SPI_MST2_CTRLR1 0x80010201 +#pragma Aux_register(0x80010201, name=>"io_spi_mst2_ctrlr1") + +// User extension aux register io_spi_mst2_spien +#define AR_IO_SPI_MST2_SPIEN 0x80010202 +#pragma Aux_register(0x80010202, name=>"io_spi_mst2_spien") + +// User extension aux register io_spi_mst2_ser +#define AR_IO_SPI_MST2_SER 0x80010204 +#pragma Aux_register(0x80010204, name=>"io_spi_mst2_ser") + +// User extension aux register io_spi_mst2_baudr +#define AR_IO_SPI_MST2_BAUDR 0x80010205 +#pragma Aux_register(0x80010205, name=>"io_spi_mst2_baudr") + +// User extension aux register io_spi_mst2_txftlr +#define AR_IO_SPI_MST2_TXFTLR 0x80010206 +#pragma Aux_register(0x80010206, name=>"io_spi_mst2_txftlr") + +// User extension aux register io_spi_mst2_rxftlr +#define AR_IO_SPI_MST2_RXFTLR 0x80010207 +#pragma Aux_register(0x80010207, name=>"io_spi_mst2_rxftlr") + +// User extension aux register io_spi_mst2_txflr +#define AR_IO_SPI_MST2_TXFLR 0x80010208 +#pragma Aux_register(0x80010208, name=>"io_spi_mst2_txflr") + +// User extension aux register io_spi_mst2_rxflr +#define AR_IO_SPI_MST2_RXFLR 0x80010209 +#pragma Aux_register(0x80010209, name=>"io_spi_mst2_rxflr") + +// User extension aux register io_spi_mst2_sr +#define AR_IO_SPI_MST2_SR 0x8001020a +#pragma Aux_register(0x8001020a, name=>"io_spi_mst2_sr") + +// User extension aux register io_spi_mst2_imr +#define AR_IO_SPI_MST2_IMR 0x8001020b +#pragma Aux_register(0x8001020b, name=>"io_spi_mst2_imr") + +// User extension aux register io_spi_mst2_isr +#define AR_IO_SPI_MST2_ISR 0x8001020c +#pragma Aux_register(0x8001020c, name=>"io_spi_mst2_isr") + +// User extension aux register io_spi_mst2_risr +#define AR_IO_SPI_MST2_RISR 0x8001020d +#pragma Aux_register(0x8001020d, name=>"io_spi_mst2_risr") + +// User extension aux register io_spi_mst2_txoicr +#define AR_IO_SPI_MST2_TXOICR 0x8001020e +#pragma Aux_register(0x8001020e, name=>"io_spi_mst2_txoicr") + +// User extension aux register io_spi_mst2_rxoicr +#define AR_IO_SPI_MST2_RXOICR 0x8001020f +#pragma Aux_register(0x8001020f, name=>"io_spi_mst2_rxoicr") + +// User extension aux register io_spi_mst2_rxuicr +#define AR_IO_SPI_MST2_RXUICR 0x80010210 +#pragma Aux_register(0x80010210, name=>"io_spi_mst2_rxuicr") + +// User extension aux register io_spi_mst2_icr +#define AR_IO_SPI_MST2_ICR 0x80010212 +#pragma Aux_register(0x80010212, name=>"io_spi_mst2_icr") + +// User extension aux register io_spi_mst2_clken +#define AR_IO_SPI_MST2_CLKEN 0x80010216 +#pragma Aux_register(0x80010216, name=>"io_spi_mst2_clken") + +// User extension aux register io_spi_mst2_dr +#define AR_IO_SPI_MST2_DR 0x80010218 +#pragma Aux_register(0x80010218, name=>"io_spi_mst2_dr") + +// User extension aux register io_spi_mst2_rx_sample_dly +#define AR_IO_SPI_MST2_RX_SAMPLE_DLY 0x8001023c +#pragma Aux_register(0x8001023c, name=>"io_spi_mst2_rx_sample_dly") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_SLV0_PRESENT 1 + +// User extension aux register io_spi_slv0_ctrlr0 +#define AR_IO_SPI_SLV0_CTRLR0 0x80011000 +#pragma Aux_register(0x80011000, name=>"io_spi_slv0_ctrlr0") + +// User extension aux register io_spi_slv0_spien +#define AR_IO_SPI_SLV0_SPIEN 0x80011002 +#pragma Aux_register(0x80011002, name=>"io_spi_slv0_spien") + +// User extension aux register io_spi_slv0_txftlr +#define AR_IO_SPI_SLV0_TXFTLR 0x80011006 +#pragma Aux_register(0x80011006, name=>"io_spi_slv0_txftlr") + +// User extension aux register io_spi_slv0_rxftlr +#define AR_IO_SPI_SLV0_RXFTLR 0x80011007 +#pragma Aux_register(0x80011007, name=>"io_spi_slv0_rxftlr") + +// User extension aux register io_spi_slv0_txflr +#define AR_IO_SPI_SLV0_TXFLR 0x80011008 +#pragma Aux_register(0x80011008, name=>"io_spi_slv0_txflr") + +// User extension aux register io_spi_slv0_rxflr +#define AR_IO_SPI_SLV0_RXFLR 0x80011009 +#pragma Aux_register(0x80011009, name=>"io_spi_slv0_rxflr") + +// User extension aux register io_spi_slv0_sr +#define AR_IO_SPI_SLV0_SR 0x8001100a +#pragma Aux_register(0x8001100a, name=>"io_spi_slv0_sr") + +// User extension aux register io_spi_slv0_imr +#define AR_IO_SPI_SLV0_IMR 0x8001100b +#pragma Aux_register(0x8001100b, name=>"io_spi_slv0_imr") + +// User extension aux register io_spi_slv0_isr +#define AR_IO_SPI_SLV0_ISR 0x8001100c +#pragma Aux_register(0x8001100c, name=>"io_spi_slv0_isr") + +// User extension aux register io_spi_slv0_risr +#define AR_IO_SPI_SLV0_RISR 0x8001100d +#pragma Aux_register(0x8001100d, name=>"io_spi_slv0_risr") + +// User extension aux register io_spi_slv0_txoicr +#define AR_IO_SPI_SLV0_TXOICR 0x8001100e +#pragma Aux_register(0x8001100e, name=>"io_spi_slv0_txoicr") + +// User extension aux register io_spi_slv0_rxoicr +#define AR_IO_SPI_SLV0_RXOICR 0x8001100f +#pragma Aux_register(0x8001100f, name=>"io_spi_slv0_rxoicr") + +// User extension aux register io_spi_slv0_rxuicr +#define AR_IO_SPI_SLV0_RXUICR 0x80011010 +#pragma Aux_register(0x80011010, name=>"io_spi_slv0_rxuicr") + +// User extension aux register io_spi_slv0_icr +#define AR_IO_SPI_SLV0_ICR 0x80011012 +#pragma Aux_register(0x80011012, name=>"io_spi_slv0_icr") + +// User extension aux register io_spi_slv0_clken +#define AR_IO_SPI_SLV0_CLKEN 0x80011016 +#pragma Aux_register(0x80011016, name=>"io_spi_slv0_clken") + +// User extension aux register io_spi_slv0_dr +#define AR_IO_SPI_SLV0_DR 0x80011018 +#pragma Aux_register(0x80011018, name=>"io_spi_slv0_dr") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO1_PRESENT 1 + +// User extension aux register io_gpio1_debounce +#define AR_IO_GPIO1_DEBOUNCE 0x80017148 +#pragma Aux_register(0x80017148, name=>"io_gpio1_debounce") + +// User extension aux register io_gpio1_clken +#define AR_IO_GPIO1_CLKEN 0x80017180 +#pragma Aux_register(0x80017180, name=>"io_gpio1_clken") + +// User extension aux register io_gpio1_swporta_dr +#define AR_IO_GPIO1_SWPORTA_DR 0x80017100 +#pragma Aux_register(0x80017100, name=>"io_gpio1_swporta_dr") + +// User extension aux register io_gpio1_swporta_ddr +#define AR_IO_GPIO1_SWPORTA_DDR 0x80017104 +#pragma Aux_register(0x80017104, name=>"io_gpio1_swporta_ddr") + +// User extension aux register io_gpio1_inten +#define AR_IO_GPIO1_INTEN 0x80017130 +#pragma Aux_register(0x80017130, name=>"io_gpio1_inten") + +// User extension aux register io_gpio1_intmask +#define AR_IO_GPIO1_INTMASK 0x80017134 +#pragma Aux_register(0x80017134, name=>"io_gpio1_intmask") + +// User extension aux register io_gpio1_inttype_level +#define AR_IO_GPIO1_INTTYPE_LEVEL 0x80017138 +#pragma Aux_register(0x80017138, name=>"io_gpio1_inttype_level") + +// User extension aux register io_gpio1_int_polarity +#define AR_IO_GPIO1_INT_POLARITY 0x8001713c +#pragma Aux_register(0x8001713c, name=>"io_gpio1_int_polarity") + +// User extension aux register io_gpio1_intstatus +#define AR_IO_GPIO1_INTSTATUS 0x80017140 +#pragma Aux_register(0x80017140, name=>"io_gpio1_intstatus") + +// User extension aux register io_gpio1_raw_intstatus +#define AR_IO_GPIO1_RAW_INTSTATUS 0x80017144 +#pragma Aux_register(0x80017144, name=>"io_gpio1_raw_intstatus") + +// User extension aux register io_gpio1_porta_eoi +#define AR_IO_GPIO1_PORTA_EOI 0x8001714c +#pragma Aux_register(0x8001714c, name=>"io_gpio1_porta_eoi") + +// User extension aux register io_gpio1_ext_porta +#define AR_IO_GPIO1_EXT_PORTA 0x80017150 +#pragma Aux_register(0x80017150, name=>"io_gpio1_ext_porta") + +// User extension aux register io_gpio1_ls_sync +#define AR_IO_GPIO1_LS_SYNC 0x80017160 +#pragma Aux_register(0x80017160, name=>"io_gpio1_ls_sync") + +// User extension aux register io_gpio1_int_bothedge +#define AR_IO_GPIO1_INT_BOTHEDGE 0x80017168 +#pragma Aux_register(0x80017168, name=>"io_gpio1_int_bothedge") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO2_PRESENT 1 + +// User extension aux register io_gpio2_debounce +#define AR_IO_GPIO2_DEBOUNCE 0x80017248 +#pragma Aux_register(0x80017248, name=>"io_gpio2_debounce") + +// User extension aux register io_gpio2_clken +#define AR_IO_GPIO2_CLKEN 0x80017280 +#pragma Aux_register(0x80017280, name=>"io_gpio2_clken") + +// User extension aux register io_gpio2_swporta_dr +#define AR_IO_GPIO2_SWPORTA_DR 0x80017200 +#pragma Aux_register(0x80017200, name=>"io_gpio2_swporta_dr") + +// User extension aux register io_gpio2_swporta_ddr +#define AR_IO_GPIO2_SWPORTA_DDR 0x80017204 +#pragma Aux_register(0x80017204, name=>"io_gpio2_swporta_ddr") + +// User extension aux register io_gpio2_inten +#define AR_IO_GPIO2_INTEN 0x80017230 +#pragma Aux_register(0x80017230, name=>"io_gpio2_inten") + +// User extension aux register io_gpio2_intmask +#define AR_IO_GPIO2_INTMASK 0x80017234 +#pragma Aux_register(0x80017234, name=>"io_gpio2_intmask") + +// User extension aux register io_gpio2_inttype_level +#define AR_IO_GPIO2_INTTYPE_LEVEL 0x80017238 +#pragma Aux_register(0x80017238, name=>"io_gpio2_inttype_level") + +// User extension aux register io_gpio2_int_polarity +#define AR_IO_GPIO2_INT_POLARITY 0x8001723c +#pragma Aux_register(0x8001723c, name=>"io_gpio2_int_polarity") + +// User extension aux register io_gpio2_intstatus +#define AR_IO_GPIO2_INTSTATUS 0x80017240 +#pragma Aux_register(0x80017240, name=>"io_gpio2_intstatus") + +// User extension aux register io_gpio2_raw_intstatus +#define AR_IO_GPIO2_RAW_INTSTATUS 0x80017244 +#pragma Aux_register(0x80017244, name=>"io_gpio2_raw_intstatus") + +// User extension aux register io_gpio2_porta_eoi +#define AR_IO_GPIO2_PORTA_EOI 0x8001724c +#pragma Aux_register(0x8001724c, name=>"io_gpio2_porta_eoi") + +// User extension aux register io_gpio2_ext_porta +#define AR_IO_GPIO2_EXT_PORTA 0x80017250 +#pragma Aux_register(0x80017250, name=>"io_gpio2_ext_porta") + +// User extension aux register io_gpio2_ls_sync +#define AR_IO_GPIO2_LS_SYNC 0x80017260 +#pragma Aux_register(0x80017260, name=>"io_gpio2_ls_sync") + +// User extension aux register io_gpio2_int_bothedge +#define AR_IO_GPIO2_INT_BOTHEDGE 0x80017268 +#pragma Aux_register(0x80017268, name=>"io_gpio2_int_bothedge") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST1_PRESENT 1 + +// User extension aux register io_i2c_mst1_clken +#define AR_IO_I2C_MST1_CLKEN 0x800121c0 +#pragma Aux_register(0x800121c0, name=>"io_i2c_mst1_clken") + +// User extension aux register io_i2c_mst1_con +#define AR_IO_I2C_MST1_CON 0x80012100 +#pragma Aux_register(0x80012100, name=>"io_i2c_mst1_con") + +// User extension aux register io_i2c_mst1_tar +#define AR_IO_I2C_MST1_TAR 0x80012104 +#pragma Aux_register(0x80012104, name=>"io_i2c_mst1_tar") + +// User extension aux register io_i2c_mst1_data_cmd +#define AR_IO_I2C_MST1_DATA_CMD 0x80012110 +#pragma Aux_register(0x80012110, name=>"io_i2c_mst1_data_cmd") + +// User extension aux register io_i2c_mst1_ss_scl_hcnt +#define AR_IO_I2C_MST1_SS_SCL_HCNT 0x80012114 +#pragma Aux_register(0x80012114, name=>"io_i2c_mst1_ss_scl_hcnt") + +// User extension aux register io_i2c_mst1_ss_scl_lcnt +#define AR_IO_I2C_MST1_SS_SCL_LCNT 0x80012118 +#pragma Aux_register(0x80012118, name=>"io_i2c_mst1_ss_scl_lcnt") + +// User extension aux register io_i2c_mst1_fs_scl_hcnt +#define AR_IO_I2C_MST1_FS_SCL_HCNT 0x8001211c +#pragma Aux_register(0x8001211c, name=>"io_i2c_mst1_fs_scl_hcnt") + +// User extension aux register io_i2c_mst1_fs_scl_lcnt +#define AR_IO_I2C_MST1_FS_SCL_LCNT 0x80012120 +#pragma Aux_register(0x80012120, name=>"io_i2c_mst1_fs_scl_lcnt") + +// User extension aux register io_i2c_mst1_intr_stat +#define AR_IO_I2C_MST1_INTR_STAT 0x8001212c +#pragma Aux_register(0x8001212c, name=>"io_i2c_mst1_intr_stat") + +// User extension aux register io_i2c_mst1_intr_mask +#define AR_IO_I2C_MST1_INTR_MASK 0x80012130 +#pragma Aux_register(0x80012130, name=>"io_i2c_mst1_intr_mask") + +// User extension aux register io_i2c_mst1_raw_intr_stat +#define AR_IO_I2C_MST1_RAW_INTR_STAT 0x80012134 +#pragma Aux_register(0x80012134, name=>"io_i2c_mst1_raw_intr_stat") + +// User extension aux register io_i2c_mst1_rx_tl +#define AR_IO_I2C_MST1_RX_TL 0x80012138 +#pragma Aux_register(0x80012138, name=>"io_i2c_mst1_rx_tl") + +// User extension aux register io_i2c_mst1_tx_tl +#define AR_IO_I2C_MST1_TX_TL 0x8001213c +#pragma Aux_register(0x8001213c, name=>"io_i2c_mst1_tx_tl") + +// User extension aux register io_i2c_mst1_clr_intr +#define AR_IO_I2C_MST1_CLR_INTR 0x80012140 +#pragma Aux_register(0x80012140, name=>"io_i2c_mst1_clr_intr") + +// User extension aux register io_i2c_mst1_clr_rx_under +#define AR_IO_I2C_MST1_CLR_RX_UNDER 0x80012144 +#pragma Aux_register(0x80012144, name=>"io_i2c_mst1_clr_rx_under") + +// User extension aux register io_i2c_mst1_clr_rx_over +#define AR_IO_I2C_MST1_CLR_RX_OVER 0x80012148 +#pragma Aux_register(0x80012148, name=>"io_i2c_mst1_clr_rx_over") + +// User extension aux register io_i2c_mst1_clr_tx_over +#define AR_IO_I2C_MST1_CLR_TX_OVER 0x8001214c +#pragma Aux_register(0x8001214c, name=>"io_i2c_mst1_clr_tx_over") + +// User extension aux register io_i2c_mst1_clr_tx_abrt +#define AR_IO_I2C_MST1_CLR_TX_ABRT 0x80012154 +#pragma Aux_register(0x80012154, name=>"io_i2c_mst1_clr_tx_abrt") + +// User extension aux register io_i2c_mst1_clr_activity +#define AR_IO_I2C_MST1_CLR_ACTIVITY 0x8001215c +#pragma Aux_register(0x8001215c, name=>"io_i2c_mst1_clr_activity") + +// User extension aux register io_i2c_mst1_clr_stop_det +#define AR_IO_I2C_MST1_CLR_STOP_DET 0x80012160 +#pragma Aux_register(0x80012160, name=>"io_i2c_mst1_clr_stop_det") + +// User extension aux register io_i2c_mst1_clr_start_det +#define AR_IO_I2C_MST1_CLR_START_DET 0x80012164 +#pragma Aux_register(0x80012164, name=>"io_i2c_mst1_clr_start_det") + +// User extension aux register io_i2c_mst1_enable +#define AR_IO_I2C_MST1_ENABLE 0x8001216c +#pragma Aux_register(0x8001216c, name=>"io_i2c_mst1_enable") + +// User extension aux register io_i2c_mst1_status +#define AR_IO_I2C_MST1_STATUS 0x80012170 +#pragma Aux_register(0x80012170, name=>"io_i2c_mst1_status") + +// User extension aux register io_i2c_mst1_txflr +#define AR_IO_I2C_MST1_TXFLR 0x80012174 +#pragma Aux_register(0x80012174, name=>"io_i2c_mst1_txflr") + +// User extension aux register io_i2c_mst1_rxflr +#define AR_IO_I2C_MST1_RXFLR 0x80012178 +#pragma Aux_register(0x80012178, name=>"io_i2c_mst1_rxflr") + +// User extension aux register io_i2c_mst1_sda_hold +#define AR_IO_I2C_MST1_SDA_HOLD 0x8001217c +#pragma Aux_register(0x8001217c, name=>"io_i2c_mst1_sda_hold") + +// User extension aux register io_i2c_mst1_tx_abrt_source +#define AR_IO_I2C_MST1_TX_ABRT_SOURCE 0x80012180 +#pragma Aux_register(0x80012180, name=>"io_i2c_mst1_tx_abrt_source") + +// User extension aux register io_i2c_mst1_enable_status +#define AR_IO_I2C_MST1_ENABLE_STATUS 0x8001219c +#pragma Aux_register(0x8001219c, name=>"io_i2c_mst1_enable_status") + +// User extension aux register io_i2c_mst1_fs_spklen +#define AR_IO_I2C_MST1_FS_SPKLEN 0x800121a0 +#pragma Aux_register(0x800121a0, name=>"io_i2c_mst1_fs_spklen") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST2_PRESENT 1 + +// User extension aux register io_i2c_mst2_clken +#define AR_IO_I2C_MST2_CLKEN 0x800122c0 +#pragma Aux_register(0x800122c0, name=>"io_i2c_mst2_clken") + +// User extension aux register io_i2c_mst2_con +#define AR_IO_I2C_MST2_CON 0x80012200 +#pragma Aux_register(0x80012200, name=>"io_i2c_mst2_con") + +// User extension aux register io_i2c_mst2_tar +#define AR_IO_I2C_MST2_TAR 0x80012204 +#pragma Aux_register(0x80012204, name=>"io_i2c_mst2_tar") + +// User extension aux register io_i2c_mst2_data_cmd +#define AR_IO_I2C_MST2_DATA_CMD 0x80012210 +#pragma Aux_register(0x80012210, name=>"io_i2c_mst2_data_cmd") + +// User extension aux register io_i2c_mst2_ss_scl_hcnt +#define AR_IO_I2C_MST2_SS_SCL_HCNT 0x80012214 +#pragma Aux_register(0x80012214, name=>"io_i2c_mst2_ss_scl_hcnt") + +// User extension aux register io_i2c_mst2_ss_scl_lcnt +#define AR_IO_I2C_MST2_SS_SCL_LCNT 0x80012218 +#pragma Aux_register(0x80012218, name=>"io_i2c_mst2_ss_scl_lcnt") + +// User extension aux register io_i2c_mst2_fs_scl_hcnt +#define AR_IO_I2C_MST2_FS_SCL_HCNT 0x8001221c +#pragma Aux_register(0x8001221c, name=>"io_i2c_mst2_fs_scl_hcnt") + +// User extension aux register io_i2c_mst2_fs_scl_lcnt +#define AR_IO_I2C_MST2_FS_SCL_LCNT 0x80012220 +#pragma Aux_register(0x80012220, name=>"io_i2c_mst2_fs_scl_lcnt") + +// User extension aux register io_i2c_mst2_intr_stat +#define AR_IO_I2C_MST2_INTR_STAT 0x8001222c +#pragma Aux_register(0x8001222c, name=>"io_i2c_mst2_intr_stat") + +// User extension aux register io_i2c_mst2_intr_mask +#define AR_IO_I2C_MST2_INTR_MASK 0x80012230 +#pragma Aux_register(0x80012230, name=>"io_i2c_mst2_intr_mask") + +// User extension aux register io_i2c_mst2_raw_intr_stat +#define AR_IO_I2C_MST2_RAW_INTR_STAT 0x80012234 +#pragma Aux_register(0x80012234, name=>"io_i2c_mst2_raw_intr_stat") + +// User extension aux register io_i2c_mst2_rx_tl +#define AR_IO_I2C_MST2_RX_TL 0x80012238 +#pragma Aux_register(0x80012238, name=>"io_i2c_mst2_rx_tl") + +// User extension aux register io_i2c_mst2_tx_tl +#define AR_IO_I2C_MST2_TX_TL 0x8001223c +#pragma Aux_register(0x8001223c, name=>"io_i2c_mst2_tx_tl") + +// User extension aux register io_i2c_mst2_clr_intr +#define AR_IO_I2C_MST2_CLR_INTR 0x80012240 +#pragma Aux_register(0x80012240, name=>"io_i2c_mst2_clr_intr") + +// User extension aux register io_i2c_mst2_clr_rx_under +#define AR_IO_I2C_MST2_CLR_RX_UNDER 0x80012244 +#pragma Aux_register(0x80012244, name=>"io_i2c_mst2_clr_rx_under") + +// User extension aux register io_i2c_mst2_clr_rx_over +#define AR_IO_I2C_MST2_CLR_RX_OVER 0x80012248 +#pragma Aux_register(0x80012248, name=>"io_i2c_mst2_clr_rx_over") + +// User extension aux register io_i2c_mst2_clr_tx_over +#define AR_IO_I2C_MST2_CLR_TX_OVER 0x8001224c +#pragma Aux_register(0x8001224c, name=>"io_i2c_mst2_clr_tx_over") + +// User extension aux register io_i2c_mst2_clr_tx_abrt +#define AR_IO_I2C_MST2_CLR_TX_ABRT 0x80012254 +#pragma Aux_register(0x80012254, name=>"io_i2c_mst2_clr_tx_abrt") + +// User extension aux register io_i2c_mst2_clr_activity +#define AR_IO_I2C_MST2_CLR_ACTIVITY 0x8001225c +#pragma Aux_register(0x8001225c, name=>"io_i2c_mst2_clr_activity") + +// User extension aux register io_i2c_mst2_clr_stop_det +#define AR_IO_I2C_MST2_CLR_STOP_DET 0x80012260 +#pragma Aux_register(0x80012260, name=>"io_i2c_mst2_clr_stop_det") + +// User extension aux register io_i2c_mst2_clr_start_det +#define AR_IO_I2C_MST2_CLR_START_DET 0x80012264 +#pragma Aux_register(0x80012264, name=>"io_i2c_mst2_clr_start_det") + +// User extension aux register io_i2c_mst2_enable +#define AR_IO_I2C_MST2_ENABLE 0x8001226c +#pragma Aux_register(0x8001226c, name=>"io_i2c_mst2_enable") + +// User extension aux register io_i2c_mst2_status +#define AR_IO_I2C_MST2_STATUS 0x80012270 +#pragma Aux_register(0x80012270, name=>"io_i2c_mst2_status") + +// User extension aux register io_i2c_mst2_txflr +#define AR_IO_I2C_MST2_TXFLR 0x80012274 +#pragma Aux_register(0x80012274, name=>"io_i2c_mst2_txflr") + +// User extension aux register io_i2c_mst2_rxflr +#define AR_IO_I2C_MST2_RXFLR 0x80012278 +#pragma Aux_register(0x80012278, name=>"io_i2c_mst2_rxflr") + +// User extension aux register io_i2c_mst2_sda_hold +#define AR_IO_I2C_MST2_SDA_HOLD 0x8001227c +#pragma Aux_register(0x8001227c, name=>"io_i2c_mst2_sda_hold") + +// User extension aux register io_i2c_mst2_tx_abrt_source +#define AR_IO_I2C_MST2_TX_ABRT_SOURCE 0x80012280 +#pragma Aux_register(0x80012280, name=>"io_i2c_mst2_tx_abrt_source") + +// User extension aux register io_i2c_mst2_enable_status +#define AR_IO_I2C_MST2_ENABLE_STATUS 0x8001229c +#pragma Aux_register(0x8001229c, name=>"io_i2c_mst2_enable_status") + +// User extension aux register io_i2c_mst2_fs_spklen +#define AR_IO_I2C_MST2_FS_SPKLEN 0x800122a0 +#pragma Aux_register(0x800122a0, name=>"io_i2c_mst2_fs_spklen") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART0_PRESENT 1 + +// User extension aux register io_uart0_clken +#define AR_IO_UART0_CLKEN 0x800140c0 +#pragma Aux_register(0x800140c0, name=>"io_uart0_clken") + +// User extension aux register io_uart0_rbr_thr_dll +#define AR_IO_UART0_RBR_THR_DLL 0x80014000 +#pragma Aux_register(0x80014000, name=>"io_uart0_rbr_thr_dll") + +// User extension aux register io_uart0_ier_dlh +#define AR_IO_UART0_IER_DLH 0x80014004 +#pragma Aux_register(0x80014004, name=>"io_uart0_ier_dlh") + +// User extension aux register io_uart0_iir_fcr +#define AR_IO_UART0_IIR_FCR 0x80014008 +#pragma Aux_register(0x80014008, name=>"io_uart0_iir_fcr") + +// User extension aux register io_uart0_lcr +#define AR_IO_UART0_LCR 0x8001400c +#pragma Aux_register(0x8001400c, name=>"io_uart0_lcr") + +// User extension aux register io_uart0_mcr +#define AR_IO_UART0_MCR 0x80014010 +#pragma Aux_register(0x80014010, name=>"io_uart0_mcr") + +// User extension aux register io_uart0_lsr +#define AR_IO_UART0_LSR 0x80014014 +#pragma Aux_register(0x80014014, name=>"io_uart0_lsr") + +// User extension aux register io_uart0_msr +#define AR_IO_UART0_MSR 0x80014018 +#pragma Aux_register(0x80014018, name=>"io_uart0_msr") + +// User extension aux register io_uart0_usr +#define AR_IO_UART0_USR 0x8001407c +#pragma Aux_register(0x8001407c, name=>"io_uart0_usr") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART1_PRESENT 1 + +// User extension aux register io_uart1_clken +#define AR_IO_UART1_CLKEN 0x800141c0 +#pragma Aux_register(0x800141c0, name=>"io_uart1_clken") + +// User extension aux register io_uart1_rbr_thr_dll +#define AR_IO_UART1_RBR_THR_DLL 0x80014100 +#pragma Aux_register(0x80014100, name=>"io_uart1_rbr_thr_dll") + +// User extension aux register io_uart1_ier_dlh +#define AR_IO_UART1_IER_DLH 0x80014104 +#pragma Aux_register(0x80014104, name=>"io_uart1_ier_dlh") + +// User extension aux register io_uart1_iir_fcr +#define AR_IO_UART1_IIR_FCR 0x80014108 +#pragma Aux_register(0x80014108, name=>"io_uart1_iir_fcr") + +// User extension aux register io_uart1_lcr +#define AR_IO_UART1_LCR 0x8001410c +#pragma Aux_register(0x8001410c, name=>"io_uart1_lcr") + +// User extension aux register io_uart1_mcr +#define AR_IO_UART1_MCR 0x80014110 +#pragma Aux_register(0x80014110, name=>"io_uart1_mcr") + +// User extension aux register io_uart1_lsr +#define AR_IO_UART1_LSR 0x80014114 +#pragma Aux_register(0x80014114, name=>"io_uart1_lsr") + +// User extension aux register io_uart1_msr +#define AR_IO_UART1_MSR 0x80014118 +#pragma Aux_register(0x80014118, name=>"io_uart1_msr") + +// User extension aux register io_uart1_usr +#define AR_IO_UART1_USR 0x8001417c +#pragma Aux_register(0x8001417c, name=>"io_uart1_usr") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART2_PRESENT 1 + +// User extension aux register io_uart2_clken +#define AR_IO_UART2_CLKEN 0x800142c0 +#pragma Aux_register(0x800142c0, name=>"io_uart2_clken") + +// User extension aux register io_uart2_rbr_thr_dll +#define AR_IO_UART2_RBR_THR_DLL 0x80014200 +#pragma Aux_register(0x80014200, name=>"io_uart2_rbr_thr_dll") + +// User extension aux register io_uart2_ier_dlh +#define AR_IO_UART2_IER_DLH 0x80014204 +#pragma Aux_register(0x80014204, name=>"io_uart2_ier_dlh") + +// User extension aux register io_uart2_iir_fcr +#define AR_IO_UART2_IIR_FCR 0x80014208 +#pragma Aux_register(0x80014208, name=>"io_uart2_iir_fcr") + +// User extension aux register io_uart2_lcr +#define AR_IO_UART2_LCR 0x8001420c +#pragma Aux_register(0x8001420c, name=>"io_uart2_lcr") + +// User extension aux register io_uart2_mcr +#define AR_IO_UART2_MCR 0x80014210 +#pragma Aux_register(0x80014210, name=>"io_uart2_mcr") + +// User extension aux register io_uart2_lsr +#define AR_IO_UART2_LSR 0x80014214 +#pragma Aux_register(0x80014214, name=>"io_uart2_lsr") + +// User extension aux register io_uart2_msr +#define AR_IO_UART2_MSR 0x80014218 +#pragma Aux_register(0x80014218, name=>"io_uart2_msr") + +// User extension aux register io_uart2_usr +#define AR_IO_UART2_USR 0x8001427c +#pragma Aux_register(0x8001427c, name=>"io_uart2_usr") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART3_PRESENT 1 + +// User extension aux register io_uart3_clken +#define AR_IO_UART3_CLKEN 0x800143c0 +#pragma Aux_register(0x800143c0, name=>"io_uart3_clken") + +// User extension aux register io_uart3_rbr_thr_dll +#define AR_IO_UART3_RBR_THR_DLL 0x80014300 +#pragma Aux_register(0x80014300, name=>"io_uart3_rbr_thr_dll") + +// User extension aux register io_uart3_ier_dlh +#define AR_IO_UART3_IER_DLH 0x80014304 +#pragma Aux_register(0x80014304, name=>"io_uart3_ier_dlh") + +// User extension aux register io_uart3_iir_fcr +#define AR_IO_UART3_IIR_FCR 0x80014308 +#pragma Aux_register(0x80014308, name=>"io_uart3_iir_fcr") + +// User extension aux register io_uart3_lcr +#define AR_IO_UART3_LCR 0x8001430c +#pragma Aux_register(0x8001430c, name=>"io_uart3_lcr") + +// User extension aux register io_uart3_mcr +#define AR_IO_UART3_MCR 0x80014310 +#pragma Aux_register(0x80014310, name=>"io_uart3_mcr") + +// User extension aux register io_uart3_lsr +#define AR_IO_UART3_LSR 0x80014314 +#pragma Aux_register(0x80014314, name=>"io_uart3_lsr") + +// User extension aux register io_uart3_msr +#define AR_IO_UART3_MSR 0x80014318 +#pragma Aux_register(0x80014318, name=>"io_uart3_msr") + +// User extension aux register io_uart3_usr +#define AR_IO_UART3_USR 0x8001437c +#pragma Aux_register(0x8001437c, name=>"io_uart3_usr") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2S_RX_MST0_PRESENT 1 + +// User extension aux register io_i2s_rx_mst0_ier +#define AR_IO_I2S_RX_MST0_IER 0x8001a000 +#pragma Aux_register(0x8001a000, name=>"io_i2s_rx_mst0_ier") + +// User extension aux register io_i2s_rx_mst0_irer +#define AR_IO_I2S_RX_MST0_IRER 0x8001a004 +#pragma Aux_register(0x8001a004, name=>"io_i2s_rx_mst0_irer") + +// User extension aux register io_i2s_rx_mst0_cer +#define AR_IO_I2S_RX_MST0_CER 0x8001a00c +#pragma Aux_register(0x8001a00c, name=>"io_i2s_rx_mst0_cer") + +// User extension aux register io_i2s_rx_mst0_ccr +#define AR_IO_I2S_RX_MST0_CCR 0x8001a010 +#pragma Aux_register(0x8001a010, name=>"io_i2s_rx_mst0_ccr") + +// User extension aux register io_i2s_rx_mst0_rxffr +#define AR_IO_I2S_RX_MST0_RXFFR 0x8001a014 +#pragma Aux_register(0x8001a014, name=>"io_i2s_rx_mst0_rxffr") + +// User extension aux register io_i2s_rx_mst0_lrbr +#define AR_IO_I2S_RX_MST0_LRBR 0x8001a020 +#pragma Aux_register(0x8001a020, name=>"io_i2s_rx_mst0_lrbr") + +// User extension aux register io_i2s_rx_mst0_rrbr +#define AR_IO_I2S_RX_MST0_RRBR 0x8001a024 +#pragma Aux_register(0x8001a024, name=>"io_i2s_rx_mst0_rrbr") + +// User extension aux register io_i2s_rx_mst0_rer +#define AR_IO_I2S_RX_MST0_RER 0x8001a028 +#pragma Aux_register(0x8001a028, name=>"io_i2s_rx_mst0_rer") + +// User extension aux register io_i2s_rx_mst0_rcr +#define AR_IO_I2S_RX_MST0_RCR 0x8001a030 +#pragma Aux_register(0x8001a030, name=>"io_i2s_rx_mst0_rcr") + +// User extension aux register io_i2s_rx_mst0_isr +#define AR_IO_I2S_RX_MST0_ISR 0x8001a038 +#pragma Aux_register(0x8001a038, name=>"io_i2s_rx_mst0_isr") + +// User extension aux register io_i2s_rx_mst0_imr +#define AR_IO_I2S_RX_MST0_IMR 0x8001a03c +#pragma Aux_register(0x8001a03c, name=>"io_i2s_rx_mst0_imr") + +// User extension aux register io_i2s_rx_mst0_ror +#define AR_IO_I2S_RX_MST0_ROR 0x8001a040 +#pragma Aux_register(0x8001a040, name=>"io_i2s_rx_mst0_ror") + +// User extension aux register io_i2s_rx_mst0_rfcr +#define AR_IO_I2S_RX_MST0_RFCR 0x8001a048 +#pragma Aux_register(0x8001a048, name=>"io_i2s_rx_mst0_rfcr") + +// User extension aux register io_i2s_rx_mst0_rff +#define AR_IO_I2S_RX_MST0_RFF 0x8001a050 +#pragma Aux_register(0x8001a050, name=>"io_i2s_rx_mst0_rff") + +// User extension aux register io_i2s_rx_mst0_rxdma +#define AR_IO_I2S_RX_MST0_RXDMA 0x8001a1c0 +#pragma Aux_register(0x8001a1c0, name=>"io_i2s_rx_mst0_rxdma") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2S_TX_MST0_PRESENT 1 + +// User extension aux register io_i2s_tx_mst0_ier +#define AR_IO_I2S_TX_MST0_IER 0x80019000 +#pragma Aux_register(0x80019000, name=>"io_i2s_tx_mst0_ier") + +// User extension aux register io_i2s_tx_mst0_iter +#define AR_IO_I2S_TX_MST0_ITER 0x80019008 +#pragma Aux_register(0x80019008, name=>"io_i2s_tx_mst0_iter") + +// User extension aux register io_i2s_tx_mst0_cer +#define AR_IO_I2S_TX_MST0_CER 0x8001900c +#pragma Aux_register(0x8001900c, name=>"io_i2s_tx_mst0_cer") + +// User extension aux register io_i2s_tx_mst0_ccr +#define AR_IO_I2S_TX_MST0_CCR 0x80019010 +#pragma Aux_register(0x80019010, name=>"io_i2s_tx_mst0_ccr") + +// User extension aux register io_i2s_tx_mst0_txffr +#define AR_IO_I2S_TX_MST0_TXFFR 0x80019018 +#pragma Aux_register(0x80019018, name=>"io_i2s_tx_mst0_txffr") + +// User extension aux register io_i2s_tx_mst0_lthr +#define AR_IO_I2S_TX_MST0_LTHR 0x80019020 +#pragma Aux_register(0x80019020, name=>"io_i2s_tx_mst0_lthr") + +// User extension aux register io_i2s_tx_mst0_rthr +#define AR_IO_I2S_TX_MST0_RTHR 0x80019024 +#pragma Aux_register(0x80019024, name=>"io_i2s_tx_mst0_rthr") + +// User extension aux register io_i2s_tx_mst0_ter +#define AR_IO_I2S_TX_MST0_TER 0x8001902c +#pragma Aux_register(0x8001902c, name=>"io_i2s_tx_mst0_ter") + +// User extension aux register io_i2s_tx_mst0_tcr +#define AR_IO_I2S_TX_MST0_TCR 0x80019034 +#pragma Aux_register(0x80019034, name=>"io_i2s_tx_mst0_tcr") + +// User extension aux register io_i2s_tx_mst0_isr +#define AR_IO_I2S_TX_MST0_ISR 0x80019038 +#pragma Aux_register(0x80019038, name=>"io_i2s_tx_mst0_isr") + +// User extension aux register io_i2s_tx_mst0_imr +#define AR_IO_I2S_TX_MST0_IMR 0x8001903c +#pragma Aux_register(0x8001903c, name=>"io_i2s_tx_mst0_imr") + +// User extension aux register io_i2s_tx_mst0_tor +#define AR_IO_I2S_TX_MST0_TOR 0x80019044 +#pragma Aux_register(0x80019044, name=>"io_i2s_tx_mst0_tor") + +// User extension aux register io_i2s_tx_mst0_tfcr +#define AR_IO_I2S_TX_MST0_TFCR 0x8001904c +#pragma Aux_register(0x8001904c, name=>"io_i2s_tx_mst0_tfcr") + +// User extension aux register io_i2s_tx_mst0_tff +#define AR_IO_I2S_TX_MST0_TFF 0x80019054 +#pragma Aux_register(0x80019054, name=>"io_i2s_tx_mst0_tff") + +// User extension aux register io_i2s_tx_mst0_txdma +#define AR_IO_I2S_TX_MST0_TXDMA 0x800191c8 +#pragma Aux_register(0x800191c8, name=>"io_i2s_tx_mst0_txdma") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_PDM_RX0_PRESENT 1 + +// User extension aux register io_pdm_rx0_pdm_en +#define AR_IO_PDM_RX0_PDM_EN 0x8001b000 +#pragma Aux_register(0x8001b000, name=>"io_pdm_rx0_pdm_en") + +// User extension aux register io_pdm_rx0_pdm_ren +#define AR_IO_PDM_RX0_PDM_REN 0x8001b004 +#pragma Aux_register(0x8001b004, name=>"io_pdm_rx0_pdm_ren") + +// User extension aux register io_pdm_rx0_cer +#define AR_IO_PDM_RX0_CER 0x8001b00c +#pragma Aux_register(0x8001b00c, name=>"io_pdm_rx0_cer") + +// User extension aux register io_pdm_rx0_rxffr +#define AR_IO_PDM_RX0_RXFFR 0x8001b014 +#pragma Aux_register(0x8001b014, name=>"io_pdm_rx0_rxffr") + +// User extension aux register io_pdm_rx0_rer0 +#define AR_IO_PDM_RX0_RER0 0x8001b028 +#pragma Aux_register(0x8001b028, name=>"io_pdm_rx0_rer0") + +// User extension aux register io_pdm_rx0_isr +#define AR_IO_PDM_RX0_ISR 0x8001b038 +#pragma Aux_register(0x8001b038, name=>"io_pdm_rx0_isr") + +// User extension aux register io_pdm_rx0_imr +#define AR_IO_PDM_RX0_IMR 0x8001b03c +#pragma Aux_register(0x8001b03c, name=>"io_pdm_rx0_imr") + +// User extension aux register io_pdm_rx0_ror +#define AR_IO_PDM_RX0_ROR 0x8001b040 +#pragma Aux_register(0x8001b040, name=>"io_pdm_rx0_ror") + +// User extension aux register io_pdm_rx0_rfcr +#define AR_IO_PDM_RX0_RFCR 0x8001b048 +#pragma Aux_register(0x8001b048, name=>"io_pdm_rx0_rfcr") + +// User extension aux register io_pdm_rx0_rxdma +#define AR_IO_PDM_RX0_RXDMA 0x8001b1c0 +#pragma Aux_register(0x8001b1c0, name=>"io_pdm_rx0_rxdma") + +// User extension aux register io_pdm_rx0_pdm_rr +#define AR_IO_PDM_RX0_PDM_RR 0x8001b1d0 +#pragma Aux_register(0x8001b1d0, name=>"io_pdm_rx0_pdm_rr") + +// User extension aux register io_pdm_rx0_cic_n +#define AR_IO_PDM_RX0_CIC_N 0x8001b1d4 +#pragma Aux_register(0x8001b1d4, name=>"io_pdm_rx0_cic_n") + +// User extension aux register io_pdm_rx0_cic_d +#define AR_IO_PDM_RX0_CIC_D 0x8001b1d8 +#pragma Aux_register(0x8001b1d8, name=>"io_pdm_rx0_cic_d") + +// User extension aux register io_pdm_rx0_dcrc +#define AR_IO_PDM_RX0_DCRC 0x8001b1dc +#pragma Aux_register(0x8001b1dc, name=>"io_pdm_rx0_dcrc") + +// User extension aux register io_pdm_rx0_brc_b0 +#define AR_IO_PDM_RX0_BRC_B0 0x8001b1e0 +#pragma Aux_register(0x8001b1e0, name=>"io_pdm_rx0_brc_b0") + +// User extension aux register io_pdm_rx0_brc_clp +#define AR_IO_PDM_RX0_BRC_CLP 0x8001b1f0 +#pragma Aux_register(0x8001b1f0, name=>"io_pdm_rx0_brc_clp") +#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_PRESENT 1 + +// User extension aux register fpu_build +#define AR_FPU_BUILD 0xc8 +#pragma Aux_register(0xc8, name=>"fpu_build") + +// User extension aux register fpu_ctrl +#define AR_FPU_CTRL 0x300 +#pragma Aux_register(0x300, name=>"fpu_ctrl") + +// User extension aux register fpu_status +#define AR_FPU_STATUS 0x301 +#pragma Aux_register(0x301, name=>"fpu_status") + +// User extension instruction fsmadd +extern int fsmadd(int,int); +#pragma intrinsic(fsmadd,opcode=>6,sub_opcode=>5, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fsmsub +extern int fsmsub(int,int); +#pragma intrinsic(fsmsub,opcode=>6,sub_opcode=>6, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fsmul +extern int fsmul(int,int); +#pragma intrinsic(fsmul,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fsadd +extern int fsadd(int,int); +#pragma intrinsic(fsadd,opcode=>6,sub_opcode=>1, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fssub +extern int fssub(int,int); +#pragma intrinsic(fssub,opcode=>6,sub_opcode=>2, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fcvt32 +extern int fcvt32(int,int); +#pragma intrinsic(fcvt32,opcode=>6,sub_opcode=>8, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fsdiv +extern int fsdiv(int,int); +#pragma intrinsic(fsdiv,opcode=>6,sub_opcode=>7, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fscmp +extern int fscmp(int,int); +#pragma intrinsic(fscmp,opcode=>6,sub_opcode=>3, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fscmp +extern int fscmp_f(int,int); +#pragma intrinsic(fscmp_f,opcode=>6,sub_opcode=>3, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fscmpf +extern int fscmpf(int,int); +#pragma intrinsic(fscmpf,opcode=>6,sub_opcode=>4, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fscmpf +extern int fscmpf_f(int,int); +#pragma intrinsic(fscmpf_f,opcode=>6,sub_opcode=>4, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fssqrt +extern int fssqrt(int); +#pragma intrinsic(fssqrt,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") +#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_DP_ASSIST_PRESENT 1 + +// User extension aux register aux_dpfp1l +#define AR_AUX_DPFP1L 0x302 +#pragma Aux_register(0x302, name=>"aux_dpfp1l") + +// User extension aux register aux_dpfp1h +#define AR_AUX_DPFP1H 0x303 +#pragma Aux_register(0x303, name=>"aux_dpfp1h") + +// User extension aux register aux_dpfp2l +#define AR_AUX_DPFP2L 0x304 +#pragma Aux_register(0x304, name=>"aux_dpfp2l") + +// User extension aux register aux_dpfp2h +#define AR_AUX_DPFP2H 0x305 +#pragma Aux_register(0x305, name=>"aux_dpfp2h") + +// User extension instruction dmulh11 +extern int dmulh11(int,int); +#pragma intrinsic(dmulh11,opcode=>6,sub_opcode=>48,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh11 +extern int dmulh11_f(int,int); +#pragma intrinsic(dmulh11_f,opcode=>6,sub_opcode=>48, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh12 +extern int dmulh12(int,int); +#pragma intrinsic(dmulh12,opcode=>6,sub_opcode=>49,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh12 +extern int dmulh12_f(int,int); +#pragma intrinsic(dmulh12_f,opcode=>6,sub_opcode=>49, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh21 +extern int dmulh21(int,int); +#pragma intrinsic(dmulh21,opcode=>6,sub_opcode=>50,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh21 +extern int dmulh21_f(int,int); +#pragma intrinsic(dmulh21_f,opcode=>6,sub_opcode=>50, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh22 +extern int dmulh22(int,int); +#pragma intrinsic(dmulh22,opcode=>6,sub_opcode=>51,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh22 +extern int dmulh22_f(int,int); +#pragma intrinsic(dmulh22_f,opcode=>6,sub_opcode=>51, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh11 +extern int daddh11(int,int); +#pragma intrinsic(daddh11,opcode=>6,sub_opcode=>52,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh11 +extern int daddh11_f(int,int); +#pragma intrinsic(daddh11_f,opcode=>6,sub_opcode=>52, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh12 +extern int daddh12(int,int); +#pragma intrinsic(daddh12,opcode=>6,sub_opcode=>53,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh12 +extern int daddh12_f(int,int); +#pragma intrinsic(daddh12_f,opcode=>6,sub_opcode=>53, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh21 +extern int daddh21(int,int); +#pragma intrinsic(daddh21,opcode=>6,sub_opcode=>54,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh21 +extern int daddh21_f(int,int); +#pragma intrinsic(daddh21_f,opcode=>6,sub_opcode=>54, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh22 +extern int daddh22(int,int); +#pragma intrinsic(daddh22,opcode=>6,sub_opcode=>55,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh22 +extern int daddh22_f(int,int); +#pragma intrinsic(daddh22_f,opcode=>6,sub_opcode=>55, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh11 +extern int dsubh11(int,int); +#pragma intrinsic(dsubh11,opcode=>6,sub_opcode=>56,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh11 +extern int dsubh11_f(int,int); +#pragma intrinsic(dsubh11_f,opcode=>6,sub_opcode=>56, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh12 +extern int dsubh12(int,int); +#pragma intrinsic(dsubh12,opcode=>6,sub_opcode=>57,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh12 +extern int dsubh12_f(int,int); +#pragma intrinsic(dsubh12_f,opcode=>6,sub_opcode=>57, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh21 +extern int dsubh21(int,int); +#pragma intrinsic(dsubh21,opcode=>6,sub_opcode=>58,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh21 +extern int dsubh21_f(int,int); +#pragma intrinsic(dsubh21_f,opcode=>6,sub_opcode=>58, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh22 +extern int dsubh22(int,int); +#pragma intrinsic(dsubh22,opcode=>6,sub_opcode=>59,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh22 +extern int dsubh22_f(int,int); +#pragma intrinsic(dsubh22_f,opcode=>6,sub_opcode=>59, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dexcl1 +extern int dexcl1(int,int); +#pragma intrinsic(dexcl1,opcode=>6,sub_opcode=>60, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dexcl2 +extern int dexcl2(int,int); +#pragma intrinsic(dexcl2,opcode=>6,sub_opcode=>61, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + + +#endif + + +]]> + + + + +
+ diff --git a/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf b/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf new file mode 100644 index 00000000000..da39ae911ff --- /dev/null +++ b/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf @@ -0,0 +1,47 @@ +# SYSTEM memory regions indicate where external memory might be located. +# The TCF has no specific knowledge of whether SYSTEM regions contain +# external memory or not. +# CCMWRAP memory regions indicate unusable portions of the address space +# due to CCM memory wrapping into upper addresses beyond its size + +MEMORY { +# SYSTEM0 : ORIGIN = 0x00000000, LENGTH = 0x20000000 + ICCM0 : ORIGIN = 0x20000000, LENGTH = 0x00040000 +# CCMWRAP0: ORIGIN = 0x20040000, LENGTH = 0x0ffc0000 +# SYSTEM1 : ORIGIN = 0x30000000, LENGTH = 0x50000000 + DCCM : ORIGIN = 0x80000000, LENGTH = 0x00020000 +# CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000 +# SYSTEM2 : ORIGIN = 0x90000000, LENGTH = 0x30000000 + XCCM : ORIGIN = 0xc0000000, LENGTH = 0x00008000 +# CCMWRAP2: ORIGIN = 0xc0008000, LENGTH = 0x0fff8000 +# SYSTEM3 : ORIGIN = 0xd0000000, LENGTH = 0x10000000 + YCCM : ORIGIN = 0xe0000000, LENGTH = 0x00008000 +# CCMWRAP3: ORIGIN = 0xe0008000, LENGTH = 0x0fff8000 +# SYSTEM4 : ORIGIN = 0xf0000000, LENGTH = 0x10000000 + } +SECTIONS { + GROUP: { + .text? : { *('.text$crt*') } + * (TEXT): {} + * (LIT): {} + } > ICCM0 + + GROUP: { + /* _SDA_BASE_ computed implicitly */ + .sdata?: {} + .sbss?: {} + * (DATA): {} + * (BSS): {} + .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32768): {} + .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {} + } > DCCM + GROUP: { + .Xdata? : {} + } > XCCM + GROUP: { + .Ydata? : {} + } > YCCM + GROUP BIND(0x0): { + .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:684): {} = FILL(0xa5a5a5a5,4) + } + } diff --git a/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf b/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf new file mode 100644 index 00000000000..004215a2f6a --- /dev/null +++ b/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf @@ -0,0 +1,4621 @@ + + + + + + + + + + + + + + + + + + + + 10*2) +# +# The speed of simulation can be greatly increased by using a faster JTAG clock, but a dependency will warn if it exceeds 1/2 of the cpu clock. +# +-jtag_tclk 4 + +# execution_trace_level --- +# This traces committed instructions as they execute, and gathers statistics +# visible in the debugger for counting instructions & cycle delays. +# At the "stats" level ony the statistics are gathered and no trace is printed. +# "file" is equivalent to "full", but the results go to a trace .txt file instead. +# +-execution_trace_level stats + +# generate_ipxact --- +# Generate ipxact.xml file describing the CPUisle or archipelago frontier +# +-generate_ipxact false + +# ipxact_relative_path_names --- +# Use relative path names for Verilog files in the ipxact. +# Otherwise, absolute path names are used. +# +-ipxact_relative_path_names true + +# optional_encryption --- +# When selected, encrypted RTL output is generated. +# +-optional_encryption false + +# ignore_encrypt_license --- +# When selected, pretend the encryption license is missing. For testing. +# +-ignore_encrypt_license false + +# ignore_clear_license --- +# When selected, pretend the cleartest license is missing. For testing. +# +-ignore_clear_license false + + +######## Tool Configuration --- cgen.1_0 ######## + +# Create Tool Configuration +-create cgen.1_0 "System.Tool Configuration" + +# mwdt_version --- Selects the MetaWare version to be used with the TCF file. +# Change from the default to an older or newer toolset version if you want the TCF file to be used with an older or newer version of the MetaWare tools. +-mwdt_version K-2015.09 + +# code_base_addr --- +# The base address to assign to the executable code segment in the linker command file when there is no ICCM in the build. This value is ignored when there is an ICCM. +# +-code_base_addr 0 + +# data_base_addr --- +# The base address to assign to the data segment in the linker command file when the data is not being mapped to a DCCM. This value is ignored when the data segment is mapped to a DCCM, as in that case the base address of the DCCM memory is used. +# +# A value of 0xffffffff means that the data segment will not be mapped to any specific address. +# +-data_base_addr 4294967295 + + +######## IO Software --- com.arc.software.dfss.sw_io.1_0 ######## + +# Create IO Software +-create com.arc.software.dfss.sw_io.1_0 "System.IO Software" + +# sw_io --- Command line option for Software element 'IO Software' +-sw_io true + + +######## DSP Software --- com.arc.software.dfss.sw_dsp.1_0 ######## + +# Create DSP Software +-create com.arc.software.dfss.sw_dsp.1_0 "System.DSP Software" + +# sw_dsp --- Command line option for Software element 'DSP Software' +-sw_dsp true + + +######## Infrastructure Software --- com.arc.software.dfss.sw_infra.1_0 ######## + +# Create Infrastructure Software +-create com.arc.software.dfss.sw_infra.1_0 "System.Infrastructure Software" + +# sw_infra --- Command line option for Software element 'Infrastructure Software' +-sw_infra true + + +######## CPUisle --- com.arc.hardware.CPU_isle.1_0 ######## + +# Create CPUisle +-create com.arc.hardware.CPU_isle.1_0 System.CPUisle + +# unique_name --- verilog module modifier prefix +-unique_name "" + +# ArcNum --- The processor number as read back in the ARCNUM field of the IDENTITY register. +-arc_num 1 + +# instances --- +# The number of instantiations of this core. +# +-instances 1 + +# CPUFloorplan --- Floorplan giving relative placement of the RAMs for the given configuration of ARCv2HS or ARCv2EM in this CPUisle +-cpu_floorplan em9d_xyccm + +# userCPUFloorplanPath --- Pathname of user floorplan for the CPU when using a hierarchical implementation +-usercpufloorplan_path "" + +# pinLocationConstraintsFile --- Pathname+filename of the physical pin location constraints file or just "side1" (all pins on l.h.s) or "side2" (pins on top only) or "side3" (pins on r.h.s. only) or "side4" (pins on bottom only) to get a template file generated +-pin_location_constraints_file "" + + +######## ARCv2EM --- com.arc.hardware.ARCv2EM.1_0 ######## + +# Create ARCv2EM +-create com.arc.hardware.ARCv2EM.1_0 System.CPUisle.ARCv2EM + +# arcv2em --- Description to follow +-arcv2em true + +# def_div2ref --- This specifies the clock division factor at reset. It is used for mss clock controller to generate core clock, and the value N means core is running at (1/N) x ref_clk. +-def_div2ref 1 + +# addr_size --- This defines the address bus width (in bits). +-addr_size 32 + +# pc_size --- This defines the program counter (in bits). +-pc_size 32 + +# lpc_size --- This defines the size of the loop counter (in bits). +-lpc_size 32 + +# halt_on_reset --- This defines whether the core is halted initially on reset. +-halt_on_reset true + +# byte_order --- This defines the endianness of the core. +-byte_order little + +# code_density_option --- This reduces the size of program memory by adding instructions that condense commonly used instruction patterns with some marginal increase in processor gate count. The added instructions are ENTER_S, LEAVE_S, JLI_S, BI, BIH. +-code_density_option true + +# bitscan_option --- This adds instructions for efficient search of bits within a 32 bit word, including normalize (NORM, NORMH, NORMW) and find first or last set bit (FFS, FLS) instructions. +-bitscan_option true + +# shift_option --- The Shift ISA option adds variable and multi-length shift rotation instructions: (0) No shift/rotation instructions (1) ASR16, ASR8, LSR8, LSL8, ROL8, ROR8 (2) ASRM, ASLM, LSRM, RORM (3) ASR16, ASR8, LSR8, LSL8, ROL8, ROR8, ASRM, ASLM, LSRM, RORM +-shift_option 3 + +# swap_option --- This adds two instructions used to swap half-words or bytes in a 32b word. Useful for converting between little to big endianess and vice-versa. +-swap_option true + +# div_rem_option --- The DIV/REM option adds non-blocking multi-cycle implementation of integer divide/remainder functions. Added instructions are DIV, DIVU (integer divide), REM and REMU (integer divide remainder).radix2 takes 33 cycles. radix4_enhanced takes 3 to 19 cycles per operation. +-div_rem_option none + +# mpy_option --- The Multiplier ISA option allows selection between several multiplier configurations to tradeoff performance with silicon area. +# For select multiply options, when the DIV/REM option is also selected, some datapath resources will be shared between the multiply and divide pipeline to minimize total area. +# +# Cycle count (16-bit, lower 32-bit or upper 32-bit) for the different configurations is as follows: +#
+# 
+# option  16/L32/U32  Instructions
+# ------  ----------  ---------------------
+#       
+# none	  -/-/-     None
+# wlh1	  1/1/1     MPYW/U, MPY/U, MPYH/U
+# wlh2	  2/2/2     MPYW/U, MPY/U, MPYH/U
+# wlh3	  2/3/3     MPYW/U, MPY/U, MPYH/U
+# wlh4	  2/4/5     MPYW/U, MPY/U, MPYH/U
+# wlh5	  5/9/9     MPYW/U, MPY/U, MPYH/U
+# 
+# +-mpy_option none + +# code_protection --- The ARC EM architecture divides the memory into 16 regions, which can be protected individually. This feature adds a 16-bit input to the processor core, one bit per region. When the protect bit is set, the processor disables any load or store to the corresponding region. An attempt to access a protected region raises an EV_ProtV exception. +-code_protection true + +# stack_checking --- Stack checking is a mechanism for checking stack accesses and raising an exception when a stack overflow or underflow is detected. +-stack_checking true + +# unaligned_option --- This enables unaligned loads and stores. +-unaligned_option true + +# intvbase_preset --- This sets the interrupt vector base configuration register, VECBASE_AC_BUILD. The vector base address is aligned to a 1KB boundary, so the required address value should be divided by 1K (i.e. do not include the lower 10 bits). On reset, this register is loaded into the interrupt vector base address register, INT_VECTOR_BASE. +-intvbase_preset 0 + +# rgf_impl --- This defines whether the register file is implemented using flip-flops, or with a hard macro. +-rgf_impl flip_flops + +# rgf_num_regs --- This defines the size (in 32b register) of the processor register file. +-rgf_num_regs 32 + +# rgf_wr_ports --- This defines the number of write ports on the register file. +-rgf_wr_ports 2 + +# rgf_num_banks --- Dual register banks are useful if Fast IRQ has been configured, but may be selected even if not. +-rgf_num_banks 2 + +# rgf_banked_regs --- This selects the number of registers that are replicated in the second register-file bank. +-rgf_banked_regs 32 + +# turbo_boost --- This enables the Turbo Boost synthesis option. By enabling this option, the achievable clock frequency is increased, but at the cost of an additional cycle latency on branch instructions. +-turbo_boost false + +# infer_alu_adder --- infer: datapath is described as behavioral code: A + B +# instantiate: datapath is instantiated as a detailed multi-stage code of a carry-lookahead adder. It is generally preferable to use the infer option and add directives for your target synthesizer. +-infer_alu_adder infer + +# infer_mpy_wtree --- infer: datapath is described as behavioral code: A * B (applies to only wlh3, wlh4 and wlh5 designs) +# instantiate: datapath is instantiated as a detailed multi-stage code of a Wallace Tree multiplier It is generally preferable to use the infer option and add directives for your target synthesizer. +-infer_mpy_wtree instantiate + +# power_domains --- Adds three separate power domains to the core, and propagates power-gate control signals to the top level of the core. Also generates UPF constraints and commands in the low-power scripts +-power_domains true + +# dvfs --- Adds logic to the core to allow dynamic controlling of voltage and frequency and propagates the associated control signals to the top level of core +-dvfs true + +# voltage_domains --- Creates a voltage domain split between RAM and std cell parts to support Ultra Low Voltage on cells and generates UPF constraints +-voltage_domains false + +# mem_bus_option --- The core supports three bus protocols for accessing external memory: AHB & AHB-Lite. AHB-Lite-single means instruction fetch and data access share a single AHB-Lite port. AHB-Lite-dual means separate AHB-Lite port for each initiator. +-mem_bus_option AHB-Lite-dual + +# mem_bus_reg_interface --- Specifies whether the memory bus interface is registered. +-mem_bus_reg_interface true + +# dmi_burst_option --- This will enable high-throughput burst support on the DMI slave interfaces. By enabling this option, the peak DMI read throughput goes from 1 word per 3 cycles to N words per N+2 cycles, in which N is the AHB burst lengthDMI write throughput goes from 1 word per 3 cycles to 1 word per cycle. +-dmi_burst_option false + +# has_dmp_peripheral --- This option enables the redirection of load/store accesses to one segment (1/16) of the addressable space to a dedicated peripheral bus. This offers high system integration and reduces overall system cost. +-has_dmp_peripheral false + +# per_bus_option --- The core supports one bus protocol for accessing the peripheral space, when enabled: AHB-Lite. +-per_bus_option AHB-Lite + +# per_bus_reg_interface --- Specifies whether the peripheral bus interface is registered. +-per_bus_reg_interface false + +# clock_gating --- This enables the insertion of architectural clock gate elements in the design. By enabling this option, the clocks to various parts of the design will be disabled when the logic they drive is not in use to save power. +-clock_gating true + +# byte_parity --- If parity protection on the CCMs is configured, this option is used to enable parity protection on a per-byte basis. Otherwise, parity will be per word basis +-byte_parity false + +# prot_pipelined --- Check the box if CCM memories are configured for ECC, and you want single-bit errors to be corrected, written back to memory, and re-fetched. When unchecked, single bit errors are corrected when read from memory, but the offending memory location itself is not corrected with a writeback +-prot_pipelined false + +# cct_test_ena --- When ECC is configured, this option enables automatic generation of error conditions in relevant testbench memories to exercise error detection and correction features +-cct_test_ena false + + +######## AGU --- com.arc.hardware.AGU.1_0 ######## + +# Create AGU +-create com.arc.hardware.AGU.1_0 System.CPUisle.ARCv2EM.AGU + +# agu_size --- Predefined configurations of modifiers, address +# pointers and offset registers +#
+# 
+#         address     address                     
+#         pointers    offset regs      modifiers  
+#        ----------- --------------- ------------ 
+# small:     4           2                 4      
+# medium:    8           4                 12     
+# large:     12          8                 24     
+# 
+# +-agu_size small + +# agu_accord --- Enable the accordion stage if operating frequency is critical +-agu_accord true + +# agu_wb_depth --- Write buffer depth +-agu_wb_depth 2 + + +######## DSP --- com.arc.hardware.DSP.1_0 ######## + +# Create DSP +-create com.arc.hardware.DSP.1_0 System.CPUisle.ARCv2EM.DSP + +# dsp_complex --- Enable/disable support for single cycle 16b+16b complex instructions and butterfly operations, else 2-cycle complex instructions only without butterfly support +-dsp_complex true + +# dsp_itu --- Enable/disable support for ITU bit-accurate 1 bit fractional shift before accumulation, else 1-bit fractional shift result after accumulation only +-dsp_itu true + +# dsp_divsqrt --- Enable/disable support for divide and square root operations: DIV(U), REM(U), SQRT +-dsp_divsqrt radix2 + +# dsp_accshift --- Select support for accumulator shift operations: no supported, limited shift support only or full shift support and convergent rounding +-dsp_accshift full + +# dsp_impl --- The datapath components may be inferred from Verilog for better area or optimized using carry-save components for better timing +-dsp_impl optimized + + +######## Interrupt Controller --- com.arc.hardware.Interrupt_Controller.1_0 ######## + +# Create Interrupt Controller +-create com.arc.hardware.Interrupt_Controller.1_0 "System.CPUisle.ARCv2EM.Interrupt Controller" + +# number_of_interrupts --- This is the total number of interrupts available to the core. Some interrupts are allocated statically to a specific interrupt line (for example, timer interrupts). For more information on Interrupt and register-file options, see DesignWare ARCv2 ISA Programmers Reference Manual. +-number_of_interrupts 95 + +# number_of_levels --- Priority levels in the interrupt controller. +-number_of_levels 4 + +# external_interrupts --- This is the total number of interrupt pins available for external system components. This parameter must be less than the total number of interrupts. +-external_interrupts 60 + +# firq_option --- This enables the fast-interrupts option, (priority level 0 interrupts), which uses an alternate register bank (if configured) instead of saving the context to memory. +-firq_option true + + +######## Timer 0 --- com.arc.hardware.Timer_0.1_0 ######## + +# Create Timer 0 +-create com.arc.hardware.Timer_0.1_0 "System.CPUisle.ARCv2EM.Timer 0" + +# timer_0_int_level --- This sets the interrupt level (and implicitly the priority: level 0 is highest) of timer 0. +-timer_0_int_level 1 + + +######## Timer 1 --- com.arc.hardware.Timer_1.1_0 ######## + +# Create Timer 1 +-create com.arc.hardware.Timer_1.1_0 "System.CPUisle.ARCv2EM.Timer 1" + +# timer_1_int_level --- This sets the interrupt level (and implicitly the priority: level 0 is highest) of timer 1. +-timer_1_int_level 0 + + +######## Watchdog Timer --- com.arc.hardware.Watchdog_Timer.1_0 ######## + +# Create Watchdog Timer +-create com.arc.hardware.Watchdog_Timer.1_0 "System.CPUisle.ARCv2EM.Watchdog Timer" + +# watchdog_size --- Specifies the bit width of the internal counter used within the timer. +-watchdog_size 16 + +# watchdog_clk --- Specifies whether the timer should be driven from a separate clock. +-watchdog_clk true + + +######## Data Memory Initiator --- com.arc.hardware.Data_Memory_Initiator.1_0 ######## + +# Create Data Memory Initiator +-create com.arc.hardware.Data_Memory_Initiator.1_0 "System.CPUisle.ARCv2EM.Data Memory Initiator" + +######## Instruction Fetch Queue --- com.arc.hardware.Instruction_Fetch_Queue.1_0 ######## + +# Create Instruction Fetch Queue +-create com.arc.hardware.Instruction_Fetch_Queue.1_0 "System.CPUisle.ARCv2EM.Instruction Fetch Queue" + +# ifqueue_size --- This defines the number of entires in the Instruction Fetch Queue. +-ifqueue_size 4 + +# ifqueue_burst_size --- This sets the burst size for bus data transfers (in 32-bit words). It cannot exceed the number of entries. +-ifqueue_burst_size 2 + + +######## DCCM --- com.arc.hardware.DCCM.1_0 ######## + +# Create DCCM +-create com.arc.hardware.DCCM.1_0 System.CPUisle.ARCv2EM.DCCM + +# dccm_size --- This defines the size of the Data Closely Coupled Memory (DCCM) in bytes +-dccm_size 131072 + +# dccm_base --- Sets the initial memory region assignment for DCCM +-dccm_base 8 + +# dccm_interleave --- Split DCCM into even/odd memory banks. +-dccm_interleave false + +# dccm_prot --- Specifies the type of protection built for the DCCM. +-dccm_prot None + +# dccm_prot_level --- Specifies the level protection. +-dccm_prot_level Data_Only + +# dccm_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the DCCM +-dccm_prot_exceptions true + +# dccm_dmi --- This enables external access through a DMI (direct memory interface) port. +-dccm_dmi true + + +######## ICCM0 --- com.arc.hardware.ICCM0.1_0 ######## + +# Create ICCM0 +-create com.arc.hardware.ICCM0.1_0 System.CPUisle.ARCv2EM.ICCM0 + +# iccm0_size --- This defines the size of ICCM0 in bytes.This ICCM has 0 wait states. +-iccm0_size 262144 + +# iccm0_base --- Sets the initial memory region assignment for ICCM0 +-iccm0_base 2 + +# iccm0_wide --- Creates ICCM0 as 64b memory to reduce accesses. +-iccm0_wide true + +# iccm0_prot --- Specifies the type of protection built for ICCM0. +-iccm0_prot None + +# iccm0_prot_level --- Specifies the level of protection. +-iccm0_prot_level Data_Only + +# iccm0_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the ICCM0 +-iccm0_prot_exceptions true + +# iccm0_dmi --- This enables external access through a DMI (direct memory interface) port. +-iccm0_dmi true + + +######## XY --- com.arc.hardware.XY.1_0 ######## + +# Create XY +-create com.arc.hardware.XY.1_0 System.CPUisle.ARCv2EM.XY + +# xy_config --- XY memory configuration: +# One memory: DCCM only. +# Two memories: DCCM + Y. +# Three memories: DCCM + X + Y. +-xy_config dccm_x_y + +# xy_size --- Size of X and Y memories if included. +# X and Y memories both have the same configured size. +-xy_size 32768 + +# xy_interleave --- Split XY memories into odd/even instances to enable single cycle unaligned access. +-xy_interleave true + +# xy_x_base --- Base region for X memory. All accesses to this region will initiate a transfer on the X memory. +-xy_x_base 12 + +# xy_y_base --- Base region for Y memory. All accesses to this region will initiate a transfer on the Y memory. +-xy_y_base 14 + + +######## DMA Controller --- com.arc.hardware.DMA_Controller.1_0 ######## + +# Create DMA Controller +-create com.arc.hardware.DMA_Controller.1_0 "System.CPUisle.ARCv2EM.DMA Controller" + +# dmac_channels --- This options specifies the number of DMA channels implemented in the DMA controller +-dmac_channels 16 + +# dmac_fifo_depth --- This option specifies the DMA transfer FIFO depth in 32b words. +-dmac_fifo_depth 4 + +# dmac_int_config --- None: the DMA controller cannot raise an interrupt +# Single-External: single done and single error interrupt signal for all DMA channels, and the interrupt signals are routed to a port at the top of the EM logical hierarchy +# Multiple-External: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are routed to ports at the top of the EM logical hierarchy +# Single-Internal: single done and single error interrupt signals for all DMA channels, and the interrupt signals are internal to the EM core +# Multiple-Internal: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are internal to the EM core +-dmac_int_config Multiple-Internal + +# dmac_registers --- This option defines the number of DMA channels with their registers located in auxiliary space. +-dmac_registers 16 + +# dmac_mem_if --- This option specifies whether the DMA controller system memory interface is integrated into the existing EM system memory interfaces or has its own interface. +-dmac_mem_if separate + + +######## JTAG Interface --- com.arc.hardware.JTAG_Interface.1_0 ######## + +# Create JTAG Interface +-create com.arc.hardware.JTAG_Interface.1_0 "System.CPUisle.ARCv2EM.JTAG Interface" + +######## Debug Interface --- com.arc.hardware.Debug_Interface.1_0 ######## + +# Create Debug Interface +-create com.arc.hardware.Debug_Interface.1_0 "System.CPUisle.ARCv2EM.Debug Interface" + +######## Actionpoints --- com.arc.hardware.Actionpoints.1_0 ######## + +# Create Actionpoints +-create com.arc.hardware.Actionpoints.1_0 System.CPUisle.ARCv2EM.Actionpoints + +# num_actionpoints --- This is the number of trigger events available. +-num_actionpoints 8 + +# aps_feature --- Selects Actionpoint feature set +-aps_feature min + + +######## SmaRT --- com.arc.hardware.SmaRT.1_0 ######## + +# Create SmaRT +-create com.arc.hardware.SmaRT.1_0 System.CPUisle.ARCv2EM.SmaRT + +# smart_stack_entries --- This specifies the number of entries in the trace buffer. +-smart_stack_entries 64 + +# smart_implementation --- Flip-flop = FF-based design. Memory = memory-based design (provides better density for larger trace buffers). +-smart_implementation memory + + +######## Memory Protection Unit --- com.arc.hardware.Memory_Protection_Unit.1_0 ######## + +# Create Memory Protection Unit +-create com.arc.hardware.Memory_Protection_Unit.1_0 "System.CPUisle.ARCv2EM.Memory Protection Unit" + +# mpu_num_regions --- Number of configured memory regions. +-mpu_num_regions 16 + +# mpu_32b --- Set the minimal region size to be 32 byte instead of 2KB. +-mpu_32b false + + +######## Floating-point unit --- com.arc.hardware.Floating_point_unit.1_0 ######## + +# Create Floating-point unit +-create com.arc.hardware.Floating_point_unit.1_0 "System.CPUisle.ARCv2EM.Floating-point unit" + +# fpu_dp_assist --- This enables double-precision acceleration instructions. +-fpu_dp_assist true + +# fpu_fma_option --- This enables the fused multiply-add & multiply-subtract instructions. +-fpu_fma_option true + +# fpu_mas_cycles --- Make mul/add/sub multicycle to achieve a higher clock speed. +-fpu_mas_cycles 2 + +# fpu_div_option --- This enables divide & square-root acceleration +-fpu_div_option true + +# fpu_div_cycles --- "inferred" option infers DSP datapath elements from verilog operators for better area and "optimized" option selects hardware for better timing +-fpu_div_cycles 17 + + +######## Performance Monitor --- com.arc.hardware.Performance_Monitor.1_0 ######## + +# Create Performance Monitor +-create com.arc.hardware.Performance_Monitor.1_0 "System.CPUisle.ARCv2EM.Performance Monitor" + +# pct_counters --- Number of counters for performance monitoring. +-pct_counters 8 + + +######## dsp_trig --- com.arc.hardware.dfss.dsp_trig.1_0 ######## + +# Create dsp_trig +-create com.arc.hardware.dfss.dsp_trig.1_0 System.CPUisle.ARCv2EM.dsp_trig + +# dsp_trig --- Command line option for EIA extension component 'dsp_trig'. +-dsp_trig true + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_gpio_4b0 --- com.arc.hardware.dfss.io_gpio_4b0.1_0 ######## + +# Create io_gpio_4b0 +-create com.arc.hardware.dfss.io_gpio_4b0.1_0 System.CPUisle.ARCv2EM.io_gpio_4b0 + +# io_gpio_4b0 --- Command line option for EIA extension component 'io_gpio_4b0'. +-io_gpio_4b0 true + +# io_gpio_4b0_debounce --- Selects the inclusion of Debounce logic +-io_gpio_4b0_debounce 1 + +# io_gpio_4b0_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal +-io_gpio_4b0_readback_sync 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_gpio_4b1 --- com.arc.hardware.dfss.io_gpio_4b1.1_0 ######## + +# Create io_gpio_4b1 +-create com.arc.hardware.dfss.io_gpio_4b1.1_0 System.CPUisle.ARCv2EM.io_gpio_4b1 + +# io_gpio_4b1 --- Command line option for EIA extension component 'io_gpio_4b1'. +-io_gpio_4b1 true + +# io_gpio_4b1_debounce --- Selects the inclusion of Debounce logic +-io_gpio_4b1_debounce 1 + +# io_gpio_4b1_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal +-io_gpio_4b1_readback_sync 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_gpio_4b2 --- com.arc.hardware.dfss.io_gpio_4b2.1_0 ######## + +# Create io_gpio_4b2 +-create com.arc.hardware.dfss.io_gpio_4b2.1_0 System.CPUisle.ARCv2EM.io_gpio_4b2 + +# io_gpio_4b2 --- Command line option for EIA extension component 'io_gpio_4b2'. +-io_gpio_4b2 true + +# io_gpio_4b2_debounce --- Selects the inclusion of Debounce logic +-io_gpio_4b2_debounce 1 + +# io_gpio_4b2_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal +-io_gpio_4b2_readback_sync 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_gpio_8b0 --- com.arc.hardware.dfss.io_gpio_8b0.1_0 ######## + +# Create io_gpio_8b0 +-create com.arc.hardware.dfss.io_gpio_8b0.1_0 System.CPUisle.ARCv2EM.io_gpio_8b0 + +# io_gpio_8b0 --- Command line option for EIA extension component 'io_gpio_8b0'. +-io_gpio_8b0 true + +# io_gpio_8b0_debounce --- Selects the inclusion of Debounce logic +-io_gpio_8b0_debounce 1 + +# io_gpio_8b0_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal +-io_gpio_8b0_readback_sync 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_gpio_8b1 --- com.arc.hardware.dfss.io_gpio_8b1.1_0 ######## + +# Create io_gpio_8b1 +-create com.arc.hardware.dfss.io_gpio_8b1.1_0 System.CPUisle.ARCv2EM.io_gpio_8b1 + +# io_gpio_8b1 --- Command line option for EIA extension component 'io_gpio_8b1'. +-io_gpio_8b1 true + +# io_gpio_8b1_debounce --- Selects the inclusion of Debounce logic +-io_gpio_8b1_debounce 1 + +# io_gpio_8b1_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal +-io_gpio_8b1_readback_sync 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_gpio_8b2 --- com.arc.hardware.dfss.io_gpio_8b2.1_0 ######## + +# Create io_gpio_8b2 +-create com.arc.hardware.dfss.io_gpio_8b2.1_0 System.CPUisle.ARCv2EM.io_gpio_8b2 + +# io_gpio_8b2 --- Command line option for EIA extension component 'io_gpio_8b2'. +-io_gpio_8b2 true + +# io_gpio_8b2_debounce --- Selects the inclusion of Debounce logic +-io_gpio_8b2_debounce 1 + +# io_gpio_8b2_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal +-io_gpio_8b2_readback_sync 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_gpio_8b3 --- com.arc.hardware.dfss.io_gpio_8b3.1_0 ######## + +# Create io_gpio_8b3 +-create com.arc.hardware.dfss.io_gpio_8b3.1_0 System.CPUisle.ARCv2EM.io_gpio_8b3 + +# io_gpio_8b3 --- Command line option for EIA extension component 'io_gpio_8b3'. +-io_gpio_8b3 true + +# io_gpio_8b3_debounce --- Selects the inclusion of Debounce logic +-io_gpio_8b3_debounce 1 + +# io_gpio_8b3_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal +-io_gpio_8b3_readback_sync 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_i2c_mst0 --- com.arc.hardware.dfss.io_i2c_mst0.1_0 ######## + +# Create io_i2c_mst0 +-create com.arc.hardware.dfss.io_i2c_mst0.1_0 System.CPUisle.ARCv2EM.io_i2c_mst0 + +# io_i2c_mst0 --- Command line option for APEX extension component 'io_i2c_mst0'. +-io_i2c_mst0 true + +# io_i2c_mst0_fs --- RX/TX FIFO size +-io_i2c_mst0_fs 16 + +# io_i2c_mst0_dma_support --- Specifies whether the DMA handshake interface is included +-io_i2c_mst0_dma_support None + +# io_i2c_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency. +-io_i2c_mst0_cdc_included 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_i2c_mst1 --- com.arc.hardware.dfss.io_i2c_mst1.1_0 ######## + +# Create io_i2c_mst1 +-create com.arc.hardware.dfss.io_i2c_mst1.1_0 System.CPUisle.ARCv2EM.io_i2c_mst1 + +# io_i2c_mst1 --- Command line option for APEX extension component 'io_i2c_mst1'. +-io_i2c_mst1 true + +# io_i2c_mst1_fs --- RX/TX FIFO size +-io_i2c_mst1_fs 16 + +# io_i2c_mst1_dma_support --- Specifies whether the DMA handshake interface is included +-io_i2c_mst1_dma_support None + +# io_i2c_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency. +-io_i2c_mst1_cdc_included 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_i2c_mst2 --- com.arc.hardware.dfss.io_i2c_mst2.1_0 ######## + +# Create io_i2c_mst2 +-create com.arc.hardware.dfss.io_i2c_mst2.1_0 System.CPUisle.ARCv2EM.io_i2c_mst2 + +# io_i2c_mst2 --- Command line option for APEX extension component 'io_i2c_mst2'. +-io_i2c_mst2 true + +# io_i2c_mst2_fs --- RX/TX FIFO size +-io_i2c_mst2_fs 16 + +# io_i2c_mst2_dma_support --- Specifies whether the DMA handshake interface is included +-io_i2c_mst2_dma_support None + +# io_i2c_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency. +-io_i2c_mst2_cdc_included 1 + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_spi_mst0 --- com.arc.hardware.dfss.io_spi_mst0.1_0 ######## + +# Create io_spi_mst0 +-create com.arc.hardware.dfss.io_spi_mst0.1_0 System.CPUisle.ARCv2EM.io_spi_mst0 + +# io_spi_mst0 --- Command line option for APEX extension component 'io_spi_mst0'. +-io_spi_mst0 true + +# io_spi_mst0_fz --- RX/TX FIFO depth +-io_spi_mst0_fs 16 + +# io_spi_mst0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. +-io_spi_mst0_max_xfer_size 16 + +# io_spi_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency. +-io_spi_mst0_cdc_included 1 + +# io_spi_mst0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_spi_mst0_dma_support Aux-Based + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_spi_mst1 --- com.arc.hardware.dfss.io_spi_mst1.1_0 ######## + +# Create io_spi_mst1 +-create com.arc.hardware.dfss.io_spi_mst1.1_0 System.CPUisle.ARCv2EM.io_spi_mst1 + +# io_spi_mst1 --- Command line option for APEX extension component 'io_spi_mst1'. +-io_spi_mst1 true + +# io_spi_mst1_fz --- RX/TX FIFO depth +-io_spi_mst1_fs 16 + +# io_spi_mst1_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. +-io_spi_mst1_max_xfer_size 16 + +# io_spi_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency. +-io_spi_mst1_cdc_included 1 + +# io_spi_mst1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_spi_mst1_dma_support Aux-Based + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_spi_mst2 --- com.arc.hardware.dfss.io_spi_mst2.1_0 ######## + +# Create io_spi_mst2 +-create com.arc.hardware.dfss.io_spi_mst2.1_0 System.CPUisle.ARCv2EM.io_spi_mst2 + +# io_spi_mst2 --- Command line option for APEX extension component 'io_spi_mst2'. +-io_spi_mst2 true + +# io_spi_mst2_fz --- RX/TX FIFO depth +-io_spi_mst2_fs 16 + +# io_spi_mst2_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. +-io_spi_mst2_max_xfer_size 16 + +# io_spi_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency. +-io_spi_mst2_cdc_included 1 + +# io_spi_mst2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_spi_mst2_dma_support Aux-Based + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_spi_slv0 --- com.arc.hardware.dfss.io_spi_slv0.1_0 ######## + +# Create io_spi_slv0 +-create com.arc.hardware.dfss.io_spi_slv0.1_0 System.CPUisle.ARCv2EM.io_spi_slv0 + +# io_spi_slv0 --- Command line option for APEX extension component 'io_spi_slv0'. +-io_spi_slv0 true + +# io_spi_slv0_fz --- RX/TX FIFO depth +-io_spi_slv0_fs 16 + +# io_spi_slv0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. +-io_spi_slv0_max_xfer_size 16 + +# io_spi_slv0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_spi_slv0_dma_support None + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_uart0 --- com.arc.hardware.dfss.io_uart0.1_0 ######## + +# Create io_uart0 +-create com.arc.hardware.dfss.io_uart0.1_0 System.CPUisle.ARCv2EM.io_uart0 + +# io_uart0 --- Command line option for EIA extension component 'io_uart0'. +-io_uart0 true + +# io_uart0_fifo_mode --- Set the UART FIFO mode +-io_uart0_fifo_mode 16 + +# io_uart0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_uart0_dma_support None + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_uart1 --- com.arc.hardware.dfss.io_uart1.1_0 ######## + +# Create io_uart1 +-create com.arc.hardware.dfss.io_uart1.1_0 System.CPUisle.ARCv2EM.io_uart1 + +# io_uart1 --- Command line option for EIA extension component 'io_uart1'. +-io_uart1 true + +# io_uart1_fifo_mode --- Set the UART FIFO mode +-io_uart1_fifo_mode 16 + +# io_uart1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_uart1_dma_support Aux-Based + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_uart2 --- com.arc.hardware.dfss.io_uart2.1_0 ######## + +# Create io_uart2 +-create com.arc.hardware.dfss.io_uart2.1_0 System.CPUisle.ARCv2EM.io_uart2 + +# io_uart2 --- Command line option for EIA extension component 'io_uart2'. +-io_uart2 true + +# io_uart2_fifo_mode --- Set the UART FIFO mode +-io_uart2_fifo_mode 16 + +# io_uart2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_uart2_dma_support Aux-Based + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_uart3 --- com.arc.hardware.dfss.io_uart3.1_0 ######## + +# Create io_uart3 +-create com.arc.hardware.dfss.io_uart3.1_0 System.CPUisle.ARCv2EM.io_uart3 + +# io_uart3 --- Command line option for EIA extension component 'io_uart3'. +-io_uart3 true + +# io_uart3_fifo_mode --- Set the UART FIFO mode +-io_uart3_fifo_mode 16 + +# io_uart3_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. +-io_uart3_dma_support Aux-Based + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_creg_mst0 --- com.arc.hardware.dfss.io_creg_mst0.1_0 ######## + +# Create io_creg_mst0 +-create com.arc.hardware.dfss.io_creg_mst0.1_0 System.CPUisle.ARCv2EM.io_creg_mst0 + +# io_creg_mst0 --- Command line option for EIA extension component 'io_creg_mst0'. +-io_creg_mst0 true + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## io_creg_slv0 --- com.arc.hardware.dfss.io_creg_slv0.1_0 ######## + +# Create io_creg_slv0 +-create com.arc.hardware.dfss.io_creg_slv0.1_0 System.CPUisle.ARCv2EM.io_creg_slv0 + +# io_creg_slv0 --- Command line option for EIA extension component 'io_creg_slv0'. +-io_creg_slv0 true + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## subsys_bcr --- com.arc.hardware.dfss.subsys_bcr.1_0 ######## + +# Create subsys_bcr +-create com.arc.hardware.dfss.subsys_bcr.1_0 System.CPUisle.ARCv2EM.subsys_bcr + +# assign_xpubit --- +# +# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. +#

+# By default an extension is not assigned a bit in this register. This means the extension is always available. +#

+# If you wish to assign an XPU bit number, select this option. +# +# +-assign_xpubit false + +# xpubit --- +# The XPU bit number for this extension. +# +-xpubit 0 + + +######## subsys_infra --- com.arc.hardware.dfss.subsys_infra.1_0 ######## + +# Create subsys_infra +-create com.arc.hardware.dfss.subsys_infra.1_0 System.subsys_infra + +# subsys_infra --- Command line option for EIA glue logic. +-subsys_infra true + +# internal_interrupt --- Connect the IO interrupts internally +-internal_interrupt true + +# internal_dma_handshake --- Connect the DMA handshake signals internally +-internal_dma_handshake true + + +######## ARConnect --- com.arc.hardware.ARConnect.1_0 ######## + +# Create ARConnect +-create com.arc.hardware.ARConnect.1_0 System.ARConnect + +# mcip_def_div2ref --- This specifies the clock division factor at reset. It is used for mss clock controller to generate ARConnect clock, and the value N means ARConnect is running at (1/N) x ref_clk. +-mcip_def_div2ref 1 + +# mcip_has_intrpt --- This specifies whether the Inter-core Interrupt Unit exists +-mcip_has_intrpt false + +# mcip_has_sema --- This specifies whether the Inter-core Semaphore Unit exists +-mcip_has_sema false + +# mcip_sema_num --- This specifies the number of semaphores in the Inter-core Semaphores Unit +-mcip_sema_num 16 + +# mcip_has_msg_sram --- This specifies whether the Inter-core Message Unit exists +-mcip_has_msg_sram false + +# mcip_msg_sram_size --- This specifies the bytes of SRAM in the Inter-core Message Unit +-mcip_msg_sram_size 512 + +# mcip_msg_1cycle --- True: The access path to message SRAM is 1 clock cycle; False: The access path to message SRAM 1.5 cycles. Note: The 1.5 cycles path use clock negetive edge for SRAM, but can acheive higher frequency. No performance difference caused by the value of this option +-mcip_msg_1cycle false + +# mcip_has_debug --- This specifies whether the Inter-core Debug Unit exists +-mcip_has_debug false + +# mcip_has_grtc --- This specifies whether the Global Real-Time Counter Unit exists +-mcip_has_grtc false + +# mcip_has_pmu --- This specifies whether the external Power Management Unit exists +-mcip_has_pmu true + +# mcip_power_domains --- This specifies whether the ARConnect Power Domain Management Unit exists +-mcip_power_domains true + +# mcip_llm_size --- This specifies the KBytes of SRAM in the Low Latency Memory Unit +-mcip_llm_size 32 + +# mcip_llm_base --- This specifies the default memory region of Low Latency Memory Unit +-mcip_llm_base 2 + +# mcip_llm_ecc --- This specifies the ECC mode of SRAM in Low Latency Memory Unit. none = No checking; parity = Parity only; SECDED = single-error correction and double-error detection (SECDED) +-mcip_llm_ecc SECDED + +# mcip_idu_cirq_num --- This specifies the number of common interrupts supported by IDU +-mcip_idu_cirq_num 4 + +# mcip_bsu_dbw --- This specifies the data bus width of Bus Slave Unit +-mcip_bsu_dbw 64 + +# mcip_bsu_type --- This specifies the bus protocol of Bus Slave Unit +-mcip_bsu_type AXI + + +]]> + + + + + + + + + + + + + + + ICCM0 + + GROUP: { + /* _SDA_BASE_ computed implicitly */ + .sdata?: {} + .sbss?: {} + * (DATA): {} + * (BSS): {} + .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32768): {} + .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {} + } > DCCM + GROUP: { + .Xdata? : {} + } > XCCM + GROUP: { + .Ydata? : {} + } > YCCM + GROUP BIND(0x0): { + .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:684): {} = FILL(0xa5a5a5a5,4) + } + } + +]]> + + + + + + 0x07, sub_opcode => 0x1E , latency_cycles => 8) + +// User extension instruction - dsp_sin +extern long dsp_sin(long); +#pragma intrinsic(dsp_sin, opcode => 0x07, sub_opcode => 0x1F , latency_cycles => 8) + +// User extension instruction - dsp_tan +extern long dsp_tan(long); +#pragma intrinsic(dsp_tan, opcode => 0x07, sub_opcode => 0x22 , latency_cycles => 11) + +// User extension instruction - dsp_acos +extern long dsp_acos(long); +#pragma intrinsic(dsp_acos, opcode => 0x07, sub_opcode => 0x23 , latency_cycles => 31) + +// User extension instruction - dsp_asin +extern long dsp_asin(long); +#pragma intrinsic(dsp_asin, opcode => 0x07, sub_opcode => 0x24 , latency_cycles => 31) + +// User extension instruction - dsp_atan +extern long dsp_atan(long); +#pragma intrinsic(dsp_atan, opcode => 0x07, sub_opcode => 0x25 , latency_cycles => 13) + +// User extension instruction - dsp_sqrt +extern long dsp_sqrt(long); +#pragma intrinsic(dsp_sqrt, opcode => 0x07, sub_opcode => 0x20 , latency_cycles => 31) + +// User extension instruction - dsp_sqrt15 +extern long dsp_sqrt15(long); +#pragma intrinsic(dsp_sqrt15, opcode => 0x07, sub_opcode => 0x21 , latency_cycles => 15) + +#define APEX_COM_ARC_HARDWARE_DFSS_DSP_TRIG_PRESENT 1 +#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_4B0_IO_GPIO_4B0_PRESENT 1 + +// User extension aux register io_gpio_4b0_debounce +#define AR_IO_GPIO_4B0_DEBOUNCE 0x80017c48 +#pragma Aux_register(0x80017c48, name=>"io_gpio_4b0_debounce") + +// User extension aux register io_gpio_4b0_clken +#define AR_IO_GPIO_4B0_CLKEN 0x80017c80 +#pragma Aux_register(0x80017c80, name=>"io_gpio_4b0_clken") + +// User extension aux register io_gpio_4b0_swporta_dr +#define AR_IO_GPIO_4B0_SWPORTA_DR 0x80017c00 +#pragma Aux_register(0x80017c00, name=>"io_gpio_4b0_swporta_dr") + +// User extension aux register io_gpio_4b0_swporta_ddr +#define AR_IO_GPIO_4B0_SWPORTA_DDR 0x80017c04 +#pragma Aux_register(0x80017c04, name=>"io_gpio_4b0_swporta_ddr") + +// User extension aux register io_gpio_4b0_inten +#define AR_IO_GPIO_4B0_INTEN 0x80017c30 +#pragma Aux_register(0x80017c30, name=>"io_gpio_4b0_inten") + +// User extension aux register io_gpio_4b0_intmask +#define AR_IO_GPIO_4B0_INTMASK 0x80017c34 +#pragma Aux_register(0x80017c34, name=>"io_gpio_4b0_intmask") + +// User extension aux register io_gpio_4b0_inttype_level +#define AR_IO_GPIO_4B0_INTTYPE_LEVEL 0x80017c38 +#pragma Aux_register(0x80017c38, name=>"io_gpio_4b0_inttype_level") + +// User extension aux register io_gpio_4b0_int_polarity +#define AR_IO_GPIO_4B0_INT_POLARITY 0x80017c3c +#pragma Aux_register(0x80017c3c, name=>"io_gpio_4b0_int_polarity") + +// User extension aux register io_gpio_4b0_intstatus +#define AR_IO_GPIO_4B0_INTSTATUS 0x80017c40 +#pragma Aux_register(0x80017c40, name=>"io_gpio_4b0_intstatus") + +// User extension aux register io_gpio_4b0_raw_intstatus +#define AR_IO_GPIO_4B0_RAW_INTSTATUS 0x80017c44 +#pragma Aux_register(0x80017c44, name=>"io_gpio_4b0_raw_intstatus") + +// User extension aux register io_gpio_4b0_porta_eoi +#define AR_IO_GPIO_4B0_PORTA_EOI 0x80017c4c +#pragma Aux_register(0x80017c4c, name=>"io_gpio_4b0_porta_eoi") + +// User extension aux register io_gpio_4b0_ext_porta +#define AR_IO_GPIO_4B0_EXT_PORTA 0x80017c50 +#pragma Aux_register(0x80017c50, name=>"io_gpio_4b0_ext_porta") + +// User extension aux register io_gpio_4b0_ls_sync +#define AR_IO_GPIO_4B0_LS_SYNC 0x80017c60 +#pragma Aux_register(0x80017c60, name=>"io_gpio_4b0_ls_sync") + +// User extension aux register io_gpio_4b0_int_bothedge +#define AR_IO_GPIO_4B0_INT_BOTHEDGE 0x80017c68 +#pragma Aux_register(0x80017c68, name=>"io_gpio_4b0_int_bothedge") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_4B1_IO_GPIO_4B1_PRESENT 1 + +// User extension aux register io_gpio_4b1_debounce +#define AR_IO_GPIO_4B1_DEBOUNCE 0x80017d48 +#pragma Aux_register(0x80017d48, name=>"io_gpio_4b1_debounce") + +// User extension aux register io_gpio_4b1_clken +#define AR_IO_GPIO_4B1_CLKEN 0x80017d80 +#pragma Aux_register(0x80017d80, name=>"io_gpio_4b1_clken") + +// User extension aux register io_gpio_4b1_swporta_dr +#define AR_IO_GPIO_4B1_SWPORTA_DR 0x80017d00 +#pragma Aux_register(0x80017d00, name=>"io_gpio_4b1_swporta_dr") + +// User extension aux register io_gpio_4b1_swporta_ddr +#define AR_IO_GPIO_4B1_SWPORTA_DDR 0x80017d04 +#pragma Aux_register(0x80017d04, name=>"io_gpio_4b1_swporta_ddr") + +// User extension aux register io_gpio_4b1_inten +#define AR_IO_GPIO_4B1_INTEN 0x80017d30 +#pragma Aux_register(0x80017d30, name=>"io_gpio_4b1_inten") + +// User extension aux register io_gpio_4b1_intmask +#define AR_IO_GPIO_4B1_INTMASK 0x80017d34 +#pragma Aux_register(0x80017d34, name=>"io_gpio_4b1_intmask") + +// User extension aux register io_gpio_4b1_inttype_level +#define AR_IO_GPIO_4B1_INTTYPE_LEVEL 0x80017d38 +#pragma Aux_register(0x80017d38, name=>"io_gpio_4b1_inttype_level") + +// User extension aux register io_gpio_4b1_int_polarity +#define AR_IO_GPIO_4B1_INT_POLARITY 0x80017d3c +#pragma Aux_register(0x80017d3c, name=>"io_gpio_4b1_int_polarity") + +// User extension aux register io_gpio_4b1_intstatus +#define AR_IO_GPIO_4B1_INTSTATUS 0x80017d40 +#pragma Aux_register(0x80017d40, name=>"io_gpio_4b1_intstatus") + +// User extension aux register io_gpio_4b1_raw_intstatus +#define AR_IO_GPIO_4B1_RAW_INTSTATUS 0x80017d44 +#pragma Aux_register(0x80017d44, name=>"io_gpio_4b1_raw_intstatus") + +// User extension aux register io_gpio_4b1_porta_eoi +#define AR_IO_GPIO_4B1_PORTA_EOI 0x80017d4c +#pragma Aux_register(0x80017d4c, name=>"io_gpio_4b1_porta_eoi") + +// User extension aux register io_gpio_4b1_ext_porta +#define AR_IO_GPIO_4B1_EXT_PORTA 0x80017d50 +#pragma Aux_register(0x80017d50, name=>"io_gpio_4b1_ext_porta") + +// User extension aux register io_gpio_4b1_ls_sync +#define AR_IO_GPIO_4B1_LS_SYNC 0x80017d60 +#pragma Aux_register(0x80017d60, name=>"io_gpio_4b1_ls_sync") + +// User extension aux register io_gpio_4b1_int_bothedge +#define AR_IO_GPIO_4B1_INT_BOTHEDGE 0x80017d68 +#pragma Aux_register(0x80017d68, name=>"io_gpio_4b1_int_bothedge") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_4B2_IO_GPIO_4B2_PRESENT 1 + +// User extension aux register io_gpio_4b2_debounce +#define AR_IO_GPIO_4B2_DEBOUNCE 0x80017e48 +#pragma Aux_register(0x80017e48, name=>"io_gpio_4b2_debounce") + +// User extension aux register io_gpio_4b2_clken +#define AR_IO_GPIO_4B2_CLKEN 0x80017e80 +#pragma Aux_register(0x80017e80, name=>"io_gpio_4b2_clken") + +// User extension aux register io_gpio_4b2_swporta_dr +#define AR_IO_GPIO_4B2_SWPORTA_DR 0x80017e00 +#pragma Aux_register(0x80017e00, name=>"io_gpio_4b2_swporta_dr") + +// User extension aux register io_gpio_4b2_swporta_ddr +#define AR_IO_GPIO_4B2_SWPORTA_DDR 0x80017e04 +#pragma Aux_register(0x80017e04, name=>"io_gpio_4b2_swporta_ddr") + +// User extension aux register io_gpio_4b2_inten +#define AR_IO_GPIO_4B2_INTEN 0x80017e30 +#pragma Aux_register(0x80017e30, name=>"io_gpio_4b2_inten") + +// User extension aux register io_gpio_4b2_intmask +#define AR_IO_GPIO_4B2_INTMASK 0x80017e34 +#pragma Aux_register(0x80017e34, name=>"io_gpio_4b2_intmask") + +// User extension aux register io_gpio_4b2_inttype_level +#define AR_IO_GPIO_4B2_INTTYPE_LEVEL 0x80017e38 +#pragma Aux_register(0x80017e38, name=>"io_gpio_4b2_inttype_level") + +// User extension aux register io_gpio_4b2_int_polarity +#define AR_IO_GPIO_4B2_INT_POLARITY 0x80017e3c +#pragma Aux_register(0x80017e3c, name=>"io_gpio_4b2_int_polarity") + +// User extension aux register io_gpio_4b2_intstatus +#define AR_IO_GPIO_4B2_INTSTATUS 0x80017e40 +#pragma Aux_register(0x80017e40, name=>"io_gpio_4b2_intstatus") + +// User extension aux register io_gpio_4b2_raw_intstatus +#define AR_IO_GPIO_4B2_RAW_INTSTATUS 0x80017e44 +#pragma Aux_register(0x80017e44, name=>"io_gpio_4b2_raw_intstatus") + +// User extension aux register io_gpio_4b2_porta_eoi +#define AR_IO_GPIO_4B2_PORTA_EOI 0x80017e4c +#pragma Aux_register(0x80017e4c, name=>"io_gpio_4b2_porta_eoi") + +// User extension aux register io_gpio_4b2_ext_porta +#define AR_IO_GPIO_4B2_EXT_PORTA 0x80017e50 +#pragma Aux_register(0x80017e50, name=>"io_gpio_4b2_ext_porta") + +// User extension aux register io_gpio_4b2_ls_sync +#define AR_IO_GPIO_4B2_LS_SYNC 0x80017e60 +#pragma Aux_register(0x80017e60, name=>"io_gpio_4b2_ls_sync") + +// User extension aux register io_gpio_4b2_int_bothedge +#define AR_IO_GPIO_4B2_INT_BOTHEDGE 0x80017e68 +#pragma Aux_register(0x80017e68, name=>"io_gpio_4b2_int_bothedge") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B0_IO_GPIO_8B0_PRESENT 1 + +// User extension aux register io_gpio_8b0_debounce +#define AR_IO_GPIO_8B0_DEBOUNCE 0x80017848 +#pragma Aux_register(0x80017848, name=>"io_gpio_8b0_debounce") + +// User extension aux register io_gpio_8b0_clken +#define AR_IO_GPIO_8B0_CLKEN 0x80017880 +#pragma Aux_register(0x80017880, name=>"io_gpio_8b0_clken") + +// User extension aux register io_gpio_8b0_swporta_dr +#define AR_IO_GPIO_8B0_SWPORTA_DR 0x80017800 +#pragma Aux_register(0x80017800, name=>"io_gpio_8b0_swporta_dr") + +// User extension aux register io_gpio_8b0_swporta_ddr +#define AR_IO_GPIO_8B0_SWPORTA_DDR 0x80017804 +#pragma Aux_register(0x80017804, name=>"io_gpio_8b0_swporta_ddr") + +// User extension aux register io_gpio_8b0_inten +#define AR_IO_GPIO_8B0_INTEN 0x80017830 +#pragma Aux_register(0x80017830, name=>"io_gpio_8b0_inten") + +// User extension aux register io_gpio_8b0_intmask +#define AR_IO_GPIO_8B0_INTMASK 0x80017834 +#pragma Aux_register(0x80017834, name=>"io_gpio_8b0_intmask") + +// User extension aux register io_gpio_8b0_inttype_level +#define AR_IO_GPIO_8B0_INTTYPE_LEVEL 0x80017838 +#pragma Aux_register(0x80017838, name=>"io_gpio_8b0_inttype_level") + +// User extension aux register io_gpio_8b0_int_polarity +#define AR_IO_GPIO_8B0_INT_POLARITY 0x8001783c +#pragma Aux_register(0x8001783c, name=>"io_gpio_8b0_int_polarity") + +// User extension aux register io_gpio_8b0_intstatus +#define AR_IO_GPIO_8B0_INTSTATUS 0x80017840 +#pragma Aux_register(0x80017840, name=>"io_gpio_8b0_intstatus") + +// User extension aux register io_gpio_8b0_raw_intstatus +#define AR_IO_GPIO_8B0_RAW_INTSTATUS 0x80017844 +#pragma Aux_register(0x80017844, name=>"io_gpio_8b0_raw_intstatus") + +// User extension aux register io_gpio_8b0_porta_eoi +#define AR_IO_GPIO_8B0_PORTA_EOI 0x8001784c +#pragma Aux_register(0x8001784c, name=>"io_gpio_8b0_porta_eoi") + +// User extension aux register io_gpio_8b0_ext_porta +#define AR_IO_GPIO_8B0_EXT_PORTA 0x80017850 +#pragma Aux_register(0x80017850, name=>"io_gpio_8b0_ext_porta") + +// User extension aux register io_gpio_8b0_ls_sync +#define AR_IO_GPIO_8B0_LS_SYNC 0x80017860 +#pragma Aux_register(0x80017860, name=>"io_gpio_8b0_ls_sync") + +// User extension aux register io_gpio_8b0_int_bothedge +#define AR_IO_GPIO_8B0_INT_BOTHEDGE 0x80017868 +#pragma Aux_register(0x80017868, name=>"io_gpio_8b0_int_bothedge") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B1_IO_GPIO_8B1_PRESENT 1 + +// User extension aux register io_gpio_8b1_debounce +#define AR_IO_GPIO_8B1_DEBOUNCE 0x80017948 +#pragma Aux_register(0x80017948, name=>"io_gpio_8b1_debounce") + +// User extension aux register io_gpio_8b1_clken +#define AR_IO_GPIO_8B1_CLKEN 0x80017980 +#pragma Aux_register(0x80017980, name=>"io_gpio_8b1_clken") + +// User extension aux register io_gpio_8b1_swporta_dr +#define AR_IO_GPIO_8B1_SWPORTA_DR 0x80017900 +#pragma Aux_register(0x80017900, name=>"io_gpio_8b1_swporta_dr") + +// User extension aux register io_gpio_8b1_swporta_ddr +#define AR_IO_GPIO_8B1_SWPORTA_DDR 0x80017904 +#pragma Aux_register(0x80017904, name=>"io_gpio_8b1_swporta_ddr") + +// User extension aux register io_gpio_8b1_inten +#define AR_IO_GPIO_8B1_INTEN 0x80017930 +#pragma Aux_register(0x80017930, name=>"io_gpio_8b1_inten") + +// User extension aux register io_gpio_8b1_intmask +#define AR_IO_GPIO_8B1_INTMASK 0x80017934 +#pragma Aux_register(0x80017934, name=>"io_gpio_8b1_intmask") + +// User extension aux register io_gpio_8b1_inttype_level +#define AR_IO_GPIO_8B1_INTTYPE_LEVEL 0x80017938 +#pragma Aux_register(0x80017938, name=>"io_gpio_8b1_inttype_level") + +// User extension aux register io_gpio_8b1_int_polarity +#define AR_IO_GPIO_8B1_INT_POLARITY 0x8001793c +#pragma Aux_register(0x8001793c, name=>"io_gpio_8b1_int_polarity") + +// User extension aux register io_gpio_8b1_intstatus +#define AR_IO_GPIO_8B1_INTSTATUS 0x80017940 +#pragma Aux_register(0x80017940, name=>"io_gpio_8b1_intstatus") + +// User extension aux register io_gpio_8b1_raw_intstatus +#define AR_IO_GPIO_8B1_RAW_INTSTATUS 0x80017944 +#pragma Aux_register(0x80017944, name=>"io_gpio_8b1_raw_intstatus") + +// User extension aux register io_gpio_8b1_porta_eoi +#define AR_IO_GPIO_8B1_PORTA_EOI 0x8001794c +#pragma Aux_register(0x8001794c, name=>"io_gpio_8b1_porta_eoi") + +// User extension aux register io_gpio_8b1_ext_porta +#define AR_IO_GPIO_8B1_EXT_PORTA 0x80017950 +#pragma Aux_register(0x80017950, name=>"io_gpio_8b1_ext_porta") + +// User extension aux register io_gpio_8b1_ls_sync +#define AR_IO_GPIO_8B1_LS_SYNC 0x80017960 +#pragma Aux_register(0x80017960, name=>"io_gpio_8b1_ls_sync") + +// User extension aux register io_gpio_8b1_int_bothedge +#define AR_IO_GPIO_8B1_INT_BOTHEDGE 0x80017968 +#pragma Aux_register(0x80017968, name=>"io_gpio_8b1_int_bothedge") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B2_IO_GPIO_8B2_PRESENT 1 + +// User extension aux register io_gpio_8b2_debounce +#define AR_IO_GPIO_8B2_DEBOUNCE 0x80017a48 +#pragma Aux_register(0x80017a48, name=>"io_gpio_8b2_debounce") + +// User extension aux register io_gpio_8b2_clken +#define AR_IO_GPIO_8B2_CLKEN 0x80017a80 +#pragma Aux_register(0x80017a80, name=>"io_gpio_8b2_clken") + +// User extension aux register io_gpio_8b2_swporta_dr +#define AR_IO_GPIO_8B2_SWPORTA_DR 0x80017a00 +#pragma Aux_register(0x80017a00, name=>"io_gpio_8b2_swporta_dr") + +// User extension aux register io_gpio_8b2_swporta_ddr +#define AR_IO_GPIO_8B2_SWPORTA_DDR 0x80017a04 +#pragma Aux_register(0x80017a04, name=>"io_gpio_8b2_swporta_ddr") + +// User extension aux register io_gpio_8b2_inten +#define AR_IO_GPIO_8B2_INTEN 0x80017a30 +#pragma Aux_register(0x80017a30, name=>"io_gpio_8b2_inten") + +// User extension aux register io_gpio_8b2_intmask +#define AR_IO_GPIO_8B2_INTMASK 0x80017a34 +#pragma Aux_register(0x80017a34, name=>"io_gpio_8b2_intmask") + +// User extension aux register io_gpio_8b2_inttype_level +#define AR_IO_GPIO_8B2_INTTYPE_LEVEL 0x80017a38 +#pragma Aux_register(0x80017a38, name=>"io_gpio_8b2_inttype_level") + +// User extension aux register io_gpio_8b2_int_polarity +#define AR_IO_GPIO_8B2_INT_POLARITY 0x80017a3c +#pragma Aux_register(0x80017a3c, name=>"io_gpio_8b2_int_polarity") + +// User extension aux register io_gpio_8b2_intstatus +#define AR_IO_GPIO_8B2_INTSTATUS 0x80017a40 +#pragma Aux_register(0x80017a40, name=>"io_gpio_8b2_intstatus") + +// User extension aux register io_gpio_8b2_raw_intstatus +#define AR_IO_GPIO_8B2_RAW_INTSTATUS 0x80017a44 +#pragma Aux_register(0x80017a44, name=>"io_gpio_8b2_raw_intstatus") + +// User extension aux register io_gpio_8b2_porta_eoi +#define AR_IO_GPIO_8B2_PORTA_EOI 0x80017a4c +#pragma Aux_register(0x80017a4c, name=>"io_gpio_8b2_porta_eoi") + +// User extension aux register io_gpio_8b2_ext_porta +#define AR_IO_GPIO_8B2_EXT_PORTA 0x80017a50 +#pragma Aux_register(0x80017a50, name=>"io_gpio_8b2_ext_porta") + +// User extension aux register io_gpio_8b2_ls_sync +#define AR_IO_GPIO_8B2_LS_SYNC 0x80017a60 +#pragma Aux_register(0x80017a60, name=>"io_gpio_8b2_ls_sync") + +// User extension aux register io_gpio_8b2_int_bothedge +#define AR_IO_GPIO_8B2_INT_BOTHEDGE 0x80017a68 +#pragma Aux_register(0x80017a68, name=>"io_gpio_8b2_int_bothedge") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B3_IO_GPIO_8B3_PRESENT 1 + +// User extension aux register io_gpio_8b3_debounce +#define AR_IO_GPIO_8B3_DEBOUNCE 0x80017b48 +#pragma Aux_register(0x80017b48, name=>"io_gpio_8b3_debounce") + +// User extension aux register io_gpio_8b3_clken +#define AR_IO_GPIO_8B3_CLKEN 0x80017b80 +#pragma Aux_register(0x80017b80, name=>"io_gpio_8b3_clken") + +// User extension aux register io_gpio_8b3_swporta_dr +#define AR_IO_GPIO_8B3_SWPORTA_DR 0x80017b00 +#pragma Aux_register(0x80017b00, name=>"io_gpio_8b3_swporta_dr") + +// User extension aux register io_gpio_8b3_swporta_ddr +#define AR_IO_GPIO_8B3_SWPORTA_DDR 0x80017b04 +#pragma Aux_register(0x80017b04, name=>"io_gpio_8b3_swporta_ddr") + +// User extension aux register io_gpio_8b3_inten +#define AR_IO_GPIO_8B3_INTEN 0x80017b30 +#pragma Aux_register(0x80017b30, name=>"io_gpio_8b3_inten") + +// User extension aux register io_gpio_8b3_intmask +#define AR_IO_GPIO_8B3_INTMASK 0x80017b34 +#pragma Aux_register(0x80017b34, name=>"io_gpio_8b3_intmask") + +// User extension aux register io_gpio_8b3_inttype_level +#define AR_IO_GPIO_8B3_INTTYPE_LEVEL 0x80017b38 +#pragma Aux_register(0x80017b38, name=>"io_gpio_8b3_inttype_level") + +// User extension aux register io_gpio_8b3_int_polarity +#define AR_IO_GPIO_8B3_INT_POLARITY 0x80017b3c +#pragma Aux_register(0x80017b3c, name=>"io_gpio_8b3_int_polarity") + +// User extension aux register io_gpio_8b3_intstatus +#define AR_IO_GPIO_8B3_INTSTATUS 0x80017b40 +#pragma Aux_register(0x80017b40, name=>"io_gpio_8b3_intstatus") + +// User extension aux register io_gpio_8b3_raw_intstatus +#define AR_IO_GPIO_8B3_RAW_INTSTATUS 0x80017b44 +#pragma Aux_register(0x80017b44, name=>"io_gpio_8b3_raw_intstatus") + +// User extension aux register io_gpio_8b3_porta_eoi +#define AR_IO_GPIO_8B3_PORTA_EOI 0x80017b4c +#pragma Aux_register(0x80017b4c, name=>"io_gpio_8b3_porta_eoi") + +// User extension aux register io_gpio_8b3_ext_porta +#define AR_IO_GPIO_8B3_EXT_PORTA 0x80017b50 +#pragma Aux_register(0x80017b50, name=>"io_gpio_8b3_ext_porta") + +// User extension aux register io_gpio_8b3_ls_sync +#define AR_IO_GPIO_8B3_LS_SYNC 0x80017b60 +#pragma Aux_register(0x80017b60, name=>"io_gpio_8b3_ls_sync") + +// User extension aux register io_gpio_8b3_int_bothedge +#define AR_IO_GPIO_8B3_INT_BOTHEDGE 0x80017b68 +#pragma Aux_register(0x80017b68, name=>"io_gpio_8b3_int_bothedge") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST0_IO_I2C_MST0_PRESENT 1 + +// User extension aux register io_i2c_mst0_clken +#define AR_IO_I2C_MST0_CLKEN 0x800120c0 +#pragma Aux_register(0x800120c0, name=>"io_i2c_mst0_clken") + +// User extension aux register io_i2c_mst0_con +#define AR_IO_I2C_MST0_CON 0x80012000 +#pragma Aux_register(0x80012000, name=>"io_i2c_mst0_con") + +// User extension aux register io_i2c_mst0_tar +#define AR_IO_I2C_MST0_TAR 0x80012004 +#pragma Aux_register(0x80012004, name=>"io_i2c_mst0_tar") + +// User extension aux register io_i2c_mst0_data_cmd +#define AR_IO_I2C_MST0_DATA_CMD 0x80012010 +#pragma Aux_register(0x80012010, name=>"io_i2c_mst0_data_cmd") + +// User extension aux register io_i2c_mst0_ss_scl_hcnt +#define AR_IO_I2C_MST0_SS_SCL_HCNT 0x80012014 +#pragma Aux_register(0x80012014, name=>"io_i2c_mst0_ss_scl_hcnt") + +// User extension aux register io_i2c_mst0_ss_scl_lcnt +#define AR_IO_I2C_MST0_SS_SCL_LCNT 0x80012018 +#pragma Aux_register(0x80012018, name=>"io_i2c_mst0_ss_scl_lcnt") + +// User extension aux register io_i2c_mst0_fs_scl_hcnt +#define AR_IO_I2C_MST0_FS_SCL_HCNT 0x8001201c +#pragma Aux_register(0x8001201c, name=>"io_i2c_mst0_fs_scl_hcnt") + +// User extension aux register io_i2c_mst0_fs_scl_lcnt +#define AR_IO_I2C_MST0_FS_SCL_LCNT 0x80012020 +#pragma Aux_register(0x80012020, name=>"io_i2c_mst0_fs_scl_lcnt") + +// User extension aux register io_i2c_mst0_intr_stat +#define AR_IO_I2C_MST0_INTR_STAT 0x8001202c +#pragma Aux_register(0x8001202c, name=>"io_i2c_mst0_intr_stat") + +// User extension aux register io_i2c_mst0_intr_mask +#define AR_IO_I2C_MST0_INTR_MASK 0x80012030 +#pragma Aux_register(0x80012030, name=>"io_i2c_mst0_intr_mask") + +// User extension aux register io_i2c_mst0_raw_intr_stat +#define AR_IO_I2C_MST0_RAW_INTR_STAT 0x80012034 +#pragma Aux_register(0x80012034, name=>"io_i2c_mst0_raw_intr_stat") + +// User extension aux register io_i2c_mst0_rx_tl +#define AR_IO_I2C_MST0_RX_TL 0x80012038 +#pragma Aux_register(0x80012038, name=>"io_i2c_mst0_rx_tl") + +// User extension aux register io_i2c_mst0_tx_tl +#define AR_IO_I2C_MST0_TX_TL 0x8001203c +#pragma Aux_register(0x8001203c, name=>"io_i2c_mst0_tx_tl") + +// User extension aux register io_i2c_mst0_clr_intr +#define AR_IO_I2C_MST0_CLR_INTR 0x80012040 +#pragma Aux_register(0x80012040, name=>"io_i2c_mst0_clr_intr") + +// User extension aux register io_i2c_mst0_clr_rx_under +#define AR_IO_I2C_MST0_CLR_RX_UNDER 0x80012044 +#pragma Aux_register(0x80012044, name=>"io_i2c_mst0_clr_rx_under") + +// User extension aux register io_i2c_mst0_clr_rx_over +#define AR_IO_I2C_MST0_CLR_RX_OVER 0x80012048 +#pragma Aux_register(0x80012048, name=>"io_i2c_mst0_clr_rx_over") + +// User extension aux register io_i2c_mst0_clr_tx_over +#define AR_IO_I2C_MST0_CLR_TX_OVER 0x8001204c +#pragma Aux_register(0x8001204c, name=>"io_i2c_mst0_clr_tx_over") + +// User extension aux register io_i2c_mst0_clr_tx_abrt +#define AR_IO_I2C_MST0_CLR_TX_ABRT 0x80012054 +#pragma Aux_register(0x80012054, name=>"io_i2c_mst0_clr_tx_abrt") + +// User extension aux register io_i2c_mst0_clr_activity +#define AR_IO_I2C_MST0_CLR_ACTIVITY 0x8001205c +#pragma Aux_register(0x8001205c, name=>"io_i2c_mst0_clr_activity") + +// User extension aux register io_i2c_mst0_clr_stop_det +#define AR_IO_I2C_MST0_CLR_STOP_DET 0x80012060 +#pragma Aux_register(0x80012060, name=>"io_i2c_mst0_clr_stop_det") + +// User extension aux register io_i2c_mst0_clr_start_det +#define AR_IO_I2C_MST0_CLR_START_DET 0x80012064 +#pragma Aux_register(0x80012064, name=>"io_i2c_mst0_clr_start_det") + +// User extension aux register io_i2c_mst0_enable +#define AR_IO_I2C_MST0_ENABLE 0x8001206c +#pragma Aux_register(0x8001206c, name=>"io_i2c_mst0_enable") + +// User extension aux register io_i2c_mst0_status +#define AR_IO_I2C_MST0_STATUS 0x80012070 +#pragma Aux_register(0x80012070, name=>"io_i2c_mst0_status") + +// User extension aux register io_i2c_mst0_txflr +#define AR_IO_I2C_MST0_TXFLR 0x80012074 +#pragma Aux_register(0x80012074, name=>"io_i2c_mst0_txflr") + +// User extension aux register io_i2c_mst0_rxflr +#define AR_IO_I2C_MST0_RXFLR 0x80012078 +#pragma Aux_register(0x80012078, name=>"io_i2c_mst0_rxflr") + +// User extension aux register io_i2c_mst0_sda_hold +#define AR_IO_I2C_MST0_SDA_HOLD 0x8001207c +#pragma Aux_register(0x8001207c, name=>"io_i2c_mst0_sda_hold") + +// User extension aux register io_i2c_mst0_tx_abrt_source +#define AR_IO_I2C_MST0_TX_ABRT_SOURCE 0x80012080 +#pragma Aux_register(0x80012080, name=>"io_i2c_mst0_tx_abrt_source") + +// User extension aux register io_i2c_mst0_enable_status +#define AR_IO_I2C_MST0_ENABLE_STATUS 0x8001209c +#pragma Aux_register(0x8001209c, name=>"io_i2c_mst0_enable_status") + +// User extension aux register io_i2c_mst0_fs_spklen +#define AR_IO_I2C_MST0_FS_SPKLEN 0x800120a0 +#pragma Aux_register(0x800120a0, name=>"io_i2c_mst0_fs_spklen") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST1_IO_I2C_MST1_PRESENT 1 + +// User extension aux register io_i2c_mst1_clken +#define AR_IO_I2C_MST1_CLKEN 0x800121c0 +#pragma Aux_register(0x800121c0, name=>"io_i2c_mst1_clken") + +// User extension aux register io_i2c_mst1_con +#define AR_IO_I2C_MST1_CON 0x80012100 +#pragma Aux_register(0x80012100, name=>"io_i2c_mst1_con") + +// User extension aux register io_i2c_mst1_tar +#define AR_IO_I2C_MST1_TAR 0x80012104 +#pragma Aux_register(0x80012104, name=>"io_i2c_mst1_tar") + +// User extension aux register io_i2c_mst1_data_cmd +#define AR_IO_I2C_MST1_DATA_CMD 0x80012110 +#pragma Aux_register(0x80012110, name=>"io_i2c_mst1_data_cmd") + +// User extension aux register io_i2c_mst1_ss_scl_hcnt +#define AR_IO_I2C_MST1_SS_SCL_HCNT 0x80012114 +#pragma Aux_register(0x80012114, name=>"io_i2c_mst1_ss_scl_hcnt") + +// User extension aux register io_i2c_mst1_ss_scl_lcnt +#define AR_IO_I2C_MST1_SS_SCL_LCNT 0x80012118 +#pragma Aux_register(0x80012118, name=>"io_i2c_mst1_ss_scl_lcnt") + +// User extension aux register io_i2c_mst1_fs_scl_hcnt +#define AR_IO_I2C_MST1_FS_SCL_HCNT 0x8001211c +#pragma Aux_register(0x8001211c, name=>"io_i2c_mst1_fs_scl_hcnt") + +// User extension aux register io_i2c_mst1_fs_scl_lcnt +#define AR_IO_I2C_MST1_FS_SCL_LCNT 0x80012120 +#pragma Aux_register(0x80012120, name=>"io_i2c_mst1_fs_scl_lcnt") + +// User extension aux register io_i2c_mst1_intr_stat +#define AR_IO_I2C_MST1_INTR_STAT 0x8001212c +#pragma Aux_register(0x8001212c, name=>"io_i2c_mst1_intr_stat") + +// User extension aux register io_i2c_mst1_intr_mask +#define AR_IO_I2C_MST1_INTR_MASK 0x80012130 +#pragma Aux_register(0x80012130, name=>"io_i2c_mst1_intr_mask") + +// User extension aux register io_i2c_mst1_raw_intr_stat +#define AR_IO_I2C_MST1_RAW_INTR_STAT 0x80012134 +#pragma Aux_register(0x80012134, name=>"io_i2c_mst1_raw_intr_stat") + +// User extension aux register io_i2c_mst1_rx_tl +#define AR_IO_I2C_MST1_RX_TL 0x80012138 +#pragma Aux_register(0x80012138, name=>"io_i2c_mst1_rx_tl") + +// User extension aux register io_i2c_mst1_tx_tl +#define AR_IO_I2C_MST1_TX_TL 0x8001213c +#pragma Aux_register(0x8001213c, name=>"io_i2c_mst1_tx_tl") + +// User extension aux register io_i2c_mst1_clr_intr +#define AR_IO_I2C_MST1_CLR_INTR 0x80012140 +#pragma Aux_register(0x80012140, name=>"io_i2c_mst1_clr_intr") + +// User extension aux register io_i2c_mst1_clr_rx_under +#define AR_IO_I2C_MST1_CLR_RX_UNDER 0x80012144 +#pragma Aux_register(0x80012144, name=>"io_i2c_mst1_clr_rx_under") + +// User extension aux register io_i2c_mst1_clr_rx_over +#define AR_IO_I2C_MST1_CLR_RX_OVER 0x80012148 +#pragma Aux_register(0x80012148, name=>"io_i2c_mst1_clr_rx_over") + +// User extension aux register io_i2c_mst1_clr_tx_over +#define AR_IO_I2C_MST1_CLR_TX_OVER 0x8001214c +#pragma Aux_register(0x8001214c, name=>"io_i2c_mst1_clr_tx_over") + +// User extension aux register io_i2c_mst1_clr_tx_abrt +#define AR_IO_I2C_MST1_CLR_TX_ABRT 0x80012154 +#pragma Aux_register(0x80012154, name=>"io_i2c_mst1_clr_tx_abrt") + +// User extension aux register io_i2c_mst1_clr_activity +#define AR_IO_I2C_MST1_CLR_ACTIVITY 0x8001215c +#pragma Aux_register(0x8001215c, name=>"io_i2c_mst1_clr_activity") + +// User extension aux register io_i2c_mst1_clr_stop_det +#define AR_IO_I2C_MST1_CLR_STOP_DET 0x80012160 +#pragma Aux_register(0x80012160, name=>"io_i2c_mst1_clr_stop_det") + +// User extension aux register io_i2c_mst1_clr_start_det +#define AR_IO_I2C_MST1_CLR_START_DET 0x80012164 +#pragma Aux_register(0x80012164, name=>"io_i2c_mst1_clr_start_det") + +// User extension aux register io_i2c_mst1_enable +#define AR_IO_I2C_MST1_ENABLE 0x8001216c +#pragma Aux_register(0x8001216c, name=>"io_i2c_mst1_enable") + +// User extension aux register io_i2c_mst1_status +#define AR_IO_I2C_MST1_STATUS 0x80012170 +#pragma Aux_register(0x80012170, name=>"io_i2c_mst1_status") + +// User extension aux register io_i2c_mst1_txflr +#define AR_IO_I2C_MST1_TXFLR 0x80012174 +#pragma Aux_register(0x80012174, name=>"io_i2c_mst1_txflr") + +// User extension aux register io_i2c_mst1_rxflr +#define AR_IO_I2C_MST1_RXFLR 0x80012178 +#pragma Aux_register(0x80012178, name=>"io_i2c_mst1_rxflr") + +// User extension aux register io_i2c_mst1_sda_hold +#define AR_IO_I2C_MST1_SDA_HOLD 0x8001217c +#pragma Aux_register(0x8001217c, name=>"io_i2c_mst1_sda_hold") + +// User extension aux register io_i2c_mst1_tx_abrt_source +#define AR_IO_I2C_MST1_TX_ABRT_SOURCE 0x80012180 +#pragma Aux_register(0x80012180, name=>"io_i2c_mst1_tx_abrt_source") + +// User extension aux register io_i2c_mst1_enable_status +#define AR_IO_I2C_MST1_ENABLE_STATUS 0x8001219c +#pragma Aux_register(0x8001219c, name=>"io_i2c_mst1_enable_status") + +// User extension aux register io_i2c_mst1_fs_spklen +#define AR_IO_I2C_MST1_FS_SPKLEN 0x800121a0 +#pragma Aux_register(0x800121a0, name=>"io_i2c_mst1_fs_spklen") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST2_IO_I2C_MST2_PRESENT 1 + +// User extension aux register io_i2c_mst2_clken +#define AR_IO_I2C_MST2_CLKEN 0x800122c0 +#pragma Aux_register(0x800122c0, name=>"io_i2c_mst2_clken") + +// User extension aux register io_i2c_mst2_con +#define AR_IO_I2C_MST2_CON 0x80012200 +#pragma Aux_register(0x80012200, name=>"io_i2c_mst2_con") + +// User extension aux register io_i2c_mst2_tar +#define AR_IO_I2C_MST2_TAR 0x80012204 +#pragma Aux_register(0x80012204, name=>"io_i2c_mst2_tar") + +// User extension aux register io_i2c_mst2_data_cmd +#define AR_IO_I2C_MST2_DATA_CMD 0x80012210 +#pragma Aux_register(0x80012210, name=>"io_i2c_mst2_data_cmd") + +// User extension aux register io_i2c_mst2_ss_scl_hcnt +#define AR_IO_I2C_MST2_SS_SCL_HCNT 0x80012214 +#pragma Aux_register(0x80012214, name=>"io_i2c_mst2_ss_scl_hcnt") + +// User extension aux register io_i2c_mst2_ss_scl_lcnt +#define AR_IO_I2C_MST2_SS_SCL_LCNT 0x80012218 +#pragma Aux_register(0x80012218, name=>"io_i2c_mst2_ss_scl_lcnt") + +// User extension aux register io_i2c_mst2_fs_scl_hcnt +#define AR_IO_I2C_MST2_FS_SCL_HCNT 0x8001221c +#pragma Aux_register(0x8001221c, name=>"io_i2c_mst2_fs_scl_hcnt") + +// User extension aux register io_i2c_mst2_fs_scl_lcnt +#define AR_IO_I2C_MST2_FS_SCL_LCNT 0x80012220 +#pragma Aux_register(0x80012220, name=>"io_i2c_mst2_fs_scl_lcnt") + +// User extension aux register io_i2c_mst2_intr_stat +#define AR_IO_I2C_MST2_INTR_STAT 0x8001222c +#pragma Aux_register(0x8001222c, name=>"io_i2c_mst2_intr_stat") + +// User extension aux register io_i2c_mst2_intr_mask +#define AR_IO_I2C_MST2_INTR_MASK 0x80012230 +#pragma Aux_register(0x80012230, name=>"io_i2c_mst2_intr_mask") + +// User extension aux register io_i2c_mst2_raw_intr_stat +#define AR_IO_I2C_MST2_RAW_INTR_STAT 0x80012234 +#pragma Aux_register(0x80012234, name=>"io_i2c_mst2_raw_intr_stat") + +// User extension aux register io_i2c_mst2_rx_tl +#define AR_IO_I2C_MST2_RX_TL 0x80012238 +#pragma Aux_register(0x80012238, name=>"io_i2c_mst2_rx_tl") + +// User extension aux register io_i2c_mst2_tx_tl +#define AR_IO_I2C_MST2_TX_TL 0x8001223c +#pragma Aux_register(0x8001223c, name=>"io_i2c_mst2_tx_tl") + +// User extension aux register io_i2c_mst2_clr_intr +#define AR_IO_I2C_MST2_CLR_INTR 0x80012240 +#pragma Aux_register(0x80012240, name=>"io_i2c_mst2_clr_intr") + +// User extension aux register io_i2c_mst2_clr_rx_under +#define AR_IO_I2C_MST2_CLR_RX_UNDER 0x80012244 +#pragma Aux_register(0x80012244, name=>"io_i2c_mst2_clr_rx_under") + +// User extension aux register io_i2c_mst2_clr_rx_over +#define AR_IO_I2C_MST2_CLR_RX_OVER 0x80012248 +#pragma Aux_register(0x80012248, name=>"io_i2c_mst2_clr_rx_over") + +// User extension aux register io_i2c_mst2_clr_tx_over +#define AR_IO_I2C_MST2_CLR_TX_OVER 0x8001224c +#pragma Aux_register(0x8001224c, name=>"io_i2c_mst2_clr_tx_over") + +// User extension aux register io_i2c_mst2_clr_tx_abrt +#define AR_IO_I2C_MST2_CLR_TX_ABRT 0x80012254 +#pragma Aux_register(0x80012254, name=>"io_i2c_mst2_clr_tx_abrt") + +// User extension aux register io_i2c_mst2_clr_activity +#define AR_IO_I2C_MST2_CLR_ACTIVITY 0x8001225c +#pragma Aux_register(0x8001225c, name=>"io_i2c_mst2_clr_activity") + +// User extension aux register io_i2c_mst2_clr_stop_det +#define AR_IO_I2C_MST2_CLR_STOP_DET 0x80012260 +#pragma Aux_register(0x80012260, name=>"io_i2c_mst2_clr_stop_det") + +// User extension aux register io_i2c_mst2_clr_start_det +#define AR_IO_I2C_MST2_CLR_START_DET 0x80012264 +#pragma Aux_register(0x80012264, name=>"io_i2c_mst2_clr_start_det") + +// User extension aux register io_i2c_mst2_enable +#define AR_IO_I2C_MST2_ENABLE 0x8001226c +#pragma Aux_register(0x8001226c, name=>"io_i2c_mst2_enable") + +// User extension aux register io_i2c_mst2_status +#define AR_IO_I2C_MST2_STATUS 0x80012270 +#pragma Aux_register(0x80012270, name=>"io_i2c_mst2_status") + +// User extension aux register io_i2c_mst2_txflr +#define AR_IO_I2C_MST2_TXFLR 0x80012274 +#pragma Aux_register(0x80012274, name=>"io_i2c_mst2_txflr") + +// User extension aux register io_i2c_mst2_rxflr +#define AR_IO_I2C_MST2_RXFLR 0x80012278 +#pragma Aux_register(0x80012278, name=>"io_i2c_mst2_rxflr") + +// User extension aux register io_i2c_mst2_sda_hold +#define AR_IO_I2C_MST2_SDA_HOLD 0x8001227c +#pragma Aux_register(0x8001227c, name=>"io_i2c_mst2_sda_hold") + +// User extension aux register io_i2c_mst2_tx_abrt_source +#define AR_IO_I2C_MST2_TX_ABRT_SOURCE 0x80012280 +#pragma Aux_register(0x80012280, name=>"io_i2c_mst2_tx_abrt_source") + +// User extension aux register io_i2c_mst2_enable_status +#define AR_IO_I2C_MST2_ENABLE_STATUS 0x8001229c +#pragma Aux_register(0x8001229c, name=>"io_i2c_mst2_enable_status") + +// User extension aux register io_i2c_mst2_fs_spklen +#define AR_IO_I2C_MST2_FS_SPKLEN 0x800122a0 +#pragma Aux_register(0x800122a0, name=>"io_i2c_mst2_fs_spklen") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST0_IO_SPI_MST0_PRESENT 1 + +// User extension aux register io_spi_mst0_ctrlr0 +#define AR_IO_SPI_MST0_CTRLR0 0x80010000 +#pragma Aux_register(0x80010000, name=>"io_spi_mst0_ctrlr0") + +// User extension aux register io_spi_mst0_ctrlr1 +#define AR_IO_SPI_MST0_CTRLR1 0x80010001 +#pragma Aux_register(0x80010001, name=>"io_spi_mst0_ctrlr1") + +// User extension aux register io_spi_mst0_spien +#define AR_IO_SPI_MST0_SPIEN 0x80010002 +#pragma Aux_register(0x80010002, name=>"io_spi_mst0_spien") + +// User extension aux register io_spi_mst0_ser +#define AR_IO_SPI_MST0_SER 0x80010004 +#pragma Aux_register(0x80010004, name=>"io_spi_mst0_ser") + +// User extension aux register io_spi_mst0_baudr +#define AR_IO_SPI_MST0_BAUDR 0x80010005 +#pragma Aux_register(0x80010005, name=>"io_spi_mst0_baudr") + +// User extension aux register io_spi_mst0_txftlr +#define AR_IO_SPI_MST0_TXFTLR 0x80010006 +#pragma Aux_register(0x80010006, name=>"io_spi_mst0_txftlr") + +// User extension aux register io_spi_mst0_rxftlr +#define AR_IO_SPI_MST0_RXFTLR 0x80010007 +#pragma Aux_register(0x80010007, name=>"io_spi_mst0_rxftlr") + +// User extension aux register io_spi_mst0_txflr +#define AR_IO_SPI_MST0_TXFLR 0x80010008 +#pragma Aux_register(0x80010008, name=>"io_spi_mst0_txflr") + +// User extension aux register io_spi_mst0_rxflr +#define AR_IO_SPI_MST0_RXFLR 0x80010009 +#pragma Aux_register(0x80010009, name=>"io_spi_mst0_rxflr") + +// User extension aux register io_spi_mst0_sr +#define AR_IO_SPI_MST0_SR 0x8001000a +#pragma Aux_register(0x8001000a, name=>"io_spi_mst0_sr") + +// User extension aux register io_spi_mst0_imr +#define AR_IO_SPI_MST0_IMR 0x8001000b +#pragma Aux_register(0x8001000b, name=>"io_spi_mst0_imr") + +// User extension aux register io_spi_mst0_isr +#define AR_IO_SPI_MST0_ISR 0x8001000c +#pragma Aux_register(0x8001000c, name=>"io_spi_mst0_isr") + +// User extension aux register io_spi_mst0_risr +#define AR_IO_SPI_MST0_RISR 0x8001000d +#pragma Aux_register(0x8001000d, name=>"io_spi_mst0_risr") + +// User extension aux register io_spi_mst0_txoicr +#define AR_IO_SPI_MST0_TXOICR 0x8001000e +#pragma Aux_register(0x8001000e, name=>"io_spi_mst0_txoicr") + +// User extension aux register io_spi_mst0_rxoicr +#define AR_IO_SPI_MST0_RXOICR 0x8001000f +#pragma Aux_register(0x8001000f, name=>"io_spi_mst0_rxoicr") + +// User extension aux register io_spi_mst0_rxuicr +#define AR_IO_SPI_MST0_RXUICR 0x80010010 +#pragma Aux_register(0x80010010, name=>"io_spi_mst0_rxuicr") + +// User extension aux register io_spi_mst0_icr +#define AR_IO_SPI_MST0_ICR 0x80010012 +#pragma Aux_register(0x80010012, name=>"io_spi_mst0_icr") + +// User extension aux register io_spi_mst0_clken +#define AR_IO_SPI_MST0_CLKEN 0x80010016 +#pragma Aux_register(0x80010016, name=>"io_spi_mst0_clken") + +// User extension aux register io_spi_mst0_dr +#define AR_IO_SPI_MST0_DR 0x80010018 +#pragma Aux_register(0x80010018, name=>"io_spi_mst0_dr") + +// User extension aux register io_spi_mst0_rx_sample_dly +#define AR_IO_SPI_MST0_RX_SAMPLE_DLY 0x8001003c +#pragma Aux_register(0x8001003c, name=>"io_spi_mst0_rx_sample_dly") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST1_IO_SPI_MST1_PRESENT 1 + +// User extension aux register io_spi_mst1_ctrlr0 +#define AR_IO_SPI_MST1_CTRLR0 0x80010100 +#pragma Aux_register(0x80010100, name=>"io_spi_mst1_ctrlr0") + +// User extension aux register io_spi_mst1_ctrlr1 +#define AR_IO_SPI_MST1_CTRLR1 0x80010101 +#pragma Aux_register(0x80010101, name=>"io_spi_mst1_ctrlr1") + +// User extension aux register io_spi_mst1_spien +#define AR_IO_SPI_MST1_SPIEN 0x80010102 +#pragma Aux_register(0x80010102, name=>"io_spi_mst1_spien") + +// User extension aux register io_spi_mst1_ser +#define AR_IO_SPI_MST1_SER 0x80010104 +#pragma Aux_register(0x80010104, name=>"io_spi_mst1_ser") + +// User extension aux register io_spi_mst1_baudr +#define AR_IO_SPI_MST1_BAUDR 0x80010105 +#pragma Aux_register(0x80010105, name=>"io_spi_mst1_baudr") + +// User extension aux register io_spi_mst1_txftlr +#define AR_IO_SPI_MST1_TXFTLR 0x80010106 +#pragma Aux_register(0x80010106, name=>"io_spi_mst1_txftlr") + +// User extension aux register io_spi_mst1_rxftlr +#define AR_IO_SPI_MST1_RXFTLR 0x80010107 +#pragma Aux_register(0x80010107, name=>"io_spi_mst1_rxftlr") + +// User extension aux register io_spi_mst1_txflr +#define AR_IO_SPI_MST1_TXFLR 0x80010108 +#pragma Aux_register(0x80010108, name=>"io_spi_mst1_txflr") + +// User extension aux register io_spi_mst1_rxflr +#define AR_IO_SPI_MST1_RXFLR 0x80010109 +#pragma Aux_register(0x80010109, name=>"io_spi_mst1_rxflr") + +// User extension aux register io_spi_mst1_sr +#define AR_IO_SPI_MST1_SR 0x8001010a +#pragma Aux_register(0x8001010a, name=>"io_spi_mst1_sr") + +// User extension aux register io_spi_mst1_imr +#define AR_IO_SPI_MST1_IMR 0x8001010b +#pragma Aux_register(0x8001010b, name=>"io_spi_mst1_imr") + +// User extension aux register io_spi_mst1_isr +#define AR_IO_SPI_MST1_ISR 0x8001010c +#pragma Aux_register(0x8001010c, name=>"io_spi_mst1_isr") + +// User extension aux register io_spi_mst1_risr +#define AR_IO_SPI_MST1_RISR 0x8001010d +#pragma Aux_register(0x8001010d, name=>"io_spi_mst1_risr") + +// User extension aux register io_spi_mst1_txoicr +#define AR_IO_SPI_MST1_TXOICR 0x8001010e +#pragma Aux_register(0x8001010e, name=>"io_spi_mst1_txoicr") + +// User extension aux register io_spi_mst1_rxoicr +#define AR_IO_SPI_MST1_RXOICR 0x8001010f +#pragma Aux_register(0x8001010f, name=>"io_spi_mst1_rxoicr") + +// User extension aux register io_spi_mst1_rxuicr +#define AR_IO_SPI_MST1_RXUICR 0x80010110 +#pragma Aux_register(0x80010110, name=>"io_spi_mst1_rxuicr") + +// User extension aux register io_spi_mst1_icr +#define AR_IO_SPI_MST1_ICR 0x80010112 +#pragma Aux_register(0x80010112, name=>"io_spi_mst1_icr") + +// User extension aux register io_spi_mst1_clken +#define AR_IO_SPI_MST1_CLKEN 0x80010116 +#pragma Aux_register(0x80010116, name=>"io_spi_mst1_clken") + +// User extension aux register io_spi_mst1_dr +#define AR_IO_SPI_MST1_DR 0x80010118 +#pragma Aux_register(0x80010118, name=>"io_spi_mst1_dr") + +// User extension aux register io_spi_mst1_rx_sample_dly +#define AR_IO_SPI_MST1_RX_SAMPLE_DLY 0x8001013c +#pragma Aux_register(0x8001013c, name=>"io_spi_mst1_rx_sample_dly") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST2_IO_SPI_MST2_PRESENT 1 + +// User extension aux register io_spi_mst2_ctrlr0 +#define AR_IO_SPI_MST2_CTRLR0 0x80010200 +#pragma Aux_register(0x80010200, name=>"io_spi_mst2_ctrlr0") + +// User extension aux register io_spi_mst2_ctrlr1 +#define AR_IO_SPI_MST2_CTRLR1 0x80010201 +#pragma Aux_register(0x80010201, name=>"io_spi_mst2_ctrlr1") + +// User extension aux register io_spi_mst2_spien +#define AR_IO_SPI_MST2_SPIEN 0x80010202 +#pragma Aux_register(0x80010202, name=>"io_spi_mst2_spien") + +// User extension aux register io_spi_mst2_ser +#define AR_IO_SPI_MST2_SER 0x80010204 +#pragma Aux_register(0x80010204, name=>"io_spi_mst2_ser") + +// User extension aux register io_spi_mst2_baudr +#define AR_IO_SPI_MST2_BAUDR 0x80010205 +#pragma Aux_register(0x80010205, name=>"io_spi_mst2_baudr") + +// User extension aux register io_spi_mst2_txftlr +#define AR_IO_SPI_MST2_TXFTLR 0x80010206 +#pragma Aux_register(0x80010206, name=>"io_spi_mst2_txftlr") + +// User extension aux register io_spi_mst2_rxftlr +#define AR_IO_SPI_MST2_RXFTLR 0x80010207 +#pragma Aux_register(0x80010207, name=>"io_spi_mst2_rxftlr") + +// User extension aux register io_spi_mst2_txflr +#define AR_IO_SPI_MST2_TXFLR 0x80010208 +#pragma Aux_register(0x80010208, name=>"io_spi_mst2_txflr") + +// User extension aux register io_spi_mst2_rxflr +#define AR_IO_SPI_MST2_RXFLR 0x80010209 +#pragma Aux_register(0x80010209, name=>"io_spi_mst2_rxflr") + +// User extension aux register io_spi_mst2_sr +#define AR_IO_SPI_MST2_SR 0x8001020a +#pragma Aux_register(0x8001020a, name=>"io_spi_mst2_sr") + +// User extension aux register io_spi_mst2_imr +#define AR_IO_SPI_MST2_IMR 0x8001020b +#pragma Aux_register(0x8001020b, name=>"io_spi_mst2_imr") + +// User extension aux register io_spi_mst2_isr +#define AR_IO_SPI_MST2_ISR 0x8001020c +#pragma Aux_register(0x8001020c, name=>"io_spi_mst2_isr") + +// User extension aux register io_spi_mst2_risr +#define AR_IO_SPI_MST2_RISR 0x8001020d +#pragma Aux_register(0x8001020d, name=>"io_spi_mst2_risr") + +// User extension aux register io_spi_mst2_txoicr +#define AR_IO_SPI_MST2_TXOICR 0x8001020e +#pragma Aux_register(0x8001020e, name=>"io_spi_mst2_txoicr") + +// User extension aux register io_spi_mst2_rxoicr +#define AR_IO_SPI_MST2_RXOICR 0x8001020f +#pragma Aux_register(0x8001020f, name=>"io_spi_mst2_rxoicr") + +// User extension aux register io_spi_mst2_rxuicr +#define AR_IO_SPI_MST2_RXUICR 0x80010210 +#pragma Aux_register(0x80010210, name=>"io_spi_mst2_rxuicr") + +// User extension aux register io_spi_mst2_icr +#define AR_IO_SPI_MST2_ICR 0x80010212 +#pragma Aux_register(0x80010212, name=>"io_spi_mst2_icr") + +// User extension aux register io_spi_mst2_clken +#define AR_IO_SPI_MST2_CLKEN 0x80010216 +#pragma Aux_register(0x80010216, name=>"io_spi_mst2_clken") + +// User extension aux register io_spi_mst2_dr +#define AR_IO_SPI_MST2_DR 0x80010218 +#pragma Aux_register(0x80010218, name=>"io_spi_mst2_dr") + +// User extension aux register io_spi_mst2_rx_sample_dly +#define AR_IO_SPI_MST2_RX_SAMPLE_DLY 0x8001023c +#pragma Aux_register(0x8001023c, name=>"io_spi_mst2_rx_sample_dly") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_SLV0_IO_SPI_SLV0_PRESENT 1 + +// User extension aux register io_spi_slv0_ctrlr0 +#define AR_IO_SPI_SLV0_CTRLR0 0x80011000 +#pragma Aux_register(0x80011000, name=>"io_spi_slv0_ctrlr0") + +// User extension aux register io_spi_slv0_spien +#define AR_IO_SPI_SLV0_SPIEN 0x80011002 +#pragma Aux_register(0x80011002, name=>"io_spi_slv0_spien") + +// User extension aux register io_spi_slv0_txftlr +#define AR_IO_SPI_SLV0_TXFTLR 0x80011006 +#pragma Aux_register(0x80011006, name=>"io_spi_slv0_txftlr") + +// User extension aux register io_spi_slv0_rxftlr +#define AR_IO_SPI_SLV0_RXFTLR 0x80011007 +#pragma Aux_register(0x80011007, name=>"io_spi_slv0_rxftlr") + +// User extension aux register io_spi_slv0_txflr +#define AR_IO_SPI_SLV0_TXFLR 0x80011008 +#pragma Aux_register(0x80011008, name=>"io_spi_slv0_txflr") + +// User extension aux register io_spi_slv0_rxflr +#define AR_IO_SPI_SLV0_RXFLR 0x80011009 +#pragma Aux_register(0x80011009, name=>"io_spi_slv0_rxflr") + +// User extension aux register io_spi_slv0_sr +#define AR_IO_SPI_SLV0_SR 0x8001100a +#pragma Aux_register(0x8001100a, name=>"io_spi_slv0_sr") + +// User extension aux register io_spi_slv0_imr +#define AR_IO_SPI_SLV0_IMR 0x8001100b +#pragma Aux_register(0x8001100b, name=>"io_spi_slv0_imr") + +// User extension aux register io_spi_slv0_isr +#define AR_IO_SPI_SLV0_ISR 0x8001100c +#pragma Aux_register(0x8001100c, name=>"io_spi_slv0_isr") + +// User extension aux register io_spi_slv0_risr +#define AR_IO_SPI_SLV0_RISR 0x8001100d +#pragma Aux_register(0x8001100d, name=>"io_spi_slv0_risr") + +// User extension aux register io_spi_slv0_txoicr +#define AR_IO_SPI_SLV0_TXOICR 0x8001100e +#pragma Aux_register(0x8001100e, name=>"io_spi_slv0_txoicr") + +// User extension aux register io_spi_slv0_rxoicr +#define AR_IO_SPI_SLV0_RXOICR 0x8001100f +#pragma Aux_register(0x8001100f, name=>"io_spi_slv0_rxoicr") + +// User extension aux register io_spi_slv0_rxuicr +#define AR_IO_SPI_SLV0_RXUICR 0x80011010 +#pragma Aux_register(0x80011010, name=>"io_spi_slv0_rxuicr") + +// User extension aux register io_spi_slv0_icr +#define AR_IO_SPI_SLV0_ICR 0x80011012 +#pragma Aux_register(0x80011012, name=>"io_spi_slv0_icr") + +// User extension aux register io_spi_slv0_clken +#define AR_IO_SPI_SLV0_CLKEN 0x80011016 +#pragma Aux_register(0x80011016, name=>"io_spi_slv0_clken") + +// User extension aux register io_spi_slv0_dr +#define AR_IO_SPI_SLV0_DR 0x80011018 +#pragma Aux_register(0x80011018, name=>"io_spi_slv0_dr") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART0_IO_UART0_PRESENT 1 + +// User extension aux register io_uart0_clken +#define AR_IO_UART0_CLKEN 0x800140c0 +#pragma Aux_register(0x800140c0, name=>"io_uart0_clken") + +// User extension aux register io_uart0_rbr_thr_dll +#define AR_IO_UART0_RBR_THR_DLL 0x80014000 +#pragma Aux_register(0x80014000, name=>"io_uart0_rbr_thr_dll") + +// User extension aux register io_uart0_ier_dlh +#define AR_IO_UART0_IER_DLH 0x80014004 +#pragma Aux_register(0x80014004, name=>"io_uart0_ier_dlh") + +// User extension aux register io_uart0_iir_fcr +#define AR_IO_UART0_IIR_FCR 0x80014008 +#pragma Aux_register(0x80014008, name=>"io_uart0_iir_fcr") + +// User extension aux register io_uart0_lcr +#define AR_IO_UART0_LCR 0x8001400c +#pragma Aux_register(0x8001400c, name=>"io_uart0_lcr") + +// User extension aux register io_uart0_mcr +#define AR_IO_UART0_MCR 0x80014010 +#pragma Aux_register(0x80014010, name=>"io_uart0_mcr") + +// User extension aux register io_uart0_lsr +#define AR_IO_UART0_LSR 0x80014014 +#pragma Aux_register(0x80014014, name=>"io_uart0_lsr") + +// User extension aux register io_uart0_msr +#define AR_IO_UART0_MSR 0x80014018 +#pragma Aux_register(0x80014018, name=>"io_uart0_msr") + +// User extension aux register io_uart0_usr +#define AR_IO_UART0_USR 0x8001407c +#pragma Aux_register(0x8001407c, name=>"io_uart0_usr") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART1_IO_UART1_PRESENT 1 + +// User extension aux register io_uart1_clken +#define AR_IO_UART1_CLKEN 0x800141c0 +#pragma Aux_register(0x800141c0, name=>"io_uart1_clken") + +// User extension aux register io_uart1_rbr_thr_dll +#define AR_IO_UART1_RBR_THR_DLL 0x80014100 +#pragma Aux_register(0x80014100, name=>"io_uart1_rbr_thr_dll") + +// User extension aux register io_uart1_ier_dlh +#define AR_IO_UART1_IER_DLH 0x80014104 +#pragma Aux_register(0x80014104, name=>"io_uart1_ier_dlh") + +// User extension aux register io_uart1_iir_fcr +#define AR_IO_UART1_IIR_FCR 0x80014108 +#pragma Aux_register(0x80014108, name=>"io_uart1_iir_fcr") + +// User extension aux register io_uart1_lcr +#define AR_IO_UART1_LCR 0x8001410c +#pragma Aux_register(0x8001410c, name=>"io_uart1_lcr") + +// User extension aux register io_uart1_mcr +#define AR_IO_UART1_MCR 0x80014110 +#pragma Aux_register(0x80014110, name=>"io_uart1_mcr") + +// User extension aux register io_uart1_lsr +#define AR_IO_UART1_LSR 0x80014114 +#pragma Aux_register(0x80014114, name=>"io_uart1_lsr") + +// User extension aux register io_uart1_msr +#define AR_IO_UART1_MSR 0x80014118 +#pragma Aux_register(0x80014118, name=>"io_uart1_msr") + +// User extension aux register io_uart1_usr +#define AR_IO_UART1_USR 0x8001417c +#pragma Aux_register(0x8001417c, name=>"io_uart1_usr") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART2_IO_UART2_PRESENT 1 + +// User extension aux register io_uart2_clken +#define AR_IO_UART2_CLKEN 0x800142c0 +#pragma Aux_register(0x800142c0, name=>"io_uart2_clken") + +// User extension aux register io_uart2_rbr_thr_dll +#define AR_IO_UART2_RBR_THR_DLL 0x80014200 +#pragma Aux_register(0x80014200, name=>"io_uart2_rbr_thr_dll") + +// User extension aux register io_uart2_ier_dlh +#define AR_IO_UART2_IER_DLH 0x80014204 +#pragma Aux_register(0x80014204, name=>"io_uart2_ier_dlh") + +// User extension aux register io_uart2_iir_fcr +#define AR_IO_UART2_IIR_FCR 0x80014208 +#pragma Aux_register(0x80014208, name=>"io_uart2_iir_fcr") + +// User extension aux register io_uart2_lcr +#define AR_IO_UART2_LCR 0x8001420c +#pragma Aux_register(0x8001420c, name=>"io_uart2_lcr") + +// User extension aux register io_uart2_mcr +#define AR_IO_UART2_MCR 0x80014210 +#pragma Aux_register(0x80014210, name=>"io_uart2_mcr") + +// User extension aux register io_uart2_lsr +#define AR_IO_UART2_LSR 0x80014214 +#pragma Aux_register(0x80014214, name=>"io_uart2_lsr") + +// User extension aux register io_uart2_msr +#define AR_IO_UART2_MSR 0x80014218 +#pragma Aux_register(0x80014218, name=>"io_uart2_msr") + +// User extension aux register io_uart2_usr +#define AR_IO_UART2_USR 0x8001427c +#pragma Aux_register(0x8001427c, name=>"io_uart2_usr") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART3_IO_UART3_PRESENT 1 + +// User extension aux register io_uart3_clken +#define AR_IO_UART3_CLKEN 0x800143c0 +#pragma Aux_register(0x800143c0, name=>"io_uart3_clken") + +// User extension aux register io_uart3_rbr_thr_dll +#define AR_IO_UART3_RBR_THR_DLL 0x80014300 +#pragma Aux_register(0x80014300, name=>"io_uart3_rbr_thr_dll") + +// User extension aux register io_uart3_ier_dlh +#define AR_IO_UART3_IER_DLH 0x80014304 +#pragma Aux_register(0x80014304, name=>"io_uart3_ier_dlh") + +// User extension aux register io_uart3_iir_fcr +#define AR_IO_UART3_IIR_FCR 0x80014308 +#pragma Aux_register(0x80014308, name=>"io_uart3_iir_fcr") + +// User extension aux register io_uart3_lcr +#define AR_IO_UART3_LCR 0x8001430c +#pragma Aux_register(0x8001430c, name=>"io_uart3_lcr") + +// User extension aux register io_uart3_mcr +#define AR_IO_UART3_MCR 0x80014310 +#pragma Aux_register(0x80014310, name=>"io_uart3_mcr") + +// User extension aux register io_uart3_lsr +#define AR_IO_UART3_LSR 0x80014314 +#pragma Aux_register(0x80014314, name=>"io_uart3_lsr") + +// User extension aux register io_uart3_msr +#define AR_IO_UART3_MSR 0x80014318 +#pragma Aux_register(0x80014318, name=>"io_uart3_msr") + +// User extension aux register io_uart3_usr +#define AR_IO_UART3_USR 0x8001437c +#pragma Aux_register(0x8001437c, name=>"io_uart3_usr") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_CREG_MST0_IO_CREG_MST0_PRESENT 1 + +// User extension aux register io_creg_mst0_ctrl +#define AR_IO_CREG_MST0_CTRL 0x80018000 +#pragma Aux_register(0x80018000, name=>"io_creg_mst0_ctrl") +#define APEX_COM_ARC_HARDWARE_DFSS_IO_CREG_SLV0_IO_CREG_SLV0_PRESENT 1 + +// User extension aux register io_creg_slv0_obsr +#define AR_IO_CREG_SLV0_OBSR 0x80018080 +#pragma Aux_register(0x80018080, name=>"io_creg_slv0_obsr") +#define APEX_COM_ARC_HARDWARE_DFSS_SUBSYS_BCR_SUBSYS_BCR_PRESENT 1 + +// User extension aux register SUBSYS_BUILD +#define AR_SUBSYS_BUILD 0xf0 +#pragma Aux_register(0xf0, name=>"SUBSYS_BUILD") + +// User extension aux register SUBSYS_DSP_0_BUILD +#define AR_SUBSYS_DSP_0_BUILD 0xa00 +#pragma Aux_register(0xa00, name=>"SUBSYS_DSP_0_BUILD") + +// User extension aux register SUBSYS_DSP_0_CONFIG +#define AR_SUBSYS_DSP_0_CONFIG 0xa02 +#pragma Aux_register(0xa02, name=>"SUBSYS_DSP_0_CONFIG") + +// User extension aux register SUBSYS_IO_0_BUILD +#define AR_SUBSYS_IO_0_BUILD 0xa04 +#pragma Aux_register(0xa04, name=>"SUBSYS_IO_0_BUILD") + +// User extension aux register SUBSYS_IO_1_BUILD +#define AR_SUBSYS_IO_1_BUILD 0xa05 +#pragma Aux_register(0xa05, name=>"SUBSYS_IO_1_BUILD") +#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_PRESENT 1 + +// User extension aux register fpu_build +#define AR_FPU_BUILD 0xc8 +#pragma Aux_register(0xc8, name=>"fpu_build") + +// User extension aux register fpu_ctrl +#define AR_FPU_CTRL 0x300 +#pragma Aux_register(0x300, name=>"fpu_ctrl") + +// User extension aux register fpu_status +#define AR_FPU_STATUS 0x301 +#pragma Aux_register(0x301, name=>"fpu_status") + +// User extension instruction fsmadd +extern long fsmadd(long,long); +#pragma intrinsic(fsmadd,opcode=>6,sub_opcode=>5, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fsmsub +extern long fsmsub(long,long); +#pragma intrinsic(fsmsub,opcode=>6,sub_opcode=>6, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fsmul +extern long fsmul(long,long); +#pragma intrinsic(fsmul,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fsadd +extern long fsadd(long,long); +#pragma intrinsic(fsadd,opcode=>6,sub_opcode=>1, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fssub +extern long fssub(long,long); +#pragma intrinsic(fssub,opcode=>6,sub_opcode=>2, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fcvt32 +extern long fcvt32(long,long); +#pragma intrinsic(fcvt32,opcode=>6,sub_opcode=>8, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fsdiv +extern long fsdiv(long,long); +#pragma intrinsic(fsdiv,opcode=>6,sub_opcode=>7, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fscmp +extern long fscmp(long,long); +#pragma intrinsic(fscmp,opcode=>6,sub_opcode=>3, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fscmp +extern long fscmp_f(long,long); +#pragma intrinsic(fscmp_f,opcode=>6,sub_opcode=>3, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fscmpf +extern long fscmpf(long,long); +#pragma intrinsic(fscmpf,opcode=>6,sub_opcode=>4, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fscmpf +extern long fscmpf_f(long,long); +#pragma intrinsic(fscmpf_f,opcode=>6,sub_opcode=>4, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") + +// User extension instruction fssqrt +extern long fssqrt(long); +#pragma intrinsic(fssqrt,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") +#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_DP_ASSIST_PRESENT 1 + +// User extension aux register aux_dpfp1l +#define AR_AUX_DPFP1L 0x302 +#pragma Aux_register(0x302, name=>"aux_dpfp1l") + +// User extension aux register aux_dpfp1h +#define AR_AUX_DPFP1H 0x303 +#pragma Aux_register(0x303, name=>"aux_dpfp1h") + +// User extension aux register aux_dpfp2l +#define AR_AUX_DPFP2L 0x304 +#pragma Aux_register(0x304, name=>"aux_dpfp2l") + +// User extension aux register aux_dpfp2h +#define AR_AUX_DPFP2H 0x305 +#pragma Aux_register(0x305, name=>"aux_dpfp2h") + +// User extension instruction dmulh11 +extern long dmulh11(long,long); +#pragma intrinsic(dmulh11,opcode=>6,sub_opcode=>48,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh11 +extern long dmulh11_f(long,long); +#pragma intrinsic(dmulh11_f,opcode=>6,sub_opcode=>48, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh12 +extern long dmulh12(long,long); +#pragma intrinsic(dmulh12,opcode=>6,sub_opcode=>49,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh12 +extern long dmulh12_f(long,long); +#pragma intrinsic(dmulh12_f,opcode=>6,sub_opcode=>49, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh21 +extern long dmulh21(long,long); +#pragma intrinsic(dmulh21,opcode=>6,sub_opcode=>50,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh21 +extern long dmulh21_f(long,long); +#pragma intrinsic(dmulh21_f,opcode=>6,sub_opcode=>50, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh22 +extern long dmulh22(long,long); +#pragma intrinsic(dmulh22,opcode=>6,sub_opcode=>51,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dmulh22 +extern long dmulh22_f(long,long); +#pragma intrinsic(dmulh22_f,opcode=>6,sub_opcode=>51, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh11 +extern long daddh11(long,long); +#pragma intrinsic(daddh11,opcode=>6,sub_opcode=>52,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh11 +extern long daddh11_f(long,long); +#pragma intrinsic(daddh11_f,opcode=>6,sub_opcode=>52, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh12 +extern long daddh12(long,long); +#pragma intrinsic(daddh12,opcode=>6,sub_opcode=>53,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh12 +extern long daddh12_f(long,long); +#pragma intrinsic(daddh12_f,opcode=>6,sub_opcode=>53, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh21 +extern long daddh21(long,long); +#pragma intrinsic(daddh21,opcode=>6,sub_opcode=>54,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh21 +extern long daddh21_f(long,long); +#pragma intrinsic(daddh21_f,opcode=>6,sub_opcode=>54, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh22 +extern long daddh22(long,long); +#pragma intrinsic(daddh22,opcode=>6,sub_opcode=>55,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction daddh22 +extern long daddh22_f(long,long); +#pragma intrinsic(daddh22_f,opcode=>6,sub_opcode=>55, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh11 +extern long dsubh11(long,long); +#pragma intrinsic(dsubh11,opcode=>6,sub_opcode=>56,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh11 +extern long dsubh11_f(long,long); +#pragma intrinsic(dsubh11_f,opcode=>6,sub_opcode=>56, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh12 +extern long dsubh12(long,long); +#pragma intrinsic(dsubh12,opcode=>6,sub_opcode=>57,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh12 +extern long dsubh12_f(long,long); +#pragma intrinsic(dsubh12_f,opcode=>6,sub_opcode=>57, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh21 +extern long dsubh21(long,long); +#pragma intrinsic(dsubh21,opcode=>6,sub_opcode=>58,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh21 +extern long dsubh21_f(long,long); +#pragma intrinsic(dsubh21_f,opcode=>6,sub_opcode=>58, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh22 +extern long dsubh22(long,long); +#pragma intrinsic(dsubh22,opcode=>6,sub_opcode=>59,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dsubh22 +extern long dsubh22_f(long,long); +#pragma intrinsic(dsubh22_f,opcode=>6,sub_opcode=>59, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dexcl1 +extern long dexcl1(long,long); +#pragma intrinsic(dexcl1,opcode=>6,sub_opcode=>60, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + +// User extension instruction dexcl2 +extern long dexcl2(long,long); +#pragma intrinsic(dexcl2,opcode=>6,sub_opcode=>61, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") + + +#endif + + +]]> + + + + + diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index eb890ef1999..d6b6d604ac7 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -5,6 +5,16 @@ ifeq ($(TARGET_ARCH), arc) AR_TOOL = arac CXX_TOOL = ccac +ifeq ($(TARGET), iotdk) + TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/iotdk/iotdk.tcf + LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/iotdk/iotdk.lcf +endif + +ifeq ($(TARGET), emsdp) + TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf + LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf +endif + ifneq ($(TCF_FILE), ) TARGET = $(basename $(notdir $(TCF_FILE))) else @@ -25,6 +35,11 @@ endif PLATFORM_FLAGS += -tcf_core_config PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map -default_lcf=$(MAKEFILE_DIR)/targets/arc/memory.lcf +ifneq ($(LCF_FILE), ) + PLATFORM_LDFLAGS += $(notdir $(LCF_FILE)) + THIRD_PARTY_CC_HDRS += $(notdir $(LCF_FILE)) +endif + CXXFLAGS += $(PLATFORM_FLAGS) CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS)) CCFLAGS += $(PLATFORM_FLAGS) From ced5b5bebb526e3e08804f4ccf49b530b9098c31 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Wed, 11 Mar 2020 14:47:28 +0300 Subject: [PATCH 026/557] Updated LCF for EMSDP and fixes for arc build process --- .../micro/tools/make/download_and_extract.sh | 2 +- .../tools/make/targets/arc/emsdp/emsdp.lcf | 51 ++++++++++++------- .../micro/tools/make/targets/arc_makefile.inc | 2 +- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh index 2248031f6d1..4a75b6b24cd 100755 --- a/tensorflow/lite/micro/tools/make/download_and_extract.sh +++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh @@ -90,7 +90,7 @@ patch_cifar10_dataset() { } build_embarc_mli() { - gmake -j 4 -C ${1}/lib/make TCF_FILE=${2} + make -j 4 -C ${1}/lib/make TCF_FILE=${2} } # Main function handling the download, verify, extract, and patch process. diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf index fc34759d745..d2d1b4220f8 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf +++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf @@ -5,43 +5,58 @@ # due to CCM memory wrapping into upper addresses beyond its size MEMORY { - IVT : ORIGIN = 0x00000000, LENGTH = 0x60000000 - ICCM0 : ORIGIN = 0x60000000, LENGTH = 0x00020000 + PSRAM : ORIGIN = 0x10000000, LENGTH = 0x01000000 + SRAM : ORIGIN = 0x20000000, LENGTH = 0x00040000 + IVT : ORIGIN = 0x60000000, LENGTH = 0x400 + ICCM0 : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400) # CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000 -# SYSTEM1 : ORIGIN = 0x70000000, LENGTH = 0x10000000 DCCM : ORIGIN = 0x80000000, LENGTH = 0x00020000 # CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000 XCCM : ORIGIN = 0x90000000, LENGTH = 0x00004000 # CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000 YCCM : ORIGIN = 0xa0000000, LENGTH = 0x00004000 # CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000 - SYSTEM2 : ORIGIN = 0xb0000000, LENGTH = 0x50000000 } + SECTIONS { - GROUP BLOCK(4): { - .text? : { *('.text$crt*') } - * (TEXT): {} - * (LIT): {} - } > ICCM0 + + GROUP BLOCK(4) : { + .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4) + } > IVT GROUP BLOCK(4): { - /* _SDA_BASE_ computed implicitly */ + .text? : { *('.text$crt*') } + * (TEXT): {} + * (LIT): {} + } > ICCM0 + + GROUP BLOCK(4): { + /* _SDA_BASE_ computed implicitly */ .sdata?: {} .sbss?: {} * (DATA): {} * (BSS): {} - .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {} - .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {} - } > SYSTEM2 + .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:16K): {} + .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {} + } > DCCM + + GROUP BLOCK(4): { + .rodata_in_data? : {} + } > PSRAM + GROUP BLOCK(4): { .Xdata? : {} - } > XCCM + } > XCCM + GROUP BLOCK(4): { .Ydata? : {} - } > YCCM - GROUP BLOCK(4) : { - .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4) - } > IVT + } > YCCM + + GROUP BLOCK(4): { + .Zdata? : {} + } > DCCM + + } diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index d6b6d604ac7..29ad5f5347a 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -31,7 +31,7 @@ else TCF_FILE_NAME = $(TCF_FILE) endif - PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -fslp-vectorize-aggressive -ffunction-sections -fdata-sections + PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections PLATFORM_FLAGS += -tcf_core_config PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map -default_lcf=$(MAKEFILE_DIR)/targets/arc/memory.lcf From 503f98f88c2d8a7a636ef4ed920e059196ac9b09 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Mon, 30 Mar 2020 18:08:12 +0300 Subject: [PATCH 027/557] ARC EMSDP board specific debug log --- tensorflow/lite/micro/emsdp/debug_log.cc | 108 ++++++++++++++++++ .../tools/make/targets/arc/emsdp/emsdp.lcf | 36 +++--- 2 files changed, 127 insertions(+), 17 deletions(-) create mode 100644 tensorflow/lite/micro/emsdp/debug_log.cc diff --git a/tensorflow/lite/micro/emsdp/debug_log.cc b/tensorflow/lite/micro/emsdp/debug_log.cc new file mode 100644 index 00000000000..7d932939a0b --- /dev/null +++ b/tensorflow/lite/micro/emsdp/debug_log.cc @@ -0,0 +1,108 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/debug_log.h" + +#include +#include +#include + +// Print to debug console by default. One can define next to extend destinations set: +// EMSDP_LOG_TO_MEMORY +// : fill .debug_log memory region (data section) with passed chars. +// EMSDP_LOG_TO_HOST +// : Use hostlink to print output log. +// EMSDP_LOG_TO_UART +// : use default debug UART (out to FTDI channel 0). The same USB Port is used for JTAG. +#define EMSDP_LOG_TO_UART + + +// For simplicity we assume U-boot has already initialized debug console durion +// application loading (or on reset). Hence we use only status and data registers +// to organize blocking loop for printing symbols. No input and no IRQ handling. +// See embarc_osp repository for full EMSDP uart driver. +// TODO: Consider U-Boot API to do it in a less "hacky" way. +void DbgUartSendStr(const char* s) { +#define EMSDP_DBG_UART_BASE (0xF0004000U) +#define DW_UART_CPR_FIFO_STAT (1<<10) +#define DW_UART_USR_TFNF (0x02) +#define DW_UART_LSR_TXD_EMPTY (0x20) + + typedef volatile struct dw_uart_reg { + uint32_t DATA; /*!< data in/out and DLL */ + uint32_t RES1[4]; + uint32_t LSR; /*!< Line Status Register */ + uint32_t RES2[25]; + uint32_t USR; /*!< UART status register */ + uint32_t RES3[29]; + uint32_t CPR; /*!< Component parameter register */ + } DW_UART_REG; + + DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE); + const char* src = s; + while (*src) { + // Check uart status to send char + bool uart_is_ready = false; + if (uart_reg_ptr->CPR & DW_UART_CPR_FIFO_STAT) + uart_is_ready = ((uart_reg_ptr->USR & DW_UART_USR_TFNF) != 0); + else + uart_is_ready = ((uart_reg_ptr->LSR & DW_UART_LSR_TXD_EMPTY) != 0); + + // Send char if uart is ready. + if (uart_is_ready) + uart_reg_ptr->DATA = *src++; + } +} + +// Simple symbols dump to a pre-allocated memory region. +// The memory region can be viewed afterward with debugger. +// It can be viewed/read with debugger afterward. +void LogToMem(const char* s) { + constexpr int kDebugLogMemChars = 2 * 1024; + static int cursor = 0; +#pragma Bss(".debug_log") + volatile static char debug_log_mem[kDebugLogMemChars]; +#pragma Bss() + + const char* src = s; + while (*src) { + debug_log_mem[cursor] = *src++; + cursor = (cursor < kDebugLogMemChars) ? cursor + 1 : 0; + } + debug_log_mem[cursor] = '^'; +} + + +extern "C" void DebugLog(const char* s) { +#ifndef TF_LITE_STRIP_ERROR_STRINGS + +#if defined EMSDP_LOG_TO_UART + DbgUartSendStr(s); +#endif + +#if defined EMSDP_LOG_TO_MEMORY +#warning "EMSDP_LOG_TO_MEMORY is defined. View .debug_log memory region for stdout" + LogToMem(s); +#endif + +#if defined EMSDP_LOG_TO_HOST +#warning "EMSDP_LOG_TO_HOST is defined. Ensure hostlib is linked." + fprintf(stderr, "%s", s); +#endif + +#endif // TF_LITE_STRIP_ERROR_STRINGS +} + + diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf index d2d1b4220f8..d17c807e250 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf +++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf @@ -5,7 +5,7 @@ # due to CCM memory wrapping into upper addresses beyond its size MEMORY { - PSRAM : ORIGIN = 0x10000000, LENGTH = 0x01000000 + PSRAM : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400 SRAM : ORIGIN = 0x20000000, LENGTH = 0x00040000 IVT : ORIGIN = 0x60000000, LENGTH = 0x400 ICCM0 : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400) @@ -31,19 +31,11 @@ SECTIONS { } > ICCM0 GROUP BLOCK(4): { - /* _SDA_BASE_ computed implicitly */ - .sdata?: {} - .sbss?: {} - * (DATA): {} - * (BSS): {} - .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:16K): {} + .Zdata? : {} + .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {} .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {} } > DCCM - - GROUP BLOCK(4): { - .rodata_in_data? : {} - } > PSRAM - + GROUP BLOCK(4): { .Xdata? : {} } > XCCM @@ -53,10 +45,20 @@ SECTIONS { } > YCCM GROUP BLOCK(4): { - .Zdata? : {} - } > DCCM - - - } + /* _SDA_BASE_ computed implicitly */ + .sdata?: {} + .sbss?: {} + * (DATA): {} + * (BSS): {} + } > PSRAM + + GROUP BLOCK(4): { + .rodata_in_data? : {} + } > PSRAM + + GROUP BLOCK(4): { + .debug_log? : {} + } > SRAM +} From 2d8e1a45ec34649d216566514d7c062ae985023a Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Wed, 1 Apr 2020 17:33:16 +0300 Subject: [PATCH 028/557] ARC EMSDB Board integration: Project generation --- .../micro/tools/make/helper_functions.inc | 32 ++-- .../tools/make/targets/arc/emsdp/uboot.env | Bin 0 -> 4096 bytes .../tools/make/targets/emsdp_makefile.inc | 155 ++++++++++++++++++ .../make/templates/arc/arc_app_makefile.tpl | 134 +++++++++++++++ 4 files changed, 307 insertions(+), 14 deletions(-) create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/emsdp/uboot.env create mode 100644 tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc create mode 100644 tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc index a7f9bd788e3..0c398be2118 100644 --- a/tensorflow/lite/micro/tools/make/helper_functions.inc +++ b/tensorflow/lite/micro/tools/make/helper_functions.inc @@ -130,31 +130,35 @@ endef define generate_arc_project ifeq ($(TARGET_ARCH), arc) -$(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/Makefile.tpl + +$(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl @mkdir -p $$(dir $$@) @sed -E 's#\%\{SRCS\}\%#$(4)#g' $$< | \ - sed -E '1 i\CC = ccac\nCXX = ccac\nLD = ccac\n' | \ + sed -E 's#\%\{CC\}\%#$(CC_TOOL)#g' | \ + sed -E 's#\%\{CXX\}\%#$(CXX_TOOL)#g' | \ + sed -E 's#\%\{LD\}\%#$(LD_TOOL)#g' | \ sed -E 's#\%\{EXECUTABLE\}\%#$(3).elf#g' | \ sed -E 's#\%\{LINKER_FLAGS\}\%#$(6)#g' | \ sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' | \ - sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' > $$@ + sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' | \ + sed -E 's#\%\{EXTRA_APP_SETTINGS\}\%#$(ARC_EXTRA_APP_SETTINGS)#g' | \ + sed -E 's#\%\{EXTRA_APP_RULES\}\%#$(ARC_EXTRA_APP_RULES)#g' | \ + sed -E 's#\%\{BIN_DEPEND\}\%#$(ARC_BIN_DEPEND)#g' | \ + sed -E 's#\%\{BIN_RULE\}\%#$(ARC_BIN_RULE)#g' | \ + sed -E 's#\%\{EXTRA_RM_TARGETS\}\%#$(ARC_EXTRA_RM_TARGETS)#g' | \ + sed -E 's#\%\{APP_RUN_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \ + sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \ + sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@ -# Special rule to copy TCF in case the local filesystem file name has been defined -ifneq ($(TCF_FILE_NAME), ) -$(PRJDIR)$(3)/$(1)/$(TCF_FILE_NAME): $(TCF_FILE) - @cp $$< $$@ -endif - -# Special rule to copy LCF in case the local filesystem file name has been defined -ifneq ($(LCF_FILE), ) -$(PRJDIR)$(3)/$(1)/$(notdir $(LCF_FILE)): $(LCF_FILE) - @cp $$< $$@ -endif +$(foreach var,$(ARC_TARGET_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var)))) endif endef + + + # Creates a set of rules to build a standalone Arduino project for an # executable, including all of the source and header files required in a # separate folder and a simple makefile. diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/uboot.env b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/uboot.env new file mode 100644 index 0000000000000000000000000000000000000000..c336b6c8733f90b8fbaab75fc705f787ad141607 GIT binary patch literal 4096 zcmeIuOKQU~5C&i$Wfmp06q}LLM+gW?4^X=4y2yGYqOxUVIrKr+y;jfB(=?N$l+ZwX zfYC}b9{*y#kI(n}MvW$^Y^4yFHDb}kn00GppL~~Xw}Qz8oXR0818u7T;pB}9WlUOc z?lkXwuxlEm!UneuF*c|dDYkbSkgD%1`shs7vCAn2%8&w`bx1sK<=0Jcv zXuZL|?k0>@dTGIhaadCR&ztmOaS$oTlE|vYcg4yk`B2_&N~lrHDk0$XF0P=y-quh8 n&gS)WKD~l-6o_X99~NK%7GMDuU;!3j0Ty5Z7GMDu_?N&JGvbmg literal 0 HcmV?d00001 diff --git a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc new file mode 100644 index 00000000000..c7286329651 --- /dev/null +++ b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc @@ -0,0 +1,155 @@ +# Settings for arc processors +ifeq ($(TARGET), emsdp) + + TARGET_ARCH = arc + + CC_TOOL = ccac + AR_TOOL = arac + CXX_TOOL = ccac + LD_TOOL = ccac + + DLR = $$$$ + ARC_EXTRA_APP_SETTINGS = \ + BIN_DIR = .$(DLR)\(PS\)bin\n\ + BIN_FILE = $(DLR)\(BIN_DIR\)$(DLR)\(PS\)app.elf\n + + ARC_EXTRA_APP_RULES = \ + $(DLR)\(BIN_FILE\): $(DLR)\(BIN_DIR\) $(DLR)\(OUT_NAME\)\ + \n\t\@$(DLR)\(CP\) $(DLR)\(OUT_NAME\) $(DLR)\(BIN_FILE\)\ + \n\t\@$(DLR)\(CP\) uboot.env $(DLR)\(BIN_DIR\)$(DLR)\(PS)uboot.env\ + \n \ + \n$(DLR)\(BIN_DIR\):\ + \n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\ + + + ARC_EXTRA_RM_TARGETS = $(DLR)\(BIN_DIR\) + + ARC_BIN_DEPEND = $(DLR)\(BIN_DIR\) $(DLR)\(BIN_FILE\) + ARC_BIN_RULE = \t@echo Copy content of $(DLR)\(BIN_DIR\) into the root of SD card and follow instructions + + ARC_APP_RUN_CMD = mdb -run -digilent -nooptions $(DLR)\(DBG_ARGS) + ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS) + ARC_EXTRA_EXECUTE_RULES = + + + + TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf + LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf + + MAKE_PROJECT_FILES += emsdp_em11d_dfss.tcf emsdp.lcf uboot.env + + ARC_TARGET_DIRS := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp $(PWD)/$(MAKEFILE_DIR)/targets/arc + # ARC_TARGET_DIRS := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp + +# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags. +# This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying. + TCF_FILE_NAME = $(notdir $(TCF_FILE)) + + THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME) + + PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections + PLATFORM_FLAGS += -tcf_core_config + PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map + +# DMITRYZ: I think we need to move it to target specific LCF file. + PLATFORM_LDFLAGS += $(notdir $(LCF_FILE)) + # THIRD_PARTY_CC_HDRS += $(notdir $(LCF_FILE)) + + CXXFLAGS += $(PLATFORM_FLAGS) + CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS)) + CCFLAGS += $(PLATFORM_FLAGS) + CCFLAGS:=$(filter-out -std=c11,$(CCFLAGS)) + LDFLAGS += $(PLATFORM_LDFLAGS) + + MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS)) + + # DMITRYZ: Here we need to check tags on "no_embarc_mli". + USE_EMBARC_MLI ?= true + +ifeq ($(USE_EMBARC_MLI), true) + ALL_TAGS += arc + +ifeq ($(PRE_COMPILED_MLI),true) + $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,)) + + MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include + MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a + + THIRD_PARTY_CC_HDRS += \ + third_party/embarc_osp/LICENSE +else + MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME)) + + $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE))) + + MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include + MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a + MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a + + THIRD_PARTY_CC_HDRS += \ + third_party/$(MLI_LIB_DIR)/LICENSE +endif + + THIRD_PARTY_CC_HDRS += $(MLI_LIB) + GENERATED_PROJECT_LIBS += $(MLI_LIB) + + INCLUDES += \ + -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \ + -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api + + GENERATED_PROJECT_INCLUDES += \ + -I. \ + -I./third_party/$(MLI_INCLUDE_FOLDER) \ + -I./third_party/$(MLI_INCLUDE_FOLDER)/api + + + THIRD_PARTY_CC_HDRS += \ + third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \ + third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h + + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_tf_utils.h + + MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf + +endif # USE_EMBARC_MLI + +# We overwrite project generator to exclude everything not relevant to ARC platform +define generate_microlite_projects +$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) +$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) +endef + +# Copy rule generator to do file copyes with changing paths in generated project +# Arguments are: +# 1 - Path files in generated project. +# 2 - Path files in the source repo +# Used in helper_functions.inc for arc projects to copy files +define path_changing_copy_file +$(1)/%: $(2)/% + @mkdir -p $$(dir $$@) + @cp $$< $$@ +endef + +$(foreach var,$(ARC_TARGET_DIRS),$(eval $(call copy_arc_project_file,$(PRJDIR)$(3)/$(1),$(var)))) + +# These are microcontroller-specific rules for converting the ELF output +# of the linker into a binary image that can be loaded directly. + +# Not applicable for ARC, leaving it empty. +$(BINDIR)%.bin: + +endif diff --git a/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl new file mode 100644 index 00000000000..5bbcb7d3f71 --- /dev/null +++ b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl @@ -0,0 +1,134 @@ +#============================================================= +# OS-specific definitions +#============================================================= +COMMA=, +OPEN_PAREN=( +CLOSE_PAREN=) +BACKSLASH=\$(nullstring) +ifneq ($(ComSpec)$(COMSPEC),) + O_SYS=Windows + RM=del /F /Q + MKDIR=mkdir + CP=copy /Y + TYPE=type + PS=$(BACKSLASH) + Q= + coQ=\$(nullstring) + fix_platform_path = $(subst /,$(PS), $(1)) + DEV_NULL = nul +else + O_SYS=Unix + RM=rm -rf + MKDIR=mkdir -p + CP=cp + TYPE=cat + PS=/ + Q=$(BACKSLASH) + coQ= + fix_platform_path=$(1) + DEV_NULL=/dev/null +endif + +# Note: Windows escaping rules is very combersome +# initially I tried to use Q=^, but this depends on the context and (looks like) on Win version. +# Also expecially ugly thing is that in quoted strings the quotes the same are remain. +# Batch has special parameter expansion syntax to remove quotes, +# but many tools themselves remove quotes (unless escaped with backslash) +# So finally we've found that in our use cases we may not escaping any symbols but prepend backslashes before quotes. + +quote=$(subst %,$(Q)%, \ + $(subst &,$(Q)&, \ + $(subst <,$(Q)<, \ + $(subst >,$(Q)>, \ + $(subst |,$(Q)|, \ + $(subst ',$(Q)', \ + $(subst $(COMMA),$(Q)$(COMMA), \ + $(subst =,$(Q)=, \ + $(subst $(OPEN_PAREN),$(Q)$(OPEN_PAREN), \ + $(subst $(CLOSE_PAREN),$(Q)$(CLOSE_PAREN), \ + $(subst !,$(Q)!, \ + $(subst ",$(BACKSLASH)", \ + $(subst $(Q),$(Q)$(Q), \ + $(1) ))))))))))))) + +#============================================================= +# Toolchain definitions +#============================================================= +CC = %{CC}% +CXX = %{CXX}% +LD = %{LD}% + + +#============================================================= +# Applications settings +#============================================================= +OUT_NAME = %{EXECUTABLE}% + +DBG_ARGS ?= + +RUN_ARGS ?= + +CXXFLAGS += %{CXX_FLAGS}% + +CCFLAGS += %{CC_FLAGS}% + +LDFLAGS += %{LINKER_FLAGS}% + +%{EXTRA_APP_SETTINGS}% + + +#============================================================= +# Files and directories +#============================================================= +SRCS := \ +%{SRCS}% + +OBJS := \ +$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS))) + + +#============================================================= +# Common rules +#============================================================= +.PHONY: all app flash clean run debug + +%.o: %.cc + $(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ + +%.o: %.c + $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@ + +$(OUT_NAME): $(OBJS) + $(LD) $(CXXFLAGS) -o $@ -Ccrossref $(OBJS) $(LDFLAGS) + +%{EXTRA_APP_RULES}% + + +#================================================================= +# Global rules +#================================================================= +all: $(OUT_NAME) + +app: $(OUT_NAME) + +flash: %{BIN_DEPEND}% +%{BIN_RULE}% + +clean: + -@$(RM) $(call fix_platform_path,$(OBJS)) + -@$(RM) $(OUT_NAME) %{EXTRA_RM_TARGETS}% + +#================================================================= +# Execution rules +#================================================================= + +APP_RUN := %{APP_RUN_CMD}% +APP_DEBUG := %{APP_DEBUG_CMD}% + +run: $(OUT_NAME) + $(APP_RUN) $(OUT_NAME) $(RUN_ARGS) + +debug: $(OUT_NAME) + $(APP_DEBUG) $(OUT_NAME) $(RUN_ARGS) + +%{EXTRA_EXECUTE_RULES}% From 1977bd0442998f7a1d8724d54e5a892d9df0daba Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Thu, 2 Apr 2020 15:52:03 +0300 Subject: [PATCH 029/557] Update project generation for custom ARC target (*.tcf) --- .../micro/tools/make/helper_functions.inc | 2 +- .../micro/tools/make/targets/arc_makefile.inc | 85 ++++++++++++++----- .../tools/make/targets/emsdp_makefile.inc | 15 ++-- 3 files changed, 71 insertions(+), 31 deletions(-) diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc index 0c398be2118..0e21e02bc07 100644 --- a/tensorflow/lite/micro/tools/make/helper_functions.inc +++ b/tensorflow/lite/micro/tools/make/helper_functions.inc @@ -151,7 +151,7 @@ $(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_ sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@ -$(foreach var,$(ARC_TARGET_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var)))) +$(foreach var,$(ARC_TARGET_FILES_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var)))) endif endef diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index 29ad5f5347a..e6505cd187b 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -1,19 +1,12 @@ -# Settings for arc processors +# Settings for not pre-defined ARC processors. +# User need to specify ARC target with Tool Configuration File (*.tcf). +# Path to this file must be passed through TCF_FILE variable. +# Otherwise, default em7d_voice_audio configuration is used + ifeq ($(TARGET_ARCH), arc) - CC_TOOL = ccac - AR_TOOL = arac - CXX_TOOL = ccac - -ifeq ($(TARGET), iotdk) - TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/iotdk/iotdk.tcf - LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/iotdk/iotdk.lcf -endif - -ifeq ($(TARGET), emsdp) - TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf - LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf -endif +# Known target are specifyed with their own make configurations. +ifeq ($(filter $(TARGET), emsdp iotdk),) ifneq ($(TCF_FILE), ) TARGET = $(basename $(notdir $(TCF_FILE))) @@ -26,30 +19,61 @@ endif # This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying. ifneq (,$(findstring .tcf,$(TCF_FILE))) TCF_FILE_NAME = $(notdir $(TCF_FILE)) - THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME) + ARC_TARGET_FILES_DIRS := $(dir $(TCF_FILE)) + MAKE_PROJECT_FILES += $(TCF_FILE_NAME) else TCF_FILE_NAME = $(TCF_FILE) endif - PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections - PLATFORM_FLAGS += -tcf_core_config - PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map -default_lcf=$(MAKEFILE_DIR)/targets/arc/memory.lcf + CC_TOOL = ccac + AR_TOOL = arac + CXX_TOOL = ccac + LD_TOOL = ccac + + # TODO: Move this to a common arc/arc_common.inc file to share this with other targets + DLR = $$$$ + ARC_EXTRA_APP_SETTINGS = + ARC_EXTRA_APP_RULES = + + ARC_EXTRA_RM_TARGETS = + + ARC_BIN_DEPEND = + ARC_BIN_RULE = \t$(DLR)\(error Flash rule isnt defined for this ARC target\) + + ARC_APP_RUN_CMD = mdb -run -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS) + ARC_APP_DEBUG_CMD = mdb -OK -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS) + ARC_EXTRA_EXECUTE_RULES = + + + PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) + PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections + PLATFORM_FLAGS += -tcf_core_config + + PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) + PLATFORM_LDFLAGS = -Hnocopyr -m -Hldopt=-Coutput=memory.map ifneq ($(LCF_FILE), ) PLATFORM_LDFLAGS += $(notdir $(LCF_FILE)) - THIRD_PARTY_CC_HDRS += $(notdir $(LCF_FILE)) + MAKE_PROJECT_FILES += $(notdir $(LCF_FILE)) +ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir LCF_FILE)),) + ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE)) +endif endif CXXFLAGS += $(PLATFORM_FLAGS) CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS)) CCFLAGS += $(PLATFORM_FLAGS) + CCFLAGS:=$(filter-out -std=c11,$(CCFLAGS)) LDFLAGS += $(PLATFORM_LDFLAGS) + MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS)) + # TODO: Move/organize embarc_mli usage in an implied way (see ext_libs/cmsis.inc for example USE_EMBARC_MLI ?= true ifeq ($(USE_EMBARC_MLI), true) + # TODO: To understand why it's done here. The same is performed in the higher level MakeFile. ALL_TAGS += arc ifeq ($(PRE_COMPILED_MLI),true) @@ -110,10 +134,29 @@ endif endif # USE_EMBARC_MLI +# We overwrite project generator to exclude everything not relevant to ARC platform +define generate_microlite_projects +$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) +$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) +endef + +# Copy rule generator to do file copyes with changing paths in generated project +# Arguments are: +# 1 - Path files in generated project. +# 2 - Path files in the source repo +# Used in helper_functions.inc for arc projects to copy files +define path_changing_copy_file +$(1)/%: $(2)/% + @mkdir -p $$(dir $$@) + @cp $$< $$@ +endef + # These are microcontroller-specific rules for converting the ELF output # of the linker into a binary image that can be loaded directly. - # Not applicable for ARC, leaving it empty. $(BINDIR)%.bin: -endif + +endif # ifeq ($(filter $(TARGET),$(ARC_PREDEFINED_TARGETS)),) +endif # ifeq ($(TARGET_ARCH), arc) + diff --git a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc index c7286329651..aeeb7fc178f 100644 --- a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc @@ -21,7 +21,6 @@ ifeq ($(TARGET), emsdp) \n$(DLR)\(BIN_DIR\):\ \n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\ - ARC_EXTRA_RM_TARGETS = $(DLR)\(BIN_DIR\) ARC_BIN_DEPEND = $(DLR)\(BIN_DIR\) $(DLR)\(BIN_FILE\) @@ -31,21 +30,19 @@ ifeq ($(TARGET), emsdp) ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS) ARC_EXTRA_EXECUTE_RULES = - - TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf MAKE_PROJECT_FILES += emsdp_em11d_dfss.tcf emsdp.lcf uboot.env - ARC_TARGET_DIRS := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp $(PWD)/$(MAKEFILE_DIR)/targets/arc - # ARC_TARGET_DIRS := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp + ARC_TARGET_FILES_DIRS := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp +# TODO: LESS TCF/LCF Variables # The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags. # This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying. TCF_FILE_NAME = $(notdir $(TCF_FILE)) - THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME) +# THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME) PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections PLATFORM_FLAGS += -tcf_core_config @@ -53,7 +50,7 @@ ifeq ($(TARGET), emsdp) # DMITRYZ: I think we need to move it to target specific LCF file. PLATFORM_LDFLAGS += $(notdir $(LCF_FILE)) - # THIRD_PARTY_CC_HDRS += $(notdir $(LCF_FILE)) +# THIRD_PARTY_CC_HDRS += $(notdir $(LCF_FILE)) CXXFLAGS += $(PLATFORM_FLAGS) CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS)) @@ -133,7 +130,7 @@ $(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(T $(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) endef -# Copy rule generator to do file copyes with changing paths in generated project +# Copy rule generator to do file copyes changing paths in generated project # Arguments are: # 1 - Path files in generated project. # 2 - Path files in the source repo @@ -144,7 +141,7 @@ $(1)/%: $(2)/% @cp $$< $$@ endef -$(foreach var,$(ARC_TARGET_DIRS),$(eval $(call copy_arc_project_file,$(PRJDIR)$(3)/$(1),$(var)))) + # These are microcontroller-specific rules for converting the ELF output # of the linker into a binary image that can be loaded directly. From 984457fd69a2615db8f2d1e5c5848b3b3c7ef27f Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Fri, 3 Apr 2020 11:41:58 +0300 Subject: [PATCH 030/557] Update platform flags and debug command template --- .../micro/tools/make/helper_functions.inc | 2 +- .../micro/tools/make/targets/arc_makefile.inc | 10 +++++---- .../tools/make/targets/emsdp_makefile.inc | 22 +++++++++++++------ .../make/templates/arc/arc_app_makefile.tpl | 6 +++-- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc index 0e21e02bc07..8d321d42490 100644 --- a/tensorflow/lite/micro/tools/make/helper_functions.inc +++ b/tensorflow/lite/micro/tools/make/helper_functions.inc @@ -147,7 +147,7 @@ $(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_ sed -E 's#\%\{BIN_RULE\}\%#$(ARC_BIN_RULE)#g' | \ sed -E 's#\%\{EXTRA_RM_TARGETS\}\%#$(ARC_EXTRA_RM_TARGETS)#g' | \ sed -E 's#\%\{APP_RUN_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \ - sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \ + sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_DEBUG_CMD)#g' | \ sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@ diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index e6505cd187b..1b30e6ac6d0 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -46,12 +46,14 @@ endif ARC_EXTRA_EXECUTE_RULES = - PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) - PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections - PLATFORM_FLAGS += -tcf_core_config + PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config + PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections + + # Use compact CRT. It requires pre-defined heap size + PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) - PLATFORM_LDFLAGS = -Hnocopyr -m -Hldopt=-Coutput=memory.map + PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K ifneq ($(LCF_FILE), ) PLATFORM_LDFLAGS += $(notdir $(LCF_FILE)) MAKE_PROJECT_FILES += $(notdir $(LCF_FILE)) diff --git a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc index aeeb7fc178f..86e9d9e7379 100644 --- a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc @@ -33,24 +33,32 @@ ifeq ($(TARGET), emsdp) TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf - MAKE_PROJECT_FILES += emsdp_em11d_dfss.tcf emsdp.lcf uboot.env + MAKE_PROJECT_FILES += $(notdir $(TCF_FILE)) $(notdir $(LCF_FILE)) uboot.env - ARC_TARGET_FILES_DIRS := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp + ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE)) +ifneq ($(dir $(TCF_FILE)), $(dir $(LCF_FILE))) + ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE)) +endif # TODO: LESS TCF/LCF Variables # The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags. # This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying. TCF_FILE_NAME = $(notdir $(TCF_FILE)) -# THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME) + PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config + PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections + + # Use compact CRT. It requires pre-defined heap size + PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset + + PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K - PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections - PLATFORM_FLAGS += -tcf_core_config - PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map + # for default EMSD configuration we can use defaul em9d rt libs + # for better performance runime should be rebuilt for emsdp configuration + PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio # DMITRYZ: I think we need to move it to target specific LCF file. PLATFORM_LDFLAGS += $(notdir $(LCF_FILE)) -# THIRD_PARTY_CC_HDRS += $(notdir $(LCF_FILE)) CXXFLAGS += $(PLATFORM_FLAGS) CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS)) diff --git a/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl index 5bbcb7d3f71..f79d04b26d1 100644 --- a/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl +++ b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl @@ -68,6 +68,8 @@ DBG_ARGS ?= RUN_ARGS ?= +EXT_CFLAGS ?= + CXXFLAGS += %{CXX_FLAGS}% CCFLAGS += %{CC_FLAGS}% @@ -93,10 +95,10 @@ $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS))) .PHONY: all app flash clean run debug %.o: %.cc - $(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ + $(CXX) $(CXXFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@ %.o: %.c - $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@ + $(CC) $(CCFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@ $(OUT_NAME): $(OBJS) $(LD) $(CXXFLAGS) -o $@ -Ccrossref $(OBJS) $(LDFLAGS) From 7c15ad0e98c1ba9234117fb160c082ef11108b46 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Fri, 3 Apr 2020 16:10:34 +0300 Subject: [PATCH 031/557] ARC platform common make parts was moved to a separate file --- .../tools/make/targets/arc/arc_common.inc | 185 ++++++++++++++++++ .../micro/tools/make/targets/arc_makefile.inc | 151 +------------- .../tools/make/targets/emsdp_makefile.inc | 148 ++------------ 3 files changed, 207 insertions(+), 277 deletions(-) create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc new file mode 100644 index 00000000000..e20887abb07 --- /dev/null +++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc @@ -0,0 +1,185 @@ +# Common Settings for ARC platform and it's projects. +# Might be reused across different targets + +ifeq ($(TARGET_ARCH), arc) + + DLR := $$$$ + + # List of folders to search project files for copy with path changing + # For instance, TCF and LCF files are copyed into the root of generated project + ARC_TARGET_FILES_DIRS ?= + + # For the following variables see arc_app_makefile.tpl for usage + + # Additional text into application settings section of arc makefile project + ARC_EXTRA_APP_SETTINGS ?= + + # Additional text into application general rules of arc makefile project + ARC_EXTRA_APP_RULES ?= + + # additional arguments for RM command of "clean" target rule ("make clean" command) + ARC_EXTRA_RM_TARGETS ?= + + # Dependencies of "flash" target rule ("make flash" command) + ARC_BIN_DEPEND ?= + + # Commands in "flash" target rule ("make flash" command) + ARC_BIN_RULE ?= \t$(DLR)\(error Flash rule isnt defined for this ARC target\) + + # Command to run app on "make run" command of generated project + ARC_APP_RUN_CMD ?= + + # Command to run app on "make debug" command of generated project + ARC_APP_DEBUG_CMD ?= + + # Additional text into application execution rules of arc makefile project + ARC_EXTRA_EXECUTE_RULES ?= + +# We overwrite project generator to exclude everything not relevant to ARC platform. +# ARC targets doesn't can't work with mbed, keil or other architecture specific development tools +# Basic make project is updated to be applicable for general ARC platform +define generate_microlite_projects +$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) +$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) +endef + +# Copy rule generator to do file copyes with changing paths in generated project +# Arguments are: +# 1 - Path files in generated project. +# 2 - Path files in the source repo +# Used in helper_functions.inc for arc projects to copy files +define path_changing_copy_file +$(1)/%: $(2)/% + @mkdir -p $$(dir $$@) + @cp $$< $$@ +endef + +# These are microcontroller-specific rules for converting the ELF output +# of the linker into a binary image that can be loaded directly. +# Not applicable for ARC, leaving it empty. +$(BINDIR)%.bin: + + +ifeq ($(ARC_TOOLCHAIN), mwdt) + CC_TOOL := ccac + AR_TOOL := arac + CXX_TOOL := ccac + LD_TOOL := ccac + + ARC_APP_RUN_CMD = mdb -run -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS\) + ARC_APP_DEBUG_CMD = mdb -OK -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS\) + + # The variable TCF_FILE stores path to Tool Configuration File (*.tcf). + # This file is used by MWDT toolchain to properly compile/run code + TCF_FILE ?= + + LCF_FILE ?= + +# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), +# this variable is used later to add the option to the linker/compiler flags. +# This condition also handles the case when the user/makefile specifies +# the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying. +ifneq (,$(findstring .tcf,$(TCF_FILE))) + TCF_FILE_NAME = $(notdir $(TCF_FILE)) + ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE)) + MAKE_PROJECT_FILES += $(TCF_FILE_NAME) +else + TCF_FILE_NAME = $(TCF_FILE) +endif + + PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config + + PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections + + # Use compact CRT. It requires pre-defined heap size + PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset + + PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) + + PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K + +ifneq ($(LCF_FILE), ) + PLATFORM_LDFLAGS += $(notdir $(LCF_FILE)) + MAKE_PROJECT_FILES += $(notdir $(LCF_FILE)) +ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(LCF_FILE))),) + ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE)) +endif +endif + + CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS)) + CCFLAGS := $(filter-out -std=c11,$(CCFLAGS)) + MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS)) + + CXXFLAGS += $(PLATFORM_FLAGS) + CCFLAGS += $(PLATFORM_FLAGS) + LDFLAGS += $(PLATFORM_LDFLAGS) + + + # TODO: Move/organize embarc_mli usage in an implied way (see ext_libs/cmsis.inc for example + USE_EMBARC_MLI ?= true + +ifeq ($(USE_EMBARC_MLI), true) + # TODO: To understand why it's done here. The same is performed in the higher level MakeFile. + ALL_TAGS += arc + +ifeq ($(PRE_COMPILED_MLI),true) + $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,)) + + MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include + MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a + + THIRD_PARTY_CC_HDRS += \ + third_party/embarc_osp/LICENSE +else + MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME)) + + $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE))) + + MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include + MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a + MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a + + THIRD_PARTY_CC_HDRS += \ + third_party/$(MLI_LIB_DIR)/LICENSE +endif + + THIRD_PARTY_CC_HDRS += $(MLI_LIB) + GENERATED_PROJECT_LIBS += $(MLI_LIB) + + INCLUDES += \ + -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \ + -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api + + GENERATED_PROJECT_INCLUDES += \ + -I. \ + -I./third_party/$(MLI_INCLUDE_FOLDER) \ + -I./third_party/$(MLI_INCLUDE_FOLDER)/api + + + THIRD_PARTY_CC_HDRS += \ + third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \ + third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h + + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_tf_utils.h + + MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf + +endif # USE_EMBARC_MLI + +endif # ARC_TOOLCHAIN +endif # TARGET_ARCH + diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index 1b30e6ac6d0..87d1b736807 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -8,157 +8,18 @@ ifeq ($(TARGET_ARCH), arc) # Known target are specifyed with their own make configurations. ifeq ($(filter $(TARGET), emsdp iotdk),) +ARC_TOOLCHAIN := mwdt + ifneq ($(TCF_FILE), ) TARGET = $(basename $(notdir $(TCF_FILE))) else + $(warning TCF_FILE variable is not specifyed. Use default em7d_voice_audio configuration) TARGET = em7d_voice_audio TCF_FILE = em7d_voice_audio endif -# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags. -# This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying. -ifneq (,$(findstring .tcf,$(TCF_FILE))) - TCF_FILE_NAME = $(notdir $(TCF_FILE)) - ARC_TARGET_FILES_DIRS := $(dir $(TCF_FILE)) - MAKE_PROJECT_FILES += $(TCF_FILE_NAME) -else - TCF_FILE_NAME = $(TCF_FILE) -endif +include $(MAKEFILE_DIR)/targets/arc/arc_common.inc - CC_TOOL = ccac - AR_TOOL = arac - CXX_TOOL = ccac - LD_TOOL = ccac - - # TODO: Move this to a common arc/arc_common.inc file to share this with other targets - DLR = $$$$ - ARC_EXTRA_APP_SETTINGS = - - ARC_EXTRA_APP_RULES = - - ARC_EXTRA_RM_TARGETS = - - ARC_BIN_DEPEND = - ARC_BIN_RULE = \t$(DLR)\(error Flash rule isnt defined for this ARC target\) - - ARC_APP_RUN_CMD = mdb -run -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS) - ARC_APP_DEBUG_CMD = mdb -OK -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS) - ARC_EXTRA_EXECUTE_RULES = - - - PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config - PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections - - # Use compact CRT. It requires pre-defined heap size - PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset - - PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) - PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K -ifneq ($(LCF_FILE), ) - PLATFORM_LDFLAGS += $(notdir $(LCF_FILE)) - MAKE_PROJECT_FILES += $(notdir $(LCF_FILE)) -ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir LCF_FILE)),) - ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE)) -endif -endif - - CXXFLAGS += $(PLATFORM_FLAGS) - CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS)) - CCFLAGS += $(PLATFORM_FLAGS) - CCFLAGS:=$(filter-out -std=c11,$(CCFLAGS)) - LDFLAGS += $(PLATFORM_LDFLAGS) - - - MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS)) - - # TODO: Move/organize embarc_mli usage in an implied way (see ext_libs/cmsis.inc for example - USE_EMBARC_MLI ?= true - -ifeq ($(USE_EMBARC_MLI), true) - # TODO: To understand why it's done here. The same is performed in the higher level MakeFile. - ALL_TAGS += arc - -ifeq ($(PRE_COMPILED_MLI),true) - $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,)) - - MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include - MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a - - THIRD_PARTY_CC_HDRS += \ - third_party/embarc_osp/LICENSE -else - MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME)) - - $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE))) - - MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include - MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a - MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a - - THIRD_PARTY_CC_HDRS += \ - third_party/$(MLI_LIB_DIR)/LICENSE -endif - - THIRD_PARTY_CC_HDRS += $(MLI_LIB) - GENERATED_PROJECT_LIBS += $(MLI_LIB) - - INCLUDES += \ - -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \ - -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api - - GENERATED_PROJECT_INCLUDES += \ - -I. \ - -I./third_party/$(MLI_INCLUDE_FOLDER) \ - -I./third_party/$(MLI_INCLUDE_FOLDER)/api - - - THIRD_PARTY_CC_HDRS += \ - third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \ - third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h - - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h - MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h - MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h - MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_tf_utils.h - - MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf - -endif # USE_EMBARC_MLI - -# We overwrite project generator to exclude everything not relevant to ARC platform -define generate_microlite_projects -$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) -$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) -endef - -# Copy rule generator to do file copyes with changing paths in generated project -# Arguments are: -# 1 - Path files in generated project. -# 2 - Path files in the source repo -# Used in helper_functions.inc for arc projects to copy files -define path_changing_copy_file -$(1)/%: $(2)/% - @mkdir -p $$(dir $$@) - @cp $$< $$@ -endef - -# These are microcontroller-specific rules for converting the ELF output -# of the linker into a binary image that can be loaded directly. -# Not applicable for ARC, leaving it empty. -$(BINDIR)%.bin: - - -endif # ifeq ($(filter $(TARGET),$(ARC_PREDEFINED_TARGETS)),) -endif # ifeq ($(TARGET_ARCH), arc) +endif # $(TARGET) +endif # $(TARGET_ARCH)... diff --git a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc index 86e9d9e7379..9901fd82b07 100644 --- a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc @@ -1,14 +1,16 @@ # Settings for arc processors ifeq ($(TARGET), emsdp) - TARGET_ARCH = arc + TARGET_ARCH := arc + ARC_TOOLCHAIN := mwdt + + TCF_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf + LCF_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf + UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env + UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE)) + +include $(MAKEFILE_DIR)/targets/arc/arc_common.inc - CC_TOOL = ccac - AR_TOOL = arac - CXX_TOOL = ccac - LD_TOOL = ccac - - DLR = $$$$ ARC_EXTRA_APP_SETTINGS = \ BIN_DIR = .$(DLR)\(PS\)bin\n\ BIN_FILE = $(DLR)\(BIN_DIR\)$(DLR)\(PS\)app.elf\n @@ -16,7 +18,7 @@ ifeq ($(TARGET), emsdp) ARC_EXTRA_APP_RULES = \ $(DLR)\(BIN_FILE\): $(DLR)\(BIN_DIR\) $(DLR)\(OUT_NAME\)\ \n\t\@$(DLR)\(CP\) $(DLR)\(OUT_NAME\) $(DLR)\(BIN_FILE\)\ - \n\t\@$(DLR)\(CP\) uboot.env $(DLR)\(BIN_DIR\)$(DLR)\(PS)uboot.env\ + \n\t\@$(DLR)\(CP\) $(UBOOT_FILE_NAME) $(DLR)\(BIN_DIR\)$(DLR)\(PS\)$(UBOOT_FILE_NAME)\ \n \ \n$(DLR)\(BIN_DIR\):\ \n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\ @@ -26,135 +28,17 @@ ifeq ($(TARGET), emsdp) ARC_BIN_DEPEND = $(DLR)\(BIN_DIR\) $(DLR)\(BIN_FILE\) ARC_BIN_RULE = \t@echo Copy content of $(DLR)\(BIN_DIR\) into the root of SD card and follow instructions - ARC_APP_RUN_CMD = mdb -run -digilent -nooptions $(DLR)\(DBG_ARGS) - ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS) + ARC_APP_RUN_CMD = mdb -run -digilent -nooptions $(DLR)\(DBG_ARGS\) + ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS\) ARC_EXTRA_EXECUTE_RULES = - TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf - LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf - - MAKE_PROJECT_FILES += $(notdir $(TCF_FILE)) $(notdir $(LCF_FILE)) uboot.env - - ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE)) -ifneq ($(dir $(TCF_FILE)), $(dir $(LCF_FILE))) - ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE)) + MAKE_PROJECT_FILES += $(UBOOT_FILE_NAME) +ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(UBOOT_FILE))),) + ARC_TARGET_FILES_DIRS += $(dir $(UBOOT_FILE)) endif -# TODO: LESS TCF/LCF Variables -# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags. -# This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying. - TCF_FILE_NAME = $(notdir $(TCF_FILE)) - - PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config - PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections - - # Use compact CRT. It requires pre-defined heap size - PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset - - PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K - # for default EMSD configuration we can use defaul em9d rt libs - # for better performance runime should be rebuilt for emsdp configuration + # for better performance runtime should be built for emsdp configuration PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio -# DMITRYZ: I think we need to move it to target specific LCF file. - PLATFORM_LDFLAGS += $(notdir $(LCF_FILE)) - - CXXFLAGS += $(PLATFORM_FLAGS) - CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS)) - CCFLAGS += $(PLATFORM_FLAGS) - CCFLAGS:=$(filter-out -std=c11,$(CCFLAGS)) - LDFLAGS += $(PLATFORM_LDFLAGS) - - MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS)) - - # DMITRYZ: Here we need to check tags on "no_embarc_mli". - USE_EMBARC_MLI ?= true - -ifeq ($(USE_EMBARC_MLI), true) - ALL_TAGS += arc - -ifeq ($(PRE_COMPILED_MLI),true) - $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,)) - - MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include - MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a - - THIRD_PARTY_CC_HDRS += \ - third_party/embarc_osp/LICENSE -else - MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME)) - - $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE))) - - MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include - MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a - MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a - - THIRD_PARTY_CC_HDRS += \ - third_party/$(MLI_LIB_DIR)/LICENSE -endif - - THIRD_PARTY_CC_HDRS += $(MLI_LIB) - GENERATED_PROJECT_LIBS += $(MLI_LIB) - - INCLUDES += \ - -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \ - -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api - - GENERATED_PROJECT_INCLUDES += \ - -I. \ - -I./third_party/$(MLI_INCLUDE_FOLDER) \ - -I./third_party/$(MLI_INCLUDE_FOLDER)/api - - - THIRD_PARTY_CC_HDRS += \ - third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \ - third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h - - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h - MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h - MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h - MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_tf_utils.h - - MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf - -endif # USE_EMBARC_MLI - -# We overwrite project generator to exclude everything not relevant to ARC platform -define generate_microlite_projects -$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) -$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) -endef - -# Copy rule generator to do file copyes changing paths in generated project -# Arguments are: -# 1 - Path files in generated project. -# 2 - Path files in the source repo -# Used in helper_functions.inc for arc projects to copy files -define path_changing_copy_file -$(1)/%: $(2)/% - @mkdir -p $$(dir $$@) - @cp $$< $$@ -endef - - - -# These are microcontroller-specific rules for converting the ELF output -# of the linker into a binary image that can be loaded directly. - -# Not applicable for ARC, leaving it empty. -$(BINDIR)%.bin: - endif From 2226b67dc3bb0a55b30a6599a94454715afba102 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Tue, 7 Apr 2020 12:53:32 +0300 Subject: [PATCH 032/557] changed EMSDP to ARC_EMSDP and other minor fixes regarding guidline --- .../micro/{emsdp => arc_emsdp}/debug_log.cc | 82 ++++++++++--------- .../micro/tools/make/download_and_extract.sh | 3 +- .../tools/make/targets/arc/arc_common.inc | 22 ++++- ...dp_makefile.inc => arc_emsdp_makefile.inc} | 20 ++++- .../micro/tools/make/targets/arc_makefile.inc | 21 ++++- .../make/templates/arc/arc_app_makefile.tpl | 22 ----- 6 files changed, 97 insertions(+), 73 deletions(-) rename tensorflow/lite/micro/{emsdp => arc_emsdp}/debug_log.cc (55%) rename tensorflow/lite/micro/tools/make/targets/{emsdp_makefile.inc => arc_emsdp_makefile.inc} (66%) diff --git a/tensorflow/lite/micro/emsdp/debug_log.cc b/tensorflow/lite/micro/arc_emsdp/debug_log.cc similarity index 55% rename from tensorflow/lite/micro/emsdp/debug_log.cc rename to tensorflow/lite/micro/arc_emsdp/debug_log.cc index 7d932939a0b..57eea6a5579 100644 --- a/tensorflow/lite/micro/emsdp/debug_log.cc +++ b/tensorflow/lite/micro/arc_emsdp/debug_log.cc @@ -1,4 +1,4 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -23,16 +23,20 @@ limitations under the License. // EMSDP_LOG_TO_MEMORY // : fill .debug_log memory region (data section) with passed chars. // EMSDP_LOG_TO_HOST -// : Use hostlink to print output log. +// : Use MetaWare HostLink to print output log. Requires Synopsys MetaWare debugger // EMSDP_LOG_TO_UART // : use default debug UART (out to FTDI channel 0). The same USB Port is used for JTAG. #define EMSDP_LOG_TO_UART +// Memory size for symbols dump in EMSDP_LOG_TO_MEMORY destination +#define EMSDP_LOG_TO_MEMORY_SIZE (2 * 1024) -// For simplicity we assume U-boot has already initialized debug console durion -// application loading (or on reset). Hence we use only status and data registers + +// For simplicity we assume U-boot has already initialized debug console during +// application loading (or on reset). Hence, we use only status and data registers // to organize blocking loop for printing symbols. No input and no IRQ handling. // See embarc_osp repository for full EMSDP uart driver. +// (https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp) // TODO: Consider U-Boot API to do it in a less "hacky" way. void DbgUartSendStr(const char* s) { #define EMSDP_DBG_UART_BASE (0xF0004000U) @@ -40,48 +44,48 @@ void DbgUartSendStr(const char* s) { #define DW_UART_USR_TFNF (0x02) #define DW_UART_LSR_TXD_EMPTY (0x20) - typedef volatile struct dw_uart_reg { - uint32_t DATA; /*!< data in/out and DLL */ - uint32_t RES1[4]; - uint32_t LSR; /*!< Line Status Register */ - uint32_t RES2[25]; - uint32_t USR; /*!< UART status register */ - uint32_t RES3[29]; - uint32_t CPR; /*!< Component parameter register */ - } DW_UART_REG; + typedef volatile struct dw_uart_reg { + uint32_t DATA; /*!< data in/out and DLL */ + uint32_t RES1[4]; + uint32_t LSR; /*!< Line Status Register */ + uint32_t RES2[25]; + uint32_t USR; /*!< UART status register */ + uint32_t RES3[29]; + uint32_t CPR; /*!< Component parameter register */ + } DW_UART_REG; - DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE); - const char* src = s; - while (*src) { - // Check uart status to send char - bool uart_is_ready = false; - if (uart_reg_ptr->CPR & DW_UART_CPR_FIFO_STAT) - uart_is_ready = ((uart_reg_ptr->USR & DW_UART_USR_TFNF) != 0); - else - uart_is_ready = ((uart_reg_ptr->LSR & DW_UART_LSR_TXD_EMPTY) != 0); + DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE); + const char* src = s; + while (*src) { + // Check uart status to send char + bool uart_is_ready = false; + if (uart_reg_ptr->CPR & DW_UART_CPR_FIFO_STAT) + uart_is_ready = ((uart_reg_ptr->USR & DW_UART_USR_TFNF) != 0); + else + uart_is_ready = ((uart_reg_ptr->LSR & DW_UART_LSR_TXD_EMPTY) != 0); - // Send char if uart is ready. - if (uart_is_ready) - uart_reg_ptr->DATA = *src++; - } + // Send char if uart is ready. + if (uart_is_ready) + uart_reg_ptr->DATA = *src++; + } } -// Simple symbols dump to a pre-allocated memory region. +// Simple dump of symbols to a pre-allocated memory region. +// When total log exceeds memory region size, cursor is moved to its begining. // The memory region can be viewed afterward with debugger. // It can be viewed/read with debugger afterward. void LogToMem(const char* s) { - constexpr int kDebugLogMemChars = 2 * 1024; - static int cursor = 0; + static int cursor = 0; #pragma Bss(".debug_log") - volatile static char debug_log_mem[kDebugLogMemChars]; + volatile static char debug_log_mem[EMSDP_LOG_TO_MEMORY_SIZE]; #pragma Bss() - const char* src = s; - while (*src) { - debug_log_mem[cursor] = *src++; - cursor = (cursor < kDebugLogMemChars) ? cursor + 1 : 0; - } - debug_log_mem[cursor] = '^'; + const char* src = s; + while (*src) { + debug_log_mem[cursor] = *src++; + cursor = (cursor < EMSDP_LOG_TO_MEMORY_SIZE) ? cursor + 1 : 0; + } + debug_log_mem[cursor] = '^'; } @@ -89,17 +93,17 @@ extern "C" void DebugLog(const char* s) { #ifndef TF_LITE_STRIP_ERROR_STRINGS #if defined EMSDP_LOG_TO_UART - DbgUartSendStr(s); + DbgUartSendStr(s); #endif #if defined EMSDP_LOG_TO_MEMORY #warning "EMSDP_LOG_TO_MEMORY is defined. View .debug_log memory region for stdout" - LogToMem(s); + LogToMem(s); #endif #if defined EMSDP_LOG_TO_HOST #warning "EMSDP_LOG_TO_HOST is defined. Ensure hostlib is linked." - fprintf(stderr, "%s", s); + fprintf(stderr, "%s", s); #endif #endif // TF_LITE_STRIP_ERROR_STRINGS diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh index 4a75b6b24cd..5b06e4e819a 100755 --- a/tensorflow/lite/micro/tools/make/download_and_extract.sh +++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh @@ -170,7 +170,8 @@ download_and_extract() { elif [[ ${action} == "patch_cifar10_dataset" ]]; then patch_cifar10_dataset ${dir} elif [[ ${action} == "build_embarc_mli" ]]; then - build_embarc_mli ${dir} ${action_param1} + cp ${action_param1} ${dir}/hw/arc.tcf + build_embarc_mli ${dir} ../../hw/arc.tcf elif [[ ${action} ]]; then echo "Unknown action '${action}'" exit 1 diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc index e20887abb07..50bb5c96799 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc @@ -1,4 +1,18 @@ -# Common Settings for ARC platform and it's projects. +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Common Settings for ARC platform and its projects. # Might be reused across different targets ifeq ($(TARGET_ARCH), arc) @@ -6,7 +20,7 @@ ifeq ($(TARGET_ARCH), arc) DLR := $$$$ # List of folders to search project files for copy with path changing - # For instance, TCF and LCF files are copyed into the root of generated project + # For instance, TCF and LCF files are copied into the root of generated project ARC_TARGET_FILES_DIRS ?= # For the following variables see arc_app_makefile.tpl for usage @@ -36,14 +50,14 @@ ifeq ($(TARGET_ARCH), arc) ARC_EXTRA_EXECUTE_RULES ?= # We overwrite project generator to exclude everything not relevant to ARC platform. -# ARC targets doesn't can't work with mbed, keil or other architecture specific development tools +# ARC targets cannot work with non-ARC development tools. # Basic make project is updated to be applicable for general ARC platform define generate_microlite_projects $(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) $(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) endef -# Copy rule generator to do file copyes with changing paths in generated project +# Copy rule generator to do file copies with changing paths in generated project # Arguments are: # 1 - Path files in generated project. # 2 - Path files in the source repo diff --git a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc similarity index 66% rename from tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc rename to tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc index 9901fd82b07..a84dd15e4e8 100644 --- a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc @@ -1,5 +1,19 @@ -# Settings for arc processors -ifeq ($(TARGET), emsdp) +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Settings for EMSDP target (ARC processor) +ifeq ($(TARGET), arc_emsdp) TARGET_ARCH := arc ARC_TOOLCHAIN := mwdt @@ -37,7 +51,7 @@ ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(UBOOT_FILE))),) ARC_TARGET_FILES_DIRS += $(dir $(UBOOT_FILE)) endif - # for default EMSD configuration we can use defaul em9d rt libs + # for default EMSD configuration we can use default em9d rt libs # for better performance runtime should be built for emsdp configuration PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index 87d1b736807..db474a54b2d 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -1,19 +1,32 @@ +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Settings for not pre-defined ARC processors. # User need to specify ARC target with Tool Configuration File (*.tcf). # Path to this file must be passed through TCF_FILE variable. # Otherwise, default em7d_voice_audio configuration is used - ifeq ($(TARGET_ARCH), arc) -# Known target are specifyed with their own make configurations. -ifeq ($(filter $(TARGET), emsdp iotdk),) +# Known target are specified with their own make configurations. +ifeq ($(filter $(TARGET), arc_emsdp arc_iotdk),) ARC_TOOLCHAIN := mwdt ifneq ($(TCF_FILE), ) TARGET = $(basename $(notdir $(TCF_FILE))) else - $(warning TCF_FILE variable is not specifyed. Use default em7d_voice_audio configuration) + $(warning TCF_FILE variable is not specified. Use default em7d_voice_audio configuration) TARGET = em7d_voice_audio TCF_FILE = em7d_voice_audio endif diff --git a/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl index f79d04b26d1..a1a3ab71028 100644 --- a/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl +++ b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl @@ -29,28 +29,6 @@ else DEV_NULL=/dev/null endif -# Note: Windows escaping rules is very combersome -# initially I tried to use Q=^, but this depends on the context and (looks like) on Win version. -# Also expecially ugly thing is that in quoted strings the quotes the same are remain. -# Batch has special parameter expansion syntax to remove quotes, -# but many tools themselves remove quotes (unless escaped with backslash) -# So finally we've found that in our use cases we may not escaping any symbols but prepend backslashes before quotes. - -quote=$(subst %,$(Q)%, \ - $(subst &,$(Q)&, \ - $(subst <,$(Q)<, \ - $(subst >,$(Q)>, \ - $(subst |,$(Q)|, \ - $(subst ',$(Q)', \ - $(subst $(COMMA),$(Q)$(COMMA), \ - $(subst =,$(Q)=, \ - $(subst $(OPEN_PAREN),$(Q)$(OPEN_PAREN), \ - $(subst $(CLOSE_PAREN),$(Q)$(CLOSE_PAREN), \ - $(subst !,$(Q)!, \ - $(subst ",$(BACKSLASH)", \ - $(subst $(Q),$(Q)$(Q), \ - $(1) ))))))))))))) - #============================================================= # Toolchain definitions #============================================================= From a7dcdb21f69ca8a5078ad855044e76fefa4f0199 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Wed, 8 Apr 2020 15:11:41 +0300 Subject: [PATCH 033/557] Move out of function ARC EMSDP UART related constatnts --- tensorflow/lite/micro/arc_emsdp/debug_log.cc | 33 +++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/tensorflow/lite/micro/arc_emsdp/debug_log.cc b/tensorflow/lite/micro/arc_emsdp/debug_log.cc index 57eea6a5579..b3b25f88ac1 100644 --- a/tensorflow/lite/micro/arc_emsdp/debug_log.cc +++ b/tensorflow/lite/micro/arc_emsdp/debug_log.cc @@ -31,6 +31,24 @@ limitations under the License. // Memory size for symbols dump in EMSDP_LOG_TO_MEMORY destination #define EMSDP_LOG_TO_MEMORY_SIZE (2 * 1024) +// EMSDP Debug UART related defines (registers and bits) +#define EMSDP_DBG_UART_BASE (0xF0004000U) +#define DW_UART_CPR_FIFO_STAT (1 << 10) +#define DW_UART_USR_TFNF (0x02) +#define DW_UART_LSR_TXD_EMPTY (0x20) + +// EMSDP UART registers map (only necessairy fields) +typedef volatile struct dw_uart_reg { + uint32_t DATA; /* data in/out and DLL */ + uint32_t RES1[4]; + uint32_t LSR; /* Line Status Register */ + uint32_t RES2[25]; + uint32_t USR; /* UART status register */ + uint32_t RES3[29]; + uint32_t CPR; /* Component parameter register */ +} DW_UART_REG; + + // For simplicity we assume U-boot has already initialized debug console during // application loading (or on reset). Hence, we use only status and data registers @@ -39,21 +57,6 @@ limitations under the License. // (https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp) // TODO: Consider U-Boot API to do it in a less "hacky" way. void DbgUartSendStr(const char* s) { -#define EMSDP_DBG_UART_BASE (0xF0004000U) -#define DW_UART_CPR_FIFO_STAT (1<<10) -#define DW_UART_USR_TFNF (0x02) -#define DW_UART_LSR_TXD_EMPTY (0x20) - - typedef volatile struct dw_uart_reg { - uint32_t DATA; /*!< data in/out and DLL */ - uint32_t RES1[4]; - uint32_t LSR; /*!< Line Status Register */ - uint32_t RES2[25]; - uint32_t USR; /*!< UART status register */ - uint32_t RES3[29]; - uint32_t CPR; /*!< Component parameter register */ - } DW_UART_REG; - DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE); const char* src = s; while (*src) { From 105eac5030a346febc615202a4841330f2779c0b Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Wed, 8 Apr 2020 17:40:54 +0300 Subject: [PATCH 034/557] Include new parameters of generate_project for arc --- tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc index 50bb5c96799..67be50d4854 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc @@ -53,7 +53,7 @@ ifeq ($(TARGET_ARCH), arc) # ARC targets cannot work with non-ARC development tools. # Basic make project is updated to be applicable for general ARC platform define generate_microlite_projects -$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) +$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX)) $(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES)) endef From e85244f2c3833f63653a92081e75f3cb2412ccc3 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Thu, 9 Apr 2020 15:12:31 +0300 Subject: [PATCH 035/557] Fix arc target list and build for built-in arc configurations --- tensorflow/lite/micro/tools/make/download_and_extract.sh | 8 ++++++-- tensorflow/lite/micro/tools/make/targets/arc_makefile.inc | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh index 5b06e4e819a..3ab7c3ba7bd 100755 --- a/tensorflow/lite/micro/tools/make/download_and_extract.sh +++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh @@ -170,8 +170,12 @@ download_and_extract() { elif [[ ${action} == "patch_cifar10_dataset" ]]; then patch_cifar10_dataset ${dir} elif [[ ${action} == "build_embarc_mli" ]]; then - cp ${action_param1} ${dir}/hw/arc.tcf - build_embarc_mli ${dir} ../../hw/arc.tcf + if [[ "${action_param1}" == *.tcf ]]; then + cp ${action_param1} ${dir}/hw/arc.tcf + build_embarc_mli ${dir} ../../hw/arc.tcf + else + build_embarc_mli ${dir} ${action_param1} + fi elif [[ ${action} ]]; then echo "Unknown action '${action}'" exit 1 diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index db474a54b2d..d379eea86f1 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -19,7 +19,7 @@ ifeq ($(TARGET_ARCH), arc) # Known target are specified with their own make configurations. -ifeq ($(filter $(TARGET), arc_emsdp arc_iotdk),) +ifeq ($(filter $(TARGET), arc_emsdp),) ARC_TOOLCHAIN := mwdt From 3006c316b64077a6bad64f42cb5e879351072b29 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Mon, 13 Apr 2020 11:22:46 +0300 Subject: [PATCH 036/557] embARC MLI related code as an external library which might be turned-off --- .../micro/kernels/{arc => embarc_mli}/conv.cc | 8 +-- .../{arc => embarc_mli}/depthwise_conv.cc | 8 +-- .../{arc => embarc_mli}/fully_connected.cc | 9 ++- .../{arc => embarc_mli}/mli_slicers.cc | 0 .../kernels/{arc => embarc_mli}/mli_slicers.h | 0 .../{arc => embarc_mli}/mli_tf_utils.h | 0 .../kernels/{arc => embarc_mli}/pooling.cc | 8 +-- .../{arc => embarc_mli}/scratch_buf_mgr.cc | 4 +- .../{arc => embarc_mli}/scratch_buf_mgr.h | 0 .../{arc => embarc_mli}/scratch_buffers.cc | 2 +- .../{arc => embarc_mli}/scratch_buffers.h | 0 .../micro/tools/make/ext_libs/embarc_mli.inc | 67 +++++++++++++++++++ .../tools/make/targets/arc/arc_common.inc | 63 ----------------- 13 files changed, 86 insertions(+), 83 deletions(-) rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/conv.cc (98%) rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/depthwise_conv.cc (98%) rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/fully_connected.cc (97%) rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/mli_slicers.cc (100%) rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/mli_slicers.h (100%) rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/mli_tf_utils.h (100%) rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/pooling.cc (98%) rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/scratch_buf_mgr.cc (98%) rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/scratch_buf_mgr.h (100%) rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/scratch_buffers.cc (98%) rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/scratch_buffers.h (100%) create mode 100644 tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/embarc_mli/conv.cc similarity index 98% rename from tensorflow/lite/micro/kernels/arc/conv.cc rename to tensorflow/lite/micro/kernels/embarc_mli/conv.cc index 6cf26c7d6d9..b124b17f66d 100644 --- a/tensorflow/lite/micro/kernels/arc/conv.cc +++ b/tensorflow/lite/micro/kernels/embarc_mli/conv.cc @@ -24,10 +24,10 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/padding.h" -#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" -#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" -#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h" -#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h" #include "mli_api.h" diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv.cc similarity index 98% rename from tensorflow/lite/micro/kernels/arc/depthwise_conv.cc rename to tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv.cc index 74e48c8c064..0ad2a9fe6c6 100644 --- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc +++ b/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv.cc @@ -25,10 +25,10 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/padding.h" -#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" -#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" -#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h" -#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h" #include "mli_api.h" diff --git a/tensorflow/lite/micro/kernels/arc/fully_connected.cc b/tensorflow/lite/micro/kernels/embarc_mli/fully_connected.cc similarity index 97% rename from tensorflow/lite/micro/kernels/arc/fully_connected.cc rename to tensorflow/lite/micro/kernels/embarc_mli/fully_connected.cc index cc9b95c570a..8088634f8de 100644 --- a/tensorflow/lite/micro/kernels/arc/fully_connected.cc +++ b/tensorflow/lite/micro/kernels/embarc_mli/fully_connected.cc @@ -23,14 +23,13 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" -#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" -#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" -#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h" -#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h" #include "mli_api.h" - namespace tflite { namespace ops { namespace micro { diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.cc b/tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc similarity index 100% rename from tensorflow/lite/micro/kernels/arc/mli_slicers.cc rename to tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.h b/tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h similarity index 100% rename from tensorflow/lite/micro/kernels/arc/mli_slicers.h rename to tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h diff --git a/tensorflow/lite/micro/kernels/arc/mli_tf_utils.h b/tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h similarity index 100% rename from tensorflow/lite/micro/kernels/arc/mli_tf_utils.h rename to tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h diff --git a/tensorflow/lite/micro/kernels/arc/pooling.cc b/tensorflow/lite/micro/kernels/embarc_mli/pooling.cc similarity index 98% rename from tensorflow/lite/micro/kernels/arc/pooling.cc rename to tensorflow/lite/micro/kernels/embarc_mli/pooling.cc index 7a26a10e23b..a147171a859 100644 --- a/tensorflow/lite/micro/kernels/arc/pooling.cc +++ b/tensorflow/lite/micro/kernels/embarc_mli/pooling.cc @@ -20,10 +20,10 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/padding.h" -#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h" -#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" -#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h" -#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h" #include "mli_api.h" diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc similarity index 98% rename from tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc rename to tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc index 5bd2d6aed22..8d00e28714c 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc +++ b/tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h" -#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h" #include #define MAX(A,B) (((A) > (B))? (A): (B)) #define MIN(A,B) (((A) > (B))? (B): (A)) diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h b/tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h similarity index 100% rename from tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h rename to tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc similarity index 98% rename from tensorflow/lite/micro/kernels/arc/scratch_buffers.cc rename to tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc index f36059f82d2..689c490569e 100644 --- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc +++ b/tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h" #include #define MAX(A,B) (((A) > (B))? (A): (B)) #define MIN(A,B) (((A) > (B))? (B): (A)) diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h b/tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h similarity index 100% rename from tensorflow/lite/micro/kernels/arc/scratch_buffers.h rename to tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h diff --git a/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc new file mode 100644 index 00000000000..851a5d43378 --- /dev/null +++ b/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc @@ -0,0 +1,67 @@ +ifeq ($(TARGET_ARCH), arc) + +# embarc_mli Library is used by default for ARC platform whenever it's possible. +# To use TFLM reference implementation it should be intentionally turned off +# by passing 'no_embarc_mli' tag (make -f TAGS=no_embarc_mli ...) +ifeq ($(filter no_embarc_mli,$(ALL_TAGS)),) + + +ALL_TAGS += embarc_mli + +ifeq ($(PRE_COMPILED_MLI),true) + # TODO: Replace with proper embarc_mli pre-builts. + $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,)) + + MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include + MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a + + THIRD_PARTY_CC_HDRS += \ + third_party/embarc_osp/LICENSE +else + MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME)) + + $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE))) + + MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include + MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a + MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a + + THIRD_PARTY_CC_HDRS += \ + third_party/$(MLI_LIB_DIR)/LICENSE +endif + + THIRD_PARTY_CC_HDRS += $(MLI_LIB) + GENERATED_PROJECT_LIBS += $(MLI_LIB) + + INCLUDES += \ + -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \ + -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api + + GENERATED_PROJECT_INCLUDES += \ + -I. \ + -I./third_party/$(MLI_INCLUDE_FOLDER) \ + -I./third_party/$(MLI_INCLUDE_FOLDER)/api + + + THIRD_PARTY_CC_HDRS += \ + third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \ + third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h + + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h + +endif # no_embarc_mli +endif # TARGET_ARCH diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc index 67be50d4854..4a9a5ccdfc3 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc @@ -129,70 +129,7 @@ endif LDFLAGS += $(PLATFORM_LDFLAGS) - # TODO: Move/organize embarc_mli usage in an implied way (see ext_libs/cmsis.inc for example - USE_EMBARC_MLI ?= true -ifeq ($(USE_EMBARC_MLI), true) - # TODO: To understand why it's done here. The same is performed in the higher level MakeFile. - ALL_TAGS += arc - -ifeq ($(PRE_COMPILED_MLI),true) - $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,)) - - MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include - MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a - - THIRD_PARTY_CC_HDRS += \ - third_party/embarc_osp/LICENSE -else - MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME)) - - $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE))) - - MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include - MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a - MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a - - THIRD_PARTY_CC_HDRS += \ - third_party/$(MLI_LIB_DIR)/LICENSE -endif - - THIRD_PARTY_CC_HDRS += $(MLI_LIB) - GENERATED_PROJECT_LIBS += $(MLI_LIB) - - INCLUDES += \ - -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \ - -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api - - GENERATED_PROJECT_INCLUDES += \ - -I. \ - -I./third_party/$(MLI_INCLUDE_FOLDER) \ - -I./third_party/$(MLI_INCLUDE_FOLDER)/api - - - THIRD_PARTY_CC_HDRS += \ - third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \ - third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h - - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h - MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h - MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h - MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_tf_utils.h - - MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf - -endif # USE_EMBARC_MLI endif # ARC_TOOLCHAIN endif # TARGET_ARCH From 03bec25ed962226e59d9d4a8b23a55540ab33ca9 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Mon, 13 Apr 2020 14:06:35 +0300 Subject: [PATCH 037/557] Additional tests for embARC MLI specific slicing (initial mock version) --- .../kernels/embarc_mli/conv_slicing_test.cc | 629 ++++++++++ .../embarc_mli/depthwise_conv_slicing_test.cc | 768 ++++++++++++ .../fully_connected_slicing_test.cc | 938 ++++++++++++++ .../embarc_mli/pooling_slicing_test.cc | 1116 +++++++++++++++++ .../micro/tools/make/ext_libs/embarc_mli.inc | 11 +- 5 files changed, 3461 insertions(+), 1 deletion(-) create mode 100644 tensorflow/lite/micro/kernels/embarc_mli/conv_slicing_test.cc create mode 100644 tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv_slicing_test.cc create mode 100644 tensorflow/lite/micro/kernels/embarc_mli/fully_connected_slicing_test.cc create mode 100644 tensorflow/lite/micro/kernels/embarc_mli/pooling_slicing_test.cc diff --git a/tensorflow/lite/micro/kernels/embarc_mli/conv_slicing_test.cc b/tensorflow/lite/micro/kernels/embarc_mli/conv_slicing_test.cc new file mode 100644 index 00000000000..a1f155ecc56 --- /dev/null +++ b/tensorflow/lite/micro/kernels/embarc_mli/conv_slicing_test.cc @@ -0,0 +1,629 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/micro/kernels/all_ops_resolver.h" +#include "tensorflow/lite/micro/micro_utils.h" +#include "tensorflow/lite/micro/testing/micro_test.h" +#include "tensorflow/lite/micro/testing/test_utils.h" + +namespace tflite { +namespace testing { +namespace { + +// Common inputs and outputs. +static const int kInputElements = 16; +static const int kInputShape[] = {4, 2, 2, 4, 1}; +static const float kInputData[] = {1, 1, 1, 1, 2, 2, 2, 2, + 1, 2, 3, 4, 1, 2, 3, 4}; +static const int kFilterElements = 12; +static const int kFilterShape[] = {4, 3, 2, 2, 1}; +static const float kFilterData[] = {1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1}; +static const int kBiasElements = 3; +static const int kBiasShape[] = {1, 3}; +static const float kBiasData[] = {1, 2, 3}; +static const int kOutputElements = 12; +static const int kOutputShape[] = {4, 2, 1, 2, 3}; +static const float kGoldenData[] = {18, 2, 5, 18, 2, 5, 17, 4, 3, 37, 4, 3}; + +static TfLiteConvParams common_conv_params = { + kTfLitePaddingValid, // padding + 2, // stride_width + 2, // stride_height + kTfLiteActNone, // activation + 1, // dilation_width_factor + 1, // dilation_height_factor +}; + +template +TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size, + const T* expected_output_data, T* output_data, + int output_length, + TfLiteConvParams* conv_params, + float tolerance = 1e-5) { + TfLiteContext context; + PopulateContext(tensors, tensors_size, micro_test::reporter, &context); + + ::tflite::ops::micro::AllOpsResolver resolver; + + const TfLiteRegistration* registration = + resolver.FindOp(tflite::BuiltinOperator_CONV_2D, 1); + + TF_LITE_MICRO_EXPECT_NE(nullptr, registration); + + const char* init_data = reinterpret_cast(conv_params); + size_t init_data_size = 0; + void* user_data = nullptr; + + if (registration->init) { + user_data = registration->init(&context, init_data, init_data_size); + } + + int inputs_array_data[] = {3, 0, 1, 2}; + TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); + int outputs_array_data[] = {1, 3}; + TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); + int temporaries_array_data[] = {0}; + TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data); + + TfLiteNode node; + node.inputs = inputs_array; + node.outputs = outputs_array; + node.temporaries = temporaries_array; + node.user_data = user_data; + node.builtin_data = reinterpret_cast(conv_params); + node.custom_initial_data = nullptr; + node.custom_initial_data_size = 0; + node.delegate = nullptr; + + if (registration->prepare) { + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); + } + TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); + TfLiteStatus return_val = registration->invoke(&context, &node); + if (return_val != kTfLiteOk) { + return return_val; + } + + if (registration->free) { + registration->free(&context, user_data); + } + + for (int i = 0; i < output_length; ++i) { + TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], + tolerance); + } + return kTfLiteOk; +} + +void TestConvFloat(const int* input_dims_data, const float* input_data, + const int* filter_dims_data, const float* filter_data, + const int* bias_dims_data, const float* bias_data, + const int* output_dims_data, + const float* expected_output_data, float* output_data, + TfLiteConvParams* conv_params) { + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); + TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + CreateFloatTensor(input_data, input_dims, "input_tensor"), + CreateFloatTensor(filter_data, filter_dims, "filter_tensor"), + CreateFloatTensor(bias_data, bias_dims, "bias_tensor"), + CreateFloatTensor(output_data, output_dims, "output_tensor"), + }; + + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, + ValidateConvGoldens(tensors, tensors_size, expected_output_data, + output_data, output_dims_count, conv_params)); +} + +void TestConvQuantizedPerLayer( + const int* input_dims_data, const float* input_data, + uint8_t* input_quantized, float input_scale, const int* filter_dims_data, + const float* filter_data, uint8_t* filter_quantized, float filter_scale, + const int* bias_dims_data, const float* bias_data, int32_t* bias_quantized, + const int* output_dims_data, const float* expected_output_data, + uint8_t* expected_output_quantized, uint8_t* output_data, + float output_scale, TfLiteConvParams* conv_params) { + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); + TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + + tflite::AsymmetricQuantize(expected_output_data, expected_output_quantized, + output_dims_count, output_scale, 128); + + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + CreateQuantizedTensor(input_data, input_quantized, input_dims, + input_scale, 128, "input_tensor"), + CreateQuantizedTensor(filter_data, filter_quantized, filter_dims, + filter_scale, 128, "filter_tensor"), + CreateQuantizedBiasTensor(bias_data, bias_quantized, bias_dims, + input_scale, filter_scale, "bias_tensor"), + CreateQuantizedTensor(output_data, output_dims, output_scale, 128, + "output_tensor")}; + + // TODO(njeff): Affine Quantization Params should be set on tensor creation. + float filter_scales[] = {1, filter_scale}; + int filter_zero_points[] = {1, 128}; + TfLiteAffineQuantization filter_quant = { + FloatArrayFromFloats(filter_scales), + IntArrayFromInts(filter_zero_points)}; + tensors[1].quantization = {kTfLiteAffineQuantization, &filter_quant}; + + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, + ValidateConvGoldens(tensors, tensors_size, expected_output_quantized, + output_data, output_dims_count, conv_params)); +} + +void TestConvQuantizedPerChannel( + const int* input_dims_data, const float* input_data, + int8_t* input_quantized, float input_scale, int input_zero_point, + const int* filter_dims_data, const float* filter_data, + int8_t* filter_data_quantized, const int* bias_dims_data, + const float* bias_data, int32_t* bias_data_quantized, float* bias_scales, + int* bias_zero_points, const int* output_dims_data, + const float* expected_output_data, int8_t* expected_output_data_quantized, + int8_t* output_data, float output_scale, int output_zero_point, + TfLiteConvParams* conv_params) { + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); + TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + + int filter_zero_points[5]; + float filter_scales[5]; + TfLiteAffineQuantization filter_quant; + TfLiteAffineQuantization bias_quant; + TfLiteTensor input_tensor = + CreateQuantizedTensor(input_data, input_quantized, input_dims, + input_scale, input_zero_point, "input_tensor"); + TfLiteTensor filter_tensor = CreateSymmetricPerChannelQuantizedTensor( + filter_data, filter_data_quantized, filter_dims, filter_scales, + filter_zero_points, &filter_quant, 0 /* quantized dimension */, + "filter_tensor"); + TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor( + bias_data, bias_data_quantized, bias_dims, input_scale, &filter_scales[1], + bias_scales, bias_zero_points, &bias_quant, 0 /* quantized dimension */, + "bias_tensor"); + TfLiteTensor output_tensor = + CreateQuantizedTensor(output_data, output_dims, output_scale, + output_zero_point, "output_tensor"); + + // TODO(njeff): Affine Quantization Params should be set on tensor creation. + float input_scales[] = {1, input_scale}; + int input_zero_points[] = {1, input_zero_point}; + TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales), + IntArrayFromInts(input_zero_points)}; + input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant}; + + float output_scales[] = {1, output_scale}; + int output_zero_points[] = {1, output_zero_point}; + TfLiteAffineQuantization output_quant = { + FloatArrayFromFloats(output_scales), + IntArrayFromInts(output_zero_points)}; + output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant}; + + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + input_tensor, + filter_tensor, + bias_tensor, + output_tensor, + }; + + tflite::AsymmetricQuantize(expected_output_data, + expected_output_data_quantized, output_dims_count, + output_scale, output_zero_point); + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, + ValidateConvGoldens(tensors, tensors_size, expected_output_data_quantized, + output_data, output_dims_count, conv_params, + 1.0 /* tolerance */)); +} + +} // namespace +} // namespace testing +} // namespace tflite + +TF_LITE_MICRO_TESTS_BEGIN + +TF_LITE_MICRO_TEST(SimpleTestFloat) { + float output_data[tflite::testing::kOutputElements]; + + tflite::testing::TestConvFloat( + tflite::testing::kInputShape, tflite::testing::kInputData, + tflite::testing::kFilterShape, tflite::testing::kFilterData, + tflite::testing::kBiasShape, tflite::testing::kBiasData, + tflite::testing::kOutputShape, tflite::testing::kGoldenData, output_data, + &tflite::testing::common_conv_params); +} + +TF_LITE_MICRO_TEST(InputAndFilterSameWidthHeight) { + const int output_dims_count = 2; + float output_data[output_dims_count]; + + const int kFilterShape[] = {4, 1, 2, 4, 1}; + const float filter_values[] = {1, 2, 3, 4, -1, -1, 1, 1}; + const int kBiasShape[] = {1, 1}; + const float bias_values[] = {0}; + const int kOutputShape[] = {4, 2, 1, 1, 1}; + const float expected_output[] = {10, 34}; + + tflite::testing::TestConvFloat( + tflite::testing::kInputShape, tflite::testing::kInputData, kFilterShape, + filter_values, kBiasShape, bias_values, kOutputShape, expected_output, + output_data, &tflite::testing::common_conv_params); +} + +TF_LITE_MICRO_TEST(SimpleTestQuantized) { + const int output_dims_count = 12; + uint8_t output_data[output_dims_count]; + + const float input_scale = 0.5f; + const float filter_scale = 0.5f; + const float output_scale = 1.0f; + + uint8_t input_quantized[tflite::testing::kInputElements]; + uint8_t filter_quantized[tflite::testing::kFilterElements]; + int32_t bias_quantized[tflite::testing::kBiasElements]; + uint8_t golden_quantized[tflite::testing::kOutputElements]; + + tflite::testing::TestConvQuantizedPerLayer( + tflite::testing::kInputShape, tflite::testing::kInputData, + input_quantized, input_scale, tflite::testing::kFilterShape, + tflite::testing::kFilterData, filter_quantized, filter_scale, + tflite::testing::kBiasShape, tflite::testing::kBiasData, bias_quantized, + tflite::testing::kOutputShape, tflite::testing::kGoldenData, + golden_quantized, output_data, output_scale, + &tflite::testing::common_conv_params); +} + +TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) { + const int output_dims_count = 12; + int8_t output_data[output_dims_count]; + + const float input_scale = 0.5f; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + + int8_t input_quantized[tflite::testing::kInputElements]; + int8_t filter_quantized[tflite::testing::kFilterElements]; + int32_t bias_quantized[tflite::testing::kBiasElements]; + int8_t golden_quantized[tflite::testing::kOutputElements]; + int zero_points[tflite::testing::kBiasElements + 1]; + float scales[tflite::testing::kBiasElements + 1]; + + tflite::testing::TestConvQuantizedPerChannel( + tflite::testing::kInputShape, tflite::testing::kInputData, + input_quantized, input_scale, input_zero_point, + tflite::testing::kFilterShape, tflite::testing::kFilterData, + filter_quantized, tflite::testing::kBiasShape, tflite::testing::kBiasData, + bias_quantized, scales, zero_points, tflite::testing::kOutputShape, + tflite::testing::kGoldenData, golden_quantized, output_data, output_scale, + output_zero_point, &tflite::testing::common_conv_params); +} + +TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelRelu6) { + // conv params: + // padding, stride_, dilation_, activation + TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6}; + const int output_dims_count = 12; + int8_t output_data[output_dims_count]; + + const float bias_values[] = {1, 2, -3}; + const float golden_data[] = {6, 2, 0, 6, 2, 0, 6, 4, 0, 6, 4, 0}; + + const float input_scale = 0.023529f; + const float output_scale = 0.023529f; + const int input_zero_point = -128; + const int output_zero_point = -128; + + int8_t input_quantized[tflite::testing::kInputElements]; + int8_t filter_quantized[tflite::testing::kFilterElements]; + int32_t bias_quantized[tflite::testing::kBiasElements]; + int8_t golden_quantized[tflite::testing::kOutputElements]; + int zero_points[tflite::testing::kBiasElements + 1]; + float scales[tflite::testing::kBiasElements + 1]; + + tflite::testing::TestConvQuantizedPerChannel( + tflite::testing::kInputShape, tflite::testing::kInputData, + input_quantized, input_scale, input_zero_point, + tflite::testing::kFilterShape, tflite::testing::kFilterData, + filter_quantized, tflite::testing::kBiasShape, bias_values, + bias_quantized, scales, zero_points, tflite::testing::kOutputShape, + golden_data, golden_quantized, output_data, output_scale, + output_zero_point, &tflite::testing::common_conv_params); +} + +TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannel) { + // conv params: + // padding, stride_, activation, dilation_ + TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, + kTfLiteActNone, 1, 1}; + const int kInputShape[] = {4, 1, 2, 2, 4}; // [len,N,H,W,C] + const int kInputElements = + kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4]; + float kInputData[/* kInputElements */] = {1, 1, 1, 1, 2, 2, 2, 2, + 1, 2, 3, 4, 1, 2, 3, 4}; + const int kFilterShape[] = {4, 3, 1, 1, 4}; + const int kFilterElements = + kFilterShape[1] * kFilterShape[2] * kFilterShape[3] * kFilterShape[4]; + float kFilterData[/* kFilterElements */] = {1, 2, 3, 4, -1, 1, + -1, 1, -1, -1, 1, 1}; + const int kBiasElements = kFilterShape[1]; + const int kBiasShape[] = {1, kBiasElements}; + float kBiasData[/* kBiasElements */] = {1, 2, 3}; + const int kOutputShape[] = {4, 1, 2, 2, kBiasElements}; + const int kOutputElements = 4 * 3; + int8_t output_data[kOutputElements]; + const float kGoldenData[/* kOutputElements */] = {11, 2, 3, 21, 2, 3, + 31, 4, 7, 31, 4, 7}; + + const float input_scale = 0.5f; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + + int8_t input_quantized[kInputElements]; + int8_t filter_quantized[kFilterElements]; + int32_t bias_quantized[kBiasElements]; + int8_t golden_quantized[kOutputElements]; + int zero_points[kBiasElements + 1]; + float scales[kBiasElements + 1]; + + tflite::testing::TestConvQuantizedPerChannel( + kInputShape, kInputData, input_quantized, input_scale, input_zero_point, + kFilterShape, kFilterData, filter_quantized, kBiasShape, kBiasData, + bias_quantized, scales, zero_points, kOutputShape, kGoldenData, + golden_quantized, output_data, output_scale, output_zero_point, + &conv_params); +} + +TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannelRelu6) { + // conv params: + // padding, stride_, dilation_, activation + TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6}; + const int kInputShape[] = {4, 1, 2, 2, 4}; // [len,N,H,W,C] + const int kInputElements = + kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4]; + float kInputData[/* kInputElements */] = {1, 1, 1, 1, 2, 2, 2, 2, + 1, 2, 3, 4, 1, 2, 3, 4}; + const int kFilterShape[] = {4, 3, 1, 1, 4}; + const int kFilterElements = + kFilterShape[1] * kFilterShape[2] * kFilterShape[3] * kFilterShape[4]; + float kFilterData[/* kFilterElements */] = {1, 2, 3, 4, -1, 1, + -1, 1, -1, -1, 1, 1}; + const int kBiasElements = kFilterShape[1]; + const int kBiasShape[] = {1, kBiasElements}; + float kBiasData[/* kBiasElements */] = {1, 2, -3}; + const int kOutputShape[] = {4, 1, 2, 2, kBiasElements}; + const int kOutputElements = 4 * 3; + int8_t output_data[kOutputElements]; + const float kGoldenData[/* kOutputElements */] = {6, 2, 0, 6, 2, 0, + 6, 4, 1, 6, 4, 1}; + + const float input_scale = 0.023529f; + const float output_scale = 0.023529f; + const int input_zero_point = -128; + const int output_zero_point = -128; + + int8_t input_quantized[kInputElements]; + int8_t filter_quantized[kFilterElements]; + int32_t bias_quantized[kBiasElements]; + int8_t golden_quantized[kOutputElements]; + int zero_points[kBiasElements + 1]; + float scales[kBiasElements + 1]; + + tflite::testing::TestConvQuantizedPerChannel( + kInputShape, kInputData, input_quantized, input_scale, input_zero_point, + kFilterShape, kFilterData, filter_quantized, kBiasShape, kBiasData, + bias_quantized, scales, zero_points, kOutputShape, kGoldenData, + golden_quantized, output_data, output_scale, output_zero_point, + &conv_params); +} + +TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) { + const int output_dims_count = 12; + int8_t output_data[output_dims_count]; + + const float input_scale = 0.5f; + const float output_scale = 1.0f; + + int8_t input_quantized[tflite::testing::kInputElements]; + int8_t filter_quantized[tflite::testing::kFilterElements]; + int32_t bias_quantized[tflite::testing::kBiasElements]; + int8_t golden_quantized[tflite::testing::kOutputElements]; + int zero_points[tflite::testing::kBiasElements + 1]; + float scales[tflite::testing::kBiasElements + 1]; + + TfLiteIntArray* input_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kInputShape); + TfLiteIntArray* filter_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kFilterShape); + TfLiteIntArray* bias_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kBiasShape); + TfLiteIntArray* output_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kOutputShape); + + int filter_zero_points[5]; + float filter_scales[5]; + TfLiteAffineQuantization filter_quant; + TfLiteAffineQuantization bias_quant; + TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor( + tflite::testing::kInputData, input_quantized, input_dims, input_scale, 0, + "input_tensor"); + TfLiteTensor filter_tensor = + tflite::testing::CreateSymmetricPerChannelQuantizedTensor( + tflite::testing::kFilterData, filter_quantized, filter_dims, + filter_scales, filter_zero_points, &filter_quant, + 0 /* quantized dimension */, "filter_tensor"); + TfLiteTensor bias_tensor = + tflite::testing::CreatePerChannelQuantizedBiasTensor( + tflite::testing::kBiasData, bias_quantized, bias_dims, input_scale, + &filter_scales[1], scales, zero_points, &bias_quant, 0, + "bias_tensor"); + TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor( + output_data, output_dims, output_scale, 0 /* quantized dimension */, + "output_tensor"); + + float input_scales[] = {1, input_scale}; + int input_zero_points[] = {1, 128}; + TfLiteAffineQuantization input_quant = { + tflite::testing::FloatArrayFromFloats(input_scales), + tflite::testing::IntArrayFromInts(input_zero_points)}; + input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant}; + + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + input_tensor, + filter_tensor, + bias_tensor, + output_tensor, + }; + + tflite::AsymmetricQuantize(tflite::testing::kGoldenData, golden_quantized, + output_dims_count, output_scale, 0); + + // Set filter quant to mismatched dimension. + TfLiteAffineQuantization* quant = reinterpret_cast( + filter_tensor.quantization.params); + + // Choose arbitrary incorrect scale and zero point sizes which are neither 1 + // (for broadcast case) nor the quantized dimension size. + quant->scale->size = 2; + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteError, + tflite::testing::ValidateConvGoldens( + tensors, tensors_size, golden_quantized, output_data, + output_dims_count, &tflite::testing::common_conv_params)); + + // Set scale back to correct dimension, and make zero point array too short. + quant->scale->size = tflite::testing::kFilterShape[0]; + quant->zero_point->size = 2; + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteError, + tflite::testing::ValidateConvGoldens( + tensors, tensors_size, golden_quantized, output_data, + output_dims_count, &tflite::testing::common_conv_params)); +} + +TF_LITE_MICRO_TEST(BroadcastPerLayerQuantizationToPerChannelShouldMatchGolden) { + const int output_dims_count = 12; + int8_t output_data[output_dims_count]; + + const float input_scale = 1.0f; + const float filter_scale = 1.0f; + const float output_scale = 1.0f; + + int8_t input_quantized[tflite::testing::kInputElements]; + int8_t filter_quantized[tflite::testing::kFilterElements]; + int32_t bias_quantized[tflite::testing::kBiasElements]; + int8_t golden_quantized[tflite::testing::kOutputElements]; + + TfLiteIntArray* input_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kInputShape); + TfLiteIntArray* filter_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kFilterShape); + TfLiteIntArray* bias_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kBiasShape); + TfLiteIntArray* output_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kOutputShape); + + // Create per-layer quantized int8 input tensor. + TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor( + tflite::testing::kInputData, input_quantized, input_dims, input_scale, 0, + "input_tensor"); + int input_zero_points[2] = {1, 0}; + float input_scales[2] = {1, input_scale}; + TfLiteAffineQuantization input_quant = { + tflite::testing::FloatArrayFromFloats(input_scales), + tflite::testing::IntArrayFromInts(input_zero_points)}; + input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant}; + + // Create per-layer quantized int8 filter tensor. + TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor( + tflite::testing::kFilterData, filter_quantized, filter_dims, filter_scale, + 0, "filter_tensor"); + int filter_zero_points[2] = {1, 0}; + float filter_scales[2] = {1, filter_scale}; + TfLiteAffineQuantization filter_quant = { + tflite::testing::FloatArrayFromFloats(filter_scales), + tflite::testing::IntArrayFromInts(filter_zero_points)}; + filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant}; + + // Create per-layer quantized int32 bias tensor. + tflite::SymmetricQuantize(tflite::testing::kBiasData, bias_quantized, + tflite::testing::kBiasElements, + input_scale * output_scale); + TfLiteTensor bias_tensor = tflite::testing::CreateInt32Tensor( + bias_quantized, bias_dims, "bias_tensor"); + + int bias_zero_points[2] = {1, 0}; + float bias_scales[2] = {1, input_scale * filter_scale}; + TfLiteAffineQuantization bias_quant = { + tflite::testing::FloatArrayFromFloats(bias_scales), + tflite::testing::IntArrayFromInts(bias_zero_points)}; + bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant}; + + // Create per-layer quantized int8 output tensor. + TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor( + output_data, output_dims, output_scale, 0 /* quantized dimension */, + "output_tensor"); + int output_zero_points[2] = {1, 0}; + float output_scales[2] = {1, output_scale}; + TfLiteAffineQuantization output_quant = { + tflite::testing::FloatArrayFromFloats(output_scales), + tflite::testing::IntArrayFromInts(output_zero_points)}; + output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant}; + + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + input_tensor, + filter_tensor, + bias_tensor, + output_tensor, + }; + + tflite::AsymmetricQuantize(tflite::testing::kGoldenData, golden_quantized, + output_dims_count, output_scale, 0); + + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, tflite::testing::ValidateConvGoldens( + tensors, tensors_size, golden_quantized, output_data, + output_dims_count, &tflite::testing::common_conv_params)); +} + +TF_LITE_MICRO_TESTS_END diff --git a/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv_slicing_test.cc b/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv_slicing_test.cc new file mode 100644 index 00000000000..8b79885a8a8 --- /dev/null +++ b/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv_slicing_test.cc @@ -0,0 +1,768 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/micro/kernels/all_ops_resolver.h" +#include "tensorflow/lite/micro/testing/micro_test.h" +#include "tensorflow/lite/micro/testing/test_utils.h" + +namespace tflite { +namespace testing { +namespace { + +constexpr int kMaxFilterChannels = 64; +constexpr int kMaxBiasChannels = 64; + +// Index of the output tensor in context->tensors, specific to +// DepthwiseConv. +constexpr int kOutputTensorIndex = 3; + +// Creates a DepthwiseConv opeerator, calls it with the provided input tensors +// and some defaults parameters, and compares the output with +// expected_output_data. +// +// The tensors parameter contains both the input tensors as well as a +// preallocated output tensor into which the output is stored. +template +TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data, + int output_length, + TfLiteFusedActivation activation, + float tolerance, int tensors_size, + TfLiteTensor* tensors) { + TfLiteContext context; + PopulateContext(tensors, tensors_size, micro_test::reporter, &context); + + ::tflite::ops::micro::AllOpsResolver resolver; + const TfLiteRegistration* registration = + resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1); + TF_LITE_MICRO_EXPECT_NE(nullptr, registration); + + int input_depth = tensors[0].dims->data[3]; + int output_depth = tensors[1].dims->data[3]; + int depth_mul = output_depth / input_depth; + TfLiteDepthwiseConvParams builtin_data; + builtin_data.padding = kTfLitePaddingValid; + builtin_data.activation = activation; + builtin_data.stride_height = 1; + builtin_data.stride_width = 1; + builtin_data.dilation_height_factor = 1; + builtin_data.dilation_width_factor = 1; + builtin_data.depth_multiplier = depth_mul; + + const char* init_data = reinterpret_cast(&builtin_data); + size_t init_data_size = 0; + void* user_data = nullptr; + if (registration->init) { + user_data = registration->init(&context, init_data, init_data_size); + } + int inputs_array_data[] = {3, 0, 1, 2}; + TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); + int outputs_array_data[] = {1, 3}; + TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); + int temporaries_array_data[] = {0}; + TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data); + + TfLiteNode node; + node.inputs = inputs_array; + node.outputs = outputs_array; + node.temporaries = temporaries_array; + node.user_data = user_data; + node.builtin_data = reinterpret_cast(&builtin_data); + node.custom_initial_data = nullptr; + node.custom_initial_data_size = 0; + node.delegate = nullptr; + if (registration->prepare) { + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); + } + TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); + TfLiteStatus invoke_status = registration->invoke(&context, &node); + if (invoke_status != kTfLiteOk) { + return invoke_status; + } + + if (registration->free) { + registration->free(&context, user_data); + } + + const T* output_data = tflite::GetTensorData(&tensors[kOutputTensorIndex]); + for (int i = 0; i < output_length; ++i) { + TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], + tolerance); + } + return kTfLiteOk; +} + +void TestDepthwiseConvFloat(const int* input_dims_data, const float* input_data, + const int* filter_dims_data, + const float* filter_data, const int* bias_dims_data, + const float* bias_data, + const float* expected_output_data, + const int* output_dims_data, + TfLiteFusedActivation activation, + float* output_data) { + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); + TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + CreateFloatTensor(input_data, input_dims, "input_tensor"), + CreateFloatTensor(filter_data, filter_dims, "filter_tensor"), + CreateFloatTensor(bias_data, bias_dims, "bias_tensor"), + CreateFloatTensor(output_data, output_dims, "output_tensor"), + }; + + ValidateDepthwiseConvGoldens(expected_output_data, output_dims_count, + activation, 1e-5, tensors_size, tensors); +} + +void TestDepthwiseConvQuantizedPerLayer( + const int* input_dims_data, const float* input_data, + uint8_t* input_quantized, float input_scale, int input_zero_point, + const int* filter_dims_data, const float* filter_data, + uint8_t* filter_quantized, float filter_scale, int filter_zero_point, + const int* bias_dims_data, const float* bias_data, int32_t* bias_quantized, + const float* golden, uint8_t* golden_quantized, const int* output_dims_data, + uint8_t* output_data, float output_scale, int output_zero_point, + TfLiteFusedActivation activation) { + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); + TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + tflite::testing::CreateQuantizedTensor(input_data, input_quantized, + input_dims, input_scale, + input_zero_point, "input_tensor"), + tflite::testing::CreateQuantizedTensor( + filter_data, filter_quantized, filter_dims, filter_scale, + filter_zero_point, "filter_tensor"), + tflite::testing::CreateQuantizedBiasTensor(bias_data, bias_quantized, + bias_dims, input_scale, + filter_scale, "bias_tensor"), + tflite::testing::CreateQuantizedTensor(output_data, output_dims, + output_scale, output_zero_point, + "output_tensor"), + }; + + // TODO(njeff): Affine Quantization Params should be set on tensor creation. + float filter_scales[] = {1, filter_scale}; + int filter_zero_points[] = {1, 128}; + TfLiteAffineQuantization filter_quant = { + FloatArrayFromFloats(filter_scales), + IntArrayFromInts(filter_zero_points)}; + tensors[1].quantization = {kTfLiteAffineQuantization, &filter_quant}; + + float bias_scales[] = {1, filter_scale * input_scale}; + int bias_zero_points[] = {1, 128}; + TfLiteAffineQuantization bias_quant = {FloatArrayFromFloats(bias_scales), + IntArrayFromInts(bias_zero_points)}; + tensors[2].quantization = {kTfLiteAffineQuantization, &bias_quant}; + + AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale, + output_zero_point); + ValidateDepthwiseConvGoldens(golden_quantized, output_dims_count, activation, + 1.0, tensors_size, tensors); +} + +void TestDepthwiseConvQuantizedPerChannel( + const int* input_dims_data, const float* input_data, + int8_t* input_quantized, float input_scale, int input_zero_point, + const int* filter_dims_data, const float* filter_data, + int8_t* filter_data_quantized, const int* bias_dims_data, + const float* bias_data, int32_t* bias_data_quantized, + const int* output_dims_data, const float* expected_output_data, + int8_t* expected_output_data_quantized, int8_t* output_data, + float output_scale, int output_zero_point, + TfLiteFusedActivation activation) { + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); + TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + + int filter_zero_points[kMaxFilterChannels]; + float filter_scales[kMaxFilterChannels]; + int bias_zero_points[kMaxBiasChannels]; + float bias_scales[kMaxBiasChannels]; + TfLiteAffineQuantization filter_quant; + TfLiteAffineQuantization bias_quant; + TfLiteTensor input_tensor = + CreateQuantizedTensor(input_data, input_quantized, input_dims, + input_scale, input_zero_point, "input_tensor"); + TfLiteTensor filter_tensor = CreateSymmetricPerChannelQuantizedTensor( + filter_data, filter_data_quantized, filter_dims, filter_scales, + filter_zero_points, &filter_quant, 3 /* quantized dimension */, + "filter_tensor"); + TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor( + bias_data, bias_data_quantized, bias_dims, input_scale, &filter_scales[1], + bias_scales, bias_zero_points, &bias_quant, 3 /* quantized dimension */, + "bias_tensor"); + TfLiteTensor output_tensor = + CreateQuantizedTensor(output_data, output_dims, output_scale, + input_zero_point, "output_tensor"); + + // TODO(njeff): Affine Quantization Params should be set on tensor creation. + float input_scales[] = {1, input_scale}; + int input_zero_points[] = {1, input_zero_point}; + TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales), + IntArrayFromInts(input_zero_points)}; + input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant}; + + float output_scales[] = {1, output_scale}; + int output_zero_points[] = {1, output_zero_point}; + TfLiteAffineQuantization output_quant = { + FloatArrayFromFloats(output_scales), + IntArrayFromInts(output_zero_points)}; + output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant}; + + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + input_tensor, + filter_tensor, + bias_tensor, + output_tensor, + }; + + AsymmetricQuantize(expected_output_data, expected_output_data_quantized, + output_dims_count, output_scale, output_zero_point); + + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, ValidateDepthwiseConvGoldens(expected_output_data_quantized, + output_dims_count, activation, + 1.0, tensors_size, tensors)); +} + +} // namespace +} // namespace testing +} // namespace tflite + +TF_LITE_MICRO_TESTS_BEGIN + +TF_LITE_MICRO_TEST(SimpleTest) { + const int input_elements = 12; + const int input_shape[] = {4, 1, 3, 2, 2}; + const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; + const int filter_elements = 16; + const int filter_shape[] = {4, 1, 2, 2, 4}; + const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, + 5, 6, 7, 8, 13, -14, 15, -16}; + const int bias_elements = 4; + const int bias_shape[] = {4, 1, 1, 1, 4}; + const float bias_values[] = {1, 2, 3, 4}; + const float golden[] = { + 71, -34, 99, -20, 91, -26, 127, -4, + }; + const int output_shape[] = {4, 1, 2, 1, 4}; + const int output_dims_count = 8; + float output_data[output_dims_count]; + tflite::testing::TestDepthwiseConvFloat( + input_shape, input_values, filter_shape, filter_values, bias_shape, + bias_values, golden, output_shape, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTestQuantized) { + const int input_elements = 12; + const int input_shape[] = {4, 1, 3, 2, 2}; + const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; + const int filter_elements = 16; + const int filter_shape[] = {4, 1, 2, 2, 4}; + const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, + 5, 6, 7, 8, 13, -14, 15, -16}; + const int bias_elements = 4; + const int bias_shape[] = {4, 1, 1, 1, 4}; + const int output_elements = 8; + const float bias_values[] = {1, 2, 3, 4}; + const float golden[] = { + 71, -34, 99, -20, 91, -26, 127, -4, + }; + const int output_shape[] = {4, 1, 2, 1, 4}; + + const float input_scale = 0.5f; + const int input_zero_point = 128; + const float filter_scale = 0.5f; + const int filter_zero_point = 128; + const float output_scale = 1.0f; + const int output_zero_point = 128; + + uint8_t input_quantized[input_elements]; + uint8_t filter_quantized[filter_elements]; + int32_t bias_quantized[bias_elements]; + uint8_t golden_quantized[output_elements]; + uint8_t output_data[output_elements]; + + tflite::testing::TestDepthwiseConvQuantizedPerLayer( + input_shape, input_values, input_quantized, input_scale, input_zero_point, + filter_shape, filter_values, filter_quantized, filter_scale, + filter_zero_point, bias_shape, bias_values, bias_quantized, golden, + golden_quantized, output_shape, output_data, output_scale, + output_zero_point, kTfLiteActNone); +} + +TF_LITE_MICRO_TEST(SimpleTestRelu) { + const int input_elements = 12; + const int input_shape[] = {4, 1, 3, 2, 2}; + const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; + const int filter_elements = 16; + const int filter_shape[] = {4, 1, 2, 2, 4}; + const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, + 5, 6, 7, 8, 13, -14, 15, -16}; + const int bias_elements = 4; + const int bias_shape[] = {4, 1, 1, 1, 4}; + const int output_elements = 8; + const float bias_values[] = {1, 2, 3, 4}; + const int output_shape[] = {4, 1, 2, 1, 4}; + const int output_dims_count = 8; + const float golden_relu[] = {71, 0, 99, 0, 91, 0, 127, 0}; + float output_data[output_dims_count]; + + tflite::testing::TestDepthwiseConvFloat( + input_shape, input_values, filter_shape, filter_values, bias_shape, + bias_values, golden_relu, output_shape, kTfLiteActRelu, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTestReluQuantized) { + const int input_elements = 12; + const int input_shape[] = {4, 1, 3, 2, 2}; + const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; + const int filter_elements = 16; + const int filter_shape[] = {4, 1, 2, 2, 4}; + const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, + 5, 6, 7, 8, 13, -14, 15, -16}; + const int bias_elements = 4; + const int bias_shape[] = {4, 1, 1, 1, 4}; + const int output_elements = 8; + const float bias_values[] = {1, 2, 3, 4}; + const int output_shape[] = {4, 1, 2, 1, 4}; + const int output_dims_count = 8; + const float golden_relu[] = {71, 0, 99, 0, 91, 0, 127, 0}; + + const float input_scale = 0.5f; + const int input_zero_point = 128; + const float filter_scale = 0.5f; + const int filter_zero_point = 128; + const float output_scale = 1.0f; + const int output_zero_point = 128; + + uint8_t input_quantized[input_elements]; + uint8_t filter_quantized[filter_elements]; + int32_t bias_quantized[bias_elements]; + uint8_t golden_quantized[output_elements]; + uint8_t output_data[output_elements]; + + tflite::testing::TestDepthwiseConvQuantizedPerLayer( + input_shape, input_values, input_quantized, input_scale, input_zero_point, + filter_shape, filter_values, filter_quantized, filter_scale, + filter_zero_point, bias_shape, bias_values, bias_quantized, golden_relu, + golden_quantized, output_shape, output_data, output_scale, + output_zero_point, kTfLiteActRelu); +} + +TF_LITE_MICRO_TEST(SimpleTestOptimizedFilterWidth) { + const int input_elements = 12; + const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; + const int filter_elements = 16; + const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, + 5, 6, 7, 8, 13, -14, 15, -16}; + const int bias_elements = 4; + const float bias_values[] = {1, 2, 3, 4}; + const int output_dims_count = 9; + const int input_shape[] = {4, 1, 1, 9, 1}; + const int filter_shape[] = {4, 2, 1, 8, 1}; + const int bias_shape[] = {1, 1}; + const float goldens[] = { + 92, 56, 12, 22, 33, 72, 44, 20, 5, + }; + const int output_shape[] = {4, 1, 1, 9, 1}; + + const float input_scale = 1.0f; + const int input_zero_point = 128; + const float filter_scale = 0.5f; + const int filter_zero_point = 128; + const float output_scale = 1.0f; + const int output_zero_point = 128; + + uint8_t input_quantized[input_elements]; + uint8_t filter_quantized[filter_elements]; + int32_t bias_quantized[bias_elements]; + uint8_t golden_quantized[output_dims_count]; + uint8_t output_data[output_dims_count]; + + tflite::testing::TestDepthwiseConvQuantizedPerLayer( + input_shape, input_values, input_quantized, input_scale, input_zero_point, + filter_shape, filter_values, filter_quantized, filter_scale, + filter_zero_point, bias_shape, bias_values, bias_quantized, goldens, + golden_quantized, output_shape, output_data, output_scale, + output_zero_point, kTfLiteActNone); +} + +TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) { + const int input_elements = 12; + const int input_shape[] = {4, 1, 3, 2, 2}; + const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; + const int filter_elements = 16; + const int filter_shape[] = {4, 1, 2, 2, 4}; + const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, + 5, 6, 7, 8, 13, -14, 15, -16}; + const int bias_elements = 4; + const int bias_shape[] = {4, 1, 1, 1, 4}; + const int output_elements = 8; + const float bias_values[] = {1, 2, 3, 4}; + const float golden[] = { + 71, -34, 99, -20, 91, -26, 127, -4, + }; + const int output_shape[] = {4, 1, 2, 1, 4}; + const int output_dims_count = 8; + int8_t output_data[output_dims_count]; + + const float input_scale = 0.5; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + + int8_t input_quantized[input_elements]; + int8_t filter_quantized[filter_elements]; + int32_t bias_quantized[bias_elements]; + int8_t golden_quantized[output_elements]; + int zero_points[bias_elements + 1]; + float scales[bias_elements + 1]; + + tflite::testing::TestDepthwiseConvQuantizedPerChannel( + input_shape, input_values, input_quantized, input_scale, input_zero_point, + filter_shape, filter_values, filter_quantized, bias_shape, bias_values, + bias_quantized, output_shape, golden, golden_quantized, output_data, + output_scale, output_zero_point, kTfLiteActNone); +} + +TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelDepthMultiplier1) { + const int input_elements = 12; + const int input_shape[] = {4, 1, 3, 2, 2}; + const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; + const int filter_elements = 8; + const int filter_shape[] = {4, 1, 2, 2, 2}; + const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12}; + const int bias_elements = 2; + const int bias_shape[] = {4, 1, 1, 1, 2}; + const int output_elements = 4; + const float bias_values[] = {1, 2}; + const float golden[] = { + -103, + 127, + -128, + 127, + }; + const int output_shape[] = {4, 1, 2, 1, 2}; + const int output_dims_count = 4; + int8_t output_data[output_dims_count]; + + const float input_scale = 1.0f; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + + int8_t input_quantized[input_elements]; + int8_t filter_quantized[filter_elements]; + int32_t bias_quantized[bias_elements]; + int8_t golden_quantized[output_elements]; + int zero_points[bias_elements + 1]; + float scales[bias_elements + 1]; + + tflite::testing::TestDepthwiseConvQuantizedPerChannel( + input_shape, input_values, input_quantized, input_scale, input_zero_point, + filter_shape, filter_values, filter_quantized, bias_shape, bias_values, + bias_quantized, output_shape, golden, golden_quantized, output_data, + output_scale, output_zero_point, kTfLiteActNone); +} + +TF_LITE_MICRO_TEST(TestQuantizedPerChannelDepthMultiplier1Relu6) { + const int input_elements = 24; + const int input_shape[] = {4, 1, 3, 2, 4}; + const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + const int filter_elements = 16; + const int filter_shape[] = {4, 1, 2, 2, 4}; + const float filter_values[] = {0, 1, 8, -2, -1, 2, -10, 0, + -1, 3, -18, 0, 0, 4, 20, -3}; + const int bias_elements = 4; + const int bias_shape[] = {4, 1, 1, 1, 4}; + const int output_elements = 8; + const float bias_values[] = {1, 2, 3, 4}; + const float golden[] = { + 0, 6, 3, 0, 0, 6, 3, 0, + }; + const int output_shape[] = {4, 1, 2, 1, 4}; + int8_t output_data[output_elements]; + float output_float[output_elements]; + + const float input_scale = 0.023529f; + const float output_scale = 0.023529f; + const int input_zero_point = -128; + const int output_zero_point = -128; + + int8_t input_quantized[input_elements]; + int8_t filter_quantized[filter_elements]; + int32_t bias_quantized[bias_elements]; + int8_t golden_quantized[output_elements]; + int zero_points[bias_elements + 1]; + float scales[bias_elements + 1]; + + tflite::testing::TestDepthwiseConvFloat( + input_shape, input_values, filter_shape, filter_values, bias_shape, + bias_values, golden, output_shape, kTfLiteActRelu6, output_float); + + tflite::testing::TestDepthwiseConvQuantizedPerChannel( + input_shape, input_values, input_quantized, input_scale, input_zero_point, + filter_shape, filter_values, filter_quantized, bias_shape, bias_values, + bias_quantized, output_shape, golden, golden_quantized, output_data, + output_scale, output_zero_point, kTfLiteActRelu6); +} + +TF_LITE_MICRO_TEST(TestQuantizedPerChannelCompareWithFloat) { + const int input_dims[] = {4, 1, 2, 3, 2}; + const float input_data[] = {3, 2, 1, -1, -2, -3, 4, 3, 2, -2, -3, -4}; + const int filter_dims[] = {4, 1, 2, 2, 4}; + const float filter_data[] = {1, 2, 3, 4, 3, 4, 5, 6, 7, 8, 5, 6, 3, 4, 1, 2}; + const int bias_dims[] = {4, 1, 1, 1, 4}; + const float bias_data[] = {3, -2, 4, 6}; + const int output_dims[] = {4, 1, 1, 2, 4}; + const float golden[] = {43, 48, 18, 22, 3, -4, -28, -36}; + + const int input_size = 12; + const int filter_size = 16; + const int output_size = 8; + const int bias_size = 4; + int8_t input_quantized[input_size]; + int8_t filter_quantized[filter_size]; + int32_t bias_quantized[bias_size]; + int8_t golden_quantized[output_size]; + int zero_points[bias_size + 1]; + float scales[bias_size + 1]; + int8_t output_data[output_size]; + float output_float[output_size]; + + const float input_scale = 0.5; + const float output_scale = 1.0; + const int input_zero_point = 0; + const int output_zero_point = 0; + + tflite::testing::TestDepthwiseConvQuantizedPerChannel( + input_dims, input_data, input_quantized, input_scale, input_zero_point, + filter_dims, filter_data, filter_quantized, bias_dims, bias_data, + bias_quantized, output_dims, golden, golden_quantized, output_data, + output_scale, output_zero_point, kTfLiteActNone); + + tflite::testing::TestDepthwiseConvFloat( + input_dims, input_data, filter_dims, filter_data, bias_dims, bias_data, + golden, output_dims, kTfLiteActNone, output_float); +} + +TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) { + const int input_shape[] = {4, 1, 2, 3, 2}; + const float input_data[] = {3, 2, 1, -1, -2, -3, 4, 3, 2, -2, -3, -4}; + const int filter_shape[] = {4, 1, 2, 2, 4}; + const float filter_data[] = {1, 2, 3, 4, 3, 4, 5, 6, 7, 8, 5, 6, 3, 4, 1, 2}; + const int bias_shape[] = {4, 1, 1, 1, 4}; + const float bias_data[] = {3, -2, 4, 6}; + const int output_shape[] = {4, 1, 1, 2, 4}; + const float golden[] = {43, 48, 18, 22, 3, -4, -28, -36}; + + const int input_size = 12; + const int filter_size = 16; + const int output_size = 8; + const int bias_size = 4; + int8_t input_quantized[input_size]; + int8_t filter_quantized[filter_size]; + int32_t bias_quantized[bias_size]; + int8_t golden_quantized[output_size]; + int zero_points[bias_size + 1]; + float scales[bias_size + 1]; + int8_t output_data[output_size]; + float output_float[output_size]; + + const float input_scale = 0.5; + const float output_scale = 1.0; + const int input_zero_point = 0; + const int output_zero_point = 0; + + TfLiteIntArray* input_dims = tflite::testing::IntArrayFromInts(input_shape); + TfLiteIntArray* filter_dims = tflite::testing::IntArrayFromInts(filter_shape); + TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape); + TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape); + + int filter_zero_points[5]; + float filter_scales[5]; + TfLiteAffineQuantization filter_quant; + TfLiteAffineQuantization bias_quant; + TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor( + input_data, input_quantized, input_dims, input_scale, input_zero_point, + "input_tensor"); + TfLiteTensor filter_tensor = + tflite::testing::CreateSymmetricPerChannelQuantizedTensor( + filter_data, filter_quantized, filter_dims, filter_scales, + filter_zero_points, &filter_quant, 0 /* quantized dimension */, + "filter_tensor"); + TfLiteTensor bias_tensor = + tflite::testing::CreatePerChannelQuantizedBiasTensor( + bias_data, bias_quantized, bias_dims, input_scale, &filter_scales[1], + scales, zero_points, &bias_quant, 0, "bias_tensor"); + TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor( + output_data, output_dims, output_scale, output_zero_point, + "output_tensor"); + + float input_scales[] = {1, input_scale}; + int input_zero_points[] = {1, input_zero_point}; + TfLiteAffineQuantization input_quant = { + tflite::testing::FloatArrayFromFloats(input_scales), + tflite::testing::IntArrayFromInts(input_zero_points)}; + input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant}; + + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + input_tensor, + filter_tensor, + bias_tensor, + output_tensor, + }; + + // Set filter quant to mismatched dimension. + TfLiteAffineQuantization* quant = reinterpret_cast( + filter_tensor.quantization.params); + quant->scale->size = 2; + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteError, tflite::testing::ValidateDepthwiseConvGoldens( + golden_quantized, output_size, kTfLiteActNone, 1e-5, + tensors_size, tensors)); + + // Set scale back to correct dimension, and make zero point array too short. + quant->scale->size = filter_shape[0]; + quant->zero_point->size = 2; + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteError, tflite::testing::ValidateDepthwiseConvGoldens( + golden_quantized, output_size, kTfLiteActNone, 1e-5, + tensors_size, tensors)); +} + +TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) { + const float input_scale = 1.0f; + const float filter_scale = 1.0f; + const float output_scale = 1.0f; + + const int input_elements = 12; + const int input_shape[] = {4, 1, 3, 2, 2}; + const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; + const int filter_elements = 16; + const int filter_shape[] = {4, 1, 2, 2, 4}; + const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, + 5, 6, 7, 8, 13, -14, 15, -16}; + const int bias_elements = 4; + const int bias_shape[] = {4, 1, 1, 1, 4}; + const int output_elements = 8; + const float bias_values[] = {1, 2, 3, 4}; + const float golden[] = { + 71, -34, 99, -20, 91, -26, 127, -4, + }; + const int output_shape[] = {4, 1, 2, 1, 4}; + const int output_dims_count = 8; + int8_t output_data[output_dims_count]; + + int8_t input_quantized[input_elements]; + int8_t filter_quantized[filter_elements]; + int32_t bias_quantized[bias_elements]; + int8_t golden_quantized[output_elements]; + + TfLiteIntArray* input_dims = tflite::testing::IntArrayFromInts(input_shape); + TfLiteIntArray* filter_dims = tflite::testing::IntArrayFromInts(filter_shape); + TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape); + TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape); + + // Create per-layer quantized int8 input tensor. + TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor( + input_values, input_quantized, input_dims, input_scale, 0, + "input_tensor"); + int input_zero_points[2] = {1, 0}; + float input_scales[2] = {1, input_scale}; + TfLiteAffineQuantization input_quant = { + tflite::testing::FloatArrayFromFloats(input_scales), + tflite::testing::IntArrayFromInts(input_zero_points)}; + input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant}; + + // Create per-layer quantized int8 filter tensor. + TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor( + filter_values, filter_quantized, filter_dims, filter_scale, 0, + "filter_tensor"); + int filter_zero_points[2] = {1, 0}; + float filter_scales[2] = {1, filter_scale}; + TfLiteAffineQuantization filter_quant = { + tflite::testing::FloatArrayFromFloats(filter_scales), + tflite::testing::IntArrayFromInts(filter_zero_points)}; + filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant}; + + // Create per-layer quantized int32 bias tensor. + tflite::SymmetricQuantize(bias_values, bias_quantized, bias_elements, + input_scale * output_scale); + TfLiteTensor bias_tensor = tflite::testing::CreateInt32Tensor( + bias_quantized, bias_dims, "bias_tensor"); + + int bias_zero_points[2] = {1, 0}; + float bias_scales[2] = {1, input_scale * filter_scale}; + TfLiteAffineQuantization bias_quant = { + tflite::testing::FloatArrayFromFloats(bias_scales), + tflite::testing::IntArrayFromInts(bias_zero_points)}; + bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant}; + + // Create per-layer quantized int8 output tensor. + TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor( + output_data, output_dims, output_scale, 0, "output_tensor"); + int output_zero_points[2] = {1, 0}; + float output_scales[2] = {1, output_scale}; + TfLiteAffineQuantization output_quant = { + tflite::testing::FloatArrayFromFloats(output_scales), + tflite::testing::IntArrayFromInts(output_zero_points)}; + output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant}; + + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + input_tensor, + filter_tensor, + bias_tensor, + output_tensor, + }; + + tflite::AsymmetricQuantize(golden, golden_quantized, output_dims_count, + output_scale, 0); + + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, tflite::testing::ValidateDepthwiseConvGoldens( + golden_quantized, output_dims_count, kTfLiteActNone, 1e-5, + tensors_size, tensors)); +} + +TF_LITE_MICRO_TESTS_END diff --git a/tensorflow/lite/micro/kernels/embarc_mli/fully_connected_slicing_test.cc b/tensorflow/lite/micro/kernels/embarc_mli/fully_connected_slicing_test.cc new file mode 100644 index 00000000000..539c7ecc3a4 --- /dev/null +++ b/tensorflow/lite/micro/kernels/embarc_mli/fully_connected_slicing_test.cc @@ -0,0 +1,938 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/micro/kernels/all_ops_resolver.h" +#include "tensorflow/lite/micro/testing/micro_test.h" +#include "tensorflow/lite/micro/testing/test_utils.h" + +namespace tflite { +namespace testing { +namespace { + +void TestFullyConnectedFloat( + const int* input_dims_data, const float* input_data, + const int* weights_dims_data, const float* weights_data, + const int* bias_dims_data, const float* bias_data, + const float* expected_output_data, const int* output_dims_data, + TfLiteFusedActivation activation, float* output_data) { + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data); + TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + CreateFloatTensor(input_data, input_dims, "input_tensor"), + CreateFloatTensor(weights_data, weights_dims, "weights_tensor"), + CreateFloatTensor(bias_data, bias_dims, "bias_tensor"), + CreateFloatTensor(output_data, output_dims, "output_tensor"), + }; + + TfLiteContext context; + PopulateContext(tensors, tensors_size, micro_test::reporter, &context); + ::tflite::ops::micro::AllOpsResolver resolver; + const TfLiteRegistration* registration = + resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1); + TF_LITE_MICRO_EXPECT_NE(nullptr, registration); + + TfLiteFullyConnectedParams builtin_data = { + activation, + kTfLiteFullyConnectedWeightsFormatDefault, + }; + const char* init_data = reinterpret_cast(&builtin_data); + size_t init_data_size = 0; + void* user_data = nullptr; + if (registration->init) { + user_data = registration->init(&context, init_data, init_data_size); + } + int inputs_array_data[] = {3, 0, 1, 2}; + TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); + int outputs_array_data[] = {1, 3}; + TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); + int temporaries_array_data[] = {0}; + TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data); + + TfLiteNode node; + node.inputs = inputs_array; + node.outputs = outputs_array; + node.temporaries = temporaries_array; + node.user_data = user_data; + node.builtin_data = reinterpret_cast(&builtin_data); + node.custom_initial_data = nullptr; + node.custom_initial_data_size = 0; + node.delegate = nullptr; + if (registration->prepare) { + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); + } + TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node)); + if (registration->free) { + registration->free(&context, user_data); + } + for (int i = 0; i < output_dims_count; ++i) { + TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f); + } +} + +template +void TestFullyConnectedQuantized( + const int* input_dims_data, const T* input_data, const float input_min, + const float input_max, const int* weights_dims_data, const T* weights_data, + const float weights_min, const float weights_max, const int* bias_dims_data, + const int32_t* bias_data, const float bias_scale, + const T* expected_output_data, const int* output_dims_data, + const float output_min, const float output_max, + TfLiteFusedActivation activation, T* output_data) { + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data); + TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min, + input_max), + CreateQuantizedTensor(weights_data, weights_dims, "weights_tensor", + weights_min, weights_max), + CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_scale), + CreateQuantizedTensor(output_data, output_dims, "output_tensor", + output_min, output_max), + }; + + TfLiteContext context; + PopulateContext(tensors, tensors_size, micro_test::reporter, &context); + + ::tflite::ops::micro::AllOpsResolver resolver; + const TfLiteRegistration* registration = + resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 4); + TF_LITE_MICRO_EXPECT_NE(nullptr, registration); + + TfLiteFullyConnectedParams builtin_data = { + activation, + kTfLiteFullyConnectedWeightsFormatDefault, + }; + const char* init_data = reinterpret_cast(&builtin_data); + size_t init_data_size = 0; + void* user_data = nullptr; + if (registration->init) { + user_data = registration->init(&context, init_data, init_data_size); + } + + int inputs_array_data[] = {3, 0, 1, 2}; + TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); + int outputs_array_data[] = {1, 3}; + TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); + int temporaries_array_data[] = {0}; + TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data); + + TfLiteNode node; + node.inputs = inputs_array; + node.outputs = outputs_array; + node.temporaries = temporaries_array; + node.user_data = user_data; + node.builtin_data = reinterpret_cast(&builtin_data); + node.custom_initial_data = nullptr; + node.custom_initial_data_size = 0; + node.delegate = nullptr; + + if (registration->prepare) { + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); + } + TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node)); + if (registration->free) { + registration->free(&context, user_data); + } + for (int i = 0; i < output_dims_count; ++i) { + TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]); + } +} + +} // namespace +} // namespace testing +} // namespace tflite + +TF_LITE_MICRO_TESTS_BEGIN + +TF_LITE_MICRO_TEST(SimpleTest) { + const int input_dims_data[] = {2, 2, 10}; + const float input_data[] = { + 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 + }; + const int weights_dims_data[] = {2, 3, 10}; + const float weights_data[] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 + }; + const int bias_dims_data[] = {1, 3}; + const float bias_data[] = {1, 2, 3}; + const float expected_output_data[] = { + 24, 25, 26, 58, 59, 60, + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + float output_data[output_dims_count]; + tflite::testing::TestFullyConnectedFloat( + input_dims_data, input_data, weights_dims_data, weights_data, + bias_dims_data, bias_data, expected_output_data, output_dims_data, + kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTest2) { + const int input_dims_data[] = {2, 2, 2}; + const float input_data[] = { + 1, 2, // b = 0 + 2, 1, // b = 1 + }; + const int weights_dims_data[] = {2, 1, 2}; + const float weights_data[] = { + 2, 4, // u = 0 + }; + const int bias_dims_data[] = {1, 1}; + const float bias_data[] = {1}; + const float expected_output_data[] = { + 11, + 9, + }; + const int output_dims_data[] = {2, 2, 1}; + + const int output_dims_count = 6; + float output_data[output_dims_count]; + tflite::testing::TestFullyConnectedFloat( + input_dims_data, input_data, weights_dims_data, weights_data, + bias_dims_data, bias_data, expected_output_data, output_dims_data, + kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTestRelu) { + const int input_dims_data[] = {2, 2, 10}; + const float input_data[] = { + 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 + }; + const int weights_dims_data[] = {2, 3, 10}; + const float weights_data[] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 + }; + const int bias_dims_data[] = {1, 3}; + const float bias_data[] = {1, -2, 3}; + const float expected_output_data[] = { + 24, 0, 26, 58, 0, 60, + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + float output_data[output_dims_count]; + tflite::testing::TestFullyConnectedFloat( + input_dims_data, input_data, weights_dims_data, weights_data, + bias_dims_data, bias_data, expected_output_data, output_dims_data, + kTfLiteActRelu, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8) { + using tflite::testing::F2Q; + using tflite::testing::F2Q32; + + const float input_min = -63.5f; + const float input_max = 64.0f; + const float weights_min = -63.5f; + const float weights_max = 64.0f; + const float bias_scale = 0.25f; + const float output_min = -127.0f; + const float output_max = 128.0f; + + const int input_dims_data[] = {2, 2, 10}; + const uint8_t input_data[] = { + F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), + F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), + F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), + F2Q(7, input_min, input_max), F2Q(8, input_min, input_max), + F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max), + F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), + F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), + F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), + F2Q(7, input_min, input_max), F2Q(-8, input_min, input_max), + F2Q(9, input_min, input_max), F2Q(-10, input_min, input_max), + }; + const int weights_dims_data[] = {2, 3, 10}; + const uint8_t weights_data[] = { + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + }; + const int bias_dims_data[] = {1, 3}; + const int32_t bias_data[] = { + F2Q32(1, bias_scale), + F2Q32(2, bias_scale), + F2Q32(3, bias_scale), + }; + const uint8_t expected_output_data[] = { + F2Q(24, output_min, output_max), F2Q(25, output_min, output_max), + F2Q(26, output_min, output_max), F2Q(58, output_min, output_max), + F2Q(59, output_min, output_max), F2Q(60, output_min, output_max), + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + uint8_t output_data[output_dims_count]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data, input_data, input_min, input_max, weights_dims_data, + weights_data, weights_min, weights_max, bias_dims_data, bias_data, + bias_scale, expected_output_data, output_dims_data, output_min, + output_max, kTfLiteActNone, output_data); +} + +// TODO(b/138811455): Fix code duplication in micro tests +TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8) { + using tflite::testing::F2Q32; + using tflite::testing::F2QS; + + const float input_min = -63.5f; + const float input_max = 64.0f; + const float weights_min = -64.0f; + const float weights_max = 63.5f; + const float bias_scale = 0.25f; + const float output_min = -127.0f; + const float output_max = 128.0f; + + const int input_dims_data[] = {2, 2, 10}; + const int8_t input_data[] = { + F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), + F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), + F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), + F2QS(7, input_min, input_max), F2QS(8, input_min, input_max), + F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max), + F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), + F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), + F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), + F2QS(7, input_min, input_max), F2QS(-8, input_min, input_max), + F2QS(9, input_min, input_max), F2QS(-10, input_min, input_max), + }; + const int weights_dims_data[] = {2, 3, 10}; + const int8_t weights_data[] = { + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + }; + const int bias_dims_data[] = {1, 3}; + const int32_t bias_data[] = { + F2Q32(1, bias_scale), + F2Q32(2, bias_scale), + F2Q32(3, bias_scale), + }; + const int8_t expected_output_data[] = { + F2QS(24, output_min, output_max), F2QS(25, output_min, output_max), + F2QS(26, output_min, output_max), F2QS(58, output_min, output_max), + F2QS(59, output_min, output_max), F2QS(60, output_min, output_max), + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + int8_t output_data[output_dims_count]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data, input_data, input_min, input_max, weights_dims_data, + weights_data, weights_min, weights_max, bias_dims_data, bias_data, + bias_scale, expected_output_data, output_dims_data, output_min, + output_max, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8Relu) { + using tflite::testing::F2Q; + using tflite::testing::F2Q32; + + const float input_min = -63.5f; + const float input_max = 64.0f; + const float weights_min = -63.5f; + const float weights_max = 64.0f; + const float bias_scale = 0.25f; + const float output_min = -127.0f; + const float output_max = 128.0f; + + const int input_dims_data[] = {2, 2, 10}; + const uint8_t input_data[] = { + F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), + F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), + F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), + F2Q(7, input_min, input_max), F2Q(8, input_min, input_max), + F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max), + F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), + F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), + F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), + F2Q(7, input_min, input_max), F2Q(-8, input_min, input_max), + F2Q(9, input_min, input_max), F2Q(-10, input_min, input_max), + }; + const int weights_dims_data[] = {2, 3, 10}; + const uint8_t weights_data[] = { + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + F2Q(-1, weights_min, weights_max), F2Q(-2, weights_min, weights_max), + F2Q(-3, weights_min, weights_max), F2Q(-4, weights_min, weights_max), + F2Q(-5, weights_min, weights_max), F2Q(-6, weights_min, weights_max), + F2Q(-7, weights_min, weights_max), F2Q(-8, weights_min, weights_max), + F2Q(-9, weights_min, weights_max), F2Q(-10, weights_min, weights_max), + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + }; + const int bias_dims_data[] = {1, 3}; + const int32_t bias_data[] = { + F2Q32(1, bias_scale), + F2Q32(0, bias_scale), + F2Q32(3, bias_scale), + }; + const uint8_t expected_output_data[] = { + F2Q(24, output_min, output_max), F2Q(0, output_min, output_max), + F2Q(26, output_min, output_max), F2Q(58, output_min, output_max), + F2Q(0, output_min, output_max), F2Q(60, output_min, output_max), + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + uint8_t output_data[output_dims_count]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data, input_data, input_min, input_max, weights_dims_data, + weights_data, weights_min, weights_max, bias_dims_data, bias_data, + bias_scale, expected_output_data, output_dims_data, output_min, + output_max, kTfLiteActRelu, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8Relu) { + using tflite::testing::F2Q32; + using tflite::testing::F2QS; + + const float input_min = -63.5f; + const float input_max = 64.0f; + const float weights_min = -64.0f; + const float weights_max = 63.5f; + const float bias_scale = 0.25f; + const float output_min = -127.0f; + const float output_max = 128.0f; + + const int input_dims_data[] = {2, 2, 10}; + const int8_t input_data[] = { + F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), + F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), + F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), + F2QS(7, input_min, input_max), F2QS(8, input_min, input_max), + F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max), + F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), + F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), + F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), + F2QS(7, input_min, input_max), F2QS(-8, input_min, input_max), + F2QS(9, input_min, input_max), F2QS(-10, input_min, input_max), + }; + const int weights_dims_data[] = {2, 3, 10}; + const int8_t weights_data[] = { + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + F2QS(-1, weights_min, weights_max), F2QS(-2, weights_min, weights_max), + F2QS(-3, weights_min, weights_max), F2QS(-4, weights_min, weights_max), + F2QS(-5, weights_min, weights_max), F2QS(-6, weights_min, weights_max), + F2QS(-7, weights_min, weights_max), F2QS(-8, weights_min, weights_max), + F2QS(-9, weights_min, weights_max), F2QS(-10, weights_min, weights_max), + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + }; + const int bias_dims_data[] = {1, 3}; + const int32_t bias_data[] = { + F2Q32(1, bias_scale), + F2Q32(0, bias_scale), + F2Q32(3, bias_scale), + }; + const int8_t expected_output_data[] = { + F2QS(24, output_min, output_max), F2QS(0, output_min, output_max), + F2QS(26, output_min, output_max), F2QS(58, output_min, output_max), + F2QS(0, output_min, output_max), F2QS(60, output_min, output_max), + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + int8_t output_data[output_dims_count]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data, input_data, input_min, input_max, weights_dims_data, + weights_data, weights_min, weights_max, bias_dims_data, bias_data, + bias_scale, expected_output_data, output_dims_data, output_min, + output_max, kTfLiteActRelu, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8OutputMultiplierGreaterThan1) { + using tflite::testing::F2Q; + using tflite::testing::F2Q32; + + const float input_min = -127.0f; + const float input_max = 128.0f; + const float weights_min = -127.0f; + const float weights_max = 128.0f; + const float bias_scale = 1.0f; + const float output_min = -63.5f; + const float output_max = 64.0f; + + const int input_dims_data[] = {2, 2, 10}; + const uint8_t input_data[] = { + F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), + F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), + F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), + F2Q(7, input_min, input_max), F2Q(8, input_min, input_max), + F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max), + F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), + F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), + F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), + F2Q(7, input_min, input_max), F2Q(-8, input_min, input_max), + F2Q(9, input_min, input_max), F2Q(-10, input_min, input_max), + }; + const int weights_dims_data[] = {2, 3, 10}; + const uint8_t weights_data[] = { + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + }; + const int bias_dims_data[] = {1, 3}; + const int32_t bias_data[] = { + F2Q32(1, bias_scale), + F2Q32(2, bias_scale), + F2Q32(3, bias_scale), + }; + const uint8_t expected_output_data[] = { + F2Q(24, output_min, output_max), F2Q(25, output_min, output_max), + F2Q(26, output_min, output_max), F2Q(58, output_min, output_max), + F2Q(59, output_min, output_max), F2Q(60, output_min, output_max), + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + uint8_t output_data[output_dims_count]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data, input_data, input_min, input_max, weights_dims_data, + weights_data, weights_min, weights_max, bias_dims_data, bias_data, + bias_scale, expected_output_data, output_dims_data, output_min, + output_max, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8OutputMultiplierGreaterThan1) { + using tflite::testing::F2Q32; + using tflite::testing::F2QS; + + const float input_min = -127.0f; + const float input_max = 128.0f; + const float weights_min = -128.0f; + const float weights_max = 127.0f; + const float bias_scale = 1.0f; + const float output_min = -63.5f; + const float output_max = 64.0f; + + const int input_dims_data[] = {2, 2, 10}; + const int8_t input_data[] = { + F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), + F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), + F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), + F2QS(7, input_min, input_max), F2QS(8, input_min, input_max), + F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max), + F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), + F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), + F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), + F2QS(7, input_min, input_max), F2QS(-8, input_min, input_max), + F2QS(9, input_min, input_max), F2QS(-10, input_min, input_max), + }; + const int weights_dims_data[] = {2, 3, 10}; + const int8_t weights_data[] = { + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + }; + const int bias_dims_data[] = {1, 3}; + const int32_t bias_data[] = { + F2Q32(1, bias_scale), + F2Q32(2, bias_scale), + F2Q32(3, bias_scale), + }; + const int8_t expected_output_data[] = { + F2QS(24, output_min, output_max), F2QS(25, output_min, output_max), + F2QS(26, output_min, output_max), F2QS(58, output_min, output_max), + F2QS(59, output_min, output_max), F2QS(60, output_min, output_max), + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + int8_t output_data[output_dims_count]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data, input_data, input_min, input_max, weights_dims_data, + weights_data, weights_min, weights_max, bias_dims_data, bias_data, + bias_scale, expected_output_data, output_dims_data, output_min, + output_max, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTest4DInput) { + const int input_dims_data[] = {4, 1, 1, 5, 1}; + const float input_data[] = { + 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 + }; + const int weights_dims_data[] = {2, 3, 10}; + const float weights_data[] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 + }; + const int bias_dims_data[] = {1, 3}; + const float bias_data[] = {1, 2, 3}; + const float expected_output_data[] = { + 24, 25, 26, 58, 59, 60, // Expected results. + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + float output_data[output_dims_count]; + tflite::testing::TestFullyConnectedFloat( + input_dims_data, input_data, weights_dims_data, weights_data, + bias_dims_data, bias_data, expected_output_data, output_dims_data, + kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedUInt8) { + using tflite::testing::F2Q; + using tflite::testing::F2Q32; + + const float input_min = -63.5f; + const float input_max = 64.0f; + const float weights_min = -63.5f; + const float weights_max = 64.0f; + const float bias_scale = 0.25f; + const float output_min = -127.0f; + const float output_max = 128.0f; + + const int input_dims_data[] = {4, 1, 1, 5, 1}; + const uint8_t input_data[] = { + F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), + F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), + F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), + F2Q(7, input_min, input_max), F2Q(8, input_min, input_max), + F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max), + F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), + F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), + F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), + F2Q(7, input_min, input_max), F2Q(-8, input_min, input_max), + F2Q(9, input_min, input_max), F2Q(-10, input_min, input_max), + }; + const int weights_dims_data[] = {2, 3, 10}; + const uint8_t weights_data[] = { + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + }; + const int bias_dims_data[] = {1, 3}; + const int32_t bias_data[] = { + F2Q32(1, bias_scale), + F2Q32(2, bias_scale), + F2Q32(3, bias_scale), + }; + const uint8_t expected_output_data[] = { + F2Q(24, output_min, output_max), F2Q(25, output_min, output_max), + F2Q(26, output_min, output_max), F2Q(58, output_min, output_max), + F2Q(59, output_min, output_max), F2Q(60, output_min, output_max), + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + uint8_t output_data[output_dims_count]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data, input_data, input_min, input_max, weights_dims_data, + weights_data, weights_min, weights_max, bias_dims_data, bias_data, + bias_scale, expected_output_data, output_dims_data, output_min, + output_max, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8) { + using tflite::testing::F2Q32; + using tflite::testing::F2QS; + + const float input_min = -63.5f; + const float input_max = 64.0f; + const float weights_min = -64.0f; + const float weights_max = 63.5f; + const float bias_scale = 0.25f; + const float output_min = -127.0f; + const float output_max = 128.0f; + + const int input_dims_data[] = {4, 1, 1, 5, 1}; + const int8_t input_data[] = { + F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), + F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), + F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), + F2QS(7, input_min, input_max), F2QS(8, input_min, input_max), + F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max), + F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), + F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), + F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), + F2QS(7, input_min, input_max), F2QS(-8, input_min, input_max), + F2QS(9, input_min, input_max), F2QS(-10, input_min, input_max), + }; + const int weights_dims_data[] = {2, 3, 10}; + const int8_t weights_data[] = { + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + }; + const int bias_dims_data[] = {1, 3}; + const int32_t bias_data[] = { + F2Q32(1, bias_scale), + F2Q32(2, bias_scale), + F2Q32(3, bias_scale), + }; + const int8_t expected_output_data[] = { + F2QS(24, output_min, output_max), F2QS(25, output_min, output_max), + F2QS(26, output_min, output_max), F2QS(58, output_min, output_max), + F2QS(59, output_min, output_max), F2QS(60, output_min, output_max), + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + int8_t output_data[output_dims_count]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data, input_data, input_min, input_max, weights_dims_data, + weights_data, weights_min, weights_max, bias_dims_data, bias_data, + bias_scale, expected_output_data, output_dims_data, output_min, + output_max, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST( + SimpleTest4DInputQuantizedUInt8OutputMultiplierGreaterThan1) { + using tflite::testing::F2Q; + using tflite::testing::F2Q32; + + const float input_min = -127.0f; + const float input_max = 128.0f; + const float weights_min = -127.0f; + const float weights_max = 128.0f; + const float bias_scale = 1.0f; + const float output_min = -63.5f; + const float output_max = 64.0f; + + const int input_dims_data[] = {4, 1, 1, 5, 1}; + const uint8_t input_data[] = { + F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), + F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), + F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), + F2Q(7, input_min, input_max), F2Q(8, input_min, input_max), + F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max), + F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), + F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), + F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), + F2Q(7, input_min, input_max), F2Q(-8, input_min, input_max), + F2Q(9, input_min, input_max), F2Q(-10, input_min, input_max), + }; + const int weights_dims_data[] = {2, 3, 10}; + const uint8_t weights_data[] = { + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), + F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), + F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), + F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), + F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), + }; + const int bias_dims_data[] = {1, 3}; + const int32_t bias_data[] = { + F2Q32(1, bias_scale), + F2Q32(2, bias_scale), + F2Q32(3, bias_scale), + }; + const uint8_t expected_output_data[] = { + F2Q(24, output_min, output_max), F2Q(25, output_min, output_max), + F2Q(26, output_min, output_max), F2Q(58, output_min, output_max), + F2Q(59, output_min, output_max), F2Q(60, output_min, output_max), + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + uint8_t output_data[output_dims_count]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data, input_data, input_min, input_max, weights_dims_data, + weights_data, weights_min, weights_max, bias_dims_data, bias_data, + bias_scale, expected_output_data, output_dims_data, output_min, + output_max, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8OutputMultiplierGreaterThan1) { + using tflite::testing::F2Q32; + using tflite::testing::F2QS; + + const float input_min = -127.0f; + const float input_max = 128.0f; + const float weights_min = -128.0f; + const float weights_max = 127.0f; + const float bias_scale = 1.0f; + const float output_min = -63.5f; + const float output_max = 64.0f; + + const int input_dims_data[] = {4, 1, 1, 5, 1}; + const int8_t input_data[] = { + F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), + F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), + F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), + F2QS(7, input_min, input_max), F2QS(8, input_min, input_max), + F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max), + F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), + F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), + F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), + F2QS(7, input_min, input_max), F2QS(-8, input_min, input_max), + F2QS(9, input_min, input_max), F2QS(-10, input_min, input_max), + }; + const int weights_dims_data[] = {2, 3, 10}; + const int8_t weights_data[] = { + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), + F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), + F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), + F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), + F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), + }; + const int bias_dims_data[] = {1, 3}; + const int32_t bias_data[] = { + F2Q32(1, bias_scale), + F2Q32(2, bias_scale), + F2Q32(3, bias_scale), + }; + const int8_t expected_output_data[] = { + F2QS(24, output_min, output_max), F2QS(25, output_min, output_max), + F2QS(26, output_min, output_max), F2QS(58, output_min, output_max), + F2QS(59, output_min, output_max), F2QS(60, output_min, output_max), + }; + const int output_dims_data[] = {2, 2, 3}; + + const int output_dims_count = 6; + int8_t output_data[output_dims_count]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data, input_data, input_min, input_max, weights_dims_data, + weights_data, weights_min, weights_max, bias_dims_data, bias_data, + bias_scale, expected_output_data, output_dims_data, output_min, + output_max, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TESTS_END diff --git a/tensorflow/lite/micro/kernels/embarc_mli/pooling_slicing_test.cc b/tensorflow/lite/micro/kernels/embarc_mli/pooling_slicing_test.cc new file mode 100644 index 00000000000..8bfeb718a1b --- /dev/null +++ b/tensorflow/lite/micro/kernels/embarc_mli/pooling_slicing_test.cc @@ -0,0 +1,1116 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/micro/kernels/all_ops_resolver.h" +#include "tensorflow/lite/micro/testing/micro_test.h" +#include "tensorflow/lite/micro/testing/test_utils.h" + +namespace tflite { +namespace testing { +namespace { + +void TestAveragePoolingFloat(std::initializer_list input_dims_data, + std::initializer_list input_data, + const int filter_height, const int filter_width, + const int stride_height, const int stride_width, + std::initializer_list expected_output_data, + std::initializer_list output_dims_data, + TfLitePadding padding, + TfLiteFusedActivation activation, + float* output_data) { + TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + + constexpr int inputs_size = 1; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + CreateFloatTensor(input_data, input_dims, "input_tensor"), + CreateFloatTensor(output_data, output_dims, "output_tensor"), + }; + + TfLiteContext context; + PopulateContext(tensors, tensors_size, micro_test::reporter, &context); + + ::tflite::ops::micro::AllOpsResolver resolver; + const TfLiteRegistration* registration = + resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1); + TF_LITE_MICRO_EXPECT_NE(nullptr, registration); + + TfLitePoolParams builtin_data = {padding, stride_width, stride_height, + filter_width, filter_height, activation}; + const char* init_data = reinterpret_cast(&builtin_data); + size_t init_data_size = 0; + void* user_data = nullptr; + if (registration->init) { + user_data = registration->init(&context, init_data, init_data_size); + } + int inputs_array_data[] = {1, 0}; + TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); + int outputs_array_data[] = {1, 1}; + TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); + int temporaries_array_data[] = {0}; + TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data); + + TfLiteNode node; + node.inputs = inputs_array; + node.outputs = outputs_array; + node.temporaries = temporaries_array; + node.user_data = user_data; + node.builtin_data = reinterpret_cast(&builtin_data); + node.custom_initial_data = nullptr; + node.custom_initial_data_size = 0; + node.delegate = nullptr; + + if (registration->prepare) { + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); + } + TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node)); + if (registration->free) { + registration->free(&context, user_data); + } + + for (int i = 0; i < output_dims_count; ++i) { + TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i], + 1e-5f); + } +} + +template +void TestAveragePoolingQuantized( + std::initializer_list input_dims_data, + std::initializer_list input_data, const float input_min, + const float input_max, const int filter_height, const int filter_width, + const int stride_height, const int stride_width, + std::initializer_list expected_output_data, + std::initializer_list output_dims_data, float output_min, + float output_max, TfLitePadding padding, TfLiteFusedActivation activation, + T* output_data) { + static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed."); + + TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + + constexpr int inputs_size = 1; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min, + input_max), + CreateQuantizedTensor(output_data, output_dims, "output_tensor", + output_min, output_max), + }; + + TfLiteContext context; + PopulateContext(tensors, tensors_size, micro_test::reporter, &context); + + ::tflite::ops::micro::AllOpsResolver resolver; + const TfLiteRegistration* registration = + resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1); + TF_LITE_MICRO_EXPECT_NE(nullptr, registration); + + TfLitePoolParams builtin_data = {padding, stride_width, stride_height, + filter_width, filter_height, activation}; + const char* init_data = reinterpret_cast(&builtin_data); + size_t init_data_size = 0; + void* user_data = nullptr; + if (registration->init) { + user_data = registration->init(&context, init_data, init_data_size); + } + int inputs_array_data[] = {1, 0}; + TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); + int outputs_array_data[] = {1, 1}; + TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); + int temporaries_array_data[] = {0}; + TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data); + + TfLiteNode node; + node.inputs = inputs_array; + node.outputs = outputs_array; + node.temporaries = temporaries_array; + node.user_data = user_data; + node.builtin_data = reinterpret_cast(&builtin_data); + node.custom_initial_data = nullptr; + node.custom_initial_data_size = 0; + node.delegate = nullptr; + + if (registration->prepare) { + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); + } + TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node)); + if (registration->free) { + registration->free(&context, user_data); + } + + for (int i = 0; i < output_dims_count; ++i) { + TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i], + 1e-5f); + } +} + +void TestMaxPoolFloat(std::initializer_list input_dims_data, + std::initializer_list input_data, int filter_width, + int filter_height, int stride_width, int stride_height, + std::initializer_list expected_output_data, + std::initializer_list output_dims_data, + TfLitePadding padding, TfLiteFusedActivation activation, + float* output_data) { + TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + + constexpr int inputs_size = 1; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + CreateFloatTensor(input_data, input_dims, "input_tensor"), + CreateFloatTensor(output_data, output_dims, "output_tensor"), + }; + + TfLiteContext context; + PopulateContext(tensors, tensors_size, micro_test::reporter, &context); + + ::tflite::ops::micro::AllOpsResolver resolver; + const TfLiteRegistration* registration = + resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1); + TF_LITE_MICRO_EXPECT_NE(nullptr, registration); + + TfLitePoolParams builtin_data = { + padding, stride_width, stride_height, + filter_width, filter_height, activation, + }; + + const char* init_data = reinterpret_cast(&builtin_data); + size_t init_data_size = 0; + void* user_data = nullptr; + if (registration->init) { + user_data = registration->init(&context, init_data, init_data_size); + } + + int inputs_array_data[] = {1, 0}; + TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); + int outputs_array_data[] = {1, 1}; + TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); + int temporaries_array_data[] = {0}; + TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data); + + TfLiteNode node; + node.inputs = inputs_array; + node.outputs = outputs_array; + node.temporaries = temporaries_array; + node.user_data = user_data; + node.builtin_data = reinterpret_cast(&builtin_data); + node.custom_initial_data = nullptr; + node.custom_initial_data_size = 0; + node.delegate = nullptr; + if (registration->prepare) { + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); + } + TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node)); + if (registration->free) { + registration->free(&context, user_data); + } + for (int i = 0; i < output_dims_count; ++i) { + TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i], + 1e-5f); + } +} + +template +void TestMaxPoolQuantized(std::initializer_list input_dims_data, + std::initializer_list input_data, float input_min, + float input_max, int filter_width, int filter_height, + int stride_width, int stride_height, + std::initializer_list expected_output_data, + float output_min, float output_max, + std::initializer_list output_dims_data, + TfLitePadding padding, + TfLiteFusedActivation activation, T* output_data) { + static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed."); + + TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + + constexpr int inputs_size = 1; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min, + input_max), + CreateQuantizedTensor(output_data, output_dims, "output_tensor", + output_min, output_max), + }; + + TfLiteContext context; + PopulateContext(tensors, tensors_size, micro_test::reporter, &context); + + ::tflite::ops::micro::AllOpsResolver resolver; + const TfLiteRegistration* registration = + resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1); + TF_LITE_MICRO_EXPECT_NE(nullptr, registration); + + TfLitePoolParams builtin_data = { + padding, stride_width, stride_height, + filter_width, filter_height, activation, + }; + + const char* init_data = reinterpret_cast(&builtin_data); + size_t init_data_size = 0; + void* user_data = nullptr; + if (registration->init) { + user_data = registration->init(&context, init_data, init_data_size); + } + + int inputs_array_data[] = {1, 0}; + TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); + int outputs_array_data[] = {1, 1}; + TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); + int temporaries_array_data[] = {0}; + TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data); + + TfLiteNode node; + node.inputs = inputs_array; + node.outputs = outputs_array; + node.temporaries = temporaries_array; + node.user_data = user_data; + node.builtin_data = reinterpret_cast(&builtin_data); + node.custom_initial_data = nullptr; + node.custom_initial_data_size = 0; + node.delegate = nullptr; + if (registration->prepare) { + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); + } + TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node)); + if (registration->free) { + registration->free(&context, user_data); + } + for (int i = 0; i < output_dims_count; ++i) { + TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]); + } +} + +} // namespace + +} // namespace testing +} // namespace tflite + +TF_LITE_MICRO_TESTS_BEGIN + +TF_LITE_MICRO_TEST(SimpleAveragePoolTestFloat) { + float output_data[2]; + tflite::testing::TestAveragePoolingFloat({4, 1, 2, 4, 1}, // Input shape + { // Input values + 0., 6., 2., 4., 3., 2., 10., 7.}, + 2, 2, // filter width, filter height + 2, 2, // stride width, stride height + { + // Output values + 2.75, + 5.75, + }, + {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActNone, + output_data); +} + +TF_LITE_MICRO_TEST(SimpleAveragePoolTestUint8) { + using tflite::testing::F2Q; + + const float input_min = -15.9375; + const float input_max = 15.9375; + const float output_min = -15.9375; + const float output_max = 15.9375; + uint8_t output_data[2]; + tflite::testing::TestAveragePoolingQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2Q(0., input_min, input_max), + F2Q(-6., input_min, input_max), + F2Q(2., input_min, input_max), + F2Q(4., input_min, input_max), + F2Q(3., input_min, input_max), + F2Q(2., input_min, input_max), + F2Q(-10., input_min, input_max), + F2Q(7., input_min, input_max), + }, + input_min, input_max, // input quantization range + 2, 2, // filter width, filter height + 2, 2, // stride width, stride height + { + // Output values + F2Q(0., output_min, output_max), + F2Q(0.75, output_min, output_max), + }, + {4, 1, 1, 2, 1}, // Output shape + output_min, output_max, // output quantization range + kTfLitePaddingValid, kTfLiteActRelu, output_data); +} + +TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2ActNone) { + using tflite::testing::F2QS; + + const float input_min = -15.9375; + const float input_max = 15.8130; + const float output_min = -15.9375; + const float output_max = 15.8130; + int8_t output_data[2]; + tflite::testing::TestAveragePoolingQuantized( + {4, 1, 2, 4, 1}, // Input shape + { // Input values + F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max), + F2QS(2., input_min, input_max), F2QS(4., input_min, input_max), + F2QS(3., input_min, input_max), F2QS(2., input_min, input_max), + F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)}, + input_min, input_max, // input quantization range + 2, 2, // filter height, filter width + 2, 2, // stride height, stride width + { // Output values + F2QS(-0.25, output_min, output_max), F2QS(0.75, output_min, output_max)}, + {4, 1, 1, 2, 1}, // Output shape + output_min, output_max, // output quantization range + kTfLitePaddingValid, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride1Stride2Relu) { + using tflite::testing::F2QS; + + const float input_min = -15.9375; + const float input_max = 15.8130; + const float output_min = -15.9375; + const float output_max = 15.8130; + int8_t output_data[3]; + tflite::testing::TestAveragePoolingQuantized( + {4, 1, 2, 4, 1}, // Input shape + { // Input values + F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max), + F2QS(2., input_min, input_max), F2QS(4., input_min, input_max), + F2QS(3., input_min, input_max), F2QS(2., input_min, input_max), + F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)}, + input_min, input_max, // input quantization range + 2, 2, // filter height, filter width + 2, 1, // stride height, stride width + { // Output values + F2QS(0., output_min, output_max), F2QS(0., output_min, output_max), + F2QS(0.75, output_min, output_max)}, + {4, 1, 1, 3, 1}, // Output shape + output_min, output_max, // output quantization range + kTfLitePaddingValid, kTfLiteActRelu, output_data); +} + +TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Stride1Relu1) { + using tflite::testing::F2QS; + + const float input_min = -15.9375; + const float input_max = 15.8130; + const float output_min = -15.9375; + const float output_max = 15.8130; + int8_t output_data[2]; + tflite::testing::TestAveragePoolingQuantized( + {4, 1, 2, 4, 1}, // Input shape + { // Input values + F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max), + F2QS(2., input_min, input_max), F2QS(4., input_min, input_max), + F2QS(3., input_min, input_max), F2QS(2., input_min, input_max), + F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)}, + input_min, input_max, // input quantization range + 2, 2, // filter height, filter width + 1, 2, // stride height, stride width + { // Output values + F2QS(-0.25, output_min, output_max), F2QS(0.75, output_min, output_max)}, + {4, 1, 1, 2, 1}, // Output shape + output_min, output_max, // output quantization range + kTfLitePaddingValid, kTfLiteActRelu1, output_data); +} + +TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Relu6) { + using tflite::testing::F2QS; + + const float input_min = -15.9375; + const float input_max = 15.8130; + const float output_min = -15.9375; + const float output_max = 15.8130; + int8_t output_data[2]; + tflite::testing::TestAveragePoolingQuantized( + {4, 1, 2, 4, 1}, // Input shape + { // Input values + F2QS(3., input_min, input_max), F2QS(-6., input_min, input_max), + F2QS(8., input_min, input_max), F2QS(4., input_min, input_max), + F2QS(3., input_min, input_max), F2QS(2., input_min, input_max), + F2QS(10., input_min, input_max), F2QS(7., input_min, input_max)}, + input_min, input_max, // input quantization range + 2, 2, // filter height, filter width + 2, 2, // stride height, stride width + { // Output values + F2QS(0.5, output_min, output_max), F2QS(6., output_min, output_max)}, + {4, 1, 1, 2, 1}, // Output shape + output_min, output_max, // output quantization range + kTfLitePaddingValid, kTfLiteActRelu6, output_data); +} + +TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingSameStride1ActNone) { + using tflite::testing::F2QS; + + const float input_min = -15.9375; + const float input_max = 15.8130; + const float output_min = -15.9375; + const float output_max = 15.8130; + int8_t output_data[8]; + tflite::testing::TestAveragePoolingQuantized( + {4, 1, 2, 4, 1}, // Input shape + { // Input values + F2QS(3., input_min, input_max), F2QS(-6., input_min, input_max), + F2QS(8., input_min, input_max), F2QS(4., input_min, input_max), + F2QS(3., input_min, input_max), F2QS(2., input_min, input_max), + F2QS(10., input_min, input_max), F2QS(7., input_min, input_max)}, + input_min, input_max, // input quantization range + 2, 2, // filter height, filter width + 1, 1, // stride height, stride width + { // Output values + F2QS(0.5, output_min, output_max), F2QS(3.5, output_min, output_max), + F2QS(7.25, output_min, output_max), F2QS(5.5, output_min, output_max), + F2QS(2.5, output_min, output_max), F2QS(6., output_min, output_max), + F2QS(8.5, output_min, output_max), F2QS(7., output_min, output_max)}, + {4, 1, 2, 4, 1}, // Output shape + output_min, output_max, // output quantization range + kTfLitePaddingValid, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloat) { + float output_data[2]; + tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape + { // Input values + 0, 6, 2, 4, 3, 2, 10, 7}, + 2, 2, // filter width, filter height + 2, 2, // stride width, stride height + { + // Output values + 6, + 10, + }, + {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActNone, + output_data); +} + +TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu) { + float output_data[2]; + tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape + { + // Input values + -1, -6, 2, 4, // + -3, -2, 10.5, 7, // + }, + 2, 2, // filter width, filter height + 2, 2, // stride width, stride height + { + // Output values + 0.0, + 10.5, + }, + {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu, + output_data); +} + +TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu1) { + float output_data[2]; + tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape + { + // Input values + -2.75, -6, 0.2, 0.4, // + -3, -2, -0.3, 0.7, // + }, + 2, 2, // filter width, filter height + 2, 2, // stride width, stride height + { + // Output values + -1.0, + 0.7, + }, + {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu1, + output_data); + + tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape + { + // Input values + -2.75, -6, -2, -4, // + -3, -2, 10, -7, // + }, + 2, 2, // filter width, filter height + 2, 2, // stride width, stride height + { + // Output values + -1.0, + 1.0, + }, + {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu1, + output_data); +} + +TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu6) { + float output_data[2]; + tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape + { + // Input values + -1.5, -6, 12, 4, // + -3, -2, 10, 7, // + }, + 2, 2, // filter width, filter height + 2, 2, // stride width, stride height + { + // Output values + 0.0, + 6.0, + }, + {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu6, + output_data); + + tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape + { + // Input values + 0, 4.5, 12, 4, // + 3, 2, 10, 7, // + }, + 2, 2, // filter width, filter height + 2, 2, // stride width, stride height + { + // Output values + 4.5, + 6.0, + }, + {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu6, + output_data); +} + +TF_LITE_MICRO_TEST(SimpleMaxPoolTestPaddingSameStride1) { + float output_data[8]; + tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape + { + // Input values + 0, 6, 2, 4, // + 3, 2, 10, 7, // + }, + 2, 2, // filter width, filter height + 1, 1, // stride width, stride height + { + // Output values + 6, 10, 10, 7, // + 3, 10, 10, 7, // + }, + {4, 1, 2, 4, 1}, // Output shape + kTfLitePaddingSame, kTfLiteActNone, + output_data); +} + +TF_LITE_MICRO_TEST(SimpleMaxPoolTestPaddingValidStride1) { + float output_data[3]; + tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape + { + // Input values + 0, 6, 2, 4, // + 3, 2, 10, 7, // + }, + 2, 2, // filter width, filter height + 1, 1, // stride width, stride height + { + // Output values + 6, + 10, + 10, + }, + {4, 1, 1, 3, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActNone, + output_data); +} + +TF_LITE_MICRO_TEST(SimpleMaxPoolTestUInt8ActNone) { + using tflite::testing::F2Q; + + uint8_t output_data[2]; + float input_min = 0; + float input_max = 15.9375; + float output_min = 0; + float output_max = 15.9375; + int filter_width = 2; + int filter_height = 2; + int stride_width = 2; + int stride_height = 2; + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2Q(0, input_min, input_max), + F2Q(6, input_min, input_max), + F2Q(2, input_min, input_max), + F2Q(4, input_min, input_max), + F2Q(3, input_min, input_max), + F2Q(2, input_min, input_max), + F2Q(10, input_min, input_max), + F2Q(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + {// Output values + F2Q(6, output_min, output_max), F2Q(10, output_min, output_max)}, + output_min, output_max, {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) { + using tflite::testing::F2Q; + + uint8_t output_data[2]; + float input_min = -15.9375; + float input_max = 15.9375; + float output_min = -15.9375; + float output_max = 15.9375; + int filter_width = 2; + int filter_height = 2; + int stride_width = 2; + int stride_height = 2; + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2Q(-1.5, input_min, input_max), + F2Q(-6, input_min, input_max), + F2Q(2, input_min, input_max), + F2Q(4, input_min, input_max), + F2Q(-3, input_min, input_max), + F2Q(-2, input_min, input_max), + F2Q(10, input_min, input_max), + F2Q(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + {// Output values + F2Q(0, output_min, output_max), F2Q(10, output_min, output_max)}, + output_min, output_max, {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu, output_data); +} + +TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) { + using tflite::testing::F2Q; + + uint8_t output_data[2]; + float input_min = -15.9375; + float input_max = 15.9375; + float output_min = -15.9375; + float output_max = 15.9375; + int filter_width = 2; + int filter_height = 2; + int stride_width = 2; + int stride_height = 2; + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2Q(-1.7, input_min, input_max), + F2Q(-6, input_min, input_max), + F2Q(2, input_min, input_max), + F2Q(4, input_min, input_max), + F2Q(-3, input_min, input_max), + F2Q(-2, input_min, input_max), + F2Q(-10, input_min, input_max), + F2Q(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + {// Output values + F2Q(-1.0, output_min, output_max), F2Q(1.0, output_min, output_max)}, + output_min, output_max, {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu1, output_data); +} + +TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) { + using tflite::testing::F2Q; + + uint8_t output_data[8]; + float input_min = -15.9375; + float input_max = 15.9375; + float output_min = -15.9375; + float output_max = 15.9375; + int filter_width = 2; + int filter_height = 2; + int stride_width = 2; + int stride_height = 2; + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2Q(0, input_min, input_max), + F2Q(-6, input_min, input_max), + F2Q(12, input_min, input_max), + F2Q(4, input_min, input_max), + F2Q(-3, input_min, input_max), + F2Q(-2, input_min, input_max), + F2Q(10, input_min, input_max), + F2Q(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + {// Output values + F2Q(0.0, output_min, output_max), F2Q(6.0, output_min, output_max)}, + output_min, output_max, {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu6, output_data); + + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2Q(0, input_min, input_max), + F2Q(4.5, input_min, input_max), + F2Q(12, input_min, input_max), + F2Q(4, input_min, input_max), + F2Q(3, input_min, input_max), + F2Q(2, input_min, input_max), + F2Q(10, input_min, input_max), + F2Q(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + {// Output values + F2Q(4.5, output_min, output_max), F2Q(6.0, output_min, output_max)}, + output_min, output_max, {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu6, output_data); +} + +TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingSameStride1) { + using tflite::testing::F2Q; + + uint8_t output_data[8]; + float input_min = 0; + float input_max = 15.9375; + float output_min = 0; + float output_max = 15.9375; + int filter_width = 2; + int filter_height = 2; + int stride_width = 1; + int stride_height = 1; + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2Q(0, input_min, input_max), + F2Q(6, input_min, input_max), + F2Q(2, input_min, input_max), + F2Q(4, input_min, input_max), + F2Q(3, input_min, input_max), + F2Q(2, input_min, input_max), + F2Q(10, input_min, input_max), + F2Q(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + { + // Output values + F2Q(6, output_min, output_max), + F2Q(10, output_min, output_max), + F2Q(10, output_min, output_max), + F2Q(7, output_min, output_max), + F2Q(3, output_min, output_max), + F2Q(10, output_min, output_max), + F2Q(10, output_min, output_max), + F2Q(7, output_min, output_max), + }, + output_min, output_max, {4, 1, 2, 4, 1}, // Output shape + kTfLitePaddingSame, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingValidStride1) { + using tflite::testing::F2Q; + + uint8_t output_data[3]; + float input_min = 0; + float input_max = 15.9375; + float output_min = 0; + float output_max = 15.9375; + int filter_width = 2; + int filter_height = 2; + int stride_width = 1; + int stride_height = 1; + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2Q(0, input_min, input_max), + F2Q(6, input_min, input_max), + F2Q(2, input_min, input_max), + F2Q(4, input_min, input_max), + F2Q(3, input_min, input_max), + F2Q(2, input_min, input_max), + F2Q(10, input_min, input_max), + F2Q(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + { + // Output values + F2Q(6, output_min, output_max), + F2Q(10, output_min, output_max), + F2Q(10, output_min, output_max), + }, + output_min, output_max, {4, 1, 1, 3, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(SimpleMaxPoolTestInt8ActNone) { + using tflite::testing::F2QS; + + int8_t output_data[2]; + float input_min = 0; + float input_max = 15.9375; + float output_min = 0; + float output_max = 15.9375; + int filter_width = 2; + int filter_height = 2; + int stride_width = 2; + int stride_height = 2; + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2QS(0, input_min, input_max), + F2QS(6, input_min, input_max), + F2QS(2, input_min, input_max), + F2QS(4, input_min, input_max), + F2QS(3, input_min, input_max), + F2QS(2, input_min, input_max), + F2QS(10, input_min, input_max), + F2QS(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + {// Output values + F2QS(6, output_min, output_max), F2QS(10, output_min, output_max)}, + output_min, output_max, {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) { + using tflite::testing::F2QS; + + int8_t output_data[2]; + float input_min = -15.9375; + float input_max = 15.9375; + float output_min = -15.9375; + float output_max = 15.9375; + int filter_width = 2; + int filter_height = 2; + int stride_width = 2; + int stride_height = 2; + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2QS(-1.5, input_min, input_max), + F2QS(-6, input_min, input_max), + F2QS(2, input_min, input_max), + F2QS(4, input_min, input_max), + F2QS(-3, input_min, input_max), + F2QS(-2, input_min, input_max), + F2QS(10, input_min, input_max), + F2QS(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + {// Output values + F2QS(0, output_min, output_max), F2QS(10, output_min, output_max)}, + output_min, output_max, {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu, output_data); +} + +TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) { + using tflite::testing::F2QS; + + int8_t output_data[2]; + float input_min = -15.9375; + float input_max = 15.9375; + float output_min = -15.9375; + float output_max = 15.9375; + int filter_width = 2; + int filter_height = 2; + int stride_width = 2; + int stride_height = 2; + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2QS(-1.7, input_min, input_max), + F2QS(-6, input_min, input_max), + F2QS(2, input_min, input_max), + F2QS(4, input_min, input_max), + F2QS(-3, input_min, input_max), + F2QS(-2, input_min, input_max), + F2QS(-10, input_min, input_max), + F2QS(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + {// Output values + F2QS(-1.0, output_min, output_max), F2QS(1.0, output_min, output_max)}, + output_min, output_max, {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu1, output_data); +} + +TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) { + using tflite::testing::F2QS; + + int8_t output_data[8]; + float input_min = -15.9375; + float input_max = 15.9375; + float output_min = -15.9375; + float output_max = 15.9375; + int filter_width = 2; + int filter_height = 2; + int stride_width = 2; + int stride_height = 2; + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2QS(0, input_min, input_max), + F2QS(-6, input_min, input_max), + F2QS(12, input_min, input_max), + F2QS(4, input_min, input_max), + F2QS(-3, input_min, input_max), + F2QS(-2, input_min, input_max), + F2QS(10, input_min, input_max), + F2QS(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + {// Output values + F2QS(0.0, output_min, output_max), F2QS(6.0, output_min, output_max)}, + output_min, output_max, {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu6, output_data); + + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2QS(0, input_min, input_max), + F2QS(4.5, input_min, input_max), + F2QS(12, input_min, input_max), + F2QS(4, input_min, input_max), + F2QS(3, input_min, input_max), + F2QS(2, input_min, input_max), + F2QS(10, input_min, input_max), + F2QS(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + {// Output values + F2QS(4.5, output_min, output_max), F2QS(6.0, output_min, output_max)}, + output_min, output_max, {4, 1, 1, 2, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActRelu6, output_data); +} + +TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingSameStride1) { + using tflite::testing::F2QS; + + int8_t output_data[8]; + float input_min = 0; + float input_max = 15.9375; + float output_min = 0; + float output_max = 15.9375; + int filter_width = 2; + int filter_height = 2; + int stride_width = 1; + int stride_height = 1; + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2QS(0, input_min, input_max), + F2QS(6, input_min, input_max), + F2QS(2, input_min, input_max), + F2QS(4, input_min, input_max), + F2QS(3, input_min, input_max), + F2QS(2, input_min, input_max), + F2QS(10, input_min, input_max), + F2QS(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + { + // Output values + F2QS(6, output_min, output_max), + F2QS(10, output_min, output_max), + F2QS(10, output_min, output_max), + F2QS(7, output_min, output_max), + F2QS(3, output_min, output_max), + F2QS(10, output_min, output_max), + F2QS(10, output_min, output_max), + F2QS(7, output_min, output_max), + }, + output_min, output_max, {4, 1, 2, 4, 1}, // Output shape + kTfLitePaddingSame, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingValidStride1) { + using tflite::testing::F2QS; + + int8_t output_data[3]; + float input_min = 0; + float input_max = 15.9375; + float output_min = 0; + float output_max = 15.9375; + int filter_width = 2; + int filter_height = 2; + int stride_width = 1; + int stride_height = 1; + tflite::testing::TestMaxPoolQuantized( + {4, 1, 2, 4, 1}, // Input shape + { + // Input values + F2QS(0, input_min, input_max), + F2QS(6, input_min, input_max), + F2QS(2, input_min, input_max), + F2QS(4, input_min, input_max), + F2QS(3, input_min, input_max), + F2QS(2, input_min, input_max), + F2QS(10, input_min, input_max), + F2QS(7, input_min, input_max), + }, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + { + // Output values + F2QS(6, output_min, output_max), + F2QS(10, output_min, output_max), + F2QS(10, output_min, output_max), + }, + output_min, output_max, {4, 1, 1, 3, 1}, // Output shape + kTfLitePaddingValid, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TESTS_END diff --git a/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc index 851a5d43378..0cba07d9d27 100644 --- a/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc +++ b/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc @@ -1,6 +1,6 @@ ifeq ($(TARGET_ARCH), arc) -# embarc_mli Library is used by default for ARC platform whenever it's possible. +# embarc_mli Library is used by default for ARC platform whenever it is possible. # To use TFLM reference implementation it should be intentionally turned off # by passing 'no_embarc_mli' tag (make -f TAGS=no_embarc_mli ...) ifeq ($(filter no_embarc_mli,$(ALL_TAGS)),) @@ -63,5 +63,14 @@ endif MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h + + MICROLITE_TEST_SRCS += $(wildcard tensorflow/lite/micro/kernels/embarc_mli/*test.cc) + + EMBARC_MLI_TESTS := conv depthwise_conv pooling fully_connected + EMBARC_MLI_TESTS += $(foreach TEST,$(EMBARC_MLI_TESTS), $(TEST)_slicing) + +generate_embarc_mli_test_projects: $(foreach TEST,$(EMBARC_MLI_TESTS), generate_kernel_$(TEST)_test_make_project) + + endif # no_embarc_mli endif # TARGET_ARCH From fc83b7fedb4f8727ac63c9e8b4c3bc7e8e75643c Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Wed, 15 Apr 2020 13:26:08 +0300 Subject: [PATCH 038/557] embARC MLI related code is present in arc_mli --- .../kernels/{embarc_mli => arc_mli}/conv.cc | 8 +- .../conv_slicing_test.cc | 0 .../{embarc_mli => arc_mli}/depthwise_conv.cc | 8 +- .../depthwise_conv_slicing_test.cc | 0 .../fully_connected.cc | 8 +- .../fully_connected_slicing_test.cc | 0 .../{embarc_mli => arc_mli}/mli_slicers.cc | 0 .../{embarc_mli => arc_mli}/mli_slicers.h | 0 .../{embarc_mli => arc_mli}/mli_tf_utils.h | 0 .../{embarc_mli => arc_mli}/pooling.cc | 8 +- .../pooling_slicing_test.cc | 0 .../scratch_buf_mgr.cc | 4 +- .../{embarc_mli => arc_mli}/scratch_buf_mgr.h | 0 .../scratch_buffers.cc | 2 +- .../{embarc_mli => arc_mli}/scratch_buffers.h | 0 .../micro/tools/make/ext_libs/arc_mli.inc | 92 +++++++++++++++++++ .../micro/tools/make/ext_libs/embarc_mli.inc | 76 --------------- 17 files changed, 111 insertions(+), 95 deletions(-) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/conv.cc (98%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/conv_slicing_test.cc (100%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/depthwise_conv.cc (98%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/depthwise_conv_slicing_test.cc (100%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/fully_connected.cc (98%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/fully_connected_slicing_test.cc (100%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/mli_slicers.cc (100%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/mli_slicers.h (100%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/mli_tf_utils.h (100%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/pooling.cc (98%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/pooling_slicing_test.cc (100%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/scratch_buf_mgr.cc (98%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/scratch_buf_mgr.h (100%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/scratch_buffers.cc (98%) rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/scratch_buffers.h (100%) create mode 100644 tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc delete mode 100644 tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc diff --git a/tensorflow/lite/micro/kernels/embarc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc similarity index 98% rename from tensorflow/lite/micro/kernels/embarc_mli/conv.cc rename to tensorflow/lite/micro/kernels/arc_mli/conv.cc index b124b17f66d..d02f081434f 100644 --- a/tensorflow/lite/micro/kernels/embarc_mli/conv.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc @@ -24,10 +24,10 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/padding.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h" +#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h" #include "mli_api.h" diff --git a/tensorflow/lite/micro/kernels/embarc_mli/conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc similarity index 100% rename from tensorflow/lite/micro/kernels/embarc_mli/conv_slicing_test.cc rename to tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc diff --git a/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc similarity index 98% rename from tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv.cc rename to tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc index 0ad2a9fe6c6..049347cc7a1 100644 --- a/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc @@ -25,10 +25,10 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/padding.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h" +#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h" #include "mli_api.h" diff --git a/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc similarity index 100% rename from tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv_slicing_test.cc rename to tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc diff --git a/tensorflow/lite/micro/kernels/embarc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc similarity index 98% rename from tensorflow/lite/micro/kernels/embarc_mli/fully_connected.cc rename to tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc index 8088634f8de..61fa0ff397f 100644 --- a/tensorflow/lite/micro/kernels/embarc_mli/fully_connected.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc @@ -23,10 +23,10 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h" +#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h" #include "mli_api.h" diff --git a/tensorflow/lite/micro/kernels/embarc_mli/fully_connected_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc similarity index 100% rename from tensorflow/lite/micro/kernels/embarc_mli/fully_connected_slicing_test.cc rename to tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc diff --git a/tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc similarity index 100% rename from tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc rename to tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc diff --git a/tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h similarity index 100% rename from tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h rename to tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h diff --git a/tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h b/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h similarity index 100% rename from tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h rename to tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h diff --git a/tensorflow/lite/micro/kernels/embarc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc similarity index 98% rename from tensorflow/lite/micro/kernels/embarc_mli/pooling.cc rename to tensorflow/lite/micro/kernels/arc_mli/pooling.cc index a147171a859..ced5c4a21b8 100644 --- a/tensorflow/lite/micro/kernels/embarc_mli/pooling.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc @@ -20,10 +20,10 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/padding.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h" +#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h" #include "mli_api.h" diff --git a/tensorflow/lite/micro/kernels/embarc_mli/pooling_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc similarity index 100% rename from tensorflow/lite/micro/kernels/embarc_mli/pooling_slicing_test.cc rename to tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc diff --git a/tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc similarity index 98% rename from tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc rename to tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc index 8d00e28714c..d030d04170c 100644 --- a/tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h" -#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h" #include #define MAX(A,B) (((A) > (B))? (A): (B)) #define MIN(A,B) (((A) > (B))? (B): (A)) diff --git a/tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h similarity index 100% rename from tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h rename to tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h diff --git a/tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc similarity index 98% rename from tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc rename to tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc index 689c490569e..a770e4ccd66 100644 --- a/tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h" #include #define MAX(A,B) (((A) > (B))? (A): (B)) #define MIN(A,B) (((A) > (B))? (B): (A)) diff --git a/tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h similarity index 100% rename from tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h rename to tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h diff --git a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc new file mode 100644 index 00000000000..3b8fa04d536 --- /dev/null +++ b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc @@ -0,0 +1,92 @@ +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Settings for embARC MLI library for ARC platform. + +ifeq ($(TARGET_ARCH), arc) + +# MLI Library is used by default for ARC platform whenever it is possible. +# To use TFLM reference implementation MLI should be intentionally turned off +# by passing 'no_arc_mli' tag (make -f TAGS=no_arc_mli ...) +ifeq ($(filter no_arc_mli,$(ALL_TAGS)),) + + +ALL_TAGS += arc_mli + +ifeq ($(PRE_COMPILED_MLI),true) + # TODO: Replace with proper arc_mli pre-builts. + $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,)) + + MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include + MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a + + THIRD_PARTY_CC_HDRS += \ + third_party/embarc_osp/LICENSE +else + MLI_LIB_DIR = arc_mli_$(basename $(TCF_FILE_NAME)) + + $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE))) + + MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include + MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a + MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a + + THIRD_PARTY_CC_HDRS += \ + third_party/$(MLI_LIB_DIR)/LICENSE +endif + + THIRD_PARTY_CC_HDRS += $(MLI_LIB) + GENERATED_PROJECT_LIBS += $(MLI_LIB) + + INCLUDES += \ + -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \ + -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api + + GENERATED_PROJECT_INCLUDES += \ + -I. \ + -I./third_party/$(MLI_INCLUDE_FOLDER) \ + -I./third_party/$(MLI_INCLUDE_FOLDER)/api + + + THIRD_PARTY_CC_HDRS += \ + third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \ + third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \ + third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h + + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h + MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc + MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h + + + MICROLITE_TEST_SRCS += $(wildcard tensorflow/lite/micro/kernels/arc_mli/*test.cc) + + ARC_MLI_TESTS := conv depthwise_conv pooling fully_connected + ARC_MLI_TESTS += $(foreach TEST,$(ARC_MLI_TESTS), $(TEST)_slicing) + +generate_arc_mli_test_projects: $(foreach TEST,$(ARC_MLI_TESTS), generate_kernel_$(TEST)_test_make_project) + + +endif # no_embarc_mli +endif # TARGET_ARCH diff --git a/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc deleted file mode 100644 index 0cba07d9d27..00000000000 --- a/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc +++ /dev/null @@ -1,76 +0,0 @@ -ifeq ($(TARGET_ARCH), arc) - -# embarc_mli Library is used by default for ARC platform whenever it is possible. -# To use TFLM reference implementation it should be intentionally turned off -# by passing 'no_embarc_mli' tag (make -f TAGS=no_embarc_mli ...) -ifeq ($(filter no_embarc_mli,$(ALL_TAGS)),) - - -ALL_TAGS += embarc_mli - -ifeq ($(PRE_COMPILED_MLI),true) - # TODO: Replace with proper embarc_mli pre-builts. - $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,)) - - MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include - MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a - - THIRD_PARTY_CC_HDRS += \ - third_party/embarc_osp/LICENSE -else - MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME)) - - $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE))) - - MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include - MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a - MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a - - THIRD_PARTY_CC_HDRS += \ - third_party/$(MLI_LIB_DIR)/LICENSE -endif - - THIRD_PARTY_CC_HDRS += $(MLI_LIB) - GENERATED_PROJECT_LIBS += $(MLI_LIB) - - INCLUDES += \ - -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \ - -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api - - GENERATED_PROJECT_INCLUDES += \ - -I. \ - -I./third_party/$(MLI_INCLUDE_FOLDER) \ - -I./third_party/$(MLI_INCLUDE_FOLDER)/api - - - THIRD_PARTY_CC_HDRS += \ - third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \ - third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \ - third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h - - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h - MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h - MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h - MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc - MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h - - - MICROLITE_TEST_SRCS += $(wildcard tensorflow/lite/micro/kernels/embarc_mli/*test.cc) - - EMBARC_MLI_TESTS := conv depthwise_conv pooling fully_connected - EMBARC_MLI_TESTS += $(foreach TEST,$(EMBARC_MLI_TESTS), $(TEST)_slicing) - -generate_embarc_mli_test_projects: $(foreach TEST,$(EMBARC_MLI_TESTS), generate_kernel_$(TEST)_test_make_project) - - -endif # no_embarc_mli -endif # TARGET_ARCH From 1196bed72bcedb8abc72a3da70c7ba58af03395f Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Thu, 16 Apr 2020 12:15:40 +0300 Subject: [PATCH 039/557] Merge latest updates from reference kernelse inside wrappers of arc_mli + fix minor bugs in kernel tests --- tensorflow/lite/micro/kernels/arc_mli/conv.cc | 180 +++++--- .../micro/kernels/arc_mli/depthwise_conv.cc | 389 ++++++++++-------- .../micro/kernels/arc_mli/fully_connected.cc | 49 ++- tensorflow/lite/micro/kernels/conv_test.cc | 4 +- tensorflow/lite/micro/kernels/pooling_test.cc | 2 +- 5 files changed, 361 insertions(+), 263 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc index d02f081434f..b9be93ceb11 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/reference/conv.h" -#include "mli_api.h" // NOLINT +#include "mli_api.h" #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/internal/common.h" @@ -24,12 +24,10 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/padding.h" -#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h" -#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h" #include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h" #include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h" - -#include "mli_api.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h" namespace tflite { namespace ops { @@ -42,9 +40,11 @@ constexpr int kBiasTensor = 2; constexpr int kOutputTensor = 0; constexpr int kMaxChannels = 256; -// This file has 2 implementation of Conv. +// Conv is quantized along dimension 0: +// https://www.tensorflow.org/lite/performance/quantization_spec +constexpr int kConvQuantizedDimension = 0; -const int kTensorNotAllocated = -1; +// This file has 2 implementation of Conv. struct OpData { TfLitePaddingValues padding; @@ -101,13 +101,15 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + int output_channels = filter->dims->data[kConvQuantizedDimension]; TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams( context, input, filter, bias, output, params->activation, &data->output_multiplier, &data->output_shift, &data->output_activation_min, &data->output_activation_max, data->per_channel_output_multiplier, - reinterpret_cast(data->per_channel_output_shift))); + reinterpret_cast(data->per_channel_output_shift), + output_channels)); } return kTfLiteOk; } @@ -144,12 +146,10 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, GetTensorData(im2col), nullptr); } -TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, - TfLiteConvParams* params, OpData* data, - const TfLiteTensor* input, - const TfLiteTensor* filter, - const TfLiteTensor* bias, TfLiteTensor* output, - TfLiteTensor* im2col) { +TfLiteStatus EvalMliQuantizedPerChannel( + TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params, + OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter, + const TfLiteTensor* bias, TfLiteTensor* output) { // Run Conv MLI kernel // MLI optimized version only supports int8 dataype and dilation factor of 1 if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) && @@ -204,24 +204,36 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, const int height_dimension = 1; int in_slice_height = 0; int out_slice_height = 0; - const int kernel_height = static_cast(mli_weights.shape[KRNL_H_DIM_HWC]); + const int kernel_height = + static_cast(mli_weights.shape[KRNL_H_DIM_HWC]); const int overlap = kernel_height - cfg.stride_height; // for weight slicing (on output channels) - const int weight_out_ch_dimension = 0; // NHWC layout for weigths, output channel dimension is the first dimension. - int slice_channels = static_cast(mli_weights.shape[weight_out_ch_dimension]); - const int out_tensor_ch_dimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels. + const int weight_out_ch_dimension = + 0; // NHWC layout for weigths, output channel dimension is the first + // dimension. + int slice_channels = + static_cast(mli_weights.shape[weight_out_ch_dimension]); + const int out_tensor_ch_dimension = + 3; // Batch-Height-Width-Channel layout means last dimension is output + // channels. - // Tensors for data in fast (local) memory and config to copy data from external to local memory + // Tensors for data in fast (local) memory and config to copy data from + // external to local memory mli_tensor weights_local = mli_weights; mli_tensor bias_local = mli_bias; mli_tensor in_local = mli_in; mli_tensor out_local = mli_out; mli_mov_cfg_t copy_config; mli_mov_cfg_for_copy(©_config); - TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); - TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &in_slice_height, &out_slice_height)); - TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, weight_out_ch_dimension, &slice_channels)); + TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors( + context, &in_local, &weights_local, &bias_local, &out_local)); + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io( + &in_local, &out_local, kernel_height, cfg.stride_height, + cfg.padding_top, cfg.padding_bottom, &in_slice_height, + &out_slice_height)); + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights( + &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels)); /* is_local indicates that the tensor is already in local memory, so in that case the original tensor can be used, @@ -233,33 +245,40 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels); TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension, slice_channels); - TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true); + TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, + 0, 0, 0, true); - mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local; - mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local; + mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local; + mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local; - void *input_buffer_ptr = NULL; + void* input_buffer_ptr = NULL; int input_buffer_size = 0; - while (!w_slice.Done()){ + while (!w_slice.Done()) { mli_mov_tensor_sync(w_slice.Sub(), ©_config, w_ptr); mli_mov_tensor_sync(b_slice.Sub(), ©_config, b_ptr); - /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor. - because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. - on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated. - The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) - in chunks of 'sliceHeight' */ - TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height, cfg.padding_top, cfg.padding_bottom, overlap); + /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional + tensor. because the mli kernel will process one HWC tensor at a time, the + 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. + on top of that there could be a need to also slice in the Height + dimension. for that the sliceHeight has been calculated. The tensor slicer + is configured that it will completely slice the nBatch dimension (0) and + slice the height dimension (1) in chunks of 'sliceHeight' */ + TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height, + cfg.padding_top, cfg.padding_bottom, overlap); - /* output tensor is alreade sliced in the output channel dimension. out_ch_slice.Sub() is the tensor for the amount of - output channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and - height dimension. */ - TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension, out_slice_height); + /* output tensor is alreade sliced in the output channel dimension. + out_ch_slice.Sub() is the tensor for the amount of output channels of this + itteration of the weight slice loop. This tensor needs to be further + sliced over the batch and height dimension. */ + TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension, + out_slice_height); - /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */ - mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; - mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; + /* setup the pointers to the local or remote tensor to make the code + * inside the loop easier. */ + mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local; + mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local; while (!out_slice.Done()) { TF_LITE_ENSURE(context, !in_slice.Done()); @@ -267,7 +286,8 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, cfg.padding_bottom = in_slice.GetPaddingPost(); // if same input copy as previous iteration, skip the copy of input - if ((in_slice.Sub()->data != input_buffer_ptr) || (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) { + if ((in_slice.Sub()->data != input_buffer_ptr) || + (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) { mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); input_buffer_ptr = in_slice.Sub()->data; input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0); @@ -283,26 +303,37 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, out_ch_slice.Next(); TF_LITE_ENSURE(context, in_slice.Done()); } - - } else { - ConvParams op_params; - op_params.input_offset = -input->params.zero_point; - op_params.output_offset = output->params.zero_point; - op_params.stride_height = params->stride_height; - op_params.stride_width = params->stride_width; - op_params.dilation_height_factor = params->dilation_height_factor; - op_params.dilation_width_factor = params->dilation_width_factor; - op_params.padding_values.height = data->padding.height; - op_params.padding_values.width = data->padding.width; - - reference_integer_ops::ConvPerChannel( - op_params, data->per_channel_output_multiplier, - data->per_channel_output_shift, GetTensorShape(input), - GetTensorData(input), GetTensorShape(filter), - GetTensorData(filter), GetTensorShape(bias), - GetTensorData(bias), GetTensorShape(output), - GetTensorData(output)); } + + return kTfLiteOk; +} + +TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, + TfLiteConvParams* params, OpData* data, + const TfLiteTensor* input, + const TfLiteTensor* filter, + const TfLiteTensor* bias, + TfLiteTensor* output) { + ConvParams op_params; + op_params.input_offset = -input->params.zero_point; + op_params.output_offset = output->params.zero_point; + op_params.stride_height = params->stride_height; + op_params.stride_width = params->stride_width; + op_params.dilation_height_factor = params->dilation_height_factor; + op_params.dilation_width_factor = params->dilation_width_factor; + op_params.padding_values.height = data->padding.height; + op_params.padding_values.width = data->padding.width; + op_params.quantized_activation_min = data->output_activation_min; + op_params.quantized_activation_max = data->output_activation_max; + + reference_integer_ops::ConvPerChannel( + op_params, data->per_channel_output_multiplier, + data->per_channel_output_shift, GetTensorShape(input), + GetTensorData(input), GetTensorShape(filter), + GetTensorData(filter), GetTensorShape(bias), + GetTensorData(bias), GetTensorShape(output), + GetTensorData(output)); + return kTfLiteOk; } @@ -352,6 +383,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { OpData data; // All per-channel quantized tensors need valid zero point and scale arrays. + bool mli_is_applicable = false; if (input->type == kTfLiteInt8) { TF_LITE_ENSURE_EQ(context, filter->quantization.type, kTfLiteAffineQuantization); @@ -362,26 +394,38 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, affine_quantization); TF_LITE_ENSURE(context, affine_quantization->scale); TF_LITE_ENSURE(context, affine_quantization->zero_point); - // Conv is quantized along dimension 0: - // https://www.tensorflow.org/lite/performance/quantization_spec - TF_LITE_ENSURE_EQ(context, filter->dims->data[0], - affine_quantization->scale->size); - TF_LITE_ENSURE_EQ(context, filter->dims->data[0], + + TF_LITE_ENSURE(context, + affine_quantization->scale->size == 1 || + affine_quantization->scale->size == + filter->dims->data[kConvQuantizedDimension]); + TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size, affine_quantization->zero_point->size); + mli_is_applicable = + ((filter->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) && + (params->dilation_width_factor == 1) && + (params->dilation_height_factor == 1) && + (affine_quantization->scale->size == + filter->dims->data[kConvQuantizedDimension])); } TF_LITE_ENSURE_STATUS(CalculateOpData( context, node, params, input_width, input_height, filter_width, filter_height, output_width, output_height, input->type, &data)); - switch (input->type) { // Already know in/out types are same. case kTfLiteFloat32: EvalFloat(context, node, params, &data, input, filter, bias, nullptr, nullptr, output); break; case kTfLiteInt8: - return EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias, - output, nullptr); + if (mli_is_applicable) { + return EvalMliQuantizedPerChannel(context, node, params, &data, input, + filter, bias, output); + + } else { + return EvalQuantizedPerChannel(context, node, params, &data, input, + filter, bias, output); + } break; case kTfLiteUInt8: EvalQuantized(context, node, params, &data, input, filter, bias, nullptr, diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc index 049347cc7a1..9860235b2fb 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h" -#include "mli_api.h" // NOLINT +#include "mli_api.h" #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/internal/common.h" @@ -30,8 +30,6 @@ limitations under the License. #include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h" #include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h" -#include "mli_api.h" - namespace tflite { namespace ops { namespace micro { @@ -44,6 +42,10 @@ constexpr int kBiasTensor = 2; constexpr int kOutputTensor = 0; constexpr int kMaxChannels = 256; +// Depthwise conv is quantized along dimension 3: +// https://www.tensorflow.org/lite/performance/quantization_spec +constexpr int kDepthwiseConvQuantizedDimension = 3; + struct OpData { TfLitePaddingValues padding; // The scaling factor from input to output (aka the 'real multiplier') can @@ -85,6 +87,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension]; // Ensure filter and bias channel count does not exceed space reserved for // quantization metadata. @@ -101,7 +104,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, &data->output_multiplier, &data->output_shift, &data->output_activation_min, &data->output_activation_max, data->per_channel_output_multiplier, - reinterpret_cast(data->per_channel_output_shift))); + reinterpret_cast(data->per_channel_output_shift), num_channels)); } return kTfLiteOk; } @@ -136,187 +139,201 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node, GetTensorData(output)); } -TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, +TfLiteStatus EvalMliQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, TfLiteDepthwiseConvParams* params, OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output) { // Run Depthwise Conv MLI kernel // MLI optimized version only supports int8 dataype and dilation factor of 1 - if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) && - (params->dilation_height_factor == 1)) { - mli_tensor mli_in = {0}; - mli_tensor mli_weights = {0}; - mli_tensor mli_bias = {0}; - mli_tensor mli_out = {0}; - mli_conv2d_cfg cfg = {}; + mli_tensor mli_in = {0}; + mli_tensor mli_weights = {0}; + mli_tensor mli_bias = {0}; + mli_tensor mli_out = {0}; + mli_conv2d_cfg cfg = {}; - // reuse space allocated for OpData parameters - mli_weights.el_params.asym.scale.pi16 = - (int16_t*)data->per_channel_output_multiplier; - mli_bias.el_params.asym.scale.pi16 = - (int16_t*)data->per_channel_output_shift; + // reuse space allocated for OpData parameters + mli_weights.el_params.asym.scale.pi16 = + (int16_t*)data->per_channel_output_multiplier; + mli_bias.el_params.asym.scale.pi16 = + (int16_t*)data->per_channel_output_shift; - int16_t filter_zero_point = 0; - int16_t bias_zero_point = 0; - mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point; - mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point; + int16_t filter_zero_point = 0; + int16_t bias_zero_point = 0; + mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point; + mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point; - ConvertToMliTensor(input, &mli_in); - ConvertToMliTensorPerChannel(filter, &mli_weights); - ConvertToMliTensorPerChannel(bias, &mli_bias); - ConvertToMliTensor(output, &mli_out); - - if (params->activation == kTfLiteActRelu) { - cfg.relu.type = MLI_RELU_GEN; - } else if (params->activation == kTfLiteActRelu6) { - cfg.relu.type = MLI_RELU_6; - } else if (params->activation == kTfLiteActRelu1) { - cfg.relu.type = MLI_RELU_1; - } else { - cfg.relu.type = MLI_RELU_NONE; - } - - cfg.stride_width = params->stride_width; - cfg.stride_height = params->stride_height; - if (params->padding == kTfLitePaddingValid) { - cfg.padding_left = 0; - cfg.padding_right = 0; - cfg.padding_top = 0; - cfg.padding_bottom = 0; - } else { - cfg.padding_left = data->padding.width; - cfg.padding_right = data->padding.width + data->padding.width_offset; - cfg.padding_top = data->padding.height; - cfg.padding_bottom = data->padding.height + data->padding.height_offset; - } - - // for height slicing - const int heightDimension = 1; - int inSliceHeight = 0; - int outSliceHeight = 0; - const int kernelHeight = static_cast(mli_weights.shape[KRNL_DW_H_DIM_HWC]); - const int overlap = kernelHeight - cfg.stride_height; - - // for weight slicing (on output channels) - const int weight_out_ch_dimension = 3; // HWCN layout for weigths, output channel dimension is the first dimension. - const int bias_out_ch_dimension = 0; // bias has only 1 dimension - const int out_tensor_ch_dimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels. - const int32_t in_channels = mli_in.shape[out_tensor_ch_dimension]; - const int32_t out_channels = mli_out.shape[out_tensor_ch_dimension]; - int slice_channels = static_cast(mli_weights.shape[weight_out_ch_dimension]); - - // Tensors for data in fast (local) memory and config to copy data from external to local memory - mli_tensor weights_local = mli_weights; - mli_tensor bias_local = mli_bias; - mli_tensor in_local = mli_in; - mli_tensor out_local = mli_out; // this assumes that output shape is already filled in the tensor struct. - mli_mov_cfg_t copy_config; - mli_mov_cfg_for_copy(©_config); - - TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); - /* is_local indicates that the tensor is already in local memory, - so in that case the original tensor can be used, - and there is no need to copy it to the local tensor*/ - const bool in_is_local = in_local.data == mli_in.data; - const bool out_is_local = out_local.data == mli_out.data; - const bool w_is_local = weights_local.data == mli_weights.data; - const bool b_is_local = bias_local.data == mli_bias.data; - - TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight)); - TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, weight_out_ch_dimension, &slice_channels)); - - /* if input channels is not equal to output channels, a channel multiplier is used. - in this case the slice channels needs to be rounded down to a multiple of the input channels */ - if (in_channels != out_channels) { - slice_channels = (slice_channels / in_channels) * in_channels; - } - - TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0, 0, 0, true); - TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels); - TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true); - TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true); - - mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local; - mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local; - - void *input_buffer_ptr = NULL; - int input_buffer_size = 0; - int padding_top = cfg.padding_top; - int padding_bottom = cfg.padding_bottom; - - while (!w_slice.Done()){ - mli_mov_tensor_sync(w_slice.Sub(), ©_config, w_ptr); - mli_mov_tensor_sync(b_slice.Sub(), ©_config, b_ptr); - - /* input tensor is alreade sliced in the channel dimension. out_ch_slice.Sub() is the tensor for the amount of - channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and - height dimension. - in_ch_slice.Sub() tensor contains batches of HWC tensors. so it is a 4 dimensional tensor. - because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. - on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated. - The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) - in chunks of 'sliceHeight' */ - TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight, padding_top, padding_bottom, overlap); - - /* output tensor is alreade sliced in the output channel dimension. out_ch_slice.Sub() is the tensor for the amount of - output channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and - height dimension. */ - TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight); - - /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */ - mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; - mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; - - while (!out_slice.Done()) { - TF_LITE_ENSURE(context, !in_slice.Done()); - cfg.padding_top = in_slice.GetPaddingPre(); - cfg.padding_bottom = in_slice.GetPaddingPost(); - - // if same input copy as previous iteration, skip the copy of input - if ((in_slice.Sub()->data != input_buffer_ptr) || (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) { - mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); - input_buffer_ptr = in_slice.Sub()->data; - input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0); - } - mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr); - mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); - - in_slice.Next(); - out_slice.Next(); - } - w_slice.Next(); - b_slice.Next(); - out_ch_slice.Next(); - in_ch_slice.Next(); - TF_LITE_ENSURE(context, in_slice.Done()); - } + ConvertToMliTensor(input, &mli_in); + ConvertToMliTensorPerChannel(filter, &mli_weights); + ConvertToMliTensorPerChannel(bias, &mli_bias); + ConvertToMliTensor(output, &mli_out); + if (params->activation == kTfLiteActRelu) { + cfg.relu.type = MLI_RELU_GEN; + } else if (params->activation == kTfLiteActRelu6) { + cfg.relu.type = MLI_RELU_6; + } else if (params->activation == kTfLiteActRelu1) { + cfg.relu.type = MLI_RELU_1; } else { - DepthwiseParams op_params; - op_params.padding_type = PaddingType::kSame; - op_params.padding_values.width = data->padding.width; - op_params.padding_values.height = data->padding.height; - op_params.stride_width = params->stride_width; - op_params.stride_height = params->stride_height; - op_params.dilation_width_factor = params->dilation_width_factor; - op_params.dilation_height_factor = params->dilation_height_factor; - op_params.depth_multiplier = params->depth_multiplier; - op_params.input_offset = -input->params.zero_point; - op_params.weights_offset = 0; - op_params.output_offset = output->params.zero_point; - // TODO(b/130439627): Use calculated value for clamping. - op_params.quantized_activation_min = std::numeric_limits::min(); - op_params.quantized_activation_max = std::numeric_limits::max(); - - reference_integer_ops::DepthwiseConvPerChannel( - op_params, data->per_channel_output_multiplier, - data->per_channel_output_shift, GetTensorShape(input), - GetTensorData(input), GetTensorShape(filter), - GetTensorData(filter), GetTensorShape(bias), - GetTensorData(bias), GetTensorShape(output), - GetTensorData(output)); + cfg.relu.type = MLI_RELU_NONE; } + + cfg.stride_width = params->stride_width; + cfg.stride_height = params->stride_height; + if (params->padding == kTfLitePaddingValid) { + cfg.padding_left = 0; + cfg.padding_right = 0; + cfg.padding_top = 0; + cfg.padding_bottom = 0; + } else { + cfg.padding_left = data->padding.width; + cfg.padding_right = data->padding.width + data->padding.width_offset; + cfg.padding_top = data->padding.height; + cfg.padding_bottom = data->padding.height + data->padding.height_offset; + } + + // for height slicing + const int heightDimension = 1; + int inSliceHeight = 0; + int outSliceHeight = 0; + const int kernelHeight = static_cast(mli_weights.shape[KRNL_DW_H_DIM_HWC]); + const int overlap = kernelHeight - cfg.stride_height; + + // for weight slicing (on output channels) + const int weight_out_ch_dimension = 3; // HWCN layout for weigths, output channel dimension is the first dimension. + const int bias_out_ch_dimension = 0; // bias has only 1 dimension + const int out_tensor_ch_dimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels. + const int32_t in_channels = mli_in.shape[out_tensor_ch_dimension]; + const int32_t out_channels = mli_out.shape[out_tensor_ch_dimension]; + int slice_channels = static_cast(mli_weights.shape[weight_out_ch_dimension]); + + // Tensors for data in fast (local) memory and config to copy data from external to local memory + mli_tensor weights_local = mli_weights; + mli_tensor bias_local = mli_bias; + mli_tensor in_local = mli_in; + mli_tensor out_local = mli_out; // this assumes that output shape is already filled in the tensor struct. + mli_mov_cfg_t copy_config; + mli_mov_cfg_for_copy(©_config); + + TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors( + context, &in_local, &weights_local, &bias_local, &out_local)); + /* is_local indicates that the tensor is already in local memory, + so in that case the original tensor can be used, + and there is no need to copy it to the local tensor*/ + const bool in_is_local = in_local.data == mli_in.data; + const bool out_is_local = out_local.data == mli_out.data; + const bool w_is_local = weights_local.data == mli_weights.data; + const bool b_is_local = bias_local.data == mli_bias.data; + + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io( + &in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top, + cfg.padding_bottom, &inSliceHeight, &outSliceHeight)); + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights( + &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels)); + + /* if input channels is not equal to output channels, a channel multiplier + is used. in this case the slice channels needs to be rounded down to a + multiple of the input channels */ + if (in_channels != out_channels) { + slice_channels = (slice_channels / in_channels) * in_channels; + } + + TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0, 0, 0, true); + TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels); + TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true); + TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true); + + mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local; + mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local; + + void *input_buffer_ptr = NULL; + int input_buffer_size = 0; + int padding_top = cfg.padding_top; + int padding_bottom = cfg.padding_bottom; + + while (!w_slice.Done()){ + mli_mov_tensor_sync(w_slice.Sub(), ©_config, w_ptr); + mli_mov_tensor_sync(b_slice.Sub(), ©_config, b_ptr); + + /* input tensor is alreade sliced in the channel dimension. + out_ch_slice.Sub() is the tensor for the amount of channels of this + itteration of the weight slice loop. This tensor needs to be further + sliced over the batch and height dimension. in_ch_slice.Sub() tensor + contains batches of HWC tensors. so it is a 4 dimensional tensor. because + the mli kernel will process one HWC tensor at a time, the 4 dimensional + tensor needs to be sliced into nBatch 3 dimensional tensors. on top of + that there could be a need to also slice in the Height dimension. for that + the sliceHeight has been calculated. The tensor slicer is configured that + it will completely slice the nBatch dimension (0) and slice the height + dimension (1) in chunks of 'sliceHeight' */ + TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight, padding_top, padding_bottom, overlap); + + /* output tensor is alreade sliced in the output channel dimension. + out_ch_slice.Sub() is the tensor for the amount of output channels of this + itteration of the weight slice loop. This tensor needs to be further + sliced over the batch and height dimension. */ + TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight); + + /* setup the pointers to the local or remote tensor to make the code + * inside the loop easier. */ + mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; + mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; + + while (!out_slice.Done()) { + TF_LITE_ENSURE(context, !in_slice.Done()); + cfg.padding_top = in_slice.GetPaddingPre(); + cfg.padding_bottom = in_slice.GetPaddingPost(); + + // if same input copy as previous iteration, skip the copy of input + if ((in_slice.Sub()->data != input_buffer_ptr) || + (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) { + mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); + input_buffer_ptr = in_slice.Sub()->data; + input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0); + } + mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr); + mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); + + in_slice.Next(); + out_slice.Next(); + } + w_slice.Next(); + b_slice.Next(); + out_ch_slice.Next(); + in_ch_slice.Next(); + TF_LITE_ENSURE(context, in_slice.Done()); + } + return kTfLiteOk; +} + +TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, + TfLiteDepthwiseConvParams* params, + OpData* data, const TfLiteTensor* input, + const TfLiteTensor* filter, + const TfLiteTensor* bias, + TfLiteTensor* output) { + DepthwiseParams op_params; + op_params.padding_type = PaddingType::kSame; + op_params.padding_values.width = data->padding.width; + op_params.padding_values.height = data->padding.height; + op_params.stride_width = params->stride_width; + op_params.stride_height = params->stride_height; + op_params.dilation_width_factor = params->dilation_width_factor; + op_params.dilation_height_factor = params->dilation_height_factor; + op_params.depth_multiplier = params->depth_multiplier; + op_params.input_offset = -input->params.zero_point; + op_params.weights_offset = 0; + op_params.output_offset = output->params.zero_point; + op_params.quantized_activation_min = data->output_activation_min; + op_params.quantized_activation_max = data->output_activation_max; + + reference_integer_ops::DepthwiseConvPerChannel( + op_params, data->per_channel_output_multiplier, + data->per_channel_output_shift, GetTensorShape(input), + GetTensorData(input), GetTensorShape(filter), + GetTensorData(filter), GetTensorShape(bias), + GetTensorData(bias), GetTensorShape(output), + GetTensorData(output)); return kTfLiteOk; } @@ -373,6 +390,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { OpData data; // All per-channel quantized tensors need valid zero point and scale arrays. + bool mli_is_applicable = false; if (input->type == kTfLiteInt8) { TF_LITE_ENSURE_EQ(context, filter->quantization.type, kTfLiteAffineQuantization); @@ -383,12 +401,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, affine_quantization); TF_LITE_ENSURE(context, affine_quantization->scale); TF_LITE_ENSURE(context, affine_quantization->zero_point); - // Depthwise conv is quantized along dimension 3: - // https://www.tensorflow.org/lite/performance/quantization_spec - TF_LITE_ENSURE_EQ(context, filter->dims->data[3], - affine_quantization->scale->size); - TF_LITE_ENSURE_EQ(context, filter->dims->data[3], + TF_LITE_ENSURE( + context, affine_quantization->scale->size == 1 || + affine_quantization->scale->size == + filter->dims->data[kDepthwiseConvQuantizedDimension]); + TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size, affine_quantization->zero_point->size); + mli_is_applicable = + ((filter->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) && + (params->dilation_width_factor == 1) && + (params->dilation_height_factor == 1) && + (affine_quantization->scale->size == + filter->dims->data[kDepthwiseConvQuantizedDimension])); } TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height, @@ -399,8 +423,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { EvalFloat(context, node, params, &data, input, filter, bias, output); break; case kTfLiteInt8: - return EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias, - output); + if (mli_is_applicable) { + return EvalMliQuantizedPerChannel(context, node, params, &data, input, + filter, bias, output); + } else { + return EvalQuantizedPerChannel(context, node, params, &data, input, + filter, bias, output); + } break; case kTfLiteUInt8: EvalQuantized(context, node, params, &data, input, filter, bias, output); diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc index 61fa0ff397f..185217d0c6a 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/reference/fully_connected.h" -#include "mli_api.h" // NOLINT +#include "mli_api.h" #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/internal/common.h" @@ -28,8 +28,6 @@ limitations under the License. #include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h" #include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h" -#include "mli_api.h" - namespace tflite { namespace ops { namespace micro { @@ -77,6 +75,37 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, } // namespace +void* Init(TfLiteContext* context, const char* buffer, size_t length) { + OpData* data = nullptr; + TfLiteStatus status = context->AllocatePersistentBuffer( + context, sizeof(OpData), reinterpret_cast(&data)); + if (status != kTfLiteOk || data == nullptr) { + return nullptr; + } + return data; +} + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + OpData* data = reinterpret_cast(node->user_data); + auto* params = + reinterpret_cast(node->builtin_data); + + const TfLiteTensor* input = GetInput(context, node, kInputTensor); + const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor); + const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor); + TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + + TF_LITE_ENSURE_EQ(context, input->type, output->type); + TF_LITE_ENSURE_MSG(context, input->type == filter->type, + "Hybrid models are not supported on TFLite Micro."); + + TfLiteType data_type = input->type; + TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input, + filter, bias, output, data)); + + return kTfLiteOk; +} + TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node, TfLiteFullyConnectedParams* params, OpData* data, const TfLiteTensor* input, @@ -263,13 +292,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - TfLiteType data_type = input->type; - OpData local_data_object; - OpData* data = &local_data_object; - TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input, - filter, bias, output, data)); + OpData* data = reinterpret_cast(node->user_data); - switch (filter->type) { // Already know in/out types are same. + // Checks in Prepare ensure input, output and filter types are all the same. + switch (input->type) { case kTfLiteFloat32: return EvalFloat(context, node, params, data, input, filter, bias, output); @@ -292,15 +318,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } // namespace fully_connected TfLiteRegistration* Register_FULLY_CONNECTED() { - static TfLiteRegistration r = {/*init=*/nullptr, + static TfLiteRegistration r = {/*init=*/fully_connected::Init, /*free=*/nullptr, - /*prepare=*/nullptr, + /*prepare=*/fully_connected::Prepare, /*invoke=*/fully_connected::Eval, /*profiling_string=*/nullptr, /*builtin_code=*/0, /*custom_name=*/nullptr, /*version=*/0}; - return &r; } diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc index 4cc2a80c3ea..8a3eb30630d 100644 --- a/tensorflow/lite/micro/kernels/conv_test.cc +++ b/tensorflow/lite/micro/kernels/conv_test.cc @@ -409,8 +409,8 @@ TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannel) { TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannelRelu6) { // conv params: - // padding, stride_, dilation_, activation - TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6}; + // padding, stride_, activation, dilation_ + TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6, 1, 1}; const int kInputShape[] = {4, 1, 2, 2, 4}; // [len,N,H,W,C] const int kInputElements = kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4]; diff --git a/tensorflow/lite/micro/kernels/pooling_test.cc b/tensorflow/lite/micro/kernels/pooling_test.cc index 8bfeb718a1b..96dff421d53 100644 --- a/tensorflow/lite/micro/kernels/pooling_test.cc +++ b/tensorflow/lite/micro/kernels/pooling_test.cc @@ -496,7 +496,7 @@ TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingSameStride1ActNone) { F2QS(8.5, output_min, output_max), F2QS(7., output_min, output_max)}, {4, 1, 2, 4, 1}, // Output shape output_min, output_max, // output quantization range - kTfLitePaddingValid, kTfLiteActNone, output_data); + kTfLitePaddingSame, kTfLiteActNone, output_data); } TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloat) { From 273948c6aaf8424e8adf33d6f3fcba6c9fa935e2 Mon Sep 17 00:00:00 2001 From: Daria Zhuravleva Date: Tue, 14 Apr 2020 12:10:11 +0300 Subject: [PATCH 040/557] Common wrapper for average and max pooling --- .../lite/micro/kernels/arc_mli/pooling.cc | 267 ++++++++++-------- 1 file changed, 145 insertions(+), 122 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc index ced5c4a21b8..7f87d4849ff 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc @@ -14,7 +14,6 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/lite/kernels/internal/reference/pooling.h" -#include "mli_api.h" // NOLINT #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" @@ -41,6 +40,8 @@ struct OpData { TfLitePaddingValues padding; }; +typedef enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 } MliPoolingType; + TfLiteStatus CalculateOpData(const TfLiteContext* context, const TfLitePoolParams* params, const TfLiteTensor* input, @@ -81,110 +82,111 @@ void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node, GetTensorShape(output), GetTensorData(output)); } -void AverageEvalUint8(TfLiteContext* context, const TfLiteNode* node, - const TfLitePoolParams* params, const OpData* data, - const TfLiteTensor* input, TfLiteTensor* output) { - int32_t activation_min, activation_max; - (void)CalculateActivationRangeQuantized(context, params->activation, output, - &activation_min, &activation_max); +//Prepare MLI tensors and run Average or Max Pooling +TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params, + const OpData* data, const TfLiteTensor* input, + TfLiteTensor* output, const MliPoolingType pooling_type) { + mli_tensor mli_in = {0}; + mli_tensor mli_out = {0}; + mli_pool_cfg cfg = {0}; - PoolParams op_params; - op_params.stride_height = params->stride_height; - op_params.stride_width = params->stride_width; - op_params.filter_height = params->filter_height; - op_params.filter_width = params->filter_width; - op_params.padding_values.height = data->padding.height; - op_params.padding_values.width = data->padding.width; - op_params.quantized_activation_min = activation_min; - op_params.quantized_activation_max = activation_max; - reference_ops::AveragePool( - op_params, GetTensorShape(input), GetTensorData(input), - GetTensorShape(output), GetTensorData(output)); + ConvertToMliTensor(input, &mli_in); + ConvertToMliTensor(output, &mli_out); + + cfg.kernel_width = params->filter_width; + cfg.kernel_height = params->filter_height; + cfg.stride_width = params->stride_width; + cfg.stride_height = params->stride_height; + + if (params->padding == kTfLitePaddingValid) { + cfg.padding_left = 0; + cfg.padding_right = 0; + cfg.padding_top = 0; + cfg.padding_bottom = 0; + } else { + cfg.padding_left = data->padding.width; + cfg.padding_right = data->padding.width + data->padding.width_offset; + cfg.padding_top = data->padding.height; + cfg.padding_bottom = data->padding.height + data->padding.height_offset; + } + + mli_point_to_subtsr_cfg subtsr_cfg_in = { + {0, 0}, 2, static_cast(mli_in.shape[1])}; + mli_point_to_subtsr_cfg subtsr_cfg_out = { + {0, 0}, 2, static_cast(mli_out.shape[1])}; + mli_tensor sub_mli_in = {0}; + mli_tensor sub_mli_out = {0}; + mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); + mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); + + const int height_dimension = 1; + int in_slice_height = 0; + int out_slice_height = 0; + const int overlap = cfg.kernel_height - cfg.stride_height; + + // Tensors for data in fast (local) memory and config to copy data from + // external to local memory + mli_tensor in_local = sub_mli_in; + mli_tensor out_local = sub_mli_out; + mli_mov_cfg_t copy_config; + mli_mov_cfg_for_copy(©_config); + TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_pooling_tensors( + context, &in_local, &out_local)); + bool in_is_local = in_local.data == sub_mli_in.data; + bool out_is_local = out_local.data == sub_mli_out.data; + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io( + &in_local, &out_local, cfg.kernel_height, cfg.stride_height, + cfg.padding_top, cfg.padding_bottom, &in_slice_height, + &out_slice_height)); + + /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional + tensor. because the mli kernel will process one HWC tensor at a time, the 4 + dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. on + top of that there could be a need to also slice in the Height dimension. + for that the sliceHeight has been calculated. The tensor slicer is + configured that it will completely slice the nBatch dimension (0) and slice + the height dimension (1) in chunks of 'sliceHeight' */ + TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height, + cfg.padding_top, cfg.padding_bottom, overlap); + TensorSlicer out_slice(&mli_out, height_dimension, out_slice_height); + + /* is_local indicates that the tensor is already in local memory, + so in that case the original tensor can be used, + and there is no need to copy it to the local tensor*/ + mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local; + mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local; + + while (!out_slice.Done()) { + cfg.padding_top = in_slice.GetPaddingPre(); + cfg.padding_bottom = in_slice.GetPaddingPost(); + + mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); + if (pooling_type == AveragePooling) + mli_krn_avepool_hwc_sa8(in_ptr, &cfg, out_ptr); + else if (pooling_type == MaxPooling) + mli_krn_maxpool_hwc_sa8(in_ptr, &cfg, out_ptr); + mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); + + in_slice.Next(); + out_slice.Next(); + } + return kTfLiteOk; } -TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node, - const TfLitePoolParams* params, const OpData* data, - const TfLiteTensor* input, TfLiteTensor* output) { +void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node, + const TfLitePoolParams* params, const OpData* data, + const TfLiteTensor* input, TfLiteTensor* output) { + TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8); // Run Average Pooling MLI kernel // MLI optimized version only supports int8 dataype and no fused Relu // TODO: subject to add mli_saturate kernel if (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone) { - mli_tensor mli_in = {0}; - mli_tensor mli_out = {0}; - mli_pool_cfg cfg = {0}; - - ConvertToMliTensor(input, &mli_in); - ConvertToMliTensor(output, &mli_out); - - cfg.kernel_width = params->filter_width; - cfg.kernel_height = params->filter_height; - cfg.stride_width = params->stride_width; - cfg.stride_height = params->stride_height; - - if (params->padding == kTfLitePaddingValid) { - cfg.padding_left = 0; - cfg.padding_right = 0; - cfg.padding_top = 0; - cfg.padding_bottom = 0; - } else { - cfg.padding_left = data->padding.width; - cfg.padding_right = data->padding.width + data->padding.width_offset; - cfg.padding_top = data->padding.height; - cfg.padding_bottom = data->padding.height + data->padding.height_offset; - } - - mli_point_to_subtsr_cfg subtsr_cfg_in = {{0,0}, 2, static_cast(mli_in.shape[1])}; - mli_point_to_subtsr_cfg subtsr_cfg_out = {{0,0}, 2, static_cast(mli_out.shape[1])}; - mli_tensor sub_mli_in = {0}; - mli_tensor sub_mli_out = {0}; - mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); - - const int height_dimension = 1; - int in_slice_height = 0; - int out_slice_height = 0; - const int overlap = cfg.kernel_height - cfg.stride_height; - - // Tensors for data in fast (local) memory and config to copy data from external to local memory - mli_tensor in_local = sub_mli_in; - mli_tensor out_local = sub_mli_out; - mli_mov_cfg_t copy_config; - mli_mov_cfg_for_copy(©_config); - TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_pooling_tensors(context, &in_local, &out_local)); - bool in_is_local = in_local.data == sub_mli_in.data; - bool out_is_local = out_local.data == sub_mli_out.data; - TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, cfg.kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &in_slice_height, &out_slice_height)); - - /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor. - because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. - on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated. - The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) - in chunks of 'sliceHeight' */ - TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height, cfg.padding_top, cfg.padding_bottom, overlap); - TensorSlicer out_slice(&mli_out, height_dimension, out_slice_height); - - /* is_local indicates that the tensor is already in local memory, - so in that case the original tensor can be used, - and there is no need to copy it to the local tensor*/ - mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; - mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; - - while (!out_slice.Done()) { - cfg.padding_top = in_slice.GetPaddingPre(); - cfg.padding_bottom = in_slice.GetPaddingPost(); - - mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); - mli_krn_avepool_hwc_sa8(in_ptr, &cfg, out_ptr); - mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); - - in_slice.Next(); - out_slice.Next(); - } - + EvalMli(context, params, data, input, output, AveragePooling); } else { int32_t activation_min, activation_max; (void)CalculateActivationRangeQuantized(context, params->activation, output, &activation_min, &activation_max); + PoolParams op_params; op_params.stride_height = params->stride_height; op_params.stride_width = params->stride_width; @@ -194,11 +196,17 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node, op_params.padding_values.width = data->padding.width; op_params.quantized_activation_min = activation_min; op_params.quantized_activation_max = activation_max; - reference_integer_ops::AveragePool( - op_params, GetTensorShape(input), GetTensorData(input), - GetTensorShape(output), GetTensorData(output)); + + if (input->type == kTfLiteUInt8) { + reference_ops::AveragePool( + op_params, GetTensorShape(input), GetTensorData(input), + GetTensorShape(output), GetTensorData(output)); + } else { + reference_integer_ops::AveragePool( + op_params, GetTensorShape(input), GetTensorData(input), + GetTensorShape(output), GetTensorData(output)); + } } - return kTfLiteOk; } void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node, @@ -222,29 +230,45 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node, GetTensorData(output)); } -void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node, - TfLitePoolParams* params, OpData* data, - const TfLiteTensor* input, TfLiteTensor* output) { - int32_t activation_min, activation_max; - (void)CalculateActivationRangeQuantized(context, params->activation, output, - &activation_min, &activation_max); +void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node, + TfLitePoolParams* params, OpData* data, + const TfLiteTensor* input, TfLiteTensor* output) { + TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8); + + // Run Max Pooling MLI kernel + // MLI optimized version only supports int8 dataype and no fused Relu + // TODO: subject to add mli_saturate kernel + if (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone) { + EvalMli(context, params, data, input, output, MaxPooling); + } else { + int32_t activation_min, activation_max; + (void)CalculateActivationRangeQuantized(context, params->activation, output, + &activation_min, &activation_max); - tflite::PoolParams op_params; - op_params.stride_height = params->stride_height; - op_params.stride_width = params->stride_width; - op_params.filter_height = params->filter_height; - op_params.filter_width = params->filter_width; - op_params.padding_values.height = data->padding.height; - op_params.padding_values.width = data->padding.width; - op_params.quantized_activation_min = activation_min; - op_params.quantized_activation_max = activation_max; - reference_ops::MaxPool(op_params, GetTensorShape(input), - GetTensorData(input), GetTensorShape(output), - GetTensorData(output)); + tflite::PoolParams op_params; + op_params.stride_height = params->stride_height; + op_params.stride_width = params->stride_width; + op_params.filter_height = params->filter_height; + op_params.filter_width = params->filter_width; + op_params.padding_values.height = data->padding.height; + op_params.padding_values.width = data->padding.width; + op_params.quantized_activation_min = activation_min; + op_params.quantized_activation_max = activation_max; + + if (input->type == kTfLiteUInt8) { + reference_ops::MaxPool( + op_params, GetTensorShape(input), GetTensorData(input), + GetTensorShape(output), GetTensorData(output)); + } else { + reference_integer_ops::MaxPool( + op_params, GetTensorShape(input), GetTensorData(input), + GetTensorShape(output), GetTensorData(output)); + } + } } - } // namespace + TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) { auto* params = reinterpret_cast(node->builtin_data); OpData data; @@ -254,16 +278,14 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data)); - // Inputs and outputs share the same type, guarenteed by the converter. + // Inputs and outputs share the same type, guaranteed by the converter. switch (input->type) { case kTfLiteFloat32: AverageEvalFloat(context, node, params, &data, input, output); break; case kTfLiteUInt8: - AverageEvalUint8(context, node, params, &data, input, output); - break; case kTfLiteInt8: - return AverageEvalInt8(context, node, params, &data, input, output); + AverageEvalQuantized(context, node, params, &data, input, output); break; default: TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported", @@ -287,7 +309,8 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) { MaxEvalFloat(context, node, params, &data, input, output); break; case kTfLiteUInt8: - MaxEvalQuantizedUInt8(context, node, params, &data, input, output); + case kTfLiteInt8: + MaxEvalQuantized(context, node, params, &data, input, output); break; default: TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.", From 8ed89130aa4c3da812790a73dae465881428863f Mon Sep 17 00:00:00 2001 From: Daria Zhuravleva Date: Wed, 15 Apr 2020 15:10:52 +0300 Subject: [PATCH 041/557] Refactoring --- tensorflow/lite/micro/kernels/arc_mli/pooling.cc | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc index 7f87d4849ff..7b68e314277 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/lite/kernels/internal/reference/pooling.h" +#include "mli_api.h" #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" @@ -24,7 +25,6 @@ limitations under the License. #include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h" #include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h" -#include "mli_api.h" namespace tflite { namespace ops { @@ -40,7 +40,7 @@ struct OpData { TfLitePaddingValues padding; }; -typedef enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 } MliPoolingType; +enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 }; TfLiteStatus CalculateOpData(const TfLiteContext* context, const TfLitePoolParams* params, @@ -111,9 +111,15 @@ TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params, } mli_point_to_subtsr_cfg subtsr_cfg_in = { - {0, 0}, 2, static_cast(mli_in.shape[1])}; + .start_coord = {0, 0}, + .coord_num = 2, + .first_out_dim_size = static_cast(mli_in.shape[1]), + }; mli_point_to_subtsr_cfg subtsr_cfg_out = { - {0, 0}, 2, static_cast(mli_out.shape[1])}; + .start_coord = {0, 0}, + .coord_num = 2, + .first_out_dim_size = static_cast(mli_out.shape[1]), + }; mli_tensor sub_mli_in = {0}; mli_tensor sub_mli_out = {0}; mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); From 51522a108d0ee14a665752f3f65e534235925a41 Mon Sep 17 00:00:00 2001 From: Daria Zhuravleva Date: Wed, 15 Apr 2020 21:46:00 +0300 Subject: [PATCH 042/557] Removed sub_tensors --- .../lite/micro/kernels/arc_mli/pooling.cc | 25 ++++--------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc index 7b68e314277..2c3875b58eb 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc @@ -109,22 +109,7 @@ TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params, cfg.padding_top = data->padding.height; cfg.padding_bottom = data->padding.height + data->padding.height_offset; } - - mli_point_to_subtsr_cfg subtsr_cfg_in = { - .start_coord = {0, 0}, - .coord_num = 2, - .first_out_dim_size = static_cast(mli_in.shape[1]), - }; - mli_point_to_subtsr_cfg subtsr_cfg_out = { - .start_coord = {0, 0}, - .coord_num = 2, - .first_out_dim_size = static_cast(mli_out.shape[1]), - }; - mli_tensor sub_mli_in = {0}; - mli_tensor sub_mli_out = {0}; - mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in); - mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out); - + const int height_dimension = 1; int in_slice_height = 0; int out_slice_height = 0; @@ -132,14 +117,14 @@ TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params, // Tensors for data in fast (local) memory and config to copy data from // external to local memory - mli_tensor in_local = sub_mli_in; - mli_tensor out_local = sub_mli_out; + mli_tensor in_local = mli_in; + mli_tensor out_local = mli_out; mli_mov_cfg_t copy_config; mli_mov_cfg_for_copy(©_config); TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_pooling_tensors( context, &in_local, &out_local)); - bool in_is_local = in_local.data == sub_mli_in.data; - bool out_is_local = out_local.data == sub_mli_out.data; + bool in_is_local = in_local.data == mli_in.data; + bool out_is_local = out_local.data == mli_out.data; TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io( &in_local, &out_local, cfg.kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &in_slice_height, From 99d489c7efa85b121b99393a53c3c07ac356c641 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Mon, 20 Apr 2020 17:09:56 +0300 Subject: [PATCH 043/557] Option to remove kernels implementation beside ARC MLI --- tensorflow/lite/micro/kernels/arc_mli/conv.cc | 104 ++++--- .../micro/kernels/arc_mli/depthwise_conv.cc | 108 +++++-- .../micro/kernels/arc_mli/fully_connected.cc | 290 ++++++++++-------- .../lite/micro/kernels/arc_mli/pooling.cc | 172 +++++++---- .../micro/tools/make/ext_libs/arc_mli.inc | 8 + 5 files changed, 427 insertions(+), 255 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc index b9be93ceb11..4a2676821d9 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc @@ -1,4 +1,4 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2019-2020 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -44,8 +44,6 @@ constexpr int kMaxChannels = 256; // https://www.tensorflow.org/lite/performance/quantization_spec constexpr int kConvQuantizedDimension = 0; -// This file has 2 implementation of Conv. - struct OpData { TfLitePaddingValues padding; // The scaling factor from input to output (aka the 'real multiplier') can @@ -76,11 +74,31 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) { } } + +bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input, + const TfLiteTensor* filter, const TfLiteTensor* bias, + const TfLiteConvParams* params) { + const auto* affine_quantization = + reinterpret_cast(filter->quantization.params); + // MLI optimized version only supports int8 dataype, dilation factor of 1 and + // per-axis quantization of weights (no broadcasting/per-tensor) + bool ret_val = (filter->type == kTfLiteInt8) && + (input->type == kTfLiteInt8) && + (bias->type == kTfLiteInt32) && + (params->dilation_width_factor == 1) && + (params->dilation_height_factor == 1) && + (affine_quantization->scale->size == + filter->dims->data[kConvQuantizedDimension]) && + affine_quantization->scale->size <= (kMaxChannels * 2); + return ret_val; +} + + TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params, int width, int height, int filter_width, int filter_height, int out_width, int out_height, const TfLiteType data_type, - OpData* data) { + bool mli_is_applicable, OpData* data) { bool has_bias = node->inputs->size == 3; // Check number of inputs/outputs TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2); @@ -95,7 +113,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, // Note that quantized inference requires that all tensors have their // parameters set. This is usually done during quantized training. - if (data_type != kTfLiteFloat32) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) + if (data_type != kTfLiteFloat32 && !mli_is_applicable) { const TfLiteTensor* input = GetInput(context, node, kInputTensor); const TfLiteTensor* filter = GetInput(context, node, kFilterTensor); const TfLiteTensor* bias = @@ -111,14 +130,16 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, reinterpret_cast(data->per_channel_output_shift), output_channels)); } +#endif return kTfLiteOk; } -void EvalQuantized(TfLiteContext* context, TfLiteNode* node, - TfLiteConvParams* params, OpData* data, - const TfLiteTensor* input, const TfLiteTensor* filter, - const TfLiteTensor* bias, TfLiteTensor* im2col, - TfLiteTensor* hwcn_weights, TfLiteTensor* output) { +TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, + TfLiteConvParams* params, OpData* data, + const TfLiteTensor* input, const TfLiteTensor* filter, + const TfLiteTensor* bias, TfLiteTensor* im2col, + TfLiteTensor* hwcn_weights, TfLiteTensor* output) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) const int32_t input_offset = -input->params.zero_point; const int32_t filter_offset = -filter->params.zero_point; const int32_t output_offset = output->params.zero_point; @@ -144,6 +165,12 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, GetTensorData(bias), GetTensorShape(output), GetTensorData(output), GetTensorShape(im2col), GetTensorData(im2col), nullptr); + return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG(context, "Type %s (%d) is not supported by ARC MLI Library.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; +#endif } TfLiteStatus EvalMliQuantizedPerChannel( @@ -209,14 +236,13 @@ TfLiteStatus EvalMliQuantizedPerChannel( const int overlap = kernel_height - cfg.stride_height; // for weight slicing (on output channels) - const int weight_out_ch_dimension = - 0; // NHWC layout for weigths, output channel dimension is the first - // dimension. + // NHWC layout for weigths, output channel dimension is the first dimension. + const int weight_out_ch_dimension = 0; int slice_channels = static_cast(mli_weights.shape[weight_out_ch_dimension]); - const int out_tensor_ch_dimension = - 3; // Batch-Height-Width-Channel layout means last dimension is output - // channels. + // Batch-Height-Width-Channel layout means last dimension is output channels. + const int out_tensor_ch_dimension = 3; + // Tensors for data in fast (local) memory and config to copy data from // external to local memory @@ -304,7 +330,6 @@ TfLiteStatus EvalMliQuantizedPerChannel( TF_LITE_ENSURE(context, in_slice.Done()); } } - return kTfLiteOk; } @@ -314,6 +339,7 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) ConvParams op_params; op_params.input_offset = -input->params.zero_point; op_params.output_offset = output->params.zero_point; @@ -333,15 +359,20 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, GetTensorData(filter), GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), GetTensorData(output)); - return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG(context, + "Node configuration is not supported by ARC MLI Library."); + return kTfLiteError; +#endif } -void EvalFloat(TfLiteContext* context, TfLiteNode* node, - TfLiteConvParams* params, OpData* data, - const TfLiteTensor* input, const TfLiteTensor* filter, - const TfLiteTensor* bias, TfLiteTensor* im2col, - TfLiteTensor* hwcn_weights, TfLiteTensor* output) { +TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node, + TfLiteConvParams* params, OpData* data, + const TfLiteTensor* input, const TfLiteTensor* filter, + const TfLiteTensor* bias, TfLiteTensor* im2col, + TfLiteTensor* hwcn_weights, TfLiteTensor* output) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) float output_activation_min, output_activation_max; CalculateActivationRange(params->activation, &output_activation_min, &output_activation_max); @@ -363,6 +394,12 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node, GetTensorData(bias), GetTensorShape(output), GetTensorData(output), GetTensorShape(im2col), GetTensorData(im2col)); + return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG(context, "Type %s (%d) is not supported by ARC MLI Library.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; +#endif } TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { @@ -383,7 +420,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { OpData data; // All per-channel quantized tensors need valid zero point and scale arrays. - bool mli_is_applicable = false; if (input->type == kTfLiteInt8) { TF_LITE_ENSURE_EQ(context, filter->quantization.type, kTfLiteAffineQuantization); @@ -401,26 +437,22 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { filter->dims->data[kConvQuantizedDimension]); TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size, affine_quantization->zero_point->size); - mli_is_applicable = - ((filter->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) && - (params->dilation_width_factor == 1) && - (params->dilation_height_factor == 1) && - (affine_quantization->scale->size == - filter->dims->data[kConvQuantizedDimension])); } + bool mli_is_applicable = IsMliApplicable(context, input, filter, bias, params); + TF_LITE_ENSURE_STATUS( + CalculateOpData(context, node, params, input_width, input_height, + filter_width, filter_height, output_width, output_height, + input->type, mli_is_applicable, &data)); - TF_LITE_ENSURE_STATUS(CalculateOpData( - context, node, params, input_width, input_height, filter_width, - filter_height, output_width, output_height, input->type, &data)); switch (input->type) { // Already know in/out types are same. case kTfLiteFloat32: - EvalFloat(context, node, params, &data, input, filter, bias, nullptr, + return EvalFloat(context, node, params, &data, input, filter, bias, nullptr, nullptr, output); break; case kTfLiteInt8: if (mli_is_applicable) { return EvalMliQuantizedPerChannel(context, node, params, &data, input, - filter, bias, output); + filter, bias, output); } else { return EvalQuantizedPerChannel(context, node, params, &data, input, @@ -428,7 +460,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } break; case kTfLiteUInt8: - EvalQuantized(context, node, params, &data, input, filter, bias, nullptr, + return EvalQuantized(context, node, params, &data, input, filter, bias, nullptr, nullptr, output); break; default: diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc index 9860235b2fb..081a40b23b5 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc @@ -1,4 +1,4 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2017-2020 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -64,10 +64,30 @@ struct OpData { int32_t output_activation_max; }; +bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input, + const TfLiteTensor* filter, const TfLiteTensor* bias, + const TfLiteDepthwiseConvParams* params) { + const auto* affine_quantization = + reinterpret_cast(filter->quantization.params); + // MLI optimized version only supports int8 dataype, dilation factor of 1 and + // per-axis quantization of weights (no broadcasting/per-tensor) + bool ret_val = (filter->type == kTfLiteInt8) && + (input->type == kTfLiteInt8) && + (bias->type == kTfLiteInt32) && + (params->dilation_width_factor == 1) && + (params->dilation_height_factor == 1) && + (affine_quantization->scale->size == + filter->dims->data[kDepthwiseConvQuantizedDimension]) && + affine_quantization->scale->size <= (kMaxChannels * 2); + return ret_val; +} + + TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, TfLiteDepthwiseConvParams* params, int width, int height, int filter_width, int filter_height, - const TfLiteType data_type, OpData* data) { + const TfLiteType data_type, bool mli_is_applicable, + OpData* data) { bool has_bias = node->inputs->size == 3; // Check number of inputs/outputs TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2); @@ -81,7 +101,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, // Note that quantized inference requires that all tensors have their // parameters set. This is usually done during quantized training. - if (data_type != kTfLiteFloat32) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) + if (data_type != kTfLiteFloat32 && !mli_is_applicable) { const TfLiteTensor* input = GetInput(context, node, kInputTensor); const TfLiteTensor* filter = GetInput(context, node, kFilterTensor); const TfLiteTensor* bias = @@ -106,15 +127,17 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, data->per_channel_output_multiplier, reinterpret_cast(data->per_channel_output_shift), num_channels)); } +#endif return kTfLiteOk; } } // namespace -void EvalFloat(TfLiteContext* context, TfLiteNode* node, - TfLiteDepthwiseConvParams* params, OpData* data, - const TfLiteTensor* input, const TfLiteTensor* filter, - const TfLiteTensor* bias, TfLiteTensor* output) { +TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node, + TfLiteDepthwiseConvParams* params, OpData* data, + const TfLiteTensor* input, const TfLiteTensor* filter, + const TfLiteTensor* bias, TfLiteTensor* output) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) float output_activation_min, output_activation_max; CalculateActivationRange(params->activation, &output_activation_min, &output_activation_max); @@ -137,6 +160,12 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node, GetTensorShape(filter), GetTensorData(filter), GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), GetTensorData(output)); + return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG(context, "Type %s (%d) is not supported by ARC MLI Library.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; +#endif } TfLiteStatus EvalMliQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, @@ -145,7 +174,6 @@ TfLiteStatus EvalMliQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output) { // Run Depthwise Conv MLI kernel - // MLI optimized version only supports int8 dataype and dilation factor of 1 mli_tensor mli_in = {0}; mli_tensor mli_weights = {0}; mli_tensor mli_bias = {0}; @@ -200,18 +228,23 @@ TfLiteStatus EvalMliQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node const int overlap = kernelHeight - cfg.stride_height; // for weight slicing (on output channels) - const int weight_out_ch_dimension = 3; // HWCN layout for weigths, output channel dimension is the first dimension. - const int bias_out_ch_dimension = 0; // bias has only 1 dimension - const int out_tensor_ch_dimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels. + // HWCN layout for weigths, output channel dimension is the first dimension. + const int weight_out_ch_dimension = 3; + // bias has only 1 dimension + const int bias_out_ch_dimension = 0; + // Batch-Height-Width-Channel layout means last dimension is output channels. + const int out_tensor_ch_dimension = 3; const int32_t in_channels = mli_in.shape[out_tensor_ch_dimension]; const int32_t out_channels = mli_out.shape[out_tensor_ch_dimension]; int slice_channels = static_cast(mli_weights.shape[weight_out_ch_dimension]); - // Tensors for data in fast (local) memory and config to copy data from external to local memory + // Tensors for data in fast (local) memory + // and config to copy data from external to local memory mli_tensor weights_local = mli_weights; mli_tensor bias_local = mli_bias; mli_tensor in_local = mli_in; - mli_tensor out_local = mli_out; // this assumes that output shape is already filled in the tensor struct. + mli_tensor out_local = mli_out; // this assumes that output shape + // is already filled in the tensor struct. mli_mov_cfg_t copy_config; mli_mov_cfg_for_copy(©_config); @@ -238,10 +271,13 @@ TfLiteStatus EvalMliQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node slice_channels = (slice_channels / in_channels) * in_channels; } - TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0, 0, 0, true); TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels); - TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true); - TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true); + TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, + 0, 0, 0, true); + TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, + 0, 0, 0, true); + TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, + 0, 0, 0, true); mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local; mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local; @@ -266,7 +302,8 @@ TfLiteStatus EvalMliQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node the sliceHeight has been calculated. The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1) in chunks of 'sliceHeight' */ - TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight, padding_top, padding_bottom, overlap); + TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight, + padding_top, padding_bottom, overlap); /* output tensor is alreade sliced in the output channel dimension. out_ch_slice.Sub() is the tensor for the amount of output channels of this @@ -312,6 +349,7 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) DepthwiseParams op_params; op_params.padding_type = PaddingType::kSame; op_params.padding_values.width = data->padding.width; @@ -335,12 +373,18 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, GetTensorData(bias), GetTensorShape(output), GetTensorData(output)); return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG(context, + "Node configuration is not supported by ARC MLI Library."); + return kTfLiteError; +#endif } -void EvalQuantized(TfLiteContext* context, TfLiteNode* node, - TfLiteDepthwiseConvParams* params, OpData* data, - const TfLiteTensor* input, const TfLiteTensor* filter, - const TfLiteTensor* bias, TfLiteTensor* output) { +TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, + TfLiteDepthwiseConvParams* params, OpData* data, + const TfLiteTensor* input, const TfLiteTensor* filter, + const TfLiteTensor* bias, TfLiteTensor* output) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) const int32_t input_offset = -input->params.zero_point; const int32_t filter_offset = -filter->params.zero_point; const int32_t output_offset = output->params.zero_point; @@ -369,6 +413,12 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, GetTensorShape(filter), GetTensorData(filter), GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), GetTensorData(output)); + return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG(context, "Type %s (%d) is not supported by ARC MLI Library.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; +#endif } TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { @@ -390,7 +440,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { OpData data; // All per-channel quantized tensors need valid zero point and scale arrays. - bool mli_is_applicable = false; if (input->type == kTfLiteInt8) { TF_LITE_ENSURE_EQ(context, filter->quantization.type, kTfLiteAffineQuantization); @@ -407,20 +456,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { filter->dims->data[kDepthwiseConvQuantizedDimension]); TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size, affine_quantization->zero_point->size); - mli_is_applicable = - ((filter->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) && - (params->dilation_width_factor == 1) && - (params->dilation_height_factor == 1) && - (affine_quantization->scale->size == - filter->dims->data[kDepthwiseConvQuantizedDimension])); } + bool mli_is_applicable = IsMliApplicable(context, input, filter, bias, params); TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height, filter_width, filter_height, data_type, - &data)); + mli_is_applicable, &data)); switch (input->type) { // Already know in/out types are same. case kTfLiteFloat32: - EvalFloat(context, node, params, &data, input, filter, bias, output); + return EvalFloat(context, node, params, &data, input, filter, bias, + output); break; case kTfLiteInt8: if (mli_is_applicable) { @@ -432,7 +477,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } break; case kTfLiteUInt8: - EvalQuantized(context, node, params, &data, input, filter, bias, output); + return EvalQuantized(context, node, params, &data, input, filter, bias, + output); break; default: TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.", diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc index 185217d0c6a..70d1fda4c2b 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc @@ -1,4 +1,4 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2017-2020 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/reference/fully_connected.h" -#include "mli_api.h" +#include "mli_api.h" #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/internal/common.h" @@ -23,10 +23,10 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" #include "tensorflow/lite/kernels/kernel_util.h" -#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h" -#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h" -#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h" #include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h" +#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h" +#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h" namespace tflite { namespace ops { @@ -52,6 +52,18 @@ constexpr int kWeightsTensor = 1; constexpr int kBiasTensor = 2; constexpr int kOutputTensor = 0; +bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input, + const TfLiteTensor* filter, const TfLiteTensor* bias, + const TfLiteFullyConnectedParams* params) { + // MLI optimized version only supports int8 dataype and no fused Relu and + // symmetric per-tensor quantization of weights (not per-axis) + bool ret_val = (filter->type == kTfLiteInt8) && + (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) && + (params->activation == kTfLiteActNone) && + (filter->params.zero_point == 0); + return ret_val; +} + TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteFullyConnectedParams* params, TfLiteType data_type, const TfLiteTensor* input, @@ -59,7 +71,9 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, const TfLiteTensor* bias, TfLiteTensor* output, OpData* data) { TfLiteStatus status = kTfLiteOk; - if (data_type != kTfLiteFloat32) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) + if (data_type != kTfLiteFloat32 && + !IsMliApplicable(context, input, filter, bias, params)) { double real_multiplier = 0.0; TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( context, input, filter, bias, output, &real_multiplier)); @@ -70,6 +84,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, context, params->activation, output, &data->output_activation_min, &data->output_activation_max)); } +#endif return status; } @@ -95,6 +110,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); + TF_LITE_ENSURE(context, data != nullptr); TF_LITE_ENSURE_EQ(context, input->type, output->type); TF_LITE_ENSURE_MSG(context, input->type == filter->type, "Hybrid models are not supported on TFLite Micro."); @@ -106,122 +122,135 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { return kTfLiteOk; } +TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node, + TfLiteFullyConnectedParams* params, + OpData* data, const TfLiteTensor* input, + const TfLiteTensor* filter, + const TfLiteTensor* bias, + TfLiteTensor* output) { + mli_tensor mli_in = {0}; + mli_tensor mli_weights = {0}; + mli_tensor mli_bias = {0}; + mli_tensor mli_out = {0}; + + ConvertToMliTensor(input, &mli_in); + ConvertToMliTensor(filter, &mli_weights); + ConvertToMliTensor(bias, &mli_bias); + ConvertToMliTensor(output, &mli_out); + + /* The input tensor can have more than 2 dimensions. for the compute this + doesn't make any difference because all the inputs or a batch entry will + be used anyway. because the MLI kernel doesn't recognize the multiple + dimensions, the tensor shape is casted to a {batchnum, inputsize} shape. */ + mli_in.shape[0] = mli_out.shape[0]; + mli_in.shape[1] = mli_weights.shape[1]; + mli_in.shape[2] = 0; + mli_in.shape[3] = 0; + mli_in.rank = 2; + + // Tensors for data in fast (local) memory and config to copy data from + // external to local memory + mli_tensor weights_local = mli_weights; + mli_tensor bias_local = mli_bias; + mli_tensor in_local = mli_in; + mli_tensor out_local = mli_out; + mli_mov_cfg_t copy_config; + mli_mov_cfg_for_copy(©_config); + const int weight_out_dimension = 0; + const int out_tensor_dimension = 1; + const int batch_dimension = 0; + int slice_size = mli_weights.shape[weight_out_dimension]; + + /* allocate the local buffers, and compute the slice size */ + TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors( + context, &in_local, &weights_local, &bias_local, &out_local)); + TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights( + &weights_local, &bias_local, weight_out_dimension, &slice_size)); + int max_out_slice_size = + out_local.capacity / mli_hlp_tensor_element_size(&out_local); + if (slice_size > max_out_slice_size) slice_size = max_out_slice_size; + + /* is_local indicates that the tensor is already in local memory, + so in that case the original tensor can be used, + and there is no need to copy it to the local tensor*/ + const bool in_is_local = in_local.data == mli_in.data; + const bool out_is_local = out_local.data == mli_out.data; + const bool w_is_local = weights_local.data == mli_weights.data; + const bool b_is_local = bias_local.data == mli_bias.data; + + TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size); + TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size); + TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0, + true); + + mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local; + mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local; + + void* input_buffer_ptr = NULL; + + while (!w_slice.Done()) { + mli_mov_tensor_sync(w_slice.Sub(), ©_config, w_ptr); + mli_mov_tensor_sync(b_slice.Sub(), ©_config, b_ptr); + + TensorSlicer in_slice(&mli_in, batch_dimension, 1); + + /* output tensor is alreade sliced in the output size dimension. + out_ch_slice.Sub() is the tensor for the amount of output size of this + itteration of the weight slice loop. This tensor needs to be further + sliced over the batch */ + TensorSlicer out_slice(out_ch_slice.Sub(), batch_dimension, 1); + + /* setup the pointers to the local or remote tensor to make the code + * inside the loop easier. */ + mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local; + mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local; + + while (!out_slice.Done()) { + // if same input copy as previous iteration, skip the copy of input + if (in_slice.Sub()->data != input_buffer_ptr) { + mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); + input_buffer_ptr = in_slice.Sub()->data; + } + mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, out_ptr); + mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); + + in_slice.Next(); + out_slice.Next(); + } + w_slice.Next(); + b_slice.Next(); + out_ch_slice.Next(); + } + return kTfLiteOk; +} + TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node, TfLiteFullyConnectedParams* params, OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output) { - // Run Fully Connected MLI kernel - // MLI optimized version only supports int8 dataype and no fused Relu - // TODO: subject to add mli_saturate kernel - // work around for issue #35318, mli fully connect kernel only supports - // zeropoint == 0 for weights. this check can be removed once issue #35318 is - // resolved. - if ((filter->params.zero_point == 0) && - (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone)) { - mli_tensor mli_in = {0}; - mli_tensor mli_weights = {0}; - mli_tensor mli_bias = {0}; - mli_tensor mli_out = {0}; +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) + FullyConnectedParams op_params; + op_params.input_offset = -input->params.zero_point; + op_params.weights_offset = -filter->params.zero_point; + op_params.output_offset = output->params.zero_point; + op_params.output_multiplier = data->output_multiplier; + // TODO(b/138810107): Figure out whether output shift should be inverted + op_params.output_shift = -data->output_shift; + op_params.quantized_activation_min = data->output_activation_min; + op_params.quantized_activation_max = data->output_activation_max; - ConvertToMliTensor(input, &mli_in); - ConvertToMliTensor(filter, &mli_weights); - ConvertToMliTensor(bias, &mli_bias); - ConvertToMliTensor(output, &mli_out); - - /* The input tensor can have more than 2 dimensions. for the compute this doesn't make any difference - because all the inputs or a batch entry will be used anyway. because the MLI kernel doesn't recognize - the multiple dimensions, the tensor shape is casted to a {batchnum, inputsize} shape. */ - mli_in.shape[0] = mli_out.shape[0]; - mli_in.shape[1] = mli_weights.shape[1]; - mli_in.shape[2] = 0; - mli_in.shape[3] = 0; - mli_in.rank = 2; - - // Tensors for data in fast (local) memory and config to copy data from external to local memory - mli_tensor weights_local = mli_weights; - mli_tensor bias_local = mli_bias; - mli_tensor in_local = mli_in; - mli_tensor out_local = mli_out; - mli_mov_cfg_t copy_config; - mli_mov_cfg_for_copy(©_config); - const int weight_out_dimension = 0; - const int out_tensor_dimension = 1; - const int batch_dimension = 0; - int slice_size = mli_weights.shape[weight_out_dimension]; - - /* allocate the local buffers, and compute the slice size */ - TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors(context, &in_local, &weights_local, &bias_local, &out_local)); - TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, weight_out_dimension, &slice_size)); - int max_out_slice_size = out_local.capacity / mli_hlp_tensor_element_size(&out_local); - if (slice_size > max_out_slice_size) slice_size = max_out_slice_size; - - /* is_local indicates that the tensor is already in local memory, - so in that case the original tensor can be used, - and there is no need to copy it to the local tensor*/ - const bool in_is_local = in_local.data == mli_in.data; - const bool out_is_local = out_local.data == mli_out.data; - const bool w_is_local = weights_local.data == mli_weights.data; - const bool b_is_local = bias_local.data == mli_bias.data; - - TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size); - TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size); - TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0, true); - - mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local; - mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local; - - void *input_buffer_ptr = NULL; - - while (!w_slice.Done()){ - mli_mov_tensor_sync(w_slice.Sub(), ©_config, w_ptr); - mli_mov_tensor_sync(b_slice.Sub(), ©_config, b_ptr); - - TensorSlicer in_slice(&mli_in, batch_dimension, 1); - - /* output tensor is alreade sliced in the output size dimension. out_ch_slice.Sub() is the tensor for the amount of - output size of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch */ - TensorSlicer out_slice(out_ch_slice.Sub(), batch_dimension, 1); - - /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */ - mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local; - mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local; - - while (!out_slice.Done()) { - - // if same input copy as previous iteration, skip the copy of input - if (in_slice.Sub()->data != input_buffer_ptr) { - mli_mov_tensor_sync(in_slice.Sub(), ©_config, in_ptr); - input_buffer_ptr = in_slice.Sub()->data; - } - mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, out_ptr); - mli_mov_tensor_sync(out_ptr, ©_config, out_slice.Sub()); - - in_slice.Next(); - out_slice.Next(); - } - w_slice.Next(); - b_slice.Next(); - out_ch_slice.Next(); - } - } else { - FullyConnectedParams op_params; - op_params.input_offset = -input->params.zero_point; - op_params.weights_offset = -filter->params.zero_point; - op_params.output_offset = output->params.zero_point; - op_params.output_multiplier = data->output_multiplier; - // TODO(b/138810107): Figure out whether output shift should be inverted - op_params.output_shift = -data->output_shift; - op_params.quantized_activation_min = data->output_activation_min; - op_params.quantized_activation_max = data->output_activation_max; - - reference_integer_ops::FullyConnected( - op_params, GetTensorShape(input), GetTensorData(input), - GetTensorShape(filter), GetTensorData(filter), - GetTensorShape(bias), GetTensorData(bias), - GetTensorShape(output), GetTensorData(output)); - } + reference_integer_ops::FullyConnected( + op_params, GetTensorShape(input), GetTensorData(input), + GetTensorShape(filter), GetTensorData(filter), + GetTensorShape(bias), GetTensorData(bias), + GetTensorShape(output), GetTensorData(output)); return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG(context, + "Node configuration is not supported by ARC MLI Library."); + return kTfLiteError; +#endif } TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, @@ -229,6 +258,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input, const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) const int32_t input_offset = -input->params.zero_point; const int32_t filter_offset = -filter->params.zero_point; const int32_t output_offset = output->params.zero_point; @@ -261,14 +291,20 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, TfLiteTypeGetName(output->type), output->type); return kTfLiteError; } - return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG(context, + "Type %s (%d) is not supported by ARC MLI Library.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; +#endif } TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node, TfLiteFullyConnectedParams* params, OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) float output_activation_min, output_activation_max; CalculateActivationRange(params->activation, &output_activation_min, &output_activation_max); @@ -281,6 +317,12 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node, GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), GetTensorData(output)); return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG(context, + "Type %s (%d) is not supported by ARC MLI Library.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; +#endif } TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { @@ -293,6 +335,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* output = GetOutput(context, node, kOutputTensor); OpData* data = reinterpret_cast(node->user_data); + TF_LITE_ENSURE(context, data != nullptr); // Checks in Prepare ensure input, output and filter types are all the same. switch (input->type) { @@ -300,12 +343,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { return EvalFloat(context, node, params, data, input, filter, bias, output); case kTfLiteInt8: - return EvalQuantizedInt8(context, node, params, data, input, filter, bias, - output); + if (IsMliApplicable(context, input, filter, bias, params)) { + return EvalMliQuantizedInt8(context, node, params, data, input, filter, + bias, output); + } else { + return EvalQuantizedInt8(context, node, params, data, input, filter, + bias, output); + } - case kTfLiteUInt8: - return EvalQuantized(context, node, params, data, input, filter, bias, - output); + case kTfLiteUInt8: + return EvalQuantized(context, node, params, data, input, filter, bias, + output); default: TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.", diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc index 2c3875b58eb..79deacc23d9 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc @@ -1,4 +1,4 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2019-2020 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -42,6 +42,15 @@ struct OpData { enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 }; + +bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input, + const TfLitePoolParams* params) { + // MLI optimized version only supports int8 dataype and no fused Relu + // TODO: subject to add mli_saturate kernel + return (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone); +} + + TfLiteStatus CalculateOpData(const TfLiteContext* context, const TfLitePoolParams* params, const TfLiteTensor* input, @@ -61,9 +70,11 @@ TfLiteStatus CalculateOpData(const TfLiteContext* context, return kTfLiteOk; } -void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node, - const TfLitePoolParams* params, const OpData* data, - const TfLiteTensor* input, TfLiteTensor* output) { +TfLiteStatus AverageEvalFloat(TfLiteContext* context, + const TfLiteNode* node, + const TfLitePoolParams* params, const OpData* data, + const TfLiteTensor* input, TfLiteTensor* output) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) float activation_min, activation_max; CalculateActivationRange(params->activation, &activation_min, &activation_max); @@ -80,6 +91,13 @@ void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node, reference_ops::AveragePool( op_params, GetTensorShape(input), GetTensorData(input), GetTensorShape(output), GetTensorData(output)); + return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG(context, + "Type %s (%d) is not supported by ARC MLI Library.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; +#endif } //Prepare MLI tensors and run Average or Max Pooling @@ -164,45 +182,49 @@ TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params, return kTfLiteOk; } -void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node, - const TfLitePoolParams* params, const OpData* data, - const TfLiteTensor* input, TfLiteTensor* output) { +TfLiteStatus AverageEvalQuantized(TfLiteContext* context, + const TfLiteNode* node, + const TfLitePoolParams* params, + const OpData* data, const TfLiteTensor* input, + TfLiteTensor* output) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8); - // Run Average Pooling MLI kernel - // MLI optimized version only supports int8 dataype and no fused Relu - // TODO: subject to add mli_saturate kernel - if (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone) { - EvalMli(context, params, data, input, output, AveragePooling); + int32_t activation_min, activation_max; + (void)CalculateActivationRangeQuantized(context, params->activation, output, + &activation_min, &activation_max); + PoolParams op_params; + op_params.stride_height = params->stride_height; + op_params.stride_width = params->stride_width; + op_params.filter_height = params->filter_height; + op_params.filter_width = params->filter_width; + op_params.padding_values.height = data->padding.height; + op_params.padding_values.width = data->padding.width; + op_params.quantized_activation_min = activation_min; + op_params.quantized_activation_max = activation_max; + + if (input->type == kTfLiteUInt8) { + reference_ops::AveragePool( + op_params, GetTensorShape(input), GetTensorData(input), + GetTensorShape(output), GetTensorData(output)); } else { - int32_t activation_min, activation_max; - (void)CalculateActivationRangeQuantized(context, params->activation, output, - &activation_min, &activation_max); - - PoolParams op_params; - op_params.stride_height = params->stride_height; - op_params.stride_width = params->stride_width; - op_params.filter_height = params->filter_height; - op_params.filter_width = params->filter_width; - op_params.padding_values.height = data->padding.height; - op_params.padding_values.width = data->padding.width; - op_params.quantized_activation_min = activation_min; - op_params.quantized_activation_max = activation_max; - - if (input->type == kTfLiteUInt8) { - reference_ops::AveragePool( - op_params, GetTensorShape(input), GetTensorData(input), - GetTensorShape(output), GetTensorData(output)); - } else { - reference_integer_ops::AveragePool( - op_params, GetTensorShape(input), GetTensorData(input), - GetTensorShape(output), GetTensorData(output)); - } + reference_integer_ops::AveragePool( + op_params, GetTensorShape(input), GetTensorData(input), + GetTensorShape(output), GetTensorData(output)); } + return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG( + context, + "Node configuration or type %s (%d) is not supported by ARC MLI Library.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; +#endif } -void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node, - TfLitePoolParams* params, OpData* data, - const TfLiteTensor* input, TfLiteTensor* output) { +TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node, + TfLitePoolParams* params, OpData* data, + const TfLiteTensor* input, TfLiteTensor* output) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) float activation_min, activation_max; CalculateActivationRange(params->activation, &activation_min, &activation_max); @@ -219,43 +241,50 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node, reference_ops::MaxPool(op_params, GetTensorShape(input), GetTensorData(input), GetTensorShape(output), GetTensorData(output)); + return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG(context, + "Type %s (%d) is not supported by ARC MLI Library.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; +#endif } -void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node, - TfLitePoolParams* params, OpData* data, - const TfLiteTensor* input, TfLiteTensor* output) { +TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node, + TfLitePoolParams* params, OpData* data, + const TfLiteTensor* input, TfLiteTensor* output) { +#if !defined(TF_LITE_STRIP_REFERENCE_IMPL) TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8); - - // Run Max Pooling MLI kernel - // MLI optimized version only supports int8 dataype and no fused Relu - // TODO: subject to add mli_saturate kernel - if (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone) { - EvalMli(context, params, data, input, output, MaxPooling); - } else { - int32_t activation_min, activation_max; - (void)CalculateActivationRangeQuantized(context, params->activation, output, - &activation_min, &activation_max); + int32_t activation_min, activation_max; + (void)CalculateActivationRangeQuantized(context, params->activation, output, + &activation_min, &activation_max); - tflite::PoolParams op_params; - op_params.stride_height = params->stride_height; - op_params.stride_width = params->stride_width; - op_params.filter_height = params->filter_height; - op_params.filter_width = params->filter_width; - op_params.padding_values.height = data->padding.height; - op_params.padding_values.width = data->padding.width; - op_params.quantized_activation_min = activation_min; - op_params.quantized_activation_max = activation_max; + tflite::PoolParams op_params; + op_params.stride_height = params->stride_height; + op_params.stride_width = params->stride_width; + op_params.filter_height = params->filter_height; + op_params.filter_width = params->filter_width; + op_params.padding_values.height = data->padding.height; + op_params.padding_values.width = data->padding.width; + op_params.quantized_activation_min = activation_min; + op_params.quantized_activation_max = activation_max; - if (input->type == kTfLiteUInt8) { + if (input->type == kTfLiteUInt8) { reference_ops::MaxPool( op_params, GetTensorShape(input), GetTensorData(input), GetTensorShape(output), GetTensorData(output)); - } else { + } else { reference_integer_ops::MaxPool( op_params, GetTensorShape(input), GetTensorData(input), GetTensorShape(output), GetTensorData(output)); - } } + return kTfLiteOk; +#else + TF_LITE_KERNEL_LOG(context, + "Node configuration or type %s (%d) is not supported by ARC MLI Library.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; +#endif } } // namespace @@ -272,11 +301,16 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) { // Inputs and outputs share the same type, guaranteed by the converter. switch (input->type) { case kTfLiteFloat32: - AverageEvalFloat(context, node, params, &data, input, output); + return AverageEvalFloat(context, node, params, &data, input, output); break; case kTfLiteUInt8: case kTfLiteInt8: - AverageEvalQuantized(context, node, params, &data, input, output); + if (IsMliApplicable(context, input, params)) { + return EvalMli(context, params, &data, input, output, AveragePooling); + } else { + return AverageEvalQuantized(context, node, params, &data, input, + output); + } break; default: TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported", @@ -297,11 +331,15 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) { switch (input->type) { case kTfLiteFloat32: - MaxEvalFloat(context, node, params, &data, input, output); + return MaxEvalFloat(context, node, params, &data, input, output); break; case kTfLiteUInt8: case kTfLiteInt8: - MaxEvalQuantized(context, node, params, &data, input, output); + if (IsMliApplicable(context, input, params)) { + return EvalMli(context, params, &data, input, output, MaxPooling); + } else { + return MaxEvalQuantized(context, node, params, &data, input, output); + } break; default: TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.", diff --git a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc index 3b8fa04d536..ee3cc8113c1 100644 --- a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc +++ b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc @@ -86,6 +86,14 @@ endif ARC_MLI_TESTS += $(foreach TEST,$(ARC_MLI_TESTS), $(TEST)_slicing) generate_arc_mli_test_projects: $(foreach TEST,$(ARC_MLI_TESTS), generate_kernel_$(TEST)_test_make_project) + + ARC_EXTRA_APP_SETTINGS += \ + \nMLI_ONLY ?= false\n\ + \nifeq \($(DLR)\(MLI_ONLY\), true\)\ + \nCCFLAGS += -DTF_LITE_STRIP_REFERENCE_IMPL\ + \nCXXFLAGS += -DTF_LITE_STRIP_REFERENCE_IMPL\ + \nendif\n + endif # no_embarc_mli From 2621bf4ee40a7d14db48b63ead3fca2589552670 Mon Sep 17 00:00:00 2001 From: naumkin Date: Sun, 26 Apr 2020 23:49:42 -0700 Subject: [PATCH 044/557] Data movement tests added --- .../kernels/arc_mli/conv_slicing_test.cc | 784 +++++------- .../arc_mli/depthwise_conv_slicing_test.cc | 836 +++++------- .../arc_mli/fully_connected_slicing_test.cc | 1074 ++++------------ .../kernels/arc_mli/pooling_slicing_test.cc | 1140 ++++------------- 4 files changed, 1167 insertions(+), 2667 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc index a1f155ecc56..27e30856f6c 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc @@ -24,25 +24,114 @@ namespace tflite { namespace testing { namespace { -// Common inputs and outputs. -static const int kInputElements = 16; -static const int kInputShape[] = {4, 2, 2, 4, 1}; -static const float kInputData[] = {1, 1, 1, 1, 2, 2, 2, 2, - 1, 2, 3, 4, 1, 2, 3, 4}; -static const int kFilterElements = 12; -static const int kFilterShape[] = {4, 3, 2, 2, 1}; -static const float kFilterData[] = {1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1}; -static const int kBiasElements = 3; -static const int kBiasShape[] = {1, 3}; -static const float kBiasData[] = {1, 2, 3}; -static const int kOutputElements = 12; -static const int kOutputShape[] = {4, 2, 1, 2, 3}; -static const float kGoldenData[] = {18, 2, 5, 18, 2, 5, 17, 4, 3, 37, 4, 3}; +// Common inputs and outputs 1. +static const int kInput1Elements = 20; +static const int kInput1Shape[] = {4, 1, 5, 2, 2}; +static const float kInput1Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; +static const int kFilter1Elements = 36; +static const int kFilter1Shape[] = {4, 2, 3, 3, 2}; +static const float kFilter1Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2}; +static const int kBias1Elements = 2; +static const int kBias1Shape[] = {1, 2}; +static const float kBias1Data[] = {2, 2}; +static const int kOutput1Elements = 20; +static const int kOutput1Shape[] = {4, 1, 5, 2, 2}; +static const float kGolden1Data[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 34, 34, 34, 34}; + +// Common inputs and outputs 2. +static const int kInput2Elements = 80; +static const int kInput2Shape[] = {4, 1, 20, 2, 2}; +static const float kInput2Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; +static const int kFilter2Elements = 36; +static const int kFilter2Shape[] = {4, 2, 3, 3, 2}; +static const float kFilter2Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2}; +static const int kBias2Elements = 2; +static const int kBias2Shape[] = {1, 2}; +static const float kBias2Data[] = {2, 2}; +static const int kOutput2Elements = 80; +static const int kOutput2Shape[] = {4, 1, 20, 2, 2}; +static const float kGolden2Data[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 34, 34, 34, 34}; + +// Common inputs and outputs 3. +static const int kInput3Elements = 40; +static const int kInput3Shape[] = {4, 1, 2, 2, 10}; +static const float kInput3Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; +static const int kFilter3Elements = 90; +static const int kFilter3Shape[] = {4, 1, 3, 3, 10}; // 1 3 3 10 +static const float kFilter3Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; +static const int kBias3Elements = 1; +static const int kBias3Shape[] = {1, 1}; +static const float kBias3Data[] = {1}; +static const int kOutput3Elements = 4; +static const int kOutput3Shape[] = {4, 1, 2, 2, 1}; // 2 2 1 +static const float kGolden3Data[] = {41, 41, 41, 41}; + +// Common inputs and outputs 4. +static const int kInput4Elements = 80; +static const int kInput4Shape[] = {4, 1, 4, 2, 10}; +static const float kInput4Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; +static const int kFilter4Elements = 90; +static const int kFilter4Shape[] = {4, 1, 3, 3, 10}; +static const float kFilter4Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; +static const int kBias4Elements = 1; +static const int kBias4Shape[] = {1, 1}; +static const float kBias4Data[] = {1}; +static const int kOutput4Elements = 8; +static const int kOutput4Shape[] = {4, 1, 4, 2, 1}; +static const float kGolden4Data[] = {41, 41, 61, 61, 61, 61, 41, 41}; static TfLiteConvParams common_conv_params = { - kTfLitePaddingValid, // padding - 2, // stride_width - 2, // stride_height + kTfLitePaddingSame, // padding + 1, // stride_width + 1, // stride_height kTfLiteActNone, // activation 1, // dilation_width_factor 1, // dilation_height_factor @@ -109,77 +198,6 @@ TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size, return kTfLiteOk; } -void TestConvFloat(const int* input_dims_data, const float* input_data, - const int* filter_dims_data, const float* filter_data, - const int* bias_dims_data, const float* bias_data, - const int* output_dims_data, - const float* expected_output_data, float* output_data, - TfLiteConvParams* conv_params) { - TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); - TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); - TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); - TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); - const int output_dims_count = ElementCount(*output_dims); - constexpr int inputs_size = 3; - constexpr int outputs_size = 1; - constexpr int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[tensors_size] = { - CreateFloatTensor(input_data, input_dims, "input_tensor"), - CreateFloatTensor(filter_data, filter_dims, "filter_tensor"), - CreateFloatTensor(bias_data, bias_dims, "bias_tensor"), - CreateFloatTensor(output_data, output_dims, "output_tensor"), - }; - - TF_LITE_MICRO_EXPECT_EQ( - kTfLiteOk, - ValidateConvGoldens(tensors, tensors_size, expected_output_data, - output_data, output_dims_count, conv_params)); -} - -void TestConvQuantizedPerLayer( - const int* input_dims_data, const float* input_data, - uint8_t* input_quantized, float input_scale, const int* filter_dims_data, - const float* filter_data, uint8_t* filter_quantized, float filter_scale, - const int* bias_dims_data, const float* bias_data, int32_t* bias_quantized, - const int* output_dims_data, const float* expected_output_data, - uint8_t* expected_output_quantized, uint8_t* output_data, - float output_scale, TfLiteConvParams* conv_params) { - TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); - TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); - TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); - TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); - const int output_dims_count = ElementCount(*output_dims); - - tflite::AsymmetricQuantize(expected_output_data, expected_output_quantized, - output_dims_count, output_scale, 128); - - constexpr int inputs_size = 3; - constexpr int outputs_size = 1; - constexpr int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[tensors_size] = { - CreateQuantizedTensor(input_data, input_quantized, input_dims, - input_scale, 128, "input_tensor"), - CreateQuantizedTensor(filter_data, filter_quantized, filter_dims, - filter_scale, 128, "filter_tensor"), - CreateQuantizedBiasTensor(bias_data, bias_quantized, bias_dims, - input_scale, filter_scale, "bias_tensor"), - CreateQuantizedTensor(output_data, output_dims, output_scale, 128, - "output_tensor")}; - - // TODO(njeff): Affine Quantization Params should be set on tensor creation. - float filter_scales[] = {1, filter_scale}; - int filter_zero_points[] = {1, 128}; - TfLiteAffineQuantization filter_quant = { - FloatArrayFromFloats(filter_scales), - IntArrayFromInts(filter_zero_points)}; - tensors[1].quantization = {kTfLiteAffineQuantization, &filter_quant}; - - TF_LITE_MICRO_EXPECT_EQ( - kTfLiteOk, - ValidateConvGoldens(tensors, tensors_size, expected_output_quantized, - output_data, output_dims_count, conv_params)); -} - void TestConvQuantizedPerChannel( const int* input_dims_data, const float* input_data, int8_t* input_quantized, float input_scale, int input_zero_point, @@ -207,6 +225,20 @@ void TestConvQuantizedPerChannel( filter_data, filter_data_quantized, filter_dims, filter_scales, filter_zero_points, &filter_quant, 0 /* quantized dimension */, "filter_tensor"); + + // DN: to replace scales and quantized data to avoid second quantization + int channel_count = filter_dims->data[0]; + float true_filter_scales[5] = {1.0, 1.0, 1.0, 1.0, 1.0}; + true_filter_scales[0] = static_cast(channel_count); + TfLiteAffineQuantization *to_change = (TfLiteAffineQuantization *)filter_tensor.quantization.params; + to_change->scale = FloatArrayFromFloats(true_filter_scales); + + int filter_size = filter_tensor.bytes; + for(int i = 0; i < filter_size; ++i) { + filter_tensor.data.int8[i] = filter_data[i]; + } + + TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor( bias_data, bias_data_quantized, bias_dims, input_scale, &filter_scales[1], bias_scales, bias_zero_points, &bias_quant, 0 /* quantized dimension */, @@ -255,375 +287,223 @@ void TestConvQuantizedPerChannel( TF_LITE_MICRO_TESTS_BEGIN -TF_LITE_MICRO_TEST(SimpleTestFloat) { - float output_data[tflite::testing::kOutputElements]; - - tflite::testing::TestConvFloat( - tflite::testing::kInputShape, tflite::testing::kInputData, - tflite::testing::kFilterShape, tflite::testing::kFilterData, - tflite::testing::kBiasShape, tflite::testing::kBiasData, - tflite::testing::kOutputShape, tflite::testing::kGoldenData, output_data, - &tflite::testing::common_conv_params); -} - -TF_LITE_MICRO_TEST(InputAndFilterSameWidthHeight) { - const int output_dims_count = 2; - float output_data[output_dims_count]; - - const int kFilterShape[] = {4, 1, 2, 4, 1}; - const float filter_values[] = {1, 2, 3, 4, -1, -1, 1, 1}; - const int kBiasShape[] = {1, 1}; - const float bias_values[] = {0}; - const int kOutputShape[] = {4, 2, 1, 1, 1}; - const float expected_output[] = {10, 34}; - - tflite::testing::TestConvFloat( - tflite::testing::kInputShape, tflite::testing::kInputData, kFilterShape, - filter_values, kBiasShape, bias_values, kOutputShape, expected_output, - output_data, &tflite::testing::common_conv_params); -} - -TF_LITE_MICRO_TEST(SimpleTestQuantized) { - const int output_dims_count = 12; - uint8_t output_data[output_dims_count]; - - const float input_scale = 0.5f; - const float filter_scale = 0.5f; - const float output_scale = 1.0f; - - uint8_t input_quantized[tflite::testing::kInputElements]; - uint8_t filter_quantized[tflite::testing::kFilterElements]; - int32_t bias_quantized[tflite::testing::kBiasElements]; - uint8_t golden_quantized[tflite::testing::kOutputElements]; - - tflite::testing::TestConvQuantizedPerLayer( - tflite::testing::kInputShape, tflite::testing::kInputData, - input_quantized, input_scale, tflite::testing::kFilterShape, - tflite::testing::kFilterData, filter_quantized, filter_scale, - tflite::testing::kBiasShape, tflite::testing::kBiasData, bias_quantized, - tflite::testing::kOutputShape, tflite::testing::kGoldenData, - golden_quantized, output_data, output_scale, - &tflite::testing::common_conv_params); -} - -TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) { - const int output_dims_count = 12; - int8_t output_data[output_dims_count]; - - const float input_scale = 0.5f; - const float output_scale = 1.0f; - const int input_zero_point = 0; - const int output_zero_point = 0; - - int8_t input_quantized[tflite::testing::kInputElements]; - int8_t filter_quantized[tflite::testing::kFilterElements]; - int32_t bias_quantized[tflite::testing::kBiasElements]; - int8_t golden_quantized[tflite::testing::kOutputElements]; - int zero_points[tflite::testing::kBiasElements + 1]; - float scales[tflite::testing::kBiasElements + 1]; - - tflite::testing::TestConvQuantizedPerChannel( - tflite::testing::kInputShape, tflite::testing::kInputData, - input_quantized, input_scale, input_zero_point, - tflite::testing::kFilterShape, tflite::testing::kFilterData, - filter_quantized, tflite::testing::kBiasShape, tflite::testing::kBiasData, - bias_quantized, scales, zero_points, tflite::testing::kOutputShape, - tflite::testing::kGoldenData, golden_quantized, output_data, output_scale, - output_zero_point, &tflite::testing::common_conv_params); -} - -TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelRelu6) { - // conv params: - // padding, stride_, dilation_, activation - TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6}; - const int output_dims_count = 12; - int8_t output_data[output_dims_count]; - - const float bias_values[] = {1, 2, -3}; - const float golden_data[] = {6, 2, 0, 6, 2, 0, 6, 4, 0, 6, 4, 0}; - - const float input_scale = 0.023529f; - const float output_scale = 0.023529f; - const int input_zero_point = -128; - const int output_zero_point = -128; - - int8_t input_quantized[tflite::testing::kInputElements]; - int8_t filter_quantized[tflite::testing::kFilterElements]; - int32_t bias_quantized[tflite::testing::kBiasElements]; - int8_t golden_quantized[tflite::testing::kOutputElements]; - int zero_points[tflite::testing::kBiasElements + 1]; - float scales[tflite::testing::kBiasElements + 1]; - - tflite::testing::TestConvQuantizedPerChannel( - tflite::testing::kInputShape, tflite::testing::kInputData, - input_quantized, input_scale, input_zero_point, - tflite::testing::kFilterShape, tflite::testing::kFilterData, - filter_quantized, tflite::testing::kBiasShape, bias_values, - bias_quantized, scales, zero_points, tflite::testing::kOutputShape, - golden_data, golden_quantized, output_data, output_scale, - output_zero_point, &tflite::testing::common_conv_params); -} - -TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannel) { - // conv params: - // padding, stride_, activation, dilation_ - TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, - kTfLiteActNone, 1, 1}; - const int kInputShape[] = {4, 1, 2, 2, 4}; // [len,N,H,W,C] - const int kInputElements = - kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4]; - float kInputData[/* kInputElements */] = {1, 1, 1, 1, 2, 2, 2, 2, - 1, 2, 3, 4, 1, 2, 3, 4}; - const int kFilterShape[] = {4, 3, 1, 1, 4}; - const int kFilterElements = - kFilterShape[1] * kFilterShape[2] * kFilterShape[3] * kFilterShape[4]; - float kFilterData[/* kFilterElements */] = {1, 2, 3, 4, -1, 1, - -1, 1, -1, -1, 1, 1}; - const int kBiasElements = kFilterShape[1]; - const int kBiasShape[] = {1, kBiasElements}; - float kBiasData[/* kBiasElements */] = {1, 2, 3}; - const int kOutputShape[] = {4, 1, 2, 2, kBiasElements}; - const int kOutputElements = 4 * 3; - int8_t output_data[kOutputElements]; - const float kGoldenData[/* kOutputElements */] = {11, 2, 3, 21, 2, 3, - 31, 4, 7, 31, 4, 7}; - - const float input_scale = 0.5f; - const float output_scale = 1.0f; - const int input_zero_point = 0; - const int output_zero_point = 0; - - int8_t input_quantized[kInputElements]; - int8_t filter_quantized[kFilterElements]; - int32_t bias_quantized[kBiasElements]; - int8_t golden_quantized[kOutputElements]; - int zero_points[kBiasElements + 1]; - float scales[kBiasElements + 1]; - - tflite::testing::TestConvQuantizedPerChannel( - kInputShape, kInputData, input_quantized, input_scale, input_zero_point, - kFilterShape, kFilterData, filter_quantized, kBiasShape, kBiasData, - bias_quantized, scales, zero_points, kOutputShape, kGoldenData, - golden_quantized, output_data, output_scale, output_zero_point, - &conv_params); -} - -TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannelRelu6) { - // conv params: - // padding, stride_, dilation_, activation - TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6}; - const int kInputShape[] = {4, 1, 2, 2, 4}; // [len,N,H,W,C] - const int kInputElements = - kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4]; - float kInputData[/* kInputElements */] = {1, 1, 1, 1, 2, 2, 2, 2, - 1, 2, 3, 4, 1, 2, 3, 4}; - const int kFilterShape[] = {4, 3, 1, 1, 4}; - const int kFilterElements = - kFilterShape[1] * kFilterShape[2] * kFilterShape[3] * kFilterShape[4]; - float kFilterData[/* kFilterElements */] = {1, 2, 3, 4, -1, 1, - -1, 1, -1, -1, 1, 1}; - const int kBiasElements = kFilterShape[1]; - const int kBiasShape[] = {1, kBiasElements}; - float kBiasData[/* kBiasElements */] = {1, 2, -3}; - const int kOutputShape[] = {4, 1, 2, 2, kBiasElements}; - const int kOutputElements = 4 * 3; - int8_t output_data[kOutputElements]; - const float kGoldenData[/* kOutputElements */] = {6, 2, 0, 6, 2, 0, - 6, 4, 1, 6, 4, 1}; - - const float input_scale = 0.023529f; - const float output_scale = 0.023529f; - const int input_zero_point = -128; - const int output_zero_point = -128; - - int8_t input_quantized[kInputElements]; - int8_t filter_quantized[kFilterElements]; - int32_t bias_quantized[kBiasElements]; - int8_t golden_quantized[kOutputElements]; - int zero_points[kBiasElements + 1]; - float scales[kBiasElements + 1]; - - tflite::testing::TestConvQuantizedPerChannel( - kInputShape, kInputData, input_quantized, input_scale, input_zero_point, - kFilterShape, kFilterData, filter_quantized, kBiasShape, kBiasData, - bias_quantized, scales, zero_points, kOutputShape, kGoldenData, - golden_quantized, output_data, output_scale, output_zero_point, - &conv_params); -} - -TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) { - const int output_dims_count = 12; - int8_t output_data[output_dims_count]; - - const float input_scale = 0.5f; - const float output_scale = 1.0f; - - int8_t input_quantized[tflite::testing::kInputElements]; - int8_t filter_quantized[tflite::testing::kFilterElements]; - int32_t bias_quantized[tflite::testing::kBiasElements]; - int8_t golden_quantized[tflite::testing::kOutputElements]; - int zero_points[tflite::testing::kBiasElements + 1]; - float scales[tflite::testing::kBiasElements + 1]; - - TfLiteIntArray* input_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kInputShape); - TfLiteIntArray* filter_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kFilterShape); - TfLiteIntArray* bias_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kBiasShape); - TfLiteIntArray* output_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kOutputShape); - - int filter_zero_points[5]; - float filter_scales[5]; - TfLiteAffineQuantization filter_quant; - TfLiteAffineQuantization bias_quant; - TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor( - tflite::testing::kInputData, input_quantized, input_dims, input_scale, 0, - "input_tensor"); - TfLiteTensor filter_tensor = - tflite::testing::CreateSymmetricPerChannelQuantizedTensor( - tflite::testing::kFilterData, filter_quantized, filter_dims, - filter_scales, filter_zero_points, &filter_quant, - 0 /* quantized dimension */, "filter_tensor"); - TfLiteTensor bias_tensor = - tflite::testing::CreatePerChannelQuantizedBiasTensor( - tflite::testing::kBiasData, bias_quantized, bias_dims, input_scale, - &filter_scales[1], scales, zero_points, &bias_quant, 0, - "bias_tensor"); - TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor( - output_data, output_dims, output_scale, 0 /* quantized dimension */, - "output_tensor"); - - float input_scales[] = {1, input_scale}; - int input_zero_points[] = {1, 128}; - TfLiteAffineQuantization input_quant = { - tflite::testing::FloatArrayFromFloats(input_scales), - tflite::testing::IntArrayFromInts(input_zero_points)}; - input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant}; - - constexpr int inputs_size = 3; - constexpr int outputs_size = 1; - constexpr int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[tensors_size] = { - input_tensor, - filter_tensor, - bias_tensor, - output_tensor, - }; - - tflite::AsymmetricQuantize(tflite::testing::kGoldenData, golden_quantized, - output_dims_count, output_scale, 0); - - // Set filter quant to mismatched dimension. - TfLiteAffineQuantization* quant = reinterpret_cast( - filter_tensor.quantization.params); - - // Choose arbitrary incorrect scale and zero point sizes which are neither 1 - // (for broadcast case) nor the quantized dimension size. - quant->scale->size = 2; - TF_LITE_MICRO_EXPECT_EQ( - kTfLiteError, - tflite::testing::ValidateConvGoldens( - tensors, tensors_size, golden_quantized, output_data, - output_dims_count, &tflite::testing::common_conv_params)); - - // Set scale back to correct dimension, and make zero point array too short. - quant->scale->size = tflite::testing::kFilterShape[0]; - quant->zero_point->size = 2; - TF_LITE_MICRO_EXPECT_EQ( - kTfLiteError, - tflite::testing::ValidateConvGoldens( - tensors, tensors_size, golden_quantized, output_data, - output_dims_count, &tflite::testing::common_conv_params)); -} - -TF_LITE_MICRO_TEST(BroadcastPerLayerQuantizationToPerChannelShouldMatchGolden) { - const int output_dims_count = 12; - int8_t output_data[output_dims_count]; - +// Test group 1 +TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel1) { + const int output_dims_count = 20; const float input_scale = 1.0f; - const float filter_scale = 1.0f; const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; - int8_t input_quantized[tflite::testing::kInputElements]; - int8_t filter_quantized[tflite::testing::kFilterElements]; - int32_t bias_quantized[tflite::testing::kBiasElements]; - int8_t golden_quantized[tflite::testing::kOutputElements]; + int8_t input_quantized[tflite::testing::kInput1Elements]; + int8_t filter_quantized[tflite::testing::kFilter1Elements]; + int32_t bias_quantized[tflite::testing::kBias1Elements]; + int8_t golden_quantized[tflite::testing::kOutput1Elements]; + int8_t output_data[output_dims_count]; - TfLiteIntArray* input_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kInputShape); - TfLiteIntArray* filter_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kFilterShape); - TfLiteIntArray* bias_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kBiasShape); - TfLiteIntArray* output_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kOutputShape); + int zero_points[tflite::testing::kBias1Elements + 1]; + float scales[tflite::testing::kBias1Elements + 1]; - // Create per-layer quantized int8 input tensor. - TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor( - tflite::testing::kInputData, input_quantized, input_dims, input_scale, 0, - "input_tensor"); - int input_zero_points[2] = {1, 0}; - float input_scales[2] = {1, input_scale}; - TfLiteAffineQuantization input_quant = { - tflite::testing::FloatArrayFromFloats(input_scales), - tflite::testing::IntArrayFromInts(input_zero_points)}; - input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant}; - - // Create per-layer quantized int8 filter tensor. - TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor( - tflite::testing::kFilterData, filter_quantized, filter_dims, filter_scale, - 0, "filter_tensor"); - int filter_zero_points[2] = {1, 0}; - float filter_scales[2] = {1, filter_scale}; - TfLiteAffineQuantization filter_quant = { - tflite::testing::FloatArrayFromFloats(filter_scales), - tflite::testing::IntArrayFromInts(filter_zero_points)}; - filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant}; - - // Create per-layer quantized int32 bias tensor. - tflite::SymmetricQuantize(tflite::testing::kBiasData, bias_quantized, - tflite::testing::kBiasElements, - input_scale * output_scale); - TfLiteTensor bias_tensor = tflite::testing::CreateInt32Tensor( - bias_quantized, bias_dims, "bias_tensor"); - - int bias_zero_points[2] = {1, 0}; - float bias_scales[2] = {1, input_scale * filter_scale}; - TfLiteAffineQuantization bias_quant = { - tflite::testing::FloatArrayFromFloats(bias_scales), - tflite::testing::IntArrayFromInts(bias_zero_points)}; - bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant}; - - // Create per-layer quantized int8 output tensor. - TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor( - output_data, output_dims, output_scale, 0 /* quantized dimension */, - "output_tensor"); - int output_zero_points[2] = {1, 0}; - float output_scales[2] = {1, output_scale}; - TfLiteAffineQuantization output_quant = { - tflite::testing::FloatArrayFromFloats(output_scales), - tflite::testing::IntArrayFromInts(output_zero_points)}; - output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant}; - - constexpr int inputs_size = 3; - constexpr int outputs_size = 1; - constexpr int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[tensors_size] = { - input_tensor, - filter_tensor, - bias_tensor, - output_tensor, - }; - - tflite::AsymmetricQuantize(tflite::testing::kGoldenData, golden_quantized, - output_dims_count, output_scale, 0); - - TF_LITE_MICRO_EXPECT_EQ( - kTfLiteOk, tflite::testing::ValidateConvGoldens( - tensors, tensors_size, golden_quantized, output_data, - output_dims_count, &tflite::testing::common_conv_params)); + tflite::testing::TestConvQuantizedPerChannel( + tflite::testing::kInput1Shape, tflite::testing::kInput1Data, + input_quantized, input_scale, input_zero_point, + tflite::testing::kFilter1Shape, tflite::testing::kFilter1Data, + filter_quantized, tflite::testing::kBias1Shape, tflite::testing::kBias1Data, + bias_quantized, scales, zero_points, tflite::testing::kOutput1Shape, + tflite::testing::kGolden1Data, golden_quantized, output_data, output_scale, + output_zero_point, &tflite::testing::common_conv_params); } +TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel1) { + const int output_dims_count = 20; + const float input_scale = 1.0f; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + +#pragma Bss(".Xdata") + static int8_t input_quantized[tflite::testing::kInput1Elements]; + static int8_t filter_quantized[tflite::testing::kFilter1Elements]; + static int32_t bias_quantized[tflite::testing::kBias1Elements]; + static int8_t output_data[output_dims_count]; +#pragma Bss() + + int8_t golden_quantized[tflite::testing::kOutput1Elements]; + int zero_points[tflite::testing::kBias1Elements + 1]; + float scales[tflite::testing::kBias1Elements + 1]; + + tflite::testing::TestConvQuantizedPerChannel( + tflite::testing::kInput1Shape, tflite::testing::kInput1Data, + input_quantized, input_scale, input_zero_point, + tflite::testing::kFilter1Shape, tflite::testing::kFilter1Data, + filter_quantized, tflite::testing::kBias1Shape, tflite::testing::kBias1Data, + bias_quantized, scales, zero_points, tflite::testing::kOutput1Shape, + tflite::testing::kGolden1Data, golden_quantized, output_data, output_scale, + output_zero_point, &tflite::testing::common_conv_params); +} + +// Test group 2 +TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel2) { + const int output_dims_count = 80; + const float input_scale = 1.0f; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + + int8_t input_quantized[tflite::testing::kInput2Elements]; + int8_t filter_quantized[tflite::testing::kFilter2Elements]; + int32_t bias_quantized[tflite::testing::kBias2Elements]; + int8_t golden_quantized[tflite::testing::kOutput2Elements]; + int8_t output_data[output_dims_count]; + + int zero_points[tflite::testing::kBias2Elements + 1]; + float scales[tflite::testing::kBias2Elements + 1]; + + tflite::testing::TestConvQuantizedPerChannel( + tflite::testing::kInput2Shape, tflite::testing::kInput2Data, + input_quantized, input_scale, input_zero_point, + tflite::testing::kFilter2Shape, tflite::testing::kFilter2Data, + filter_quantized, tflite::testing::kBias2Shape, tflite::testing::kBias2Data, + bias_quantized, scales, zero_points, tflite::testing::kOutput2Shape, + tflite::testing::kGolden2Data, golden_quantized, output_data, output_scale, + output_zero_point, &tflite::testing::common_conv_params); +} + +TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel2) { + const int output_dims_count = 80; + const float input_scale = 1.0f; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + +#pragma Bss(".Xdata") + static int8_t input_quantized[tflite::testing::kInput2Elements]; + static int8_t filter_quantized[tflite::testing::kFilter2Elements]; + static int32_t bias_quantized[tflite::testing::kBias2Elements]; + static int8_t output_data[output_dims_count]; +#pragma Bss() + + int8_t golden_quantized[tflite::testing::kOutput2Elements]; + int zero_points[tflite::testing::kBias2Elements + 1]; + float scales[tflite::testing::kBias2Elements + 1]; + + tflite::testing::TestConvQuantizedPerChannel( + tflite::testing::kInput2Shape, tflite::testing::kInput2Data, + input_quantized, input_scale, input_zero_point, + tflite::testing::kFilter2Shape, tflite::testing::kFilter2Data, + filter_quantized, tflite::testing::kBias2Shape, tflite::testing::kBias2Data, + bias_quantized, scales, zero_points, tflite::testing::kOutput2Shape, + tflite::testing::kGolden2Data, golden_quantized, output_data, output_scale, + output_zero_point, &tflite::testing::common_conv_params); +} + +// Test group 3 +TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel3) { + const int output_dims_count = 4; + const float input_scale = 1.0f; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + + int8_t input_quantized[tflite::testing::kInput3Elements]; + int8_t filter_quantized[tflite::testing::kFilter3Elements]; + int32_t bias_quantized[tflite::testing::kBias3Elements]; + int8_t golden_quantized[tflite::testing::kOutput3Elements]; + int8_t output_data[output_dims_count]; + + int zero_points[tflite::testing::kBias3Elements + 1]; + float scales[tflite::testing::kBias3Elements + 1]; + + tflite::testing::TestConvQuantizedPerChannel( + tflite::testing::kInput3Shape, tflite::testing::kInput3Data, + input_quantized, input_scale, input_zero_point, + tflite::testing::kFilter3Shape, tflite::testing::kFilter3Data, + filter_quantized, tflite::testing::kBias3Shape, tflite::testing::kBias3Data, + bias_quantized, scales, zero_points, tflite::testing::kOutput3Shape, + tflite::testing::kGolden3Data, golden_quantized, output_data, output_scale, + output_zero_point, &tflite::testing::common_conv_params); +} + +TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel3) { + const int output_dims_count = 4; + const float input_scale = 1.0f; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + +#pragma Bss(".Xdata") + static int8_t input_quantized[tflite::testing::kInput3Elements]; + static int8_t filter_quantized[tflite::testing::kFilter3Elements]; + static int32_t bias_quantized[tflite::testing::kBias3Elements]; + static int8_t output_data[output_dims_count]; +#pragma Bss() + + int8_t golden_quantized[tflite::testing::kOutput3Elements]; + int zero_points[tflite::testing::kBias3Elements + 1]; + float scales[tflite::testing::kBias3Elements + 1]; + + tflite::testing::TestConvQuantizedPerChannel( + tflite::testing::kInput3Shape, tflite::testing::kInput3Data, + input_quantized, input_scale, input_zero_point, + tflite::testing::kFilter3Shape, tflite::testing::kFilter3Data, + filter_quantized, tflite::testing::kBias3Shape, tflite::testing::kBias3Data, + bias_quantized, scales, zero_points, tflite::testing::kOutput3Shape, + tflite::testing::kGolden3Data, golden_quantized, output_data, output_scale, + output_zero_point, &tflite::testing::common_conv_params); +} + +// Test group 4 +TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel4) { + const int output_dims_count = 8; + const float input_scale = 1.0f; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + + int8_t input_quantized[tflite::testing::kInput4Elements]; + int8_t filter_quantized[tflite::testing::kFilter4Elements]; + int32_t bias_quantized[tflite::testing::kBias4Elements]; + int8_t golden_quantized[tflite::testing::kOutput4Elements]; + int8_t output_data[output_dims_count]; + + int zero_points[tflite::testing::kBias4Elements + 1]; + float scales[tflite::testing::kBias4Elements + 1]; + + tflite::testing::TestConvQuantizedPerChannel( + tflite::testing::kInput4Shape, tflite::testing::kInput4Data, + input_quantized, input_scale, input_zero_point, + tflite::testing::kFilter4Shape, tflite::testing::kFilter4Data, + filter_quantized, tflite::testing::kBias4Shape, tflite::testing::kBias4Data, + bias_quantized, scales, zero_points, tflite::testing::kOutput4Shape, + tflite::testing::kGolden4Data, golden_quantized, output_data, output_scale, + output_zero_point, &tflite::testing::common_conv_params); +} + +TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel4) { + const int output_dims_count = 8; + const float input_scale = 1.0f; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + +#pragma Bss(".Xdata") + static int8_t input_quantized[tflite::testing::kInput4Elements]; + static int8_t filter_quantized[tflite::testing::kFilter4Elements]; + static int32_t bias_quantized[tflite::testing::kBias4Elements]; + static int8_t output_data[output_dims_count]; +#pragma Bss() + + int8_t golden_quantized[tflite::testing::kOutput4Elements]; + int zero_points[tflite::testing::kBias4Elements + 1]; + float scales[tflite::testing::kBias4Elements + 1]; + + tflite::testing::TestConvQuantizedPerChannel( + tflite::testing::kInput4Shape, tflite::testing::kInput4Data, + input_quantized, input_scale, input_zero_point, + tflite::testing::kFilter4Shape, tflite::testing::kFilter4Data, + filter_quantized, tflite::testing::kBias4Shape, tflite::testing::kBias4Data, + bias_quantized, scales, zero_points, tflite::testing::kOutput4Shape, + tflite::testing::kGolden4Data, golden_quantized, output_data, output_scale, + output_zero_point, &tflite::testing::common_conv_params); +} TF_LITE_MICRO_TESTS_END diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc index 8b79885a8a8..fb9dd46c1e4 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc @@ -106,87 +106,6 @@ TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data, return kTfLiteOk; } -void TestDepthwiseConvFloat(const int* input_dims_data, const float* input_data, - const int* filter_dims_data, - const float* filter_data, const int* bias_dims_data, - const float* bias_data, - const float* expected_output_data, - const int* output_dims_data, - TfLiteFusedActivation activation, - float* output_data) { - TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); - TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); - TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); - TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); - const int output_dims_count = ElementCount(*output_dims); - - constexpr int inputs_size = 3; - constexpr int outputs_size = 1; - constexpr int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[tensors_size] = { - CreateFloatTensor(input_data, input_dims, "input_tensor"), - CreateFloatTensor(filter_data, filter_dims, "filter_tensor"), - CreateFloatTensor(bias_data, bias_dims, "bias_tensor"), - CreateFloatTensor(output_data, output_dims, "output_tensor"), - }; - - ValidateDepthwiseConvGoldens(expected_output_data, output_dims_count, - activation, 1e-5, tensors_size, tensors); -} - -void TestDepthwiseConvQuantizedPerLayer( - const int* input_dims_data, const float* input_data, - uint8_t* input_quantized, float input_scale, int input_zero_point, - const int* filter_dims_data, const float* filter_data, - uint8_t* filter_quantized, float filter_scale, int filter_zero_point, - const int* bias_dims_data, const float* bias_data, int32_t* bias_quantized, - const float* golden, uint8_t* golden_quantized, const int* output_dims_data, - uint8_t* output_data, float output_scale, int output_zero_point, - TfLiteFusedActivation activation) { - TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); - TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); - TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); - TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); - const int output_dims_count = ElementCount(*output_dims); - - constexpr int inputs_size = 3; - constexpr int outputs_size = 1; - constexpr int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[tensors_size] = { - tflite::testing::CreateQuantizedTensor(input_data, input_quantized, - input_dims, input_scale, - input_zero_point, "input_tensor"), - tflite::testing::CreateQuantizedTensor( - filter_data, filter_quantized, filter_dims, filter_scale, - filter_zero_point, "filter_tensor"), - tflite::testing::CreateQuantizedBiasTensor(bias_data, bias_quantized, - bias_dims, input_scale, - filter_scale, "bias_tensor"), - tflite::testing::CreateQuantizedTensor(output_data, output_dims, - output_scale, output_zero_point, - "output_tensor"), - }; - - // TODO(njeff): Affine Quantization Params should be set on tensor creation. - float filter_scales[] = {1, filter_scale}; - int filter_zero_points[] = {1, 128}; - TfLiteAffineQuantization filter_quant = { - FloatArrayFromFloats(filter_scales), - IntArrayFromInts(filter_zero_points)}; - tensors[1].quantization = {kTfLiteAffineQuantization, &filter_quant}; - - float bias_scales[] = {1, filter_scale * input_scale}; - int bias_zero_points[] = {1, 128}; - TfLiteAffineQuantization bias_quant = {FloatArrayFromFloats(bias_scales), - IntArrayFromInts(bias_zero_points)}; - tensors[2].quantization = {kTfLiteAffineQuantization, &bias_quant}; - - AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale, - output_zero_point); - ValidateDepthwiseConvGoldens(golden_quantized, output_dims_count, activation, - 1.0, tensors_size, tensors); -} - void TestDepthwiseConvQuantizedPerChannel( const int* input_dims_data, const float* input_data, int8_t* input_quantized, float input_scale, int input_zero_point, @@ -263,183 +182,29 @@ void TestDepthwiseConvQuantizedPerChannel( TF_LITE_MICRO_TESTS_BEGIN -TF_LITE_MICRO_TEST(SimpleTest) { - const int input_elements = 12; - const int input_shape[] = {4, 1, 3, 2, 2}; - const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; - const int filter_elements = 16; - const int filter_shape[] = {4, 1, 2, 2, 4}; - const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, - 5, 6, 7, 8, 13, -14, 15, -16}; - const int bias_elements = 4; - const int bias_shape[] = {4, 1, 1, 1, 4}; - const float bias_values[] = {1, 2, 3, 4}; - const float golden[] = { - 71, -34, 99, -20, 91, -26, 127, -4, - }; - const int output_shape[] = {4, 1, 2, 1, 4}; - const int output_dims_count = 8; - float output_data[output_dims_count]; - tflite::testing::TestDepthwiseConvFloat( - input_shape, input_values, filter_shape, filter_values, bias_shape, - bias_values, golden, output_shape, kTfLiteActNone, output_data); -} - -TF_LITE_MICRO_TEST(SimpleTestQuantized) { - const int input_elements = 12; - const int input_shape[] = {4, 1, 3, 2, 2}; - const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; - const int filter_elements = 16; - const int filter_shape[] = {4, 1, 2, 2, 4}; - const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, - 5, 6, 7, 8, 13, -14, 15, -16}; - const int bias_elements = 4; - const int bias_shape[] = {4, 1, 1, 1, 4}; - const int output_elements = 8; - const float bias_values[] = {1, 2, 3, 4}; - const float golden[] = { - 71, -34, 99, -20, 91, -26, 127, -4, - }; - const int output_shape[] = {4, 1, 2, 1, 4}; - - const float input_scale = 0.5f; - const int input_zero_point = 128; - const float filter_scale = 0.5f; - const int filter_zero_point = 128; - const float output_scale = 1.0f; - const int output_zero_point = 128; - - uint8_t input_quantized[input_elements]; - uint8_t filter_quantized[filter_elements]; - int32_t bias_quantized[bias_elements]; - uint8_t golden_quantized[output_elements]; - uint8_t output_data[output_elements]; - - tflite::testing::TestDepthwiseConvQuantizedPerLayer( - input_shape, input_values, input_quantized, input_scale, input_zero_point, - filter_shape, filter_values, filter_quantized, filter_scale, - filter_zero_point, bias_shape, bias_values, bias_quantized, golden, - golden_quantized, output_shape, output_data, output_scale, - output_zero_point, kTfLiteActNone); -} - -TF_LITE_MICRO_TEST(SimpleTestRelu) { - const int input_elements = 12; - const int input_shape[] = {4, 1, 3, 2, 2}; - const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; - const int filter_elements = 16; - const int filter_shape[] = {4, 1, 2, 2, 4}; - const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, - 5, 6, 7, 8, 13, -14, 15, -16}; - const int bias_elements = 4; - const int bias_shape[] = {4, 1, 1, 1, 4}; - const int output_elements = 8; - const float bias_values[] = {1, 2, 3, 4}; - const int output_shape[] = {4, 1, 2, 1, 4}; - const int output_dims_count = 8; - const float golden_relu[] = {71, 0, 99, 0, 91, 0, 127, 0}; - float output_data[output_dims_count]; - - tflite::testing::TestDepthwiseConvFloat( - input_shape, input_values, filter_shape, filter_values, bias_shape, - bias_values, golden_relu, output_shape, kTfLiteActRelu, output_data); -} - -TF_LITE_MICRO_TEST(SimpleTestReluQuantized) { - const int input_elements = 12; - const int input_shape[] = {4, 1, 3, 2, 2}; - const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; - const int filter_elements = 16; - const int filter_shape[] = {4, 1, 2, 2, 4}; - const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, - 5, 6, 7, 8, 13, -14, 15, -16}; - const int bias_elements = 4; - const int bias_shape[] = {4, 1, 1, 1, 4}; - const int output_elements = 8; - const float bias_values[] = {1, 2, 3, 4}; - const int output_shape[] = {4, 1, 2, 1, 4}; - const int output_dims_count = 8; - const float golden_relu[] = {71, 0, 99, 0, 91, 0, 127, 0}; - - const float input_scale = 0.5f; - const int input_zero_point = 128; - const float filter_scale = 0.5f; - const int filter_zero_point = 128; - const float output_scale = 1.0f; - const int output_zero_point = 128; - - uint8_t input_quantized[input_elements]; - uint8_t filter_quantized[filter_elements]; - int32_t bias_quantized[bias_elements]; - uint8_t golden_quantized[output_elements]; - uint8_t output_data[output_elements]; - - tflite::testing::TestDepthwiseConvQuantizedPerLayer( - input_shape, input_values, input_quantized, input_scale, input_zero_point, - filter_shape, filter_values, filter_quantized, filter_scale, - filter_zero_point, bias_shape, bias_values, bias_quantized, golden_relu, - golden_quantized, output_shape, output_data, output_scale, - output_zero_point, kTfLiteActRelu); -} - -TF_LITE_MICRO_TEST(SimpleTestOptimizedFilterWidth) { - const int input_elements = 12; - const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; - const int filter_elements = 16; - const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, - 5, 6, 7, 8, 13, -14, 15, -16}; - const int bias_elements = 4; - const float bias_values[] = {1, 2, 3, 4}; - const int output_dims_count = 9; - const int input_shape[] = {4, 1, 1, 9, 1}; - const int filter_shape[] = {4, 2, 1, 8, 1}; - const int bias_shape[] = {1, 1}; - const float goldens[] = { - 92, 56, 12, 22, 33, 72, 44, 20, 5, - }; - const int output_shape[] = {4, 1, 1, 9, 1}; - - const float input_scale = 1.0f; - const int input_zero_point = 128; - const float filter_scale = 0.5f; - const int filter_zero_point = 128; - const float output_scale = 1.0f; - const int output_zero_point = 128; - - uint8_t input_quantized[input_elements]; - uint8_t filter_quantized[filter_elements]; - int32_t bias_quantized[bias_elements]; - uint8_t golden_quantized[output_dims_count]; - uint8_t output_data[output_dims_count]; - - tflite::testing::TestDepthwiseConvQuantizedPerLayer( - input_shape, input_values, input_quantized, input_scale, input_zero_point, - filter_shape, filter_values, filter_quantized, filter_scale, - filter_zero_point, bias_shape, bias_values, bias_quantized, goldens, - golden_quantized, output_shape, output_data, output_scale, - output_zero_point, kTfLiteActNone); -} - -TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) { - const int input_elements = 12; - const int input_shape[] = {4, 1, 3, 2, 2}; - const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; - const int filter_elements = 16; - const int filter_shape[] = {4, 1, 2, 2, 4}; - const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, - 5, 6, 7, 8, 13, -14, 15, -16}; - const int bias_elements = 4; - const int bias_shape[] = {4, 1, 1, 1, 4}; - const int output_elements = 8; - const float bias_values[] = {1, 2, 3, 4}; - const float golden[] = { - 71, -34, 99, -20, 91, -26, 127, -4, - }; - const int output_shape[] = {4, 1, 2, 1, 4}; - const int output_dims_count = 8; +// Test group 1 +TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel1) { + const int input_elements = 20; + const int input_shape[] = {4, 1, 5, 2, 2}; + const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; + const int filter_elements = 36; + const int filter_shape[] = {4, 2, 3, 3, 2}; + const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2}; + const int bias_elements = 2; + const int bias_shape[] = {4, 1, 1, 1, 2}; + const int output_elements = 20; + const float bias_values[] = {2, 2}; + const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 34, 34, 34, 34}; + const int output_shape[] = {4, 1, 5, 2, 2}; + const int output_dims_count = 20; int8_t output_data[output_dims_count]; - const float input_scale = 0.5; + const float input_scale = 1.0; const float output_scale = 1.0f; const int input_zero_point = 0; const int output_zero_point = 0; @@ -458,28 +223,188 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) { output_scale, output_zero_point, kTfLiteActNone); } -TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelDepthMultiplier1) { - const int input_elements = 12; - const int input_shape[] = {4, 1, 3, 2, 2}; - const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; - const int filter_elements = 8; - const int filter_shape[] = {4, 1, 2, 2, 2}; - const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12}; +TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel1) { + const int input_elements = 20; + const int input_shape[] = {4, 1, 5, 2, 2}; + const int filter_elements = 36; + const int filter_shape[] = {4, 2, 3, 3, 2}; const int bias_elements = 2; const int bias_shape[] = {4, 1, 1, 1, 2}; + const int output_elements = 20; + const int output_shape[] = {4, 1, 5, 2, 2}; + const int output_dims_count = 20; + +#pragma Bss(".Zdata") + const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; + const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2}; + const float bias_values[] = {2, 2}; + int8_t output_data[output_dims_count]; +#pragma Bss() + + const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 34, 34, 34, 34}; + + const float input_scale = 1.0; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + + int8_t input_quantized[input_elements]; + int8_t filter_quantized[filter_elements]; + int32_t bias_quantized[bias_elements]; + int8_t golden_quantized[output_elements]; + int zero_points[bias_elements + 1]; + float scales[bias_elements + 1]; + + tflite::testing::TestDepthwiseConvQuantizedPerChannel( + input_shape, input_values, input_quantized, input_scale, input_zero_point, + filter_shape, filter_values, filter_quantized, bias_shape, bias_values, + bias_quantized, output_shape, golden, golden_quantized, output_data, + output_scale, output_zero_point, kTfLiteActNone); +} + +// Test group 2 +TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel2) { + const int input_elements = 80; + const int input_shape[] = {4, 1, 20, 2, 2}; + const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; + const int filter_elements = 36; + const int filter_shape[] = {4, 2, 3, 3, 2}; + const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2}; + const int bias_elements = 2; + const int bias_shape[] = {4, 1, 1, 1, 2}; + const int output_elements = 80; + const float bias_values[] = {2, 2}; + const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 34, 34, 34, 34}; + const int output_shape[] = {4, 1, 20, 2, 2}; + const int output_dims_count = 80; + int8_t output_data[output_dims_count]; + + const float input_scale = 1.0; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + + int8_t input_quantized[input_elements]; + int8_t filter_quantized[filter_elements]; + int32_t bias_quantized[bias_elements]; + int8_t golden_quantized[output_elements]; + int zero_points[bias_elements + 1]; + float scales[bias_elements + 1]; + + tflite::testing::TestDepthwiseConvQuantizedPerChannel( + input_shape, input_values, input_quantized, input_scale, input_zero_point, + filter_shape, filter_values, filter_quantized, bias_shape, bias_values, + bias_quantized, output_shape, golden, golden_quantized, output_data, + output_scale, output_zero_point, kTfLiteActNone); +} + +TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel2) { + const int input_elements = 80; + const int input_shape[] = {4, 1, 20, 2, 2}; + const int filter_elements = 36; + const int filter_shape[] = {4, 2, 3, 3, 2}; + const int bias_elements = 2; + const int bias_shape[] = {4, 1, 1, 1, 2}; + const int output_elements = 80; + const int output_shape[] = {4, 1, 20, 2, 2}; + const int output_dims_count = 80; + +#pragma Bss(".Zdata") + float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; + float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2}; + float bias_values[] = {2, 2}; + int8_t output_data[output_dims_count]; +#pragma Bss() + + const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 34, 34, 34, 34}; + + const float input_scale = 1.0; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + + int8_t input_quantized[input_elements]; + int8_t filter_quantized[filter_elements]; + int32_t bias_quantized[bias_elements]; + int8_t golden_quantized[output_elements]; + int zero_points[bias_elements + 1]; + float scales[bias_elements + 1]; + + tflite::testing::TestDepthwiseConvQuantizedPerChannel( + input_shape, input_values, input_quantized, input_scale, input_zero_point, + filter_shape, filter_values, filter_quantized, bias_shape, bias_values, + bias_quantized, output_shape, golden, golden_quantized, output_data, + output_scale, output_zero_point, kTfLiteActNone); +} + +// Test group 3 +TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel3) { + const int input_elements = 40; + const int input_shape[] = {4, 1, 2, 2, 10}; + const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + const int filter_elements = 90; + const int filter_shape[] = {4, 1, 3, 3, 10}; + const float filter_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + const int bias_elements = 1; + const int bias_shape[] = {4, 1, 1, 1, 1}; const int output_elements = 4; - const float bias_values[] = {1, 2}; - const float golden[] = { - -103, - 127, - -128, - 127, - }; - const int output_shape[] = {4, 1, 2, 1, 2}; + const float bias_values[] = {1}; + const float golden[] = {41, 41, 41, 41}; + const int output_shape[] = {4, 1, 2, 2, 1}; const int output_dims_count = 4; int8_t output_data[output_dims_count]; - const float input_scale = 1.0f; + const float input_scale = 1.0; const float output_scale = 1.0f; const int input_zero_point = 0; const int output_zero_point = 0; @@ -498,30 +423,41 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelDepthMultiplier1) { output_scale, output_zero_point, kTfLiteActNone); } -TF_LITE_MICRO_TEST(TestQuantizedPerChannelDepthMultiplier1Relu6) { - const int input_elements = 24; - const int input_shape[] = {4, 1, 3, 2, 4}; - const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - const int filter_elements = 16; - const int filter_shape[] = {4, 1, 2, 2, 4}; - const float filter_values[] = {0, 1, 8, -2, -1, 2, -10, 0, - -1, 3, -18, 0, 0, 4, 20, -3}; - const int bias_elements = 4; - const int bias_shape[] = {4, 1, 1, 1, 4}; - const int output_elements = 8; - const float bias_values[] = {1, 2, 3, 4}; - const float golden[] = { - 0, 6, 3, 0, 0, 6, 3, 0, - }; - const int output_shape[] = {4, 1, 2, 1, 4}; - int8_t output_data[output_elements]; - float output_float[output_elements]; +TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel3) { + const int input_elements = 40; + const int input_shape[] = {4, 1, 2, 2, 10}; + const int filter_elements = 90; + const int filter_shape[] = {4, 1, 3, 3, 10}; + const int bias_elements = 1; + const int bias_shape[] = {4, 1, 1, 1, 1}; + const int output_elements = 4; + const int output_shape[] = {4, 1, 2, 2, 1}; + const int output_dims_count = 4; - const float input_scale = 0.023529f; - const float output_scale = 0.023529f; - const int input_zero_point = -128; - const int output_zero_point = -128; +#pragma Bss(".Zdata") + float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + float filter_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + float bias_values[] = {1}; + int8_t output_data[output_dims_count]; +#pragma Bss() + + const float golden[] = {41, 41, 41, 41}; + + const float input_scale = 1.0; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; int8_t input_quantized[input_elements]; int8_t filter_quantized[filter_elements]; @@ -530,239 +466,115 @@ TF_LITE_MICRO_TEST(TestQuantizedPerChannelDepthMultiplier1Relu6) { int zero_points[bias_elements + 1]; float scales[bias_elements + 1]; - tflite::testing::TestDepthwiseConvFloat( - input_shape, input_values, filter_shape, filter_values, bias_shape, - bias_values, golden, output_shape, kTfLiteActRelu6, output_float); - tflite::testing::TestDepthwiseConvQuantizedPerChannel( input_shape, input_values, input_quantized, input_scale, input_zero_point, filter_shape, filter_values, filter_quantized, bias_shape, bias_values, bias_quantized, output_shape, golden, golden_quantized, output_data, - output_scale, output_zero_point, kTfLiteActRelu6); -} - -TF_LITE_MICRO_TEST(TestQuantizedPerChannelCompareWithFloat) { - const int input_dims[] = {4, 1, 2, 3, 2}; - const float input_data[] = {3, 2, 1, -1, -2, -3, 4, 3, 2, -2, -3, -4}; - const int filter_dims[] = {4, 1, 2, 2, 4}; - const float filter_data[] = {1, 2, 3, 4, 3, 4, 5, 6, 7, 8, 5, 6, 3, 4, 1, 2}; - const int bias_dims[] = {4, 1, 1, 1, 4}; - const float bias_data[] = {3, -2, 4, 6}; - const int output_dims[] = {4, 1, 1, 2, 4}; - const float golden[] = {43, 48, 18, 22, 3, -4, -28, -36}; - - const int input_size = 12; - const int filter_size = 16; - const int output_size = 8; - const int bias_size = 4; - int8_t input_quantized[input_size]; - int8_t filter_quantized[filter_size]; - int32_t bias_quantized[bias_size]; - int8_t golden_quantized[output_size]; - int zero_points[bias_size + 1]; - float scales[bias_size + 1]; - int8_t output_data[output_size]; - float output_float[output_size]; - - const float input_scale = 0.5; - const float output_scale = 1.0; - const int input_zero_point = 0; - const int output_zero_point = 0; - - tflite::testing::TestDepthwiseConvQuantizedPerChannel( - input_dims, input_data, input_quantized, input_scale, input_zero_point, - filter_dims, filter_data, filter_quantized, bias_dims, bias_data, - bias_quantized, output_dims, golden, golden_quantized, output_data, output_scale, output_zero_point, kTfLiteActNone); - - tflite::testing::TestDepthwiseConvFloat( - input_dims, input_data, filter_dims, filter_data, bias_dims, bias_data, - golden, output_dims, kTfLiteActNone, output_float); } -TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) { - const int input_shape[] = {4, 1, 2, 3, 2}; - const float input_data[] = {3, 2, 1, -1, -2, -3, 4, 3, 2, -2, -3, -4}; - const int filter_shape[] = {4, 1, 2, 2, 4}; - const float filter_data[] = {1, 2, 3, 4, 3, 4, 5, 6, 7, 8, 5, 6, 3, 4, 1, 2}; - const int bias_shape[] = {4, 1, 1, 1, 4}; - const float bias_data[] = {3, -2, 4, 6}; - const int output_shape[] = {4, 1, 1, 2, 4}; - const float golden[] = {43, 48, 18, 22, 3, -4, -28, -36}; - - const int input_size = 12; - const int filter_size = 16; - const int output_size = 8; - const int bias_size = 4; - int8_t input_quantized[input_size]; - int8_t filter_quantized[filter_size]; - int32_t bias_quantized[bias_size]; - int8_t golden_quantized[output_size]; - int zero_points[bias_size + 1]; - float scales[bias_size + 1]; - int8_t output_data[output_size]; - float output_float[output_size]; - - const float input_scale = 0.5; - const float output_scale = 1.0; - const int input_zero_point = 0; - const int output_zero_point = 0; - - TfLiteIntArray* input_dims = tflite::testing::IntArrayFromInts(input_shape); - TfLiteIntArray* filter_dims = tflite::testing::IntArrayFromInts(filter_shape); - TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape); - TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape); - - int filter_zero_points[5]; - float filter_scales[5]; - TfLiteAffineQuantization filter_quant; - TfLiteAffineQuantization bias_quant; - TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor( - input_data, input_quantized, input_dims, input_scale, input_zero_point, - "input_tensor"); - TfLiteTensor filter_tensor = - tflite::testing::CreateSymmetricPerChannelQuantizedTensor( - filter_data, filter_quantized, filter_dims, filter_scales, - filter_zero_points, &filter_quant, 0 /* quantized dimension */, - "filter_tensor"); - TfLiteTensor bias_tensor = - tflite::testing::CreatePerChannelQuantizedBiasTensor( - bias_data, bias_quantized, bias_dims, input_scale, &filter_scales[1], - scales, zero_points, &bias_quant, 0, "bias_tensor"); - TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor( - output_data, output_dims, output_scale, output_zero_point, - "output_tensor"); - - float input_scales[] = {1, input_scale}; - int input_zero_points[] = {1, input_zero_point}; - TfLiteAffineQuantization input_quant = { - tflite::testing::FloatArrayFromFloats(input_scales), - tflite::testing::IntArrayFromInts(input_zero_points)}; - input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant}; - - constexpr int inputs_size = 3; - constexpr int outputs_size = 1; - constexpr int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[tensors_size] = { - input_tensor, - filter_tensor, - bias_tensor, - output_tensor, - }; - - // Set filter quant to mismatched dimension. - TfLiteAffineQuantization* quant = reinterpret_cast( - filter_tensor.quantization.params); - quant->scale->size = 2; - TF_LITE_MICRO_EXPECT_EQ( - kTfLiteError, tflite::testing::ValidateDepthwiseConvGoldens( - golden_quantized, output_size, kTfLiteActNone, 1e-5, - tensors_size, tensors)); - - // Set scale back to correct dimension, and make zero point array too short. - quant->scale->size = filter_shape[0]; - quant->zero_point->size = 2; - TF_LITE_MICRO_EXPECT_EQ( - kTfLiteError, tflite::testing::ValidateDepthwiseConvGoldens( - golden_quantized, output_size, kTfLiteActNone, 1e-5, - tensors_size, tensors)); -} - -TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) { - const float input_scale = 1.0f; - const float filter_scale = 1.0f; - const float output_scale = 1.0f; - - const int input_elements = 12; - const int input_shape[] = {4, 1, 3, 2, 2}; - const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12}; - const int filter_elements = 16; - const int filter_shape[] = {4, 1, 2, 2, 4}; - const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12, - 5, 6, 7, 8, 13, -14, 15, -16}; - const int bias_elements = 4; - const int bias_shape[] = {4, 1, 1, 1, 4}; +// Test group 4 +TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel4) { + const int input_elements = 80; + const int input_shape[] = {4, 1, 4, 2, 10}; + const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + const int filter_elements = 90; + const int filter_shape[] = {4, 1, 3, 3, 10}; + const float filter_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + const int bias_elements = 1; + const int bias_shape[] = {4, 1, 1, 1, 1}; const int output_elements = 8; - const float bias_values[] = {1, 2, 3, 4}; - const float golden[] = { - 71, -34, 99, -20, 91, -26, 127, -4, - }; - const int output_shape[] = {4, 1, 2, 1, 4}; + const float bias_values[] = {1}; + const float golden[] = {41, 41, 61, 61, 61, 61, 41, 41}; + const int output_shape[] = {4, 1, 4, 2, 1}; const int output_dims_count = 8; int8_t output_data[output_dims_count]; + const float input_scale = 1.0; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + int8_t input_quantized[input_elements]; int8_t filter_quantized[filter_elements]; int32_t bias_quantized[bias_elements]; int8_t golden_quantized[output_elements]; + int zero_points[bias_elements + 1]; + float scales[bias_elements + 1]; - TfLiteIntArray* input_dims = tflite::testing::IntArrayFromInts(input_shape); - TfLiteIntArray* filter_dims = tflite::testing::IntArrayFromInts(filter_shape); - TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape); - TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape); - - // Create per-layer quantized int8 input tensor. - TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor( - input_values, input_quantized, input_dims, input_scale, 0, - "input_tensor"); - int input_zero_points[2] = {1, 0}; - float input_scales[2] = {1, input_scale}; - TfLiteAffineQuantization input_quant = { - tflite::testing::FloatArrayFromFloats(input_scales), - tflite::testing::IntArrayFromInts(input_zero_points)}; - input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant}; - - // Create per-layer quantized int8 filter tensor. - TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor( - filter_values, filter_quantized, filter_dims, filter_scale, 0, - "filter_tensor"); - int filter_zero_points[2] = {1, 0}; - float filter_scales[2] = {1, filter_scale}; - TfLiteAffineQuantization filter_quant = { - tflite::testing::FloatArrayFromFloats(filter_scales), - tflite::testing::IntArrayFromInts(filter_zero_points)}; - filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant}; - - // Create per-layer quantized int32 bias tensor. - tflite::SymmetricQuantize(bias_values, bias_quantized, bias_elements, - input_scale * output_scale); - TfLiteTensor bias_tensor = tflite::testing::CreateInt32Tensor( - bias_quantized, bias_dims, "bias_tensor"); - - int bias_zero_points[2] = {1, 0}; - float bias_scales[2] = {1, input_scale * filter_scale}; - TfLiteAffineQuantization bias_quant = { - tflite::testing::FloatArrayFromFloats(bias_scales), - tflite::testing::IntArrayFromInts(bias_zero_points)}; - bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant}; - - // Create per-layer quantized int8 output tensor. - TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor( - output_data, output_dims, output_scale, 0, "output_tensor"); - int output_zero_points[2] = {1, 0}; - float output_scales[2] = {1, output_scale}; - TfLiteAffineQuantization output_quant = { - tflite::testing::FloatArrayFromFloats(output_scales), - tflite::testing::IntArrayFromInts(output_zero_points)}; - output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant}; - - constexpr int inputs_size = 3; - constexpr int outputs_size = 1; - constexpr int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[tensors_size] = { - input_tensor, - filter_tensor, - bias_tensor, - output_tensor, - }; - - tflite::AsymmetricQuantize(golden, golden_quantized, output_dims_count, - output_scale, 0); - - TF_LITE_MICRO_EXPECT_EQ( - kTfLiteOk, tflite::testing::ValidateDepthwiseConvGoldens( - golden_quantized, output_dims_count, kTfLiteActNone, 1e-5, - tensors_size, tensors)); + tflite::testing::TestDepthwiseConvQuantizedPerChannel( + input_shape, input_values, input_quantized, input_scale, input_zero_point, + filter_shape, filter_values, filter_quantized, bias_shape, bias_values, + bias_quantized, output_shape, golden, golden_quantized, output_data, + output_scale, output_zero_point, kTfLiteActNone); } +TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel4) { + const int input_elements = 80; + const int input_shape[] = {4, 1, 4, 2, 10}; + const int filter_elements = 90; + const int filter_shape[] = {4, 1, 3, 3, 10}; + const int bias_elements = 1; + const int bias_shape[] = {4, 1, 1, 1, 1}; + const int output_elements = 8; + const int output_shape[] = {4, 1, 4, 2, 1}; + const int output_dims_count = 8; + +#pragma Bss(".Zdata") + float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + float filter_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + float bias_values[] = {1}; + int8_t output_data[output_dims_count]; +#pragma Bss() + + const float golden[] = {41, 41, 61, 61, 61, 61, 41, 41}; + + const float input_scale = 1.0; + const float output_scale = 1.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + + int8_t input_quantized[input_elements]; + int8_t filter_quantized[filter_elements]; + int32_t bias_quantized[bias_elements]; + int8_t golden_quantized[output_elements]; + int zero_points[bias_elements + 1]; + float scales[bias_elements + 1]; + + tflite::testing::TestDepthwiseConvQuantizedPerChannel( + input_shape, input_values, input_quantized, input_scale, input_zero_point, + filter_shape, filter_values, filter_quantized, bias_shape, bias_values, + bias_quantized, output_shape, golden, golden_quantized, output_data, + output_scale, output_zero_point, kTfLiteActNone); +} TF_LITE_MICRO_TESTS_END diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc index 539c7ecc3a4..78cb2873c54 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc @@ -25,74 +25,6 @@ namespace tflite { namespace testing { namespace { -void TestFullyConnectedFloat( - const int* input_dims_data, const float* input_data, - const int* weights_dims_data, const float* weights_data, - const int* bias_dims_data, const float* bias_data, - const float* expected_output_data, const int* output_dims_data, - TfLiteFusedActivation activation, float* output_data) { - TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); - TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data); - TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); - TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); - const int output_dims_count = ElementCount(*output_dims); - - constexpr int inputs_size = 3; - constexpr int outputs_size = 1; - constexpr int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[tensors_size] = { - CreateFloatTensor(input_data, input_dims, "input_tensor"), - CreateFloatTensor(weights_data, weights_dims, "weights_tensor"), - CreateFloatTensor(bias_data, bias_dims, "bias_tensor"), - CreateFloatTensor(output_data, output_dims, "output_tensor"), - }; - - TfLiteContext context; - PopulateContext(tensors, tensors_size, micro_test::reporter, &context); - ::tflite::ops::micro::AllOpsResolver resolver; - const TfLiteRegistration* registration = - resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1); - TF_LITE_MICRO_EXPECT_NE(nullptr, registration); - - TfLiteFullyConnectedParams builtin_data = { - activation, - kTfLiteFullyConnectedWeightsFormatDefault, - }; - const char* init_data = reinterpret_cast(&builtin_data); - size_t init_data_size = 0; - void* user_data = nullptr; - if (registration->init) { - user_data = registration->init(&context, init_data, init_data_size); - } - int inputs_array_data[] = {3, 0, 1, 2}; - TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); - int outputs_array_data[] = {1, 3}; - TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); - int temporaries_array_data[] = {0}; - TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data); - - TfLiteNode node; - node.inputs = inputs_array; - node.outputs = outputs_array; - node.temporaries = temporaries_array; - node.user_data = user_data; - node.builtin_data = reinterpret_cast(&builtin_data); - node.custom_initial_data = nullptr; - node.custom_initial_data_size = 0; - node.delegate = nullptr; - if (registration->prepare) { - TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); - } - TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); - TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node)); - if (registration->free) { - registration->free(&context, user_data); - } - for (int i = 0; i < output_dims_count; ++i) { - TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f); - } -} - template void TestFullyConnectedQuantized( const int* input_dims_data, const T* input_data, const float input_min, @@ -121,6 +53,10 @@ void TestFullyConnectedQuantized( output_min, output_max), }; + tensors[0].params.zero_point = 0; + tensors[1].params.zero_point = 0; + tensors[3].params.zero_point = 0; + TfLiteContext context; PopulateContext(tensors, tensors_size, micro_test::reporter, &context); @@ -176,466 +112,23 @@ void TestFullyConnectedQuantized( TF_LITE_MICRO_TESTS_BEGIN -TF_LITE_MICRO_TEST(SimpleTest) { - const int input_dims_data[] = {2, 2, 10}; - const float input_data[] = { - 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 - 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 - }; - const int weights_dims_data[] = {2, 3, 10}; - const float weights_data[] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 - }; - const int bias_dims_data[] = {1, 3}; - const float bias_data[] = {1, 2, 3}; - const float expected_output_data[] = { - 24, 25, 26, 58, 59, 60, - }; - const int output_dims_data[] = {2, 2, 3}; - - const int output_dims_count = 6; - float output_data[output_dims_count]; - tflite::testing::TestFullyConnectedFloat( - input_dims_data, input_data, weights_dims_data, weights_data, - bias_dims_data, bias_data, expected_output_data, output_dims_data, - kTfLiteActNone, output_data); -} - -TF_LITE_MICRO_TEST(SimpleTest2) { - const int input_dims_data[] = {2, 2, 2}; - const float input_data[] = { - 1, 2, // b = 0 - 2, 1, // b = 1 - }; - const int weights_dims_data[] = {2, 1, 2}; - const float weights_data[] = { - 2, 4, // u = 0 - }; - const int bias_dims_data[] = {1, 1}; - const float bias_data[] = {1}; - const float expected_output_data[] = { - 11, - 9, - }; - const int output_dims_data[] = {2, 2, 1}; - - const int output_dims_count = 6; - float output_data[output_dims_count]; - tflite::testing::TestFullyConnectedFloat( - input_dims_data, input_data, weights_dims_data, weights_data, - bias_dims_data, bias_data, expected_output_data, output_dims_data, - kTfLiteActNone, output_data); -} - -TF_LITE_MICRO_TEST(SimpleTestRelu) { - const int input_dims_data[] = {2, 2, 10}; - const float input_data[] = { - 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 - 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 - }; - const int weights_dims_data[] = {2, 3, 10}; - const float weights_data[] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 - -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, // u = 1 - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 - }; - const int bias_dims_data[] = {1, 3}; - const float bias_data[] = {1, -2, 3}; - const float expected_output_data[] = { - 24, 0, 26, 58, 0, 60, - }; - const int output_dims_data[] = {2, 2, 3}; - - const int output_dims_count = 6; - float output_data[output_dims_count]; - tflite::testing::TestFullyConnectedFloat( - input_dims_data, input_data, weights_dims_data, weights_data, - bias_dims_data, bias_data, expected_output_data, output_dims_data, - kTfLiteActRelu, output_data); -} - -TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8) { - using tflite::testing::F2Q; - using tflite::testing::F2Q32; - - const float input_min = -63.5f; - const float input_max = 64.0f; - const float weights_min = -63.5f; - const float weights_max = 64.0f; - const float bias_scale = 0.25f; - const float output_min = -127.0f; - const float output_max = 128.0f; - - const int input_dims_data[] = {2, 2, 10}; - const uint8_t input_data[] = { - F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), - F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), - F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), - F2Q(7, input_min, input_max), F2Q(8, input_min, input_max), - F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max), - F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), - F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), - F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), - F2Q(7, input_min, input_max), F2Q(-8, input_min, input_max), - F2Q(9, input_min, input_max), F2Q(-10, input_min, input_max), - }; - const int weights_dims_data[] = {2, 3, 10}; - const uint8_t weights_data[] = { - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - }; - const int bias_dims_data[] = {1, 3}; - const int32_t bias_data[] = { - F2Q32(1, bias_scale), - F2Q32(2, bias_scale), - F2Q32(3, bias_scale), - }; - const uint8_t expected_output_data[] = { - F2Q(24, output_min, output_max), F2Q(25, output_min, output_max), - F2Q(26, output_min, output_max), F2Q(58, output_min, output_max), - F2Q(59, output_min, output_max), F2Q(60, output_min, output_max), - }; - const int output_dims_data[] = {2, 2, 3}; - - const int output_dims_count = 6; - uint8_t output_data[output_dims_count]; - tflite::testing::TestFullyConnectedQuantized( - input_dims_data, input_data, input_min, input_max, weights_dims_data, - weights_data, weights_min, weights_max, bias_dims_data, bias_data, - bias_scale, expected_output_data, output_dims_data, output_min, - output_max, kTfLiteActNone, output_data); -} - -// TODO(b/138811455): Fix code duplication in micro tests -TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8) { - using tflite::testing::F2Q32; - using tflite::testing::F2QS; - - const float input_min = -63.5f; - const float input_max = 64.0f; - const float weights_min = -64.0f; - const float weights_max = 63.5f; - const float bias_scale = 0.25f; - const float output_min = -127.0f; - const float output_max = 128.0f; - - const int input_dims_data[] = {2, 2, 10}; - const int8_t input_data[] = { - F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), - F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), - F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), - F2QS(7, input_min, input_max), F2QS(8, input_min, input_max), - F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max), - F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), - F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), - F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), - F2QS(7, input_min, input_max), F2QS(-8, input_min, input_max), - F2QS(9, input_min, input_max), F2QS(-10, input_min, input_max), - }; - const int weights_dims_data[] = {2, 3, 10}; - const int8_t weights_data[] = { - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - }; - const int bias_dims_data[] = {1, 3}; - const int32_t bias_data[] = { - F2Q32(1, bias_scale), - F2Q32(2, bias_scale), - F2Q32(3, bias_scale), - }; - const int8_t expected_output_data[] = { - F2QS(24, output_min, output_max), F2QS(25, output_min, output_max), - F2QS(26, output_min, output_max), F2QS(58, output_min, output_max), - F2QS(59, output_min, output_max), F2QS(60, output_min, output_max), - }; - const int output_dims_data[] = {2, 2, 3}; - - const int output_dims_count = 6; - int8_t output_data[output_dims_count]; - tflite::testing::TestFullyConnectedQuantized( - input_dims_data, input_data, input_min, input_max, weights_dims_data, - weights_data, weights_min, weights_max, bias_dims_data, bias_data, - bias_scale, expected_output_data, output_dims_data, output_min, - output_max, kTfLiteActNone, output_data); -} - -TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8Relu) { - using tflite::testing::F2Q; - using tflite::testing::F2Q32; - - const float input_min = -63.5f; - const float input_max = 64.0f; - const float weights_min = -63.5f; - const float weights_max = 64.0f; - const float bias_scale = 0.25f; - const float output_min = -127.0f; - const float output_max = 128.0f; - - const int input_dims_data[] = {2, 2, 10}; - const uint8_t input_data[] = { - F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), - F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), - F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), - F2Q(7, input_min, input_max), F2Q(8, input_min, input_max), - F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max), - F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), - F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), - F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), - F2Q(7, input_min, input_max), F2Q(-8, input_min, input_max), - F2Q(9, input_min, input_max), F2Q(-10, input_min, input_max), - }; - const int weights_dims_data[] = {2, 3, 10}; - const uint8_t weights_data[] = { - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - F2Q(-1, weights_min, weights_max), F2Q(-2, weights_min, weights_max), - F2Q(-3, weights_min, weights_max), F2Q(-4, weights_min, weights_max), - F2Q(-5, weights_min, weights_max), F2Q(-6, weights_min, weights_max), - F2Q(-7, weights_min, weights_max), F2Q(-8, weights_min, weights_max), - F2Q(-9, weights_min, weights_max), F2Q(-10, weights_min, weights_max), - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - }; - const int bias_dims_data[] = {1, 3}; - const int32_t bias_data[] = { - F2Q32(1, bias_scale), - F2Q32(0, bias_scale), - F2Q32(3, bias_scale), - }; - const uint8_t expected_output_data[] = { - F2Q(24, output_min, output_max), F2Q(0, output_min, output_max), - F2Q(26, output_min, output_max), F2Q(58, output_min, output_max), - F2Q(0, output_min, output_max), F2Q(60, output_min, output_max), - }; - const int output_dims_data[] = {2, 2, 3}; - - const int output_dims_count = 6; - uint8_t output_data[output_dims_count]; - tflite::testing::TestFullyConnectedQuantized( - input_dims_data, input_data, input_min, input_max, weights_dims_data, - weights_data, weights_min, weights_max, bias_dims_data, bias_data, - bias_scale, expected_output_data, output_dims_data, output_min, - output_max, kTfLiteActRelu, output_data); -} - -TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8Relu) { - using tflite::testing::F2Q32; - using tflite::testing::F2QS; - - const float input_min = -63.5f; - const float input_max = 64.0f; - const float weights_min = -64.0f; - const float weights_max = 63.5f; - const float bias_scale = 0.25f; - const float output_min = -127.0f; - const float output_max = 128.0f; - - const int input_dims_data[] = {2, 2, 10}; - const int8_t input_data[] = { - F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), - F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), - F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), - F2QS(7, input_min, input_max), F2QS(8, input_min, input_max), - F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max), - F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), - F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), - F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), - F2QS(7, input_min, input_max), F2QS(-8, input_min, input_max), - F2QS(9, input_min, input_max), F2QS(-10, input_min, input_max), - }; - const int weights_dims_data[] = {2, 3, 10}; - const int8_t weights_data[] = { - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - F2QS(-1, weights_min, weights_max), F2QS(-2, weights_min, weights_max), - F2QS(-3, weights_min, weights_max), F2QS(-4, weights_min, weights_max), - F2QS(-5, weights_min, weights_max), F2QS(-6, weights_min, weights_max), - F2QS(-7, weights_min, weights_max), F2QS(-8, weights_min, weights_max), - F2QS(-9, weights_min, weights_max), F2QS(-10, weights_min, weights_max), - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - }; - const int bias_dims_data[] = {1, 3}; - const int32_t bias_data[] = { - F2Q32(1, bias_scale), - F2Q32(0, bias_scale), - F2Q32(3, bias_scale), - }; - const int8_t expected_output_data[] = { - F2QS(24, output_min, output_max), F2QS(0, output_min, output_max), - F2QS(26, output_min, output_max), F2QS(58, output_min, output_max), - F2QS(0, output_min, output_max), F2QS(60, output_min, output_max), - }; - const int output_dims_data[] = {2, 2, 3}; - - const int output_dims_count = 6; - int8_t output_data[output_dims_count]; - tflite::testing::TestFullyConnectedQuantized( - input_dims_data, input_data, input_min, input_max, weights_dims_data, - weights_data, weights_min, weights_max, bias_dims_data, bias_data, - bias_scale, expected_output_data, output_dims_data, output_min, - output_max, kTfLiteActRelu, output_data); -} - -TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8OutputMultiplierGreaterThan1) { - using tflite::testing::F2Q; - using tflite::testing::F2Q32; - - const float input_min = -127.0f; - const float input_max = 128.0f; - const float weights_min = -127.0f; - const float weights_max = 128.0f; - const float bias_scale = 1.0f; - const float output_min = -63.5f; - const float output_max = 64.0f; - - const int input_dims_data[] = {2, 2, 10}; - const uint8_t input_data[] = { - F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), - F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), - F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), - F2Q(7, input_min, input_max), F2Q(8, input_min, input_max), - F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max), - F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), - F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), - F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), - F2Q(7, input_min, input_max), F2Q(-8, input_min, input_max), - F2Q(9, input_min, input_max), F2Q(-10, input_min, input_max), - }; - const int weights_dims_data[] = {2, 3, 10}; - const uint8_t weights_data[] = { - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - }; - const int bias_dims_data[] = {1, 3}; - const int32_t bias_data[] = { - F2Q32(1, bias_scale), - F2Q32(2, bias_scale), - F2Q32(3, bias_scale), - }; - const uint8_t expected_output_data[] = { - F2Q(24, output_min, output_max), F2Q(25, output_min, output_max), - F2Q(26, output_min, output_max), F2Q(58, output_min, output_max), - F2Q(59, output_min, output_max), F2Q(60, output_min, output_max), - }; - const int output_dims_data[] = {2, 2, 3}; - - const int output_dims_count = 6; - uint8_t output_data[output_dims_count]; - tflite::testing::TestFullyConnectedQuantized( - input_dims_data, input_data, input_min, input_max, weights_dims_data, - weights_data, weights_min, weights_max, bias_dims_data, bias_data, - bias_scale, expected_output_data, output_dims_data, output_min, - output_max, kTfLiteActNone, output_data); -} - -TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8OutputMultiplierGreaterThan1) { - using tflite::testing::F2Q32; - using tflite::testing::F2QS; - - const float input_min = -127.0f; - const float input_max = 128.0f; +// Test group 1 +TF_LITE_MICRO_TEST(SystemSimpleTestQuantized1) { + const float input_min = -128.0f; + const float input_max = 127.0f; const float weights_min = -128.0f; const float weights_max = 127.0f; const float bias_scale = 1.0f; - const float output_min = -63.5f; - const float output_max = 64.0f; + const float output_min = -128.0f; + const float output_max = 127.0f; const int input_dims_data[] = {2, 2, 10}; - const int8_t input_data[] = { - F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), - F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), - F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), - F2QS(7, input_min, input_max), F2QS(8, input_min, input_max), - F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max), - F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), - F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), - F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), - F2QS(7, input_min, input_max), F2QS(-8, input_min, input_max), - F2QS(9, input_min, input_max), F2QS(-10, input_min, input_max), - }; + const int8_t input_data[] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2}; const int weights_dims_data[] = {2, 3, 10}; - const int8_t weights_data[] = { - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - }; + const int8_t weights_data[] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2}; const int bias_dims_data[] = {1, 3}; - const int32_t bias_data[] = { - F2Q32(1, bias_scale), - F2Q32(2, bias_scale), - F2Q32(3, bias_scale), - }; - const int8_t expected_output_data[] = { - F2QS(24, output_min, output_max), F2QS(25, output_min, output_max), - F2QS(26, output_min, output_max), F2QS(58, output_min, output_max), - F2QS(59, output_min, output_max), F2QS(60, output_min, output_max), - }; + const int32_t bias_data[] = {1,1,1}; + const int8_t expected_output_data[] = {41,41,41,41,41,41}; const int output_dims_data[] = {2, 2, 3}; const int output_dims_count = 6; @@ -647,292 +140,273 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8OutputMultiplierGreaterThan1) { output_max, kTfLiteActNone, output_data); } -TF_LITE_MICRO_TEST(SimpleTest4DInput) { - const int input_dims_data[] = {4, 1, 1, 5, 1}; - const float input_data[] = { - 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 - 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 - }; - const int weights_dims_data[] = {2, 3, 10}; - const float weights_data[] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 - }; - const int bias_dims_data[] = {1, 3}; - const float bias_data[] = {1, 2, 3}; - const float expected_output_data[] = { - 24, 25, 26, 58, 59, 60, // Expected results. - }; - const int output_dims_data[] = {2, 2, 3}; - - const int output_dims_count = 6; - float output_data[output_dims_count]; - tflite::testing::TestFullyConnectedFloat( - input_dims_data, input_data, weights_dims_data, weights_data, - bias_dims_data, bias_data, expected_output_data, output_dims_data, - kTfLiteActNone, output_data); -} - -TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedUInt8) { - using tflite::testing::F2Q; - using tflite::testing::F2Q32; - - const float input_min = -63.5f; - const float input_max = 64.0f; - const float weights_min = -63.5f; - const float weights_max = 64.0f; - const float bias_scale = 0.25f; - const float output_min = -127.0f; - const float output_max = 128.0f; - - const int input_dims_data[] = {4, 1, 1, 5, 1}; - const uint8_t input_data[] = { - F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), - F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), - F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), - F2Q(7, input_min, input_max), F2Q(8, input_min, input_max), - F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max), - F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), - F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), - F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), - F2Q(7, input_min, input_max), F2Q(-8, input_min, input_max), - F2Q(9, input_min, input_max), F2Q(-10, input_min, input_max), - }; - const int weights_dims_data[] = {2, 3, 10}; - const uint8_t weights_data[] = { - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - }; - const int bias_dims_data[] = {1, 3}; - const int32_t bias_data[] = { - F2Q32(1, bias_scale), - F2Q32(2, bias_scale), - F2Q32(3, bias_scale), - }; - const uint8_t expected_output_data[] = { - F2Q(24, output_min, output_max), F2Q(25, output_min, output_max), - F2Q(26, output_min, output_max), F2Q(58, output_min, output_max), - F2Q(59, output_min, output_max), F2Q(60, output_min, output_max), - }; - const int output_dims_data[] = {2, 2, 3}; - - const int output_dims_count = 6; - uint8_t output_data[output_dims_count]; - tflite::testing::TestFullyConnectedQuantized( - input_dims_data, input_data, input_min, input_max, weights_dims_data, - weights_data, weights_min, weights_max, bias_dims_data, bias_data, - bias_scale, expected_output_data, output_dims_data, output_min, - output_max, kTfLiteActNone, output_data); -} - -TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8) { - using tflite::testing::F2Q32; - using tflite::testing::F2QS; - - const float input_min = -63.5f; - const float input_max = 64.0f; - const float weights_min = -64.0f; - const float weights_max = 63.5f; - const float bias_scale = 0.25f; - const float output_min = -127.0f; - const float output_max = 128.0f; - - const int input_dims_data[] = {4, 1, 1, 5, 1}; - const int8_t input_data[] = { - F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), - F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), - F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), - F2QS(7, input_min, input_max), F2QS(8, input_min, input_max), - F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max), - F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), - F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), - F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), - F2QS(7, input_min, input_max), F2QS(-8, input_min, input_max), - F2QS(9, input_min, input_max), F2QS(-10, input_min, input_max), - }; - const int weights_dims_data[] = {2, 3, 10}; - const int8_t weights_data[] = { - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - }; - const int bias_dims_data[] = {1, 3}; - const int32_t bias_data[] = { - F2Q32(1, bias_scale), - F2Q32(2, bias_scale), - F2Q32(3, bias_scale), - }; - const int8_t expected_output_data[] = { - F2QS(24, output_min, output_max), F2QS(25, output_min, output_max), - F2QS(26, output_min, output_max), F2QS(58, output_min, output_max), - F2QS(59, output_min, output_max), F2QS(60, output_min, output_max), - }; - const int output_dims_data[] = {2, 2, 3}; - - const int output_dims_count = 6; - int8_t output_data[output_dims_count]; - tflite::testing::TestFullyConnectedQuantized( - input_dims_data, input_data, input_min, input_max, weights_dims_data, - weights_data, weights_min, weights_max, bias_dims_data, bias_data, - bias_scale, expected_output_data, output_dims_data, output_min, - output_max, kTfLiteActNone, output_data); -} - -TF_LITE_MICRO_TEST( - SimpleTest4DInputQuantizedUInt8OutputMultiplierGreaterThan1) { - using tflite::testing::F2Q; - using tflite::testing::F2Q32; - - const float input_min = -127.0f; - const float input_max = 128.0f; - const float weights_min = -127.0f; - const float weights_max = 128.0f; - const float bias_scale = 1.0f; - const float output_min = -63.5f; - const float output_max = 64.0f; - - const int input_dims_data[] = {4, 1, 1, 5, 1}; - const uint8_t input_data[] = { - F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), - F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), - F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), - F2Q(7, input_min, input_max), F2Q(8, input_min, input_max), - F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max), - F2Q(1, input_min, input_max), F2Q(2, input_min, input_max), - F2Q(3, input_min, input_max), F2Q(4, input_min, input_max), - F2Q(5, input_min, input_max), F2Q(6, input_min, input_max), - F2Q(7, input_min, input_max), F2Q(-8, input_min, input_max), - F2Q(9, input_min, input_max), F2Q(-10, input_min, input_max), - }; - const int weights_dims_data[] = {2, 3, 10}; - const uint8_t weights_data[] = { - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max), - F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max), - F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max), - F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max), - F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max), - }; - const int bias_dims_data[] = {1, 3}; - const int32_t bias_data[] = { - F2Q32(1, bias_scale), - F2Q32(2, bias_scale), - F2Q32(3, bias_scale), - }; - const uint8_t expected_output_data[] = { - F2Q(24, output_min, output_max), F2Q(25, output_min, output_max), - F2Q(26, output_min, output_max), F2Q(58, output_min, output_max), - F2Q(59, output_min, output_max), F2Q(60, output_min, output_max), - }; - const int output_dims_data[] = {2, 2, 3}; - - const int output_dims_count = 6; - uint8_t output_data[output_dims_count]; - tflite::testing::TestFullyConnectedQuantized( - input_dims_data, input_data, input_min, input_max, weights_dims_data, - weights_data, weights_min, weights_max, bias_dims_data, bias_data, - bias_scale, expected_output_data, output_dims_data, output_min, - output_max, kTfLiteActNone, output_data); -} - -TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8OutputMultiplierGreaterThan1) { - using tflite::testing::F2Q32; - using tflite::testing::F2QS; - - const float input_min = -127.0f; - const float input_max = 128.0f; +TF_LITE_MICRO_TEST(LocalSimpleTestQuantized1) { + const float input_min = -128.0f; + const float input_max = 127.0f; const float weights_min = -128.0f; const float weights_max = 127.0f; const float bias_scale = 1.0f; - const float output_min = -63.5f; - const float output_max = 64.0f; + const float output_min = -128.0f; + const float output_max = 127.0f; - const int input_dims_data[] = {4, 1, 1, 5, 1}; - const int8_t input_data[] = { - F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), - F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), - F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), - F2QS(7, input_min, input_max), F2QS(8, input_min, input_max), - F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max), - F2QS(1, input_min, input_max), F2QS(2, input_min, input_max), - F2QS(3, input_min, input_max), F2QS(4, input_min, input_max), - F2QS(5, input_min, input_max), F2QS(6, input_min, input_max), - F2QS(7, input_min, input_max), F2QS(-8, input_min, input_max), - F2QS(9, input_min, input_max), F2QS(-10, input_min, input_max), - }; - const int weights_dims_data[] = {2, 3, 10}; - const int8_t weights_data[] = { - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max), - F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max), - F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max), - F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max), - F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max), - }; - const int bias_dims_data[] = {1, 3}; - const int32_t bias_data[] = { - F2Q32(1, bias_scale), - F2Q32(2, bias_scale), - F2Q32(3, bias_scale), - }; - const int8_t expected_output_data[] = { - F2QS(24, output_min, output_max), F2QS(25, output_min, output_max), - F2QS(26, output_min, output_max), F2QS(58, output_min, output_max), - F2QS(59, output_min, output_max), F2QS(60, output_min, output_max), - }; - const int output_dims_data[] = {2, 2, 3}; + const int input_dims_data_local[] = {2, 2, 10}; + const int weights_dims_data_local[] = {2, 3, 10}; + const int bias_dims_data_local[] = {1, 3}; + const int output_dims_data_local[] = {2, 2, 3}; const int output_dims_count = 6; - int8_t output_data[output_dims_count]; + +#pragma Bss(".Zdata") + const int8_t input_data_local[] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2}; + const int8_t weights_data_local[] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2}; + const int32_t bias_data_local[] = {1,1,1}; + int8_t output_data_local[output_dims_count]; +#pragma Bss() + + const int8_t expected_output_data[] = {41,41,41,41,41,41}; + tflite::testing::TestFullyConnectedQuantized( - input_dims_data, input_data, input_min, input_max, weights_dims_data, - weights_data, weights_min, weights_max, bias_dims_data, bias_data, - bias_scale, expected_output_data, output_dims_data, output_min, - output_max, kTfLiteActNone, output_data); + input_dims_data_local, input_data_local, input_min, input_max, weights_dims_data_local, + weights_data_local, weights_min, weights_max, bias_dims_data_local, bias_data_local, + bias_scale, expected_output_data, output_dims_data_local, output_min, + output_max, kTfLiteActNone, output_data_local); +} + +// Test group 2 +TF_LITE_MICRO_TEST(SystemSimpleTestQuantized2) { + const float input_min = -128.0f; + const float input_max = 127.0f; + const float weights_min = -128.0f; + const float weights_max = 127.0f; + const float bias_scale = 1.0f; + const float output_min = -128.0f; + const float output_max = 127.0f; + + const int input_dims_data_2[] = {2, 10, 4}; + const int8_t input_data_2[] = {2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2}; + const int weights_dims_data_2[] = {2, 6, 4}; + const int8_t weights_data_2[] = {2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2}; + const int bias_dims_data_2[] = {1, 6}; + const int32_t bias_data_2[] = {1,1,1,1,1,1}; + const int8_t expected_output_data_2[] = {17,17,17,17,17,17,17,17,17,17, + 17,17,17,17,17,17,17,17,17,17, + 17,17,17,17,17,17,17,17,17,17, + 17,17,17,17,17,17,17,17,17,17, + 17,17,17,17,17,17,17,17,17,17, + 17,17,17,17,17,17,17,17,17,17}; + const int output_dims_data_2[] = {2, 10, 6}; + + const int output_dims_count_2 = 60; + int8_t output_data_2[output_dims_count_2]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data_2, input_data_2, input_min, input_max, weights_dims_data_2, + weights_data_2, weights_min, weights_max, bias_dims_data_2, bias_data_2, + bias_scale, expected_output_data_2, output_dims_data_2, output_min, + output_max, kTfLiteActNone, output_data_2); +} + +TF_LITE_MICRO_TEST(LocalSimpleTestQuantized2) { + const float input_min = -128.0f; + const float input_max = 127.0f; + const float weights_min = -128.0f; + const float weights_max = 127.0f; + const float bias_scale = 1.0f; + const float output_min = -128.0f; + const float output_max = 127.0f; + + const int input_dims_data_local_2[] = {2, 10, 4}; + const int weights_dims_data_local_2[] = {2, 6, 4}; + const int bias_dims_data_local_2[] = {1, 6}; + const int output_dims_data_local_2[] = {2, 10, 6}; + + const int output_dims_count_local_2 = 60; + +#pragma Bss(".Zdata") + const int8_t input_data_local_2[] = {2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2}; + const int8_t weights_data_local_2[] = {2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2}; + const int32_t bias_data_local_2[] = {1,1,1,1,1,1}; + int8_t output_data_local_2[output_dims_count_local_2]; +#pragma Bss() + + const int8_t expected_output_data_local_2[] = {41,41,41,41,41,41}; + + tflite::testing::TestFullyConnectedQuantized( + input_dims_data_local_2, input_data_local_2, input_min, input_max, weights_dims_data_local_2, + weights_data_local_2, weights_min, weights_max, bias_dims_data_local_2, bias_data_local_2, + bias_scale, expected_output_data_local_2, output_dims_data_local_2, output_min, + output_max, kTfLiteActNone, output_data_local_2); +} + +// Test group 3 +TF_LITE_MICRO_TEST(SystemSimpleTestQuantized3) { + const float input_min = -128.0f; + const float input_max = 127.0f; + const float weights_min = -128.0f; + const float weights_max = 127.0f; + const float bias_scale = 1.0f; + const float output_min = -128.0f; + const float output_max = 127.0f; + + const int input_dims_data_3[] = {2, 2, 5}; + const int8_t input_data_3[] = {2,2,2,2,2,2,2,2,2,2}; + const int weights_dims_data_3[] = {2, 10, 5}; + const int8_t weights_data_3[] = {2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2}; + const int bias_dims_data_3[] = {1, 10}; + const int32_t bias_data_3[] = {1,1,1,1,1,1,1,1,1,1}; + const int8_t expected_output_data_3[] = {21,21,21,21,21,21,21,21,21,21, + 21,21,21,21,21,21,21,21,21,21}; + const int output_dims_data_3[] = {2, 2, 10}; + + const int output_dims_count_3 = 20; + int8_t output_data_3[output_dims_count_3]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data_3, input_data_3, input_min, input_max, weights_dims_data_3, + weights_data_3, weights_min, weights_max, bias_dims_data_3, bias_data_3, + bias_scale, expected_output_data_3, output_dims_data_3, output_min, + output_max, kTfLiteActNone, output_data_3); +} + +TF_LITE_MICRO_TEST(LocalSimpleTestQuantized3) { + const float input_min = -128.0f; + const float input_max = 127.0f; + const float weights_min = -128.0f; + const float weights_max = 127.0f; + const float bias_scale = 1.0f; + const float output_min = -128.0f; + const float output_max = 127.0f; + + const int input_dims_data_local_3[] = {2, 2, 5}; + const int weights_dims_data_local_3[] = {2, 10, 5}; + const int bias_dims_data_local_3[] = {1, 10}; + const int output_dims_data_local_3[] = {2, 2, 10}; + + const int output_dims_count_local_3 = 20; + +#pragma Bss(".Zdata") + static int8_t input_data_local_3[10]; + static int8_t weights_data_local_3[50]; + static int32_t bias_data_local_3[10]; + static int8_t output_data_local_3[output_dims_count_local_3]; +#pragma Bss() + + for(int i = 0; i < 10; ++i) { + input_data_local_3[i] = 2; + } + + for(int i = 0; i < 50; ++i) { + weights_data_local_3[i] = 2; + } + + for(int i = 0; i < 10; ++i) { + bias_data_local_3[i] = 1; + } + + for(int i = 0; i < 20; ++i) { + output_data_local_3[i] = 0; + } + + const int8_t expected_output_data_local_3[] = {21,21,21,21,21,21,21,21,21,21, + 21,21,21,21,21,21,21,21,21,21}; + + tflite::testing::TestFullyConnectedQuantized( + input_dims_data_local_3, input_data_local_3, input_min, input_max, weights_dims_data_local_3, + weights_data_local_3, weights_min, weights_max, bias_dims_data_local_3, bias_data_local_3, + bias_scale, expected_output_data_local_3, output_dims_data_local_3, output_min, + output_max, kTfLiteActNone, output_data_local_3); +} + +// Test group 4 +TF_LITE_MICRO_TEST(SystemSimpleTestQuantized4) { + const float input_min = -128.0f; + const float input_max = 127.0f; + const float weights_min = -128.0f; + const float weights_max = 127.0f; + const float bias_scale = 1.0f; + const float output_min = -128.0f; + const float output_max = 127.0f; + + const int input_dims_data_4[] = {2, 5, 10}; + const int8_t input_data_4[] = {2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2}; + const int weights_dims_data_4[] = {2, 5, 10}; + const int8_t weights_data_4[] = {2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2}; + const int bias_dims_data_4[] = {1, 5}; + const int32_t bias_data_4[] = {1,1,1,1,1}; + const int8_t expected_output_data_4[] = {41,41,41,41,41,41,41,41,41,41, + 41,41,41,41,41,41,41,41,41,41, + 41,41,41,41,41}; + const int output_dims_data_4[] = {2, 5, 5}; + + const int output_dims_count_4 = 25; + int8_t output_data_4[output_dims_count_4]; + tflite::testing::TestFullyConnectedQuantized( + input_dims_data_4, input_data_4, input_min, input_max, weights_dims_data_4, + weights_data_4, weights_min, weights_max, bias_dims_data_4, bias_data_4, + bias_scale, expected_output_data_4, output_dims_data_4, output_min, + output_max, kTfLiteActNone, output_data_4); +} + +TF_LITE_MICRO_TEST(LocalSimpleTestQuantized4) { + const float input_min = -128.0f; + const float input_max = 127.0f; + const float weights_min = -128.0f; + const float weights_max = 127.0f; + const float bias_scale = 1.0f; + const float output_min = -128.0f; + const float output_max = 127.0f; + + const int input_dims_data_local_4[] = {2, 5, 10}; + const int weights_dims_data_local_4[] = {2, 5, 10}; + const int bias_dims_data_local_4[] = {1, 5}; + const int output_dims_data_local_4[] = {2, 5, 5}; + + const int output_dims_count_local_4 = 25; + +#pragma Bss(".Zdata") + const int8_t input_data_local_4[] = {2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2}; + const int8_t weights_data_local_4[] = {2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2}; + const int32_t bias_data_local_4[] = {1,1,1,1,1}; + int8_t output_data_local_4[output_dims_count_local_4]; +#pragma Bss() + + const int8_t expected_output_data_local_4[] = {41,41,41,41,41,41,41,41,41,41, + 41,41,41,41,41,41,41,41,41,41, + 41,41,41,41,41}; + + tflite::testing::TestFullyConnectedQuantized( + input_dims_data_local_4, input_data_local_4, input_min, input_max, weights_dims_data_local_4, + weights_data_local_4, weights_min, weights_max, bias_dims_data_local_4, bias_data_local_4, + bias_scale, expected_output_data_local_4, output_dims_data_local_4, output_min, + output_max, kTfLiteActNone, output_data_local_4); } TF_LITE_MICRO_TESTS_END diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc index 8bfeb718a1b..63737a41791 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc @@ -25,89 +25,20 @@ namespace tflite { namespace testing { namespace { -void TestAveragePoolingFloat(std::initializer_list input_dims_data, - std::initializer_list input_data, - const int filter_height, const int filter_width, - const int stride_height, const int stride_width, - std::initializer_list expected_output_data, - std::initializer_list output_dims_data, - TfLitePadding padding, - TfLiteFusedActivation activation, - float* output_data) { - TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data); - TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data); - const int output_dims_count = ElementCount(*output_dims); - - constexpr int inputs_size = 1; - constexpr int outputs_size = 1; - constexpr int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[tensors_size] = { - CreateFloatTensor(input_data, input_dims, "input_tensor"), - CreateFloatTensor(output_data, output_dims, "output_tensor"), - }; - - TfLiteContext context; - PopulateContext(tensors, tensors_size, micro_test::reporter, &context); - - ::tflite::ops::micro::AllOpsResolver resolver; - const TfLiteRegistration* registration = - resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1); - TF_LITE_MICRO_EXPECT_NE(nullptr, registration); - - TfLitePoolParams builtin_data = {padding, stride_width, stride_height, - filter_width, filter_height, activation}; - const char* init_data = reinterpret_cast(&builtin_data); - size_t init_data_size = 0; - void* user_data = nullptr; - if (registration->init) { - user_data = registration->init(&context, init_data, init_data_size); - } - int inputs_array_data[] = {1, 0}; - TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); - int outputs_array_data[] = {1, 1}; - TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); - int temporaries_array_data[] = {0}; - TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data); - - TfLiteNode node; - node.inputs = inputs_array; - node.outputs = outputs_array; - node.temporaries = temporaries_array; - node.user_data = user_data; - node.builtin_data = reinterpret_cast(&builtin_data); - node.custom_initial_data = nullptr; - node.custom_initial_data_size = 0; - node.delegate = nullptr; - - if (registration->prepare) { - TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); - } - TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); - TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node)); - if (registration->free) { - registration->free(&context, user_data); - } - - for (int i = 0; i < output_dims_count; ++i) { - TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i], - 1e-5f); - } -} - template void TestAveragePoolingQuantized( - std::initializer_list input_dims_data, - std::initializer_list input_data, const float input_min, + const int* input_dims_data, + const T* input_data, const float input_min, const float input_max, const int filter_height, const int filter_width, const int stride_height, const int stride_width, - std::initializer_list expected_output_data, - std::initializer_list output_dims_data, float output_min, + const T* expected_output_data, + const int* output_dims_data, float output_min, float output_max, TfLitePadding padding, TfLiteFusedActivation activation, T* output_data) { static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed."); - TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data); - TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data); + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); const int output_dims_count = ElementCount(*output_dims); constexpr int inputs_size = 1; @@ -163,94 +94,25 @@ void TestAveragePoolingQuantized( } for (int i = 0; i < output_dims_count; ++i) { - TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i], - 1e-5f); - } -} - -void TestMaxPoolFloat(std::initializer_list input_dims_data, - std::initializer_list input_data, int filter_width, - int filter_height, int stride_width, int stride_height, - std::initializer_list expected_output_data, - std::initializer_list output_dims_data, - TfLitePadding padding, TfLiteFusedActivation activation, - float* output_data) { - TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data); - TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data); - const int output_dims_count = ElementCount(*output_dims); - - constexpr int inputs_size = 1; - constexpr int outputs_size = 1; - constexpr int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[tensors_size] = { - CreateFloatTensor(input_data, input_dims, "input_tensor"), - CreateFloatTensor(output_data, output_dims, "output_tensor"), - }; - - TfLiteContext context; - PopulateContext(tensors, tensors_size, micro_test::reporter, &context); - - ::tflite::ops::micro::AllOpsResolver resolver; - const TfLiteRegistration* registration = - resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1); - TF_LITE_MICRO_EXPECT_NE(nullptr, registration); - - TfLitePoolParams builtin_data = { - padding, stride_width, stride_height, - filter_width, filter_height, activation, - }; - - const char* init_data = reinterpret_cast(&builtin_data); - size_t init_data_size = 0; - void* user_data = nullptr; - if (registration->init) { - user_data = registration->init(&context, init_data, init_data_size); - } - - int inputs_array_data[] = {1, 0}; - TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); - int outputs_array_data[] = {1, 1}; - TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); - int temporaries_array_data[] = {0}; - TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data); - - TfLiteNode node; - node.inputs = inputs_array; - node.outputs = outputs_array; - node.temporaries = temporaries_array; - node.user_data = user_data; - node.builtin_data = reinterpret_cast(&builtin_data); - node.custom_initial_data = nullptr; - node.custom_initial_data_size = 0; - node.delegate = nullptr; - if (registration->prepare) { - TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); - } - TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); - TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node)); - if (registration->free) { - registration->free(&context, user_data); - } - for (int i = 0; i < output_dims_count; ++i) { - TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i], + TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f); } } template -void TestMaxPoolQuantized(std::initializer_list input_dims_data, - std::initializer_list input_data, float input_min, +void TestMaxPoolQuantized(const int* input_dims_data, + const T* input_data, float input_min, float input_max, int filter_width, int filter_height, int stride_width, int stride_height, - std::initializer_list expected_output_data, + const T* expected_output_data, float output_min, float output_max, - std::initializer_list output_dims_data, + const int* output_dims_data, TfLitePadding padding, TfLiteFusedActivation activation, T* output_data) { static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed."); - TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data); - TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data); + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); const int output_dims_count = ElementCount(*output_dims); constexpr int inputs_size = 1; @@ -308,7 +170,7 @@ void TestMaxPoolQuantized(std::initializer_list input_dims_data, registration->free(&context, user_data); } for (int i = 0; i < output_dims_count; ++i) { - TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]); + TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]); } } @@ -319,797 +181,269 @@ void TestMaxPoolQuantized(std::initializer_list input_dims_data, TF_LITE_MICRO_TESTS_BEGIN -TF_LITE_MICRO_TEST(SimpleAveragePoolTestFloat) { - float output_data[2]; - tflite::testing::TestAveragePoolingFloat({4, 1, 2, 4, 1}, // Input shape - { // Input values - 0., 6., 2., 4., 3., 2., 10., 7.}, - 2, 2, // filter width, filter height - 2, 2, // stride width, stride height - { - // Output values - 2.75, - 5.75, - }, - {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActNone, - output_data); -} - -TF_LITE_MICRO_TEST(SimpleAveragePoolTestUint8) { - using tflite::testing::F2Q; - - const float input_min = -15.9375; - const float input_max = 15.9375; - const float output_min = -15.9375; - const float output_max = 15.9375; - uint8_t output_data[2]; - tflite::testing::TestAveragePoolingQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2Q(0., input_min, input_max), - F2Q(-6., input_min, input_max), - F2Q(2., input_min, input_max), - F2Q(4., input_min, input_max), - F2Q(3., input_min, input_max), - F2Q(2., input_min, input_max), - F2Q(-10., input_min, input_max), - F2Q(7., input_min, input_max), - }, - input_min, input_max, // input quantization range - 2, 2, // filter width, filter height - 2, 2, // stride width, stride height - { - // Output values - F2Q(0., output_min, output_max), - F2Q(0.75, output_min, output_max), - }, - {4, 1, 1, 2, 1}, // Output shape - output_min, output_max, // output quantization range - kTfLitePaddingValid, kTfLiteActRelu, output_data); -} - -TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2ActNone) { +TF_LITE_MICRO_TEST(SystemAveragePoolTestInt1) { using tflite::testing::F2QS; - const float input_min = -15.9375; - const float input_max = 15.8130; - const float output_min = -15.9375; - const float output_max = 15.8130; - int8_t output_data[2]; - tflite::testing::TestAveragePoolingQuantized( - {4, 1, 2, 4, 1}, // Input shape - { // Input values - F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max), - F2QS(2., input_min, input_max), F2QS(4., input_min, input_max), - F2QS(3., input_min, input_max), F2QS(2., input_min, input_max), - F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)}, - input_min, input_max, // input quantization range - 2, 2, // filter height, filter width - 2, 2, // stride height, stride width - { // Output values - F2QS(-0.25, output_min, output_max), F2QS(0.75, output_min, output_max)}, - {4, 1, 1, 2, 1}, // Output shape - output_min, output_max, // output quantization range - kTfLitePaddingValid, kTfLiteActNone, output_data); -} - -TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride1Stride2Relu) { - using tflite::testing::F2QS; - - const float input_min = -15.9375; - const float input_max = 15.8130; - const float output_min = -15.9375; - const float output_max = 15.8130; + const float input_min = -128; + const float input_max = 127; + const float output_min = -128; + const float output_max = 127; int8_t output_data[3]; + + const int kInput1Shape[] = {4, 1, 2, 4, 1}; + const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1}; + const int kOutput1Shape[] = {4, 1, 1, 3, 1}; + const int8_t kGolden1Data[] = {1, 1, 1}; + tflite::testing::TestAveragePoolingQuantized( - {4, 1, 2, 4, 1}, // Input shape - { // Input values - F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max), - F2QS(2., input_min, input_max), F2QS(4., input_min, input_max), - F2QS(3., input_min, input_max), F2QS(2., input_min, input_max), - F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)}, - input_min, input_max, // input quantization range - 2, 2, // filter height, filter width - 2, 1, // stride height, stride width - { // Output values - F2QS(0., output_min, output_max), F2QS(0., output_min, output_max), - F2QS(0.75, output_min, output_max)}, - {4, 1, 1, 3, 1}, // Output shape - output_min, output_max, // output quantization range - kTfLitePaddingValid, kTfLiteActRelu, output_data); -} - -TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Stride1Relu1) { - using tflite::testing::F2QS; - - const float input_min = -15.9375; - const float input_max = 15.8130; - const float output_min = -15.9375; - const float output_max = 15.8130; - int8_t output_data[2]; - tflite::testing::TestAveragePoolingQuantized( - {4, 1, 2, 4, 1}, // Input shape - { // Input values - F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max), - F2QS(2., input_min, input_max), F2QS(4., input_min, input_max), - F2QS(3., input_min, input_max), F2QS(2., input_min, input_max), - F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)}, - input_min, input_max, // input quantization range - 2, 2, // filter height, filter width - 1, 2, // stride height, stride width - { // Output values - F2QS(-0.25, output_min, output_max), F2QS(0.75, output_min, output_max)}, - {4, 1, 1, 2, 1}, // Output shape - output_min, output_max, // output quantization range - kTfLitePaddingValid, kTfLiteActRelu1, output_data); -} - -TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Relu6) { - using tflite::testing::F2QS; - - const float input_min = -15.9375; - const float input_max = 15.8130; - const float output_min = -15.9375; - const float output_max = 15.8130; - int8_t output_data[2]; - tflite::testing::TestAveragePoolingQuantized( - {4, 1, 2, 4, 1}, // Input shape - { // Input values - F2QS(3., input_min, input_max), F2QS(-6., input_min, input_max), - F2QS(8., input_min, input_max), F2QS(4., input_min, input_max), - F2QS(3., input_min, input_max), F2QS(2., input_min, input_max), - F2QS(10., input_min, input_max), F2QS(7., input_min, input_max)}, - input_min, input_max, // input quantization range - 2, 2, // filter height, filter width - 2, 2, // stride height, stride width - { // Output values - F2QS(0.5, output_min, output_max), F2QS(6., output_min, output_max)}, - {4, 1, 1, 2, 1}, // Output shape - output_min, output_max, // output quantization range - kTfLitePaddingValid, kTfLiteActRelu6, output_data); -} - -TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingSameStride1ActNone) { - using tflite::testing::F2QS; - - const float input_min = -15.9375; - const float input_max = 15.8130; - const float output_min = -15.9375; - const float output_max = 15.8130; - int8_t output_data[8]; - tflite::testing::TestAveragePoolingQuantized( - {4, 1, 2, 4, 1}, // Input shape - { // Input values - F2QS(3., input_min, input_max), F2QS(-6., input_min, input_max), - F2QS(8., input_min, input_max), F2QS(4., input_min, input_max), - F2QS(3., input_min, input_max), F2QS(2., input_min, input_max), - F2QS(10., input_min, input_max), F2QS(7., input_min, input_max)}, + kInput1Shape, // Input shape + kInput1Data, input_min, input_max, // input quantization range 2, 2, // filter height, filter width 1, 1, // stride height, stride width - { // Output values - F2QS(0.5, output_min, output_max), F2QS(3.5, output_min, output_max), - F2QS(7.25, output_min, output_max), F2QS(5.5, output_min, output_max), - F2QS(2.5, output_min, output_max), F2QS(6., output_min, output_max), - F2QS(8.5, output_min, output_max), F2QS(7., output_min, output_max)}, - {4, 1, 2, 4, 1}, // Output shape + kGolden1Data, + kOutput1Shape, // Output shape output_min, output_max, // output quantization range kTfLitePaddingValid, kTfLiteActNone, output_data); } -TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloat) { - float output_data[2]; - tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape - { // Input values - 0, 6, 2, 4, 3, 2, 10, 7}, - 2, 2, // filter width, filter height - 2, 2, // stride width, stride height - { - // Output values - 6, - 10, - }, - {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActNone, - output_data); -} -TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu) { - float output_data[2]; - tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape - { - // Input values - -1, -6, 2, 4, // - -3, -2, 10.5, 7, // - }, - 2, 2, // filter width, filter height - 2, 2, // stride width, stride height - { - // Output values - 0.0, - 10.5, - }, - {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu, - output_data); -} +TF_LITE_MICRO_TEST(LocalAveragePoolTestInt1) { + using tflite::testing::F2QS; -TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu1) { - float output_data[2]; - tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape - { - // Input values - -2.75, -6, 0.2, 0.4, // - -3, -2, -0.3, 0.7, // - }, - 2, 2, // filter width, filter height - 2, 2, // stride width, stride height - { - // Output values - -1.0, - 0.7, - }, - {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu1, - output_data); + const float input_min = -128; + const float input_max = 127; + const float output_min = -128; + const float output_max = 127; + int8_t output_data[3]; - tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape - { - // Input values - -2.75, -6, -2, -4, // - -3, -2, 10, -7, // - }, - 2, 2, // filter width, filter height - 2, 2, // stride width, stride height - { - // Output values - -1.0, - 1.0, - }, - {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu1, - output_data); -} +#pragma Bss(".Zdata") + const int kInput1Shape[] = {4, 1, 2, 4, 1}; + const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1}; + const int kOutput1Shape[] = {4, 1, 1, 3, 1}; + const int8_t kGolden1Data[] = {1, 1, 1}; +#pragma Bss() -TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu6) { - float output_data[2]; - tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape - { - // Input values - -1.5, -6, 12, 4, // - -3, -2, 10, 7, // - }, - 2, 2, // filter width, filter height - 2, 2, // stride width, stride height - { - // Output values - 0.0, - 6.0, - }, - {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu6, - output_data); - - tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape - { - // Input values - 0, 4.5, 12, 4, // - 3, 2, 10, 7, // - }, - 2, 2, // filter width, filter height - 2, 2, // stride width, stride height - { - // Output values - 4.5, - 6.0, - }, - {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu6, - output_data); -} - -TF_LITE_MICRO_TEST(SimpleMaxPoolTestPaddingSameStride1) { - float output_data[8]; - tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape - { - // Input values - 0, 6, 2, 4, // - 3, 2, 10, 7, // - }, - 2, 2, // filter width, filter height - 1, 1, // stride width, stride height - { - // Output values - 6, 10, 10, 7, // - 3, 10, 10, 7, // - }, - {4, 1, 2, 4, 1}, // Output shape - kTfLitePaddingSame, kTfLiteActNone, - output_data); -} - -TF_LITE_MICRO_TEST(SimpleMaxPoolTestPaddingValidStride1) { - float output_data[3]; - tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1}, // Input shape - { - // Input values - 0, 6, 2, 4, // - 3, 2, 10, 7, // - }, - 2, 2, // filter width, filter height - 1, 1, // stride width, stride height - { - // Output values - 6, - 10, - 10, - }, - {4, 1, 1, 3, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActNone, - output_data); -} - -TF_LITE_MICRO_TEST(SimpleMaxPoolTestUInt8ActNone) { - using tflite::testing::F2Q; - - uint8_t output_data[2]; - float input_min = 0; - float input_max = 15.9375; - float output_min = 0; - float output_max = 15.9375; - int filter_width = 2; - int filter_height = 2; - int stride_width = 2; - int stride_height = 2; - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2Q(0, input_min, input_max), - F2Q(6, input_min, input_max), - F2Q(2, input_min, input_max), - F2Q(4, input_min, input_max), - F2Q(3, input_min, input_max), - F2Q(2, input_min, input_max), - F2Q(10, input_min, input_max), - F2Q(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - {// Output values - F2Q(6, output_min, output_max), F2Q(10, output_min, output_max)}, - output_min, output_max, {4, 1, 1, 2, 1}, // Output shape + tflite::testing::TestAveragePoolingQuantized( + kInput1Shape, // Input shape + kInput1Data, + input_min, input_max, // input quantization range + 2, 2, // filter height, filter width + 1, 1, // stride height, stride width + kGolden1Data, + kOutput1Shape, // Output shape + output_min, output_max, // output quantization range kTfLitePaddingValid, kTfLiteActNone, output_data); } -TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) { - using tflite::testing::F2Q; +// Test group AVG 2 +TF_LITE_MICRO_TEST(SystemAveragePoolTestInt2) { + using tflite::testing::F2QS; - uint8_t output_data[2]; - float input_min = -15.9375; - float input_max = 15.9375; - float output_min = -15.9375; - float output_max = 15.9375; - int filter_width = 2; - int filter_height = 2; - int stride_width = 2; - int stride_height = 2; - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2Q(-1.5, input_min, input_max), - F2Q(-6, input_min, input_max), - F2Q(2, input_min, input_max), - F2Q(4, input_min, input_max), - F2Q(-3, input_min, input_max), - F2Q(-2, input_min, input_max), - F2Q(10, input_min, input_max), - F2Q(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - {// Output values - F2Q(0, output_min, output_max), F2Q(10, output_min, output_max)}, - output_min, output_max, {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu, output_data); -} + const float input_min = -128; + const float input_max = 127; + const float output_min = -128; + const float output_max = 127; + int8_t output_data[45]; -TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) { - using tflite::testing::F2Q; + const int kInput2Shape[] = {4, 1, 6, 10, 1}; + const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + const int kOutput2Shape[] = {4, 1, 5, 9, 1}; + const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1}; - uint8_t output_data[2]; - float input_min = -15.9375; - float input_max = 15.9375; - float output_min = -15.9375; - float output_max = 15.9375; - int filter_width = 2; - int filter_height = 2; - int stride_width = 2; - int stride_height = 2; - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2Q(-1.7, input_min, input_max), - F2Q(-6, input_min, input_max), - F2Q(2, input_min, input_max), - F2Q(4, input_min, input_max), - F2Q(-3, input_min, input_max), - F2Q(-2, input_min, input_max), - F2Q(-10, input_min, input_max), - F2Q(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - {// Output values - F2Q(-1.0, output_min, output_max), F2Q(1.0, output_min, output_max)}, - output_min, output_max, {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu1, output_data); -} -TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) { - using tflite::testing::F2Q; - - uint8_t output_data[8]; - float input_min = -15.9375; - float input_max = 15.9375; - float output_min = -15.9375; - float output_max = 15.9375; - int filter_width = 2; - int filter_height = 2; - int stride_width = 2; - int stride_height = 2; - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2Q(0, input_min, input_max), - F2Q(-6, input_min, input_max), - F2Q(12, input_min, input_max), - F2Q(4, input_min, input_max), - F2Q(-3, input_min, input_max), - F2Q(-2, input_min, input_max), - F2Q(10, input_min, input_max), - F2Q(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - {// Output values - F2Q(0.0, output_min, output_max), F2Q(6.0, output_min, output_max)}, - output_min, output_max, {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu6, output_data); - - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2Q(0, input_min, input_max), - F2Q(4.5, input_min, input_max), - F2Q(12, input_min, input_max), - F2Q(4, input_min, input_max), - F2Q(3, input_min, input_max), - F2Q(2, input_min, input_max), - F2Q(10, input_min, input_max), - F2Q(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - {// Output values - F2Q(4.5, output_min, output_max), F2Q(6.0, output_min, output_max)}, - output_min, output_max, {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu6, output_data); -} - -TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingSameStride1) { - using tflite::testing::F2Q; - - uint8_t output_data[8]; - float input_min = 0; - float input_max = 15.9375; - float output_min = 0; - float output_max = 15.9375; - int filter_width = 2; - int filter_height = 2; - int stride_width = 1; - int stride_height = 1; - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2Q(0, input_min, input_max), - F2Q(6, input_min, input_max), - F2Q(2, input_min, input_max), - F2Q(4, input_min, input_max), - F2Q(3, input_min, input_max), - F2Q(2, input_min, input_max), - F2Q(10, input_min, input_max), - F2Q(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - { - // Output values - F2Q(6, output_min, output_max), - F2Q(10, output_min, output_max), - F2Q(10, output_min, output_max), - F2Q(7, output_min, output_max), - F2Q(3, output_min, output_max), - F2Q(10, output_min, output_max), - F2Q(10, output_min, output_max), - F2Q(7, output_min, output_max), - }, - output_min, output_max, {4, 1, 2, 4, 1}, // Output shape - kTfLitePaddingSame, kTfLiteActNone, output_data); -} - -TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingValidStride1) { - using tflite::testing::F2Q; - - uint8_t output_data[3]; - float input_min = 0; - float input_max = 15.9375; - float output_min = 0; - float output_max = 15.9375; - int filter_width = 2; - int filter_height = 2; - int stride_width = 1; - int stride_height = 1; - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2Q(0, input_min, input_max), - F2Q(6, input_min, input_max), - F2Q(2, input_min, input_max), - F2Q(4, input_min, input_max), - F2Q(3, input_min, input_max), - F2Q(2, input_min, input_max), - F2Q(10, input_min, input_max), - F2Q(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - { - // Output values - F2Q(6, output_min, output_max), - F2Q(10, output_min, output_max), - F2Q(10, output_min, output_max), - }, - output_min, output_max, {4, 1, 1, 3, 1}, // Output shape + tflite::testing::TestAveragePoolingQuantized( + kInput2Shape, // Input shape + kInput2Data, + input_min, input_max, // input quantization range + 2, 2, // filter height, filter width + 1, 1, // stride height, stride width + kGolden2Data, + kOutput2Shape, // Output shape + output_min, output_max, // output quantization range kTfLitePaddingValid, kTfLiteActNone, output_data); } -TF_LITE_MICRO_TEST(SimpleMaxPoolTestInt8ActNone) { +TF_LITE_MICRO_TEST(LocalAveragePoolTestInt2) { using tflite::testing::F2QS; - int8_t output_data[2]; - float input_min = 0; - float input_max = 15.9375; - float output_min = 0; - float output_max = 15.9375; - int filter_width = 2; - int filter_height = 2; - int stride_width = 2; - int stride_height = 2; - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2QS(0, input_min, input_max), - F2QS(6, input_min, input_max), - F2QS(2, input_min, input_max), - F2QS(4, input_min, input_max), - F2QS(3, input_min, input_max), - F2QS(2, input_min, input_max), - F2QS(10, input_min, input_max), - F2QS(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - {// Output values - F2QS(6, output_min, output_max), F2QS(10, output_min, output_max)}, - output_min, output_max, {4, 1, 1, 2, 1}, // Output shape + const float input_min = -128; + const float input_max = 127; + const float output_min = -128; + const float output_max = 127; + int8_t output_data[45]; + +#pragma Bss(".Zdata") + const int kInput2Shape[] = {4, 1, 6, 10, 1}; + const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + const int kOutput2Shape[] = {4, 1, 5, 9, 1}; + const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1}; +#pragma Bss() + + tflite::testing::TestAveragePoolingQuantized( + kInput2Shape, // Input shape + kInput2Data, + input_min, input_max, // input quantization range + 2, 2, // filter height, filter width + 1, 1, // stride height, stride width + kGolden2Data, + kOutput2Shape, // Output shape + output_min, output_max, // output quantization range kTfLitePaddingValid, kTfLiteActNone, output_data); } -TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) { - using tflite::testing::F2QS; - - int8_t output_data[2]; - float input_min = -15.9375; - float input_max = 15.9375; - float output_min = -15.9375; - float output_max = 15.9375; - int filter_width = 2; - int filter_height = 2; - int stride_width = 2; - int stride_height = 2; - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2QS(-1.5, input_min, input_max), - F2QS(-6, input_min, input_max), - F2QS(2, input_min, input_max), - F2QS(4, input_min, input_max), - F2QS(-3, input_min, input_max), - F2QS(-2, input_min, input_max), - F2QS(10, input_min, input_max), - F2QS(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - {// Output values - F2QS(0, output_min, output_max), F2QS(10, output_min, output_max)}, - output_min, output_max, {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu, output_data); -} - -TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) { - using tflite::testing::F2QS; - - int8_t output_data[2]; - float input_min = -15.9375; - float input_max = 15.9375; - float output_min = -15.9375; - float output_max = 15.9375; - int filter_width = 2; - int filter_height = 2; - int stride_width = 2; - int stride_height = 2; - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2QS(-1.7, input_min, input_max), - F2QS(-6, input_min, input_max), - F2QS(2, input_min, input_max), - F2QS(4, input_min, input_max), - F2QS(-3, input_min, input_max), - F2QS(-2, input_min, input_max), - F2QS(-10, input_min, input_max), - F2QS(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - {// Output values - F2QS(-1.0, output_min, output_max), F2QS(1.0, output_min, output_max)}, - output_min, output_max, {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu1, output_data); -} - -TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) { - using tflite::testing::F2QS; - - int8_t output_data[8]; - float input_min = -15.9375; - float input_max = 15.9375; - float output_min = -15.9375; - float output_max = 15.9375; - int filter_width = 2; - int filter_height = 2; - int stride_width = 2; - int stride_height = 2; - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2QS(0, input_min, input_max), - F2QS(-6, input_min, input_max), - F2QS(12, input_min, input_max), - F2QS(4, input_min, input_max), - F2QS(-3, input_min, input_max), - F2QS(-2, input_min, input_max), - F2QS(10, input_min, input_max), - F2QS(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - {// Output values - F2QS(0.0, output_min, output_max), F2QS(6.0, output_min, output_max)}, - output_min, output_max, {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu6, output_data); - - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2QS(0, input_min, input_max), - F2QS(4.5, input_min, input_max), - F2QS(12, input_min, input_max), - F2QS(4, input_min, input_max), - F2QS(3, input_min, input_max), - F2QS(2, input_min, input_max), - F2QS(10, input_min, input_max), - F2QS(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - {// Output values - F2QS(4.5, output_min, output_max), F2QS(6.0, output_min, output_max)}, - output_min, output_max, {4, 1, 1, 2, 1}, // Output shape - kTfLitePaddingValid, kTfLiteActRelu6, output_data); -} - -TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingSameStride1) { - using tflite::testing::F2QS; - - int8_t output_data[8]; - float input_min = 0; - float input_max = 15.9375; - float output_min = 0; - float output_max = 15.9375; - int filter_width = 2; - int filter_height = 2; - int stride_width = 1; - int stride_height = 1; - tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2QS(0, input_min, input_max), - F2QS(6, input_min, input_max), - F2QS(2, input_min, input_max), - F2QS(4, input_min, input_max), - F2QS(3, input_min, input_max), - F2QS(2, input_min, input_max), - F2QS(10, input_min, input_max), - F2QS(7, input_min, input_max), - }, - input_min, input_max, filter_width, filter_height, stride_width, - stride_height, - { - // Output values - F2QS(6, output_min, output_max), - F2QS(10, output_min, output_max), - F2QS(10, output_min, output_max), - F2QS(7, output_min, output_max), - F2QS(3, output_min, output_max), - F2QS(10, output_min, output_max), - F2QS(10, output_min, output_max), - F2QS(7, output_min, output_max), - }, - output_min, output_max, {4, 1, 2, 4, 1}, // Output shape - kTfLitePaddingSame, kTfLiteActNone, output_data); -} - -TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingValidStride1) { +// Test group MAX 1 +TF_LITE_MICRO_TEST(SystemMaxPoolTestInt1) { using tflite::testing::F2QS; int8_t output_data[3]; - float input_min = 0; - float input_max = 15.9375; - float output_min = 0; - float output_max = 15.9375; + const float input_min = -128; + const float input_max = 127; + const float output_min = -128; + const float output_max = 127; int filter_width = 2; int filter_height = 2; int stride_width = 1; int stride_height = 1; + + const int kInput1Shape[] = {4, 1, 2, 4, 1}; + const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1}; + const int kOutput1Shape[] = {4, 1, 1, 3, 1}; + const int8_t kGolden1Data[] = {1, 1, 1}; + tflite::testing::TestMaxPoolQuantized( - {4, 1, 2, 4, 1}, // Input shape - { - // Input values - F2QS(0, input_min, input_max), - F2QS(6, input_min, input_max), - F2QS(2, input_min, input_max), - F2QS(4, input_min, input_max), - F2QS(3, input_min, input_max), - F2QS(2, input_min, input_max), - F2QS(10, input_min, input_max), - F2QS(7, input_min, input_max), - }, + kInput1Shape, // Input shape + kInput1Data, input_min, input_max, filter_width, filter_height, stride_width, stride_height, - { - // Output values - F2QS(6, output_min, output_max), - F2QS(10, output_min, output_max), - F2QS(10, output_min, output_max), - }, - output_min, output_max, {4, 1, 1, 3, 1}, // Output shape + kGolden1Data, + output_min, output_max, kOutput1Shape, // Output shape + kTfLitePaddingValid, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(LocalMaxPoolTestInt1) { + using tflite::testing::F2QS; + + int8_t output_data[3]; + const float input_min = -128; + const float input_max = 127; + const float output_min = -128; + const float output_max = 127; + int filter_width = 2; + int filter_height = 2; + int stride_width = 1; + int stride_height = 1; + +#pragma Bss(".Zdata") + const int kInput1Shape[] = {4, 1, 2, 4, 1}; + const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1}; + const int kOutput1Shape[] = {4, 1, 1, 3, 1}; + const int8_t kGolden1Data[] = {1, 1, 1}; +#pragma Bss() + + tflite::testing::TestMaxPoolQuantized( + kInput1Shape, // Input shape + kInput1Data, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + kGolden1Data, + output_min, output_max, kOutput1Shape, // Output shape + kTfLitePaddingValid, kTfLiteActNone, output_data); +} + + +// Test group MAX 2 +TF_LITE_MICRO_TEST(SystemMaxPoolTestInt2) { + using tflite::testing::F2QS; + + int8_t output_data[45]; + const float input_min = -128; + const float input_max = 127; + const float output_min = -128; + const float output_max = 127; + int filter_width = 2; + int filter_height = 2; + int stride_width = 1; + int stride_height = 1; + + const int kInput2Shape[] = {4, 1, 6, 10, 1}; + const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + const int kOutput2Shape[] = {4, 1, 5, 9, 1}; + const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1}; + + tflite::testing::TestMaxPoolQuantized( + kInput2Shape, // Input shape + kInput2Data, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + kGolden2Data, + output_min, output_max, kOutput2Shape, // Output shape + kTfLitePaddingValid, kTfLiteActNone, output_data); +} + +TF_LITE_MICRO_TEST(LocalMaxPoolTestInt2) { + using tflite::testing::F2QS; + + int8_t output_data[45]; + const float input_min = -128; + const float input_max = 127; + const float output_min = -128; + const float output_max = 127; + int filter_width = 2; + int filter_height = 2; + int stride_width = 1; + int stride_height = 1; + + #pragma Bss(".Zdata") + const int kInput2Shape[] = {4, 1, 6, 10, 1}; + const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + const int kOutput2Shape[] = {4, 1, 5, 9, 1}; + const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1}; +#pragma Bss() + + tflite::testing::TestMaxPoolQuantized( + kInput2Shape, // Input shape + kInput2Data, + input_min, input_max, filter_width, filter_height, stride_width, + stride_height, + kGolden2Data, + output_min, output_max, kOutput2Shape, // Output shape kTfLitePaddingValid, kTfLiteActNone, output_data); } From 9996df4d7c3cbd8fadf342f27df4ae3d225b56b0 Mon Sep 17 00:00:00 2001 From: jacco Date: Wed, 29 Apr 2020 12:37:40 +0200 Subject: [PATCH 045/557] Small fix in mli slicing code for fully connect kernel --- tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc index 70d1fda4c2b..89eae356f51 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc @@ -158,7 +158,7 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node, mli_mov_cfg_for_copy(©_config); const int weight_out_dimension = 0; const int out_tensor_dimension = 1; - const int batch_dimension = 0; + const int input_size_dimension = 1; int slice_size = mli_weights.shape[weight_out_dimension]; /* allocate the local buffers, and compute the slice size */ @@ -192,13 +192,14 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node, mli_mov_tensor_sync(w_slice.Sub(), ©_config, w_ptr); mli_mov_tensor_sync(b_slice.Sub(), ©_config, b_ptr); - TensorSlicer in_slice(&mli_in, batch_dimension, 1); + // Slice the input over the batches (one at a time with the size of a complete input) + TensorSlicer in_slice(&mli_in, input_size_dimension, mli_in.shape[input_size_dimension]); /* output tensor is alreade sliced in the output size dimension. out_ch_slice.Sub() is the tensor for the amount of output size of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch */ - TensorSlicer out_slice(out_ch_slice.Sub(), batch_dimension, 1); + TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension, slice_size); /* setup the pointers to the local or remote tensor to make the code * inside the loop easier. */ From 21e7a9fffa8461f670abe50d2ef6a1724597d352 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Thu, 23 Apr 2020 14:09:21 +0300 Subject: [PATCH 046/557] Updated embARC MLI version for downloading + Package with pre-built libraries for various platforms --- .../micro/tools/make/ext_libs/arc_mli.inc | 26 +++++++++++-------- .../tools/make/targets/arc/arc_common.inc | 2 ++ .../tools/make/targets/arc_emsdp_makefile.inc | 3 +++ .../tools/make/third_party_downloads.inc | 8 +++--- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc index ee3cc8113c1..a95b4550417 100644 --- a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc +++ b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc @@ -21,19 +21,9 @@ ifeq ($(TARGET_ARCH), arc) # by passing 'no_arc_mli' tag (make -f TAGS=no_arc_mli ...) ifeq ($(filter no_arc_mli,$(ALL_TAGS)),) - ALL_TAGS += arc_mli -ifeq ($(PRE_COMPILED_MLI),true) - # TODO: Replace with proper arc_mli pre-builts. - $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,)) - - MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include - MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a - - THIRD_PARTY_CC_HDRS += \ - third_party/embarc_osp/LICENSE -else +ifeq ($(BUILD_ARC_MLI),true) MLI_LIB_DIR = arc_mli_$(basename $(TCF_FILE_NAME)) $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE))) @@ -44,6 +34,20 @@ else THIRD_PARTY_CC_HDRS += \ third_party/$(MLI_LIB_DIR)/LICENSE +else +ifneq ($(ARC_MLI_PRE_COMPILED_TARGET),) + MLI_LIB_DIR = arc_mli_package + $(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),)) + + MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include + MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/$(ARC_MLI_PRE_COMPILED_TARGET)/release/libmli.a + MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/$(ARC_MLI_PRE_COMPILED_TARGET)/release/libmli.a + + THIRD_PARTY_CC_HDRS += \ + third_party/$(MLI_LIB_DIR)/LICENSE +else +$(error Target for pre compiled ARC MLI library is not defined) +endif endif THIRD_PARTY_CC_HDRS += $(MLI_LIB) diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc index 4a9a5ccdfc3..9462c3852f2 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc @@ -89,6 +89,8 @@ ifeq ($(ARC_TOOLCHAIN), mwdt) LCF_FILE ?= + BUILD_ARC_MLI ?= true + # The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), # this variable is used later to add the option to the linker/compiler flags. # This condition also handles the case when the user/makefile specifies diff --git a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc index a84dd15e4e8..b81bcea0eb8 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc @@ -23,6 +23,9 @@ ifeq ($(TARGET), arc_emsdp) UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE)) + BUILD_ARC_MLI := false + ARC_MLI_PRE_COMPILED_TARGET := emsdp_em11d_em9d_dfss + include $(MAKEFILE_DIR)/targets/arc/arc_common.inc ARC_EXTRA_APP_SETTINGS = \ diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc index ce24ba29542..db420b7fd1b 100644 --- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc +++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc @@ -71,11 +71,11 @@ PERSON_MODEL_MD5 := "fe2934bd0788f1dcc7af3f0a954542ab" PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip" PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc" -EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip" -EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776" +EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/7026ad09bb7f967324eb29e069f776bc44a08886.zip" +EMBARC_MLI_MD5 := "7eebd730db79c6834399f87e509115fb" -EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/4b6c6eed65395dced1564006be8188781af16035.zip" -EMBARC_MLI_MD5 := "47167553c17ff8c7cd59fb1afb90c304" +EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC1/embARC_MLI_package.zip" +EMBARC_MLI_PRE_COMPILED_MD5 := "b85b8b89446757735342795367e37d22" XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip" XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b" From f9b6799aadacfc19032994bbb1c4eba67e53c598 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Fri, 24 Apr 2020 13:31:42 +0300 Subject: [PATCH 047/557] Fixes in project generation for ARC specific projects --- tensorflow/lite/micro/tools/make/helper_functions.inc | 2 ++ .../lite/micro/tools/make/targets/arc/arc_common.inc | 2 +- .../lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf | 4 ++-- .../lite/micro/tools/make/targets/arc_emsdp_makefile.inc | 7 +++++-- tensorflow/lite/micro/tools/make/targets/arc_makefile.inc | 2 ++ .../lite/micro/tools/make/templates/arc/README_ARC.md.tpl | 2 ++ .../micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl | 2 ++ 7 files changed, 16 insertions(+), 5 deletions(-) create mode 100644 tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl create mode 100644 tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc index 8d321d42490..1cf9afa8794 100644 --- a/tensorflow/lite/micro/tools/make/helper_functions.inc +++ b/tensorflow/lite/micro/tools/make/helper_functions.inc @@ -150,6 +150,8 @@ $(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_ sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_DEBUG_CMD)#g' | \ sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@ +$(PRJDIR)$(3)/$(1)/%: tensorflow/lite/micro/tools/make/templates/arc/%.tpl + @cp $$< $$@ $(foreach var,$(ARC_TARGET_FILES_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var)))) diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc index 9462c3852f2..596f219d3d1 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc @@ -105,7 +105,7 @@ endif PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config - PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections + PLATFORM_FLAGS += -Hnocopyr -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections # Use compact CRT. It requires pre-defined heap size PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf index d17c807e250..c13dea5c6a0 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf +++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf @@ -28,11 +28,11 @@ SECTIONS { .text? : { *('.text$crt*') } * (TEXT): {} * (LIT): {} - } > ICCM0 + } > SRAM GROUP BLOCK(4): { .Zdata? : {} - .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {} + .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32K): {} .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {} } > DCCM diff --git a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc index b81bcea0eb8..211437bd9f4 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc @@ -54,8 +54,11 @@ ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(UBOOT_FILE))),) ARC_TARGET_FILES_DIRS += $(dir $(UBOOT_FILE)) endif - # for default EMSD configuration we can use default em9d rt libs + MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC_EMSDP.md + + # for default EMSDP configuration we can use em9d_va rt libs # for better performance runtime should be built for emsdp configuration - PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio + # No hostlink library for smaller codesize purpose + PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio -Hhostlib= endif diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc index d379eea86f1..9f5442b4c6c 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc @@ -33,6 +33,8 @@ endif include $(MAKEFILE_DIR)/targets/arc/arc_common.inc +MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC.md + endif # $(TARGET) endif # $(TARGET_ARCH)... diff --git a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl new file mode 100644 index 00000000000..b722b9c441d --- /dev/null +++ b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl @@ -0,0 +1,2 @@ +# Mock Project Readme for common ARC target + diff --git a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl new file mode 100644 index 00000000000..b3d9257f4d2 --- /dev/null +++ b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl @@ -0,0 +1,2 @@ +# Mock Project Readme for ARC EMSDP target + From 0fece983977cbf914a3a413005b8de7648963735 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Fri, 24 Apr 2020 17:45:52 +0300 Subject: [PATCH 048/557] ARC EMSDP specific patch of generated projects for examples --- .../micro_speech/arc_emsdp/Makefile.inc | 22 +++++++ .../person_detection/arc_emsdp/Makefile.inc | 19 ++++++ .../person_detection/arc_emsdp/emsdp.lcf | 61 ++++++++++++++++++ .../arc_emsdp/Makefile.inc | 16 +++++ .../arc_emsdp/emsdp.lcf | 63 +++++++++++++++++++ 5 files changed, 181 insertions(+) create mode 100644 tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc create mode 100644 tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc create mode 100644 tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf create mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc create mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf diff --git a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc new file mode 100644 index 00000000000..7fe4906cdf9 --- /dev/null +++ b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc @@ -0,0 +1,22 @@ +ifeq ($(TARGET), arc_emsdp) + + MICRO_SPEECH_HDRS += \ + micro_speech_patch.txt + + MICRO_SPEECH_TEST_HDRS += \ + micro_speech_patch.txt + + MICRO_SPEECH_MOCK_HDRS += \ + micro_speech_patch.txt + +%/micro_speech_patch.txt: %/emsdp.lcf %/Makefile + @cp tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf $< + @echo emsdp.lcf > $@ + @sed -E -i 's#-Hheap=[^ ]*#\-Hheap=16K \-Hstack=16K#g' $(word 2, $^) + @sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\ + CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\ + CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\ + $(word 2, $^) + @echo Makefile >> $@ + +endif diff --git a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc new file mode 100644 index 00000000000..cb7ba57ecb1 --- /dev/null +++ b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc @@ -0,0 +1,19 @@ +ifeq ($(TARGET), arc_emsdp) + + person_detection_HDRS += \ + person_detection_patch.txt + + person_detection_TEST_HDRS += \ + person_detection_patch.txt + + +%/person_detection_patch.txt: %/emsdp.lcf %/Makefile + @cp tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf $< + @echo emsdp.lcf > $@ + @sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\ + CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\ + CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\ + $(word 2, $^) + @echo Makefile >> $@ + +endif diff --git a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf new file mode 100644 index 00000000000..34ed267652c --- /dev/null +++ b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf @@ -0,0 +1,61 @@ +# SYSTEM memory regions indicate where external memory might be located. +# The TCF has no specific knowledge of whether SYSTEM regions contain +# external memory or not. +# CCMWRAP memory regions indicate unusable portions of the address space +# due to CCM memory wrapping into upper addresses beyond its size + +MEMORY { + PSRAM : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400 + SRAM : ORIGIN = 0x20000000, LENGTH = 0x00040000 + IVT : ORIGIN = 0x60000000, LENGTH = 0x400 + ICCM0 : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400) +# CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000 + DCCM : ORIGIN = 0x80000000, LENGTH = 0x00020000 +# CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000 + XCCM : ORIGIN = 0x90000000, LENGTH = 0x00004000 +# CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000 + YCCM : ORIGIN = 0xa0000000, LENGTH = 0x00004000 +# CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000 + } + +SECTIONS { + + GROUP BLOCK(4) : { + .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4) + } > IVT + + GROUP BLOCK(4): { + .text? : { *('.text$crt*') } + * (TEXT): {} + * (LIT): {} + } > ICCM0 + + GROUP BLOCK(4): { + .rodata_in_data? : {} + } > PSRAM + + GROUP BLOCK(4): { + .debug_log? : {} + } > SRAM + + GROUP BLOCK(4): { + /* _SDA_BASE_ computed implicitly */ + .sdata?: {} + .sbss?: {} + * (DATA): {} + * (BSS): {} + .Zdata? : {} + .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {} + .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {} + } > DCCM + + GROUP BLOCK(4): { + .Xdata? : {} + } > XCCM + + GROUP BLOCK(4): { + .Ydata? : {} + } > YCCM +} + + diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc new file mode 100644 index 00000000000..94d73f903ed --- /dev/null +++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc @@ -0,0 +1,16 @@ +ifeq ($(TARGET), arc_emsdp) + + person_detection_HDRS += \ + person_detection_int8_patch.txt + + person_detection_TEST_HDRS += \ + person_detection_int8_patch.txt + + +%/person_detection_int8_patch.txt: %/emsdp.lcf %/Makefile + @cp tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf $< + @echo emsdp.lcf > $@ + @sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= true#' $(word 2, $^) + @echo Makefile > $@ + +endif diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf new file mode 100644 index 00000000000..98b7e1d911f --- /dev/null +++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf @@ -0,0 +1,63 @@ +# SYSTEM memory regions indicate where external memory might be located. +# The TCF has no specific knowledge of whether SYSTEM regions contain +# external memory or not. +# CCMWRAP memory regions indicate unusable portions of the address space +# due to CCM memory wrapping into upper addresses beyond its size + +MEMORY { + PSRAM : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400 + SRAM : ORIGIN = 0x20000000, LENGTH = 0x00040000 + IVT : ORIGIN = 0x60000000, LENGTH = 0x400 + ICCM0 : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400) +# CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000 + DCCM : ORIGIN = 0x80000000, LENGTH = 0x00020000 +# CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000 + XCCM : ORIGIN = 0x90000000, LENGTH = 0x00004000 +# CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000 + YCCM : ORIGIN = 0xa0000000, LENGTH = 0x00004000 +# CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000 + } + +SECTIONS { + + GROUP BLOCK(4) : { + .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4) + } > IVT + + GROUP BLOCK(4): { + .text? : { *('.text$crt*') } + * (TEXT): {} + * (LIT): {} + } > ICCM0 + + GROUP BLOCK(4): { + .rodata_in_data? : {} + } > PSRAM + + GROUP BLOCK(4): { + /* _SDA_BASE_ computed implicitly */ + .sdata?: {} + .sbss?: {} + * (DATA): {} + * (BSS): {} + .debug_log? : {} + } > SRAM + + GROUP BLOCK(4): { +# TODO: Move tensor arena to DCCM when it will be possible +# .tensor_arena? : {} + .Zdata? : {} + .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {} + .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {} + } > DCCM + + GROUP BLOCK(4): { + .Xdata? : {} + } > XCCM + + GROUP BLOCK(4): { + .Ydata? : {} + } > YCCM +} + + From afef62b9764bc08289006e3a1ea60cffa9c55888 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Wed, 29 Apr 2020 14:42:14 +0300 Subject: [PATCH 049/557] ARC: Move shared lcf + Cleanup and comments --- .../micro_speech/arc_emsdp/Makefile.inc | 8 +- .../person_detection/arc_emsdp/Makefile.inc | 7 +- .../arc_emsdp/Makefile.inc | 5 + .../arc_emsdp/emsdp.lcf | 8 +- .../tools/make/targets/arc/emsdp/emsdp.lcf | 15 +- .../make/targets/arc/emsdp/emsdp_v2.lcf} | 7 +- .../tools/make/targets/arc/iotdk/iotdk.lcf | 47 - .../tools/make/targets/arc/iotdk/iotdk.tcf | 4621 ----------------- .../micro/tools/make/targets/arc/memory.lcf | 50 - 9 files changed, 39 insertions(+), 4729 deletions(-) rename tensorflow/lite/micro/{examples/person_detection/arc_emsdp/emsdp.lcf => tools/make/targets/arc/emsdp/emsdp_v2.lcf} (90%) delete mode 100644 tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf delete mode 100644 tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf delete mode 100644 tensorflow/lite/micro/tools/make/targets/arc/memory.lcf diff --git a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc index 7fe4906cdf9..850263f0eb9 100644 --- a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc +++ b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc @@ -1,5 +1,11 @@ ifeq ($(TARGET), arc_emsdp) +# Patch of arc make project to adjust it specifically for micro speech example. +# In particular: +# - Extend Heap and stack size for application needs +# - Use Linker command file with better usage of fast memory +# - In case project was generated with MLI usage, reduce scratch buffers. + MICRO_SPEECH_HDRS += \ micro_speech_patch.txt @@ -10,7 +16,7 @@ ifeq ($(TARGET), arc_emsdp) micro_speech_patch.txt %/micro_speech_patch.txt: %/emsdp.lcf %/Makefile - @cp tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf $< + @cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $< @echo emsdp.lcf > $@ @sed -E -i 's#-Hheap=[^ ]*#\-Hheap=16K \-Hstack=16K#g' $(word 2, $^) @sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\ diff --git a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc index cb7ba57ecb1..29a09466e83 100644 --- a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc +++ b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc @@ -1,5 +1,10 @@ ifeq ($(TARGET), arc_emsdp) +# Patch of arc make project to adjust it specifically +# for person detection example. In particular: +# - Use Linker command file with better usage of fast memory +# - In case project was generated with MLI usage, reduce scratch buffers. + person_detection_HDRS += \ person_detection_patch.txt @@ -8,7 +13,7 @@ ifeq ($(TARGET), arc_emsdp) %/person_detection_patch.txt: %/emsdp.lcf %/Makefile - @cp tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf $< + @cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $< @echo emsdp.lcf > $@ @sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\ CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\ diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc index 94d73f903ed..c00f9b89953 100644 --- a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc +++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc @@ -1,5 +1,10 @@ ifeq ($(TARGET), arc_emsdp) +# Patch of arc make project to adjust it specifically +# for experimental person detection example. In particular: +# - Use Linker command file with better usage of fast memory +# - Stripout TFLM reference code by default. + person_detection_HDRS += \ person_detection_int8_patch.txt diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf index 98b7e1d911f..2d7954217d3 100644 --- a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf +++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf @@ -1,6 +1,8 @@ -# SYSTEM memory regions indicate where external memory might be located. -# The TCF has no specific knowledge of whether SYSTEM regions contain -# external memory or not. +# Difference with common EMSDP LCF file (to reduce data access time): +# - move data from external PSRAM to on-chip memory +# - move text from SRAM to ICCM +# - TODO: Move tensor arena to DCCM to reduce data flow between fast and extrnal memory +# # CCMWRAP memory regions indicate unusable portions of the address space # due to CCM memory wrapping into upper addresses beyond its size diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf index c13dea5c6a0..b01b4835071 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf +++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf @@ -1,6 +1,15 @@ -# SYSTEM memory regions indicate where external memory might be located. -# The TCF has no specific knowledge of whether SYSTEM regions contain -# external memory or not. +# Common EMSDP LCF File for applications +# +# external SRAM memory is used for code, because some TFLM applications includes the whole +# set of supported kernels which doesn't fit to ICCM0. +# It could slow performance a bit. Smaller applications can use ICCM0 instead. +# +# External PSRAM is used for potentially big sections. In particular: +# - rodata_in data which typically includes protobuf with model. +# - other .data which typically includes tensor arena. +# +# stack and heap are kept in DCCM which is the closest memory to the core + # CCMWRAP memory regions indicate unusable portions of the address space # due to CCM memory wrapping into upper addresses beyond its size diff --git a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf similarity index 90% rename from tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf rename to tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf index 34ed267652c..a379fe69e21 100644 --- a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf +++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf @@ -1,6 +1,7 @@ -# SYSTEM memory regions indicate where external memory might be located. -# The TCF has no specific knowledge of whether SYSTEM regions contain -# external memory or not. +# Difference with common EMSDP LCF file (to reduce data access time): +# - move data from external PSRAM to DCCM +# - move text from SRAM to ICCM +# # CCMWRAP memory regions indicate unusable portions of the address space # due to CCM memory wrapping into upper addresses beyond its size diff --git a/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf b/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf deleted file mode 100644 index da39ae911ff..00000000000 --- a/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf +++ /dev/null @@ -1,47 +0,0 @@ -# SYSTEM memory regions indicate where external memory might be located. -# The TCF has no specific knowledge of whether SYSTEM regions contain -# external memory or not. -# CCMWRAP memory regions indicate unusable portions of the address space -# due to CCM memory wrapping into upper addresses beyond its size - -MEMORY { -# SYSTEM0 : ORIGIN = 0x00000000, LENGTH = 0x20000000 - ICCM0 : ORIGIN = 0x20000000, LENGTH = 0x00040000 -# CCMWRAP0: ORIGIN = 0x20040000, LENGTH = 0x0ffc0000 -# SYSTEM1 : ORIGIN = 0x30000000, LENGTH = 0x50000000 - DCCM : ORIGIN = 0x80000000, LENGTH = 0x00020000 -# CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000 -# SYSTEM2 : ORIGIN = 0x90000000, LENGTH = 0x30000000 - XCCM : ORIGIN = 0xc0000000, LENGTH = 0x00008000 -# CCMWRAP2: ORIGIN = 0xc0008000, LENGTH = 0x0fff8000 -# SYSTEM3 : ORIGIN = 0xd0000000, LENGTH = 0x10000000 - YCCM : ORIGIN = 0xe0000000, LENGTH = 0x00008000 -# CCMWRAP3: ORIGIN = 0xe0008000, LENGTH = 0x0fff8000 -# SYSTEM4 : ORIGIN = 0xf0000000, LENGTH = 0x10000000 - } -SECTIONS { - GROUP: { - .text? : { *('.text$crt*') } - * (TEXT): {} - * (LIT): {} - } > ICCM0 - - GROUP: { - /* _SDA_BASE_ computed implicitly */ - .sdata?: {} - .sbss?: {} - * (DATA): {} - * (BSS): {} - .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32768): {} - .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {} - } > DCCM - GROUP: { - .Xdata? : {} - } > XCCM - GROUP: { - .Ydata? : {} - } > YCCM - GROUP BIND(0x0): { - .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:684): {} = FILL(0xa5a5a5a5,4) - } - } diff --git a/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf b/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf deleted file mode 100644 index 004215a2f6a..00000000000 --- a/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf +++ /dev/null @@ -1,4621 +0,0 @@ - - - - - - - - - - - - - - - - - - - - 10*2) -# -# The speed of simulation can be greatly increased by using a faster JTAG clock, but a dependency will warn if it exceeds 1/2 of the cpu clock. -# --jtag_tclk 4 - -# execution_trace_level --- -# This traces committed instructions as they execute, and gathers statistics -# visible in the debugger for counting instructions & cycle delays. -# At the "stats" level ony the statistics are gathered and no trace is printed. -# "file" is equivalent to "full", but the results go to a trace .txt file instead. -# --execution_trace_level stats - -# generate_ipxact --- -# Generate ipxact.xml file describing the CPUisle or archipelago frontier -# --generate_ipxact false - -# ipxact_relative_path_names --- -# Use relative path names for Verilog files in the ipxact. -# Otherwise, absolute path names are used. -# --ipxact_relative_path_names true - -# optional_encryption --- -# When selected, encrypted RTL output is generated. -# --optional_encryption false - -# ignore_encrypt_license --- -# When selected, pretend the encryption license is missing. For testing. -# --ignore_encrypt_license false - -# ignore_clear_license --- -# When selected, pretend the cleartest license is missing. For testing. -# --ignore_clear_license false - - -######## Tool Configuration --- cgen.1_0 ######## - -# Create Tool Configuration --create cgen.1_0 "System.Tool Configuration" - -# mwdt_version --- Selects the MetaWare version to be used with the TCF file. -# Change from the default to an older or newer toolset version if you want the TCF file to be used with an older or newer version of the MetaWare tools. --mwdt_version K-2015.09 - -# code_base_addr --- -# The base address to assign to the executable code segment in the linker command file when there is no ICCM in the build. This value is ignored when there is an ICCM. -# --code_base_addr 0 - -# data_base_addr --- -# The base address to assign to the data segment in the linker command file when the data is not being mapped to a DCCM. This value is ignored when the data segment is mapped to a DCCM, as in that case the base address of the DCCM memory is used. -# -# A value of 0xffffffff means that the data segment will not be mapped to any specific address. -# --data_base_addr 4294967295 - - -######## IO Software --- com.arc.software.dfss.sw_io.1_0 ######## - -# Create IO Software --create com.arc.software.dfss.sw_io.1_0 "System.IO Software" - -# sw_io --- Command line option for Software element 'IO Software' --sw_io true - - -######## DSP Software --- com.arc.software.dfss.sw_dsp.1_0 ######## - -# Create DSP Software --create com.arc.software.dfss.sw_dsp.1_0 "System.DSP Software" - -# sw_dsp --- Command line option for Software element 'DSP Software' --sw_dsp true - - -######## Infrastructure Software --- com.arc.software.dfss.sw_infra.1_0 ######## - -# Create Infrastructure Software --create com.arc.software.dfss.sw_infra.1_0 "System.Infrastructure Software" - -# sw_infra --- Command line option for Software element 'Infrastructure Software' --sw_infra true - - -######## CPUisle --- com.arc.hardware.CPU_isle.1_0 ######## - -# Create CPUisle --create com.arc.hardware.CPU_isle.1_0 System.CPUisle - -# unique_name --- verilog module modifier prefix --unique_name "" - -# ArcNum --- The processor number as read back in the ARCNUM field of the IDENTITY register. --arc_num 1 - -# instances --- -# The number of instantiations of this core. -# --instances 1 - -# CPUFloorplan --- Floorplan giving relative placement of the RAMs for the given configuration of ARCv2HS or ARCv2EM in this CPUisle --cpu_floorplan em9d_xyccm - -# userCPUFloorplanPath --- Pathname of user floorplan for the CPU when using a hierarchical implementation --usercpufloorplan_path "" - -# pinLocationConstraintsFile --- Pathname+filename of the physical pin location constraints file or just "side1" (all pins on l.h.s) or "side2" (pins on top only) or "side3" (pins on r.h.s. only) or "side4" (pins on bottom only) to get a template file generated --pin_location_constraints_file "" - - -######## ARCv2EM --- com.arc.hardware.ARCv2EM.1_0 ######## - -# Create ARCv2EM --create com.arc.hardware.ARCv2EM.1_0 System.CPUisle.ARCv2EM - -# arcv2em --- Description to follow --arcv2em true - -# def_div2ref --- This specifies the clock division factor at reset. It is used for mss clock controller to generate core clock, and the value N means core is running at (1/N) x ref_clk. --def_div2ref 1 - -# addr_size --- This defines the address bus width (in bits). --addr_size 32 - -# pc_size --- This defines the program counter (in bits). --pc_size 32 - -# lpc_size --- This defines the size of the loop counter (in bits). --lpc_size 32 - -# halt_on_reset --- This defines whether the core is halted initially on reset. --halt_on_reset true - -# byte_order --- This defines the endianness of the core. --byte_order little - -# code_density_option --- This reduces the size of program memory by adding instructions that condense commonly used instruction patterns with some marginal increase in processor gate count. The added instructions are ENTER_S, LEAVE_S, JLI_S, BI, BIH. --code_density_option true - -# bitscan_option --- This adds instructions for efficient search of bits within a 32 bit word, including normalize (NORM, NORMH, NORMW) and find first or last set bit (FFS, FLS) instructions. --bitscan_option true - -# shift_option --- The Shift ISA option adds variable and multi-length shift rotation instructions: (0) No shift/rotation instructions (1) ASR16, ASR8, LSR8, LSL8, ROL8, ROR8 (2) ASRM, ASLM, LSRM, RORM (3) ASR16, ASR8, LSR8, LSL8, ROL8, ROR8, ASRM, ASLM, LSRM, RORM --shift_option 3 - -# swap_option --- This adds two instructions used to swap half-words or bytes in a 32b word. Useful for converting between little to big endianess and vice-versa. --swap_option true - -# div_rem_option --- The DIV/REM option adds non-blocking multi-cycle implementation of integer divide/remainder functions. Added instructions are DIV, DIVU (integer divide), REM and REMU (integer divide remainder).radix2 takes 33 cycles. radix4_enhanced takes 3 to 19 cycles per operation. --div_rem_option none - -# mpy_option --- The Multiplier ISA option allows selection between several multiplier configurations to tradeoff performance with silicon area. -# For select multiply options, when the DIV/REM option is also selected, some datapath resources will be shared between the multiply and divide pipeline to minimize total area. -# -# Cycle count (16-bit, lower 32-bit or upper 32-bit) for the different configurations is as follows: -#

-# 
-# option  16/L32/U32  Instructions
-# ------  ----------  ---------------------
-#       
-# none	  -/-/-     None
-# wlh1	  1/1/1     MPYW/U, MPY/U, MPYH/U
-# wlh2	  2/2/2     MPYW/U, MPY/U, MPYH/U
-# wlh3	  2/3/3     MPYW/U, MPY/U, MPYH/U
-# wlh4	  2/4/5     MPYW/U, MPY/U, MPYH/U
-# wlh5	  5/9/9     MPYW/U, MPY/U, MPYH/U
-# 
-# --mpy_option none - -# code_protection --- The ARC EM architecture divides the memory into 16 regions, which can be protected individually. This feature adds a 16-bit input to the processor core, one bit per region. When the protect bit is set, the processor disables any load or store to the corresponding region. An attempt to access a protected region raises an EV_ProtV exception. --code_protection true - -# stack_checking --- Stack checking is a mechanism for checking stack accesses and raising an exception when a stack overflow or underflow is detected. --stack_checking true - -# unaligned_option --- This enables unaligned loads and stores. --unaligned_option true - -# intvbase_preset --- This sets the interrupt vector base configuration register, VECBASE_AC_BUILD. The vector base address is aligned to a 1KB boundary, so the required address value should be divided by 1K (i.e. do not include the lower 10 bits). On reset, this register is loaded into the interrupt vector base address register, INT_VECTOR_BASE. --intvbase_preset 0 - -# rgf_impl --- This defines whether the register file is implemented using flip-flops, or with a hard macro. --rgf_impl flip_flops - -# rgf_num_regs --- This defines the size (in 32b register) of the processor register file. --rgf_num_regs 32 - -# rgf_wr_ports --- This defines the number of write ports on the register file. --rgf_wr_ports 2 - -# rgf_num_banks --- Dual register banks are useful if Fast IRQ has been configured, but may be selected even if not. --rgf_num_banks 2 - -# rgf_banked_regs --- This selects the number of registers that are replicated in the second register-file bank. --rgf_banked_regs 32 - -# turbo_boost --- This enables the Turbo Boost synthesis option. By enabling this option, the achievable clock frequency is increased, but at the cost of an additional cycle latency on branch instructions. --turbo_boost false - -# infer_alu_adder --- infer: datapath is described as behavioral code: A + B -# instantiate: datapath is instantiated as a detailed multi-stage code of a carry-lookahead adder. It is generally preferable to use the infer option and add directives for your target synthesizer. --infer_alu_adder infer - -# infer_mpy_wtree --- infer: datapath is described as behavioral code: A * B (applies to only wlh3, wlh4 and wlh5 designs) -# instantiate: datapath is instantiated as a detailed multi-stage code of a Wallace Tree multiplier It is generally preferable to use the infer option and add directives for your target synthesizer. --infer_mpy_wtree instantiate - -# power_domains --- Adds three separate power domains to the core, and propagates power-gate control signals to the top level of the core. Also generates UPF constraints and commands in the low-power scripts --power_domains true - -# dvfs --- Adds logic to the core to allow dynamic controlling of voltage and frequency and propagates the associated control signals to the top level of core --dvfs true - -# voltage_domains --- Creates a voltage domain split between RAM and std cell parts to support Ultra Low Voltage on cells and generates UPF constraints --voltage_domains false - -# mem_bus_option --- The core supports three bus protocols for accessing external memory: AHB & AHB-Lite. AHB-Lite-single means instruction fetch and data access share a single AHB-Lite port. AHB-Lite-dual means separate AHB-Lite port for each initiator. --mem_bus_option AHB-Lite-dual - -# mem_bus_reg_interface --- Specifies whether the memory bus interface is registered. --mem_bus_reg_interface true - -# dmi_burst_option --- This will enable high-throughput burst support on the DMI slave interfaces. By enabling this option, the peak DMI read throughput goes from 1 word per 3 cycles to N words per N+2 cycles, in which N is the AHB burst lengthDMI write throughput goes from 1 word per 3 cycles to 1 word per cycle. --dmi_burst_option false - -# has_dmp_peripheral --- This option enables the redirection of load/store accesses to one segment (1/16) of the addressable space to a dedicated peripheral bus. This offers high system integration and reduces overall system cost. --has_dmp_peripheral false - -# per_bus_option --- The core supports one bus protocol for accessing the peripheral space, when enabled: AHB-Lite. --per_bus_option AHB-Lite - -# per_bus_reg_interface --- Specifies whether the peripheral bus interface is registered. --per_bus_reg_interface false - -# clock_gating --- This enables the insertion of architectural clock gate elements in the design. By enabling this option, the clocks to various parts of the design will be disabled when the logic they drive is not in use to save power. --clock_gating true - -# byte_parity --- If parity protection on the CCMs is configured, this option is used to enable parity protection on a per-byte basis. Otherwise, parity will be per word basis --byte_parity false - -# prot_pipelined --- Check the box if CCM memories are configured for ECC, and you want single-bit errors to be corrected, written back to memory, and re-fetched. When unchecked, single bit errors are corrected when read from memory, but the offending memory location itself is not corrected with a writeback --prot_pipelined false - -# cct_test_ena --- When ECC is configured, this option enables automatic generation of error conditions in relevant testbench memories to exercise error detection and correction features --cct_test_ena false - - -######## AGU --- com.arc.hardware.AGU.1_0 ######## - -# Create AGU --create com.arc.hardware.AGU.1_0 System.CPUisle.ARCv2EM.AGU - -# agu_size --- Predefined configurations of modifiers, address -# pointers and offset registers -#
-# 
-#         address     address                     
-#         pointers    offset regs      modifiers  
-#        ----------- --------------- ------------ 
-# small:     4           2                 4      
-# medium:    8           4                 12     
-# large:     12          8                 24     
-# 
-# --agu_size small - -# agu_accord --- Enable the accordion stage if operating frequency is critical --agu_accord true - -# agu_wb_depth --- Write buffer depth --agu_wb_depth 2 - - -######## DSP --- com.arc.hardware.DSP.1_0 ######## - -# Create DSP --create com.arc.hardware.DSP.1_0 System.CPUisle.ARCv2EM.DSP - -# dsp_complex --- Enable/disable support for single cycle 16b+16b complex instructions and butterfly operations, else 2-cycle complex instructions only without butterfly support --dsp_complex true - -# dsp_itu --- Enable/disable support for ITU bit-accurate 1 bit fractional shift before accumulation, else 1-bit fractional shift result after accumulation only --dsp_itu true - -# dsp_divsqrt --- Enable/disable support for divide and square root operations: DIV(U), REM(U), SQRT --dsp_divsqrt radix2 - -# dsp_accshift --- Select support for accumulator shift operations: no supported, limited shift support only or full shift support and convergent rounding --dsp_accshift full - -# dsp_impl --- The datapath components may be inferred from Verilog for better area or optimized using carry-save components for better timing --dsp_impl optimized - - -######## Interrupt Controller --- com.arc.hardware.Interrupt_Controller.1_0 ######## - -# Create Interrupt Controller --create com.arc.hardware.Interrupt_Controller.1_0 "System.CPUisle.ARCv2EM.Interrupt Controller" - -# number_of_interrupts --- This is the total number of interrupts available to the core. Some interrupts are allocated statically to a specific interrupt line (for example, timer interrupts). For more information on Interrupt and register-file options, see DesignWare ARCv2 ISA Programmers Reference Manual. --number_of_interrupts 95 - -# number_of_levels --- Priority levels in the interrupt controller. --number_of_levels 4 - -# external_interrupts --- This is the total number of interrupt pins available for external system components. This parameter must be less than the total number of interrupts. --external_interrupts 60 - -# firq_option --- This enables the fast-interrupts option, (priority level 0 interrupts), which uses an alternate register bank (if configured) instead of saving the context to memory. --firq_option true - - -######## Timer 0 --- com.arc.hardware.Timer_0.1_0 ######## - -# Create Timer 0 --create com.arc.hardware.Timer_0.1_0 "System.CPUisle.ARCv2EM.Timer 0" - -# timer_0_int_level --- This sets the interrupt level (and implicitly the priority: level 0 is highest) of timer 0. --timer_0_int_level 1 - - -######## Timer 1 --- com.arc.hardware.Timer_1.1_0 ######## - -# Create Timer 1 --create com.arc.hardware.Timer_1.1_0 "System.CPUisle.ARCv2EM.Timer 1" - -# timer_1_int_level --- This sets the interrupt level (and implicitly the priority: level 0 is highest) of timer 1. --timer_1_int_level 0 - - -######## Watchdog Timer --- com.arc.hardware.Watchdog_Timer.1_0 ######## - -# Create Watchdog Timer --create com.arc.hardware.Watchdog_Timer.1_0 "System.CPUisle.ARCv2EM.Watchdog Timer" - -# watchdog_size --- Specifies the bit width of the internal counter used within the timer. --watchdog_size 16 - -# watchdog_clk --- Specifies whether the timer should be driven from a separate clock. --watchdog_clk true - - -######## Data Memory Initiator --- com.arc.hardware.Data_Memory_Initiator.1_0 ######## - -# Create Data Memory Initiator --create com.arc.hardware.Data_Memory_Initiator.1_0 "System.CPUisle.ARCv2EM.Data Memory Initiator" - -######## Instruction Fetch Queue --- com.arc.hardware.Instruction_Fetch_Queue.1_0 ######## - -# Create Instruction Fetch Queue --create com.arc.hardware.Instruction_Fetch_Queue.1_0 "System.CPUisle.ARCv2EM.Instruction Fetch Queue" - -# ifqueue_size --- This defines the number of entires in the Instruction Fetch Queue. --ifqueue_size 4 - -# ifqueue_burst_size --- This sets the burst size for bus data transfers (in 32-bit words). It cannot exceed the number of entries. --ifqueue_burst_size 2 - - -######## DCCM --- com.arc.hardware.DCCM.1_0 ######## - -# Create DCCM --create com.arc.hardware.DCCM.1_0 System.CPUisle.ARCv2EM.DCCM - -# dccm_size --- This defines the size of the Data Closely Coupled Memory (DCCM) in bytes --dccm_size 131072 - -# dccm_base --- Sets the initial memory region assignment for DCCM --dccm_base 8 - -# dccm_interleave --- Split DCCM into even/odd memory banks. --dccm_interleave false - -# dccm_prot --- Specifies the type of protection built for the DCCM. --dccm_prot None - -# dccm_prot_level --- Specifies the level protection. --dccm_prot_level Data_Only - -# dccm_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the DCCM --dccm_prot_exceptions true - -# dccm_dmi --- This enables external access through a DMI (direct memory interface) port. --dccm_dmi true - - -######## ICCM0 --- com.arc.hardware.ICCM0.1_0 ######## - -# Create ICCM0 --create com.arc.hardware.ICCM0.1_0 System.CPUisle.ARCv2EM.ICCM0 - -# iccm0_size --- This defines the size of ICCM0 in bytes.This ICCM has 0 wait states. --iccm0_size 262144 - -# iccm0_base --- Sets the initial memory region assignment for ICCM0 --iccm0_base 2 - -# iccm0_wide --- Creates ICCM0 as 64b memory to reduce accesses. --iccm0_wide true - -# iccm0_prot --- Specifies the type of protection built for ICCM0. --iccm0_prot None - -# iccm0_prot_level --- Specifies the level of protection. --iccm0_prot_level Data_Only - -# iccm0_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the ICCM0 --iccm0_prot_exceptions true - -# iccm0_dmi --- This enables external access through a DMI (direct memory interface) port. --iccm0_dmi true - - -######## XY --- com.arc.hardware.XY.1_0 ######## - -# Create XY --create com.arc.hardware.XY.1_0 System.CPUisle.ARCv2EM.XY - -# xy_config --- XY memory configuration: -# One memory: DCCM only. -# Two memories: DCCM + Y. -# Three memories: DCCM + X + Y. --xy_config dccm_x_y - -# xy_size --- Size of X and Y memories if included. -# X and Y memories both have the same configured size. --xy_size 32768 - -# xy_interleave --- Split XY memories into odd/even instances to enable single cycle unaligned access. --xy_interleave true - -# xy_x_base --- Base region for X memory. All accesses to this region will initiate a transfer on the X memory. --xy_x_base 12 - -# xy_y_base --- Base region for Y memory. All accesses to this region will initiate a transfer on the Y memory. --xy_y_base 14 - - -######## DMA Controller --- com.arc.hardware.DMA_Controller.1_0 ######## - -# Create DMA Controller --create com.arc.hardware.DMA_Controller.1_0 "System.CPUisle.ARCv2EM.DMA Controller" - -# dmac_channels --- This options specifies the number of DMA channels implemented in the DMA controller --dmac_channels 16 - -# dmac_fifo_depth --- This option specifies the DMA transfer FIFO depth in 32b words. --dmac_fifo_depth 4 - -# dmac_int_config --- None: the DMA controller cannot raise an interrupt -# Single-External: single done and single error interrupt signal for all DMA channels, and the interrupt signals are routed to a port at the top of the EM logical hierarchy -# Multiple-External: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are routed to ports at the top of the EM logical hierarchy -# Single-Internal: single done and single error interrupt signals for all DMA channels, and the interrupt signals are internal to the EM core -# Multiple-Internal: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are internal to the EM core --dmac_int_config Multiple-Internal - -# dmac_registers --- This option defines the number of DMA channels with their registers located in auxiliary space. --dmac_registers 16 - -# dmac_mem_if --- This option specifies whether the DMA controller system memory interface is integrated into the existing EM system memory interfaces or has its own interface. --dmac_mem_if separate - - -######## JTAG Interface --- com.arc.hardware.JTAG_Interface.1_0 ######## - -# Create JTAG Interface --create com.arc.hardware.JTAG_Interface.1_0 "System.CPUisle.ARCv2EM.JTAG Interface" - -######## Debug Interface --- com.arc.hardware.Debug_Interface.1_0 ######## - -# Create Debug Interface --create com.arc.hardware.Debug_Interface.1_0 "System.CPUisle.ARCv2EM.Debug Interface" - -######## Actionpoints --- com.arc.hardware.Actionpoints.1_0 ######## - -# Create Actionpoints --create com.arc.hardware.Actionpoints.1_0 System.CPUisle.ARCv2EM.Actionpoints - -# num_actionpoints --- This is the number of trigger events available. --num_actionpoints 8 - -# aps_feature --- Selects Actionpoint feature set --aps_feature min - - -######## SmaRT --- com.arc.hardware.SmaRT.1_0 ######## - -# Create SmaRT --create com.arc.hardware.SmaRT.1_0 System.CPUisle.ARCv2EM.SmaRT - -# smart_stack_entries --- This specifies the number of entries in the trace buffer. --smart_stack_entries 64 - -# smart_implementation --- Flip-flop = FF-based design. Memory = memory-based design (provides better density for larger trace buffers). --smart_implementation memory - - -######## Memory Protection Unit --- com.arc.hardware.Memory_Protection_Unit.1_0 ######## - -# Create Memory Protection Unit --create com.arc.hardware.Memory_Protection_Unit.1_0 "System.CPUisle.ARCv2EM.Memory Protection Unit" - -# mpu_num_regions --- Number of configured memory regions. --mpu_num_regions 16 - -# mpu_32b --- Set the minimal region size to be 32 byte instead of 2KB. --mpu_32b false - - -######## Floating-point unit --- com.arc.hardware.Floating_point_unit.1_0 ######## - -# Create Floating-point unit --create com.arc.hardware.Floating_point_unit.1_0 "System.CPUisle.ARCv2EM.Floating-point unit" - -# fpu_dp_assist --- This enables double-precision acceleration instructions. --fpu_dp_assist true - -# fpu_fma_option --- This enables the fused multiply-add & multiply-subtract instructions. --fpu_fma_option true - -# fpu_mas_cycles --- Make mul/add/sub multicycle to achieve a higher clock speed. --fpu_mas_cycles 2 - -# fpu_div_option --- This enables divide & square-root acceleration --fpu_div_option true - -# fpu_div_cycles --- "inferred" option infers DSP datapath elements from verilog operators for better area and "optimized" option selects hardware for better timing --fpu_div_cycles 17 - - -######## Performance Monitor --- com.arc.hardware.Performance_Monitor.1_0 ######## - -# Create Performance Monitor --create com.arc.hardware.Performance_Monitor.1_0 "System.CPUisle.ARCv2EM.Performance Monitor" - -# pct_counters --- Number of counters for performance monitoring. --pct_counters 8 - - -######## dsp_trig --- com.arc.hardware.dfss.dsp_trig.1_0 ######## - -# Create dsp_trig --create com.arc.hardware.dfss.dsp_trig.1_0 System.CPUisle.ARCv2EM.dsp_trig - -# dsp_trig --- Command line option for EIA extension component 'dsp_trig'. --dsp_trig true - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_gpio_4b0 --- com.arc.hardware.dfss.io_gpio_4b0.1_0 ######## - -# Create io_gpio_4b0 --create com.arc.hardware.dfss.io_gpio_4b0.1_0 System.CPUisle.ARCv2EM.io_gpio_4b0 - -# io_gpio_4b0 --- Command line option for EIA extension component 'io_gpio_4b0'. --io_gpio_4b0 true - -# io_gpio_4b0_debounce --- Selects the inclusion of Debounce logic --io_gpio_4b0_debounce 1 - -# io_gpio_4b0_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal --io_gpio_4b0_readback_sync 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_gpio_4b1 --- com.arc.hardware.dfss.io_gpio_4b1.1_0 ######## - -# Create io_gpio_4b1 --create com.arc.hardware.dfss.io_gpio_4b1.1_0 System.CPUisle.ARCv2EM.io_gpio_4b1 - -# io_gpio_4b1 --- Command line option for EIA extension component 'io_gpio_4b1'. --io_gpio_4b1 true - -# io_gpio_4b1_debounce --- Selects the inclusion of Debounce logic --io_gpio_4b1_debounce 1 - -# io_gpio_4b1_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal --io_gpio_4b1_readback_sync 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_gpio_4b2 --- com.arc.hardware.dfss.io_gpio_4b2.1_0 ######## - -# Create io_gpio_4b2 --create com.arc.hardware.dfss.io_gpio_4b2.1_0 System.CPUisle.ARCv2EM.io_gpio_4b2 - -# io_gpio_4b2 --- Command line option for EIA extension component 'io_gpio_4b2'. --io_gpio_4b2 true - -# io_gpio_4b2_debounce --- Selects the inclusion of Debounce logic --io_gpio_4b2_debounce 1 - -# io_gpio_4b2_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal --io_gpio_4b2_readback_sync 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_gpio_8b0 --- com.arc.hardware.dfss.io_gpio_8b0.1_0 ######## - -# Create io_gpio_8b0 --create com.arc.hardware.dfss.io_gpio_8b0.1_0 System.CPUisle.ARCv2EM.io_gpio_8b0 - -# io_gpio_8b0 --- Command line option for EIA extension component 'io_gpio_8b0'. --io_gpio_8b0 true - -# io_gpio_8b0_debounce --- Selects the inclusion of Debounce logic --io_gpio_8b0_debounce 1 - -# io_gpio_8b0_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal --io_gpio_8b0_readback_sync 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_gpio_8b1 --- com.arc.hardware.dfss.io_gpio_8b1.1_0 ######## - -# Create io_gpio_8b1 --create com.arc.hardware.dfss.io_gpio_8b1.1_0 System.CPUisle.ARCv2EM.io_gpio_8b1 - -# io_gpio_8b1 --- Command line option for EIA extension component 'io_gpio_8b1'. --io_gpio_8b1 true - -# io_gpio_8b1_debounce --- Selects the inclusion of Debounce logic --io_gpio_8b1_debounce 1 - -# io_gpio_8b1_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal --io_gpio_8b1_readback_sync 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_gpio_8b2 --- com.arc.hardware.dfss.io_gpio_8b2.1_0 ######## - -# Create io_gpio_8b2 --create com.arc.hardware.dfss.io_gpio_8b2.1_0 System.CPUisle.ARCv2EM.io_gpio_8b2 - -# io_gpio_8b2 --- Command line option for EIA extension component 'io_gpio_8b2'. --io_gpio_8b2 true - -# io_gpio_8b2_debounce --- Selects the inclusion of Debounce logic --io_gpio_8b2_debounce 1 - -# io_gpio_8b2_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal --io_gpio_8b2_readback_sync 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_gpio_8b3 --- com.arc.hardware.dfss.io_gpio_8b3.1_0 ######## - -# Create io_gpio_8b3 --create com.arc.hardware.dfss.io_gpio_8b3.1_0 System.CPUisle.ARCv2EM.io_gpio_8b3 - -# io_gpio_8b3 --- Command line option for EIA extension component 'io_gpio_8b3'. --io_gpio_8b3 true - -# io_gpio_8b3_debounce --- Selects the inclusion of Debounce logic --io_gpio_8b3_debounce 1 - -# io_gpio_8b3_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal --io_gpio_8b3_readback_sync 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_i2c_mst0 --- com.arc.hardware.dfss.io_i2c_mst0.1_0 ######## - -# Create io_i2c_mst0 --create com.arc.hardware.dfss.io_i2c_mst0.1_0 System.CPUisle.ARCv2EM.io_i2c_mst0 - -# io_i2c_mst0 --- Command line option for APEX extension component 'io_i2c_mst0'. --io_i2c_mst0 true - -# io_i2c_mst0_fs --- RX/TX FIFO size --io_i2c_mst0_fs 16 - -# io_i2c_mst0_dma_support --- Specifies whether the DMA handshake interface is included --io_i2c_mst0_dma_support None - -# io_i2c_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency. --io_i2c_mst0_cdc_included 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_i2c_mst1 --- com.arc.hardware.dfss.io_i2c_mst1.1_0 ######## - -# Create io_i2c_mst1 --create com.arc.hardware.dfss.io_i2c_mst1.1_0 System.CPUisle.ARCv2EM.io_i2c_mst1 - -# io_i2c_mst1 --- Command line option for APEX extension component 'io_i2c_mst1'. --io_i2c_mst1 true - -# io_i2c_mst1_fs --- RX/TX FIFO size --io_i2c_mst1_fs 16 - -# io_i2c_mst1_dma_support --- Specifies whether the DMA handshake interface is included --io_i2c_mst1_dma_support None - -# io_i2c_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency. --io_i2c_mst1_cdc_included 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_i2c_mst2 --- com.arc.hardware.dfss.io_i2c_mst2.1_0 ######## - -# Create io_i2c_mst2 --create com.arc.hardware.dfss.io_i2c_mst2.1_0 System.CPUisle.ARCv2EM.io_i2c_mst2 - -# io_i2c_mst2 --- Command line option for APEX extension component 'io_i2c_mst2'. --io_i2c_mst2 true - -# io_i2c_mst2_fs --- RX/TX FIFO size --io_i2c_mst2_fs 16 - -# io_i2c_mst2_dma_support --- Specifies whether the DMA handshake interface is included --io_i2c_mst2_dma_support None - -# io_i2c_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency. --io_i2c_mst2_cdc_included 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_spi_mst0 --- com.arc.hardware.dfss.io_spi_mst0.1_0 ######## - -# Create io_spi_mst0 --create com.arc.hardware.dfss.io_spi_mst0.1_0 System.CPUisle.ARCv2EM.io_spi_mst0 - -# io_spi_mst0 --- Command line option for APEX extension component 'io_spi_mst0'. --io_spi_mst0 true - -# io_spi_mst0_fz --- RX/TX FIFO depth --io_spi_mst0_fs 16 - -# io_spi_mst0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. --io_spi_mst0_max_xfer_size 16 - -# io_spi_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency. --io_spi_mst0_cdc_included 1 - -# io_spi_mst0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_spi_mst0_dma_support Aux-Based - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_spi_mst1 --- com.arc.hardware.dfss.io_spi_mst1.1_0 ######## - -# Create io_spi_mst1 --create com.arc.hardware.dfss.io_spi_mst1.1_0 System.CPUisle.ARCv2EM.io_spi_mst1 - -# io_spi_mst1 --- Command line option for APEX extension component 'io_spi_mst1'. --io_spi_mst1 true - -# io_spi_mst1_fz --- RX/TX FIFO depth --io_spi_mst1_fs 16 - -# io_spi_mst1_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. --io_spi_mst1_max_xfer_size 16 - -# io_spi_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency. --io_spi_mst1_cdc_included 1 - -# io_spi_mst1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_spi_mst1_dma_support Aux-Based - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_spi_mst2 --- com.arc.hardware.dfss.io_spi_mst2.1_0 ######## - -# Create io_spi_mst2 --create com.arc.hardware.dfss.io_spi_mst2.1_0 System.CPUisle.ARCv2EM.io_spi_mst2 - -# io_spi_mst2 --- Command line option for APEX extension component 'io_spi_mst2'. --io_spi_mst2 true - -# io_spi_mst2_fz --- RX/TX FIFO depth --io_spi_mst2_fs 16 - -# io_spi_mst2_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. --io_spi_mst2_max_xfer_size 16 - -# io_spi_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency. --io_spi_mst2_cdc_included 1 - -# io_spi_mst2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_spi_mst2_dma_support Aux-Based - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_spi_slv0 --- com.arc.hardware.dfss.io_spi_slv0.1_0 ######## - -# Create io_spi_slv0 --create com.arc.hardware.dfss.io_spi_slv0.1_0 System.CPUisle.ARCv2EM.io_spi_slv0 - -# io_spi_slv0 --- Command line option for APEX extension component 'io_spi_slv0'. --io_spi_slv0 true - -# io_spi_slv0_fz --- RX/TX FIFO depth --io_spi_slv0_fs 16 - -# io_spi_slv0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. --io_spi_slv0_max_xfer_size 16 - -# io_spi_slv0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_spi_slv0_dma_support None - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_uart0 --- com.arc.hardware.dfss.io_uart0.1_0 ######## - -# Create io_uart0 --create com.arc.hardware.dfss.io_uart0.1_0 System.CPUisle.ARCv2EM.io_uart0 - -# io_uart0 --- Command line option for EIA extension component 'io_uart0'. --io_uart0 true - -# io_uart0_fifo_mode --- Set the UART FIFO mode --io_uart0_fifo_mode 16 - -# io_uart0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_uart0_dma_support None - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_uart1 --- com.arc.hardware.dfss.io_uart1.1_0 ######## - -# Create io_uart1 --create com.arc.hardware.dfss.io_uart1.1_0 System.CPUisle.ARCv2EM.io_uart1 - -# io_uart1 --- Command line option for EIA extension component 'io_uart1'. --io_uart1 true - -# io_uart1_fifo_mode --- Set the UART FIFO mode --io_uart1_fifo_mode 16 - -# io_uart1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_uart1_dma_support Aux-Based - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_uart2 --- com.arc.hardware.dfss.io_uart2.1_0 ######## - -# Create io_uart2 --create com.arc.hardware.dfss.io_uart2.1_0 System.CPUisle.ARCv2EM.io_uart2 - -# io_uart2 --- Command line option for EIA extension component 'io_uart2'. --io_uart2 true - -# io_uart2_fifo_mode --- Set the UART FIFO mode --io_uart2_fifo_mode 16 - -# io_uart2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_uart2_dma_support Aux-Based - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_uart3 --- com.arc.hardware.dfss.io_uart3.1_0 ######## - -# Create io_uart3 --create com.arc.hardware.dfss.io_uart3.1_0 System.CPUisle.ARCv2EM.io_uart3 - -# io_uart3 --- Command line option for EIA extension component 'io_uart3'. --io_uart3 true - -# io_uart3_fifo_mode --- Set the UART FIFO mode --io_uart3_fifo_mode 16 - -# io_uart3_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_uart3_dma_support Aux-Based - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_creg_mst0 --- com.arc.hardware.dfss.io_creg_mst0.1_0 ######## - -# Create io_creg_mst0 --create com.arc.hardware.dfss.io_creg_mst0.1_0 System.CPUisle.ARCv2EM.io_creg_mst0 - -# io_creg_mst0 --- Command line option for EIA extension component 'io_creg_mst0'. --io_creg_mst0 true - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_creg_slv0 --- com.arc.hardware.dfss.io_creg_slv0.1_0 ######## - -# Create io_creg_slv0 --create com.arc.hardware.dfss.io_creg_slv0.1_0 System.CPUisle.ARCv2EM.io_creg_slv0 - -# io_creg_slv0 --- Command line option for EIA extension component 'io_creg_slv0'. --io_creg_slv0 true - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## subsys_bcr --- com.arc.hardware.dfss.subsys_bcr.1_0 ######## - -# Create subsys_bcr --create com.arc.hardware.dfss.subsys_bcr.1_0 System.CPUisle.ARCv2EM.subsys_bcr - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## subsys_infra --- com.arc.hardware.dfss.subsys_infra.1_0 ######## - -# Create subsys_infra --create com.arc.hardware.dfss.subsys_infra.1_0 System.subsys_infra - -# subsys_infra --- Command line option for EIA glue logic. --subsys_infra true - -# internal_interrupt --- Connect the IO interrupts internally --internal_interrupt true - -# internal_dma_handshake --- Connect the DMA handshake signals internally --internal_dma_handshake true - - -######## ARConnect --- com.arc.hardware.ARConnect.1_0 ######## - -# Create ARConnect --create com.arc.hardware.ARConnect.1_0 System.ARConnect - -# mcip_def_div2ref --- This specifies the clock division factor at reset. It is used for mss clock controller to generate ARConnect clock, and the value N means ARConnect is running at (1/N) x ref_clk. --mcip_def_div2ref 1 - -# mcip_has_intrpt --- This specifies whether the Inter-core Interrupt Unit exists --mcip_has_intrpt false - -# mcip_has_sema --- This specifies whether the Inter-core Semaphore Unit exists --mcip_has_sema false - -# mcip_sema_num --- This specifies the number of semaphores in the Inter-core Semaphores Unit --mcip_sema_num 16 - -# mcip_has_msg_sram --- This specifies whether the Inter-core Message Unit exists --mcip_has_msg_sram false - -# mcip_msg_sram_size --- This specifies the bytes of SRAM in the Inter-core Message Unit --mcip_msg_sram_size 512 - -# mcip_msg_1cycle --- True: The access path to message SRAM is 1 clock cycle; False: The access path to message SRAM 1.5 cycles. Note: The 1.5 cycles path use clock negetive edge for SRAM, but can acheive higher frequency. No performance difference caused by the value of this option --mcip_msg_1cycle false - -# mcip_has_debug --- This specifies whether the Inter-core Debug Unit exists --mcip_has_debug false - -# mcip_has_grtc --- This specifies whether the Global Real-Time Counter Unit exists --mcip_has_grtc false - -# mcip_has_pmu --- This specifies whether the external Power Management Unit exists --mcip_has_pmu true - -# mcip_power_domains --- This specifies whether the ARConnect Power Domain Management Unit exists --mcip_power_domains true - -# mcip_llm_size --- This specifies the KBytes of SRAM in the Low Latency Memory Unit --mcip_llm_size 32 - -# mcip_llm_base --- This specifies the default memory region of Low Latency Memory Unit --mcip_llm_base 2 - -# mcip_llm_ecc --- This specifies the ECC mode of SRAM in Low Latency Memory Unit. none = No checking; parity = Parity only; SECDED = single-error correction and double-error detection (SECDED) --mcip_llm_ecc SECDED - -# mcip_idu_cirq_num --- This specifies the number of common interrupts supported by IDU --mcip_idu_cirq_num 4 - -# mcip_bsu_dbw --- This specifies the data bus width of Bus Slave Unit --mcip_bsu_dbw 64 - -# mcip_bsu_type --- This specifies the bus protocol of Bus Slave Unit --mcip_bsu_type AXI - - -]]> - - - - - - - - - - - - - - - ICCM0 - - GROUP: { - /* _SDA_BASE_ computed implicitly */ - .sdata?: {} - .sbss?: {} - * (DATA): {} - * (BSS): {} - .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32768): {} - .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {} - } > DCCM - GROUP: { - .Xdata? : {} - } > XCCM - GROUP: { - .Ydata? : {} - } > YCCM - GROUP BIND(0x0): { - .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:684): {} = FILL(0xa5a5a5a5,4) - } - } - -]]> - - - - - - 0x07, sub_opcode => 0x1E , latency_cycles => 8) - -// User extension instruction - dsp_sin -extern long dsp_sin(long); -#pragma intrinsic(dsp_sin, opcode => 0x07, sub_opcode => 0x1F , latency_cycles => 8) - -// User extension instruction - dsp_tan -extern long dsp_tan(long); -#pragma intrinsic(dsp_tan, opcode => 0x07, sub_opcode => 0x22 , latency_cycles => 11) - -// User extension instruction - dsp_acos -extern long dsp_acos(long); -#pragma intrinsic(dsp_acos, opcode => 0x07, sub_opcode => 0x23 , latency_cycles => 31) - -// User extension instruction - dsp_asin -extern long dsp_asin(long); -#pragma intrinsic(dsp_asin, opcode => 0x07, sub_opcode => 0x24 , latency_cycles => 31) - -// User extension instruction - dsp_atan -extern long dsp_atan(long); -#pragma intrinsic(dsp_atan, opcode => 0x07, sub_opcode => 0x25 , latency_cycles => 13) - -// User extension instruction - dsp_sqrt -extern long dsp_sqrt(long); -#pragma intrinsic(dsp_sqrt, opcode => 0x07, sub_opcode => 0x20 , latency_cycles => 31) - -// User extension instruction - dsp_sqrt15 -extern long dsp_sqrt15(long); -#pragma intrinsic(dsp_sqrt15, opcode => 0x07, sub_opcode => 0x21 , latency_cycles => 15) - -#define APEX_COM_ARC_HARDWARE_DFSS_DSP_TRIG_PRESENT 1 -#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_4B0_IO_GPIO_4B0_PRESENT 1 - -// User extension aux register io_gpio_4b0_debounce -#define AR_IO_GPIO_4B0_DEBOUNCE 0x80017c48 -#pragma Aux_register(0x80017c48, name=>"io_gpio_4b0_debounce") - -// User extension aux register io_gpio_4b0_clken -#define AR_IO_GPIO_4B0_CLKEN 0x80017c80 -#pragma Aux_register(0x80017c80, name=>"io_gpio_4b0_clken") - -// User extension aux register io_gpio_4b0_swporta_dr -#define AR_IO_GPIO_4B0_SWPORTA_DR 0x80017c00 -#pragma Aux_register(0x80017c00, name=>"io_gpio_4b0_swporta_dr") - -// User extension aux register io_gpio_4b0_swporta_ddr -#define AR_IO_GPIO_4B0_SWPORTA_DDR 0x80017c04 -#pragma Aux_register(0x80017c04, name=>"io_gpio_4b0_swporta_ddr") - -// User extension aux register io_gpio_4b0_inten -#define AR_IO_GPIO_4B0_INTEN 0x80017c30 -#pragma Aux_register(0x80017c30, name=>"io_gpio_4b0_inten") - -// User extension aux register io_gpio_4b0_intmask -#define AR_IO_GPIO_4B0_INTMASK 0x80017c34 -#pragma Aux_register(0x80017c34, name=>"io_gpio_4b0_intmask") - -// User extension aux register io_gpio_4b0_inttype_level -#define AR_IO_GPIO_4B0_INTTYPE_LEVEL 0x80017c38 -#pragma Aux_register(0x80017c38, name=>"io_gpio_4b0_inttype_level") - -// User extension aux register io_gpio_4b0_int_polarity -#define AR_IO_GPIO_4B0_INT_POLARITY 0x80017c3c -#pragma Aux_register(0x80017c3c, name=>"io_gpio_4b0_int_polarity") - -// User extension aux register io_gpio_4b0_intstatus -#define AR_IO_GPIO_4B0_INTSTATUS 0x80017c40 -#pragma Aux_register(0x80017c40, name=>"io_gpio_4b0_intstatus") - -// User extension aux register io_gpio_4b0_raw_intstatus -#define AR_IO_GPIO_4B0_RAW_INTSTATUS 0x80017c44 -#pragma Aux_register(0x80017c44, name=>"io_gpio_4b0_raw_intstatus") - -// User extension aux register io_gpio_4b0_porta_eoi -#define AR_IO_GPIO_4B0_PORTA_EOI 0x80017c4c -#pragma Aux_register(0x80017c4c, name=>"io_gpio_4b0_porta_eoi") - -// User extension aux register io_gpio_4b0_ext_porta -#define AR_IO_GPIO_4B0_EXT_PORTA 0x80017c50 -#pragma Aux_register(0x80017c50, name=>"io_gpio_4b0_ext_porta") - -// User extension aux register io_gpio_4b0_ls_sync -#define AR_IO_GPIO_4B0_LS_SYNC 0x80017c60 -#pragma Aux_register(0x80017c60, name=>"io_gpio_4b0_ls_sync") - -// User extension aux register io_gpio_4b0_int_bothedge -#define AR_IO_GPIO_4B0_INT_BOTHEDGE 0x80017c68 -#pragma Aux_register(0x80017c68, name=>"io_gpio_4b0_int_bothedge") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_4B1_IO_GPIO_4B1_PRESENT 1 - -// User extension aux register io_gpio_4b1_debounce -#define AR_IO_GPIO_4B1_DEBOUNCE 0x80017d48 -#pragma Aux_register(0x80017d48, name=>"io_gpio_4b1_debounce") - -// User extension aux register io_gpio_4b1_clken -#define AR_IO_GPIO_4B1_CLKEN 0x80017d80 -#pragma Aux_register(0x80017d80, name=>"io_gpio_4b1_clken") - -// User extension aux register io_gpio_4b1_swporta_dr -#define AR_IO_GPIO_4B1_SWPORTA_DR 0x80017d00 -#pragma Aux_register(0x80017d00, name=>"io_gpio_4b1_swporta_dr") - -// User extension aux register io_gpio_4b1_swporta_ddr -#define AR_IO_GPIO_4B1_SWPORTA_DDR 0x80017d04 -#pragma Aux_register(0x80017d04, name=>"io_gpio_4b1_swporta_ddr") - -// User extension aux register io_gpio_4b1_inten -#define AR_IO_GPIO_4B1_INTEN 0x80017d30 -#pragma Aux_register(0x80017d30, name=>"io_gpio_4b1_inten") - -// User extension aux register io_gpio_4b1_intmask -#define AR_IO_GPIO_4B1_INTMASK 0x80017d34 -#pragma Aux_register(0x80017d34, name=>"io_gpio_4b1_intmask") - -// User extension aux register io_gpio_4b1_inttype_level -#define AR_IO_GPIO_4B1_INTTYPE_LEVEL 0x80017d38 -#pragma Aux_register(0x80017d38, name=>"io_gpio_4b1_inttype_level") - -// User extension aux register io_gpio_4b1_int_polarity -#define AR_IO_GPIO_4B1_INT_POLARITY 0x80017d3c -#pragma Aux_register(0x80017d3c, name=>"io_gpio_4b1_int_polarity") - -// User extension aux register io_gpio_4b1_intstatus -#define AR_IO_GPIO_4B1_INTSTATUS 0x80017d40 -#pragma Aux_register(0x80017d40, name=>"io_gpio_4b1_intstatus") - -// User extension aux register io_gpio_4b1_raw_intstatus -#define AR_IO_GPIO_4B1_RAW_INTSTATUS 0x80017d44 -#pragma Aux_register(0x80017d44, name=>"io_gpio_4b1_raw_intstatus") - -// User extension aux register io_gpio_4b1_porta_eoi -#define AR_IO_GPIO_4B1_PORTA_EOI 0x80017d4c -#pragma Aux_register(0x80017d4c, name=>"io_gpio_4b1_porta_eoi") - -// User extension aux register io_gpio_4b1_ext_porta -#define AR_IO_GPIO_4B1_EXT_PORTA 0x80017d50 -#pragma Aux_register(0x80017d50, name=>"io_gpio_4b1_ext_porta") - -// User extension aux register io_gpio_4b1_ls_sync -#define AR_IO_GPIO_4B1_LS_SYNC 0x80017d60 -#pragma Aux_register(0x80017d60, name=>"io_gpio_4b1_ls_sync") - -// User extension aux register io_gpio_4b1_int_bothedge -#define AR_IO_GPIO_4B1_INT_BOTHEDGE 0x80017d68 -#pragma Aux_register(0x80017d68, name=>"io_gpio_4b1_int_bothedge") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_4B2_IO_GPIO_4B2_PRESENT 1 - -// User extension aux register io_gpio_4b2_debounce -#define AR_IO_GPIO_4B2_DEBOUNCE 0x80017e48 -#pragma Aux_register(0x80017e48, name=>"io_gpio_4b2_debounce") - -// User extension aux register io_gpio_4b2_clken -#define AR_IO_GPIO_4B2_CLKEN 0x80017e80 -#pragma Aux_register(0x80017e80, name=>"io_gpio_4b2_clken") - -// User extension aux register io_gpio_4b2_swporta_dr -#define AR_IO_GPIO_4B2_SWPORTA_DR 0x80017e00 -#pragma Aux_register(0x80017e00, name=>"io_gpio_4b2_swporta_dr") - -// User extension aux register io_gpio_4b2_swporta_ddr -#define AR_IO_GPIO_4B2_SWPORTA_DDR 0x80017e04 -#pragma Aux_register(0x80017e04, name=>"io_gpio_4b2_swporta_ddr") - -// User extension aux register io_gpio_4b2_inten -#define AR_IO_GPIO_4B2_INTEN 0x80017e30 -#pragma Aux_register(0x80017e30, name=>"io_gpio_4b2_inten") - -// User extension aux register io_gpio_4b2_intmask -#define AR_IO_GPIO_4B2_INTMASK 0x80017e34 -#pragma Aux_register(0x80017e34, name=>"io_gpio_4b2_intmask") - -// User extension aux register io_gpio_4b2_inttype_level -#define AR_IO_GPIO_4B2_INTTYPE_LEVEL 0x80017e38 -#pragma Aux_register(0x80017e38, name=>"io_gpio_4b2_inttype_level") - -// User extension aux register io_gpio_4b2_int_polarity -#define AR_IO_GPIO_4B2_INT_POLARITY 0x80017e3c -#pragma Aux_register(0x80017e3c, name=>"io_gpio_4b2_int_polarity") - -// User extension aux register io_gpio_4b2_intstatus -#define AR_IO_GPIO_4B2_INTSTATUS 0x80017e40 -#pragma Aux_register(0x80017e40, name=>"io_gpio_4b2_intstatus") - -// User extension aux register io_gpio_4b2_raw_intstatus -#define AR_IO_GPIO_4B2_RAW_INTSTATUS 0x80017e44 -#pragma Aux_register(0x80017e44, name=>"io_gpio_4b2_raw_intstatus") - -// User extension aux register io_gpio_4b2_porta_eoi -#define AR_IO_GPIO_4B2_PORTA_EOI 0x80017e4c -#pragma Aux_register(0x80017e4c, name=>"io_gpio_4b2_porta_eoi") - -// User extension aux register io_gpio_4b2_ext_porta -#define AR_IO_GPIO_4B2_EXT_PORTA 0x80017e50 -#pragma Aux_register(0x80017e50, name=>"io_gpio_4b2_ext_porta") - -// User extension aux register io_gpio_4b2_ls_sync -#define AR_IO_GPIO_4B2_LS_SYNC 0x80017e60 -#pragma Aux_register(0x80017e60, name=>"io_gpio_4b2_ls_sync") - -// User extension aux register io_gpio_4b2_int_bothedge -#define AR_IO_GPIO_4B2_INT_BOTHEDGE 0x80017e68 -#pragma Aux_register(0x80017e68, name=>"io_gpio_4b2_int_bothedge") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B0_IO_GPIO_8B0_PRESENT 1 - -// User extension aux register io_gpio_8b0_debounce -#define AR_IO_GPIO_8B0_DEBOUNCE 0x80017848 -#pragma Aux_register(0x80017848, name=>"io_gpio_8b0_debounce") - -// User extension aux register io_gpio_8b0_clken -#define AR_IO_GPIO_8B0_CLKEN 0x80017880 -#pragma Aux_register(0x80017880, name=>"io_gpio_8b0_clken") - -// User extension aux register io_gpio_8b0_swporta_dr -#define AR_IO_GPIO_8B0_SWPORTA_DR 0x80017800 -#pragma Aux_register(0x80017800, name=>"io_gpio_8b0_swporta_dr") - -// User extension aux register io_gpio_8b0_swporta_ddr -#define AR_IO_GPIO_8B0_SWPORTA_DDR 0x80017804 -#pragma Aux_register(0x80017804, name=>"io_gpio_8b0_swporta_ddr") - -// User extension aux register io_gpio_8b0_inten -#define AR_IO_GPIO_8B0_INTEN 0x80017830 -#pragma Aux_register(0x80017830, name=>"io_gpio_8b0_inten") - -// User extension aux register io_gpio_8b0_intmask -#define AR_IO_GPIO_8B0_INTMASK 0x80017834 -#pragma Aux_register(0x80017834, name=>"io_gpio_8b0_intmask") - -// User extension aux register io_gpio_8b0_inttype_level -#define AR_IO_GPIO_8B0_INTTYPE_LEVEL 0x80017838 -#pragma Aux_register(0x80017838, name=>"io_gpio_8b0_inttype_level") - -// User extension aux register io_gpio_8b0_int_polarity -#define AR_IO_GPIO_8B0_INT_POLARITY 0x8001783c -#pragma Aux_register(0x8001783c, name=>"io_gpio_8b0_int_polarity") - -// User extension aux register io_gpio_8b0_intstatus -#define AR_IO_GPIO_8B0_INTSTATUS 0x80017840 -#pragma Aux_register(0x80017840, name=>"io_gpio_8b0_intstatus") - -// User extension aux register io_gpio_8b0_raw_intstatus -#define AR_IO_GPIO_8B0_RAW_INTSTATUS 0x80017844 -#pragma Aux_register(0x80017844, name=>"io_gpio_8b0_raw_intstatus") - -// User extension aux register io_gpio_8b0_porta_eoi -#define AR_IO_GPIO_8B0_PORTA_EOI 0x8001784c -#pragma Aux_register(0x8001784c, name=>"io_gpio_8b0_porta_eoi") - -// User extension aux register io_gpio_8b0_ext_porta -#define AR_IO_GPIO_8B0_EXT_PORTA 0x80017850 -#pragma Aux_register(0x80017850, name=>"io_gpio_8b0_ext_porta") - -// User extension aux register io_gpio_8b0_ls_sync -#define AR_IO_GPIO_8B0_LS_SYNC 0x80017860 -#pragma Aux_register(0x80017860, name=>"io_gpio_8b0_ls_sync") - -// User extension aux register io_gpio_8b0_int_bothedge -#define AR_IO_GPIO_8B0_INT_BOTHEDGE 0x80017868 -#pragma Aux_register(0x80017868, name=>"io_gpio_8b0_int_bothedge") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B1_IO_GPIO_8B1_PRESENT 1 - -// User extension aux register io_gpio_8b1_debounce -#define AR_IO_GPIO_8B1_DEBOUNCE 0x80017948 -#pragma Aux_register(0x80017948, name=>"io_gpio_8b1_debounce") - -// User extension aux register io_gpio_8b1_clken -#define AR_IO_GPIO_8B1_CLKEN 0x80017980 -#pragma Aux_register(0x80017980, name=>"io_gpio_8b1_clken") - -// User extension aux register io_gpio_8b1_swporta_dr -#define AR_IO_GPIO_8B1_SWPORTA_DR 0x80017900 -#pragma Aux_register(0x80017900, name=>"io_gpio_8b1_swporta_dr") - -// User extension aux register io_gpio_8b1_swporta_ddr -#define AR_IO_GPIO_8B1_SWPORTA_DDR 0x80017904 -#pragma Aux_register(0x80017904, name=>"io_gpio_8b1_swporta_ddr") - -// User extension aux register io_gpio_8b1_inten -#define AR_IO_GPIO_8B1_INTEN 0x80017930 -#pragma Aux_register(0x80017930, name=>"io_gpio_8b1_inten") - -// User extension aux register io_gpio_8b1_intmask -#define AR_IO_GPIO_8B1_INTMASK 0x80017934 -#pragma Aux_register(0x80017934, name=>"io_gpio_8b1_intmask") - -// User extension aux register io_gpio_8b1_inttype_level -#define AR_IO_GPIO_8B1_INTTYPE_LEVEL 0x80017938 -#pragma Aux_register(0x80017938, name=>"io_gpio_8b1_inttype_level") - -// User extension aux register io_gpio_8b1_int_polarity -#define AR_IO_GPIO_8B1_INT_POLARITY 0x8001793c -#pragma Aux_register(0x8001793c, name=>"io_gpio_8b1_int_polarity") - -// User extension aux register io_gpio_8b1_intstatus -#define AR_IO_GPIO_8B1_INTSTATUS 0x80017940 -#pragma Aux_register(0x80017940, name=>"io_gpio_8b1_intstatus") - -// User extension aux register io_gpio_8b1_raw_intstatus -#define AR_IO_GPIO_8B1_RAW_INTSTATUS 0x80017944 -#pragma Aux_register(0x80017944, name=>"io_gpio_8b1_raw_intstatus") - -// User extension aux register io_gpio_8b1_porta_eoi -#define AR_IO_GPIO_8B1_PORTA_EOI 0x8001794c -#pragma Aux_register(0x8001794c, name=>"io_gpio_8b1_porta_eoi") - -// User extension aux register io_gpio_8b1_ext_porta -#define AR_IO_GPIO_8B1_EXT_PORTA 0x80017950 -#pragma Aux_register(0x80017950, name=>"io_gpio_8b1_ext_porta") - -// User extension aux register io_gpio_8b1_ls_sync -#define AR_IO_GPIO_8B1_LS_SYNC 0x80017960 -#pragma Aux_register(0x80017960, name=>"io_gpio_8b1_ls_sync") - -// User extension aux register io_gpio_8b1_int_bothedge -#define AR_IO_GPIO_8B1_INT_BOTHEDGE 0x80017968 -#pragma Aux_register(0x80017968, name=>"io_gpio_8b1_int_bothedge") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B2_IO_GPIO_8B2_PRESENT 1 - -// User extension aux register io_gpio_8b2_debounce -#define AR_IO_GPIO_8B2_DEBOUNCE 0x80017a48 -#pragma Aux_register(0x80017a48, name=>"io_gpio_8b2_debounce") - -// User extension aux register io_gpio_8b2_clken -#define AR_IO_GPIO_8B2_CLKEN 0x80017a80 -#pragma Aux_register(0x80017a80, name=>"io_gpio_8b2_clken") - -// User extension aux register io_gpio_8b2_swporta_dr -#define AR_IO_GPIO_8B2_SWPORTA_DR 0x80017a00 -#pragma Aux_register(0x80017a00, name=>"io_gpio_8b2_swporta_dr") - -// User extension aux register io_gpio_8b2_swporta_ddr -#define AR_IO_GPIO_8B2_SWPORTA_DDR 0x80017a04 -#pragma Aux_register(0x80017a04, name=>"io_gpio_8b2_swporta_ddr") - -// User extension aux register io_gpio_8b2_inten -#define AR_IO_GPIO_8B2_INTEN 0x80017a30 -#pragma Aux_register(0x80017a30, name=>"io_gpio_8b2_inten") - -// User extension aux register io_gpio_8b2_intmask -#define AR_IO_GPIO_8B2_INTMASK 0x80017a34 -#pragma Aux_register(0x80017a34, name=>"io_gpio_8b2_intmask") - -// User extension aux register io_gpio_8b2_inttype_level -#define AR_IO_GPIO_8B2_INTTYPE_LEVEL 0x80017a38 -#pragma Aux_register(0x80017a38, name=>"io_gpio_8b2_inttype_level") - -// User extension aux register io_gpio_8b2_int_polarity -#define AR_IO_GPIO_8B2_INT_POLARITY 0x80017a3c -#pragma Aux_register(0x80017a3c, name=>"io_gpio_8b2_int_polarity") - -// User extension aux register io_gpio_8b2_intstatus -#define AR_IO_GPIO_8B2_INTSTATUS 0x80017a40 -#pragma Aux_register(0x80017a40, name=>"io_gpio_8b2_intstatus") - -// User extension aux register io_gpio_8b2_raw_intstatus -#define AR_IO_GPIO_8B2_RAW_INTSTATUS 0x80017a44 -#pragma Aux_register(0x80017a44, name=>"io_gpio_8b2_raw_intstatus") - -// User extension aux register io_gpio_8b2_porta_eoi -#define AR_IO_GPIO_8B2_PORTA_EOI 0x80017a4c -#pragma Aux_register(0x80017a4c, name=>"io_gpio_8b2_porta_eoi") - -// User extension aux register io_gpio_8b2_ext_porta -#define AR_IO_GPIO_8B2_EXT_PORTA 0x80017a50 -#pragma Aux_register(0x80017a50, name=>"io_gpio_8b2_ext_porta") - -// User extension aux register io_gpio_8b2_ls_sync -#define AR_IO_GPIO_8B2_LS_SYNC 0x80017a60 -#pragma Aux_register(0x80017a60, name=>"io_gpio_8b2_ls_sync") - -// User extension aux register io_gpio_8b2_int_bothedge -#define AR_IO_GPIO_8B2_INT_BOTHEDGE 0x80017a68 -#pragma Aux_register(0x80017a68, name=>"io_gpio_8b2_int_bothedge") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B3_IO_GPIO_8B3_PRESENT 1 - -// User extension aux register io_gpio_8b3_debounce -#define AR_IO_GPIO_8B3_DEBOUNCE 0x80017b48 -#pragma Aux_register(0x80017b48, name=>"io_gpio_8b3_debounce") - -// User extension aux register io_gpio_8b3_clken -#define AR_IO_GPIO_8B3_CLKEN 0x80017b80 -#pragma Aux_register(0x80017b80, name=>"io_gpio_8b3_clken") - -// User extension aux register io_gpio_8b3_swporta_dr -#define AR_IO_GPIO_8B3_SWPORTA_DR 0x80017b00 -#pragma Aux_register(0x80017b00, name=>"io_gpio_8b3_swporta_dr") - -// User extension aux register io_gpio_8b3_swporta_ddr -#define AR_IO_GPIO_8B3_SWPORTA_DDR 0x80017b04 -#pragma Aux_register(0x80017b04, name=>"io_gpio_8b3_swporta_ddr") - -// User extension aux register io_gpio_8b3_inten -#define AR_IO_GPIO_8B3_INTEN 0x80017b30 -#pragma Aux_register(0x80017b30, name=>"io_gpio_8b3_inten") - -// User extension aux register io_gpio_8b3_intmask -#define AR_IO_GPIO_8B3_INTMASK 0x80017b34 -#pragma Aux_register(0x80017b34, name=>"io_gpio_8b3_intmask") - -// User extension aux register io_gpio_8b3_inttype_level -#define AR_IO_GPIO_8B3_INTTYPE_LEVEL 0x80017b38 -#pragma Aux_register(0x80017b38, name=>"io_gpio_8b3_inttype_level") - -// User extension aux register io_gpio_8b3_int_polarity -#define AR_IO_GPIO_8B3_INT_POLARITY 0x80017b3c -#pragma Aux_register(0x80017b3c, name=>"io_gpio_8b3_int_polarity") - -// User extension aux register io_gpio_8b3_intstatus -#define AR_IO_GPIO_8B3_INTSTATUS 0x80017b40 -#pragma Aux_register(0x80017b40, name=>"io_gpio_8b3_intstatus") - -// User extension aux register io_gpio_8b3_raw_intstatus -#define AR_IO_GPIO_8B3_RAW_INTSTATUS 0x80017b44 -#pragma Aux_register(0x80017b44, name=>"io_gpio_8b3_raw_intstatus") - -// User extension aux register io_gpio_8b3_porta_eoi -#define AR_IO_GPIO_8B3_PORTA_EOI 0x80017b4c -#pragma Aux_register(0x80017b4c, name=>"io_gpio_8b3_porta_eoi") - -// User extension aux register io_gpio_8b3_ext_porta -#define AR_IO_GPIO_8B3_EXT_PORTA 0x80017b50 -#pragma Aux_register(0x80017b50, name=>"io_gpio_8b3_ext_porta") - -// User extension aux register io_gpio_8b3_ls_sync -#define AR_IO_GPIO_8B3_LS_SYNC 0x80017b60 -#pragma Aux_register(0x80017b60, name=>"io_gpio_8b3_ls_sync") - -// User extension aux register io_gpio_8b3_int_bothedge -#define AR_IO_GPIO_8B3_INT_BOTHEDGE 0x80017b68 -#pragma Aux_register(0x80017b68, name=>"io_gpio_8b3_int_bothedge") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST0_IO_I2C_MST0_PRESENT 1 - -// User extension aux register io_i2c_mst0_clken -#define AR_IO_I2C_MST0_CLKEN 0x800120c0 -#pragma Aux_register(0x800120c0, name=>"io_i2c_mst0_clken") - -// User extension aux register io_i2c_mst0_con -#define AR_IO_I2C_MST0_CON 0x80012000 -#pragma Aux_register(0x80012000, name=>"io_i2c_mst0_con") - -// User extension aux register io_i2c_mst0_tar -#define AR_IO_I2C_MST0_TAR 0x80012004 -#pragma Aux_register(0x80012004, name=>"io_i2c_mst0_tar") - -// User extension aux register io_i2c_mst0_data_cmd -#define AR_IO_I2C_MST0_DATA_CMD 0x80012010 -#pragma Aux_register(0x80012010, name=>"io_i2c_mst0_data_cmd") - -// User extension aux register io_i2c_mst0_ss_scl_hcnt -#define AR_IO_I2C_MST0_SS_SCL_HCNT 0x80012014 -#pragma Aux_register(0x80012014, name=>"io_i2c_mst0_ss_scl_hcnt") - -// User extension aux register io_i2c_mst0_ss_scl_lcnt -#define AR_IO_I2C_MST0_SS_SCL_LCNT 0x80012018 -#pragma Aux_register(0x80012018, name=>"io_i2c_mst0_ss_scl_lcnt") - -// User extension aux register io_i2c_mst0_fs_scl_hcnt -#define AR_IO_I2C_MST0_FS_SCL_HCNT 0x8001201c -#pragma Aux_register(0x8001201c, name=>"io_i2c_mst0_fs_scl_hcnt") - -// User extension aux register io_i2c_mst0_fs_scl_lcnt -#define AR_IO_I2C_MST0_FS_SCL_LCNT 0x80012020 -#pragma Aux_register(0x80012020, name=>"io_i2c_mst0_fs_scl_lcnt") - -// User extension aux register io_i2c_mst0_intr_stat -#define AR_IO_I2C_MST0_INTR_STAT 0x8001202c -#pragma Aux_register(0x8001202c, name=>"io_i2c_mst0_intr_stat") - -// User extension aux register io_i2c_mst0_intr_mask -#define AR_IO_I2C_MST0_INTR_MASK 0x80012030 -#pragma Aux_register(0x80012030, name=>"io_i2c_mst0_intr_mask") - -// User extension aux register io_i2c_mst0_raw_intr_stat -#define AR_IO_I2C_MST0_RAW_INTR_STAT 0x80012034 -#pragma Aux_register(0x80012034, name=>"io_i2c_mst0_raw_intr_stat") - -// User extension aux register io_i2c_mst0_rx_tl -#define AR_IO_I2C_MST0_RX_TL 0x80012038 -#pragma Aux_register(0x80012038, name=>"io_i2c_mst0_rx_tl") - -// User extension aux register io_i2c_mst0_tx_tl -#define AR_IO_I2C_MST0_TX_TL 0x8001203c -#pragma Aux_register(0x8001203c, name=>"io_i2c_mst0_tx_tl") - -// User extension aux register io_i2c_mst0_clr_intr -#define AR_IO_I2C_MST0_CLR_INTR 0x80012040 -#pragma Aux_register(0x80012040, name=>"io_i2c_mst0_clr_intr") - -// User extension aux register io_i2c_mst0_clr_rx_under -#define AR_IO_I2C_MST0_CLR_RX_UNDER 0x80012044 -#pragma Aux_register(0x80012044, name=>"io_i2c_mst0_clr_rx_under") - -// User extension aux register io_i2c_mst0_clr_rx_over -#define AR_IO_I2C_MST0_CLR_RX_OVER 0x80012048 -#pragma Aux_register(0x80012048, name=>"io_i2c_mst0_clr_rx_over") - -// User extension aux register io_i2c_mst0_clr_tx_over -#define AR_IO_I2C_MST0_CLR_TX_OVER 0x8001204c -#pragma Aux_register(0x8001204c, name=>"io_i2c_mst0_clr_tx_over") - -// User extension aux register io_i2c_mst0_clr_tx_abrt -#define AR_IO_I2C_MST0_CLR_TX_ABRT 0x80012054 -#pragma Aux_register(0x80012054, name=>"io_i2c_mst0_clr_tx_abrt") - -// User extension aux register io_i2c_mst0_clr_activity -#define AR_IO_I2C_MST0_CLR_ACTIVITY 0x8001205c -#pragma Aux_register(0x8001205c, name=>"io_i2c_mst0_clr_activity") - -// User extension aux register io_i2c_mst0_clr_stop_det -#define AR_IO_I2C_MST0_CLR_STOP_DET 0x80012060 -#pragma Aux_register(0x80012060, name=>"io_i2c_mst0_clr_stop_det") - -// User extension aux register io_i2c_mst0_clr_start_det -#define AR_IO_I2C_MST0_CLR_START_DET 0x80012064 -#pragma Aux_register(0x80012064, name=>"io_i2c_mst0_clr_start_det") - -// User extension aux register io_i2c_mst0_enable -#define AR_IO_I2C_MST0_ENABLE 0x8001206c -#pragma Aux_register(0x8001206c, name=>"io_i2c_mst0_enable") - -// User extension aux register io_i2c_mst0_status -#define AR_IO_I2C_MST0_STATUS 0x80012070 -#pragma Aux_register(0x80012070, name=>"io_i2c_mst0_status") - -// User extension aux register io_i2c_mst0_txflr -#define AR_IO_I2C_MST0_TXFLR 0x80012074 -#pragma Aux_register(0x80012074, name=>"io_i2c_mst0_txflr") - -// User extension aux register io_i2c_mst0_rxflr -#define AR_IO_I2C_MST0_RXFLR 0x80012078 -#pragma Aux_register(0x80012078, name=>"io_i2c_mst0_rxflr") - -// User extension aux register io_i2c_mst0_sda_hold -#define AR_IO_I2C_MST0_SDA_HOLD 0x8001207c -#pragma Aux_register(0x8001207c, name=>"io_i2c_mst0_sda_hold") - -// User extension aux register io_i2c_mst0_tx_abrt_source -#define AR_IO_I2C_MST0_TX_ABRT_SOURCE 0x80012080 -#pragma Aux_register(0x80012080, name=>"io_i2c_mst0_tx_abrt_source") - -// User extension aux register io_i2c_mst0_enable_status -#define AR_IO_I2C_MST0_ENABLE_STATUS 0x8001209c -#pragma Aux_register(0x8001209c, name=>"io_i2c_mst0_enable_status") - -// User extension aux register io_i2c_mst0_fs_spklen -#define AR_IO_I2C_MST0_FS_SPKLEN 0x800120a0 -#pragma Aux_register(0x800120a0, name=>"io_i2c_mst0_fs_spklen") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST1_IO_I2C_MST1_PRESENT 1 - -// User extension aux register io_i2c_mst1_clken -#define AR_IO_I2C_MST1_CLKEN 0x800121c0 -#pragma Aux_register(0x800121c0, name=>"io_i2c_mst1_clken") - -// User extension aux register io_i2c_mst1_con -#define AR_IO_I2C_MST1_CON 0x80012100 -#pragma Aux_register(0x80012100, name=>"io_i2c_mst1_con") - -// User extension aux register io_i2c_mst1_tar -#define AR_IO_I2C_MST1_TAR 0x80012104 -#pragma Aux_register(0x80012104, name=>"io_i2c_mst1_tar") - -// User extension aux register io_i2c_mst1_data_cmd -#define AR_IO_I2C_MST1_DATA_CMD 0x80012110 -#pragma Aux_register(0x80012110, name=>"io_i2c_mst1_data_cmd") - -// User extension aux register io_i2c_mst1_ss_scl_hcnt -#define AR_IO_I2C_MST1_SS_SCL_HCNT 0x80012114 -#pragma Aux_register(0x80012114, name=>"io_i2c_mst1_ss_scl_hcnt") - -// User extension aux register io_i2c_mst1_ss_scl_lcnt -#define AR_IO_I2C_MST1_SS_SCL_LCNT 0x80012118 -#pragma Aux_register(0x80012118, name=>"io_i2c_mst1_ss_scl_lcnt") - -// User extension aux register io_i2c_mst1_fs_scl_hcnt -#define AR_IO_I2C_MST1_FS_SCL_HCNT 0x8001211c -#pragma Aux_register(0x8001211c, name=>"io_i2c_mst1_fs_scl_hcnt") - -// User extension aux register io_i2c_mst1_fs_scl_lcnt -#define AR_IO_I2C_MST1_FS_SCL_LCNT 0x80012120 -#pragma Aux_register(0x80012120, name=>"io_i2c_mst1_fs_scl_lcnt") - -// User extension aux register io_i2c_mst1_intr_stat -#define AR_IO_I2C_MST1_INTR_STAT 0x8001212c -#pragma Aux_register(0x8001212c, name=>"io_i2c_mst1_intr_stat") - -// User extension aux register io_i2c_mst1_intr_mask -#define AR_IO_I2C_MST1_INTR_MASK 0x80012130 -#pragma Aux_register(0x80012130, name=>"io_i2c_mst1_intr_mask") - -// User extension aux register io_i2c_mst1_raw_intr_stat -#define AR_IO_I2C_MST1_RAW_INTR_STAT 0x80012134 -#pragma Aux_register(0x80012134, name=>"io_i2c_mst1_raw_intr_stat") - -// User extension aux register io_i2c_mst1_rx_tl -#define AR_IO_I2C_MST1_RX_TL 0x80012138 -#pragma Aux_register(0x80012138, name=>"io_i2c_mst1_rx_tl") - -// User extension aux register io_i2c_mst1_tx_tl -#define AR_IO_I2C_MST1_TX_TL 0x8001213c -#pragma Aux_register(0x8001213c, name=>"io_i2c_mst1_tx_tl") - -// User extension aux register io_i2c_mst1_clr_intr -#define AR_IO_I2C_MST1_CLR_INTR 0x80012140 -#pragma Aux_register(0x80012140, name=>"io_i2c_mst1_clr_intr") - -// User extension aux register io_i2c_mst1_clr_rx_under -#define AR_IO_I2C_MST1_CLR_RX_UNDER 0x80012144 -#pragma Aux_register(0x80012144, name=>"io_i2c_mst1_clr_rx_under") - -// User extension aux register io_i2c_mst1_clr_rx_over -#define AR_IO_I2C_MST1_CLR_RX_OVER 0x80012148 -#pragma Aux_register(0x80012148, name=>"io_i2c_mst1_clr_rx_over") - -// User extension aux register io_i2c_mst1_clr_tx_over -#define AR_IO_I2C_MST1_CLR_TX_OVER 0x8001214c -#pragma Aux_register(0x8001214c, name=>"io_i2c_mst1_clr_tx_over") - -// User extension aux register io_i2c_mst1_clr_tx_abrt -#define AR_IO_I2C_MST1_CLR_TX_ABRT 0x80012154 -#pragma Aux_register(0x80012154, name=>"io_i2c_mst1_clr_tx_abrt") - -// User extension aux register io_i2c_mst1_clr_activity -#define AR_IO_I2C_MST1_CLR_ACTIVITY 0x8001215c -#pragma Aux_register(0x8001215c, name=>"io_i2c_mst1_clr_activity") - -// User extension aux register io_i2c_mst1_clr_stop_det -#define AR_IO_I2C_MST1_CLR_STOP_DET 0x80012160 -#pragma Aux_register(0x80012160, name=>"io_i2c_mst1_clr_stop_det") - -// User extension aux register io_i2c_mst1_clr_start_det -#define AR_IO_I2C_MST1_CLR_START_DET 0x80012164 -#pragma Aux_register(0x80012164, name=>"io_i2c_mst1_clr_start_det") - -// User extension aux register io_i2c_mst1_enable -#define AR_IO_I2C_MST1_ENABLE 0x8001216c -#pragma Aux_register(0x8001216c, name=>"io_i2c_mst1_enable") - -// User extension aux register io_i2c_mst1_status -#define AR_IO_I2C_MST1_STATUS 0x80012170 -#pragma Aux_register(0x80012170, name=>"io_i2c_mst1_status") - -// User extension aux register io_i2c_mst1_txflr -#define AR_IO_I2C_MST1_TXFLR 0x80012174 -#pragma Aux_register(0x80012174, name=>"io_i2c_mst1_txflr") - -// User extension aux register io_i2c_mst1_rxflr -#define AR_IO_I2C_MST1_RXFLR 0x80012178 -#pragma Aux_register(0x80012178, name=>"io_i2c_mst1_rxflr") - -// User extension aux register io_i2c_mst1_sda_hold -#define AR_IO_I2C_MST1_SDA_HOLD 0x8001217c -#pragma Aux_register(0x8001217c, name=>"io_i2c_mst1_sda_hold") - -// User extension aux register io_i2c_mst1_tx_abrt_source -#define AR_IO_I2C_MST1_TX_ABRT_SOURCE 0x80012180 -#pragma Aux_register(0x80012180, name=>"io_i2c_mst1_tx_abrt_source") - -// User extension aux register io_i2c_mst1_enable_status -#define AR_IO_I2C_MST1_ENABLE_STATUS 0x8001219c -#pragma Aux_register(0x8001219c, name=>"io_i2c_mst1_enable_status") - -// User extension aux register io_i2c_mst1_fs_spklen -#define AR_IO_I2C_MST1_FS_SPKLEN 0x800121a0 -#pragma Aux_register(0x800121a0, name=>"io_i2c_mst1_fs_spklen") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST2_IO_I2C_MST2_PRESENT 1 - -// User extension aux register io_i2c_mst2_clken -#define AR_IO_I2C_MST2_CLKEN 0x800122c0 -#pragma Aux_register(0x800122c0, name=>"io_i2c_mst2_clken") - -// User extension aux register io_i2c_mst2_con -#define AR_IO_I2C_MST2_CON 0x80012200 -#pragma Aux_register(0x80012200, name=>"io_i2c_mst2_con") - -// User extension aux register io_i2c_mst2_tar -#define AR_IO_I2C_MST2_TAR 0x80012204 -#pragma Aux_register(0x80012204, name=>"io_i2c_mst2_tar") - -// User extension aux register io_i2c_mst2_data_cmd -#define AR_IO_I2C_MST2_DATA_CMD 0x80012210 -#pragma Aux_register(0x80012210, name=>"io_i2c_mst2_data_cmd") - -// User extension aux register io_i2c_mst2_ss_scl_hcnt -#define AR_IO_I2C_MST2_SS_SCL_HCNT 0x80012214 -#pragma Aux_register(0x80012214, name=>"io_i2c_mst2_ss_scl_hcnt") - -// User extension aux register io_i2c_mst2_ss_scl_lcnt -#define AR_IO_I2C_MST2_SS_SCL_LCNT 0x80012218 -#pragma Aux_register(0x80012218, name=>"io_i2c_mst2_ss_scl_lcnt") - -// User extension aux register io_i2c_mst2_fs_scl_hcnt -#define AR_IO_I2C_MST2_FS_SCL_HCNT 0x8001221c -#pragma Aux_register(0x8001221c, name=>"io_i2c_mst2_fs_scl_hcnt") - -// User extension aux register io_i2c_mst2_fs_scl_lcnt -#define AR_IO_I2C_MST2_FS_SCL_LCNT 0x80012220 -#pragma Aux_register(0x80012220, name=>"io_i2c_mst2_fs_scl_lcnt") - -// User extension aux register io_i2c_mst2_intr_stat -#define AR_IO_I2C_MST2_INTR_STAT 0x8001222c -#pragma Aux_register(0x8001222c, name=>"io_i2c_mst2_intr_stat") - -// User extension aux register io_i2c_mst2_intr_mask -#define AR_IO_I2C_MST2_INTR_MASK 0x80012230 -#pragma Aux_register(0x80012230, name=>"io_i2c_mst2_intr_mask") - -// User extension aux register io_i2c_mst2_raw_intr_stat -#define AR_IO_I2C_MST2_RAW_INTR_STAT 0x80012234 -#pragma Aux_register(0x80012234, name=>"io_i2c_mst2_raw_intr_stat") - -// User extension aux register io_i2c_mst2_rx_tl -#define AR_IO_I2C_MST2_RX_TL 0x80012238 -#pragma Aux_register(0x80012238, name=>"io_i2c_mst2_rx_tl") - -// User extension aux register io_i2c_mst2_tx_tl -#define AR_IO_I2C_MST2_TX_TL 0x8001223c -#pragma Aux_register(0x8001223c, name=>"io_i2c_mst2_tx_tl") - -// User extension aux register io_i2c_mst2_clr_intr -#define AR_IO_I2C_MST2_CLR_INTR 0x80012240 -#pragma Aux_register(0x80012240, name=>"io_i2c_mst2_clr_intr") - -// User extension aux register io_i2c_mst2_clr_rx_under -#define AR_IO_I2C_MST2_CLR_RX_UNDER 0x80012244 -#pragma Aux_register(0x80012244, name=>"io_i2c_mst2_clr_rx_under") - -// User extension aux register io_i2c_mst2_clr_rx_over -#define AR_IO_I2C_MST2_CLR_RX_OVER 0x80012248 -#pragma Aux_register(0x80012248, name=>"io_i2c_mst2_clr_rx_over") - -// User extension aux register io_i2c_mst2_clr_tx_over -#define AR_IO_I2C_MST2_CLR_TX_OVER 0x8001224c -#pragma Aux_register(0x8001224c, name=>"io_i2c_mst2_clr_tx_over") - -// User extension aux register io_i2c_mst2_clr_tx_abrt -#define AR_IO_I2C_MST2_CLR_TX_ABRT 0x80012254 -#pragma Aux_register(0x80012254, name=>"io_i2c_mst2_clr_tx_abrt") - -// User extension aux register io_i2c_mst2_clr_activity -#define AR_IO_I2C_MST2_CLR_ACTIVITY 0x8001225c -#pragma Aux_register(0x8001225c, name=>"io_i2c_mst2_clr_activity") - -// User extension aux register io_i2c_mst2_clr_stop_det -#define AR_IO_I2C_MST2_CLR_STOP_DET 0x80012260 -#pragma Aux_register(0x80012260, name=>"io_i2c_mst2_clr_stop_det") - -// User extension aux register io_i2c_mst2_clr_start_det -#define AR_IO_I2C_MST2_CLR_START_DET 0x80012264 -#pragma Aux_register(0x80012264, name=>"io_i2c_mst2_clr_start_det") - -// User extension aux register io_i2c_mst2_enable -#define AR_IO_I2C_MST2_ENABLE 0x8001226c -#pragma Aux_register(0x8001226c, name=>"io_i2c_mst2_enable") - -// User extension aux register io_i2c_mst2_status -#define AR_IO_I2C_MST2_STATUS 0x80012270 -#pragma Aux_register(0x80012270, name=>"io_i2c_mst2_status") - -// User extension aux register io_i2c_mst2_txflr -#define AR_IO_I2C_MST2_TXFLR 0x80012274 -#pragma Aux_register(0x80012274, name=>"io_i2c_mst2_txflr") - -// User extension aux register io_i2c_mst2_rxflr -#define AR_IO_I2C_MST2_RXFLR 0x80012278 -#pragma Aux_register(0x80012278, name=>"io_i2c_mst2_rxflr") - -// User extension aux register io_i2c_mst2_sda_hold -#define AR_IO_I2C_MST2_SDA_HOLD 0x8001227c -#pragma Aux_register(0x8001227c, name=>"io_i2c_mst2_sda_hold") - -// User extension aux register io_i2c_mst2_tx_abrt_source -#define AR_IO_I2C_MST2_TX_ABRT_SOURCE 0x80012280 -#pragma Aux_register(0x80012280, name=>"io_i2c_mst2_tx_abrt_source") - -// User extension aux register io_i2c_mst2_enable_status -#define AR_IO_I2C_MST2_ENABLE_STATUS 0x8001229c -#pragma Aux_register(0x8001229c, name=>"io_i2c_mst2_enable_status") - -// User extension aux register io_i2c_mst2_fs_spklen -#define AR_IO_I2C_MST2_FS_SPKLEN 0x800122a0 -#pragma Aux_register(0x800122a0, name=>"io_i2c_mst2_fs_spklen") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST0_IO_SPI_MST0_PRESENT 1 - -// User extension aux register io_spi_mst0_ctrlr0 -#define AR_IO_SPI_MST0_CTRLR0 0x80010000 -#pragma Aux_register(0x80010000, name=>"io_spi_mst0_ctrlr0") - -// User extension aux register io_spi_mst0_ctrlr1 -#define AR_IO_SPI_MST0_CTRLR1 0x80010001 -#pragma Aux_register(0x80010001, name=>"io_spi_mst0_ctrlr1") - -// User extension aux register io_spi_mst0_spien -#define AR_IO_SPI_MST0_SPIEN 0x80010002 -#pragma Aux_register(0x80010002, name=>"io_spi_mst0_spien") - -// User extension aux register io_spi_mst0_ser -#define AR_IO_SPI_MST0_SER 0x80010004 -#pragma Aux_register(0x80010004, name=>"io_spi_mst0_ser") - -// User extension aux register io_spi_mst0_baudr -#define AR_IO_SPI_MST0_BAUDR 0x80010005 -#pragma Aux_register(0x80010005, name=>"io_spi_mst0_baudr") - -// User extension aux register io_spi_mst0_txftlr -#define AR_IO_SPI_MST0_TXFTLR 0x80010006 -#pragma Aux_register(0x80010006, name=>"io_spi_mst0_txftlr") - -// User extension aux register io_spi_mst0_rxftlr -#define AR_IO_SPI_MST0_RXFTLR 0x80010007 -#pragma Aux_register(0x80010007, name=>"io_spi_mst0_rxftlr") - -// User extension aux register io_spi_mst0_txflr -#define AR_IO_SPI_MST0_TXFLR 0x80010008 -#pragma Aux_register(0x80010008, name=>"io_spi_mst0_txflr") - -// User extension aux register io_spi_mst0_rxflr -#define AR_IO_SPI_MST0_RXFLR 0x80010009 -#pragma Aux_register(0x80010009, name=>"io_spi_mst0_rxflr") - -// User extension aux register io_spi_mst0_sr -#define AR_IO_SPI_MST0_SR 0x8001000a -#pragma Aux_register(0x8001000a, name=>"io_spi_mst0_sr") - -// User extension aux register io_spi_mst0_imr -#define AR_IO_SPI_MST0_IMR 0x8001000b -#pragma Aux_register(0x8001000b, name=>"io_spi_mst0_imr") - -// User extension aux register io_spi_mst0_isr -#define AR_IO_SPI_MST0_ISR 0x8001000c -#pragma Aux_register(0x8001000c, name=>"io_spi_mst0_isr") - -// User extension aux register io_spi_mst0_risr -#define AR_IO_SPI_MST0_RISR 0x8001000d -#pragma Aux_register(0x8001000d, name=>"io_spi_mst0_risr") - -// User extension aux register io_spi_mst0_txoicr -#define AR_IO_SPI_MST0_TXOICR 0x8001000e -#pragma Aux_register(0x8001000e, name=>"io_spi_mst0_txoicr") - -// User extension aux register io_spi_mst0_rxoicr -#define AR_IO_SPI_MST0_RXOICR 0x8001000f -#pragma Aux_register(0x8001000f, name=>"io_spi_mst0_rxoicr") - -// User extension aux register io_spi_mst0_rxuicr -#define AR_IO_SPI_MST0_RXUICR 0x80010010 -#pragma Aux_register(0x80010010, name=>"io_spi_mst0_rxuicr") - -// User extension aux register io_spi_mst0_icr -#define AR_IO_SPI_MST0_ICR 0x80010012 -#pragma Aux_register(0x80010012, name=>"io_spi_mst0_icr") - -// User extension aux register io_spi_mst0_clken -#define AR_IO_SPI_MST0_CLKEN 0x80010016 -#pragma Aux_register(0x80010016, name=>"io_spi_mst0_clken") - -// User extension aux register io_spi_mst0_dr -#define AR_IO_SPI_MST0_DR 0x80010018 -#pragma Aux_register(0x80010018, name=>"io_spi_mst0_dr") - -// User extension aux register io_spi_mst0_rx_sample_dly -#define AR_IO_SPI_MST0_RX_SAMPLE_DLY 0x8001003c -#pragma Aux_register(0x8001003c, name=>"io_spi_mst0_rx_sample_dly") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST1_IO_SPI_MST1_PRESENT 1 - -// User extension aux register io_spi_mst1_ctrlr0 -#define AR_IO_SPI_MST1_CTRLR0 0x80010100 -#pragma Aux_register(0x80010100, name=>"io_spi_mst1_ctrlr0") - -// User extension aux register io_spi_mst1_ctrlr1 -#define AR_IO_SPI_MST1_CTRLR1 0x80010101 -#pragma Aux_register(0x80010101, name=>"io_spi_mst1_ctrlr1") - -// User extension aux register io_spi_mst1_spien -#define AR_IO_SPI_MST1_SPIEN 0x80010102 -#pragma Aux_register(0x80010102, name=>"io_spi_mst1_spien") - -// User extension aux register io_spi_mst1_ser -#define AR_IO_SPI_MST1_SER 0x80010104 -#pragma Aux_register(0x80010104, name=>"io_spi_mst1_ser") - -// User extension aux register io_spi_mst1_baudr -#define AR_IO_SPI_MST1_BAUDR 0x80010105 -#pragma Aux_register(0x80010105, name=>"io_spi_mst1_baudr") - -// User extension aux register io_spi_mst1_txftlr -#define AR_IO_SPI_MST1_TXFTLR 0x80010106 -#pragma Aux_register(0x80010106, name=>"io_spi_mst1_txftlr") - -// User extension aux register io_spi_mst1_rxftlr -#define AR_IO_SPI_MST1_RXFTLR 0x80010107 -#pragma Aux_register(0x80010107, name=>"io_spi_mst1_rxftlr") - -// User extension aux register io_spi_mst1_txflr -#define AR_IO_SPI_MST1_TXFLR 0x80010108 -#pragma Aux_register(0x80010108, name=>"io_spi_mst1_txflr") - -// User extension aux register io_spi_mst1_rxflr -#define AR_IO_SPI_MST1_RXFLR 0x80010109 -#pragma Aux_register(0x80010109, name=>"io_spi_mst1_rxflr") - -// User extension aux register io_spi_mst1_sr -#define AR_IO_SPI_MST1_SR 0x8001010a -#pragma Aux_register(0x8001010a, name=>"io_spi_mst1_sr") - -// User extension aux register io_spi_mst1_imr -#define AR_IO_SPI_MST1_IMR 0x8001010b -#pragma Aux_register(0x8001010b, name=>"io_spi_mst1_imr") - -// User extension aux register io_spi_mst1_isr -#define AR_IO_SPI_MST1_ISR 0x8001010c -#pragma Aux_register(0x8001010c, name=>"io_spi_mst1_isr") - -// User extension aux register io_spi_mst1_risr -#define AR_IO_SPI_MST1_RISR 0x8001010d -#pragma Aux_register(0x8001010d, name=>"io_spi_mst1_risr") - -// User extension aux register io_spi_mst1_txoicr -#define AR_IO_SPI_MST1_TXOICR 0x8001010e -#pragma Aux_register(0x8001010e, name=>"io_spi_mst1_txoicr") - -// User extension aux register io_spi_mst1_rxoicr -#define AR_IO_SPI_MST1_RXOICR 0x8001010f -#pragma Aux_register(0x8001010f, name=>"io_spi_mst1_rxoicr") - -// User extension aux register io_spi_mst1_rxuicr -#define AR_IO_SPI_MST1_RXUICR 0x80010110 -#pragma Aux_register(0x80010110, name=>"io_spi_mst1_rxuicr") - -// User extension aux register io_spi_mst1_icr -#define AR_IO_SPI_MST1_ICR 0x80010112 -#pragma Aux_register(0x80010112, name=>"io_spi_mst1_icr") - -// User extension aux register io_spi_mst1_clken -#define AR_IO_SPI_MST1_CLKEN 0x80010116 -#pragma Aux_register(0x80010116, name=>"io_spi_mst1_clken") - -// User extension aux register io_spi_mst1_dr -#define AR_IO_SPI_MST1_DR 0x80010118 -#pragma Aux_register(0x80010118, name=>"io_spi_mst1_dr") - -// User extension aux register io_spi_mst1_rx_sample_dly -#define AR_IO_SPI_MST1_RX_SAMPLE_DLY 0x8001013c -#pragma Aux_register(0x8001013c, name=>"io_spi_mst1_rx_sample_dly") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST2_IO_SPI_MST2_PRESENT 1 - -// User extension aux register io_spi_mst2_ctrlr0 -#define AR_IO_SPI_MST2_CTRLR0 0x80010200 -#pragma Aux_register(0x80010200, name=>"io_spi_mst2_ctrlr0") - -// User extension aux register io_spi_mst2_ctrlr1 -#define AR_IO_SPI_MST2_CTRLR1 0x80010201 -#pragma Aux_register(0x80010201, name=>"io_spi_mst2_ctrlr1") - -// User extension aux register io_spi_mst2_spien -#define AR_IO_SPI_MST2_SPIEN 0x80010202 -#pragma Aux_register(0x80010202, name=>"io_spi_mst2_spien") - -// User extension aux register io_spi_mst2_ser -#define AR_IO_SPI_MST2_SER 0x80010204 -#pragma Aux_register(0x80010204, name=>"io_spi_mst2_ser") - -// User extension aux register io_spi_mst2_baudr -#define AR_IO_SPI_MST2_BAUDR 0x80010205 -#pragma Aux_register(0x80010205, name=>"io_spi_mst2_baudr") - -// User extension aux register io_spi_mst2_txftlr -#define AR_IO_SPI_MST2_TXFTLR 0x80010206 -#pragma Aux_register(0x80010206, name=>"io_spi_mst2_txftlr") - -// User extension aux register io_spi_mst2_rxftlr -#define AR_IO_SPI_MST2_RXFTLR 0x80010207 -#pragma Aux_register(0x80010207, name=>"io_spi_mst2_rxftlr") - -// User extension aux register io_spi_mst2_txflr -#define AR_IO_SPI_MST2_TXFLR 0x80010208 -#pragma Aux_register(0x80010208, name=>"io_spi_mst2_txflr") - -// User extension aux register io_spi_mst2_rxflr -#define AR_IO_SPI_MST2_RXFLR 0x80010209 -#pragma Aux_register(0x80010209, name=>"io_spi_mst2_rxflr") - -// User extension aux register io_spi_mst2_sr -#define AR_IO_SPI_MST2_SR 0x8001020a -#pragma Aux_register(0x8001020a, name=>"io_spi_mst2_sr") - -// User extension aux register io_spi_mst2_imr -#define AR_IO_SPI_MST2_IMR 0x8001020b -#pragma Aux_register(0x8001020b, name=>"io_spi_mst2_imr") - -// User extension aux register io_spi_mst2_isr -#define AR_IO_SPI_MST2_ISR 0x8001020c -#pragma Aux_register(0x8001020c, name=>"io_spi_mst2_isr") - -// User extension aux register io_spi_mst2_risr -#define AR_IO_SPI_MST2_RISR 0x8001020d -#pragma Aux_register(0x8001020d, name=>"io_spi_mst2_risr") - -// User extension aux register io_spi_mst2_txoicr -#define AR_IO_SPI_MST2_TXOICR 0x8001020e -#pragma Aux_register(0x8001020e, name=>"io_spi_mst2_txoicr") - -// User extension aux register io_spi_mst2_rxoicr -#define AR_IO_SPI_MST2_RXOICR 0x8001020f -#pragma Aux_register(0x8001020f, name=>"io_spi_mst2_rxoicr") - -// User extension aux register io_spi_mst2_rxuicr -#define AR_IO_SPI_MST2_RXUICR 0x80010210 -#pragma Aux_register(0x80010210, name=>"io_spi_mst2_rxuicr") - -// User extension aux register io_spi_mst2_icr -#define AR_IO_SPI_MST2_ICR 0x80010212 -#pragma Aux_register(0x80010212, name=>"io_spi_mst2_icr") - -// User extension aux register io_spi_mst2_clken -#define AR_IO_SPI_MST2_CLKEN 0x80010216 -#pragma Aux_register(0x80010216, name=>"io_spi_mst2_clken") - -// User extension aux register io_spi_mst2_dr -#define AR_IO_SPI_MST2_DR 0x80010218 -#pragma Aux_register(0x80010218, name=>"io_spi_mst2_dr") - -// User extension aux register io_spi_mst2_rx_sample_dly -#define AR_IO_SPI_MST2_RX_SAMPLE_DLY 0x8001023c -#pragma Aux_register(0x8001023c, name=>"io_spi_mst2_rx_sample_dly") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_SLV0_IO_SPI_SLV0_PRESENT 1 - -// User extension aux register io_spi_slv0_ctrlr0 -#define AR_IO_SPI_SLV0_CTRLR0 0x80011000 -#pragma Aux_register(0x80011000, name=>"io_spi_slv0_ctrlr0") - -// User extension aux register io_spi_slv0_spien -#define AR_IO_SPI_SLV0_SPIEN 0x80011002 -#pragma Aux_register(0x80011002, name=>"io_spi_slv0_spien") - -// User extension aux register io_spi_slv0_txftlr -#define AR_IO_SPI_SLV0_TXFTLR 0x80011006 -#pragma Aux_register(0x80011006, name=>"io_spi_slv0_txftlr") - -// User extension aux register io_spi_slv0_rxftlr -#define AR_IO_SPI_SLV0_RXFTLR 0x80011007 -#pragma Aux_register(0x80011007, name=>"io_spi_slv0_rxftlr") - -// User extension aux register io_spi_slv0_txflr -#define AR_IO_SPI_SLV0_TXFLR 0x80011008 -#pragma Aux_register(0x80011008, name=>"io_spi_slv0_txflr") - -// User extension aux register io_spi_slv0_rxflr -#define AR_IO_SPI_SLV0_RXFLR 0x80011009 -#pragma Aux_register(0x80011009, name=>"io_spi_slv0_rxflr") - -// User extension aux register io_spi_slv0_sr -#define AR_IO_SPI_SLV0_SR 0x8001100a -#pragma Aux_register(0x8001100a, name=>"io_spi_slv0_sr") - -// User extension aux register io_spi_slv0_imr -#define AR_IO_SPI_SLV0_IMR 0x8001100b -#pragma Aux_register(0x8001100b, name=>"io_spi_slv0_imr") - -// User extension aux register io_spi_slv0_isr -#define AR_IO_SPI_SLV0_ISR 0x8001100c -#pragma Aux_register(0x8001100c, name=>"io_spi_slv0_isr") - -// User extension aux register io_spi_slv0_risr -#define AR_IO_SPI_SLV0_RISR 0x8001100d -#pragma Aux_register(0x8001100d, name=>"io_spi_slv0_risr") - -// User extension aux register io_spi_slv0_txoicr -#define AR_IO_SPI_SLV0_TXOICR 0x8001100e -#pragma Aux_register(0x8001100e, name=>"io_spi_slv0_txoicr") - -// User extension aux register io_spi_slv0_rxoicr -#define AR_IO_SPI_SLV0_RXOICR 0x8001100f -#pragma Aux_register(0x8001100f, name=>"io_spi_slv0_rxoicr") - -// User extension aux register io_spi_slv0_rxuicr -#define AR_IO_SPI_SLV0_RXUICR 0x80011010 -#pragma Aux_register(0x80011010, name=>"io_spi_slv0_rxuicr") - -// User extension aux register io_spi_slv0_icr -#define AR_IO_SPI_SLV0_ICR 0x80011012 -#pragma Aux_register(0x80011012, name=>"io_spi_slv0_icr") - -// User extension aux register io_spi_slv0_clken -#define AR_IO_SPI_SLV0_CLKEN 0x80011016 -#pragma Aux_register(0x80011016, name=>"io_spi_slv0_clken") - -// User extension aux register io_spi_slv0_dr -#define AR_IO_SPI_SLV0_DR 0x80011018 -#pragma Aux_register(0x80011018, name=>"io_spi_slv0_dr") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART0_IO_UART0_PRESENT 1 - -// User extension aux register io_uart0_clken -#define AR_IO_UART0_CLKEN 0x800140c0 -#pragma Aux_register(0x800140c0, name=>"io_uart0_clken") - -// User extension aux register io_uart0_rbr_thr_dll -#define AR_IO_UART0_RBR_THR_DLL 0x80014000 -#pragma Aux_register(0x80014000, name=>"io_uart0_rbr_thr_dll") - -// User extension aux register io_uart0_ier_dlh -#define AR_IO_UART0_IER_DLH 0x80014004 -#pragma Aux_register(0x80014004, name=>"io_uart0_ier_dlh") - -// User extension aux register io_uart0_iir_fcr -#define AR_IO_UART0_IIR_FCR 0x80014008 -#pragma Aux_register(0x80014008, name=>"io_uart0_iir_fcr") - -// User extension aux register io_uart0_lcr -#define AR_IO_UART0_LCR 0x8001400c -#pragma Aux_register(0x8001400c, name=>"io_uart0_lcr") - -// User extension aux register io_uart0_mcr -#define AR_IO_UART0_MCR 0x80014010 -#pragma Aux_register(0x80014010, name=>"io_uart0_mcr") - -// User extension aux register io_uart0_lsr -#define AR_IO_UART0_LSR 0x80014014 -#pragma Aux_register(0x80014014, name=>"io_uart0_lsr") - -// User extension aux register io_uart0_msr -#define AR_IO_UART0_MSR 0x80014018 -#pragma Aux_register(0x80014018, name=>"io_uart0_msr") - -// User extension aux register io_uart0_usr -#define AR_IO_UART0_USR 0x8001407c -#pragma Aux_register(0x8001407c, name=>"io_uart0_usr") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART1_IO_UART1_PRESENT 1 - -// User extension aux register io_uart1_clken -#define AR_IO_UART1_CLKEN 0x800141c0 -#pragma Aux_register(0x800141c0, name=>"io_uart1_clken") - -// User extension aux register io_uart1_rbr_thr_dll -#define AR_IO_UART1_RBR_THR_DLL 0x80014100 -#pragma Aux_register(0x80014100, name=>"io_uart1_rbr_thr_dll") - -// User extension aux register io_uart1_ier_dlh -#define AR_IO_UART1_IER_DLH 0x80014104 -#pragma Aux_register(0x80014104, name=>"io_uart1_ier_dlh") - -// User extension aux register io_uart1_iir_fcr -#define AR_IO_UART1_IIR_FCR 0x80014108 -#pragma Aux_register(0x80014108, name=>"io_uart1_iir_fcr") - -// User extension aux register io_uart1_lcr -#define AR_IO_UART1_LCR 0x8001410c -#pragma Aux_register(0x8001410c, name=>"io_uart1_lcr") - -// User extension aux register io_uart1_mcr -#define AR_IO_UART1_MCR 0x80014110 -#pragma Aux_register(0x80014110, name=>"io_uart1_mcr") - -// User extension aux register io_uart1_lsr -#define AR_IO_UART1_LSR 0x80014114 -#pragma Aux_register(0x80014114, name=>"io_uart1_lsr") - -// User extension aux register io_uart1_msr -#define AR_IO_UART1_MSR 0x80014118 -#pragma Aux_register(0x80014118, name=>"io_uart1_msr") - -// User extension aux register io_uart1_usr -#define AR_IO_UART1_USR 0x8001417c -#pragma Aux_register(0x8001417c, name=>"io_uart1_usr") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART2_IO_UART2_PRESENT 1 - -// User extension aux register io_uart2_clken -#define AR_IO_UART2_CLKEN 0x800142c0 -#pragma Aux_register(0x800142c0, name=>"io_uart2_clken") - -// User extension aux register io_uart2_rbr_thr_dll -#define AR_IO_UART2_RBR_THR_DLL 0x80014200 -#pragma Aux_register(0x80014200, name=>"io_uart2_rbr_thr_dll") - -// User extension aux register io_uart2_ier_dlh -#define AR_IO_UART2_IER_DLH 0x80014204 -#pragma Aux_register(0x80014204, name=>"io_uart2_ier_dlh") - -// User extension aux register io_uart2_iir_fcr -#define AR_IO_UART2_IIR_FCR 0x80014208 -#pragma Aux_register(0x80014208, name=>"io_uart2_iir_fcr") - -// User extension aux register io_uart2_lcr -#define AR_IO_UART2_LCR 0x8001420c -#pragma Aux_register(0x8001420c, name=>"io_uart2_lcr") - -// User extension aux register io_uart2_mcr -#define AR_IO_UART2_MCR 0x80014210 -#pragma Aux_register(0x80014210, name=>"io_uart2_mcr") - -// User extension aux register io_uart2_lsr -#define AR_IO_UART2_LSR 0x80014214 -#pragma Aux_register(0x80014214, name=>"io_uart2_lsr") - -// User extension aux register io_uart2_msr -#define AR_IO_UART2_MSR 0x80014218 -#pragma Aux_register(0x80014218, name=>"io_uart2_msr") - -// User extension aux register io_uart2_usr -#define AR_IO_UART2_USR 0x8001427c -#pragma Aux_register(0x8001427c, name=>"io_uart2_usr") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART3_IO_UART3_PRESENT 1 - -// User extension aux register io_uart3_clken -#define AR_IO_UART3_CLKEN 0x800143c0 -#pragma Aux_register(0x800143c0, name=>"io_uart3_clken") - -// User extension aux register io_uart3_rbr_thr_dll -#define AR_IO_UART3_RBR_THR_DLL 0x80014300 -#pragma Aux_register(0x80014300, name=>"io_uart3_rbr_thr_dll") - -// User extension aux register io_uart3_ier_dlh -#define AR_IO_UART3_IER_DLH 0x80014304 -#pragma Aux_register(0x80014304, name=>"io_uart3_ier_dlh") - -// User extension aux register io_uart3_iir_fcr -#define AR_IO_UART3_IIR_FCR 0x80014308 -#pragma Aux_register(0x80014308, name=>"io_uart3_iir_fcr") - -// User extension aux register io_uart3_lcr -#define AR_IO_UART3_LCR 0x8001430c -#pragma Aux_register(0x8001430c, name=>"io_uart3_lcr") - -// User extension aux register io_uart3_mcr -#define AR_IO_UART3_MCR 0x80014310 -#pragma Aux_register(0x80014310, name=>"io_uart3_mcr") - -// User extension aux register io_uart3_lsr -#define AR_IO_UART3_LSR 0x80014314 -#pragma Aux_register(0x80014314, name=>"io_uart3_lsr") - -// User extension aux register io_uart3_msr -#define AR_IO_UART3_MSR 0x80014318 -#pragma Aux_register(0x80014318, name=>"io_uart3_msr") - -// User extension aux register io_uart3_usr -#define AR_IO_UART3_USR 0x8001437c -#pragma Aux_register(0x8001437c, name=>"io_uart3_usr") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_CREG_MST0_IO_CREG_MST0_PRESENT 1 - -// User extension aux register io_creg_mst0_ctrl -#define AR_IO_CREG_MST0_CTRL 0x80018000 -#pragma Aux_register(0x80018000, name=>"io_creg_mst0_ctrl") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_CREG_SLV0_IO_CREG_SLV0_PRESENT 1 - -// User extension aux register io_creg_slv0_obsr -#define AR_IO_CREG_SLV0_OBSR 0x80018080 -#pragma Aux_register(0x80018080, name=>"io_creg_slv0_obsr") -#define APEX_COM_ARC_HARDWARE_DFSS_SUBSYS_BCR_SUBSYS_BCR_PRESENT 1 - -// User extension aux register SUBSYS_BUILD -#define AR_SUBSYS_BUILD 0xf0 -#pragma Aux_register(0xf0, name=>"SUBSYS_BUILD") - -// User extension aux register SUBSYS_DSP_0_BUILD -#define AR_SUBSYS_DSP_0_BUILD 0xa00 -#pragma Aux_register(0xa00, name=>"SUBSYS_DSP_0_BUILD") - -// User extension aux register SUBSYS_DSP_0_CONFIG -#define AR_SUBSYS_DSP_0_CONFIG 0xa02 -#pragma Aux_register(0xa02, name=>"SUBSYS_DSP_0_CONFIG") - -// User extension aux register SUBSYS_IO_0_BUILD -#define AR_SUBSYS_IO_0_BUILD 0xa04 -#pragma Aux_register(0xa04, name=>"SUBSYS_IO_0_BUILD") - -// User extension aux register SUBSYS_IO_1_BUILD -#define AR_SUBSYS_IO_1_BUILD 0xa05 -#pragma Aux_register(0xa05, name=>"SUBSYS_IO_1_BUILD") -#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_PRESENT 1 - -// User extension aux register fpu_build -#define AR_FPU_BUILD 0xc8 -#pragma Aux_register(0xc8, name=>"fpu_build") - -// User extension aux register fpu_ctrl -#define AR_FPU_CTRL 0x300 -#pragma Aux_register(0x300, name=>"fpu_ctrl") - -// User extension aux register fpu_status -#define AR_FPU_STATUS 0x301 -#pragma Aux_register(0x301, name=>"fpu_status") - -// User extension instruction fsmadd -extern long fsmadd(long,long); -#pragma intrinsic(fsmadd,opcode=>6,sub_opcode=>5, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fsmsub -extern long fsmsub(long,long); -#pragma intrinsic(fsmsub,opcode=>6,sub_opcode=>6, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fsmul -extern long fsmul(long,long); -#pragma intrinsic(fsmul,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fsadd -extern long fsadd(long,long); -#pragma intrinsic(fsadd,opcode=>6,sub_opcode=>1, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fssub -extern long fssub(long,long); -#pragma intrinsic(fssub,opcode=>6,sub_opcode=>2, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fcvt32 -extern long fcvt32(long,long); -#pragma intrinsic(fcvt32,opcode=>6,sub_opcode=>8, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fsdiv -extern long fsdiv(long,long); -#pragma intrinsic(fsdiv,opcode=>6,sub_opcode=>7, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fscmp -extern long fscmp(long,long); -#pragma intrinsic(fscmp,opcode=>6,sub_opcode=>3, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fscmp -extern long fscmp_f(long,long); -#pragma intrinsic(fscmp_f,opcode=>6,sub_opcode=>3, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fscmpf -extern long fscmpf(long,long); -#pragma intrinsic(fscmpf,opcode=>6,sub_opcode=>4, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fscmpf -extern long fscmpf_f(long,long); -#pragma intrinsic(fscmpf_f,opcode=>6,sub_opcode=>4, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fssqrt -extern long fssqrt(long); -#pragma intrinsic(fssqrt,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") -#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_DP_ASSIST_PRESENT 1 - -// User extension aux register aux_dpfp1l -#define AR_AUX_DPFP1L 0x302 -#pragma Aux_register(0x302, name=>"aux_dpfp1l") - -// User extension aux register aux_dpfp1h -#define AR_AUX_DPFP1H 0x303 -#pragma Aux_register(0x303, name=>"aux_dpfp1h") - -// User extension aux register aux_dpfp2l -#define AR_AUX_DPFP2L 0x304 -#pragma Aux_register(0x304, name=>"aux_dpfp2l") - -// User extension aux register aux_dpfp2h -#define AR_AUX_DPFP2H 0x305 -#pragma Aux_register(0x305, name=>"aux_dpfp2h") - -// User extension instruction dmulh11 -extern long dmulh11(long,long); -#pragma intrinsic(dmulh11,opcode=>6,sub_opcode=>48,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh11 -extern long dmulh11_f(long,long); -#pragma intrinsic(dmulh11_f,opcode=>6,sub_opcode=>48, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh12 -extern long dmulh12(long,long); -#pragma intrinsic(dmulh12,opcode=>6,sub_opcode=>49,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh12 -extern long dmulh12_f(long,long); -#pragma intrinsic(dmulh12_f,opcode=>6,sub_opcode=>49, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh21 -extern long dmulh21(long,long); -#pragma intrinsic(dmulh21,opcode=>6,sub_opcode=>50,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh21 -extern long dmulh21_f(long,long); -#pragma intrinsic(dmulh21_f,opcode=>6,sub_opcode=>50, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh22 -extern long dmulh22(long,long); -#pragma intrinsic(dmulh22,opcode=>6,sub_opcode=>51,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh22 -extern long dmulh22_f(long,long); -#pragma intrinsic(dmulh22_f,opcode=>6,sub_opcode=>51, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh11 -extern long daddh11(long,long); -#pragma intrinsic(daddh11,opcode=>6,sub_opcode=>52,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh11 -extern long daddh11_f(long,long); -#pragma intrinsic(daddh11_f,opcode=>6,sub_opcode=>52, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh12 -extern long daddh12(long,long); -#pragma intrinsic(daddh12,opcode=>6,sub_opcode=>53,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh12 -extern long daddh12_f(long,long); -#pragma intrinsic(daddh12_f,opcode=>6,sub_opcode=>53, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh21 -extern long daddh21(long,long); -#pragma intrinsic(daddh21,opcode=>6,sub_opcode=>54,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh21 -extern long daddh21_f(long,long); -#pragma intrinsic(daddh21_f,opcode=>6,sub_opcode=>54, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh22 -extern long daddh22(long,long); -#pragma intrinsic(daddh22,opcode=>6,sub_opcode=>55,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh22 -extern long daddh22_f(long,long); -#pragma intrinsic(daddh22_f,opcode=>6,sub_opcode=>55, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh11 -extern long dsubh11(long,long); -#pragma intrinsic(dsubh11,opcode=>6,sub_opcode=>56,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh11 -extern long dsubh11_f(long,long); -#pragma intrinsic(dsubh11_f,opcode=>6,sub_opcode=>56, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh12 -extern long dsubh12(long,long); -#pragma intrinsic(dsubh12,opcode=>6,sub_opcode=>57,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh12 -extern long dsubh12_f(long,long); -#pragma intrinsic(dsubh12_f,opcode=>6,sub_opcode=>57, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh21 -extern long dsubh21(long,long); -#pragma intrinsic(dsubh21,opcode=>6,sub_opcode=>58,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh21 -extern long dsubh21_f(long,long); -#pragma intrinsic(dsubh21_f,opcode=>6,sub_opcode=>58, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh22 -extern long dsubh22(long,long); -#pragma intrinsic(dsubh22,opcode=>6,sub_opcode=>59,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh22 -extern long dsubh22_f(long,long); -#pragma intrinsic(dsubh22_f,opcode=>6,sub_opcode=>59, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dexcl1 -extern long dexcl1(long,long); -#pragma intrinsic(dexcl1,opcode=>6,sub_opcode=>60, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dexcl2 -extern long dexcl2(long,long); -#pragma intrinsic(dexcl2,opcode=>6,sub_opcode=>61, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - - -#endif - - -]]> - - - - - diff --git a/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf b/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf deleted file mode 100644 index 00cf0a3050b..00000000000 --- a/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf +++ /dev/null @@ -1,50 +0,0 @@ - # SYSTEM memory regions indicate where external memory might be located. - # The TCF has no specific knowledge of whether SYSTEM regions contain - # external memory or not. - # CCMWRAP memory regions indicate unusable portions of the address space - # due to CCM memory wrapping into upper addresses beyond its size - - MEMORY { - ICCM0 : ORIGIN = 0x00000000, LENGTH = 0x00010000 - # CCMWRAP0: ORIGIN = 0x00010000, LENGTH = 0x0fff0000 - ICCM1 : ORIGIN = 0x10000000, LENGTH = 0x00080000 - # CCMWRAP1: ORIGIN = 0x10080000, LENGTH = 0x0ff80000 - # SYSTEM0 : ORIGIN = 0x20000000, LENGTH = 0x60000000 - DCCM : ORIGIN = 0x80000000, LENGTH = 0x00080000 - # CCMWRAP2: ORIGIN = 0x80080000, LENGTH = 0x0ff80000 - XCCM : ORIGIN = 0x90000000, LENGTH = 0x00008000 - # CCMWRAP3: ORIGIN = 0x90008000, LENGTH = 0x0fff8000 - YCCM : ORIGIN = 0xa0000000, LENGTH = 0x00008000 - # CCMWRAP4: ORIGIN = 0xa0008000, LENGTH = 0x0fff8000 - # SYSTEM1 : ORIGIN = 0xb0000000, LENGTH = 0x50000000 - } - SECTIONS { - GROUP BLOCK(4): { - .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:132): {} - .text? : { *('.text$crt*') } - * (TEXT): {} - * (LIT): {} - .rodata_in_data?:{} - } > ICCM1 - - GROUP BLOCK(4): { - /* _SDA_BASE_ computed implicitly */ - .sdata?: {} - .sbss?: {} - .protobuf?: {} - * (DATA): {} - * (BSS): {} - .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {} - .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {} - .tensor_arena?: {} - } > DCCM - GROUP BLOCK(4): { - .Xdata? : {} - } > XCCM - GROUP BLOCK(4): { - .Ydata? : {} - } > YCCM - } - - - From 5b2f6d322cb4943548935b0fc52b528e18c4ad7d Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Thu, 30 Apr 2020 10:56:08 +0300 Subject: [PATCH 050/557] Cases with channel multiplier for DW conv (int8) temporarily fallback to reference code --- tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc index 081a40b23b5..2aad76bc042 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc @@ -69,8 +69,14 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input, const TfLiteDepthwiseConvParams* params) { const auto* affine_quantization = reinterpret_cast(filter->quantization.params); + const int in_ch = SizeOfDimension(input, 3); + const int filters_num = SizeOfDimension(filter, 3); + // MLI optimized version only supports int8 dataype, dilation factor of 1 and // per-axis quantization of weights (no broadcasting/per-tensor) + // TODO: ((in_ch == filters_num) || (in_ch == 1)) is a forbidding of + // channel multiplier logic for multichannel input. + // To be removed after it will be supported in MLI bool ret_val = (filter->type == kTfLiteInt8) && (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) && @@ -78,6 +84,7 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input, (params->dilation_height_factor == 1) && (affine_quantization->scale->size == filter->dims->data[kDepthwiseConvQuantizedDimension]) && + ((in_ch == filters_num) || (in_ch == 1)) && affine_quantization->scale->size <= (kMaxChannels * 2); return ret_val; } From ea1a6715ef2fc136b06986cdade85f6a084855be Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Fri, 1 May 2020 13:46:45 +0300 Subject: [PATCH 051/557] ARC related documentation in readme files --- .../lite/micro/examples/hello_world/README.md | 45 ++++ .../micro/examples/micro_speech/README.md | 51 +++++ .../micro/examples/person_detection/README.md | 52 +++++ .../person_detection_experimental/README.md | 54 +++++ .../lite/micro/kernels/arc_mli/README.md | 57 +++++ .../micro/tools/make/targets/arc/README.md | 214 ++++++++++++++++++ .../make/templates/arc/README_ARC.md.tpl | 45 +++- .../templates/arc/README_ARC_EMSDP.md.tpl | 48 +++- 8 files changed, 564 insertions(+), 2 deletions(-) create mode 100644 tensorflow/lite/micro/kernels/arc_mli/README.md create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/README.md diff --git a/tensorflow/lite/micro/examples/hello_world/README.md b/tensorflow/lite/micro/examples/hello_world/README.md index 3f3fef67f28..a0a2b678157 100644 --- a/tensorflow/lite/micro/examples/hello_world/README.md +++ b/tensorflow/lite/micro/examples/hello_world/README.md @@ -15,6 +15,7 @@ animation. ## Table of contents - [Understand the model](#understand-the-model) +- [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp) - [Deploy to Arduino](#deploy-to-arduino) - [Deploy to ESP32](#deploy-to-esp32) - [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge) @@ -30,6 +31,50 @@ Walk through this tutorial to understand what the model does, how it works, and how it was converted for use with TensorFlow Lite for Microcontrollers. +## Deploy to ARC EM SDP + +The following instructions will help you to build and deploy this example to +[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform) board. General information and instructions on using the board with TensorFlow Lite Micro can be found in the common [ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md). + +### Initial Setup + +Follow the instructions on the [ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) to get and install all required tools for work with ARC EM SDP. + +### Generate Example Project + +The example project for ARC EM SDP platform can be generated with the following command: + + make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_hello_world_make_project + +### Build and Run Example + +For more detailed information on building and running examples see the appropriate sections of general descriptions of the [ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP). In the directory with generated project you can also find a *README_ARC_EMSDP.md* file with instructions and options on building and running. Here we only briefly mention main steps which are typically enough to get it started. + +1. You need to [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board) and open an serial connection. + +2. Go to the generated example project director + + cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/hello_world/make + +3. Build the example using + + make app + +4. To generate artefacts for self-boot of example from the board use + + make flash + +5. To run application from the board using microSD card: + * Copy the content of the created /bin folder into the root of microSD card. Note that the card must be formatted as FAT32 with default cluster size (but less than 32 Kbytes) + * Plug in the microSD card into the J11 connector. + * Push the RST button. If a red LED is lit beside RST button, push the CFG button. + +6. If you have the MetaWare Debugger installed in your environment: + * To run application from the console using it type `make run`. + * To stop the execution type `Ctrl+C` in the console several times. + +In both cases (step 5 and 6) you will see the application output in the serial terminal. + ## Deploy to Arduino The following instructions will help you build and deploy this sample diff --git a/tensorflow/lite/micro/examples/micro_speech/README.md b/tensorflow/lite/micro/examples/micro_speech/README.md index 7ccaa806366..ba55a7d8493 100644 --- a/tensorflow/lite/micro/examples/micro_speech/README.md +++ b/tensorflow/lite/micro/examples/micro_speech/README.md @@ -16,6 +16,7 @@ kilobytes of Flash. ## Table of contents +- [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp) - [Deploy to Arduino](#deploy-to-arduino) - [Deploy to ESP32](#deploy-to-esp32) - [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge) @@ -25,6 +26,56 @@ kilobytes of Flash. - [Run the tests on a development machine](#run-the-tests-on-a-development-machine) - [Train your own model](#train-your-own-model) +## Deploy to ARC EM SDP + +The following instructions will help you to build and deploy this example to +[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform) board. General information and instructions on using the board with TensorFlow Lite Micro can be found in the common [ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md). + +This example is quantized with symmetric uint8 scheme. As noted in [kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md), embARC MLI supports optimized kernels for int8 quantization only. Therefore, this example will only use TFLM reference kernels. + +The ARC EM SDP board contains the rich set of extension interfaces. You can choose any compatible microphone and modify [audio_provider.cc](/tensorflow/lite/micro/examples/micro_speech/audio_provider.cc) file accordingly to use input from your specific camera. By default, results of running this example are printed to the console. If you would like to instead implement some target-specific actions, you need to modify [command_responder.cc](/tensorflow/lite/micro/examples/micro_speech/command_responder.cc) accordingly. + +The reference implementations of these files are used by default on the EM SDP. + +### Initial setup + +Follow the instructions on the [ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) to get and install all required tools for work with ARC EM SDP. + +### Generate Example Project + +As default example doesn’t provide any output without real audio, it is recommended to get started with example for mock data. The project for ARC EM SDP platform can be generated with the following command: + + make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_micro_speech_mock_make_project + +### Build and Run Example + +For more detailed information on building and running examples see the appropriate sections of general descriptions of the [ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP). In the directory with generated project you can also find a *README_ARC_EMSDP.md* file with instructions and options on building and running. Here we only briefly mention main steps which are typically enough to get it started. + +1. You need to [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board) and open an serial connection. + +2. Go to the generated example project director + + cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make` + +3. Build the example using + + make app + +4. To generate artefacts for self-boot of example from the board use + + make flash + +5. To run application from the board using microSD card: + * Copy the content of the created /bin folder into the root of microSD card. Note that the card must be formatted as FAT32 with default cluster size (but less than 32 Kbytes) + * Plug in the microSD card into the J11 connector. + * Push the RST button. If a red LED is lit beside RST button, push the CFG button. + +6. If you have the MetaWare Debugger installed in your environment: + * To run application from the console using it type `make run`. + * To stop the execution type `Ctrl+C` in the console several times. + +In both cases (step 5 and 6) you will see the application output in the serial terminal. + ## Deploy to Arduino The following instructions will help you build and deploy this sample diff --git a/tensorflow/lite/micro/examples/person_detection/README.md b/tensorflow/lite/micro/examples/person_detection/README.md index 5ee7bda9914..ae47c4be0ff 100644 --- a/tensorflow/lite/micro/examples/person_detection/README.md +++ b/tensorflow/lite/micro/examples/person_detection/README.md @@ -6,6 +6,7 @@ run on systems with small amounts of memory such as microcontrollers and DSPs. ## Table of contents - [Getting started](#getting-started) +- [Running on ARC EM SDP](#running-on-arc-em-sdp) - [Running on Arduino](#running-on-arduino) - [Running on ESP32](#running-on-esp32) - [Running on SparkFun Edge](#running-on-sparkfun-edge) @@ -13,6 +14,57 @@ run on systems with small amounts of memory such as microcontrollers and DSPs. - [Debugging image capture](#debugging-image-capture) - [Training your own model](#training-your-own-model) +## Running on ARC EM SDP + +The following instructions will help you to build and deploy this example to +[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform) board. General information and instructions on using the board with TensorFlow Lite Micro can be found in the common [ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md). + +This example is quantized with symmetric uint8 scheme. As noted in [kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md), embARC MLI supports optimized kernels for int8 quantization only. Therefore, this example will only use TFLM reference kernels. + +The ARC EM SDP board contains the reach set of extension interfaces. +You can choose any compatible camera and modify [image_provider.cc](/tensorflow/lite/micro/examples/person_detection/image_provider.cc) file accordingly to use input from your specific camera. By default, results of running this example are printed to the console. If you would like to instead implement some target-specific actions, you need to modify [detection_responder.cc](/tensorflow/lite/micro/examples/person_detection/detection_responder.cc) accordingly. + +The reference implementations of these files are used by default on the EM SDP. + +### Initial setup + +Follow the instructions on the [ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) to get and install all required tools for work with ARC EM SDP. + +### Generate Example Project + +The example project for ARC EM SDP platform can be generated with the following command: + + make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_make_project ` + +### Build and Run Example + +For more detailed information on building and running examples see the appropriate sections of general descriptions of the [ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP). In the directory with generated project you can also find a *README_ARC_EMSDP.md* file with instructions and options on building and running. Here we only briefly mention main steps which are typically enough to get it started. + +1. You need to [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board) and open an serial connection. + +2. Go to the generated example project director + + cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make` + +3. Build the example using + + make app + +4. To generate artefacts for self-boot of example from the board use + + make flash + +5. To run application from the board using microSD card: + * Copy the content of the created /bin folder into the root of microSD card. Note that the card must be formatted as FAT32 with default cluster size (but less than 32 Kbytes) + * Plug in the microSD card into the J11 connector. + * Push the RST button. If a red LED is lit beside RST button, push the CFG button. + +6. If you have the MetaWare Debugger installed in your environment: + * To run application from the console using it type `make run`. + * To stop the execution type `Ctrl+C` in the console several times. + +In both cases (step 5 and 6) you will see the application output in the serial terminal. + ## Running on Arduino The following instructions will help you build and deploy this sample diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md b/tensorflow/lite/micro/examples/person_detection_experimental/README.md index d8aaa9ba383..af0186fb276 100644 --- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md +++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md @@ -7,12 +7,66 @@ This uses the experimental int8 quantized version of the person detection model. ## Table of contents - [Getting started](#getting-started) +- [Running on ARC EM SDP](#running-on-arc-em-sdp) - [Running on Arduino](#running-on-arduino) - [Running on SparkFun Edge](#running-on-sparkfun-edge) - [Run the tests on a development machine](#run-the-tests-on-a-development-machine) - [Debugging image capture](#debugging-image-capture) - [Training your own model](#training-your-own-model) + +## Running on ARC EM SDP + +The following instructions will help you to build and deploy this example to +[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform) board. General information and instructions on using the board with TensorFlow Lite Micro can be found in the common [ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md). + +This example uses asymmetric int8 quantization and can therefore leverage optimized int8 kernels from the embARC MLI library + +The ARC EM SDP board contains a rich set of extension interfaces. +You can choose any compatible camera and modify [image_provider.cc](/tensorflow/lite/micro/examples/person_detection_experimental/image_provider.cc) file accordingly to use input from your specific camera. By default, results of running this example are printed to the console. If you would like to instead implement some target-specific actions, you need to modify [detection_responder.cc](/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.cc) accordingly. + +The reference implementations of these files are used by default on the EM SDP. + +### Initial setup + +Follow the instructions on the [ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) to get and install all required tools for work with ARC EM SDP. + +### Generate Example Project + +The example project for ARC EM SDP platform can be generated with the following command: + + make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project + +### Build and Run Example + +For more detailed information on building and running examples see the appropriate sections of general descriptions of the [ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP). In the directory with generated project you can also find a *README_ARC_EMSDP.md* file with instructions and options on building and running. Here we only briefly mention main steps which are typically enough to get it started. + +1. You need to [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board) and open an serial connection. + +2. Go to the generated example project director + + cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make` + +3. Build the example using + + make app + +4. To generate artefacts for self-boot of example from the board use + + make flash + +5. To run application from the board using microSD card: + * Copy the content of the created /bin folder into the root of microSD card. Note that the card must be formatted as FAT32 with default cluster size (but less than 32 Kbytes) + * Plug in the microSD card into the J11 connector. + * Push the RST button. If a red LED is lit beside RST button, push the CFG button. + +6. If you have the MetaWare Debugger installed in your environment: + * To run application from the console using it type `make run`. + * To stop the execution type `Ctrl+C` in the console several times. + +In both cases (step 5 and 6) you will see the application output in the serial terminal. + + ## Running on Arduino The following instructions will help you build and deploy this sample diff --git a/tensorflow/lite/micro/kernels/arc_mli/README.md b/tensorflow/lite/micro/kernels/arc_mli/README.md new file mode 100644 index 00000000000..2b2e194e757 --- /dev/null +++ b/tensorflow/lite/micro/kernels/arc_mli/README.md @@ -0,0 +1,57 @@ +# EmbARC MLI Library Based Optimizations of TensorFlow Lite Micro Kernels for ARC Platforms. + +This folder contains kernel implementations which use optimized [embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli). It allows acceleration of inference operations which use int8 (asymmetric quantization). + +## Usage + +embARC MLI Library is used by default to speed up execution of some kernels for asymmetrically quantized layers. This means that usual project generation for ARC specific target implies usage of embARC MLI. + +For example: + + make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project + +In case MLI implementation can’t be used, kernels in this folder fallback to TFLM reference implementations. For applications which may not benefit from MLI library, projects can be generated without these implementations by adding `TAGS=no_arc_mli` in the command line, which can reduce overall code size: + + make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_int8_make_project + +For ARC EM SDP board, a pre-compiled MLI library is downloaded and used in the application. For a custom target ARC-based platform, MLI sources are downloaded and compiled during project generation phase. To build library from sources for ARC EM SDP platform, add `BUILD_ARC_MLI=true` option to make command: + + make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp BUILD_ARC_MLI=true generate_person_detection_int8_make_project. + +If an application exclusively uses accelerated MLI kernel implementations, one can strip out TFLM reference kernel implementations to reduce code size of application. Build application with `MLI_ONLY=true` option in generated project (after the project was built): + + cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make + + make app MLI_ONLY=true + +if you try this and application execution fails, then most probably MLI can’t be used for some nodes and you need to revert to using TFLM reference kernels. + + +## Limitations + +Currently, the MLI Library provides optimized implementation only for int8 (asymmetric) versions of the following kernels: +1. Convolution 2D – Per axis quantization only, `dilation_ratio==1` +2. Depthwise Convolution 2D – Per axis quantization only, `dilation_ratio==1` +3. Average Pooling +4. Max Pooling +5. Fully Connected + +Currently only [/tensorflow/lite/micro/examples/person_detection_experimental](/tensorflow/lite/micro/examples/person_detection_experimental) is quantized using this specification. Other examples can be executed on ARC-based targets, but will only use reference kernels. + + +## Scratch Buffers and Slicing + +The following information applies only for ARC EM SDP and other targets with XY memory. embARC MLI uses specific optimizations which assumes node operands are in XY memory and/or DCCM (Data Closely Coupled Memory). As operands might be quite big and may not fit in available XY memory, special slicing logic is applied which allows kernel calculations to be split into multiple parts. For this reason, internal static buffers are allocated in these X, Y and DCCM memory banks and used to execute sub-calculations. + +All this is performed automatically and invisible to the user. Half of the DCCM memory bank and the full XY banks are occupied for MLI specific needs. If the user needs space in XY memory for other tasks, these arrays can be reduced by setting specific sizes. For this, add the following option to build command replacing **** with required values: + + EXT_CFLAGS=”-DSCRATCH_MEM_Z_SIZE= -DSCRATCH_MEM_X_SIZE= -DSCRATCH_MEM_Y_SIZE=” + +For example, to reduce sizes of arrays placed in DCCM and XCCM to 32k and 8k respectively, use next command: + + make app EXT_CFLAGS=”-DSCRATCH_MEM_Z_SIZE=32*1024 -DSCRATCH_MEM_X_SIZE=8*1024” + + +## License + +TensorFlow's code is covered by the Apache2 License included in the repository, and third party dependencies are covered by their respective licenses, in the third_party folder of this package. diff --git a/tensorflow/lite/micro/tools/make/targets/arc/README.md b/tensorflow/lite/micro/tools/make/targets/arc/README.md new file mode 100644 index 00000000000..8d20a4681ff --- /dev/null +++ b/tensorflow/lite/micro/tools/make/targets/arc/README.md @@ -0,0 +1,214 @@ +# Building TensorFlow Lite for Microcontrollers for Synopsys DesignWare ARC EM/HS Processors + +This document contains the general information on building and running TensorFlow Lite Micro for targets based on the Synopsys ARC EM/HS Processors. + +## Table of Contents + +- [Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit) +- [ARC EM Software Development Platform (ARC EM SDP)](#ARC-EM-Software-Development-Platform-ARC-EM-SDP) +- [Custom ARC EM or HS Platform](#Custom-ARC-EMHS-Platform) + + +## Install the Synopsys DesignWare ARC MetaWare Development Toolkit + +The Synopsys DesignWare ARC MetaWare Development Toolkit (MWDT) is required to build and run Tensorflow Lite Micro applications for all ARC EM/HS targets. + +To license MWDT, please see further details [here](https://www.synopsys.com/dw/ipdir.php?ds=sw_metaware) + +To request an evaluation version of MWDT, please use the [Synopsys Eval Portal](https://eval.synopsys.com/) and follow the link for the MetaWare Development Toolkit (Important: Do not confuse this with MetaWare EV Development Toolkit or MetaWare Lite options also available on this page) + +Run the downloaded installer and follow the instructions to set up the toolchain on your platform. + +TensorFlow Lite for Microcontrollers builds are divided into two phases: Application Project Generation and Application Project Building/Running. The former phase requires \*nix environment while the latter does not. + +For basic project generation targeting [ARC EM Software Development Platform](#ARC-EM-Software-Development-Platform-ARC-EM-SDP), MetaWare is NOT required for the Project Generation Phase. However, it is required in case the following: +- For project generation for custom (not EM SDP) targets +- To build microlib target library with all required TFLM objects for external use + +Please consider the above when choosing whether to install Windows or Linux or both versions of MWDT + + +## ARC EM Software Development Platform (ARC EM SDP) + +This section describes how to deploy on an [ARC EM SDP board](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform) + +### Initial Setup + +To use the EM SDP, you need the following hardware and software: + +#### ARC EM SDP +More information on the platform, including ordering information, can be found [here](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform). + +#### MetaWare Development Toolkit +See [Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit) section for instructions on toolchain installation. + +#### Digilent Adept 2 System Software Package +If you wish to use the MetaWare Debugger to debug your code, you need to also install the Digilent Adept 2 software, which includes the necessary drivers for connecting to the targets. This is available from oficial [Digilent site](https://reference.digilentinc.com/reference/software/adept/start?redirect=1#software_downloads). You should install the “System” component, and Runtime. Utilities and SDK are NOT required. + +Digilent installation is NOT required if you plan to deploy to EM SDP via the SD card instead of using the debugger. + +#### Make Tool +A `'make'` tool is required for both phases of deploying Tensorflow Lite Micro applications on ARC EM SDP: +1. Application project generation +2. Working with generated application (build and run) + +For the first phase you need an environment and make tool compatible with Tensorflow Lite for Micro build system. At the moment of this writing, this requires make >=3.82 and a *nix-like environment which supports shell and native commands for file manipulations. MWDT toolkit is not required for this phase. + +For the second phase, requirements are less strict. The gmake version delivered with MetaWare Development Toolkit is sufficient. There are no shell and *nix command dependencies, so Windows can be used + + +#### Serial Terminal Emulation Application +The Debug UART port of the EM SDP is used to print application output. The USB connection provides both the debug channel and RS232 transport. You can use any terminal emulation program (like [PuTTY](https://www.putty.org/)) to view UART output from the EM SDP. + +#### microSD Card +If you want to self-boot your application (start it independently from a debugger connection), you also need a microSD card with a minimum size of 512 MB and a way to write to the card from your development host + +### Connect the Board + +1. Make sure Boot switches of the board (S3) are configured in the next way: + +| Switch # | Switch position | +| :-------: | :----------------: | +| 1 | Low (0) | +| 2 | Low (0) | +| 3 | High (1) | +| 4 | Low (0) | + + +2. Connect the power supply included in the product package to the ARC EM SDP. +3. Connect the USB cable to connector J10 on the ARC EM SDP (near the RST and CFG buttons) and to an available USB port on your development host. +4. Determine the COM port assigned to the USB Serial Port (on Windows, using Device Manager is an easy way to do this) +5. Execute the serial terminal application you installed in the previous step and open the serial connection with the early defined COM port (speed 115200 baud; 8 bits; 1 stop bit; no parity). +6. Push the CFG button on the board. After a few seconds you should see the boot log in the terminal which begins as follows: + +``` +U-Boot + +CPU: ARC EM11D v5.0 at 40 MHz +Subsys:ARC Data Fusion IP Subsystem +Model: snps,emsdp +Board: ARC EM Software Development Platform v1.0 +… +``` + +### Generate Application Project for ARC EM SDP + +Before building an example or test application, you need to generate a TFLM project for this application from TensorFlow sources and external dependencies. To generate it for ARC EM SDP board you need to set `TARGET=arc_emsdp` on the make command line. For instance, to build the Person Detect test application, use a shell to execute the following command from the root directory of the TensorFlow repo: + + make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_test_int8_make_project TARGET=arc_emsdp + +The application project will be generated into *tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_test_int8/make* + +Info on generating and building example applications for EM SDP (*tensorflow/lite/micro/examples*) can be found in the appropriate readme file placed in the same directory with the examples. In general, it’s the same process which described in this Readme. + +The [embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli) is used by default to speed up execution of some kernels for asymmetrically quantized layers. Kernels which use MLI-based implementations are kept in the *tensorflow/lite/micro/kernels/arc_mli* folder. For applications which may not benefit from MLI library, the project can be generated without these implementations by adding `TAGS=no_arc_mli` in the command line. This can reduce code size when the optimized kernels are not required. + +For more options on embARC MLI usage see [kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md). + +### Build the Application + +You may need to adjust the following commands in order to use the appropriate make tool available in your environment (ie: `make` or `gmake`) + +1. Open command shell and change the working directory to the location which contains the generated project, as described in the previous section + +2. Clean previous build artifacts (optional) + + make clean + +3. Build application + + make app + +### Run the Application on the Board Using MetaWare Debugger + +In case you do not have access to the MetaWare Debugger or have chosen not to install the Digilent drivers, you can skip to the next section. + +To run the application from the console, use the following command: + + make run + +If application runs in an infinite loop, type `Ctrl+C` several times to exit the debugger. + +To run the application in the GUI debugger, use the following command: + + make debug + +In both cases you will see the application output in the serial terminal. + +### Run the Application on the Board from the microSD Card + +1. Use the following command in the same command shell you used for building the application, as described in the previous step + + make flash + +2. Copy the content of the created *./bin* folder into the root of microSD card. Note that the card must be formatted as FAT32 with default cluster size (but less than 32 Kbytes) +3. Plug in the microSD card into the J11 connector. +4. Push the RST button. If a red LED is lit beside RST button, push the CFG button. + +You will see the application output in the serial terminal. + + + +## Custom ARC EM/HS Platform + +This section describes how to deploy on a Custom ARC EM/HS platform defined only by a TCF (Tool Configuration File, created at CPU configuration time) and optional LCF (Linker Command File). In this case, the real hardware is unknown, and applications can be run only in the nSIM simulator included with the MetaWare toolkit + +### Initial Setup + +To with custom ARC EM/HS platform, you need the following : +* Synopsys MetaWare Development Toolkit version 2019.12 or higher +* Make tool (make or gmake) + +See [Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit) section for instructions on toolchain installation. +See [MetaWare Development Toolkit](#MetaWare-Development-Toolkit) and [Make Tool](#Make-Tool) sections for instructions on toolchain installation and comments about make versions. + +### Generate Application Project + +Before building the application itself, you need to generate the project for this application from TensorFlow sources and external dependencies. To generate it for a custom TCF you need to set the following variables in the make command line: +* TARGET_ARCH=arc +* TCF_FILE= +* (optional) LCF_FILE= + +If you don’t supply an external LCF, the one embedded in the TCF will be used instead + +For instance, to build **Person Detection** test application, use the following command from the root directory of the TensorFlow repo: + + make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_test_int8_make_project TARGET_ARCH=arc TCF_FILE= LCF_FILE= + +The application project will be generated into *tensorflow/lite/micro/tools/make/gen/_arc/prj/person_detection_test_int8/make* + +The [embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli) is used by default to speed up execution of some kernels for asymmetrically quantized layers. Kernels which use MLI-based implementations are kept in the *tensorflow/lite/micro/kernels/arc_mli* folder. For applications which may not benefit from MLI library, the project can be generated without these implementations by adding `TAGS=no_arc_mli` in the command line. This can reduce code size when the optimized kernels are not required. + +For more options on embARC MLI usage see [kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md). + +### Build the Application + +You may need to adjust the following commands in order to use the appropriate make tool available in your environment (ie: `make` or `gmake`) + +1. Open command shell and change the working directory to the location which contains the generated project, as described in the previous section + +2. Clean previous build artifacts (optional) + + make clean + +3. Build application + + make app + +### Run the Application with MetaWare Debugger on the nSim Simulator. + +To run application from the console, use the following command: + + make run + +If application runs in an infinite loop, type `Ctrl+C` several times to exit the debugger. + +To run the application in the GUI debugger, use the following command: + + make debug + +You will see the application output in the same console where you ran it. + +## License + +TensorFlow's code is covered by the Apache2 License included in the repository, and third-party dependencies are covered by their respective licenses, in the third_party folder of this package. diff --git a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl index b722b9c441d..0ddaf3e0a81 100644 --- a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl +++ b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl @@ -1,2 +1,45 @@ -# Mock Project Readme for common ARC target +# TensorFlow Lite Micro ARC Make Project +This folder has been autogenerated by TensorFlow, and contains sources, headers, and project files needed to build a single TensorFlow Lite Micro application using make tool and a Synopsys DesignWare ARC processor compatible toolchain, specifically the ARC MetaWare Development Toolkit (MWDT). + +This project has been generated for a target defined by TCF file only (Tool Configuration File). The real target board is unspecified, and applications can be run only in the nSIM simulator included with MWDT. + +See +[tensorflow/lite/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro) +for details on how projects like this can be generated from the main source tree. + +## Usage + +See [Custom ARC EM/HS Platform](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#Custom-ARC-EMHS-Platform) section for more detailed information on requirements and usage of this project. + +The Makefile contains all the information on building and running the project. One can modify it to satisfy specific needs. Next actions are available out of the box. You may need to adjust the following commands in order to use the appropriate make tool available in your environment, ie: `make` or `gmake` + +1. Build the application. + + make app + +2. Build the application passing additional flags to compiler. + + make app EXT_CFLAGS=[additional compiler flags] + +3. Build the application and stripout TFLM reference kernel fallback implementations in order to reduce code size. This only has an effect in case the project was generated with MLI support. See more info in [EmbARC MLI Library Based Optimizations](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/kernels/arc_mli/README.md). `false` is the default value. + + make app MLI_ONLY=[true|false] + +4. Delete all artifacts created during build. + + make clean + +5. Run the application with the nSIM simulator in console mode. + + make run + +6. Run the application with the nSIM simulator, but using the MetaWare Debugger GUI for further execution/debugging capabilities. + + make debug + + + +## License + +TensorFlow's code is covered by the Apache2 License included in the repository, and third party dependencies are covered by their respective licenses, in the third_party folder of this package. diff --git a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl index b3d9257f4d2..9d2801ed6b7 100644 --- a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl +++ b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl @@ -1,2 +1,48 @@ -# Mock Project Readme for ARC EMSDP target +# TensorFlow Lite Micro ARC Make Project for EM SDP Board. +This folder has been autogenerated by TensorFlow, and contains source, header, and project files needed to build a single TensorFlow Lite Micro target using make tool and and a Synopsys DesignWare ARC processor compatible toolchain, specifically the ARC MetaWare Development Toolkit (MWDT). + +This project has been generated for the ARC EM Software Development Platform (EM SDP). The built application can be run only on this platform. + +See +[tensorflow/lite/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro) +for details on how projects like this can be generated from the main source tree. + +## Usage + +See [ARC EM Software Development Platform](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) section for more detailed information on requirements and usage of this project. + +The Makefile contains all the information on building and running the project. One can modify it to satisfy specific needs. Next actions are available out of the box. You may need to adjust the following commands in order to use the appropriate make tool available in your environment, ie: `make` or `gmake`: + +1. Build the application. + + make app + +2. Build the application passing additional flags to compiler. + + make app EXT_CFLAGS=[additional compiler flags] + +3. Build the application and stripout TFLM reference kernel fallback implementations in order to reduce code size. This only has an effect in case the project was generated with MLI support. See more info in [EmbARC MLI Library Based Optimizations](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/kernels/arc_mli/README.md). `false` is the default value. + + make app MLI_ONLY=[true|false] + +4. Delete all artifacts created during build. + + make clean + +5. Run the application with the nSIM simulator in console mode. + + make run + +6. Load the application and open MetaWare Debugger GUI for further execution/debugging. + + make debug + +7. Generate necessary artefacts for self-booting execution from flash. See [reference to Run the application on the board from the micro SD card](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#Run-the-Application-on-the-Board-from-the-microSD-Card). + + make flash + + +## License + +TensorFlow's code is covered by the Apache2 License included in the repository, and third party dependencies are covered by their respective licenses, in the third_party folder of this package. From 754e0d967f131165badc7d28b41bf6ad3f7c9132 Mon Sep 17 00:00:00 2001 From: Rishit Dagli <39672672+Rishit-dagli@users.noreply.github.com> Date: Sat, 2 May 2020 09:25:13 +0530 Subject: [PATCH 052/557] Added in resources section Added Coursera course Machine Learning with TensorFlow on GCP --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 27032043e07..d1bc88b8dbc 100644 --- a/README.md +++ b/README.md @@ -142,6 +142,7 @@ Build Type | Status * [Getting Started with TensorFlow 2 from Coursera](https://www.coursera.org/learn/getting-started-with-tensor-flow2) * [Intro to TensorFlow for Deep Learning from Udacity](https://www.udacity.com/course/intro-to-tensorflow-for-deep-learning--ud187) * [Introduction to TensorFlow Lite from Udacity](https://www.udacity.com/course/intro-to-tensorflow-lite--ud190) +* [Machine Learning with TensorFLow on GCP](https://www.coursera.org/specializations/machine-learning-tensorflow-gcp) * [TensorFlow Blog](https://blog.tensorflow.org) * [Learn ML with TensorFlow](https://www.tensorflow.org/resources/learn-ml) * [TensorFlow Twitter](https://twitter.com/tensorflow) From 74b9f9dcc9e7bfaf1a72ddab5a6711d748e6fbf8 Mon Sep 17 00:00:00 2001 From: Marcin Sielski Date: Sun, 3 May 2020 13:31:57 +0200 Subject: [PATCH 053/557] Cross and native compilation of TFLite for RPI Why: * Describe correct cross and native compilation process for RPI. This change addresses the need by: * Updates in instruction for cross compilation. * Alignement text style across whole instruction. --- tensorflow/lite/g3doc/guide/build_rpi.md | 104 ++++++++++++----------- 1 file changed, 53 insertions(+), 51 deletions(-) diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md index 1e04ee77a0e..a1724258118 100644 --- a/tensorflow/lite/g3doc/guide/build_rpi.md +++ b/tensorflow/lite/g3doc/guide/build_rpi.md @@ -5,87 +5,89 @@ Raspberry Pi. If you just want to start using TensorFlow Lite to execute your models, the fastest option is to install the TensorFlow Lite runtime package as shown in the [Python quickstart](python.md). -Note: This page shows how to compile only the C++ static library for +**Note:** This page shows how to compile only the C++ static library for TensorFlow Lite. Alternative install options include: [install just the Python interpreter API](python.md) (for inferencing only); [install the full TensorFlow package from pip](https://www.tensorflow.org/install/pip); or [build the full TensorFlow package]( https://www.tensorflow.org/install/source_rpi). - ## Cross-compile for Raspberry Pi -This has been tested on Ubuntu 16.04.3 64bit and TensorFlow devel docker image +Instruction has been tested on Ubuntu 16.04.3 64-bit PC (AMD64) and TensorFlow devel +docker image [tensorflow/tensorflow:nightly-devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/). -To cross compile TensorFlow Lite, first install the toolchain and libs: +To cross compile TensorFlow Lite follow the steps: -```bash -sudo apt-get update -sudo apt-get install crossbuild-essential-armhf -# The following is only needed for Pi Zero build. -sudo apt-get install crossbuild-essential-armel -``` +1. Clone official Raspberry Pi cross-compilation toolchain: -If you are using Docker, you may not use `sudo`. + ```bash + git clone --depth 1 https://github.com/raspberrypi/tools.git rpi_tools + ``` -Now git-clone the TensorFlow repository -(`https://github.com/tensorflow/tensorflow`)—if you're using the TensorFlow -Docker image, the repo is already provided in `/tensorflow_src/`—and then run -this script at the root of the TensorFlow repository to download all the +2. Clone TensorFlow repository: + + ```bash + git clone --depth 1 https://github.com/tensorflow/tensorflow.git tensorflow_src + + ``` + + **Note:** If you're using the TensorFlow Docker image, the repo is already provided in `/tensorflow_src/`. + +3. Run following script at the root of the TensorFlow repository to download all the build dependencies: -```bash -./tensorflow/lite/tools/make/download_dependencies.sh -``` + ```bash + cd tensor_src && ./tensorflow/lite/tools/make/download_dependencies.sh + ``` -Note that you only need to do this once. + **Note:** You only need to do this once. -You should then be able to compile: +4. To build ARMv7 binary for Raspberry Pi 2, 3 and 4 execute: -To build ARMv7 binary for Raspberry Pi 2, 3 and 4: + ```bash + PATH=$PATH:../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/ ./tensorflow/lite/tools/make/build_rpi_lib.sh + ``` -```bash -./tensorflow/lite/tools/make/build_rpi_lib.sh -``` + **Note:** This should compile a static library in: + `tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`. -This should compile a static library in: -`tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`. +5. To build ARMv6 binary for Raspberry Pi Zero execute: -To build ARMv6 binary for Raspberry Pi Zero: + ```bash + PATH=$PATH:../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/ ./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6 + ``` -```bash -./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6 -``` - -This should compile a static library in: -`tensorflow/lite/tools/make/gen/rpi_armv6/lib/libtensorflow-lite.a`. + **Note:** This should compile a static library in: + `tensorflow/lite/tools/make/gen/rpi_armv6/lib/libtensorflow-lite.a`. ## Compile natively on Raspberry Pi -This has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1). +Instruction has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1): -Log in to your Raspberry Pi and install the toolchain: +To natively compile TensorFlow Lite follow the steps: -```bash -sudo apt-get install build-essential -``` +1. Log in to your Raspberry Pi and install the toolchain: -Now git-clone the TensorFlow repository -(`https://github.com/tensorflow/tensorflow`) and run this at the root of -the repository: + ```bash + sudo apt-get install build-essential + ``` -```bash -./tensorflow/lite/tools/make/download_dependencies.sh -``` +2. Run following script at the root of the TensorFlow repository to download all the +build dependencies: -Note that you only need to do this once. + ```bash + cd tensor_src && ./tensorflow/lite/tools/make/download_dependencies.sh + ``` -You should then be able to compile: + **Note:** You only need to do this once. -```bash -./tensorflow/lite/tools/make/build_rpi_lib.sh -``` +3. You should then be able to compile TensorFlow Lite with: -This should compile a static library in: -`tensorflow/lite/tools/make/gen/lib/rpi_armv7/libtensorflow-lite.a`. + ```bash + ./tensorflow/lite/tools/make/build_rpi_lib.sh + ``` + + **Note:** This should compile a static library in: + `tensorflow/lite/tools/make/gen/lib/rpi_armv7/libtensorflow-lite.a`. From b9579f96bd07d3016285128e1e2466540b47bf01 Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Fri, 27 Mar 2020 14:20:09 -0700 Subject: [PATCH 054/557] Vectorize transpose --- tensorflow/core/kernels/conv_2d_gpu.h | 91 +++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h index 31abe9dfead..90d85e6f04e 100644 --- a/tensorflow/core/kernels/conv_2d_gpu.h +++ b/tensorflow/core/kernels/conv_2d_gpu.h @@ -210,6 +210,57 @@ __global__ void ShuffleInTensor3Simple(int nthreads, } } +constexpr int kUnroll = 4; + +template +__global__ void ShuffleInTensor3SimpleVector(int nthreads, + const T* __restrict__ input, + Dimension<3> input_dims, + T* __restrict__ output) { + Dimension<3> output_dims; + output_dims[sp0] = input_dims[0]; + output_dims[sp1] = input_dims[1]; + output_dims[sp2] = input_dims[2]; + + const int stride = blockDim.x * gridDim.x * kUnroll; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + T buf[kUnroll]; + + int output_index; + for (output_index = tid * kUnroll; output_index + kUnroll - 1 < nthreads; + output_index += stride) { +#pragma unroll + for (int i = 0; i < kUnroll; i++) { + int output_index_i = output_index + i; + Index<3> output_tensor_index = FlatToTensorIndex(output_index_i, + output_dims); + Index<3> input_tensor_index; + input_tensor_index[0] = output_tensor_index[sp0]; + input_tensor_index[1] = output_tensor_index[sp1]; + input_tensor_index[2] = output_tensor_index[sp2]; + + int input_index_i = TensorIndexToFlat(input_tensor_index, input_dims); + buf[i] = maybe_conj::run(ldg(input + input_index_i)); + } + float2 *out = reinterpret_cast(output + output_index); + *out = *reinterpret_cast(buf); + } + + for(; output_index < nthreads; output_index++) { + Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims); + + Index<3> input_tensor_index; + input_tensor_index[0] = output_tensor_index[sp0]; + input_tensor_index[1] = output_tensor_index[sp1]; + input_tensor_index[2] = output_tensor_index[sp2]; + + int input_index = TensorIndexToFlat(input_tensor_index, input_dims); + + output[output_index] = + maybe_conj::run(ldg(input + input_index)); + } +} + // Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor, // where dimensions are zero-based: output[i][j][k] = input[i][k][j]. // @@ -1008,10 +1059,42 @@ struct SwapDimension0And2InTensor3 { static_cast(combined_dims[2])}; size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2]; GpuLaunchConfig config = GetGpuLaunchConfig(total_size, d); - TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple, - config.block_count, config.thread_per_block, 0, - d.stream(), config.virtual_thread_count, in, - input_dims, out)); + + auto out_ptr = reinterpret_cast(out); + bool aligned = out_ptr % 16 == 0; + + bool use_vector = false; + bool use_custom_config = false; + if (input_dims[0] <= 128 && input_dims[2] <= 128 || + input_dims[0] * input_dims[1] <= 128 || + input_dims[1] * input_dims[2] <= 8) { + use_vector = true; + use_custom_config = true; + } else if (input_dims[1] * input_dims[2] <= 16384) { + use_vector = true; + } + + if (sizeof(T) == 2 && aligned && use_vector) { + int block_count; + if (use_custom_config) { + block_count = (total_size + config.thread_per_block - 1) / + config.thread_per_block; + } else { + block_count = config.block_count; + } + + TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3SimpleVector, + block_count, + config.thread_per_block / kUnroll, + 0, d.stream(), total_size, + in, input_dims, out)); + } else { + TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple, + config.block_count, config.thread_per_block, + 0, d.stream(), config.virtual_thread_count, + in, input_dims, out)); + } } }; From 9c36f4b4266a13501ebf131ded0fb5639c29ede7 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Wed, 6 May 2020 19:28:16 +0300 Subject: [PATCH 055/557] EM SDP tcf file is removed (to be downloaded with MLI package) + minor fixes in Readmes --- .../micro/examples/micro_speech/README.md | 2 +- .../micro/examples/person_detection/README.md | 4 +- .../person_detection_experimental/README.md | 2 +- .../micro/tools/make/ext_libs/arc_mli.inc | 4 +- .../targets/arc/emsdp/emsdp_em11d_dfss.tcf | 4907 ----------------- .../tools/make/targets/arc_emsdp_makefile.inc | 17 +- .../tools/make/third_party_downloads.inc | 2 +- 7 files changed, 20 insertions(+), 4918 deletions(-) delete mode 100644 tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf diff --git a/tensorflow/lite/micro/examples/micro_speech/README.md b/tensorflow/lite/micro/examples/micro_speech/README.md index ba55a7d8493..3ab8ad24338 100644 --- a/tensorflow/lite/micro/examples/micro_speech/README.md +++ b/tensorflow/lite/micro/examples/micro_speech/README.md @@ -55,7 +55,7 @@ For more detailed information on building and running examples see the appropria 2. Go to the generated example project director - cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make` + cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/micro_speech_mock/make 3. Build the example using diff --git a/tensorflow/lite/micro/examples/person_detection/README.md b/tensorflow/lite/micro/examples/person_detection/README.md index ae47c4be0ff..d736d6f7cd5 100644 --- a/tensorflow/lite/micro/examples/person_detection/README.md +++ b/tensorflow/lite/micro/examples/person_detection/README.md @@ -34,7 +34,7 @@ Follow the instructions on the [ARC EM SDP Initial Setup](/tensorflow/lite/micro The example project for ARC EM SDP platform can be generated with the following command: - make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_make_project ` + make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_make_project ### Build and Run Example @@ -44,7 +44,7 @@ For more detailed information on building and running examples see the appropria 2. Go to the generated example project director - cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make` + cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make 3. Build the example using diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md b/tensorflow/lite/micro/examples/person_detection_experimental/README.md index af0186fb276..19a39ddd9a5 100644 --- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md +++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md @@ -45,7 +45,7 @@ For more detailed information on building and running examples see the appropria 2. Go to the generated example project director - cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make` + cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make 3. Build the example using diff --git a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc index a95b4550417..5dbb91dd368 100644 --- a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc +++ b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc @@ -24,7 +24,7 @@ ifeq ($(filter no_arc_mli,$(ALL_TAGS)),) ALL_TAGS += arc_mli ifeq ($(BUILD_ARC_MLI),true) - MLI_LIB_DIR = arc_mli_$(basename $(TCF_FILE_NAME)) + MLI_LIB_DIR ?= arc_mli_$(basename $(TCF_FILE_NAME)) $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE))) @@ -36,7 +36,7 @@ ifeq ($(BUILD_ARC_MLI),true) third_party/$(MLI_LIB_DIR)/LICENSE else ifneq ($(ARC_MLI_PRE_COMPILED_TARGET),) - MLI_LIB_DIR = arc_mli_package + MLI_LIB_DIR ?= arc_mli_package $(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),)) MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf deleted file mode 100644 index 833fa9ca9b9..00000000000 --- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf +++ /dev/null @@ -1,4907 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - -# -# option 16/L32/U32 Instructions -# ------ ---------- --------------------- -# -# none -/-/- None -# wlh1 1/1/1 MPYW/U, MPY/U, MPYH/U -# wlh2 2/2/2 MPYW/U, MPY/U, MPYH/U -# wlh3 2/3/3 MPYW/U, MPY/U, MPYH/U -# wlh4 2/4/5 MPYW/U, MPY/U, MPYH/U -# wlh5 5/9/9 MPYW/U, MPY/U, MPYH/U -# -# --mpy_option none - -# code_protection --- The ARC EM architecture divides the memory into 16 regions, which can be protected individually. This feature adds a 16-bit input to the processor core, one bit per region. When the protect bit is set, the processor disables any load or store to the corresponding region. An attempt to access a protected region raises an EV_ProtV exception. --code_protection false - -# stack_checking --- Stack checking is a mechanism for checking stack accesses and raising an exception when a stack overflow or underflow is detected. --stack_checking true - -# unaligned_option --- This enables unaligned loads and stores. --unaligned_option true - -# intvbase_preset --- This sets the interrupt vector base configuration register, VECBASE_AC_BUILD. The vector base address is aligned to a 1KB boundary, so the required address value should be divided by 1K (i.e. do not include the lower 10 bits). On reset, this register is loaded into the interrupt vector base address register, INT_VECTOR_BASE. --intvbase_preset 0x0 - -# intvbase_preset_s --- This sets the secure interrupt vector base configuration register, VECBASE_AC_BUILD. The vector base address is aligned to a 1KB boundary, so the required address value should be divided by 1K (i.e. do not include the lower 10 bits). On reset, this register is loaded into the interrupt vector base address register, INT_VECTOR_BASE_S.This is effective only when 2+2 mode is enabled. --intvbase_preset_s 0x0 - -# intvbase_ext --- Set this option to drive the upper 22 bits of the interrupt base vector externally, into signal intvbase_in. --intvbase_ext false - -# nmi_option --- add Non-maskable external exception support --nmi_option false - -# rgf_impl --- This defines whether the register file is implemented using flip-flops, or with a hard macro. --rgf_impl flip_flops - -# rgf_num_regs --- This defines the size (in 32b register) of the processor register file. --rgf_num_regs 32 - -# rgf_wr_ports --- This defines the number of write ports on the register file. --rgf_wr_ports 2 - -# rgf_num_banks --- Dual register banks are useful if Fast IRQ has been configured, but may be selected even if not. --rgf_num_banks 2 - -# rgf_banked_regs --- This selects the number of registers that are replicated in the second register-file bank. --rgf_banked_regs 32 - -# turbo_boost --- This enables the Turbo Boost synthesis option. By enabling this option, the achievable clock frequency is increased, but at the cost of an additional cycle latency on branch instructions. --turbo_boost false - -# infer_alu_adder --- infer: datapath is described as behavioral code: A + B -# instantiate: datapath is instantiated as a detailed multi-stage code of a carry-lookahead adder. It is generally preferable to use the infer option and add directives for your target synthesizer. --infer_alu_adder infer - -# infer_mpy_wtree --- infer: datapath is described as behavioral code: A * B (applies to only wlh3, wlh4 and wlh5 designs) -# instantiate: datapath is instantiated as a detailed multi-stage code of a Wallace Tree multiplier It is generally preferable to use the infer option and add directives for your target synthesizer. --infer_mpy_wtree instantiate - -# scantest_ram_bypass_mux --- This mux is used to make logic trapped between flops and memory (aka shadow logic) to be covered by scantest without requiring advanced sequential ATPG on the memory to be applied. Will add delay to functional access time --scantest_ram_bypass_mux false - -# logic_bist --- This option will OR LBIST_EN with test_mode --logic_bist false - -# power_domains --- Adds three separate power domains to the core, and propagates power-gate control signals to the top level of the core. Also generates UPF constraints and commands in the low-power scripts --power_domains false - -# dvfs --- Adds logic to the core to allow dynamic controlling of voltage and frequency and propagates the associated control signals to the top level of core --dvfs false - -# voltage_domains --- Creates a voltage domain split between RAM and std cell parts to support Ultra Low Voltage on cells and generates UPF constraints --voltage_domains false - -# mem_bus_option --- The core supports two bus protocols for accessing external memory: AHB & AHB-Lite. AHB-Lite-single means instruction fetch and data access share a single AHB-Lite port. AHB-Lite-dual means separate AHB-Lite port for each initiator if present. --mem_bus_option AHB - -# mem_bus_reg_interface --- Specifies whether the memory bus interface is registered. --mem_bus_reg_interface true - -# dmi_burst_option --- This will enable high-throughput burst support on the DMI slave interfaces. By enabling this option, the peak DMI read throughput goes from 1 word per 3 cycles to N words per N+2 cycles, in which N is the AHB burst lengthDMI write throughput goes from 1 word per 3 cycles to 1 word per cycle. --dmi_burst_option true - -# has_dmp_peripheral --- This option enables the redirection of load/store accesses to one segment (1/16) of the addressable space to a dedicated peripheral bus. This offers high system integration and reduces overall system cost. --has_dmp_peripheral true - -# per0_base --- This option specifies the memory region assignment for this peripheral aperture --per0_base 15 - -# per0_limit --- This option specifies the end of this peripheral aperture --per0_limit 0 - -# per_bus_option --- The core supports one bus protocol for accessing the peripheral space, when enabled: AHB-Lite. --per_bus_option AHB-Lite - -# per_bus_reg_interface --- Specifies whether the peripheral bus interface is registered. --per_bus_reg_interface true - -# clock_gating --- This enables the insertion of architectural clock gate elements in the design. By enabling this option, the clocks to various parts of the design will be disabled when the logic they drive is not in use to save power. --clock_gating false - -# back_compat --- This enables the addition of rst_a input in the clkgate module to support backward compatibility with the older EM and Subsystem releases. --back_compat true - -# byte_parity --- If parity protection on the CCMs or Cache is configured, this option enables parity protection on a per-byte basis. Otherwise, parity is per word basis --byte_parity false - -# prot_pipelined --- Check the box if CCM memories are configured for ECC, and you want single-bit errors to be corrected, written back to memory, and re-fetched. When unchecked, single bit errors are corrected when read from memory, but the offending memory location itself is not corrected with a writeback, no influence on Cache protection --prot_pipelined false - -# cct_test_ena --- When ECC is configured, this option enables single bit error injection in CCT RAM models to demonstrate ECC protection on the RAMs. When enabled, the RAM models can only be used in HDL CCT simulation (no xCAM support) and are not intended for use in SoC level integration. --cct_test_ena false - -# err_prot_ehce --- Enabled enhanced ECC architecture for CCM. Instruction fetch with single bit error is not replayed; ecc cac modules are shared to reduce area and timing opt. --err_prot_ehce false - - -######## dsp_trig --- com.arc.hardware.dfss.dsp_trig.1_0 ######## - -# Create dsp_trig --create com.arc.hardware.dfss.dsp_trig.1_0 System.CPUisle.ARCv2EM.dsp_trig - -# dsp_trig --- Command line option for EIA extension component 'dsp_trig'. --dsp_trig true - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_gpio0 --- com.arc.hardware.dfss.io_gpio0.1_0 ######## - -# Create io_gpio0 --create com.arc.hardware.dfss.io_gpio0.1_0 System.CPUisle.ARCv2EM.io_gpio0 - -# io_gpio0 --- Command line option for EIA extension component 'io_gpio0'. --io_gpio0 true - -# io_gpio0_debounce --- Selects the inclusion of Debounce logic --io_gpio0_debounce 1 - -# io_gpio0_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal --io_gpio0_readback_sync 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - -# io_gpio0_direction_rst_value --- Reset value of the SWPORTA_DDR register, which determines the direction (input/output) of the GPIO interface. 0: input, 1: output. --io_gpio0_direction_rst_value 0 - -# io_gpio0_output_rst_value --- Reset value of the SWPORTA_DR register, which determines the reset value of the GPIO output ports. Bits corresponding to input ports are ignored. --io_gpio0_output_rst_value 0x0 - - -######## io_i2c_mst0 --- com.arc.hardware.dfss.io_i2c_mst0.1_0 ######## - -# Create io_i2c_mst0 --create com.arc.hardware.dfss.io_i2c_mst0.1_0 System.CPUisle.ARCv2EM.io_i2c_mst0 - -# io_i2c_mst0 --- Command line option for APEX extension component 'io_i2c_mst0'. --io_i2c_mst0 true - -# io_i2c_mst0_fs --- RX/TX FIFO size --io_i2c_mst0_fs 16 - -# io_i2c_mst0_dma_support --- Specifies whether the DMA handshake interface is included --io_i2c_mst0_dma_support None - -# io_i2c_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency. --io_i2c_mst0_cdc_included 0 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_i2c_slv0 --- com.arc.hardware.dfss.io_i2c_slv0.1_0 ######## - -# Create io_i2c_slv0 --create com.arc.hardware.dfss.io_i2c_slv0.1_0 System.CPUisle.ARCv2EM.io_i2c_slv0 - -# io_i2c_slv0 --- Command line option for APEX extension component 'io_i2c_slv0'. --io_i2c_slv0 true - -# io_i2c_slv0_fs --- RX/TX FIFO size --io_i2c_slv0_fs 16 - -# io_i2c_slv0_dma_support --- Specifies whether the DMA handshake interface is included --io_i2c_slv0_dma_support None - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_spi_mst0 --- com.arc.hardware.dfss.io_spi_mst0.1_0 ######## - -# Create io_spi_mst0 --create com.arc.hardware.dfss.io_spi_mst0.1_0 System.CPUisle.ARCv2EM.io_spi_mst0 - -# io_spi_mst0 --- Command line option for APEX extension component 'io_spi_mst0'. --io_spi_mst0 true - -# io_spi_mst0_fz --- RX/TX FIFO depth --io_spi_mst0_fs 16 - -# io_spi_mst0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. --io_spi_mst0_max_xfer_size 16 - -# io_spi_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency. --io_spi_mst0_cdc_included 0 - -# io_spi_mst0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_spi_mst0_dma_support Memory-Based - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## subsys_bcr --- com.arc.hardware.dfss.subsys_bcr.1_0 ######## - -# Create subsys_bcr --create com.arc.hardware.dfss.subsys_bcr.1_0 System.CPUisle.ARCv2EM.subsys_bcr - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_spi_mst1 --- com.arc.hardware.dfss.io_spi_mst1.1_0 ######## - -# Create io_spi_mst1 --create com.arc.hardware.dfss.io_spi_mst1.1_0 System.CPUisle.ARCv2EM.io_spi_mst1 - -# io_spi_mst1 --- Command line option for APEX extension component 'io_spi_mst1'. --io_spi_mst1 true - -# io_spi_mst1_fz --- RX/TX FIFO depth --io_spi_mst1_fs 16 - -# io_spi_mst1_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. --io_spi_mst1_max_xfer_size 16 - -# io_spi_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency. --io_spi_mst1_cdc_included 0 - -# io_spi_mst1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_spi_mst1_dma_support Memory-Based - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_spi_mst2 --- com.arc.hardware.dfss.io_spi_mst2.1_0 ######## - -# Create io_spi_mst2 --create com.arc.hardware.dfss.io_spi_mst2.1_0 System.CPUisle.ARCv2EM.io_spi_mst2 - -# io_spi_mst2 --- Command line option for APEX extension component 'io_spi_mst2'. --io_spi_mst2 true - -# io_spi_mst2_fz --- RX/TX FIFO depth --io_spi_mst2_fs 16 - -# io_spi_mst2_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. --io_spi_mst2_max_xfer_size 16 - -# io_spi_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency. --io_spi_mst2_cdc_included 0 - -# io_spi_mst2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_spi_mst2_dma_support None - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_spi_slv0 --- com.arc.hardware.dfss.io_spi_slv0.1_0 ######## - -# Create io_spi_slv0 --create com.arc.hardware.dfss.io_spi_slv0.1_0 System.CPUisle.ARCv2EM.io_spi_slv0 - -# io_spi_slv0 --- Command line option for APEX extension component 'io_spi_slv0'. --io_spi_slv0 true - -# io_spi_slv0_fz --- RX/TX FIFO depth --io_spi_slv0_fs 16 - -# io_spi_slv0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width. --io_spi_slv0_max_xfer_size 16 - -# io_spi_slv0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_spi_slv0_dma_support Memory-Based - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_gpio1 --- com.arc.hardware.dfss.io_gpio1.1_0 ######## - -# Create io_gpio1 --create com.arc.hardware.dfss.io_gpio1.1_0 System.CPUisle.ARCv2EM.io_gpio1 - -# io_gpio1 --- Command line option for EIA extension component 'io_gpio1'. --io_gpio1 true - -# io_gpio1_debounce --- Selects the inclusion of Debounce logic --io_gpio1_debounce 1 - -# io_gpio1_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal --io_gpio1_readback_sync 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - -# io_gpio1_direction_rst_value --- Reset value of the SWPORTA_DDR register, which determines the direction (input/output) of the GPIO interface. 0: input, 1: output. --io_gpio1_direction_rst_value 0 - -# io_gpio1_output_rst_value --- Reset value of the SWPORTA_DR register, which determines the reset value of the GPIO output ports. Bits corresponding to input ports are ignored. --io_gpio1_output_rst_value 0x0 - - -######## io_gpio2 --- com.arc.hardware.dfss.io_gpio2.1_0 ######## - -# Create io_gpio2 --create com.arc.hardware.dfss.io_gpio2.1_0 System.CPUisle.ARCv2EM.io_gpio2 - -# io_gpio2 --- Command line option for EIA extension component 'io_gpio2'. --io_gpio2 true - -# io_gpio2_debounce --- Selects the inclusion of Debounce logic --io_gpio2_debounce 1 - -# io_gpio2_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal --io_gpio2_readback_sync 1 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - -# io_gpio2_direction_rst_value --- Reset value of the SWPORTA_DDR register, which determines the direction (input/output) of the GPIO interface. 0: input, 1: output. --io_gpio2_direction_rst_value 0 - -# io_gpio2_output_rst_value --- Reset value of the SWPORTA_DR register, which determines the reset value of the GPIO output ports. Bits corresponding to input ports are ignored. --io_gpio2_output_rst_value 0x0 - - -######## io_i2c_mst1 --- com.arc.hardware.dfss.io_i2c_mst1.1_0 ######## - -# Create io_i2c_mst1 --create com.arc.hardware.dfss.io_i2c_mst1.1_0 System.CPUisle.ARCv2EM.io_i2c_mst1 - -# io_i2c_mst1 --- Command line option for APEX extension component 'io_i2c_mst1'. --io_i2c_mst1 true - -# io_i2c_mst1_fs --- RX/TX FIFO size --io_i2c_mst1_fs 16 - -# io_i2c_mst1_dma_support --- Specifies whether the DMA handshake interface is included --io_i2c_mst1_dma_support None - -# io_i2c_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency. --io_i2c_mst1_cdc_included 0 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_i2c_mst2 --- com.arc.hardware.dfss.io_i2c_mst2.1_0 ######## - -# Create io_i2c_mst2 --create com.arc.hardware.dfss.io_i2c_mst2.1_0 System.CPUisle.ARCv2EM.io_i2c_mst2 - -# io_i2c_mst2 --- Command line option for APEX extension component 'io_i2c_mst2'. --io_i2c_mst2 true - -# io_i2c_mst2_fs --- RX/TX FIFO size --io_i2c_mst2_fs 16 - -# io_i2c_mst2_dma_support --- Specifies whether the DMA handshake interface is included --io_i2c_mst2_dma_support None - -# io_i2c_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency. --io_i2c_mst2_cdc_included 0 - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_uart0 --- com.arc.hardware.dfss.io_uart0.1_0 ######## - -# Create io_uart0 --create com.arc.hardware.dfss.io_uart0.1_0 System.CPUisle.ARCv2EM.io_uart0 - -# io_uart0 --- Command line option for EIA extension component 'io_uart0'. --io_uart0 true - -# io_uart0_fifo_mode --- Set the UART FIFO mode --io_uart0_fifo_mode 16 - -# io_uart0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_uart0_dma_support None - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_uart1 --- com.arc.hardware.dfss.io_uart1.1_0 ######## - -# Create io_uart1 --create com.arc.hardware.dfss.io_uart1.1_0 System.CPUisle.ARCv2EM.io_uart1 - -# io_uart1 --- Command line option for EIA extension component 'io_uart1'. --io_uart1 true - -# io_uart1_fifo_mode --- Set the UART FIFO mode --io_uart1_fifo_mode 16 - -# io_uart1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_uart1_dma_support None - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_uart2 --- com.arc.hardware.dfss.io_uart2.1_0 ######## - -# Create io_uart2 --create com.arc.hardware.dfss.io_uart2.1_0 System.CPUisle.ARCv2EM.io_uart2 - -# io_uart2 --- Command line option for EIA extension component 'io_uart2'. --io_uart2 true - -# io_uart2_fifo_mode --- Set the UART FIFO mode --io_uart2_fifo_mode 16 - -# io_uart2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_uart2_dma_support None - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_uart3 --- com.arc.hardware.dfss.io_uart3.1_0 ######## - -# Create io_uart3 --create com.arc.hardware.dfss.io_uart3.1_0 System.CPUisle.ARCv2EM.io_uart3 - -# io_uart3 --- Command line option for EIA extension component 'io_uart3'. --io_uart3 true - -# io_uart3_fifo_mode --- Set the UART FIFO mode --io_uart3_fifo_mode 16 - -# io_uart3_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel. --io_uart3_dma_support None - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_i2s_rx_mst0 --- com.arc.hardware.dfss.io_i2s_rx_mst0.1_0 ######## - -# Create io_i2s_rx_mst0 --create com.arc.hardware.dfss.io_i2s_rx_mst0.1_0 System.CPUisle.ARCv2EM.io_i2s_rx_mst0 - -# io_i2s_rx_mst0 --- Command line option for APEX extension component 'io_i2s_rx_mst0'. --io_i2s_rx_mst0 true - -# io_i2s_rx_mst0_fs --- RX FIFO size --io_i2s_rx_mst0_fs 8 - -# io_i2s_rx_mst0_fw --- RX FIFO width --io_i2s_rx_mst0_fw 16 - -# io_i2s_rx_mst0_dma_support --- Specifies whether the DMA handshake interface is included --io_i2s_rx_mst0_dma_support Memory-Based - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_i2s_tx_mst0 --- com.arc.hardware.dfss.io_i2s_tx_mst0.1_0 ######## - -# Create io_i2s_tx_mst0 --create com.arc.hardware.dfss.io_i2s_tx_mst0.1_0 System.CPUisle.ARCv2EM.io_i2s_tx_mst0 - -# io_i2s_tx_mst0 --- Command line option for APEX extension component 'io_i2s_tx_mst0'. --io_i2s_tx_mst0 true - -# io_i2s_tx_mst0_fs --- TX FIFO size --io_i2s_tx_mst0_fs 8 - -# io_i2s_tx_mst0_fw --- TX FIFO width --io_i2s_tx_mst0_fw 16 - -# io_i2s_tx_mst0_dma_support --- Specifies whether the DMA handshake interface is included --io_i2s_tx_mst0_dma_support Memory-Based - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## io_pdm_rx0 --- com.arc.hardware.dfss.io_pdm_rx0.1_0 ######## - -# Create io_pdm_rx0 --create com.arc.hardware.dfss.io_pdm_rx0.1_0 System.CPUisle.ARCv2EM.io_pdm_rx0 - -# io_pdm_rx0 --- Command line option for APEX extension component 'io_pdm_rx0'. --io_pdm_rx0 true - -# io_pdm_rx0_ch --- Number of Stereo Channels --io_pdm_rx0_ch 1 - -# io_pdm_rx0_fs --- RX FIFO size --io_pdm_rx0_fs 16 - -# io_pdm_rx0_ns --- Maximum number of CIC stages --io_pdm_rx0_ns 4 - -# io_pdm_rx0_ds --- Maximum delay in the COMB filter of the CIC filter --io_pdm_rx0_ds 2 - -# io_pdm_rx0_dma_support --- Specifies whether the DMA handshake interface is included --io_pdm_rx0_dma_support Memory-Based - -# assign_xpubit --- -# -# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group. -#

-# By default an extension is not assigned a bit in this register. This means the extension is always available. -#

-# If you wish to assign an XPU bit number, select this option. -# -# --assign_xpubit false - -# xpubit --- -# The XPU bit number for this extension. -# --xpubit 0 - - -######## DCCM --- com.arc.hardware.DCCM.1_0 ######## - -# Create DCCM --create com.arc.hardware.DCCM.1_0 System.CPUisle.ARCv2EM.DCCM - -# dccm_size --- This defines the size of the Data Closely Coupled Memory (DCCM) in bytes --dccm_size 131072 - -# dccm_base --- Sets the initial memory region assignment for DCCM --dccm_base 8 - -# dccm_interleave --- Split DCCM into even/odd memory banks. --dccm_interleave false - -# dccm_prot --- Specifies the type of protection built for the DCCM. --dccm_prot None - -# dccm_prot_level --- Specifies the level protection. --dccm_prot_level Data_Only - -# dccm_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the DCCM --dccm_prot_exceptions true - -# dccm_sec_lvl --- Specifies the level of secure DCCM. --dccm_sec_lvl Non_Secure - -# dccm_dmi --- This enables external access through a DMI (direct memory interface) port. --dccm_dmi true - - -######## DMA Controller --- com.arc.hardware.DMA_Controller.1_0 ######## - -# Create DMA Controller --create com.arc.hardware.DMA_Controller.1_0 "System.CPUisle.ARCv2EM.DMA Controller" - -# dmac_channels --- This options specifies the number of DMA channels implemented in the DMA controller --dmac_channels 16 - -# dmac_fifo_depth --- This option specifies the DMA transfer FIFO depth in 32b words. --dmac_fifo_depth 2 - -# dmac_int_config --- None: the DMA controller cannot raise an interrupt -# Single-External: single done and single error interrupt signal for all DMA channels, and the interrupt signals are routed to a port at the top of the EM logical hierarchy -# Multiple-External: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are routed to ports at the top of the EM logical hierarchy -# Single-Internal: single done and single error interrupt signals for all DMA channels, and the interrupt signals are internal to the EM core -# Multiple-Internal: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are internal to the EM core --dmac_int_config Multiple-Internal - -# dmac_separate_error_interrupts --- This specifies whether there is a separate error interrupt per DMA channel, or just one. --dmac_separate_error_interrupts false - -# dmac_registers --- This option defines the number of DMA channels with their registers located in auxiliary space. --dmac_registers 0 - -# dmac_mem_if --- This option specifies whether the DMA controller system memory interface is integrated into the existing EM system memory interfaces or has its own interface. --dmac_mem_if integrated - -# dmac_per_if --- Internal vs DW peripheral interface. Specify (in hex) which channels have the DW interface, where bit 0 corresponds to DMA channel 0, bit 1 for DMA channel 1, etc. -# Example: 4 channel DMA controller where -dmac_per_if is set to 0x9 = DMA Channels 0 and 3 configured with the DW req interface, DMA Channels 1 and 2 configured with the internal req interface. --dmac_per_if 0x7e00 - - -######## DSP --- com.arc.hardware.DSP.1_0 ######## - -# Create DSP --create com.arc.hardware.DSP.1_0 System.CPUisle.ARCv2EM.DSP - -# dsp_complex --- Enable/disable support for single cycle 16b+16b complex instructions and butterfly operations, else 2-cycle complex instructions only without butterfly support --dsp_complex true - -# dsp_itu --- Enable/disable support for ITU bit-accurate 1 bit fractional shift before accumulation, else 1-bit fractional shift result after accumulation only --dsp_itu true - -# dsp_divsqrt --- Enable/disable support for divide and square root operations: DIV(U), REM(U), SQRT --dsp_divsqrt radix2 - -# dsp_accshift --- Select support for accumulator shift operations: no supported, limited shift support only or full shift support and convergent rounding --dsp_accshift full - -# dsp_impl --- The datapath components may be inferred from Verilog for better area or optimized using carry-save components for better timing --dsp_impl optimized - - -######## Data Cache --- com.arc.hardware.Data_Cache.1_0 ######## - -# Create Data Cache --create com.arc.hardware.Data_Cache.1_0 "System.CPUisle.ARCv2EM.Data Cache" - -# dc_size --- This defines the total size of the Data Cache in bytes. --dc_size 16384 - -# dc_ways --- This defines the number of cache ways. --dc_ways 2 - -# dc_bsize --- This defines the cache line length in bytes. --dc_bsize 32 - -# dc_feature_level --- Feature Level, indicates locking and debug feature level 00 = Basic cache, with no locking or debug features 01 = Lock and flush features supported 10 = Lock, flush and advanced debug features supported 11 = Reserved --dc_feature_level 2 - -# dc_uncached_region --- Enable an uncached region defined by aux reg --dc_uncached_region false - -# dc_prot --- Specifies the type of protection built for DCACHE. --dc_prot None - -# dc_prot_level --- Specifies the level of protection. --dc_prot_level Data_Only - -# dc_prot_exceptions --- Builds exception generation hardware for uncorrectable (fatal) errors detected on DCACHE. --dc_prot_exceptions true - - -######## Debug Interface --- com.arc.hardware.Debug_Interface.1_0 ######## - -# Create Debug Interface --create com.arc.hardware.Debug_Interface.1_0 "System.CPUisle.ARCv2EM.Debug Interface" - -# dbg_en_option --- Adds an enable pin to the existing debug interface --dbg_en_option false - -# secure_debug --- This enables secure debug feature --secure_debug false - -# scdbg_aux_unlk --- An internal demo module will be included when enable --scdbg_aux_unlk false - -# dbg_apb_option --- Adds an additional APB debug port alongside the BVCI one --dbg_apb_option false - - -######## ICCM0 --- com.arc.hardware.ICCM0.1_0 ######## - -# Create ICCM0 --create com.arc.hardware.ICCM0.1_0 System.CPUisle.ARCv2EM.ICCM0 - -# iccm0_size --- This defines the size of ICCM0 in bytes.This ICCM has 0 wait states. --iccm0_size 131072 - -# iccm0_base --- Sets the initial memory region assignment for ICCM0 --iccm0_base 6 - -# iccm0_wide --- Creates ICCM0 as 64b memory to reduce accesses. --iccm0_wide false - -# iccm0_prot --- Specifies the type of protection built for ICCM0. --iccm0_prot None - -# iccm0_prot_level --- Specifies the level of protection. --iccm0_prot_level Data_Only - -# iccm0_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the ICCM0 --iccm0_prot_exceptions true - -# iccm0_sec_lvl --- Specifies the level of secure ICCM0. --iccm0_sec_lvl Non_Secure - -# iccm0_dmi --- This enables external access through a DMI (direct memory interface) port. --iccm0_dmi true - - -######## Instruction Cache --- com.arc.hardware.Instruction_Cache.1_0 ######## - -# Create Instruction Cache --create com.arc.hardware.Instruction_Cache.1_0 "System.CPUisle.ARCv2EM.Instruction Cache" - -# ic_size --- This defines the total size of the instruction cache in bytes. --ic_size 16384 - -# ic_ways --- This defines the number of cache ways --ic_ways 2 - -# ic_bsize --- This defines the cache line length in bytes. --ic_bsize 64 - -# ic_disable_on_reset --- The instruction cache may be enabled immediately after reset, depending on this option. If this option is enabled, the last cache operation is set to failed, and the direct cache-RAM access is enabled. Furthermore, the instruction cache is invalidated all cache lines are invalidated and unlocked, and the tag RAM is cleared. --ic_disable_on_reset false - -# ic_feature_level --- This defines the feature level of the cache. --ic_feature_level 1 - -# ic_pwr_opt_level --- This selects power-optimization options in the micro-architecture of the instruction cache. --ic_pwr_opt_level 0 - -# ic_prot --- Specifies the type of protection built for ICACHE. --ic_prot None - -# ic_prot_level --- Specifies the level of protection. --ic_prot_level Data_Only - -# ic_prot_exceptions --- Builds exception generation hardware for uncorrectable (fatal) errors detected on ICACHE. --ic_prot_exceptions true - - -######## Interrupt Controller --- com.arc.hardware.Interrupt_Controller.1_0 ######## - -# Create Interrupt Controller --create com.arc.hardware.Interrupt_Controller.1_0 "System.CPUisle.ARCv2EM.Interrupt Controller" - -# number_of_interrupts --- This is the total number of interrupts available to the core. Some interrupts are allocated statically to a specific interrupt line (for example, timer interrupts). For more information on Interrupt and register-file options, see DesignWare ARCv2 ISA Programmers Reference Manual. --number_of_interrupts 96 - -# number_of_levels --- Priority levels in the interrupt controller. --number_of_levels 4 - -# external_interrupts --- This is the total number of interrupt pins available for external system components. This parameter must be less than the total number of interrupts. --external_interrupts 77 - -# firq_option --- This enables the fast-interrupts option, (priority level 0 interrupts), which uses an alternate register bank (if configured) instead of saving the context to memory. --firq_option true - - -######## JTAG Interface --- com.arc.hardware.JTAG_Interface.1_0 ######## - -# Create JTAG Interface --create com.arc.hardware.JTAG_Interface.1_0 "System.CPUisle.ARCv2EM.JTAG Interface" - -######## Timer 0 --- com.arc.hardware.Timer_0.1_0 ######## - -# Create Timer 0 --create com.arc.hardware.Timer_0.1_0 "System.CPUisle.ARCv2EM.Timer 0" - -# timer_0_int_level --- This sets the interrupt level (and implicitly the priority: level 0 is highest) of timer 0. --timer_0_int_level 1 - - -######## Watchdog Timer --- com.arc.hardware.Watchdog_Timer.1_0 ######## - -# Create Watchdog Timer --create com.arc.hardware.Watchdog_Timer.1_0 "System.CPUisle.ARCv2EM.Watchdog Timer" - -# watchdog_size --- Specifies the bit width of timer's internal counter. --watchdog_size 32 - -# watchdog_clk --- Specifies whether the timer should be driven from a separate clock. --watchdog_clk false - - -######## Real-time Counter --- com.arc.hardware.Real_time_Counter.1_0 ######## - -# Create Real-time Counter --create com.arc.hardware.Real_time_Counter.1_0 "System.CPUisle.ARCv2EM.Real-time Counter" - -######## Performance Monitor --- com.arc.hardware.Performance_Monitor.1_0 ######## - -# Create Performance Monitor --create com.arc.hardware.Performance_Monitor.1_0 "System.CPUisle.ARCv2EM.Performance Monitor" - -# pct_counters --- Number of counters for performance monitoring. --pct_counters 8 - - -######## SmaRT --- com.arc.hardware.SmaRT.1_0 ######## - -# Create SmaRT --create com.arc.hardware.SmaRT.1_0 System.CPUisle.ARCv2EM.SmaRT - -# smart_stack_entries --- This specifies the number of entries in the trace buffer. --smart_stack_entries 8 - -# smart_implementation --- Flip-flop = FF-based design. Memory = memory-based design (provides better density for larger trace buffers). --smart_implementation flip-flop - - -######## XY --- com.arc.hardware.XY.1_0 ######## - -# Create XY --create com.arc.hardware.XY.1_0 System.CPUisle.ARCv2EM.XY - -# xy_config --- XY memory configuration: -# One memory: DCCM only. -# Two memories: DCCM + Y. -# Three memories: DCCM + X + Y. --xy_config dccm_x_y - -# xy_size --- Size of X and Y memories if included. -# X and Y memories both have the same configured size. --xy_size 16384 - -# xy_interleave --- Split XY memories into odd/even instances to enable single cycle unaligned access. --xy_interleave false - -# xy_x_base --- Base region for X memory. All accesses to this region will initiate a transfer on the X memory. --xy_x_base 9 - -# xy_y_base --- Base region for Y memory. All accesses to this region will initiate a transfer on the Y memory. --xy_y_base 10 - - -######## AGU --- com.arc.hardware.AGU.1_0 ######## - -# Create AGU --create com.arc.hardware.AGU.1_0 System.CPUisle.ARCv2EM.AGU - -# agu_size --- Predefined configurations of modifiers, address -# pointers and offset registers -#

-# 
-#         address     address                     
-#         pointers    offset regs      modifiers  
-#        ----------- --------------- ------------ 
-# small:     4           2                 4      
-# medium:    8           4                 12     
-# large:     12          8                 24     
-# 
-# --agu_size large - -# agu_accord --- Enable the accordion stage if operating frequency is critical --agu_accord true - -# agu_wb_depth --- Write buffer depth --agu_wb_depth 4 - - -######## Actionpoints --- com.arc.hardware.Actionpoints.1_0 ######## - -# Create Actionpoints --create com.arc.hardware.Actionpoints.1_0 System.CPUisle.ARCv2EM.Actionpoints - -# num_actionpoints --- This is the number of trigger events available. --num_actionpoints 8 - -# aps_feature --- Selects Actionpoint feature set --aps_feature min - - -######## Bit stream --- com.arc.hardware.Bit_stream.1_0 ######## - -# Create Bit stream --create com.arc.hardware.Bit_stream.1_0 "System.CPUisle.ARCv2EM.Bit stream" - -######## Floating-point unit --- com.arc.hardware.Floating_point_unit.1_0 ######## - -# Create Floating-point unit --create com.arc.hardware.Floating_point_unit.1_0 "System.CPUisle.ARCv2EM.Floating-point unit" - -# fpu_dp_assist --- This enables double-precision acceleration instructions. --fpu_dp_assist true - -# fpu_fma_option --- This enables the fused multiply-add & multiply-subtract instructions. --fpu_fma_option true - -# fpu_mas_cycles --- Make mul/add/sub multicycle to achieve a higher clock speed. --fpu_mas_cycles 2 - -# fpu_pipe_impl --- FPU pipelined implementation --fpu_pipe_impl true - -# fpu_div_option --- This enables divide & square-root acceleration --fpu_div_option true - -# fpu_div_cycles --- Controls div/sqrt implementation. --fpu_div_cycles 17 - - -######## Memory Protection Unit --- com.arc.hardware.Memory_Protection_Unit.1_0 ######## - -# Create Memory Protection Unit --create com.arc.hardware.Memory_Protection_Unit.1_0 "System.CPUisle.ARCv2EM.Memory Protection Unit" - -# mpu_num_regions --- Number of configured memory regions. --mpu_num_regions 16 - -# mpu_32b --- Set the minimal region size to be 32 byte instead of 2KB. --mpu_32b false - -# mpu_sid_option --- It will enable SID support in Secure Shield --mpu_sid_option false - - -######## Real-time trace producer --- com.arc.hardware.Real_time_trace_producer.1_0 ######## - -# Create Real-time trace producer --create com.arc.hardware.Real_time_trace_producer.1_0 "System.CPUisle.ARCv2EM.Real-time trace producer" - -# rtt_feature_level --- 'small' means that program trace only is available. `medium' adds data trace. `full' adds core and aux register trace. --rtt_feature_level full - - -######## ARCv2EM CCT --- cct.1_0 ######## - -# Create ARCv2EM CCT --create cct.1_0 "System.ARCv2EM CCT" - -# cct --- -# Option used to add a CCT to the design for command-line builds -# Without this architect can't add this component to a build -# via a cmdline -create command. -# with old scripts. -# --cct true - -# no_hostlink --- -# This prevents the inclusion of the hostlink library when compiling -# C or C++ programs. The resultant executable, if it contains printfs, -# will print to an internal fixed buffer __mwwrite_buf. -# Other hostlink operations that require debugger assistance, such as file -# opens, will fail. -# -# Hostlink references incur memory cycles at unpredictable times and -# so can perturb cycle-timing results. Without hostlink, -# the debugger will not in any way interfere with the target while it is running. -# Therefore this option is useful for simulation in which you want precisely the -# same cycle timing to occur each time you run, or for accurate power consumption results. -# --cct_no_hostlink false - -# has_subsystem_cct_flow --- -# The above option will check for the presence of subsystem component in the build configuration and suitably modifies the Makefile for the sub-system environment. -# --has_subsystem_cct_flow false - - -######## BusFabric --- com.arc.hardware.ARCv2MSS.BusFabric.1_0 ######## - -# Create BusFabric --create com.arc.hardware.ARCv2MSS.BusFabric.1_0 System.BusFabric - -######## ClkCtrl --- com.arc.hardware.ARCv2MSS.ClkCtrl.1_0 ######## - -# Create ClkCtrl --create com.arc.hardware.ARCv2MSS.ClkCtrl.1_0 System.ClkCtrl - -######## DSP Software --- com.arc.software.dfss.sw_dsp.1_0 ######## - -# Create DSP Software --create com.arc.software.dfss.sw_dsp.1_0 "System.DSP Software" - -# sw_dsp --- Command line option for Software element 'DSP Software' --sw_dsp true - - -######## EMSDP_BOARD --- com.arc.hardware.ARCv2MSS.EMSDP_BOARD.1_0 ######## - -# Create EMSDP_BOARD --create com.arc.hardware.ARCv2MSS.EMSDP_BOARD.1_0 System.EMSDP_BOARD - -# emsdp_sys_freq --- Select the core frequency. --emsdp_sys_freq 40 - - -######## IO Software --- com.arc.software.dfss.sw_io.1_0 ######## - -# Create IO Software --create com.arc.software.dfss.sw_io.1_0 "System.IO Software" - -# sw_io --- Command line option for Software element 'IO Software' --sw_io true - - -######## Implementation --- com.arc.hardware.implementation.1_0 ######## - -# Create Implementation --create com.arc.hardware.implementation.1_0 System.Implementation - -# ClockSpeed --- Target clock speed of the system --clock_speed 10 - -# DDR2_clk_Ratio --- DDR2 Clock Vs System Clock Ratio -# 2x -# 3x -# 4x --ddr2_clk_ratio 3x - -# ClockSkew --- The clock skew for the system --clock_skew 0.2 - -# HoldMargin --- Margin for hold time checks --hold_margin 0.05 - -# Floorplan --- Floorplan definition for relative placement of RAMs (at CPU-level) or the placement of the rams and CPU hard cores (at multicore level) --floorplan em4_sensor - -# JTAGFrequency --- Select the frequency of the JTAG clock Tck (in MHz). -# -# The JTAG clock speed has to be less than 1/2 of the cpu clock otherwise the signals on the BVCI interface are not guaranteed to be valid. -# -# NOTE: The RTL simulations will work when the JTAG clock frequency is set to half the CPU clock, however this may not be the case when simulating at gate level due to delays on the IO pads. -# -# The default is set to 10 MHz so that there is no conflict when simulating with an ARCangel3 at 30MHz. (30 > 10*2) -# -# The speed of simulation can be greatly increased by using a faster JTAG clock, but a dependency will warn if it exceeds 1/2 of the cpu clock. -# --jtag_tclk 4 - -# execution_trace_level --- -# This traces committed instructions as they execute, and gathers statistics -# visible in the debugger for counting instructions & cycle delays. -# At the "stats" level ony the statistics are gathered and no trace is printed. -# "file" is equivalent to "full", but the results go to a trace .txt file instead. -# --execution_trace_level stats - -# tb_trace --- -# Enable instruction execution trace. -# This is available to arc_dev licensees (internal developers) only. -# --tb_trace false - -# zero_based_arcnum --- -# In a multicore build, number ARCs from 0. -# If this is not selected, arcs are numbered from 1. -# (This provides the initial value to the arcnum signal.) -# --zero_based_arcnum true - -# generate_ipxact --- -# Generate ipxact.xml file describing the CPUisle or archipelago frontier -# --generate_ipxact false - -# ipxact_relative_path_names --- -# Use relative path names for Verilog files in the ipxact. -# Otherwise, absolute path names are used. -# --ipxact_relative_path_names true - -# optional_encryption --- -# When selected, encrypted RTL output is generated. -# --optional_encryption false - -# ignore_encrypt_license --- -# When selected, pretend the encryption license is missing. For testing. -# --ignore_encrypt_license false - -# ignore_clear_license --- -# When selected, pretend the cleartest license is missing. For testing. -# --ignore_clear_license false - -# OPTION_require_archipelago --- -# When selected, force use of archipelago. This is for testing purposes. -# --require_archipelago false - - -######## Infrastructure Software --- com.arc.software.dfss.sw_infra.1_0 ######## - -# Create Infrastructure Software --create com.arc.software.dfss.sw_infra.1_0 "System.Infrastructure Software" - -# sw_infra --- Command line option for Software element 'Infrastructure Software' --sw_infra true - -# templateName --- Template name --template_name siss_combo_sensor_dsp - - -######## subsys_infra --- com.arc.hardware.dfss.subsys_infra.1_0 ######## - -# Create subsys_infra --create com.arc.hardware.dfss.subsys_infra.1_0 System.subsys_infra - -# subsys_infra --- Command line option for EIA glue logic. --subsys_infra true - -# internal_interrupt --- Connect the IO interrupts internally --internal_interrupt true - -# internal_dma_handshake --- Connect the DMA handshake signals internally --internal_dma_handshake true - -# spi_tb_sw_test_mode --- -# This is a secret option, not seen by customers. -# If you check this, the SPI peripheral's testbenches will be set to SW test mode: -# The serial interface of the first SPI master io_spi_mstN peripheral is connected to all SPI slave peripherals io_spi_slvN. -# This is used for testing the SW drivers. -# --spi_tb_sw_test_mode false - -# i3c_tb_sw_test_mode --- -# This is a secret option, not seen by customers. -# If you check this, the I3C peripheral's testbenches will be set to SW test mode: -# The serial interface of the io_i3cN peripheral is connected to the I2C slave peripherals io_i2c_slv0. -# This is used for testing the SW drivers. -# --i3c_tb_sw_test_mode false - -# subsys_apex_offset --- Subsystem APEX address offset in the AUX address space. The aperture used by the subsystem is fixed to 0x0010_0000. In general, the APEX address offset must be in the range from 0x0010_0000 to 0xFFF0_0000. However, if your design includes the "UAUX Interface" component, then the APEX address offset must be in the range from 0x0010_0000 to 0x7FF0_0000 to avoid address conflicts with any UAUX components. --subsys_apex_offset 0x8000_0000 - -# subsys_uaux_offset --- Subsystem UAUX address offset in the UAUX address space. The UAUX address offset must be an integer multiple of 0x0010_0000 in the range from 0x0000_0000 to 0x7FF0_0000. The aperture reserved for the subsystem is fixed to 0x0010_0000. --subsys_uaux_offset 0x10_0000 - - -######## ARC_RTT --- com.arc.hardware.ARC_RTT.1_0 ######## - -# Create ARC_RTT --create com.arc.hardware.ARC_RTT.1_0 System.ARC_RTT - -# has_nexus_if --- Please select Nexus interface to offload the data from RTT --has_nexus_if true - -# has_on_chip_mem --- Please select the on-chip memory option to store the trace data in shared memory --has_on_chip_mem true - -# nexus_data_wdt --- Please select the Nexus Data Width to offload the data from RTT --nexus_data_wdt 16 - -# internal_memory_size --- Please select internal memory size to capture the trace data --internal_memory_size 16k - -# ram_type --- Please select Types of internal memories to be inferred for the logic --ram_type 1_PORT - -# power_domains --- Adds isolation signal inputs/power switch controls for use in UPF flow when configuring power domains. --rtt_power_domains false - - -######## Tool Configuration --- cgen.1_0 ######## - -# Create Tool Configuration --create cgen.1_0 "System.Tool Configuration" - -# mwdt_version --- Selects the MetaWare version to be used with the TCF file. -# Change from the default to an older or newer toolset version if you want the TCF file to be used with an older or newer version of the MetaWare tools. --mwdt_version O-2018.09 - -# code_base_addr --- -# The base address to assign to the executable code segment in the linker command file when there is no ICCM in the build. This value is ignored when there is an ICCM. -# --code_base_addr 0x0 - -# data_base_addr --- -# The base address to assign to the data segment in the linker command file when the data is not being mapped to a DCCM. This value is ignored when the data segment is mapped to a DCCM, as in that case the base address of the DCCM memory is used. -# -# A value of 0xffffffff means that the data segment will not be mapped to any specific address. -# --data_base_addr 0xffff_ffff - -# underscores_in_numbers --- Use underscores in hex numbers to improve readability. --underscores_in_numbers false - -# tcf_rebrand --- Alternate branding of TCF (not used) --rebrand false - - -]]>
-
- - - - - - - - - - - - - - ICCM0 - - GROUP BLOCK(4): { - /* _SDA_BASE_ computed implicitly */ - .sdata?: {} - .sbss?: {} - * (DATA): {} - * (BSS): {} - .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {} - .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {} - } > SYSTEM2 - GROUP BLOCK(4): { - .Xdata? : {} - } > XCCM - GROUP BLOCK(4): { - .Ydata? : {} - } > YCCM - GROUP BLOCK(4) : { - .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4) - } > IVT - } - -]]> - - - - - - 0x07, sub_opcode => 0x1E , latency_cycles => 8) - -// User extension instruction - dsp_sin -extern long dsp_sin(long); -#pragma intrinsic(dsp_sin, opcode => 0x07, sub_opcode => 0x1F , latency_cycles => 8) - -// User extension instruction - dsp_tan -extern long dsp_tan(long); -#pragma intrinsic(dsp_tan, opcode => 0x07, sub_opcode => 0x22 , latency_cycles => 11) - -// User extension instruction - dsp_acos -extern long dsp_acos(long); -#pragma intrinsic(dsp_acos, opcode => 0x07, sub_opcode => 0x23 , latency_cycles => 31) - -// User extension instruction - dsp_asin -extern long dsp_asin(long); -#pragma intrinsic(dsp_asin, opcode => 0x07, sub_opcode => 0x24 , latency_cycles => 31) - -// User extension instruction - dsp_atan -extern long dsp_atan(long); -#pragma intrinsic(dsp_atan, opcode => 0x07, sub_opcode => 0x25 , latency_cycles => 13) - -// User extension instruction - dsp_sqrt -extern long dsp_sqrt(long); -#pragma intrinsic(dsp_sqrt, opcode => 0x07, sub_opcode => 0x20 , latency_cycles => 31) - -// User extension instruction - dsp_sqrt15 -extern long dsp_sqrt15(long); -#pragma intrinsic(dsp_sqrt15, opcode => 0x07, sub_opcode => 0x21 , latency_cycles => 15) - -#define APEX_COM_ARC_HARDWARE_DFSS_DSP_TRIG_PRESENT 1 -#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO0_PRESENT 1 - -// User extension aux register io_gpio0_debounce -#define AR_IO_GPIO0_DEBOUNCE 0x80017048 -#pragma Aux_register(0x80017048, name=>"io_gpio0_debounce") - -// User extension aux register io_gpio0_clken -#define AR_IO_GPIO0_CLKEN 0x80017080 -#pragma Aux_register(0x80017080, name=>"io_gpio0_clken") - -// User extension aux register io_gpio0_swporta_dr -#define AR_IO_GPIO0_SWPORTA_DR 0x80017000 -#pragma Aux_register(0x80017000, name=>"io_gpio0_swporta_dr") - -// User extension aux register io_gpio0_swporta_ddr -#define AR_IO_GPIO0_SWPORTA_DDR 0x80017004 -#pragma Aux_register(0x80017004, name=>"io_gpio0_swporta_ddr") - -// User extension aux register io_gpio0_inten -#define AR_IO_GPIO0_INTEN 0x80017030 -#pragma Aux_register(0x80017030, name=>"io_gpio0_inten") - -// User extension aux register io_gpio0_intmask -#define AR_IO_GPIO0_INTMASK 0x80017034 -#pragma Aux_register(0x80017034, name=>"io_gpio0_intmask") - -// User extension aux register io_gpio0_inttype_level -#define AR_IO_GPIO0_INTTYPE_LEVEL 0x80017038 -#pragma Aux_register(0x80017038, name=>"io_gpio0_inttype_level") - -// User extension aux register io_gpio0_int_polarity -#define AR_IO_GPIO0_INT_POLARITY 0x8001703c -#pragma Aux_register(0x8001703c, name=>"io_gpio0_int_polarity") - -// User extension aux register io_gpio0_intstatus -#define AR_IO_GPIO0_INTSTATUS 0x80017040 -#pragma Aux_register(0x80017040, name=>"io_gpio0_intstatus") - -// User extension aux register io_gpio0_raw_intstatus -#define AR_IO_GPIO0_RAW_INTSTATUS 0x80017044 -#pragma Aux_register(0x80017044, name=>"io_gpio0_raw_intstatus") - -// User extension aux register io_gpio0_porta_eoi -#define AR_IO_GPIO0_PORTA_EOI 0x8001704c -#pragma Aux_register(0x8001704c, name=>"io_gpio0_porta_eoi") - -// User extension aux register io_gpio0_ext_porta -#define AR_IO_GPIO0_EXT_PORTA 0x80017050 -#pragma Aux_register(0x80017050, name=>"io_gpio0_ext_porta") - -// User extension aux register io_gpio0_ls_sync -#define AR_IO_GPIO0_LS_SYNC 0x80017060 -#pragma Aux_register(0x80017060, name=>"io_gpio0_ls_sync") - -// User extension aux register io_gpio0_int_bothedge -#define AR_IO_GPIO0_INT_BOTHEDGE 0x80017068 -#pragma Aux_register(0x80017068, name=>"io_gpio0_int_bothedge") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST0_PRESENT 1 - -// User extension aux register io_i2c_mst0_clken -#define AR_IO_I2C_MST0_CLKEN 0x800120c0 -#pragma Aux_register(0x800120c0, name=>"io_i2c_mst0_clken") - -// User extension aux register io_i2c_mst0_con -#define AR_IO_I2C_MST0_CON 0x80012000 -#pragma Aux_register(0x80012000, name=>"io_i2c_mst0_con") - -// User extension aux register io_i2c_mst0_tar -#define AR_IO_I2C_MST0_TAR 0x80012004 -#pragma Aux_register(0x80012004, name=>"io_i2c_mst0_tar") - -// User extension aux register io_i2c_mst0_data_cmd -#define AR_IO_I2C_MST0_DATA_CMD 0x80012010 -#pragma Aux_register(0x80012010, name=>"io_i2c_mst0_data_cmd") - -// User extension aux register io_i2c_mst0_ss_scl_hcnt -#define AR_IO_I2C_MST0_SS_SCL_HCNT 0x80012014 -#pragma Aux_register(0x80012014, name=>"io_i2c_mst0_ss_scl_hcnt") - -// User extension aux register io_i2c_mst0_ss_scl_lcnt -#define AR_IO_I2C_MST0_SS_SCL_LCNT 0x80012018 -#pragma Aux_register(0x80012018, name=>"io_i2c_mst0_ss_scl_lcnt") - -// User extension aux register io_i2c_mst0_fs_scl_hcnt -#define AR_IO_I2C_MST0_FS_SCL_HCNT 0x8001201c -#pragma Aux_register(0x8001201c, name=>"io_i2c_mst0_fs_scl_hcnt") - -// User extension aux register io_i2c_mst0_fs_scl_lcnt -#define AR_IO_I2C_MST0_FS_SCL_LCNT 0x80012020 -#pragma Aux_register(0x80012020, name=>"io_i2c_mst0_fs_scl_lcnt") - -// User extension aux register io_i2c_mst0_intr_stat -#define AR_IO_I2C_MST0_INTR_STAT 0x8001202c -#pragma Aux_register(0x8001202c, name=>"io_i2c_mst0_intr_stat") - -// User extension aux register io_i2c_mst0_intr_mask -#define AR_IO_I2C_MST0_INTR_MASK 0x80012030 -#pragma Aux_register(0x80012030, name=>"io_i2c_mst0_intr_mask") - -// User extension aux register io_i2c_mst0_raw_intr_stat -#define AR_IO_I2C_MST0_RAW_INTR_STAT 0x80012034 -#pragma Aux_register(0x80012034, name=>"io_i2c_mst0_raw_intr_stat") - -// User extension aux register io_i2c_mst0_rx_tl -#define AR_IO_I2C_MST0_RX_TL 0x80012038 -#pragma Aux_register(0x80012038, name=>"io_i2c_mst0_rx_tl") - -// User extension aux register io_i2c_mst0_tx_tl -#define AR_IO_I2C_MST0_TX_TL 0x8001203c -#pragma Aux_register(0x8001203c, name=>"io_i2c_mst0_tx_tl") - -// User extension aux register io_i2c_mst0_clr_intr -#define AR_IO_I2C_MST0_CLR_INTR 0x80012040 -#pragma Aux_register(0x80012040, name=>"io_i2c_mst0_clr_intr") - -// User extension aux register io_i2c_mst0_clr_rx_under -#define AR_IO_I2C_MST0_CLR_RX_UNDER 0x80012044 -#pragma Aux_register(0x80012044, name=>"io_i2c_mst0_clr_rx_under") - -// User extension aux register io_i2c_mst0_clr_rx_over -#define AR_IO_I2C_MST0_CLR_RX_OVER 0x80012048 -#pragma Aux_register(0x80012048, name=>"io_i2c_mst0_clr_rx_over") - -// User extension aux register io_i2c_mst0_clr_tx_over -#define AR_IO_I2C_MST0_CLR_TX_OVER 0x8001204c -#pragma Aux_register(0x8001204c, name=>"io_i2c_mst0_clr_tx_over") - -// User extension aux register io_i2c_mst0_clr_tx_abrt -#define AR_IO_I2C_MST0_CLR_TX_ABRT 0x80012054 -#pragma Aux_register(0x80012054, name=>"io_i2c_mst0_clr_tx_abrt") - -// User extension aux register io_i2c_mst0_clr_activity -#define AR_IO_I2C_MST0_CLR_ACTIVITY 0x8001205c -#pragma Aux_register(0x8001205c, name=>"io_i2c_mst0_clr_activity") - -// User extension aux register io_i2c_mst0_clr_stop_det -#define AR_IO_I2C_MST0_CLR_STOP_DET 0x80012060 -#pragma Aux_register(0x80012060, name=>"io_i2c_mst0_clr_stop_det") - -// User extension aux register io_i2c_mst0_clr_start_det -#define AR_IO_I2C_MST0_CLR_START_DET 0x80012064 -#pragma Aux_register(0x80012064, name=>"io_i2c_mst0_clr_start_det") - -// User extension aux register io_i2c_mst0_enable -#define AR_IO_I2C_MST0_ENABLE 0x8001206c -#pragma Aux_register(0x8001206c, name=>"io_i2c_mst0_enable") - -// User extension aux register io_i2c_mst0_status -#define AR_IO_I2C_MST0_STATUS 0x80012070 -#pragma Aux_register(0x80012070, name=>"io_i2c_mst0_status") - -// User extension aux register io_i2c_mst0_txflr -#define AR_IO_I2C_MST0_TXFLR 0x80012074 -#pragma Aux_register(0x80012074, name=>"io_i2c_mst0_txflr") - -// User extension aux register io_i2c_mst0_rxflr -#define AR_IO_I2C_MST0_RXFLR 0x80012078 -#pragma Aux_register(0x80012078, name=>"io_i2c_mst0_rxflr") - -// User extension aux register io_i2c_mst0_sda_hold -#define AR_IO_I2C_MST0_SDA_HOLD 0x8001207c -#pragma Aux_register(0x8001207c, name=>"io_i2c_mst0_sda_hold") - -// User extension aux register io_i2c_mst0_tx_abrt_source -#define AR_IO_I2C_MST0_TX_ABRT_SOURCE 0x80012080 -#pragma Aux_register(0x80012080, name=>"io_i2c_mst0_tx_abrt_source") - -// User extension aux register io_i2c_mst0_enable_status -#define AR_IO_I2C_MST0_ENABLE_STATUS 0x8001209c -#pragma Aux_register(0x8001209c, name=>"io_i2c_mst0_enable_status") - -// User extension aux register io_i2c_mst0_fs_spklen -#define AR_IO_I2C_MST0_FS_SPKLEN 0x800120a0 -#pragma Aux_register(0x800120a0, name=>"io_i2c_mst0_fs_spklen") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_SLV0_PRESENT 1 - -// User extension aux register io_i2c_slv0_clken -#define AR_IO_I2C_SLV0_CLKEN 0x800130c0 -#pragma Aux_register(0x800130c0, name=>"io_i2c_slv0_clken") - -// User extension aux register io_i2c_slv0_con -#define AR_IO_I2C_SLV0_CON 0x80013000 -#pragma Aux_register(0x80013000, name=>"io_i2c_slv0_con") - -// User extension aux register io_i2c_slv0_sar -#define AR_IO_I2C_SLV0_SAR 0x80013008 -#pragma Aux_register(0x80013008, name=>"io_i2c_slv0_sar") - -// User extension aux register io_i2c_slv0_data_cmd -#define AR_IO_I2C_SLV0_DATA_CMD 0x80013010 -#pragma Aux_register(0x80013010, name=>"io_i2c_slv0_data_cmd") - -// User extension aux register io_i2c_slv0_intr_stat -#define AR_IO_I2C_SLV0_INTR_STAT 0x8001302c -#pragma Aux_register(0x8001302c, name=>"io_i2c_slv0_intr_stat") - -// User extension aux register io_i2c_slv0_intr_mask -#define AR_IO_I2C_SLV0_INTR_MASK 0x80013030 -#pragma Aux_register(0x80013030, name=>"io_i2c_slv0_intr_mask") - -// User extension aux register io_i2c_slv0_raw_intr_stat -#define AR_IO_I2C_SLV0_RAW_INTR_STAT 0x80013034 -#pragma Aux_register(0x80013034, name=>"io_i2c_slv0_raw_intr_stat") - -// User extension aux register io_i2c_slv0_rx_tl -#define AR_IO_I2C_SLV0_RX_TL 0x80013038 -#pragma Aux_register(0x80013038, name=>"io_i2c_slv0_rx_tl") - -// User extension aux register io_i2c_slv0_tx_tl -#define AR_IO_I2C_SLV0_TX_TL 0x8001303c -#pragma Aux_register(0x8001303c, name=>"io_i2c_slv0_tx_tl") - -// User extension aux register io_i2c_slv0_clr_intr -#define AR_IO_I2C_SLV0_CLR_INTR 0x80013040 -#pragma Aux_register(0x80013040, name=>"io_i2c_slv0_clr_intr") - -// User extension aux register io_i2c_slv0_clr_rx_under -#define AR_IO_I2C_SLV0_CLR_RX_UNDER 0x80013044 -#pragma Aux_register(0x80013044, name=>"io_i2c_slv0_clr_rx_under") - -// User extension aux register io_i2c_slv0_clr_rx_over -#define AR_IO_I2C_SLV0_CLR_RX_OVER 0x80013048 -#pragma Aux_register(0x80013048, name=>"io_i2c_slv0_clr_rx_over") - -// User extension aux register io_i2c_slv0_clr_tx_over -#define AR_IO_I2C_SLV0_CLR_TX_OVER 0x8001304c -#pragma Aux_register(0x8001304c, name=>"io_i2c_slv0_clr_tx_over") - -// User extension aux register io_i2c_slv0_clr_rd_req -#define AR_IO_I2C_SLV0_CLR_RD_REQ 0x80013050 -#pragma Aux_register(0x80013050, name=>"io_i2c_slv0_clr_rd_req") - -// User extension aux register io_i2c_slv0_clr_tx_abrt -#define AR_IO_I2C_SLV0_CLR_TX_ABRT 0x80013054 -#pragma Aux_register(0x80013054, name=>"io_i2c_slv0_clr_tx_abrt") - -// User extension aux register io_i2c_slv0_clr_rx_done -#define AR_IO_I2C_SLV0_CLR_RX_DONE 0x80013058 -#pragma Aux_register(0x80013058, name=>"io_i2c_slv0_clr_rx_done") - -// User extension aux register io_i2c_slv0_clr_activity -#define AR_IO_I2C_SLV0_CLR_ACTIVITY 0x8001305c -#pragma Aux_register(0x8001305c, name=>"io_i2c_slv0_clr_activity") - -// User extension aux register io_i2c_slv0_clr_stop_det -#define AR_IO_I2C_SLV0_CLR_STOP_DET 0x80013060 -#pragma Aux_register(0x80013060, name=>"io_i2c_slv0_clr_stop_det") - -// User extension aux register io_i2c_slv0_clr_start_det -#define AR_IO_I2C_SLV0_CLR_START_DET 0x80013064 -#pragma Aux_register(0x80013064, name=>"io_i2c_slv0_clr_start_det") - -// User extension aux register io_i2c_slv0_enable -#define AR_IO_I2C_SLV0_ENABLE 0x8001306c -#pragma Aux_register(0x8001306c, name=>"io_i2c_slv0_enable") - -// User extension aux register io_i2c_slv0_status -#define AR_IO_I2C_SLV0_STATUS 0x80013070 -#pragma Aux_register(0x80013070, name=>"io_i2c_slv0_status") - -// User extension aux register io_i2c_slv0_txflr -#define AR_IO_I2C_SLV0_TXFLR 0x80013074 -#pragma Aux_register(0x80013074, name=>"io_i2c_slv0_txflr") - -// User extension aux register io_i2c_slv0_rxflr -#define AR_IO_I2C_SLV0_RXFLR 0x80013078 -#pragma Aux_register(0x80013078, name=>"io_i2c_slv0_rxflr") - -// User extension aux register io_i2c_slv0_sda_hold -#define AR_IO_I2C_SLV0_SDA_HOLD 0x8001307c -#pragma Aux_register(0x8001307c, name=>"io_i2c_slv0_sda_hold") - -// User extension aux register io_i2c_slv0_tx_abrt_source -#define AR_IO_I2C_SLV0_TX_ABRT_SOURCE 0x80013080 -#pragma Aux_register(0x80013080, name=>"io_i2c_slv0_tx_abrt_source") - -// User extension aux register io_i2c_slv0_sda_setup -#define AR_IO_I2C_SLV0_SDA_SETUP 0x80013094 -#pragma Aux_register(0x80013094, name=>"io_i2c_slv0_sda_setup") - -// User extension aux register io_i2c_slv0_enable_status -#define AR_IO_I2C_SLV0_ENABLE_STATUS 0x8001309c -#pragma Aux_register(0x8001309c, name=>"io_i2c_slv0_enable_status") - -// User extension aux register io_i2c_slv0_fs_spklen -#define AR_IO_I2C_SLV0_FS_SPKLEN 0x800130a0 -#pragma Aux_register(0x800130a0, name=>"io_i2c_slv0_fs_spklen") - -// User extension aux register io_i2c_slv0_clr_restart_det -#define AR_IO_I2C_SLV0_CLR_RESTART_DET 0x800130a8 -#pragma Aux_register(0x800130a8, name=>"io_i2c_slv0_clr_restart_det") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST0_PRESENT 1 - -// User extension aux register io_spi_mst0_ctrlr0 -#define AR_IO_SPI_MST0_CTRLR0 0x80010000 -#pragma Aux_register(0x80010000, name=>"io_spi_mst0_ctrlr0") - -// User extension aux register io_spi_mst0_ctrlr1 -#define AR_IO_SPI_MST0_CTRLR1 0x80010001 -#pragma Aux_register(0x80010001, name=>"io_spi_mst0_ctrlr1") - -// User extension aux register io_spi_mst0_spien -#define AR_IO_SPI_MST0_SPIEN 0x80010002 -#pragma Aux_register(0x80010002, name=>"io_spi_mst0_spien") - -// User extension aux register io_spi_mst0_ser -#define AR_IO_SPI_MST0_SER 0x80010004 -#pragma Aux_register(0x80010004, name=>"io_spi_mst0_ser") - -// User extension aux register io_spi_mst0_baudr -#define AR_IO_SPI_MST0_BAUDR 0x80010005 -#pragma Aux_register(0x80010005, name=>"io_spi_mst0_baudr") - -// User extension aux register io_spi_mst0_txftlr -#define AR_IO_SPI_MST0_TXFTLR 0x80010006 -#pragma Aux_register(0x80010006, name=>"io_spi_mst0_txftlr") - -// User extension aux register io_spi_mst0_rxftlr -#define AR_IO_SPI_MST0_RXFTLR 0x80010007 -#pragma Aux_register(0x80010007, name=>"io_spi_mst0_rxftlr") - -// User extension aux register io_spi_mst0_txflr -#define AR_IO_SPI_MST0_TXFLR 0x80010008 -#pragma Aux_register(0x80010008, name=>"io_spi_mst0_txflr") - -// User extension aux register io_spi_mst0_rxflr -#define AR_IO_SPI_MST0_RXFLR 0x80010009 -#pragma Aux_register(0x80010009, name=>"io_spi_mst0_rxflr") - -// User extension aux register io_spi_mst0_sr -#define AR_IO_SPI_MST0_SR 0x8001000a -#pragma Aux_register(0x8001000a, name=>"io_spi_mst0_sr") - -// User extension aux register io_spi_mst0_imr -#define AR_IO_SPI_MST0_IMR 0x8001000b -#pragma Aux_register(0x8001000b, name=>"io_spi_mst0_imr") - -// User extension aux register io_spi_mst0_isr -#define AR_IO_SPI_MST0_ISR 0x8001000c -#pragma Aux_register(0x8001000c, name=>"io_spi_mst0_isr") - -// User extension aux register io_spi_mst0_risr -#define AR_IO_SPI_MST0_RISR 0x8001000d -#pragma Aux_register(0x8001000d, name=>"io_spi_mst0_risr") - -// User extension aux register io_spi_mst0_txoicr -#define AR_IO_SPI_MST0_TXOICR 0x8001000e -#pragma Aux_register(0x8001000e, name=>"io_spi_mst0_txoicr") - -// User extension aux register io_spi_mst0_rxoicr -#define AR_IO_SPI_MST0_RXOICR 0x8001000f -#pragma Aux_register(0x8001000f, name=>"io_spi_mst0_rxoicr") - -// User extension aux register io_spi_mst0_rxuicr -#define AR_IO_SPI_MST0_RXUICR 0x80010010 -#pragma Aux_register(0x80010010, name=>"io_spi_mst0_rxuicr") - -// User extension aux register io_spi_mst0_icr -#define AR_IO_SPI_MST0_ICR 0x80010012 -#pragma Aux_register(0x80010012, name=>"io_spi_mst0_icr") - -// User extension aux register io_spi_mst0_clken -#define AR_IO_SPI_MST0_CLKEN 0x80010016 -#pragma Aux_register(0x80010016, name=>"io_spi_mst0_clken") - -// User extension aux register io_spi_mst0_dr -#define AR_IO_SPI_MST0_DR 0x80010018 -#pragma Aux_register(0x80010018, name=>"io_spi_mst0_dr") - -// User extension aux register io_spi_mst0_rx_sample_dly -#define AR_IO_SPI_MST0_RX_SAMPLE_DLY 0x8001003c -#pragma Aux_register(0x8001003c, name=>"io_spi_mst0_rx_sample_dly") -#define APEX_COM_ARC_HARDWARE_DFSS_SUBSYS_BCR_PRESENT 1 - -// User extension aux register SUBSYS_BUILD -#define AR_SUBSYS_BUILD 0xf0 -#pragma Aux_register(0xf0, name=>"SUBSYS_BUILD") - -// User extension aux register SUBSYS_DSP_0_BUILD -#define AR_SUBSYS_DSP_0_BUILD 0xa00 -#pragma Aux_register(0xa00, name=>"SUBSYS_DSP_0_BUILD") - -// User extension aux register SUBSYS_DSP_0_CONFIG -#define AR_SUBSYS_DSP_0_CONFIG 0xa02 -#pragma Aux_register(0xa02, name=>"SUBSYS_DSP_0_CONFIG") - -// User extension aux register SUBSYS_IO_0_BUILD -#define AR_SUBSYS_IO_0_BUILD 0xa04 -#pragma Aux_register(0xa04, name=>"SUBSYS_IO_0_BUILD") - -// User extension aux register SUBSYS_IO_1_BUILD -#define AR_SUBSYS_IO_1_BUILD 0xa05 -#pragma Aux_register(0xa05, name=>"SUBSYS_IO_1_BUILD") - -// User extension aux register SUBSYS_IO_2_BUILD -#define AR_SUBSYS_IO_2_BUILD 0xa06 -#pragma Aux_register(0xa06, name=>"SUBSYS_IO_2_BUILD") - -// User extension aux register SUBSYS_UAUX_OFFSET -#define AR_SUBSYS_UAUX_OFFSET 0xa1e -#pragma Aux_register(0xa1e, name=>"SUBSYS_UAUX_OFFSET") - -// User extension aux register SUBSYS_APEX_OFFSET -#define AR_SUBSYS_APEX_OFFSET 0xa1f -#pragma Aux_register(0xa1f, name=>"SUBSYS_APEX_OFFSET") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST1_PRESENT 1 - -// User extension aux register io_spi_mst1_ctrlr0 -#define AR_IO_SPI_MST1_CTRLR0 0x80010100 -#pragma Aux_register(0x80010100, name=>"io_spi_mst1_ctrlr0") - -// User extension aux register io_spi_mst1_ctrlr1 -#define AR_IO_SPI_MST1_CTRLR1 0x80010101 -#pragma Aux_register(0x80010101, name=>"io_spi_mst1_ctrlr1") - -// User extension aux register io_spi_mst1_spien -#define AR_IO_SPI_MST1_SPIEN 0x80010102 -#pragma Aux_register(0x80010102, name=>"io_spi_mst1_spien") - -// User extension aux register io_spi_mst1_ser -#define AR_IO_SPI_MST1_SER 0x80010104 -#pragma Aux_register(0x80010104, name=>"io_spi_mst1_ser") - -// User extension aux register io_spi_mst1_baudr -#define AR_IO_SPI_MST1_BAUDR 0x80010105 -#pragma Aux_register(0x80010105, name=>"io_spi_mst1_baudr") - -// User extension aux register io_spi_mst1_txftlr -#define AR_IO_SPI_MST1_TXFTLR 0x80010106 -#pragma Aux_register(0x80010106, name=>"io_spi_mst1_txftlr") - -// User extension aux register io_spi_mst1_rxftlr -#define AR_IO_SPI_MST1_RXFTLR 0x80010107 -#pragma Aux_register(0x80010107, name=>"io_spi_mst1_rxftlr") - -// User extension aux register io_spi_mst1_txflr -#define AR_IO_SPI_MST1_TXFLR 0x80010108 -#pragma Aux_register(0x80010108, name=>"io_spi_mst1_txflr") - -// User extension aux register io_spi_mst1_rxflr -#define AR_IO_SPI_MST1_RXFLR 0x80010109 -#pragma Aux_register(0x80010109, name=>"io_spi_mst1_rxflr") - -// User extension aux register io_spi_mst1_sr -#define AR_IO_SPI_MST1_SR 0x8001010a -#pragma Aux_register(0x8001010a, name=>"io_spi_mst1_sr") - -// User extension aux register io_spi_mst1_imr -#define AR_IO_SPI_MST1_IMR 0x8001010b -#pragma Aux_register(0x8001010b, name=>"io_spi_mst1_imr") - -// User extension aux register io_spi_mst1_isr -#define AR_IO_SPI_MST1_ISR 0x8001010c -#pragma Aux_register(0x8001010c, name=>"io_spi_mst1_isr") - -// User extension aux register io_spi_mst1_risr -#define AR_IO_SPI_MST1_RISR 0x8001010d -#pragma Aux_register(0x8001010d, name=>"io_spi_mst1_risr") - -// User extension aux register io_spi_mst1_txoicr -#define AR_IO_SPI_MST1_TXOICR 0x8001010e -#pragma Aux_register(0x8001010e, name=>"io_spi_mst1_txoicr") - -// User extension aux register io_spi_mst1_rxoicr -#define AR_IO_SPI_MST1_RXOICR 0x8001010f -#pragma Aux_register(0x8001010f, name=>"io_spi_mst1_rxoicr") - -// User extension aux register io_spi_mst1_rxuicr -#define AR_IO_SPI_MST1_RXUICR 0x80010110 -#pragma Aux_register(0x80010110, name=>"io_spi_mst1_rxuicr") - -// User extension aux register io_spi_mst1_icr -#define AR_IO_SPI_MST1_ICR 0x80010112 -#pragma Aux_register(0x80010112, name=>"io_spi_mst1_icr") - -// User extension aux register io_spi_mst1_clken -#define AR_IO_SPI_MST1_CLKEN 0x80010116 -#pragma Aux_register(0x80010116, name=>"io_spi_mst1_clken") - -// User extension aux register io_spi_mst1_dr -#define AR_IO_SPI_MST1_DR 0x80010118 -#pragma Aux_register(0x80010118, name=>"io_spi_mst1_dr") - -// User extension aux register io_spi_mst1_rx_sample_dly -#define AR_IO_SPI_MST1_RX_SAMPLE_DLY 0x8001013c -#pragma Aux_register(0x8001013c, name=>"io_spi_mst1_rx_sample_dly") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST2_PRESENT 1 - -// User extension aux register io_spi_mst2_ctrlr0 -#define AR_IO_SPI_MST2_CTRLR0 0x80010200 -#pragma Aux_register(0x80010200, name=>"io_spi_mst2_ctrlr0") - -// User extension aux register io_spi_mst2_ctrlr1 -#define AR_IO_SPI_MST2_CTRLR1 0x80010201 -#pragma Aux_register(0x80010201, name=>"io_spi_mst2_ctrlr1") - -// User extension aux register io_spi_mst2_spien -#define AR_IO_SPI_MST2_SPIEN 0x80010202 -#pragma Aux_register(0x80010202, name=>"io_spi_mst2_spien") - -// User extension aux register io_spi_mst2_ser -#define AR_IO_SPI_MST2_SER 0x80010204 -#pragma Aux_register(0x80010204, name=>"io_spi_mst2_ser") - -// User extension aux register io_spi_mst2_baudr -#define AR_IO_SPI_MST2_BAUDR 0x80010205 -#pragma Aux_register(0x80010205, name=>"io_spi_mst2_baudr") - -// User extension aux register io_spi_mst2_txftlr -#define AR_IO_SPI_MST2_TXFTLR 0x80010206 -#pragma Aux_register(0x80010206, name=>"io_spi_mst2_txftlr") - -// User extension aux register io_spi_mst2_rxftlr -#define AR_IO_SPI_MST2_RXFTLR 0x80010207 -#pragma Aux_register(0x80010207, name=>"io_spi_mst2_rxftlr") - -// User extension aux register io_spi_mst2_txflr -#define AR_IO_SPI_MST2_TXFLR 0x80010208 -#pragma Aux_register(0x80010208, name=>"io_spi_mst2_txflr") - -// User extension aux register io_spi_mst2_rxflr -#define AR_IO_SPI_MST2_RXFLR 0x80010209 -#pragma Aux_register(0x80010209, name=>"io_spi_mst2_rxflr") - -// User extension aux register io_spi_mst2_sr -#define AR_IO_SPI_MST2_SR 0x8001020a -#pragma Aux_register(0x8001020a, name=>"io_spi_mst2_sr") - -// User extension aux register io_spi_mst2_imr -#define AR_IO_SPI_MST2_IMR 0x8001020b -#pragma Aux_register(0x8001020b, name=>"io_spi_mst2_imr") - -// User extension aux register io_spi_mst2_isr -#define AR_IO_SPI_MST2_ISR 0x8001020c -#pragma Aux_register(0x8001020c, name=>"io_spi_mst2_isr") - -// User extension aux register io_spi_mst2_risr -#define AR_IO_SPI_MST2_RISR 0x8001020d -#pragma Aux_register(0x8001020d, name=>"io_spi_mst2_risr") - -// User extension aux register io_spi_mst2_txoicr -#define AR_IO_SPI_MST2_TXOICR 0x8001020e -#pragma Aux_register(0x8001020e, name=>"io_spi_mst2_txoicr") - -// User extension aux register io_spi_mst2_rxoicr -#define AR_IO_SPI_MST2_RXOICR 0x8001020f -#pragma Aux_register(0x8001020f, name=>"io_spi_mst2_rxoicr") - -// User extension aux register io_spi_mst2_rxuicr -#define AR_IO_SPI_MST2_RXUICR 0x80010210 -#pragma Aux_register(0x80010210, name=>"io_spi_mst2_rxuicr") - -// User extension aux register io_spi_mst2_icr -#define AR_IO_SPI_MST2_ICR 0x80010212 -#pragma Aux_register(0x80010212, name=>"io_spi_mst2_icr") - -// User extension aux register io_spi_mst2_clken -#define AR_IO_SPI_MST2_CLKEN 0x80010216 -#pragma Aux_register(0x80010216, name=>"io_spi_mst2_clken") - -// User extension aux register io_spi_mst2_dr -#define AR_IO_SPI_MST2_DR 0x80010218 -#pragma Aux_register(0x80010218, name=>"io_spi_mst2_dr") - -// User extension aux register io_spi_mst2_rx_sample_dly -#define AR_IO_SPI_MST2_RX_SAMPLE_DLY 0x8001023c -#pragma Aux_register(0x8001023c, name=>"io_spi_mst2_rx_sample_dly") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_SLV0_PRESENT 1 - -// User extension aux register io_spi_slv0_ctrlr0 -#define AR_IO_SPI_SLV0_CTRLR0 0x80011000 -#pragma Aux_register(0x80011000, name=>"io_spi_slv0_ctrlr0") - -// User extension aux register io_spi_slv0_spien -#define AR_IO_SPI_SLV0_SPIEN 0x80011002 -#pragma Aux_register(0x80011002, name=>"io_spi_slv0_spien") - -// User extension aux register io_spi_slv0_txftlr -#define AR_IO_SPI_SLV0_TXFTLR 0x80011006 -#pragma Aux_register(0x80011006, name=>"io_spi_slv0_txftlr") - -// User extension aux register io_spi_slv0_rxftlr -#define AR_IO_SPI_SLV0_RXFTLR 0x80011007 -#pragma Aux_register(0x80011007, name=>"io_spi_slv0_rxftlr") - -// User extension aux register io_spi_slv0_txflr -#define AR_IO_SPI_SLV0_TXFLR 0x80011008 -#pragma Aux_register(0x80011008, name=>"io_spi_slv0_txflr") - -// User extension aux register io_spi_slv0_rxflr -#define AR_IO_SPI_SLV0_RXFLR 0x80011009 -#pragma Aux_register(0x80011009, name=>"io_spi_slv0_rxflr") - -// User extension aux register io_spi_slv0_sr -#define AR_IO_SPI_SLV0_SR 0x8001100a -#pragma Aux_register(0x8001100a, name=>"io_spi_slv0_sr") - -// User extension aux register io_spi_slv0_imr -#define AR_IO_SPI_SLV0_IMR 0x8001100b -#pragma Aux_register(0x8001100b, name=>"io_spi_slv0_imr") - -// User extension aux register io_spi_slv0_isr -#define AR_IO_SPI_SLV0_ISR 0x8001100c -#pragma Aux_register(0x8001100c, name=>"io_spi_slv0_isr") - -// User extension aux register io_spi_slv0_risr -#define AR_IO_SPI_SLV0_RISR 0x8001100d -#pragma Aux_register(0x8001100d, name=>"io_spi_slv0_risr") - -// User extension aux register io_spi_slv0_txoicr -#define AR_IO_SPI_SLV0_TXOICR 0x8001100e -#pragma Aux_register(0x8001100e, name=>"io_spi_slv0_txoicr") - -// User extension aux register io_spi_slv0_rxoicr -#define AR_IO_SPI_SLV0_RXOICR 0x8001100f -#pragma Aux_register(0x8001100f, name=>"io_spi_slv0_rxoicr") - -// User extension aux register io_spi_slv0_rxuicr -#define AR_IO_SPI_SLV0_RXUICR 0x80011010 -#pragma Aux_register(0x80011010, name=>"io_spi_slv0_rxuicr") - -// User extension aux register io_spi_slv0_icr -#define AR_IO_SPI_SLV0_ICR 0x80011012 -#pragma Aux_register(0x80011012, name=>"io_spi_slv0_icr") - -// User extension aux register io_spi_slv0_clken -#define AR_IO_SPI_SLV0_CLKEN 0x80011016 -#pragma Aux_register(0x80011016, name=>"io_spi_slv0_clken") - -// User extension aux register io_spi_slv0_dr -#define AR_IO_SPI_SLV0_DR 0x80011018 -#pragma Aux_register(0x80011018, name=>"io_spi_slv0_dr") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO1_PRESENT 1 - -// User extension aux register io_gpio1_debounce -#define AR_IO_GPIO1_DEBOUNCE 0x80017148 -#pragma Aux_register(0x80017148, name=>"io_gpio1_debounce") - -// User extension aux register io_gpio1_clken -#define AR_IO_GPIO1_CLKEN 0x80017180 -#pragma Aux_register(0x80017180, name=>"io_gpio1_clken") - -// User extension aux register io_gpio1_swporta_dr -#define AR_IO_GPIO1_SWPORTA_DR 0x80017100 -#pragma Aux_register(0x80017100, name=>"io_gpio1_swporta_dr") - -// User extension aux register io_gpio1_swporta_ddr -#define AR_IO_GPIO1_SWPORTA_DDR 0x80017104 -#pragma Aux_register(0x80017104, name=>"io_gpio1_swporta_ddr") - -// User extension aux register io_gpio1_inten -#define AR_IO_GPIO1_INTEN 0x80017130 -#pragma Aux_register(0x80017130, name=>"io_gpio1_inten") - -// User extension aux register io_gpio1_intmask -#define AR_IO_GPIO1_INTMASK 0x80017134 -#pragma Aux_register(0x80017134, name=>"io_gpio1_intmask") - -// User extension aux register io_gpio1_inttype_level -#define AR_IO_GPIO1_INTTYPE_LEVEL 0x80017138 -#pragma Aux_register(0x80017138, name=>"io_gpio1_inttype_level") - -// User extension aux register io_gpio1_int_polarity -#define AR_IO_GPIO1_INT_POLARITY 0x8001713c -#pragma Aux_register(0x8001713c, name=>"io_gpio1_int_polarity") - -// User extension aux register io_gpio1_intstatus -#define AR_IO_GPIO1_INTSTATUS 0x80017140 -#pragma Aux_register(0x80017140, name=>"io_gpio1_intstatus") - -// User extension aux register io_gpio1_raw_intstatus -#define AR_IO_GPIO1_RAW_INTSTATUS 0x80017144 -#pragma Aux_register(0x80017144, name=>"io_gpio1_raw_intstatus") - -// User extension aux register io_gpio1_porta_eoi -#define AR_IO_GPIO1_PORTA_EOI 0x8001714c -#pragma Aux_register(0x8001714c, name=>"io_gpio1_porta_eoi") - -// User extension aux register io_gpio1_ext_porta -#define AR_IO_GPIO1_EXT_PORTA 0x80017150 -#pragma Aux_register(0x80017150, name=>"io_gpio1_ext_porta") - -// User extension aux register io_gpio1_ls_sync -#define AR_IO_GPIO1_LS_SYNC 0x80017160 -#pragma Aux_register(0x80017160, name=>"io_gpio1_ls_sync") - -// User extension aux register io_gpio1_int_bothedge -#define AR_IO_GPIO1_INT_BOTHEDGE 0x80017168 -#pragma Aux_register(0x80017168, name=>"io_gpio1_int_bothedge") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO2_PRESENT 1 - -// User extension aux register io_gpio2_debounce -#define AR_IO_GPIO2_DEBOUNCE 0x80017248 -#pragma Aux_register(0x80017248, name=>"io_gpio2_debounce") - -// User extension aux register io_gpio2_clken -#define AR_IO_GPIO2_CLKEN 0x80017280 -#pragma Aux_register(0x80017280, name=>"io_gpio2_clken") - -// User extension aux register io_gpio2_swporta_dr -#define AR_IO_GPIO2_SWPORTA_DR 0x80017200 -#pragma Aux_register(0x80017200, name=>"io_gpio2_swporta_dr") - -// User extension aux register io_gpio2_swporta_ddr -#define AR_IO_GPIO2_SWPORTA_DDR 0x80017204 -#pragma Aux_register(0x80017204, name=>"io_gpio2_swporta_ddr") - -// User extension aux register io_gpio2_inten -#define AR_IO_GPIO2_INTEN 0x80017230 -#pragma Aux_register(0x80017230, name=>"io_gpio2_inten") - -// User extension aux register io_gpio2_intmask -#define AR_IO_GPIO2_INTMASK 0x80017234 -#pragma Aux_register(0x80017234, name=>"io_gpio2_intmask") - -// User extension aux register io_gpio2_inttype_level -#define AR_IO_GPIO2_INTTYPE_LEVEL 0x80017238 -#pragma Aux_register(0x80017238, name=>"io_gpio2_inttype_level") - -// User extension aux register io_gpio2_int_polarity -#define AR_IO_GPIO2_INT_POLARITY 0x8001723c -#pragma Aux_register(0x8001723c, name=>"io_gpio2_int_polarity") - -// User extension aux register io_gpio2_intstatus -#define AR_IO_GPIO2_INTSTATUS 0x80017240 -#pragma Aux_register(0x80017240, name=>"io_gpio2_intstatus") - -// User extension aux register io_gpio2_raw_intstatus -#define AR_IO_GPIO2_RAW_INTSTATUS 0x80017244 -#pragma Aux_register(0x80017244, name=>"io_gpio2_raw_intstatus") - -// User extension aux register io_gpio2_porta_eoi -#define AR_IO_GPIO2_PORTA_EOI 0x8001724c -#pragma Aux_register(0x8001724c, name=>"io_gpio2_porta_eoi") - -// User extension aux register io_gpio2_ext_porta -#define AR_IO_GPIO2_EXT_PORTA 0x80017250 -#pragma Aux_register(0x80017250, name=>"io_gpio2_ext_porta") - -// User extension aux register io_gpio2_ls_sync -#define AR_IO_GPIO2_LS_SYNC 0x80017260 -#pragma Aux_register(0x80017260, name=>"io_gpio2_ls_sync") - -// User extension aux register io_gpio2_int_bothedge -#define AR_IO_GPIO2_INT_BOTHEDGE 0x80017268 -#pragma Aux_register(0x80017268, name=>"io_gpio2_int_bothedge") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST1_PRESENT 1 - -// User extension aux register io_i2c_mst1_clken -#define AR_IO_I2C_MST1_CLKEN 0x800121c0 -#pragma Aux_register(0x800121c0, name=>"io_i2c_mst1_clken") - -// User extension aux register io_i2c_mst1_con -#define AR_IO_I2C_MST1_CON 0x80012100 -#pragma Aux_register(0x80012100, name=>"io_i2c_mst1_con") - -// User extension aux register io_i2c_mst1_tar -#define AR_IO_I2C_MST1_TAR 0x80012104 -#pragma Aux_register(0x80012104, name=>"io_i2c_mst1_tar") - -// User extension aux register io_i2c_mst1_data_cmd -#define AR_IO_I2C_MST1_DATA_CMD 0x80012110 -#pragma Aux_register(0x80012110, name=>"io_i2c_mst1_data_cmd") - -// User extension aux register io_i2c_mst1_ss_scl_hcnt -#define AR_IO_I2C_MST1_SS_SCL_HCNT 0x80012114 -#pragma Aux_register(0x80012114, name=>"io_i2c_mst1_ss_scl_hcnt") - -// User extension aux register io_i2c_mst1_ss_scl_lcnt -#define AR_IO_I2C_MST1_SS_SCL_LCNT 0x80012118 -#pragma Aux_register(0x80012118, name=>"io_i2c_mst1_ss_scl_lcnt") - -// User extension aux register io_i2c_mst1_fs_scl_hcnt -#define AR_IO_I2C_MST1_FS_SCL_HCNT 0x8001211c -#pragma Aux_register(0x8001211c, name=>"io_i2c_mst1_fs_scl_hcnt") - -// User extension aux register io_i2c_mst1_fs_scl_lcnt -#define AR_IO_I2C_MST1_FS_SCL_LCNT 0x80012120 -#pragma Aux_register(0x80012120, name=>"io_i2c_mst1_fs_scl_lcnt") - -// User extension aux register io_i2c_mst1_intr_stat -#define AR_IO_I2C_MST1_INTR_STAT 0x8001212c -#pragma Aux_register(0x8001212c, name=>"io_i2c_mst1_intr_stat") - -// User extension aux register io_i2c_mst1_intr_mask -#define AR_IO_I2C_MST1_INTR_MASK 0x80012130 -#pragma Aux_register(0x80012130, name=>"io_i2c_mst1_intr_mask") - -// User extension aux register io_i2c_mst1_raw_intr_stat -#define AR_IO_I2C_MST1_RAW_INTR_STAT 0x80012134 -#pragma Aux_register(0x80012134, name=>"io_i2c_mst1_raw_intr_stat") - -// User extension aux register io_i2c_mst1_rx_tl -#define AR_IO_I2C_MST1_RX_TL 0x80012138 -#pragma Aux_register(0x80012138, name=>"io_i2c_mst1_rx_tl") - -// User extension aux register io_i2c_mst1_tx_tl -#define AR_IO_I2C_MST1_TX_TL 0x8001213c -#pragma Aux_register(0x8001213c, name=>"io_i2c_mst1_tx_tl") - -// User extension aux register io_i2c_mst1_clr_intr -#define AR_IO_I2C_MST1_CLR_INTR 0x80012140 -#pragma Aux_register(0x80012140, name=>"io_i2c_mst1_clr_intr") - -// User extension aux register io_i2c_mst1_clr_rx_under -#define AR_IO_I2C_MST1_CLR_RX_UNDER 0x80012144 -#pragma Aux_register(0x80012144, name=>"io_i2c_mst1_clr_rx_under") - -// User extension aux register io_i2c_mst1_clr_rx_over -#define AR_IO_I2C_MST1_CLR_RX_OVER 0x80012148 -#pragma Aux_register(0x80012148, name=>"io_i2c_mst1_clr_rx_over") - -// User extension aux register io_i2c_mst1_clr_tx_over -#define AR_IO_I2C_MST1_CLR_TX_OVER 0x8001214c -#pragma Aux_register(0x8001214c, name=>"io_i2c_mst1_clr_tx_over") - -// User extension aux register io_i2c_mst1_clr_tx_abrt -#define AR_IO_I2C_MST1_CLR_TX_ABRT 0x80012154 -#pragma Aux_register(0x80012154, name=>"io_i2c_mst1_clr_tx_abrt") - -// User extension aux register io_i2c_mst1_clr_activity -#define AR_IO_I2C_MST1_CLR_ACTIVITY 0x8001215c -#pragma Aux_register(0x8001215c, name=>"io_i2c_mst1_clr_activity") - -// User extension aux register io_i2c_mst1_clr_stop_det -#define AR_IO_I2C_MST1_CLR_STOP_DET 0x80012160 -#pragma Aux_register(0x80012160, name=>"io_i2c_mst1_clr_stop_det") - -// User extension aux register io_i2c_mst1_clr_start_det -#define AR_IO_I2C_MST1_CLR_START_DET 0x80012164 -#pragma Aux_register(0x80012164, name=>"io_i2c_mst1_clr_start_det") - -// User extension aux register io_i2c_mst1_enable -#define AR_IO_I2C_MST1_ENABLE 0x8001216c -#pragma Aux_register(0x8001216c, name=>"io_i2c_mst1_enable") - -// User extension aux register io_i2c_mst1_status -#define AR_IO_I2C_MST1_STATUS 0x80012170 -#pragma Aux_register(0x80012170, name=>"io_i2c_mst1_status") - -// User extension aux register io_i2c_mst1_txflr -#define AR_IO_I2C_MST1_TXFLR 0x80012174 -#pragma Aux_register(0x80012174, name=>"io_i2c_mst1_txflr") - -// User extension aux register io_i2c_mst1_rxflr -#define AR_IO_I2C_MST1_RXFLR 0x80012178 -#pragma Aux_register(0x80012178, name=>"io_i2c_mst1_rxflr") - -// User extension aux register io_i2c_mst1_sda_hold -#define AR_IO_I2C_MST1_SDA_HOLD 0x8001217c -#pragma Aux_register(0x8001217c, name=>"io_i2c_mst1_sda_hold") - -// User extension aux register io_i2c_mst1_tx_abrt_source -#define AR_IO_I2C_MST1_TX_ABRT_SOURCE 0x80012180 -#pragma Aux_register(0x80012180, name=>"io_i2c_mst1_tx_abrt_source") - -// User extension aux register io_i2c_mst1_enable_status -#define AR_IO_I2C_MST1_ENABLE_STATUS 0x8001219c -#pragma Aux_register(0x8001219c, name=>"io_i2c_mst1_enable_status") - -// User extension aux register io_i2c_mst1_fs_spklen -#define AR_IO_I2C_MST1_FS_SPKLEN 0x800121a0 -#pragma Aux_register(0x800121a0, name=>"io_i2c_mst1_fs_spklen") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST2_PRESENT 1 - -// User extension aux register io_i2c_mst2_clken -#define AR_IO_I2C_MST2_CLKEN 0x800122c0 -#pragma Aux_register(0x800122c0, name=>"io_i2c_mst2_clken") - -// User extension aux register io_i2c_mst2_con -#define AR_IO_I2C_MST2_CON 0x80012200 -#pragma Aux_register(0x80012200, name=>"io_i2c_mst2_con") - -// User extension aux register io_i2c_mst2_tar -#define AR_IO_I2C_MST2_TAR 0x80012204 -#pragma Aux_register(0x80012204, name=>"io_i2c_mst2_tar") - -// User extension aux register io_i2c_mst2_data_cmd -#define AR_IO_I2C_MST2_DATA_CMD 0x80012210 -#pragma Aux_register(0x80012210, name=>"io_i2c_mst2_data_cmd") - -// User extension aux register io_i2c_mst2_ss_scl_hcnt -#define AR_IO_I2C_MST2_SS_SCL_HCNT 0x80012214 -#pragma Aux_register(0x80012214, name=>"io_i2c_mst2_ss_scl_hcnt") - -// User extension aux register io_i2c_mst2_ss_scl_lcnt -#define AR_IO_I2C_MST2_SS_SCL_LCNT 0x80012218 -#pragma Aux_register(0x80012218, name=>"io_i2c_mst2_ss_scl_lcnt") - -// User extension aux register io_i2c_mst2_fs_scl_hcnt -#define AR_IO_I2C_MST2_FS_SCL_HCNT 0x8001221c -#pragma Aux_register(0x8001221c, name=>"io_i2c_mst2_fs_scl_hcnt") - -// User extension aux register io_i2c_mst2_fs_scl_lcnt -#define AR_IO_I2C_MST2_FS_SCL_LCNT 0x80012220 -#pragma Aux_register(0x80012220, name=>"io_i2c_mst2_fs_scl_lcnt") - -// User extension aux register io_i2c_mst2_intr_stat -#define AR_IO_I2C_MST2_INTR_STAT 0x8001222c -#pragma Aux_register(0x8001222c, name=>"io_i2c_mst2_intr_stat") - -// User extension aux register io_i2c_mst2_intr_mask -#define AR_IO_I2C_MST2_INTR_MASK 0x80012230 -#pragma Aux_register(0x80012230, name=>"io_i2c_mst2_intr_mask") - -// User extension aux register io_i2c_mst2_raw_intr_stat -#define AR_IO_I2C_MST2_RAW_INTR_STAT 0x80012234 -#pragma Aux_register(0x80012234, name=>"io_i2c_mst2_raw_intr_stat") - -// User extension aux register io_i2c_mst2_rx_tl -#define AR_IO_I2C_MST2_RX_TL 0x80012238 -#pragma Aux_register(0x80012238, name=>"io_i2c_mst2_rx_tl") - -// User extension aux register io_i2c_mst2_tx_tl -#define AR_IO_I2C_MST2_TX_TL 0x8001223c -#pragma Aux_register(0x8001223c, name=>"io_i2c_mst2_tx_tl") - -// User extension aux register io_i2c_mst2_clr_intr -#define AR_IO_I2C_MST2_CLR_INTR 0x80012240 -#pragma Aux_register(0x80012240, name=>"io_i2c_mst2_clr_intr") - -// User extension aux register io_i2c_mst2_clr_rx_under -#define AR_IO_I2C_MST2_CLR_RX_UNDER 0x80012244 -#pragma Aux_register(0x80012244, name=>"io_i2c_mst2_clr_rx_under") - -// User extension aux register io_i2c_mst2_clr_rx_over -#define AR_IO_I2C_MST2_CLR_RX_OVER 0x80012248 -#pragma Aux_register(0x80012248, name=>"io_i2c_mst2_clr_rx_over") - -// User extension aux register io_i2c_mst2_clr_tx_over -#define AR_IO_I2C_MST2_CLR_TX_OVER 0x8001224c -#pragma Aux_register(0x8001224c, name=>"io_i2c_mst2_clr_tx_over") - -// User extension aux register io_i2c_mst2_clr_tx_abrt -#define AR_IO_I2C_MST2_CLR_TX_ABRT 0x80012254 -#pragma Aux_register(0x80012254, name=>"io_i2c_mst2_clr_tx_abrt") - -// User extension aux register io_i2c_mst2_clr_activity -#define AR_IO_I2C_MST2_CLR_ACTIVITY 0x8001225c -#pragma Aux_register(0x8001225c, name=>"io_i2c_mst2_clr_activity") - -// User extension aux register io_i2c_mst2_clr_stop_det -#define AR_IO_I2C_MST2_CLR_STOP_DET 0x80012260 -#pragma Aux_register(0x80012260, name=>"io_i2c_mst2_clr_stop_det") - -// User extension aux register io_i2c_mst2_clr_start_det -#define AR_IO_I2C_MST2_CLR_START_DET 0x80012264 -#pragma Aux_register(0x80012264, name=>"io_i2c_mst2_clr_start_det") - -// User extension aux register io_i2c_mst2_enable -#define AR_IO_I2C_MST2_ENABLE 0x8001226c -#pragma Aux_register(0x8001226c, name=>"io_i2c_mst2_enable") - -// User extension aux register io_i2c_mst2_status -#define AR_IO_I2C_MST2_STATUS 0x80012270 -#pragma Aux_register(0x80012270, name=>"io_i2c_mst2_status") - -// User extension aux register io_i2c_mst2_txflr -#define AR_IO_I2C_MST2_TXFLR 0x80012274 -#pragma Aux_register(0x80012274, name=>"io_i2c_mst2_txflr") - -// User extension aux register io_i2c_mst2_rxflr -#define AR_IO_I2C_MST2_RXFLR 0x80012278 -#pragma Aux_register(0x80012278, name=>"io_i2c_mst2_rxflr") - -// User extension aux register io_i2c_mst2_sda_hold -#define AR_IO_I2C_MST2_SDA_HOLD 0x8001227c -#pragma Aux_register(0x8001227c, name=>"io_i2c_mst2_sda_hold") - -// User extension aux register io_i2c_mst2_tx_abrt_source -#define AR_IO_I2C_MST2_TX_ABRT_SOURCE 0x80012280 -#pragma Aux_register(0x80012280, name=>"io_i2c_mst2_tx_abrt_source") - -// User extension aux register io_i2c_mst2_enable_status -#define AR_IO_I2C_MST2_ENABLE_STATUS 0x8001229c -#pragma Aux_register(0x8001229c, name=>"io_i2c_mst2_enable_status") - -// User extension aux register io_i2c_mst2_fs_spklen -#define AR_IO_I2C_MST2_FS_SPKLEN 0x800122a0 -#pragma Aux_register(0x800122a0, name=>"io_i2c_mst2_fs_spklen") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART0_PRESENT 1 - -// User extension aux register io_uart0_clken -#define AR_IO_UART0_CLKEN 0x800140c0 -#pragma Aux_register(0x800140c0, name=>"io_uart0_clken") - -// User extension aux register io_uart0_rbr_thr_dll -#define AR_IO_UART0_RBR_THR_DLL 0x80014000 -#pragma Aux_register(0x80014000, name=>"io_uart0_rbr_thr_dll") - -// User extension aux register io_uart0_ier_dlh -#define AR_IO_UART0_IER_DLH 0x80014004 -#pragma Aux_register(0x80014004, name=>"io_uart0_ier_dlh") - -// User extension aux register io_uart0_iir_fcr -#define AR_IO_UART0_IIR_FCR 0x80014008 -#pragma Aux_register(0x80014008, name=>"io_uart0_iir_fcr") - -// User extension aux register io_uart0_lcr -#define AR_IO_UART0_LCR 0x8001400c -#pragma Aux_register(0x8001400c, name=>"io_uart0_lcr") - -// User extension aux register io_uart0_mcr -#define AR_IO_UART0_MCR 0x80014010 -#pragma Aux_register(0x80014010, name=>"io_uart0_mcr") - -// User extension aux register io_uart0_lsr -#define AR_IO_UART0_LSR 0x80014014 -#pragma Aux_register(0x80014014, name=>"io_uart0_lsr") - -// User extension aux register io_uart0_msr -#define AR_IO_UART0_MSR 0x80014018 -#pragma Aux_register(0x80014018, name=>"io_uart0_msr") - -// User extension aux register io_uart0_usr -#define AR_IO_UART0_USR 0x8001407c -#pragma Aux_register(0x8001407c, name=>"io_uart0_usr") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART1_PRESENT 1 - -// User extension aux register io_uart1_clken -#define AR_IO_UART1_CLKEN 0x800141c0 -#pragma Aux_register(0x800141c0, name=>"io_uart1_clken") - -// User extension aux register io_uart1_rbr_thr_dll -#define AR_IO_UART1_RBR_THR_DLL 0x80014100 -#pragma Aux_register(0x80014100, name=>"io_uart1_rbr_thr_dll") - -// User extension aux register io_uart1_ier_dlh -#define AR_IO_UART1_IER_DLH 0x80014104 -#pragma Aux_register(0x80014104, name=>"io_uart1_ier_dlh") - -// User extension aux register io_uart1_iir_fcr -#define AR_IO_UART1_IIR_FCR 0x80014108 -#pragma Aux_register(0x80014108, name=>"io_uart1_iir_fcr") - -// User extension aux register io_uart1_lcr -#define AR_IO_UART1_LCR 0x8001410c -#pragma Aux_register(0x8001410c, name=>"io_uart1_lcr") - -// User extension aux register io_uart1_mcr -#define AR_IO_UART1_MCR 0x80014110 -#pragma Aux_register(0x80014110, name=>"io_uart1_mcr") - -// User extension aux register io_uart1_lsr -#define AR_IO_UART1_LSR 0x80014114 -#pragma Aux_register(0x80014114, name=>"io_uart1_lsr") - -// User extension aux register io_uart1_msr -#define AR_IO_UART1_MSR 0x80014118 -#pragma Aux_register(0x80014118, name=>"io_uart1_msr") - -// User extension aux register io_uart1_usr -#define AR_IO_UART1_USR 0x8001417c -#pragma Aux_register(0x8001417c, name=>"io_uart1_usr") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART2_PRESENT 1 - -// User extension aux register io_uart2_clken -#define AR_IO_UART2_CLKEN 0x800142c0 -#pragma Aux_register(0x800142c0, name=>"io_uart2_clken") - -// User extension aux register io_uart2_rbr_thr_dll -#define AR_IO_UART2_RBR_THR_DLL 0x80014200 -#pragma Aux_register(0x80014200, name=>"io_uart2_rbr_thr_dll") - -// User extension aux register io_uart2_ier_dlh -#define AR_IO_UART2_IER_DLH 0x80014204 -#pragma Aux_register(0x80014204, name=>"io_uart2_ier_dlh") - -// User extension aux register io_uart2_iir_fcr -#define AR_IO_UART2_IIR_FCR 0x80014208 -#pragma Aux_register(0x80014208, name=>"io_uart2_iir_fcr") - -// User extension aux register io_uart2_lcr -#define AR_IO_UART2_LCR 0x8001420c -#pragma Aux_register(0x8001420c, name=>"io_uart2_lcr") - -// User extension aux register io_uart2_mcr -#define AR_IO_UART2_MCR 0x80014210 -#pragma Aux_register(0x80014210, name=>"io_uart2_mcr") - -// User extension aux register io_uart2_lsr -#define AR_IO_UART2_LSR 0x80014214 -#pragma Aux_register(0x80014214, name=>"io_uart2_lsr") - -// User extension aux register io_uart2_msr -#define AR_IO_UART2_MSR 0x80014218 -#pragma Aux_register(0x80014218, name=>"io_uart2_msr") - -// User extension aux register io_uart2_usr -#define AR_IO_UART2_USR 0x8001427c -#pragma Aux_register(0x8001427c, name=>"io_uart2_usr") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART3_PRESENT 1 - -// User extension aux register io_uart3_clken -#define AR_IO_UART3_CLKEN 0x800143c0 -#pragma Aux_register(0x800143c0, name=>"io_uart3_clken") - -// User extension aux register io_uart3_rbr_thr_dll -#define AR_IO_UART3_RBR_THR_DLL 0x80014300 -#pragma Aux_register(0x80014300, name=>"io_uart3_rbr_thr_dll") - -// User extension aux register io_uart3_ier_dlh -#define AR_IO_UART3_IER_DLH 0x80014304 -#pragma Aux_register(0x80014304, name=>"io_uart3_ier_dlh") - -// User extension aux register io_uart3_iir_fcr -#define AR_IO_UART3_IIR_FCR 0x80014308 -#pragma Aux_register(0x80014308, name=>"io_uart3_iir_fcr") - -// User extension aux register io_uart3_lcr -#define AR_IO_UART3_LCR 0x8001430c -#pragma Aux_register(0x8001430c, name=>"io_uart3_lcr") - -// User extension aux register io_uart3_mcr -#define AR_IO_UART3_MCR 0x80014310 -#pragma Aux_register(0x80014310, name=>"io_uart3_mcr") - -// User extension aux register io_uart3_lsr -#define AR_IO_UART3_LSR 0x80014314 -#pragma Aux_register(0x80014314, name=>"io_uart3_lsr") - -// User extension aux register io_uart3_msr -#define AR_IO_UART3_MSR 0x80014318 -#pragma Aux_register(0x80014318, name=>"io_uart3_msr") - -// User extension aux register io_uart3_usr -#define AR_IO_UART3_USR 0x8001437c -#pragma Aux_register(0x8001437c, name=>"io_uart3_usr") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2S_RX_MST0_PRESENT 1 - -// User extension aux register io_i2s_rx_mst0_ier -#define AR_IO_I2S_RX_MST0_IER 0x8001a000 -#pragma Aux_register(0x8001a000, name=>"io_i2s_rx_mst0_ier") - -// User extension aux register io_i2s_rx_mst0_irer -#define AR_IO_I2S_RX_MST0_IRER 0x8001a004 -#pragma Aux_register(0x8001a004, name=>"io_i2s_rx_mst0_irer") - -// User extension aux register io_i2s_rx_mst0_cer -#define AR_IO_I2S_RX_MST0_CER 0x8001a00c -#pragma Aux_register(0x8001a00c, name=>"io_i2s_rx_mst0_cer") - -// User extension aux register io_i2s_rx_mst0_ccr -#define AR_IO_I2S_RX_MST0_CCR 0x8001a010 -#pragma Aux_register(0x8001a010, name=>"io_i2s_rx_mst0_ccr") - -// User extension aux register io_i2s_rx_mst0_rxffr -#define AR_IO_I2S_RX_MST0_RXFFR 0x8001a014 -#pragma Aux_register(0x8001a014, name=>"io_i2s_rx_mst0_rxffr") - -// User extension aux register io_i2s_rx_mst0_lrbr -#define AR_IO_I2S_RX_MST0_LRBR 0x8001a020 -#pragma Aux_register(0x8001a020, name=>"io_i2s_rx_mst0_lrbr") - -// User extension aux register io_i2s_rx_mst0_rrbr -#define AR_IO_I2S_RX_MST0_RRBR 0x8001a024 -#pragma Aux_register(0x8001a024, name=>"io_i2s_rx_mst0_rrbr") - -// User extension aux register io_i2s_rx_mst0_rer -#define AR_IO_I2S_RX_MST0_RER 0x8001a028 -#pragma Aux_register(0x8001a028, name=>"io_i2s_rx_mst0_rer") - -// User extension aux register io_i2s_rx_mst0_rcr -#define AR_IO_I2S_RX_MST0_RCR 0x8001a030 -#pragma Aux_register(0x8001a030, name=>"io_i2s_rx_mst0_rcr") - -// User extension aux register io_i2s_rx_mst0_isr -#define AR_IO_I2S_RX_MST0_ISR 0x8001a038 -#pragma Aux_register(0x8001a038, name=>"io_i2s_rx_mst0_isr") - -// User extension aux register io_i2s_rx_mst0_imr -#define AR_IO_I2S_RX_MST0_IMR 0x8001a03c -#pragma Aux_register(0x8001a03c, name=>"io_i2s_rx_mst0_imr") - -// User extension aux register io_i2s_rx_mst0_ror -#define AR_IO_I2S_RX_MST0_ROR 0x8001a040 -#pragma Aux_register(0x8001a040, name=>"io_i2s_rx_mst0_ror") - -// User extension aux register io_i2s_rx_mst0_rfcr -#define AR_IO_I2S_RX_MST0_RFCR 0x8001a048 -#pragma Aux_register(0x8001a048, name=>"io_i2s_rx_mst0_rfcr") - -// User extension aux register io_i2s_rx_mst0_rff -#define AR_IO_I2S_RX_MST0_RFF 0x8001a050 -#pragma Aux_register(0x8001a050, name=>"io_i2s_rx_mst0_rff") - -// User extension aux register io_i2s_rx_mst0_rxdma -#define AR_IO_I2S_RX_MST0_RXDMA 0x8001a1c0 -#pragma Aux_register(0x8001a1c0, name=>"io_i2s_rx_mst0_rxdma") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2S_TX_MST0_PRESENT 1 - -// User extension aux register io_i2s_tx_mst0_ier -#define AR_IO_I2S_TX_MST0_IER 0x80019000 -#pragma Aux_register(0x80019000, name=>"io_i2s_tx_mst0_ier") - -// User extension aux register io_i2s_tx_mst0_iter -#define AR_IO_I2S_TX_MST0_ITER 0x80019008 -#pragma Aux_register(0x80019008, name=>"io_i2s_tx_mst0_iter") - -// User extension aux register io_i2s_tx_mst0_cer -#define AR_IO_I2S_TX_MST0_CER 0x8001900c -#pragma Aux_register(0x8001900c, name=>"io_i2s_tx_mst0_cer") - -// User extension aux register io_i2s_tx_mst0_ccr -#define AR_IO_I2S_TX_MST0_CCR 0x80019010 -#pragma Aux_register(0x80019010, name=>"io_i2s_tx_mst0_ccr") - -// User extension aux register io_i2s_tx_mst0_txffr -#define AR_IO_I2S_TX_MST0_TXFFR 0x80019018 -#pragma Aux_register(0x80019018, name=>"io_i2s_tx_mst0_txffr") - -// User extension aux register io_i2s_tx_mst0_lthr -#define AR_IO_I2S_TX_MST0_LTHR 0x80019020 -#pragma Aux_register(0x80019020, name=>"io_i2s_tx_mst0_lthr") - -// User extension aux register io_i2s_tx_mst0_rthr -#define AR_IO_I2S_TX_MST0_RTHR 0x80019024 -#pragma Aux_register(0x80019024, name=>"io_i2s_tx_mst0_rthr") - -// User extension aux register io_i2s_tx_mst0_ter -#define AR_IO_I2S_TX_MST0_TER 0x8001902c -#pragma Aux_register(0x8001902c, name=>"io_i2s_tx_mst0_ter") - -// User extension aux register io_i2s_tx_mst0_tcr -#define AR_IO_I2S_TX_MST0_TCR 0x80019034 -#pragma Aux_register(0x80019034, name=>"io_i2s_tx_mst0_tcr") - -// User extension aux register io_i2s_tx_mst0_isr -#define AR_IO_I2S_TX_MST0_ISR 0x80019038 -#pragma Aux_register(0x80019038, name=>"io_i2s_tx_mst0_isr") - -// User extension aux register io_i2s_tx_mst0_imr -#define AR_IO_I2S_TX_MST0_IMR 0x8001903c -#pragma Aux_register(0x8001903c, name=>"io_i2s_tx_mst0_imr") - -// User extension aux register io_i2s_tx_mst0_tor -#define AR_IO_I2S_TX_MST0_TOR 0x80019044 -#pragma Aux_register(0x80019044, name=>"io_i2s_tx_mst0_tor") - -// User extension aux register io_i2s_tx_mst0_tfcr -#define AR_IO_I2S_TX_MST0_TFCR 0x8001904c -#pragma Aux_register(0x8001904c, name=>"io_i2s_tx_mst0_tfcr") - -// User extension aux register io_i2s_tx_mst0_tff -#define AR_IO_I2S_TX_MST0_TFF 0x80019054 -#pragma Aux_register(0x80019054, name=>"io_i2s_tx_mst0_tff") - -// User extension aux register io_i2s_tx_mst0_txdma -#define AR_IO_I2S_TX_MST0_TXDMA 0x800191c8 -#pragma Aux_register(0x800191c8, name=>"io_i2s_tx_mst0_txdma") -#define APEX_COM_ARC_HARDWARE_DFSS_IO_PDM_RX0_PRESENT 1 - -// User extension aux register io_pdm_rx0_pdm_en -#define AR_IO_PDM_RX0_PDM_EN 0x8001b000 -#pragma Aux_register(0x8001b000, name=>"io_pdm_rx0_pdm_en") - -// User extension aux register io_pdm_rx0_pdm_ren -#define AR_IO_PDM_RX0_PDM_REN 0x8001b004 -#pragma Aux_register(0x8001b004, name=>"io_pdm_rx0_pdm_ren") - -// User extension aux register io_pdm_rx0_cer -#define AR_IO_PDM_RX0_CER 0x8001b00c -#pragma Aux_register(0x8001b00c, name=>"io_pdm_rx0_cer") - -// User extension aux register io_pdm_rx0_rxffr -#define AR_IO_PDM_RX0_RXFFR 0x8001b014 -#pragma Aux_register(0x8001b014, name=>"io_pdm_rx0_rxffr") - -// User extension aux register io_pdm_rx0_rer0 -#define AR_IO_PDM_RX0_RER0 0x8001b028 -#pragma Aux_register(0x8001b028, name=>"io_pdm_rx0_rer0") - -// User extension aux register io_pdm_rx0_isr -#define AR_IO_PDM_RX0_ISR 0x8001b038 -#pragma Aux_register(0x8001b038, name=>"io_pdm_rx0_isr") - -// User extension aux register io_pdm_rx0_imr -#define AR_IO_PDM_RX0_IMR 0x8001b03c -#pragma Aux_register(0x8001b03c, name=>"io_pdm_rx0_imr") - -// User extension aux register io_pdm_rx0_ror -#define AR_IO_PDM_RX0_ROR 0x8001b040 -#pragma Aux_register(0x8001b040, name=>"io_pdm_rx0_ror") - -// User extension aux register io_pdm_rx0_rfcr -#define AR_IO_PDM_RX0_RFCR 0x8001b048 -#pragma Aux_register(0x8001b048, name=>"io_pdm_rx0_rfcr") - -// User extension aux register io_pdm_rx0_rxdma -#define AR_IO_PDM_RX0_RXDMA 0x8001b1c0 -#pragma Aux_register(0x8001b1c0, name=>"io_pdm_rx0_rxdma") - -// User extension aux register io_pdm_rx0_pdm_rr -#define AR_IO_PDM_RX0_PDM_RR 0x8001b1d0 -#pragma Aux_register(0x8001b1d0, name=>"io_pdm_rx0_pdm_rr") - -// User extension aux register io_pdm_rx0_cic_n -#define AR_IO_PDM_RX0_CIC_N 0x8001b1d4 -#pragma Aux_register(0x8001b1d4, name=>"io_pdm_rx0_cic_n") - -// User extension aux register io_pdm_rx0_cic_d -#define AR_IO_PDM_RX0_CIC_D 0x8001b1d8 -#pragma Aux_register(0x8001b1d8, name=>"io_pdm_rx0_cic_d") - -// User extension aux register io_pdm_rx0_dcrc -#define AR_IO_PDM_RX0_DCRC 0x8001b1dc -#pragma Aux_register(0x8001b1dc, name=>"io_pdm_rx0_dcrc") - -// User extension aux register io_pdm_rx0_brc_b0 -#define AR_IO_PDM_RX0_BRC_B0 0x8001b1e0 -#pragma Aux_register(0x8001b1e0, name=>"io_pdm_rx0_brc_b0") - -// User extension aux register io_pdm_rx0_brc_clp -#define AR_IO_PDM_RX0_BRC_CLP 0x8001b1f0 -#pragma Aux_register(0x8001b1f0, name=>"io_pdm_rx0_brc_clp") -#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_PRESENT 1 - -// User extension aux register fpu_build -#define AR_FPU_BUILD 0xc8 -#pragma Aux_register(0xc8, name=>"fpu_build") - -// User extension aux register fpu_ctrl -#define AR_FPU_CTRL 0x300 -#pragma Aux_register(0x300, name=>"fpu_ctrl") - -// User extension aux register fpu_status -#define AR_FPU_STATUS 0x301 -#pragma Aux_register(0x301, name=>"fpu_status") - -// User extension instruction fsmadd -extern int fsmadd(int,int); -#pragma intrinsic(fsmadd,opcode=>6,sub_opcode=>5, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fsmsub -extern int fsmsub(int,int); -#pragma intrinsic(fsmsub,opcode=>6,sub_opcode=>6, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fsmul -extern int fsmul(int,int); -#pragma intrinsic(fsmul,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fsadd -extern int fsadd(int,int); -#pragma intrinsic(fsadd,opcode=>6,sub_opcode=>1, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fssub -extern int fssub(int,int); -#pragma intrinsic(fssub,opcode=>6,sub_opcode=>2, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fcvt32 -extern int fcvt32(int,int); -#pragma intrinsic(fcvt32,opcode=>6,sub_opcode=>8, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fsdiv -extern int fsdiv(int,int); -#pragma intrinsic(fsdiv,opcode=>6,sub_opcode=>7, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fscmp -extern int fscmp(int,int); -#pragma intrinsic(fscmp,opcode=>6,sub_opcode=>3, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fscmp -extern int fscmp_f(int,int); -#pragma intrinsic(fscmp_f,opcode=>6,sub_opcode=>3, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fscmpf -extern int fscmpf(int,int); -#pragma intrinsic(fscmpf,opcode=>6,sub_opcode=>4, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fscmpf -extern int fscmpf_f(int,int); -#pragma intrinsic(fscmpf_f,opcode=>6,sub_opcode=>4, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") - -// User extension instruction fssqrt -extern int fssqrt(int); -#pragma intrinsic(fssqrt,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written") -#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_DP_ASSIST_PRESENT 1 - -// User extension aux register aux_dpfp1l -#define AR_AUX_DPFP1L 0x302 -#pragma Aux_register(0x302, name=>"aux_dpfp1l") - -// User extension aux register aux_dpfp1h -#define AR_AUX_DPFP1H 0x303 -#pragma Aux_register(0x303, name=>"aux_dpfp1h") - -// User extension aux register aux_dpfp2l -#define AR_AUX_DPFP2L 0x304 -#pragma Aux_register(0x304, name=>"aux_dpfp2l") - -// User extension aux register aux_dpfp2h -#define AR_AUX_DPFP2H 0x305 -#pragma Aux_register(0x305, name=>"aux_dpfp2h") - -// User extension instruction dmulh11 -extern int dmulh11(int,int); -#pragma intrinsic(dmulh11,opcode=>6,sub_opcode=>48,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh11 -extern int dmulh11_f(int,int); -#pragma intrinsic(dmulh11_f,opcode=>6,sub_opcode=>48, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh12 -extern int dmulh12(int,int); -#pragma intrinsic(dmulh12,opcode=>6,sub_opcode=>49,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh12 -extern int dmulh12_f(int,int); -#pragma intrinsic(dmulh12_f,opcode=>6,sub_opcode=>49, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh21 -extern int dmulh21(int,int); -#pragma intrinsic(dmulh21,opcode=>6,sub_opcode=>50,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh21 -extern int dmulh21_f(int,int); -#pragma intrinsic(dmulh21_f,opcode=>6,sub_opcode=>50, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh22 -extern int dmulh22(int,int); -#pragma intrinsic(dmulh22,opcode=>6,sub_opcode=>51,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dmulh22 -extern int dmulh22_f(int,int); -#pragma intrinsic(dmulh22_f,opcode=>6,sub_opcode=>51, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh11 -extern int daddh11(int,int); -#pragma intrinsic(daddh11,opcode=>6,sub_opcode=>52,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh11 -extern int daddh11_f(int,int); -#pragma intrinsic(daddh11_f,opcode=>6,sub_opcode=>52, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh12 -extern int daddh12(int,int); -#pragma intrinsic(daddh12,opcode=>6,sub_opcode=>53,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh12 -extern int daddh12_f(int,int); -#pragma intrinsic(daddh12_f,opcode=>6,sub_opcode=>53, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh21 -extern int daddh21(int,int); -#pragma intrinsic(daddh21,opcode=>6,sub_opcode=>54,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh21 -extern int daddh21_f(int,int); -#pragma intrinsic(daddh21_f,opcode=>6,sub_opcode=>54, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh22 -extern int daddh22(int,int); -#pragma intrinsic(daddh22,opcode=>6,sub_opcode=>55,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction daddh22 -extern int daddh22_f(int,int); -#pragma intrinsic(daddh22_f,opcode=>6,sub_opcode=>55, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh11 -extern int dsubh11(int,int); -#pragma intrinsic(dsubh11,opcode=>6,sub_opcode=>56,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh11 -extern int dsubh11_f(int,int); -#pragma intrinsic(dsubh11_f,opcode=>6,sub_opcode=>56, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh12 -extern int dsubh12(int,int); -#pragma intrinsic(dsubh12,opcode=>6,sub_opcode=>57,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh12 -extern int dsubh12_f(int,int); -#pragma intrinsic(dsubh12_f,opcode=>6,sub_opcode=>57, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh21 -extern int dsubh21(int,int); -#pragma intrinsic(dsubh21,opcode=>6,sub_opcode=>58,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh21 -extern int dsubh21_f(int,int); -#pragma intrinsic(dsubh21_f,opcode=>6,sub_opcode=>58, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh22 -extern int dsubh22(int,int); -#pragma intrinsic(dsubh22,opcode=>6,sub_opcode=>59,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dsubh22 -extern int dsubh22_f(int,int); -#pragma intrinsic(dsubh22_f,opcode=>6,sub_opcode=>59, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dexcl1 -extern int dexcl1(int,int); -#pragma intrinsic(dexcl1,opcode=>6,sub_opcode=>60, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - -// User extension instruction dexcl2 -extern int dexcl2(int,int); -#pragma intrinsic(dexcl2,opcode=>6,sub_opcode=>61, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written") - - -#endif - - -]]> - - - - -
- diff --git a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc index 211437bd9f4..405b9698cca 100644 --- a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc @@ -18,14 +18,23 @@ ifeq ($(TARGET), arc_emsdp) TARGET_ARCH := arc ARC_TOOLCHAIN := mwdt - TCF_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf - LCF_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf - UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env - UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE)) BUILD_ARC_MLI := false ARC_MLI_PRE_COMPILED_TARGET := emsdp_em11d_em9d_dfss +ifneq ($(filter no_arc_mli,$(ALL_TAGS)),) + MLI_LIB_DIR = arc_mli_package + $(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),)) +else ifeq ($(BUILD_ARC_MLI), true) + MLI_LIB_DIR = arc_mli_$(ARC_MLI_PRE_COMPILED_TARGET) +endif + + TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/hw/emsdp_em11d_em9d_dfss.tcf + LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf + UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env + UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE)) + + include $(MAKEFILE_DIR)/targets/arc/arc_common.inc ARC_EXTRA_APP_SETTINGS = \ diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc index db420b7fd1b..d90f8548f31 100644 --- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc +++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc @@ -75,7 +75,7 @@ EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embar EMBARC_MLI_MD5 := "7eebd730db79c6834399f87e509115fb" EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC1/embARC_MLI_package.zip" -EMBARC_MLI_PRE_COMPILED_MD5 := "b85b8b89446757735342795367e37d22" +EMBARC_MLI_PRE_COMPILED_MD5 := "a66d6afff8daeb40bd3a99c42de048ab" XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip" XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b" From 00165602f78b33bc07f9bb8134472bbeceac23cf Mon Sep 17 00:00:00 2001 From: Ajay P Date: Thu, 7 May 2020 00:54:02 +0000 Subject: [PATCH 056/557] Removed API change and added tests. --- .../python/keras/integration_test/BUILD | 10 ++ .../gradient_checkpoint_test.py | 160 ++++++++++++++++++ tensorflow/python/ops/custom_gradient.py | 9 +- 3 files changed, 173 insertions(+), 6 deletions(-) create mode 100644 tensorflow/python/keras/integration_test/gradient_checkpoint_test.py diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD index 01c405a86ae..f92f9d14685 100644 --- a/tensorflow/python/keras/integration_test/BUILD +++ b/tensorflow/python/keras/integration_test/BUILD @@ -70,3 +70,13 @@ tf_py_test( "//tensorflow/python:extra_py_tests_deps", ], ) + +tf_py_test( + name = "gradient_checkpoint_test", + srcs = ["gradient_checkpoint_test.py"], + python_version = "PY3", + deps = [ + "//tensorflow:tensorflow_py", + "//tensorflow/python:extra_py_tests_deps", + ], +) diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py new file mode 100644 index 00000000000..df23c3abff5 --- /dev/null +++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py @@ -0,0 +1,160 @@ +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +from tensorflow.keras import layers, optimizers + + +def _get_big_cnn_model(img_dim, n_channels, num_partitions, + blocks_per_partition): + """Creates a test model whose activations are significantly larger than model size.""" + model = tf.keras.Sequential() + model.add(layers.Input(shape=(img_dim, img_dim, n_channels))) + for _ in range(num_partitions): + for _ in range(blocks_per_partition): + model.add(layers.Conv2D(10, 5, padding='same', activation=tf.nn.relu)) + model.add(layers.MaxPooling2D((1, 1), padding='same')) + model.add(layers.Conv2D(40, 5, padding='same', activation=tf.nn.relu)) + model.add(layers.MaxPooling2D((1, 1), padding='same')) + model.add(layers.Conv2D(20, 5, padding='same', activation=tf.nn.relu)) + model.add(layers.MaxPooling2D((1, 1), padding='same')) + model.add(layers.Flatten()) + model.add(layers.Dense(32, activation=tf.nn.relu)) + model.add(layers.Dense(10)) + return model + + +def _get_split_cnn_model(img_dim, n_channels, num_partitions, + blocks_per_partition): + """Creates a test model that is split into `num_partitions` smaller models""" + models = [tf.keras.Sequential() for _ in range(num_partitions)] + models[0].add(layers.Input(shape=(img_dim, img_dim, n_channels))) + for i in range(num_partitions): + model = models[i] + if i > 0: + last_shape = models[i - 1].layers[-1].output_shape + model.add(layers.Input(shape=last_shape[1:])) + for _ in range(blocks_per_partition): + model.add(layers.Conv2D(10, 5, padding='same', activation=tf.nn.relu)) + model.add(layers.MaxPooling2D((1, 1), padding='same')) + model.add(layers.Conv2D(40, 5, padding='same', activation=tf.nn.relu)) + model.add(layers.MaxPooling2D((1, 1), padding='same')) + model.add(layers.Conv2D(20, 5, padding='same', activation=tf.nn.relu)) + model.add(layers.MaxPooling2D((1, 1), padding='same')) + models[-1].add(layers.Flatten()) + models[-1].add(layers.Dense(32, activation=tf.nn.relu)) + models[-1].add(layers.Dense(10)) + return models + + +def _compute_loss(logits, labels): + return tf.reduce_mean( + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, + labels=labels)) + + +def _limit_gpu_memory(): + """Helper function to limit GPU memory for testing """ + gpus = tf.config.experimental.list_physical_devices('GPU') + if gpus: + try: + tf.config.experimental.set_virtual_device_configuration( + gpus[0], [ + tf.config.experimental.VirtualDeviceConfiguration( + memory_limit=1024) + ]) + except RuntimeError as e: + print(e) + + +def _get_dummy_data(img_dim, n_channels, batch_size): + inputs = tf.ones([batch_size, img_dim, img_dim, n_channels]) + labels = tf.ones([batch_size], dtype=tf.int64) + return inputs, labels + + +def _train_no_recompute(n_steps): + """Trains a single large model without gradient checkpointing.""" + _limit_gpu_memory() + img_dim, n_channels, batch_size = 256, 1, 4 + x, y = _get_dummy_data(img_dim, n_channels, batch_size) + model = _get_big_cnn_model(img_dim, + n_channels, + num_partitions=3, + blocks_per_partition=2) + optimizer = optimizers.SGD() + losses = [] + tr_vars = model.trainable_variables + for _ in range(n_steps): + with tf.GradientTape() as tape: + logits = model(x) + loss = _compute_loss(logits, y) + losses.append(loss) + grads = tape.gradient(loss, tr_vars) # tr_vars + optimizer.apply_gradients(zip(grads, tr_vars)) + del grads + return losses + + +def _train_with_recompute(n_steps): + """Trains a single large model with gradient checkpointing using tf.recompute_grad.""" + _limit_gpu_memory() + img_dim, n_channels, batch_size = 256, 1, 4 + x, y = _get_dummy_data(img_dim, n_channels, batch_size) + # This model is the same model as _get_big_cnn_model but split into 3 parts. + models = _get_split_cnn_model(img_dim, + n_channels, + num_partitions=3, + blocks_per_partition=2) + model1, model2, model3 = models + # Apply gradient checkpointing to the submodels using tf.recompute_grad. + model1_re = tf.recompute_grad(model1) + model2_re = tf.recompute_grad(model2) + model3_re = tf.recompute_grad(model3) + optimizer = optimizers.SGD() + tr_vars = model1.trainable_variables + model2.trainable_variables + model3.trainable_variables + losses = [] + for _ in range(n_steps): + with tf.GradientTape() as tape: + logits1 = model1_re(x) + logits2 = model2_re(logits1) + logits3 = model3_re(logits2) + loss = _compute_loss(logits3, y) + losses.append(loss) + grads = tape.gradient(loss, tr_vars) # tr_vars + optimizer.apply_gradients(zip(grads, tr_vars)) + del grads + return losses + + +class GradientCheckpointTest(tf.test.TestCase): + + def test_raises_oom_exception(self): + with self.assertRaises(Exception) as context: + _train_no_recompute(1) + self.assertTrue( + context.exception.__class__.__name__ == 'ResourceExhaustedError') + + def test_does_not_raise_oom_exception(self): + n_step = 2 + losses = _train_with_recompute(n_step) + self.assertTrue(len(losses) == n_step) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py index a20619f5be7..a5013062936 100644 --- a/tensorflow/python/ops/custom_gradient.py +++ b/tensorflow/python/ops/custom_gradient.py @@ -406,17 +406,14 @@ def _graph_mode_decorator(f, args, kwargs): def _eager_mode_decorator(f, args, kwargs): """Implement custom gradient decorator for eager mode.""" - - trainable_vars = [] - if 'trainable_variables' in kwargs: - trainable_vars = kwargs.pop('trainable_variables') - result, grad_fn = f(*args, **kwargs) + with tape_lib.VariableWatcher() as variable_watcher: + result, grad_fn = f(*args, **kwargs) all_inputs = list(args) + list(kwargs.values()) # The variables that grad_fn needs to return gradients for are the set of # variables used that are *not* part of the inputs. variables = [ v.deref() # pylint: disable=g-complex-comprehension - for v in set(v.ref() for v in trainable_vars) + for v in set(v.ref() for v in variable_watcher.watched_variables()) if all(v.deref() is not i for i in all_inputs) ] grad_argspec = tf_inspect.getfullargspec(grad_fn) From 175ec5e02e4f7bc1662c6a6b0bde2c50292ba638 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Fri, 8 May 2020 19:01:07 +0300 Subject: [PATCH 057/557] arc_mli slicing: Got rid of hand-written MIN/MAX macro --- .../lite/micro/kernels/arc_mli/mli_slicers.cc | 10 +++++----- .../micro/kernels/arc_mli/scratch_buf_mgr.cc | 19 ++++++++++--------- .../micro/kernels/arc_mli/scratch_buffers.cc | 3 +-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc index 91bae5caa38..11065f00646 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc @@ -15,8 +15,8 @@ limitations under the License. #include "mli_slicers.h" -#define MAX(A,B) (((A) > (B))? (A): (B)) -#define MIN(A,B) (((A) > (B))? (B): (A)) +#include + namespace tflite { namespace ops { @@ -75,11 +75,11 @@ void TensorSlicer::ComputeSubTensor(void) { // begin and end spans the complete input region including padding areas. const int begin = (int)sub_cfg_.offset[sliceDim_] - pad_pre_; // end is clipped to the end of the full input region. this is needed for cases where the last slice is smaller than the rest. - const int end = MIN(begin + sub_cfg_.size[sliceDim_] + overlap_, full_tensor_->shape[sliceDim_] + pad_post_); + const int end = std::min(begin + sub_cfg_.size[sliceDim_] + overlap_, full_tensor_->shape[sliceDim_] + pad_post_); // The start coordinate of the subtensor is clipped to zero - cfg_new.offset[sliceDim_] = MAX(begin, 0); + cfg_new.offset[sliceDim_] = std::max(begin, 0); // and the stop coordinate is clipped to the size of the full tensor - const int stop_coord = MIN(end, full_tensor_->shape[sliceDim_]); + const int stop_coord = std::min(end, static_cast(full_tensor_->shape[sliceDim_])); // compute the size of the subtensor cfg_new.size[sliceDim_] = stop_coord - cfg_new.offset[sliceDim_]; diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc index d030d04170c..097908e30ab 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc @@ -15,9 +15,10 @@ limitations under the License. #include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h" #include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h" + +#include #include -#define MAX(A,B) (((A) > (B))? (A): (B)) -#define MIN(A,B) (((A) > (B))? (B): (A)) + namespace tflite { namespace ops { @@ -242,19 +243,19 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io( *out_slice_height = out_height; } else { // First compute how many lines fit into the input tensor, and compute how many output lines can be computed with that. - max_lines_in = MIN(in_height, in->capacity / line_size_in); + max_lines_in = std::min(in_height, static_cast(in->capacity) / line_size_in); if (max_lines_in >= in_height) { max_out_lines_for_input = out_height; } else if (2 * max_lines_in >= in_height) { // in this case only two slices are needed, so both could benefit from padding. take the MIN to get the worst case. - max_out_lines_for_input = (max_lines_in + MIN(padding_top, padding_bot) - kernel_height + 1) / stride_height; + max_out_lines_for_input = (max_lines_in + std::min(padding_top, padding_bot) - kernel_height + 1) / stride_height; } else { max_out_lines_for_input = (max_lines_in - kernel_height + 1) / stride_height; // TODO add padding exceptions and test by makin fit=false; } // Ten compute how many ouput lines fit into the output tensor. - max_lines_out = MIN(out_height, out->capacity / line_size_out); + max_lines_out = std::min(out_height, static_cast(out->capacity) / line_size_out); // the smallest of the two determines the slice height for the output, and the derived sliceheight for the input. - *out_slice_height = MIN(max_out_lines_for_input, max_lines_out); + *out_slice_height = std::min(max_out_lines_for_input, max_lines_out); *in_slice_height = *out_slice_height * stride_height; } @@ -282,11 +283,11 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_weights( *slice_channels = channels; } else { // First compute how many channels fit into the weights tensor - max_ch_weigths = MIN(channels, weights->capacity / ch_size_w); + max_ch_weigths = std::min(channels, static_cast(weights->capacity) / ch_size_w); // Ten compute how many channels fit into the bias tensor. - max_ch_bias = MIN(channels, bias->capacity / ch_size_b); + max_ch_bias = std::min(channels, static_cast(bias->capacity) / ch_size_b); // the smallest of the two determines the slice size - *slice_channels = MIN(max_ch_weigths, max_ch_bias); + *slice_channels = std::min(max_ch_weigths, max_ch_bias); } if (*slice_channels > 0) { diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc index a770e4ccd66..6b56770f1f7 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc @@ -14,9 +14,8 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h" + #include -#define MAX(A,B) (((A) > (B))? (A): (B)) -#define MIN(A,B) (((A) > (B))? (B): (A)) namespace tflite { namespace ops { From 738a28685bc1a5714ee2ea40d431156f526c3e0b Mon Sep 17 00:00:00 2001 From: Srinivasan Narayanamoorthy Date: Fri, 8 May 2020 13:10:15 -0700 Subject: [PATCH 058/557] Enabling DNNL SGEMM and removing all code related to MKL matmuls. --- .../core/common_runtime/mkl_layout_pass.cc | 12 ++- tensorflow/core/kernels/mkl_matmul_op.cc | 82 ++----------------- tensorflow/core/ops/math_ops.cc | 2 +- 3 files changed, 18 insertions(+), 78 deletions(-) diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc index 2941845a604..55355363106 100644 --- a/tensorflow/core/common_runtime/mkl_layout_pass.cc +++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc @@ -499,7 +499,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { CopyAttrsAll, LrnGradRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.matmul, mkl_op_registry::GetMklOpName(csinfo_.matmul), - CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange}); + CopyAttrsAll, MatMulRewrite, kRewriteForOpNameChange}); rinfo_.push_back( {csinfo_.leakyrelu, mkl_op_registry::GetMklOpName(csinfo_.leakyrelu), CopyAttrsAll, LeakyReluRewrite, kRewriteForLayoutPropagation}); @@ -1473,6 +1473,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass { return false; } + static bool MatMulRewrite(const Node* n) { + DataType T; + GetNodeAttr(n->def(), "T", &T); + if ((T == DT_FLOAT) || (T == DT_BFLOAT16)) { + VLOG(2) << "Rewriting MatMul to _MklMatMul"; + return true; + } + return false; + } + static bool DequantizeRewrite(const Node* n) { DCHECK(n); Node* input = nullptr; diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc index 3a7c864d10e..83785af8910 100644 --- a/tensorflow/core/kernels/mkl_matmul_op.cc +++ b/tensorflow/core/kernels/mkl_matmul_op.cc @@ -31,13 +31,7 @@ limitations under the License. #include "tensorflow/core/kernels/fill_functor.h" #include "tensorflow/core/kernels/mkl_matmul_ops_common.h" #include "tensorflow/core/util/mkl_util.h" - -// This header file is part of MKL ML, need equivalent file in MKL DNN -#ifndef INTEL_MKL_DNN_ONLY -#include "mkl_cblas.h" -#endif - -#include "mkldnn.h" +#include "mkldnn.hpp" namespace tensorflow { @@ -157,21 +151,11 @@ class MklMatMulOp : public OpKernel { // 1.0 and 0.0 respectively. const float alpha = 1.0f; const float beta = 0.0f; -#if defined(INTEL_MKL_DNN_ONLY) - const char* const ftrans[] = {"N", "T", "C"}; - int index_transa = transa ? 1 : 0; - int index_transb = transb ? 1 : 0; - VLOG(2) << "MKL DNN SGEMM called"; - // MKL DNN only supports the Fortran api and requires column major while - // Tensorflow uses row major so we reverse the order A and B - mkldnn_sgemm(ftrans[index_transb], ftrans[index_transa], &n, &m, &k, &alpha, - b, &ldb, a, &lda, &beta, c, &ldc); -#else - // MKL ML binary uses CBLAS API - cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, - transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b, - ldb, beta, c, ldc); -#endif + char char_transa = transa ? 'T' : 'N'; + char char_transb = transb ? 'T' : 'N'; + VLOG(2) << "MKL DNN SGEMM CALLED"; + dnnl_sgemm(char_transa, char_transb, m, n, k, alpha, + a, lda, b, ldb, beta, c, ldc); } #ifdef ENABLE_INTEL_MKL_BFLOAT16 @@ -205,53 +189,6 @@ class MklMatMulOp : public OpKernel { FloatToBFloat16(c_float.flat().data(), c, c_float.NumElements()); } #endif // ENABLE_INTEL_MKL_BFLOAT16 - -// MKL-DNN only supports SGEMM and bfloat16-GEMM. -#ifndef INTEL_MKL_DNN_ONLY - - // Matrix-Matrix Multiplication with FP64 tensors. For detailed info about - // parameters, look at FP32 function description. - void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb, const int m, - const int n, const int k, const double* a, const int lda, - const double* b, const int ldb, double* c, const int ldc) { - const double alpha = 1.0; - const double beta = 0.0; - cblas_dgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, - transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b, - ldb, beta, c, ldc); - } - - // Matrix-Matrix Multiplication with Complex64 (std::complex) tensors. - // For detailed info about parameters, look at FP32 function description. - void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb, const int m, - const int n, const int k, const complex64* a, const int lda, - const complex64* b, const int ldb, complex64* c, - int const ldc) { - const MKL_Complex8 alpha = {1.0f, 0.0f}; - const MKL_Complex8 beta = {0.0f, 0.0f}; - cblas_cgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, - transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha, - reinterpret_cast(a), lda, - reinterpret_cast(b), ldb, &beta, - reinterpret_cast(c), ldc); - } - - // Matrix-Matrix Multiplication with Complex128 (std::complex) - // tensors. For detailed info about parameters, look at FP32 function - // description. - void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb, const int m, - const int n, const int k, const complex128* a, const int lda, - const complex128* b, const int ldb, complex128* c, - const int ldc) { - const MKL_Complex16 alpha = {1.0, 0.0}; - const MKL_Complex16 beta = {0.0, 0.0}; - cblas_zgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans, - transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha, - reinterpret_cast(a), lda, - reinterpret_cast(b), ldb, &beta, - reinterpret_cast(c), ldc); - } -#endif // !INTEL_MKL_DNN_ONLY }; #define REGISTER_CPU(T) \ @@ -269,13 +206,6 @@ TF_CALL_float(REGISTER_CPU); #ifdef ENABLE_INTEL_MKL_BFLOAT16 TF_CALL_bfloat16(REGISTER_CPU); #endif // ENABLE_INTEL_MKL_BFLOAT16 - -#ifndef INTEL_MKL_DNN_ONLY -TF_CALL_double(REGISTER_CPU); -TF_CALL_complex64(REGISTER_CPU); -TF_CALL_complex128(REGISTER_CPU); -#endif // !INTEL_MKL_DNN_ONLY #endif // ENABLE_MKL - } // namespace tensorflow #endif // INTEL_MKL diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc index 7ac003379d4..d00731f223a 100644 --- a/tensorflow/core/ops/math_ops.cc +++ b/tensorflow/core/ops/math_ops.cc @@ -936,7 +936,7 @@ REGISTER_OP("_MklMatMul") .Output("product: T") .Attr("transpose_a: bool = false") .Attr("transpose_b: bool = false") - .Attr("T: {bfloat16, float, double, complex64, complex128}") + .Attr("T: {bfloat16, float}") .SetShapeFn(shape_inference::MatMulShape); #endif // INTEL_MKL From f208ff6827e17fe773cf59192abaaa3f90bd16ad Mon Sep 17 00:00:00 2001 From: Koan-Sin Tan Date: Mon, 11 May 2020 17:14:11 +0800 Subject: [PATCH 059/557] [tflite] reformat/cleanup label_image readme.md --- .../lite/examples/label_image/README.md | 124 +++++++++++++----- 1 file changed, 88 insertions(+), 36 deletions(-) diff --git a/tensorflow/lite/examples/label_image/README.md b/tensorflow/lite/examples/label_image/README.md index 09e9e77b86a..9d37c153361 100644 --- a/tensorflow/lite/examples/label_image/README.md +++ b/tensorflow/lite/examples/label_image/README.md @@ -90,48 +90,100 @@ adb push tensorflow/lite/examples/label_image/testdata/grace_hopper.bmp /data/l adb push /tmp/labels.txt /data/local/tmp ``` -Run it, `adb shell "/data/local/tmp/label_image \ -m -/data/local/tmp/mobilenet_v1_1.0_224.tflite \ -i -/data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt"` then you -should see something like the followings: `Loaded model -/data/local/tmp/mobilenet_v1_1.0_224.tflite resolved reporter INFO: Initialized -TensorFlow Lite runtime. invoked average time: 25.03 ms 0.907071: 653 military -uniform 0.0372416: 907 Windsor tie 0.00733753: 466 bulletproof vest 0.00592852: -458 bow tie 0.00414091: 514 cornet` +Run it, +``` +adb shell "/data/local/tmp/label_image \ + -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \ + -i /data/local/tmp/grace_hopper.bmp \ + -l /data/local/tmp/labels.txt" +``` +then you should see something like the followings: +``` +Loaded model /data/local/tmp/mobilenet_v1_1.0_224.tflite +resolved reporter +INFO: Initialized +TensorFlow Lite runtime. +invoked +average time: 25.03 ms +0.907071: 653 military uniform +0.0372416: 907 Windsor tie +0.00733753: 466 bulletproof vest +0.00592852: 458 bow tie +0.00414091: 514 cornet +``` -Run the model with NNAPI delegate (`-a 1`), `adb shell -"/data/local/tmp/label_image \ -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \ --i /data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt -a 1 -f 1"` -then you should see something like the followings: `Loaded model -/data/local/tmp/mobilenet_v1_1.0_224.tflite resolved reporter INFO: Initialized -TensorFlow Lite runtime. INFO: Created TensorFlow Lite delegate for NNAPI. -Applied NNAPI delegate. invoked average time:10.348 ms 0.905401: 653 military -uniform 0.0379589: 907 Windsor tie 0.00735866: 466 bulletproof vest 0.00605307: -458 bow tie 0.00422573: 514 cornet` +Run the model with NNAPI delegate (`-a 1`), +``` +adb shell "/data/local/tmp/label_image \ + -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \ + -i /data/local/tmp/grace_hopper.bmp \ + -l /data/local/tmp/labels.txt -a 1 -f 1" +``` +then you should see something like the followings: +``` +Loaded model /data/local/tmp/mobilenet_v1_1.0_224.tflite +resolved reporter +INFO: Initialized +TensorFlow Lite runtime. +INFO: Created TensorFlow Lite delegate for NNAPI. +Applied NNAPI delegate. +invoked +average time:10.348 ms +0.905401: 653 military uniform +0.0379589: 907 Windsor tie +0.00735866: 466 bulletproof vest +0.00605307: 458 bow tie +0.00422573: 514 cornet +``` To run a model with the Hexagon Delegate, assuming we have followed the [Hexagon Delegate Guide](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/hexagon_delegate.md) -and installed Hexagon libraries in `/data/local/tmp`. Run it `adb shell -"/data/local/tmp/label_image \ -m -/data/local/tmp/mobilenet_v1_1.0_224_quant.tflite \ -i -/data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt -j 1"` then you -should see something like the followings: ``` Loaded model -/data/local/tmp/mobilenet_v1_1.0_224_quant.tflite resolved reporter INFO: -Initialized TensorFlow Lite runtime. INFO: Created TensorFlow Lite delegate for -Hexagon. INFO: Hexagon delegate: 31 nodes delegated out of 31 nodes. +and installed Hexagon libraries in `/data/local/tmp`. Run it +``` +adb shell \ + "/data/local/tmp/label_image \ + -m /data/local/tmp/mobilenet_v1_1.0_224_quant.tflite \ + -i /data/local/tmp/grace_hopper.bmp \ + -l /data/local/tmp/labels.txt -j 1" +``` +then you should see something like the followings: +``` +Loaded model /data/local/tmp/mobilenet_v1_1.0_224_quant.tflite +resolved reporter +INFO: Initialized TensorFlow Lite runtime. +loaded libcdsprpc.so +INFO: Created TensorFlow Lite delegate for Hexagon. +INFO: Hexagon delegate: 31 nodes delegated out of 31 nodes with 1 partitions. -remote_handle_control available and used Applied Hexagon delegate.invoked -average time: 8.307 ms 0.729412: 653 military uniform 0.0980392: 907 Windsor tie -0.0313726: 466 bulletproof vest 0.0313726: 458 bow tie 0.0117647: 700 panpipe +Applied Hexagon delegate.invoked +average time: 4.231 ms +0.639216: 458 bow tie +0.329412: 653 military uniform +0.00784314: 835 suit +0.00784314: 611 jersey +0.00392157: 514 cornet ``` -Run the model with the XNNPACK delegate (`-x 1`), `adb shell -"/data/local/tmp/label_image \ -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \ --i /data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt -x 1"` then -you should see something like the followings: `Loaded model -/data/local/tmp/mobilenet_v1_1.0_224.tflite resolved reporter INFO: Initialized -TensorFlow Lite runtime. Applied XNNPACK delegate.invoked average time: 11.0237 -ms 0.90707: 653 military uniform 0.0372418: 907 Windsor tie 0.0073376: 466 -bulletproof vest 0.00592856: 458 bow tie 0.00414093: 514 cornet` +Run the model with the XNNPACK delegate (`-x 1`), +``` +adb shell \ + "/data/local/tmp/label_image \ + -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \ + -i /data/local/tmp/grace_hopper.bmp \ + -l /data/local/tmp/labels.txt -x 1" +``` +then you should see something like the followings: +``` +Loaded model /data/local/tmp/mobilenet_v1_1.0_224.tflite +resolved reporter +INFO: Initialized TensorFlow Lite runtime. +Applied XNNPACK delegate.invoked +average time: 17.33 ms +0.90707: 653 military uniform +0.0372418: 907 Windsor tie +0.0073376: 466 bulletproof vest +0.00592857: 458 bow tie +0.00414093: 514 cornet +``` See the `label_image.cc` source code for other command line options. From f8867620dcc60433b9a83a5af5b96276e83127d6 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Mon, 11 May 2020 12:36:37 +0300 Subject: [PATCH 060/557] Explanatory comments in slicing tests files (ARC specific) + URL to the latest embarc_MLI library --- tensorflow/lite/micro/kernels/arc_mli/README.md | 2 +- .../lite/micro/kernels/arc_mli/conv_slicing_test.cc | 9 +++++++++ .../micro/kernels/arc_mli/depthwise_conv_slicing_test.cc | 9 +++++++++ .../kernels/arc_mli/fully_connected_slicing_test.cc | 9 +++++++++ .../lite/micro/kernels/arc_mli/pooling_slicing_test.cc | 8 ++++++++ .../lite/micro/tools/make/third_party_downloads.inc | 8 ++++---- 6 files changed, 40 insertions(+), 5 deletions(-) diff --git a/tensorflow/lite/micro/kernels/arc_mli/README.md b/tensorflow/lite/micro/kernels/arc_mli/README.md index 2b2e194e757..33e46ca871d 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/README.md +++ b/tensorflow/lite/micro/kernels/arc_mli/README.md @@ -16,7 +16,7 @@ In case MLI implementation can’t be used, kernels in this folder fallback to T For ARC EM SDP board, a pre-compiled MLI library is downloaded and used in the application. For a custom target ARC-based platform, MLI sources are downloaded and compiled during project generation phase. To build library from sources for ARC EM SDP platform, add `BUILD_ARC_MLI=true` option to make command: - make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp BUILD_ARC_MLI=true generate_person_detection_int8_make_project. + make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp BUILD_ARC_MLI=true generate_person_detection_int8_make_project If an application exclusively uses accelerated MLI kernel implementations, one can strip out TFLM reference kernel implementations to reduce code size of application. Build application with `MLI_ONLY=true` option in generated project (after the project was built): diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc index 27e30856f6c..9eb9d6499dd 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc @@ -13,6 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +// This test checks that slicing logic doesn`t affect result of convolution +// kernel +// +// This test doesn`t replace default convolution test +// (tensorflow/lite/micro/kernels/conv_test.cc). It is added to the whole +// testset only in case MLI for ARC platform is used during generation (which is +// handled in arc_mli.inc). So such tests won`t be generated for other +// platforms. + #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/micro/kernels/all_ops_resolver.h" diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc index fb9dd46c1e4..e6a87ff82e6 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc @@ -13,6 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +// This test checks that slicing logic doesn`t affect result of depthwise +// convolution kernel +// +// This test doesn`t replace default depthwise convolution test +// (tensorflow/lite/micro/kernels/depthwise_conv_test.cc). It is added to the +// whole testset only in case MLI for ARC platform is used during generation +// (which is handled in arc_mli.inc). So such tests won`t be generated for other +// platforms. + #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc index 78cb2873c54..0bd264a5f1b 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc @@ -13,6 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +// This test checks that slicing logic doesn`t affect result of fully +// connected kernel +// +// This test doesn`t replace default fully connected test +// (tensorflow/lite/micro/kernels/fully_connected_test.cc). It is added to the +// whole testset only in case MLI for ARC platform is used during generation +// (which is handled in arc_mli.inc). So such tests won`t be generated for other +// platforms. + #include #include "tensorflow/lite/c/builtin_op_data.h" diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc index 63737a41791..381420f1f7d 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc @@ -13,6 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +// This test checks that slicing logic doesn`t affect result of pooling kernels +// +// This test doesn`t replace default pooling test +// (tensorflow/lite/micro/kernels/pooling.cc). It is added to the +// whole testset only in case MLI for ARC platform is used during generation +// (which is handled in arc_mli.inc). So such tests won`t be generated for other +// platforms. + #include #include "tensorflow/lite/c/builtin_op_data.h" diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc index d90f8548f31..91f3f1b5263 100644 --- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc +++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc @@ -71,11 +71,11 @@ PERSON_MODEL_MD5 := "fe2934bd0788f1dcc7af3f0a954542ab" PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip" PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc" -EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/7026ad09bb7f967324eb29e069f776bc44a08886.zip" -EMBARC_MLI_MD5 := "7eebd730db79c6834399f87e509115fb" +EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip" +EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef" -EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC1/embARC_MLI_package.zip" -EMBARC_MLI_PRE_COMPILED_MD5 := "a66d6afff8daeb40bd3a99c42de048ab" +EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip" +EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6" XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip" XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b" From 0975574df38cecd6f5643d0c188342cef96b463e Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Mon, 11 May 2020 10:46:01 -0700 Subject: [PATCH 061/557] Minor changes --- tensorflow/core/kernels/conv_2d_gpu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h index 90d85e6f04e..297016160ad 100644 --- a/tensorflow/core/kernels/conv_2d_gpu.h +++ b/tensorflow/core/kernels/conv_2d_gpu.h @@ -210,7 +210,7 @@ __global__ void ShuffleInTensor3Simple(int nthreads, } } -constexpr int kUnroll = 4; +static constexpr int kUnroll = 4; template __global__ void ShuffleInTensor3SimpleVector(int nthreads, @@ -246,7 +246,7 @@ __global__ void ShuffleInTensor3SimpleVector(int nthreads, *out = *reinterpret_cast(buf); } - for(; output_index < nthreads; output_index++) { + for (; output_index < nthreads; ++output_index) { Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims); Index<3> input_tensor_index; From 764e3a790eea85cbf8e275ef504c76335a3236f0 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Mon, 11 May 2020 17:44:32 +0000 Subject: [PATCH 062/557] Add uint32/uint64 support for tf.tile This PR tries to address the issue raised in 39405 where there is no uint32/uint64 support for tf.tile. The related kernel impl for uint32 and uint64 has been added in this PR. This PR fixes 39405 Signed-off-by: Yong Tang --- tensorflow/core/kernels/BUILD | 2 ++ .../core/kernels/tile_functor_cpu_uint32.cc | 29 +++++++++++++++++++ .../core/kernels/tile_functor_cpu_uint64.cc | 29 +++++++++++++++++++ tensorflow/core/kernels/tile_ops.cc | 6 ++++ 4 files changed, 66 insertions(+) create mode 100644 tensorflow/core/kernels/tile_functor_cpu_uint32.cc create mode 100644 tensorflow/core/kernels/tile_functor_cpu_uint64.cc diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 5f85fe99018..4a1b9318f29 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -1337,6 +1337,8 @@ tf_kernel_library( "tile_functor_cpu_int32.cc", "tile_functor_cpu_int64.cc", "tile_functor_cpu_int8.cc", + "tile_functor_cpu_uint32.cc", + "tile_functor_cpu_uint64.cc", "tile_functor_cpu_tstring.cc", "tile_functor_cpu_uint8.cc", "tile_functor_sycl.cc", diff --git a/tensorflow/core/kernels/tile_functor_cpu_uint32.cc b/tensorflow/core/kernels/tile_functor_cpu_uint32.cc new file mode 100644 index 00000000000..4dd44eeea0f --- /dev/null +++ b/tensorflow/core/kernels/tile_functor_cpu_uint32.cc @@ -0,0 +1,29 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/tile_functor_cpu.h" + +namespace tensorflow { +namespace functor { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template struct Tile; +template struct Tile; + +} // end namespace functor +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/tile_functor_cpu_uint64.cc b/tensorflow/core/kernels/tile_functor_cpu_uint64.cc new file mode 100644 index 00000000000..ec1eb7b0946 --- /dev/null +++ b/tensorflow/core/kernels/tile_functor_cpu_uint64.cc @@ -0,0 +1,29 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/tile_functor_cpu.h" + +namespace tensorflow { +namespace functor { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +template struct Tile; +template struct Tile; + +} // end namespace functor +} // end namespace tensorflow diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc index cd047ed9d4a..75c34fb1bf7 100644 --- a/tensorflow/core/kernels/tile_ops.cc +++ b/tensorflow/core/kernels/tile_ops.cc @@ -139,6 +139,8 @@ TF_CALL_uint8(DECLARE_TYPE); TF_CALL_int32(DECLARE_TYPE); TF_CALL_int16(DECLARE_TYPE); TF_CALL_int64(DECLARE_TYPE); +TF_CALL_uint32(DECLARE_TYPE); +TF_CALL_uint64(DECLARE_TYPE); TF_CALL_half(DECLARE_TYPE); TF_CALL_complex64(DECLARE_TYPE); TF_CALL_complex128(DECLARE_TYPE); @@ -240,6 +242,8 @@ class TileOp : public OpKernel { TF_CALL_int32(HANDLE_TYPE_NAME); TF_CALL_int16(HANDLE_TYPE_NAME); TF_CALL_int64(HANDLE_TYPE_NAME); + TF_CALL_uint32(HANDLE_TYPE_NAME); + TF_CALL_uint64(HANDLE_TYPE_NAME); TF_CALL_half(HANDLE_TYPE_NAME); TF_CALL_tstring(HANDLE_TYPE_NAME); // when DEVICE=CPUDevice. TF_CALL_complex64(HANDLE_TYPE_NAME); @@ -319,6 +323,8 @@ TF_CALL_int8(HANDLE_TYPE_NAME_CPU); TF_CALL_int32(HANDLE_TYPE_NAME_CPU); TF_CALL_int16(HANDLE_TYPE_NAME_CPU); TF_CALL_int64(HANDLE_TYPE_NAME_CPU); +TF_CALL_uint32(HANDLE_TYPE_NAME_CPU); +TF_CALL_uint64(HANDLE_TYPE_NAME_CPU); TF_CALL_half(HANDLE_TYPE_NAME_CPU); TF_CALL_complex64(HANDLE_TYPE_NAME_CPU); TF_CALL_complex128(HANDLE_TYPE_NAME_CPU); From c65b6f9356d9232f1edd5be4aafe5b8f377a6fd9 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Mon, 11 May 2020 16:48:53 +0000 Subject: [PATCH 063/557] Add test case for uint32/uint64 support of tf.tile Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/shape_ops_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py index 7dde89c9818..6c2f2e236f2 100644 --- a/tensorflow/python/kernel_tests/shape_ops_test.py +++ b/tensorflow/python/kernel_tests/shape_ops_test.py @@ -500,6 +500,8 @@ class TileTest(test.TestCase, parameterized.TestCase): "int16": (dtypes.int16, int), "int32": (dtypes.int32, int), "int64": (dtypes.int64, int), + "uint32": (dtypes.uint32, int), + "uint64": (dtypes.uint64, int), bytes: (dtypes.string, bytes) } for dtype_np, (dtype_tf, cast) in types_to_test.items(): From d000961fcd283638ff2fd9fadb0a3c9fcce5db07 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Mon, 11 May 2020 19:12:51 +0000 Subject: [PATCH 064/557] Bazel buildifier lint fix Signed-off-by: Yong Tang --- tensorflow/core/kernels/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 4a1b9318f29..daa6093a460 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -1337,9 +1337,9 @@ tf_kernel_library( "tile_functor_cpu_int32.cc", "tile_functor_cpu_int64.cc", "tile_functor_cpu_int8.cc", + "tile_functor_cpu_tstring.cc", "tile_functor_cpu_uint32.cc", "tile_functor_cpu_uint64.cc", - "tile_functor_cpu_tstring.cc", "tile_functor_cpu_uint8.cc", "tile_functor_sycl.cc", ], From 0a980f296919766407af45b95c9e8aa290f72569 Mon Sep 17 00:00:00 2001 From: Eugene Kuznetsov Date: Tue, 5 May 2020 10:54:54 +0000 Subject: [PATCH 065/557] ROCm 3.5 (hip-clang) build fixes --- .../service/gpu/llvm_gpu_backend/gpu_backend_lib.cc | 2 +- .../stream_executor/rocm/rocm_gpu_executor.cc | 4 ++++ .../clang/bin/crosstool_wrapper_driver_rocm.tpl | 4 +++- third_party/gpus/cuda_configure.bzl | 13 +++++++++---- third_party/gpus/rocm_configure.bzl | 2 ++ 5 files changed, 19 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc index 060a0375271..497dcda4361 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc @@ -689,7 +689,7 @@ std::unique_ptr AMDGPUGetTargetMachine( llvm::Triple target_triple, int amdgpu_version, const HloModuleConfig& hlo_module_config) { return GetTargetMachine(target_triple, absl::StrCat("gfx", amdgpu_version), - hlo_module_config, "-code-object-v3"); + hlo_module_config, "+code-object-v3"); } void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) { diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc index e22a243a70b..216602a7597 100644 --- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc +++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc @@ -132,6 +132,10 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) { VLOG(3) << "Unloading HSACO module " << module; GpuDriver::UnloadModule(context_, module); gpu_binary_to_module_.erase(module_it); + const char* mem_it = nullptr; + for (auto x : in_memory_modules_) + if (x.second == module) mem_it = x.first; + if (mem_it != nullptr) in_memory_modules_.erase(mem_it); } return true; } diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl index f5ac7b39dfd..89275128a9c 100755 --- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl +++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl @@ -179,7 +179,7 @@ def InvokeHipcc(argv, log=False): # Also we need to retain warning about uninitialised shared variable as # warning only, even when -Werror option is specified. if HIPCC_IS_HIPCLANG: - hipccopts += ' --include=hip/hip_runtime.h -Wno-error=cuda-shared-init ' + hipccopts += ' --include=hip/hip_runtime.h ' hipccopts += ' ' + hipcc_compiler_options # Use -fno-gpu-rdc by default for early GPU kernel finalization # This flag would trigger GPU kernels be generated at compile time, instead @@ -258,6 +258,8 @@ def main(): gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH) gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH) gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY) + if HIPCC_IS_HIPCLANG: + gpu_linker_flags.append("-lrt") if VERBOSE: print(' '.join([CPU_COMPILER] + gpu_linker_flags)) return subprocess.call([CPU_COMPILER] + gpu_linker_flags) diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl index 545aeebe97a..ce924fe4cd2 100644 --- a/third_party/gpus/cuda_configure.bzl +++ b/third_party/gpus/cuda_configure.bzl @@ -808,23 +808,28 @@ def make_copy_files_rule(repository_ctx, name, srcs, outs): cmd = \"""%s \""", )""" % (name, "\n".join(outs), " && \\\n".join(cmds)) -def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir): +def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir, exceptions=None): """Returns a rule to recursively copy a directory.""" src_dir = _norm_path(src_dir) out_dir = _norm_path(out_dir) outs = read_dir(repository_ctx, src_dir) + post_cmd='' + if exceptions!=None: + outs = [x for x in outs if not any([x.startswith(y) for y in exceptions])] outs = [(' "%s",' % out.replace(src_dir, out_dir)) for out in outs] - # '@D' already contains the relative path for a single file, see # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)" + if exceptions!=None: + for x in exceptions: + post_cmd+=" ; rm -fR " + x.replace(src_dir, out_dir) return """genrule( name = "%s", outs = [ %s ], - cmd = \"""cp -rLf "%s/." "%s/" \""", -)""" % (name, "\n".join(outs), src_dir, out_dir) + cmd = \"""cp -rLf "%s/." "%s/" %s\""", +)""" % (name, "\n".join(outs), src_dir, out_dir, post_cmd) def _flag_enabled(repository_ctx, flag_name): return get_host_environ(repository_ctx, flag_name) == "1" diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl index 3c345e6724b..3f518fb05f1 100644 --- a/third_party/gpus/rocm_configure.bzl +++ b/third_party/gpus/rocm_configure.bzl @@ -615,6 +615,8 @@ def _create_local_rocm_repository(repository_ctx): name = "rocm-include", src_dir = rocm_toolkit_path + "/include", out_dir = "rocm/include", + exceptions = [rocm_toolkit_path + "/include/gtest", + rocm_toolkit_path + "/include/gmock"], ), make_copy_dir_rule( repository_ctx, From cb92c9b87392a373f66d2b662ff6e50d4b57551c Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 14 Apr 2020 20:50:12 +0000 Subject: [PATCH 066/557] Fix issue in tf.image.extract_glimpse This PR is to re-apply PR 12829. While 12829 was merged before, for some reason it was reverted at one point. The guess is that there are some internal testing that caused the revert. This PR will try to submit again, and fix any internal tests that fails. This fix tries to fix the issue raised in 2134 where `tf.image.extract_glimpse` does not work as expected when `centered=False` and `normalized=False` This fix fixes 2134. Signed-off-by: Yong Tang --- tensorflow/core/kernels/eigen_attention.h | 27 ++++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h index c5158e65d8a..7cf5c53dfca 100644 --- a/tensorflow/core/kernels/eigen_attention.h +++ b/tensorflow/core/kernels/eigen_attention.h @@ -101,21 +101,26 @@ struct GlimpseExtractionOp { for (Index i = 0; i < batch_size; ++i) { float x = offsets_[i].first, y = offsets_[i].second; - // Un-normalize coordinates back to pixel space if normalized. if (normalized_) { + // Un-normalize coordinates back to pixel space if normalized. x *= input_width; y *= input_height; + if (centered_) { + // Un-center if coordinates are centered on the image center. + x /= 2.0f; + y /= 2.0f; + x += input_width / 2.0f; + y += input_height / 2.0f; + // Remove half of the glimpse window. + x -= width_ / 2.0f; + y -= height_ / 2.0f; + } + } else { + if (centered_) { + x += input_width / 2.0f; + y += input_height / 2.0f; + } } - // Un-center if coordinates are centered on the image center. - if (centered_) { - x /= 2.0f; - y /= 2.0f; - x += input_width / 2.0f; - y += input_height / 2.0f; - } - // Remove half of the glimpse window. - x -= width_ / 2.0f; - y -= height_ / 2.0f; const Index offset_x = (Index)x; const Index offset_y = (Index)y; From 3e2bcc33e527a27edf7011bfd11aa395a68cb9e4 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 14 Apr 2020 20:53:51 +0000 Subject: [PATCH 067/557] Add test cases for tf.image.extract_glimpse Add test cases for tf.image.extract_glimpse with centered=False and normalized=False Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/attention_ops_test.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py index 87e709fc69e..8799980668a 100644 --- a/tensorflow/python/kernel_tests/attention_ops_test.py +++ b/tensorflow/python/kernel_tests/attention_ops_test.py @@ -236,6 +236,18 @@ class ExtractGlimpseTest(test.TestCase): [0, 0, 0, 0, 0, 0, 0]]), self.evaluate(result2)[0, :, :, 0]) + def testGlimpseNonNormalizedNonCentered(self): + img = constant_op.constant(np.arange(25).reshape((1, 5, 5, 1)), + dtype=dtypes.float32) + with self.test_session(): + result1 = image_ops.extract_glimpse(img, [3, 3], [[0, 0]], + centered=False, normalized=False) + result2 = image_ops.extract_glimpse(img, [3, 3], [[1, 0]], + centered=False, normalized=False) + self.assertAllEqual(np.asarray([[0, 1, 2], [5, 6, 7], [10, 11, 12]]), + result1.eval()[0, :, :, 0]) + self.assertAllEqual(np.asarray([[5, 6, 7], [10, 11, 12], [15, 16, 17]]), + result2.eval()[0, :, :, 0]) if __name__ == '__main__': test.main() From 35efb74fb72efde43122dd41da3dfc93dbf5be18 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 14 Apr 2020 20:58:33 +0000 Subject: [PATCH 068/557] Fix test failure caused by API changes in tests Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/attention_ops_test.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py index 8799980668a..21db05fac2f 100644 --- a/tensorflow/python/kernel_tests/attention_ops_test.py +++ b/tensorflow/python/kernel_tests/attention_ops_test.py @@ -240,14 +240,14 @@ class ExtractGlimpseTest(test.TestCase): img = constant_op.constant(np.arange(25).reshape((1, 5, 5, 1)), dtype=dtypes.float32) with self.test_session(): - result1 = image_ops.extract_glimpse(img, [3, 3], [[0, 0]], - centered=False, normalized=False) - result2 = image_ops.extract_glimpse(img, [3, 3], [[1, 0]], - centered=False, normalized=False) + result1 = image_ops.extract_glimpse_v2(img, [3, 3], [[0, 0]], + centered=False, normalized=False) + result2 = image_ops.extract_glimpse_v2(img, [3, 3], [[1, 0]], + centered=False, normalized=False) self.assertAllEqual(np.asarray([[0, 1, 2], [5, 6, 7], [10, 11, 12]]), - result1.eval()[0, :, :, 0]) + self.evaluate(result1)[0, :, :, 0]) self.assertAllEqual(np.asarray([[5, 6, 7], [10, 11, 12], [15, 16, 17]]), - result2.eval()[0, :, :, 0]) + self.evaluate(result2)[0, :, :, 0]) if __name__ == '__main__': test.main() From 677f75990460f3b68a66651001e25c5bde4aa374 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 14 Apr 2020 21:47:37 +0000 Subject: [PATCH 069/557] Fix test failure due to changes of the fix for centered=False and normalized=False Signed-off-by: Yong Tang --- .../python/kernel_tests/attention_ops_test.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py index 21db05fac2f..feec82aa051 100644 --- a/tensorflow/python/kernel_tests/attention_ops_test.py +++ b/tensorflow/python/kernel_tests/attention_ops_test.py @@ -211,28 +211,33 @@ class ExtractGlimpseTest(test.TestCase): # [ 0. 0. 0.] # [ 0. 0. 0.] result1 = image_ops.extract_glimpse_v2( - img, [3, 3], [[-2, 2]], + img, [3, 3], [[-2, -2]], centered=False, normalized=False, noise='zero') self.assertAllEqual( - np.asarray([[0, 0, 0], [0, 0, 0], [0, 0, 0]]), + np.asarray([[0, 0, 0], + [0, 0, 0], + [0, 0, 0]]), self.evaluate(result1)[0, :, :, 0]) # Result 2: + # [ 12. 13. 14. 0. 0. 0. 0.] + # [ 17. 18. 19. 0. 0. 0. 0.] + # [ 22. 23. 24. 0. 0. 0. 0.] + # [ 0. 0. 0. 0. 0. 0. 0.] + # [ 0. 0. 0. 0. 0. 0. 0.] # [ 0. 0. 0. 0. 0. 0. 0.] - # [ 0. 0. 1. 2. 3. 4. 0.] - # [ 0. 5. 6. 7. 8. 9. 0.] - # [ 0. 10. 11. 12. 13. 14. 0.] - # [ 0. 15. 16. 17. 18. 19. 0.] - # [ 0. 20. 21. 22. 23. 24. 0.] # [ 0. 0. 0. 0. 0. 0. 0.] result2 = image_ops.extract_glimpse_v2( img, [7, 7], [[0, 0]], normalized=False, noise='zero') self.assertAllEqual( - np.asarray([[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 4, 0], - [0, 5, 6, 7, 8, 9, 0], [0, 10, 11, 12, 13, 14, 0], - [0, 15, 16, 17, 18, 19, 0], [0, 20, 21, 22, 23, 24, 0], + np.asarray([[12, 13, 14, 0, 0, 0, 0], + [17, 18, 19, 0, 0, 0, 0], + [22, 23, 24, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]]), self.evaluate(result2)[0, :, :, 0]) From 3fc74213ba34f5748be1c3ac3f9199b225d10b64 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 15 Apr 2020 15:25:42 +0000 Subject: [PATCH 070/557] Fix incorrect doc test Signed-off-by: Yong Tang --- tensorflow/python/ops/image_ops_impl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index e6a5cdbf4e8..c84c9e701c4 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -4148,10 +4148,10 @@ def extract_glimpse_v2( >>> tf.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]], ... centered=False, normalized=False) + array([[[[4.], + [5.]], + [[7.], + [8.]]]], dtype=float32)> Args: input: A `Tensor` of type `float32`. A 4-D float tensor of shape From 353d22eb433b1494b6bafbfde126bd999499a79e Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 15 Apr 2020 15:26:42 +0000 Subject: [PATCH 071/557] Fix incorrect doc example with centered=False Signed-off-by: Yong Tang --- tensorflow/python/ops/image_ops_impl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index c84c9e701c4..bd0722f32f9 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -4063,10 +4063,10 @@ def extract_glimpse( >>> tf.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]], ... centered=False, normalized=False) + array([[[[4.], + [5.]], + [[7.], + [8.]]]], dtype=float32)> Args: input: A `Tensor` of type `float32`. A 4-D float tensor of shape From 756b7ed2d65843d52c8e02ca6350fd51fb638a55 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Mon, 27 Apr 2020 17:09:52 +0000 Subject: [PATCH 072/557] Use ExtractGlimpseV2 and ExtractGlimpse to make sure C++ kernel is backward compatible Signed-off-by: Yong Tang --- tensorflow/core/kernels/attention_ops.cc | 8 +++- tensorflow/core/kernels/eigen_attention.h | 50 ++++++++++++++++------- tensorflow/core/ops/image_ops.cc | 35 ++++++++++++++++ tensorflow/python/ops/image_ops_impl.py | 10 ++--- 4 files changed, 82 insertions(+), 21 deletions(-) diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc index f555c0fd679..6e5e07a9fb1 100644 --- a/tensorflow/core/kernels/attention_ops.cc +++ b/tensorflow/core/kernels/attention_ops.cc @@ -32,6 +32,8 @@ namespace tensorflow { class ExtractGlimpseOp : public OpKernel { public: explicit ExtractGlimpseOp(OpKernelConstruction* context) : OpKernel(context) { + const string& op = context->def().op(); + version_ = (op == "ExtractGlimpse") ? 1 : 2; OP_REQUIRES_OK(context, context->GetAttr("normalized", &normalized_)); OP_REQUIRES_OK(context, context->GetAttr("centered", ¢ered_)); bool uniform_noise = false; @@ -117,21 +119,23 @@ class ExtractGlimpseOp : public OpKernel { // calling TensorFlow operates with (y,x) as indices. offset_vec.push_back(Eigen::IndexPair(offset_x, offset_y)); } - output->tensor().swap_layout().device( context->eigen_cpu_device()) = Eigen::ExtractGlimpses(input.tensor().swap_layout(), output_width, output_height, offset_vec, - normalized_, centered_, noise_); + normalized_, centered_, noise_, version_); } private: bool normalized_; bool centered_; Eigen::ExtractGlimpsesNoiseMode noise_; + int32 version_; }; REGISTER_KERNEL_BUILDER(Name("ExtractGlimpse").Device(DEVICE_CPU), ExtractGlimpseOp); +REGISTER_KERNEL_BUILDER(Name("ExtractGlimpseV2").Device(DEVICE_CPU), + ExtractGlimpseOp); } // end namespace tensorflow diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h index 7cf5c53dfca..ca61e223c21 100644 --- a/tensorflow/core/kernels/eigen_attention.h +++ b/tensorflow/core/kernels/eigen_attention.h @@ -56,13 +56,15 @@ struct GlimpseExtractionOp { GlimpseExtractionOp(const Index width, const Index height, const std::vector >& offsets, const bool normalized, const bool centered, - const ExtractGlimpsesNoiseMode noise) + const ExtractGlimpsesNoiseMode noise, + const int version) : width_(width), height_(height), offsets_(offsets), normalized_(normalized), centered_(centered), - noise_(noise) {} + noise_(noise), + version_(version) {} template DSizes dimensions(const Input& input) const { @@ -101,24 +103,42 @@ struct GlimpseExtractionOp { for (Index i = 0; i < batch_size; ++i) { float x = offsets_[i].first, y = offsets_[i].second; - if (normalized_) { + if (version_ == 1) { // Un-normalize coordinates back to pixel space if normalized. - x *= input_width; - y *= input_height; + if (normalized_) { + x *= input_width; + y *= input_height; + } + // Un-center if coordinates are centered on the image center. if (centered_) { - // Un-center if coordinates are centered on the image center. x /= 2.0f; y /= 2.0f; x += input_width / 2.0f; y += input_height / 2.0f; - // Remove half of the glimpse window. - x -= width_ / 2.0f; - y -= height_ / 2.0f; } + // Remove half of the glimpse window. + x -= width_ / 2.0f; + y -= height_ / 2.0f; } else { - if (centered_) { - x += input_width / 2.0f; - y += input_height / 2.0f; + if (normalized_) { + // Un-normalize coordinates back to pixel space if normalized. + x *= input_width; + y *= input_height; + if (centered_) { + // Un-center if coordinates are centered on the image center. + x /= 2.0f; + y /= 2.0f; + x += input_width / 2.0f; + y += input_height / 2.0f; + // Remove half of the glimpse window. + x -= width_ / 2.0f; + y -= height_ / 2.0f; + } + } else { + if (centered_) { + x += input_width / 2.0f; + y += input_height / 2.0f; + } } } @@ -248,6 +268,7 @@ struct GlimpseExtractionOp { const bool normalized_; const bool centered_; const ExtractGlimpsesNoiseMode noise_; + const int version_; }; } // namespace @@ -260,7 +281,8 @@ ExtractGlimpses( const typename internal::traits::Index height, const std::vector >& offsets, const bool normalized = true, const bool centered = true, - const ExtractGlimpsesNoiseMode noise = ExtractGlimpsesNoiseMode::UNIFORM) { + const ExtractGlimpsesNoiseMode noise = ExtractGlimpsesNoiseMode::UNIFORM, + const int version = 2) { EIGEN_STATIC_ASSERT(internal::traits::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT(internal::traits::NumDimensions == 4, @@ -268,7 +290,7 @@ ExtractGlimpses( typedef typename internal::traits::Index Index; const GlimpseExtractionOp op(width, height, offsets, normalized, - centered, noise); + centered, noise, version); return input.customOp(op); } diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc index 418f1e20e37..e11f14b8538 100644 --- a/tensorflow/core/ops/image_ops.cc +++ b/tensorflow/core/ops/image_ops.cc @@ -756,6 +756,41 @@ REGISTER_OP("ExtractGlimpse") c->Dim(input, 3)); }); +REGISTER_OP("ExtractGlimpseV2") + .Input("input: float") + .Input("size: int32") + .Input("offsets: float") + .Output("glimpse: float") + .Attr("centered: bool = true") + .Attr("normalized: bool = true") + .Attr("uniform_noise: bool = true") + .Attr("noise: string = 'uniform'") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle input; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input)); + ShapeHandle offsets; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &offsets)); + + DimensionHandle batch_dim; + TF_RETURN_IF_ERROR( + c->Merge(c->Dim(input, 0), c->Dim(offsets, 0), &batch_dim)); + DimensionHandle unused; + TF_RETURN_IF_ERROR(c->WithValue(c->Dim(offsets, 1), 2, &unused)); + + bool uniform_noise = false; + TF_RETURN_IF_ERROR(c->GetAttr("uniform_noise", &uniform_noise)); + string noise; + TF_RETURN_IF_ERROR(c->GetAttr("noise", &noise)); + if (uniform_noise && (!noise.empty() && noise != "uniform")) { + return errors::InvalidArgument( + "The uniform_noise and noise should not be specified at the same " + "time"); + } + + return SetOutputToSizedImage(c, batch_dim, 1 /* size_input_idx */, + c->Dim(input, 3)); + }); + // -------------------------------------------------------------------------- REGISTER_OP("CropAndResize") diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index bd0722f32f9..49f44872ebf 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -4063,10 +4063,10 @@ def extract_glimpse( >>> tf.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]], ... centered=False, normalized=False) + array([[[[0.], + [1.]], + [[3.], + [4.]]]], dtype=float32)> Args: input: A `Tensor` of type `float32`. A 4-D float tensor of shape @@ -4176,7 +4176,7 @@ def extract_glimpse_v2( Returns: A `Tensor` of type `float32`. """ - return gen_image_ops.extract_glimpse( + return gen_image_ops.extract_glimpse_v2( input=input, size=size, offsets=offsets, From 9b84edeb4f866f137073f04f1e10296d19ef9e76 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Mon, 27 Apr 2020 17:11:04 +0000 Subject: [PATCH 073/557] Expand test case to cover both old kernel (ExtractGlimpse) and new kernel (ExtractGlimpseV2) Signed-off-by: Yong Tang --- .../python/kernel_tests/attention_ops_test.py | 48 +++++++++++++++++++ tensorflow/python/ops/image_ops_impl.py | 10 ++-- 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py index feec82aa051..80e2a816834 100644 --- a/tensorflow/python/kernel_tests/attention_ops_test.py +++ b/tensorflow/python/kernel_tests/attention_ops_test.py @@ -23,6 +23,7 @@ import numpy as np from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gen_image_ops from tensorflow.python.ops import image_ops from tensorflow.python.platform import test @@ -196,6 +197,53 @@ class ExtractGlimpseTest(test.TestCase): expected_rows=[None, None, None, 1, 2, 3, 4], expected_cols=[56, 57, 58, 59, 60]) + def testGlimpseNoiseZeroV1Compatible(self): + # Note: The old versions of extract_glimpse was incorrect in implementation. + # This test is for compatibility so that graph save in old versions behave + # the same. Notice the API uses gen_image_ops.extract_glimpse() on purpose. + # + # Image: + # [ 0. 1. 2. 3. 4.] + # [ 5. 6. 7. 8. 9.] + # [ 10. 11. 12. 13. 14.] + # [ 15. 16. 17. 18. 19.] + # [ 20. 21. 22. 23. 24.] + img = constant_op.constant( + np.arange(25).reshape((1, 5, 5, 1)), dtype=dtypes.float32) + with self.test_session(): + # Result 1: + # [ 0. 0. 0.] + # [ 0. 0. 0.] + # [ 0. 0. 0.] + result1 = gen_image_ops.extract_glimpse( + img, [3, 3], [[-2, 2]], + centered=False, + normalized=False, + noise='zero', + uniform_noise=False) + self.assertAllEqual( + np.asarray([[0, 0, 0], [0, 0, 0], [0, 0, 0]]), + self.evaluate(result1)[0, :, :, 0]) + + # Result 2: + # [ 0. 0. 0. 0. 0. 0. 0.] + # [ 0. 0. 1. 2. 3. 4. 0.] + # [ 0. 5. 6. 7. 8. 9. 0.] + # [ 0. 10. 11. 12. 13. 14. 0.] + # [ 0. 15. 16. 17. 18. 19. 0.] + # [ 0. 20. 21. 22. 23. 24. 0.] + # [ 0. 0. 0. 0. 0. 0. 0.] + result2 = gen_image_ops.extract_glimpse( + img, [7, 7], [[0, 0]], normalized=False, noise='zero', + uniform_noise=False) + self.assertAllEqual( + np.asarray([[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 4, 0], + [0, 5, 6, 7, 8, 9, 0], [0, 10, 11, 12, 13, 14, 0], + [0, 15, 16, 17, 18, 19, 0], [0, 20, 21, 22, 23, 24, 0], + [0, 0, 0, 0, 0, 0, 0]]), + self.evaluate(result2)[0, :, :, 0]) + + def testGlimpseNoiseZero(self): # Image: # [ 0. 1. 2. 3. 4.] diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index 49f44872ebf..e86dee798a8 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -4063,10 +4063,10 @@ def extract_glimpse( >>> tf.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]], ... centered=False, normalized=False) + array([[[[4.], + [5.]], + [[7.], + [8.]]]], dtype=float32)> Args: input: A `Tensor` of type `float32`. A 4-D float tensor of shape @@ -4091,7 +4091,7 @@ def extract_glimpse( Returns: A `Tensor` of type `float32`. """ - return gen_image_ops.extract_glimpse( + return gen_image_ops.extract_glimpse_v2( input=input, size=size, offsets=offsets, From 8c80414bacb3aaf5327b60d8538274e3d8cc7a7c Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Mon, 27 Apr 2020 17:43:44 +0000 Subject: [PATCH 074/557] Add api_def_ExtractGlimpseV2.pbtxt Signed-off-by: Yong Tang --- .../base_api/api_def_ExtractGlimpseV2.pbtxt | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt diff --git a/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt new file mode 100644 index 00000000000..160b864a007 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt @@ -0,0 +1,85 @@ +op { + graph_op_name: "ExtractGlimpseV2" + in_arg { + name: "input" + description: < Date: Mon, 27 Apr 2020 23:08:17 +0000 Subject: [PATCH 075/557] Update API golden Signed-off-by: Yong Tang --- .../core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt | 1 + tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt | 4 ++++ tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt index 160b864a007..aeb87346ab2 100644 --- a/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt @@ -1,5 +1,6 @@ op { graph_op_name: "ExtractGlimpseV2" + visibility: HIDDEN in_arg { name: "input" description: < Date: Tue, 28 Apr 2020 19:22:49 +0000 Subject: [PATCH 076/557] Reroute tf.compat.v1.extract_glimpse to use gen_image_ops.extract_glimpse (old API) This fix reroute tf.compat.v1.extract_glimpse to use gen_image_ops.extract_glimpse, so that the behavior of TF 1.x remains the same. Signed-off-by: Yong Tang --- tensorflow/python/ops/image_ops_impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index e86dee798a8..a86d3af2492 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -4091,7 +4091,7 @@ def extract_glimpse( Returns: A `Tensor` of type `float32`. """ - return gen_image_ops.extract_glimpse_v2( + return gen_image_ops.extract_glimpse( input=input, size=size, offsets=offsets, From 960bbc2d1bb95efd65177fdbdd70a63781eecfab Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 29 Apr 2020 20:13:46 +0000 Subject: [PATCH 077/557] Update RELEADE.md to capture the breaking change of `tf.image.extract_glimpse` Signed-off-by: Yong Tang --- RELEASE.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index 6c8921cf492..673d854d1b9 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,8 @@ +# Release 2.3.0 + +## Breaking Changes +* `tf.image.extract_glimpse` has been updated to correctly process the case where `centered=False` and `normalized=False`. This is a breaking change as the output is different from (incorrect) previous versions. Note this breaking change only impacts `tf.image.extract_glimpse` and `tf.compat.v2.image.extract_glimpse` API endpoints. The behavior of `tf.compat.v1.image.extract_glimpse` does not change. The behavior of exsiting C++ kernel `ExtractGlimpse` does not change as well, so saved models will not be impacted. + # Release 2.2.0 TensorFlow 2.2 discontinues support for Python 2, [previously announced](https://groups.google.com/a/tensorflow.org/d/msg/announce/gVwS5RC8mds/dCt1ka2XAAAJ) as following [Python 2's EOL on January 1, 2020](https://www.python.org/dev/peps/pep-0373/#update). From c00af599966359e4e0090cfd5191441354052068 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 6 May 2020 22:19:40 +0000 Subject: [PATCH 078/557] Update doc example of v1 to keep old behavior with usage of tf.compat.v1.image.extract_glimpse Signed-off-by: Yong Tang --- tensorflow/python/ops/image_ops_impl.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index a86d3af2492..633725da511 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -4060,13 +4060,13 @@ def extract_glimpse( ... [[6.0], ... [7.0], ... [8.0]]]] - >>> tf.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]], - ... centered=False, normalized=False) + >>> tf.compat.v1.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]], + ... centered=False, normalized=False) + array([[[[0.], + [1.]], + [[3.], + [4.]]]], dtype=float32)> Args: input: A `Tensor` of type `float32`. A 4-D float tensor of shape From 6e2654d882563116c2965215818b59c3abc8cc23 Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Tue, 12 May 2020 21:35:27 +0300 Subject: [PATCH 079/557] Removed named section pragmas from shared example code --- .../examples/person_detection_experimental/main_functions.cc | 2 -- .../person_detection_experimental/person_detection_test.cc | 2 -- 2 files changed, 4 deletions(-) diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc index 552b52c9c51..719f16b2d36 100644 --- a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc +++ b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc @@ -42,9 +42,7 @@ TfLiteTensor* input = nullptr; // An area of memory to use for input, output, and intermediate arrays. constexpr int kTensorArenaSize = 125 * 1024; -#pragma Bss(".tensor_arena") static uint8_t tensor_arena[kTensorArenaSize]; -#pragma Bss() } // namespace // The name of this function is important for Arduino compatibility. diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc index 9c7212648cc..b0979735d4f 100644 --- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc +++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc @@ -28,9 +28,7 @@ limitations under the License. // Create an area of memory to use for input, output, and intermediate arrays. constexpr int tensor_arena_size = 125 * 1024; -#pragma Bss(".tensor_arena") uint8_t tensor_arena[tensor_arena_size]; -#pragma Bss() TF_LITE_MICRO_TESTS_BEGIN From 417b97cd7468830f881a7867192355bd42f8c99d Mon Sep 17 00:00:00 2001 From: Ajay P Date: Tue, 12 May 2020 19:50:13 +0000 Subject: [PATCH 080/557] Modified recompute_grad to handle fwd mode diff --- tensorflow/python/eager/forwardprop_test.py | 24 ++++++--- .../python/keras/integration_test/BUILD | 3 +- .../gradient_checkpoint_test.py | 3 +- tensorflow/python/ops/custom_gradient.py | 49 ++++++++++++------- 4 files changed, 53 insertions(+), 26 deletions(-) diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py index aad179ffb6b..611e9ce2b2a 100644 --- a/tensorflow/python/eager/forwardprop_test.py +++ b/tensorflow/python/eager/forwardprop_test.py @@ -177,7 +177,8 @@ def _test_gradients(testcase, order, delta=1e-3, rtol=1e-2, - atol=1e-6): + atol=1e-6, + recompute=False): """Tests forward/backward jacobians of `f`'s [0, `order`)-order gradients.""" if order < 1: raise ValueError( @@ -190,14 +191,20 @@ def _test_gradients(testcase, order=order - 1, delta=delta, rtol=rtol, - atol=atol) + atol=atol, + recompute=recompute) sym_jac_back, num_jac = gradient_checker_v2.compute_gradient( f, primals, delta=delta) testcase.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol) - sym_jac_fwd = _jacfwd(f, primals) - testcase.assertAllClose(num_jac, sym_jac_fwd, rtol=rtol, atol=atol) - # And the symbolic computations should be much closer. - testcase.assertAllClose(sym_jac_back, sym_jac_fwd) + if not recompute: + sym_jac_fwd = _jacfwd(f, primals) + testcase.assertAllClose(num_jac, sym_jac_fwd, rtol=rtol, atol=atol) + # And the symbolic computations should be much closer. + testcase.assertAllClose(sym_jac_back, sym_jac_fwd) + else: + with testcase.assertRaisesRegexp(ValueError, + "recompute_grad tried to transpose"): + sym_jac_fwd = _jacfwd(f, primals) class ForwardpropTest(test.TestCase, parameterized.TestCase): @@ -357,7 +364,10 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase): def f(x): return math_ops.reduce_prod(math_ops.tanh(x)**2) - _test_gradients(self, f, [constant_op.constant([1.])], order=3) + _test_gradients(self, + f, [constant_op.constant([1.])], + order=3, + recompute=True) def testExceptionInCustomGradientNotSwallowed(self): diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD index f92f9d14685..b7d9957a12e 100644 --- a/tensorflow/python/keras/integration_test/BUILD +++ b/tensorflow/python/keras/integration_test/BUILD @@ -2,6 +2,7 @@ # Contains Keras integration tests that verify with other TF high level APIs. load("//tensorflow:tensorflow.bzl", "tf_py_test") +load("//tensorflow:tensorflow.bzl", "cuda_py_test") package( default_visibility = [ @@ -71,7 +72,7 @@ tf_py_test( ], ) -tf_py_test( +cuda_py_test( name = "gradient_checkpoint_test", srcs = ["gradient_checkpoint_test.py"], python_version = "PY3", diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py index df23c3abff5..92c53b3ab70 100644 --- a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py +++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py @@ -127,7 +127,8 @@ def _train_with_recompute(n_steps): model2_re = tf.recompute_grad(model2) model3_re = tf.recompute_grad(model3) optimizer = optimizers.SGD() - tr_vars = model1.trainable_variables + model2.trainable_variables + model3.trainable_variables + tr_vars = (model1.trainable_variables + model2.trainable_variables + + model3.trainable_variables) losses = [] for _ in range(n_steps): with tf.GradientTape() as tape: diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py index a5013062936..e32c0820e93 100644 --- a/tensorflow/python/ops/custom_gradient.py +++ b/tensorflow/python/ops/custom_gradient.py @@ -482,27 +482,42 @@ def recompute_grad(f): def inner(*args, **kwargs): """Inner function closure for calculating gradients.""" current_var_scope = variable_scope.get_variable_scope() - with tape_lib.stop_recording(): result = f(*args, **kwargs) - + @custom_gradient def grad(*dresult, **grad_kwargs): - """Gradient function calculation for inner function.""" - variables = grad_kwargs.get("variables") - with backprop.GradientTape() as t: - id_args = [gen_array_ops.identity(x) for x in args] - t.watch(id_args) + """Nested custom gradient function for computing grads in reverse and forward mode autodiff.""" + + def grad_eval(): + """Gradient function calculation for reverse mode autodiff.""" + variables = grad_kwargs.get("variables") + with backprop.GradientTape() as t: + id_args = [gen_array_ops.identity(x) for x in args] + t.watch(id_args) + if variables is not None: + t.watch(variables) + with ops.control_dependencies(dresult): + with variable_scope.variable_scope(current_var_scope): + result = f(*id_args, **kwargs) + kw_vars = [] if variables is not None: - t.watch(variables) - with ops.control_dependencies(dresult): - with variable_scope.variable_scope(current_var_scope): - result = f(*id_args, **kwargs) - kw_vars = [] - if variables is not None: - kw_vars = list(variables) - grads = t.gradient( - result, list(id_args) + kw_vars, output_gradients=dresult) - return grads[:len(id_args)], grads[len(id_args):] + kw_vars = list(variables) + grads = t.gradient(result, + list(id_args) + kw_vars, + output_gradients=dresult) + if len(grads) == 1 and None in grads: + return 0 + return grads[:len(id_args)], grads[len(id_args):] + + def transpose(*t_args, **t_kwargs): + """Gradient function calculation for forward mode autodiff.""" + # Just throw an error since gradients / activations are not stored on tape for recompute. + raise ValueError( + "recompute_grad tried to transpose {}." + "Consider not using recompute_grad in forward mode autodiff".format( + f.__name__)) + + return grad_eval(), transpose return result, grad From 38e503d845d0c45c42b4b19f76548b140a608a7f Mon Sep 17 00:00:00 2001 From: Ajay P Date: Tue, 12 May 2020 21:47:01 +0000 Subject: [PATCH 081/557] Addressed PR comments --- tensorflow/python/eager/forwardprop_test.py | 27 +++++++-------------- tensorflow/python/ops/custom_gradient.py | 4 +-- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py index 611e9ce2b2a..c32de30a2b3 100644 --- a/tensorflow/python/eager/forwardprop_test.py +++ b/tensorflow/python/eager/forwardprop_test.py @@ -177,8 +177,7 @@ def _test_gradients(testcase, order, delta=1e-3, rtol=1e-2, - atol=1e-6, - recompute=False): + atol=1e-6): """Tests forward/backward jacobians of `f`'s [0, `order`)-order gradients.""" if order < 1: raise ValueError( @@ -191,21 +190,14 @@ def _test_gradients(testcase, order=order - 1, delta=delta, rtol=rtol, - atol=atol, - recompute=recompute) + atol=atol) sym_jac_back, num_jac = gradient_checker_v2.compute_gradient( f, primals, delta=delta) testcase.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol) - if not recompute: - sym_jac_fwd = _jacfwd(f, primals) - testcase.assertAllClose(num_jac, sym_jac_fwd, rtol=rtol, atol=atol) - # And the symbolic computations should be much closer. - testcase.assertAllClose(sym_jac_back, sym_jac_fwd) - else: - with testcase.assertRaisesRegexp(ValueError, - "recompute_grad tried to transpose"): - sym_jac_fwd = _jacfwd(f, primals) - + sym_jac_fwd = _jacfwd(f, primals) + testcase.assertAllClose(num_jac, sym_jac_fwd, rtol=rtol, atol=atol) + # And the symbolic computations should be much closer. + testcase.assertAllClose(sym_jac_back, sym_jac_fwd) class ForwardpropTest(test.TestCase, parameterized.TestCase): @@ -364,10 +356,9 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase): def f(x): return math_ops.reduce_prod(math_ops.tanh(x)**2) - _test_gradients(self, - f, [constant_op.constant([1.])], - order=3, - recompute=True) + with self.assertRaisesRegexp(NotImplementedError, + "recompute_grad tried to transpose"): + _test_gradients(self, f, [constant_op.constant([1.])], order=3) def testExceptionInCustomGradientNotSwallowed(self): diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py index e32c0820e93..d0f06718911 100644 --- a/tensorflow/python/ops/custom_gradient.py +++ b/tensorflow/python/ops/custom_gradient.py @@ -512,8 +512,8 @@ def recompute_grad(f): def transpose(*t_args, **t_kwargs): """Gradient function calculation for forward mode autodiff.""" # Just throw an error since gradients / activations are not stored on tape for recompute. - raise ValueError( - "recompute_grad tried to transpose {}." + raise NotImplementedError( + "recompute_grad tried to transpose grad of {}. " "Consider not using recompute_grad in forward mode autodiff".format( f.__name__)) From e4c22494e716b34f148f8154ad23f77b7d68ac9c Mon Sep 17 00:00:00 2001 From: Ajay P Date: Tue, 12 May 2020 22:33:07 +0000 Subject: [PATCH 082/557] Addressed PR comments --- tensorflow/python/eager/forwardprop_test.py | 2 +- tensorflow/python/ops/custom_gradient.py | 40 ++++++++++----------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py index c32de30a2b3..0c9ffaa0816 100644 --- a/tensorflow/python/eager/forwardprop_test.py +++ b/tensorflow/python/eager/forwardprop_test.py @@ -349,7 +349,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase): _test_gradients(self, f, [constant_op.constant([1., 2.])], order=3) - @test_util.assert_no_new_pyobjects_executing_eagerly + # TODO(allenl): investigate why assert_no_new_pyobjects_executing_eagerly fails around this test? def testCustomGradientRecomputeGrad(self): @custom_gradient.recompute_grad diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py index d0f06718911..6489aff117f 100644 --- a/tensorflow/python/ops/custom_gradient.py +++ b/tensorflow/python/ops/custom_gradient.py @@ -484,30 +484,26 @@ def recompute_grad(f): current_var_scope = variable_scope.get_variable_scope() with tape_lib.stop_recording(): result = f(*args, **kwargs) + @custom_gradient def grad(*dresult, **grad_kwargs): """Nested custom gradient function for computing grads in reverse and forward mode autodiff.""" - - def grad_eval(): - """Gradient function calculation for reverse mode autodiff.""" - variables = grad_kwargs.get("variables") - with backprop.GradientTape() as t: - id_args = [gen_array_ops.identity(x) for x in args] - t.watch(id_args) - if variables is not None: - t.watch(variables) - with ops.control_dependencies(dresult): - with variable_scope.variable_scope(current_var_scope): - result = f(*id_args, **kwargs) - kw_vars = [] + # Gradient calculation for reverse mode autodiff. + variables = grad_kwargs.get("variables") + with backprop.GradientTape() as t: + id_args = [gen_array_ops.identity(x) for x in args] + t.watch(id_args) if variables is not None: - kw_vars = list(variables) - grads = t.gradient(result, - list(id_args) + kw_vars, - output_gradients=dresult) - if len(grads) == 1 and None in grads: - return 0 - return grads[:len(id_args)], grads[len(id_args):] + t.watch(variables) + with ops.control_dependencies(dresult): + with variable_scope.variable_scope(current_var_scope): + result = f(*id_args, **kwargs) + kw_vars = [] + if variables is not None: + kw_vars = list(variables) + grads = t.gradient(result, + list(id_args) + kw_vars, + output_gradients=dresult) def transpose(*t_args, **t_kwargs): """Gradient function calculation for forward mode autodiff.""" @@ -517,7 +513,9 @@ def recompute_grad(f): "Consider not using recompute_grad in forward mode autodiff".format( f.__name__)) - return grad_eval(), transpose + if len(grads) == 1 and None in grads: + return 0, transpose + return (grads[:len(id_args)], grads[len(id_args):]), transpose return result, grad From b79631972128ab60c1f646dca68867459f5cb102 Mon Sep 17 00:00:00 2001 From: Koan-Sin Tan Date: Wed, 13 May 2020 07:06:27 +0800 Subject: [PATCH 083/557] list command line flags in readme And update some `usage()` descriptions --- .../lite/examples/label_image/README.md | 27 ++++++++++++++++++- .../lite/examples/label_image/label_image.cc | 6 ++--- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/tensorflow/lite/examples/label_image/README.md b/tensorflow/lite/examples/label_image/README.md index 9d37c153361..9ca8fd05e09 100644 --- a/tensorflow/lite/examples/label_image/README.md +++ b/tensorflow/lite/examples/label_image/README.md @@ -138,7 +138,7 @@ average time:10.348 ms To run a model with the Hexagon Delegate, assuming we have followed the [Hexagon Delegate Guide](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/hexagon_delegate.md) -and installed Hexagon libraries in `/data/local/tmp`. Run it +and installed Hexagon libraries in `/data/local/tmp`. Run it wth (`-j 1`) ``` adb shell \ "/data/local/tmp/label_image \ @@ -186,4 +186,29 @@ average time: 17.33 ms 0.00414093: 514 cornet ``` +With `-h` or any other unsupported flags, `label_image` will list +supported options +``` +sargo:/data/local/tmp $ ./label_image -h +./label_image: invalid option -- h +label_image +--accelerated, -a: [0|1], use Android NNAPI or not +--old_accelerated, -d: [0|1], use old Android NNAPI delegate or not +--allow_fp16, -f: [0|1], allow running fp32 models with fp16 or not +--count, -c: loop interpreter->Invoke() for certain times +--gl_backend, -g: [0|1]: use GL GPU Delegate on Android +--hexagon_delegate, -j: [0|1]: use Hexagon Delegate on Android +--input_mean, -b: input mean +--input_std, -s: input standard deviation +--image, -i: image_name.bmp +--labels, -l: labels for the model +--tflite_model, -m: model_name.tflite +--profiling, -p: [0|1], profiling or not +--num_results, -r: number of results to show +--threads, -t: number of threads +--verbose, -v: [0|1] print more information +--warmup_runs, -w: number of warmup runs +--xnnpack_delegate, -x [0:1]: xnnpack delegate +``` + See the `label_image.cc` source code for other command line options. diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc index ec744d70381..364ac325967 100644 --- a/tensorflow/lite/examples/label_image/label_image.cc +++ b/tensorflow/lite/examples/label_image/label_image.cc @@ -362,8 +362,8 @@ void display_usage() { << "--old_accelerated, -d: [0|1], use old Android NNAPI delegate or not\n" << "--allow_fp16, -f: [0|1], allow running fp32 models with fp16 or not\n" << "--count, -c: loop interpreter->Invoke() for certain times\n" - << "--gl_backend, -g: use GL GPU Delegate on Android\n" - << "--hexagon_delegate: use Hexagon Delegate on Android\n" + << "--gl_backend, -g: [0|1]: use GL GPU Delegate on Android\n" + << "--hexagon_delegate, -j: [0|1]: use Hexagon Delegate on Android\n" << "--input_mean, -b: input mean\n" << "--input_std, -s: input standard deviation\n" << "--image, -i: image_name.bmp\n" @@ -374,7 +374,7 @@ void display_usage() { << "--threads, -t: number of threads\n" << "--verbose, -v: [0|1] print more information\n" << "--warmup_runs, -w: number of warmup runs\n" - << "--xnnpack_delegate, -x: xnnpack delegate\n" + << "--xnnpack_delegate, -x [0:1]: xnnpack delegate\n" << "\n"; } From 8e073e237ed258dac220d3cc1a177a08e43f2c0d Mon Sep 17 00:00:00 2001 From: "Felix E. Klee" Date: Wed, 13 May 2020 17:47:52 +0800 Subject: [PATCH 084/557] Fix typo preventing compilation `idf.py build` returned: ../main/esp/app_camera_esp.h:46:27: error: 'FRAMESIZE_96x96' undeclared (first use in this function); did you mean 'FRAMESIZE_96X96'? --- .../lite/micro/examples/person_detection/esp/app_camera_esp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/micro/examples/person_detection/esp/app_camera_esp.h b/tensorflow/lite/micro/examples/person_detection/esp/app_camera_esp.h index 403fb4defb1..e8cbe2177a9 100644 --- a/tensorflow/lite/micro/examples/person_detection/esp/app_camera_esp.h +++ b/tensorflow/lite/micro/examples/person_detection/esp/app_camera_esp.h @@ -30,7 +30,7 @@ limitations under the License. #define CAMERA_PIXEL_FORMAT PIXFORMAT_GRAYSCALE /* - * FRAMESIZE_96x96, // 96x96 + * FRAMESIZE_96X96, // 96x96 * FRAMESIZE_QQVGA, // 160x120 * FRAMESIZE_QQVGA2, // 128x160 * FRAMESIZE_QCIF, // 176x144 @@ -43,7 +43,7 @@ limitations under the License. * FRAMESIZE_SXGA, // 1280x1024 * FRAMESIZE_UXGA, // 1600x1200 */ -#define CAMERA_FRAME_SIZE FRAMESIZE_96x96 +#define CAMERA_FRAME_SIZE FRAMESIZE_96X96 #if CONFIG_CAMERA_MODEL_WROVER_KIT #define PWDN_GPIO_NUM -1 From 3d557534a3d5792f03c3607b14b0b0bfb51bdc1f Mon Sep 17 00:00:00 2001 From: Ajay P Date: Thu, 14 May 2020 00:01:41 +0000 Subject: [PATCH 085/557] Reorganized tests for recompute grad --- tensorflow/python/eager/forwardprop_test.py | 5 ++- .../gradient_checkpoint_test.py | 10 +++-- tensorflow/python/ops/custom_gradient.py | 6 +-- tensorflow/python/ops/gradients_test.py | 41 ++++++++++++++++++- 4 files changed, 52 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py index 0c9ffaa0816..d1a30b352d3 100644 --- a/tensorflow/python/eager/forwardprop_test.py +++ b/tensorflow/python/eager/forwardprop_test.py @@ -350,7 +350,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase): _test_gradients(self, f, [constant_op.constant([1., 2.])], order=3) # TODO(allenl): investigate why assert_no_new_pyobjects_executing_eagerly fails around this test? - def testCustomGradientRecomputeGrad(self): + def testExceptionCustomGradientRecomputeGradForward(self): @custom_gradient.recompute_grad def f(x): @@ -358,7 +358,8 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase): with self.assertRaisesRegexp(NotImplementedError, "recompute_grad tried to transpose"): - _test_gradients(self, f, [constant_op.constant([1.])], order=3) + primals = [constant_op.constant([1.])] + sym_jac_fwd = _jacfwd(f, primals) def testExceptionInCustomGradientNotSwallowed(self): diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py index 92c53b3ab70..18e88179e9b 100644 --- a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py +++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py @@ -19,7 +19,6 @@ from __future__ import print_function import tensorflow as tf from tensorflow.keras import layers, optimizers - def _get_big_cnn_model(img_dim, n_channels, num_partitions, blocks_per_partition): """Creates a test model whose activations are significantly larger than model size.""" @@ -67,7 +66,6 @@ def _compute_loss(logits, labels): tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)) - def _limit_gpu_memory(): """Helper function to limit GPU memory for testing """ gpus = tf.config.experimental.list_physical_devices('GPU') @@ -80,6 +78,8 @@ def _limit_gpu_memory(): ]) except RuntimeError as e: print(e) + return True + return False def _get_dummy_data(img_dim, n_channels, batch_size): @@ -90,7 +90,6 @@ def _get_dummy_data(img_dim, n_channels, batch_size): def _train_no_recompute(n_steps): """Trains a single large model without gradient checkpointing.""" - _limit_gpu_memory() img_dim, n_channels, batch_size = 256, 1, 4 x, y = _get_dummy_data(img_dim, n_channels, batch_size) model = _get_big_cnn_model(img_dim, @@ -113,7 +112,6 @@ def _train_no_recompute(n_steps): def _train_with_recompute(n_steps): """Trains a single large model with gradient checkpointing using tf.recompute_grad.""" - _limit_gpu_memory() img_dim, n_channels, batch_size = 256, 1, 4 x, y = _get_dummy_data(img_dim, n_channels, batch_size) # This model is the same model as _get_big_cnn_model but split into 3 parts. @@ -146,12 +144,16 @@ def _train_with_recompute(n_steps): class GradientCheckpointTest(tf.test.TestCase): def test_raises_oom_exception(self): + if not _limit_gpu_memory(): + self.skipTest("No virtual GPUs found") with self.assertRaises(Exception) as context: _train_no_recompute(1) self.assertTrue( context.exception.__class__.__name__ == 'ResourceExhaustedError') def test_does_not_raise_oom_exception(self): + if not _limit_gpu_memory(): + self.skipTest("No virtual GPUs found") n_step = 2 losses = _train_with_recompute(n_step) self.assertTrue(len(losses) == n_step) diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py index 6489aff117f..d57be41c3de 100644 --- a/tensorflow/python/ops/custom_gradient.py +++ b/tensorflow/python/ops/custom_gradient.py @@ -33,6 +33,7 @@ from tensorflow.python.util import nest from tensorflow.python.util import tf_decorator from tensorflow.python.util import tf_inspect from tensorflow.python.util.tf_export import tf_export +from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients VAR_OP_TYPES = [ @@ -503,7 +504,8 @@ def recompute_grad(f): kw_vars = list(variables) grads = t.gradient(result, list(id_args) + kw_vars, - output_gradients=dresult) + output_gradients=dresult, + unconnected_gradients=UnconnectedGradients.ZERO) def transpose(*t_args, **t_kwargs): """Gradient function calculation for forward mode autodiff.""" @@ -513,8 +515,6 @@ def recompute_grad(f): "Consider not using recompute_grad in forward mode autodiff".format( f.__name__)) - if len(grads) == 1 and None in grads: - return 0, transpose return (grads[:len(id_args)], grads[len(id_args):]), transpose return result, grad diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py index 817d8a1adbe..9b536136cb5 100644 --- a/tensorflow/python/ops/gradients_test.py +++ b/tensorflow/python/ops/gradients_test.py @@ -59,7 +59,7 @@ from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables from tensorflow.python.ops.nn_ops import bias_add from tensorflow.python.platform import googletest - +from tensorflow.python.ops import gradient_checker_v2 class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase): @@ -1340,6 +1340,45 @@ class VariablesGradientTest(test_util.TensorFlowTestCase): return grads_re, grads + def _grad(self, f, argnums=0): + """Return a function which computes the gradient of `f`.""" + + def _f(*params): + with backprop.GradientTape() as tape: + tape.watch(params) + outputs = f(*params) + return tape.gradient( + outputs, + params[argnums], + unconnected_gradients=unconnected_gradients.UnconnectedGradients.ZERO) + + return _f + + def _test_gradients(self, f, inputs, order, delta=1e-3, rtol=1e-2, atol=1e-6): + """Tests backward jacobians of `f`'s [0, `order`)-order gradients.""" + if order < 1: + raise ValueError( + "`order` should be a positive integer, got '{}'.".format(order)) + if order > 1: + self._test_gradients(f=self._grad(f), + inputs=inputs, + order=order - 1, + delta=delta, + rtol=rtol, + atol=atol) + sym_jac_back, num_jac = gradient_checker_v2.compute_gradient(f, + inputs, + delta=delta) + testcase.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol) + + @test_util.run_in_graph_and_eager_modes + def testCustomGradientRecomputeGradHigherOrder(self): + + @custom_gradient.recompute_grad + def f(x): + return math_ops.reduce_prod(math_ops.tanh(x)**2) + self._test_gradients(f, [constant_op.constant([1.])], order=3) + @test_util.run_in_graph_and_eager_modes def testFnRecompute(self): """Checks that recompute_grad works grads of function args.""" From 59a473982d771a50d9c97298a69c06e6a90395b1 Mon Sep 17 00:00:00 2001 From: Teng Lu Date: Thu, 14 May 2020 11:40:43 +0800 Subject: [PATCH 086/557] Support BF16 Softmax and add UT. --- tensorflow/core/kernels/mkl_tmp_bf16_ops.cc | 4 +++- tensorflow/core/ops/nn_grad.cc | 2 +- tensorflow/python/ops/math_ops_test.py | 2 +- tensorflow/python/ops/nn_grad_test.py | 18 ++++++++++++++++++ tensorflow/python/ops/nn_test.py | 15 +++++++++++++++ 5 files changed, 38 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc b/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc index 7f45979a57e..e8d53a1fadf 100644 --- a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc +++ b/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc @@ -56,7 +56,9 @@ namespace tensorflow { REGISTER_KERNEL_BUILDER( \ Name("_FusedMatMul").Device(DEVICE_CPU).TypeConstraint("T"), NoOp); \ REGISTER_KERNEL_BUILDER( \ - Name("BatchMatMulV2").Device(DEVICE_CPU).TypeConstraint("T"), NoOp); + Name("BatchMatMulV2").Device(DEVICE_CPU).TypeConstraint("T"), NoOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("Softmax").Device(DEVICE_CPU).TypeConstraint("T"), NoOp); TF_CALL_bfloat16(REGISTER_CPU); #undef REGISTER_CPU diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc index c39f3adfa97..ae75e6b95b2 100644 --- a/tensorflow/core/ops/nn_grad.cc +++ b/tensorflow/core/ops/nn_grad.cc @@ -31,7 +31,7 @@ Status SoftmaxGrad(const AttrSlice& attrs, FunctionDef* g) { // Ret val defs {"grad_x: T"}, // Attr defs -#if defined(INTEL_MKL) && defined(ENABLE_INTEL_MKL_BFLOAT16) +#if defined(INTEL_MKL) {{"T: {float, double, bfloat16}"}}, #else {{"T: {float, double}"}}, diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py index ab554388cdc..1362a23e104 100644 --- a/tensorflow/python/ops/math_ops_test.py +++ b/tensorflow/python/ops/math_ops_test.py @@ -45,7 +45,7 @@ class ReduceTest(test_util.TensorFlowTestCase): self.assertEqual(y_tf, 21) def testReduceExtendType(self): - in_f32 = np.random.rand(1024, 1024).astype(np.float) + in_f32 = np.random.randn(1000, 1000).astype(np.float32) in_bf16 = math_ops.cast(in_f32, dtypes.bfloat16) out_f32 = self.evaluate(math_ops.reduce_sum(in_f32)) diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py index 9da56cb7200..1334b733854 100644 --- a/tensorflow/python/ops/nn_grad_test.py +++ b/tensorflow/python/ops/nn_grad_test.py @@ -33,6 +33,24 @@ from tensorflow.python.ops import nn_ops from tensorflow.python.platform import test +class SoftmaxOpTest(test.TestCase): + + @test_util.run_deprecated_v1 + def testSoftmaxGradGradExtendType(self): + if test_util.IsMklEnabled(): + inputs = constant_op.constant( + [[-2, -1, 1, 3], [5, 7, 8, 9]], dtype=dtypes.bfloat16) + r = nn_ops.softmax(inputs) + r_g = gradients_impl.gradients(r, inputs)[0] + with self.cached_session(): + error = gradient_checker.compute_gradient_error( + inputs, + inputs.get_shape(), + r_g, + r_g.get_shape()) + self.assertLess(error, 1e-4) + + class Relu6OpTest(test.TestCase): @test_util.run_deprecated_v1 diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py index 860bdc60387..ec60e13411d 100644 --- a/tensorflow/python/ops/nn_test.py +++ b/tensorflow/python/ops/nn_test.py @@ -130,6 +130,21 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase): self.assertAllClose(x_neg_axis_tf, y_pos_axis_tf, eps) self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps) + @test_util.run_deprecated_v1 + def testSoftmaxExtendType(self): + if test_util.IsMklEnabled(): + x_shape = [5, 10] + x_np = np.random.randn(*x_shape).astype(np.float32) + + x_f32_tf = constant_op.constant(x_np) + x_bf16_tf = math_ops.cast(x_f32_tf, dtypes.bfloat16) + y_f32_tf = self.evaluate(nn_ops.softmax(x_f32_tf)) + y_bf16_tf = self.evaluate(nn_ops.softmax(x_bf16_tf)) + expected = math_ops.cast(y_f32_tf, dtypes.bfloat16) + # BF16 type has less precision + eps = 1e-2 + self.assertAllClose(y_bf16_tf, expected, eps) + @parameterized.parameters(((5, 10),), ((2, 3, 4),)) @test_util.run_deprecated_v1 def testGradient(self, x_shape): From 5d92849778771a475fe339d2954db12c3d4ecc2b Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Thu, 14 May 2020 08:28:07 -0700 Subject: [PATCH 087/557] fix conv_ops_test and remapper_test --- .../core/grappler/optimizers/remapper_test.cc | 3 +++ tensorflow/core/kernels/conv_ops_test.cc | 23 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc index 35e09b28205..52f420c57cc 100644 --- a/tensorflow/core/grappler/optimizers/remapper_test.cc +++ b/tensorflow/core/grappler/optimizers/remapper_test.cc @@ -607,6 +607,7 @@ TEST_F(RemapperTest, FuseMatMulWithBiasAndActivation) { } } +#ifndef INTEL_MKL TEST_F(RemapperTest, FuseConv2DWithBatchNorm) { using ops::Placeholder; @@ -685,6 +686,7 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNorm) { test::ExpectTensorNear(tensors[0], tensors_expected[0], 1e-6); } + TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) { using ops::Placeholder; @@ -850,6 +852,7 @@ TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) { ASSERT_EQ(tensors.size(), 1); test::ExpectTensorNear(tensors[0], tensors_expected[0], 1e-6); } +#endif } // namespace grappler } // namespace tensorflow diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc index 21dffa3cc5e..9e9ca27a570 100644 --- a/tensorflow/core/kernels/conv_ops_test.cc +++ b/tensorflow/core/kernels/conv_ops_test.cc @@ -1028,12 +1028,14 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolution) { this->VerifyConv2DWithBias(filter_size, filter_count); } +#ifndef INTEL_MKL TYPED_TEST_P(FusedConv2DWithBiasOpTest, ExplicitPaddingConvolution) { const int filter_size = 3; const int filter_count = 12; this->VerifyConv2DWithBias(filter_size, filter_count, /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0}); } +#endif TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndActivation) { const int filter_size = 1; @@ -1062,6 +1064,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndActivation) { } } +#ifndef INTEL_MKL TYPED_TEST_P(FusedConv2DWithBiasOpTest, ExplicitPaddingConvolutionAndActivation) { const int filter_size = 3; @@ -1072,6 +1075,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0}); } } +#endif // -------------------------------------------------------------------------- // // Conv2D + FusedBatchNorm + {Activation} // @@ -1095,6 +1099,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolution) { this->VerifyConv2DWithBatchNorm(filter_size, filter_count); } +#ifndef INTEL_MKL TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) { const int filter_size = 3; const int filter_count = 12; @@ -1102,6 +1107,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) { filter_size, filter_count, /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0}); } +#endif TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndActivation) { const int filter_size = 1; @@ -1131,6 +1137,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndActivation) { } } +#ifndef INTEL_MKL TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolutionAndActivation) { const int filter_size = 3; @@ -1141,34 +1148,50 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0}); } } +#endif REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest, // OneByOneConvolution, // ImageSizeConvolution, // SpatialConvolution, // +#ifndef INTEL_MKL ExplicitPaddingConvolution, // +#endif OneByOneConvolutionAndActivation, // ImageSizeConvolutionAndActivation, // +#ifndef INTEL_MKL SpatialConvolutionAndActivation, // ExplicitPaddingConvolutionAndActivation); +#else + SpatialConvolutionAndActivation); +#endif REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest, // OneByOneConvolution, // ImageSizeConvolution, // SpatialConvolution, // +#ifndef INTEL_MKL ExplicitPaddingConvolution, // +#endif OneByOneConvolutionAndActivation, // ImageSizeConvolutionAndActivation, // +#ifndef INTEL_MKL SpatialConvolutionAndActivation, // ExplicitPaddingConvolutionAndActivation); +#else + SpatialConvolutionAndActivation); +#endif using FusedBiasAddDataTypes = ::testing::Types; INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBiasOpTest, FusedBiasAddDataTypes); + +#ifndef INTEL_MKL using FusedBatchNormDataTypes = ::testing::Types; INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBatchNormOpTest, FusedBatchNormDataTypes); +#endif #endif // TENSORFLOW_USE_ROCM } // namespace tensorflow From 10c7f276e41f6b1790d8e767f77b9f5583419ad5 Mon Sep 17 00:00:00 2001 From: bhack Date: Thu, 14 May 2020 17:37:50 +0200 Subject: [PATCH 088/557] Test autograph indirect tf.map_fn decorator --- tensorflow/python/kernel_tests/map_fn_test.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py index 1e10d689886..a5c860b407d 100644 --- a/tensorflow/python/kernel_tests/map_fn_test.py +++ b/tensorflow/python/kernel_tests/map_fn_test.py @@ -186,6 +186,24 @@ class MapFnTest(test.TestCase): self.assertAllEqual(-nums, received[1]) self.assertAllEqual(nums, received[2]) + @test_util.run_in_graph_and_eager_modes + def testMap_autograph_indirect(): + def test_function(x): + cond = tf.constant(-1) + if cond == 0: + result = x + else: + result = x + return result + + @tf.function + def map_call(x): + tf.map_fn(test_function, x) + + x = constant_op.constant([1]) + y = map_call(x) + self.assertAllEqual([1], self.evaluate(y)) + @test_util.run_in_graph_and_eager_modes def testMapShape(self): x = constant_op.constant([[1, 2, 3], [4, 5, 6]]) From 38e941dada7b7d790b4b060ec04ee78d5c9252ef Mon Sep 17 00:00:00 2001 From: bhack Date: Thu, 14 May 2020 17:40:11 +0200 Subject: [PATCH 089/557] Fix missing return --- tensorflow/python/kernel_tests/map_fn_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py index a5c860b407d..7bf793c1e20 100644 --- a/tensorflow/python/kernel_tests/map_fn_test.py +++ b/tensorflow/python/kernel_tests/map_fn_test.py @@ -198,7 +198,7 @@ class MapFnTest(test.TestCase): @tf.function def map_call(x): - tf.map_fn(test_function, x) + return tf.map_fn(test_function, x) x = constant_op.constant([1]) y = map_call(x) From ffef54602d33f3b23ce21a0d421efde05efe7cef Mon Sep 17 00:00:00 2001 From: bhack Date: Thu, 14 May 2020 18:21:13 +0200 Subject: [PATCH 090/557] Fix missing self Add initial autograph wrapping in map_fn --- tensorflow/python/kernel_tests/map_fn_test.py | 2 +- tensorflow/python/ops/map_fn.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py index 7bf793c1e20..1859c6c5873 100644 --- a/tensorflow/python/kernel_tests/map_fn_test.py +++ b/tensorflow/python/kernel_tests/map_fn_test.py @@ -187,7 +187,7 @@ class MapFnTest(test.TestCase): self.assertAllEqual(nums, received[2]) @test_util.run_in_graph_and_eager_modes - def testMap_autograph_indirect(): + def testMap_autograph_indirect(self): def test_function(x): cond = tf.constant(-1) if cond == 0: diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py index 2c9c678336e..dfe32998282 100644 --- a/tensorflow/python/ops/map_fn.py +++ b/tensorflow/python/ops/map_fn.py @@ -39,6 +39,12 @@ from tensorflow.python.util import deprecation from tensorflow.python.util import nest from tensorflow.python.util.tf_export import tf_export +autograph_ctx = lazy_loader.LazyLoader( + "autograph_ctx", globals(), + "tensorflow.python.autograph.core.ag_ctx") +autograph = lazy_loader.LazyLoader( + "autograph", globals(), + "tensorflow.python.autograph.impl.api") @tf_export(v1=["map_fn"]) @deprecation.deprecated_args(None, "Use fn_output_signature instead", "dtype") @@ -477,7 +483,8 @@ def map_fn(fn, elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable, elems_flat_signature) elems_value = elems_unflatten(elems_value_flat) - result_value = fn(elems_value) + ag_ctx = autograph_ctx.control_status_ctx() + result_value = autograph.tf_convert(elems_value, ag_ctx) nest.assert_same_structure(fn_output_signature or elems, result_value) result_value_flat = nest.flatten(result_value) result_value_batchable = _result_value_flat_to_batchable( From 9a6a6476b563a65416b4bb438d021a2c7e52f139 Mon Sep 17 00:00:00 2001 From: bhack Date: Fri, 15 May 2020 00:40:15 +0000 Subject: [PATCH 091/557] Add test and remove decorator --- tensorflow/python/kernel_tests/map_fn_test.py | 8 +++----- tensorflow/python/ops/map_fn.py | 11 +---------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py index 1859c6c5873..0bc3307e484 100644 --- a/tensorflow/python/kernel_tests/map_fn_test.py +++ b/tensorflow/python/kernel_tests/map_fn_test.py @@ -189,20 +189,18 @@ class MapFnTest(test.TestCase): @test_util.run_in_graph_and_eager_modes def testMap_autograph_indirect(self): def test_function(x): - cond = tf.constant(-1) + cond = constant_op.constant(-1) if cond == 0: result = x else: result = x return result - - @tf.function def map_call(x): - return tf.map_fn(test_function, x) + return map_fn.map_fn(test_function, x) x = constant_op.constant([1]) y = map_call(x) - self.assertAllEqual([1], self.evaluate(y)) + self.assertAllEqual([1], self.evaluate(y)) @test_util.run_in_graph_and_eager_modes def testMapShape(self): diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py index dfe32998282..4a21a6e148b 100644 --- a/tensorflow/python/ops/map_fn.py +++ b/tensorflow/python/ops/map_fn.py @@ -39,14 +39,6 @@ from tensorflow.python.util import deprecation from tensorflow.python.util import nest from tensorflow.python.util.tf_export import tf_export -autograph_ctx = lazy_loader.LazyLoader( - "autograph_ctx", globals(), - "tensorflow.python.autograph.core.ag_ctx") -autograph = lazy_loader.LazyLoader( - "autograph", globals(), - "tensorflow.python.autograph.impl.api") - -@tf_export(v1=["map_fn"]) @deprecation.deprecated_args(None, "Use fn_output_signature instead", "dtype") def map_fn(fn, elems, @@ -483,8 +475,7 @@ def map_fn(fn, elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable, elems_flat_signature) elems_value = elems_unflatten(elems_value_flat) - ag_ctx = autograph_ctx.control_status_ctx() - result_value = autograph.tf_convert(elems_value, ag_ctx) + result_value = fn(elems_value) nest.assert_same_structure(fn_output_signature or elems, result_value) result_value_flat = nest.flatten(result_value) result_value_batchable = _result_value_flat_to_batchable( From 86342e236b40996ea5b6ccd17f1e753b00668d1c Mon Sep 17 00:00:00 2001 From: bhack Date: Fri, 15 May 2020 02:45:52 +0200 Subject: [PATCH 092/557] restore a remove export --- tensorflow/python/ops/map_fn.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py index 4a21a6e148b..2c9c678336e 100644 --- a/tensorflow/python/ops/map_fn.py +++ b/tensorflow/python/ops/map_fn.py @@ -39,6 +39,8 @@ from tensorflow.python.util import deprecation from tensorflow.python.util import nest from tensorflow.python.util.tf_export import tf_export + +@tf_export(v1=["map_fn"]) @deprecation.deprecated_args(None, "Use fn_output_signature instead", "dtype") def map_fn(fn, elems, From 6ccf21ef6d284fc1fc262789523cbece1b22ddad Mon Sep 17 00:00:00 2001 From: Dmitry Zakharov Date: Fri, 15 May 2020 12:46:49 +0300 Subject: [PATCH 093/557] =?UTF-8?q?Cleanup=20of=20TODO=E2=80=99s=20in=20AR?= =?UTF-8?q?C=20specific=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tensorflow/lite/micro/arc_emsdp/debug_log.cc | 1 - .../person_detection_experimental/arc_emsdp/emsdp.lcf | 3 --- tensorflow/lite/micro/kernels/arc_mli/conv.cc | 1 - tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc | 1 - tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc | 4 +--- .../micro/kernels/arc_mli/depthwise_conv_slicing_test.cc | 1 - tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc | 1 - tensorflow/lite/micro/kernels/arc_mli/pooling.cc | 1 - tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc | 5 ++--- 9 files changed, 3 insertions(+), 15 deletions(-) diff --git a/tensorflow/lite/micro/arc_emsdp/debug_log.cc b/tensorflow/lite/micro/arc_emsdp/debug_log.cc index b3b25f88ac1..fa9909f7372 100644 --- a/tensorflow/lite/micro/arc_emsdp/debug_log.cc +++ b/tensorflow/lite/micro/arc_emsdp/debug_log.cc @@ -55,7 +55,6 @@ typedef volatile struct dw_uart_reg { // to organize blocking loop for printing symbols. No input and no IRQ handling. // See embarc_osp repository for full EMSDP uart driver. // (https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp) -// TODO: Consider U-Boot API to do it in a less "hacky" way. void DbgUartSendStr(const char* s) { DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE); const char* src = s; diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf index 2d7954217d3..95732d2a8b9 100644 --- a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf +++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf @@ -1,7 +1,6 @@ # Difference with common EMSDP LCF file (to reduce data access time): # - move data from external PSRAM to on-chip memory # - move text from SRAM to ICCM -# - TODO: Move tensor arena to DCCM to reduce data flow between fast and extrnal memory # # CCMWRAP memory regions indicate unusable portions of the address space # due to CCM memory wrapping into upper addresses beyond its size @@ -46,8 +45,6 @@ SECTIONS { } > SRAM GROUP BLOCK(4): { -# TODO: Move tensor arena to DCCM when it will be possible -# .tensor_arena? : {} .Zdata? : {} .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {} .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {} diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc index 4a2676821d9..b80d220a1cc 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc @@ -52,7 +52,6 @@ struct OpData { int output_shift; // Per channel output multiplier and shift. - // TODO(b/141139247): Allocate these dynamically when possible. int32_t per_channel_output_multiplier[kMaxChannels]; int32_t per_channel_output_shift[kMaxChannels]; diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc index 9eb9d6499dd..7703bec3602 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc @@ -256,7 +256,6 @@ void TestConvQuantizedPerChannel( CreateQuantizedTensor(output_data, output_dims, output_scale, output_zero_point, "output_tensor"); - // TODO(njeff): Affine Quantization Params should be set on tensor creation. float input_scales[] = {1, input_scale}; int input_zero_points[] = {1, input_zero_point}; TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales), diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc index 2aad76bc042..e46f4766fce 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc @@ -54,7 +54,6 @@ struct OpData { int output_shift; // Per channel output multiplier and shift. - // TODO(b/141139247): Allocate these dynamically when possible. int32_t per_channel_output_multiplier[kMaxChannels]; int32_t per_channel_output_shift[kMaxChannels]; @@ -74,9 +73,8 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input, // MLI optimized version only supports int8 dataype, dilation factor of 1 and // per-axis quantization of weights (no broadcasting/per-tensor) - // TODO: ((in_ch == filters_num) || (in_ch == 1)) is a forbidding of + // (in_ch == filters_num) || (in_ch == 1)) is a forbidding of // channel multiplier logic for multichannel input. - // To be removed after it will be supported in MLI bool ret_val = (filter->type == kTfLiteInt8) && (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) && diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc index e6a87ff82e6..03a9fcbb30b 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc @@ -152,7 +152,6 @@ void TestDepthwiseConvQuantizedPerChannel( CreateQuantizedTensor(output_data, output_dims, output_scale, input_zero_point, "output_tensor"); - // TODO(njeff): Affine Quantization Params should be set on tensor creation. float input_scales[] = {1, input_scale}; int input_zero_points[] = {1, input_zero_point}; TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales), diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc index 89eae356f51..c2e35dbc8dc 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc @@ -236,7 +236,6 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node, op_params.weights_offset = -filter->params.zero_point; op_params.output_offset = output->params.zero_point; op_params.output_multiplier = data->output_multiplier; - // TODO(b/138810107): Figure out whether output shift should be inverted op_params.output_shift = -data->output_shift; op_params.quantized_activation_min = data->output_activation_min; op_params.quantized_activation_max = data->output_activation_max; diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc index 79deacc23d9..0d79fc5dbcf 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc @@ -46,7 +46,6 @@ enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 }; bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input, const TfLitePoolParams* params) { // MLI optimized version only supports int8 dataype and no fused Relu - // TODO: subject to add mli_saturate kernel return (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone); } diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc index 097908e30ab..1518513649f 100644 --- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc +++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc @@ -54,7 +54,6 @@ static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2, int } else { // In case only one buffer is available, // use only the max buffer, and split it. - // TODO compute optimal split ratio based on request ratio. *grant_size_1 = maxavailable / 2; *grant_size_2 = maxavailable / 2; } @@ -228,7 +227,7 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io( const int padding_bot, int *in_slice_height, int *out_slice_height) { - const int height_dimension = 1; // todo: compute from rank + const int height_dimension = 1; const int in_height = in->shape[height_dimension]; const int out_height = out->shape[height_dimension]; const int line_size_in = mli_hlp_count_elem_num(in, height_dimension + 1) * mli_hlp_tensor_element_size(in); @@ -250,7 +249,7 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io( // in this case only two slices are needed, so both could benefit from padding. take the MIN to get the worst case. max_out_lines_for_input = (max_lines_in + std::min(padding_top, padding_bot) - kernel_height + 1) / stride_height; } else { - max_out_lines_for_input = (max_lines_in - kernel_height + 1) / stride_height; // TODO add padding exceptions and test by makin fit=false; + max_out_lines_for_input = (max_lines_in - kernel_height + 1) / stride_height; } // Ten compute how many ouput lines fit into the output tensor. max_lines_out = std::min(out_height, static_cast(out->capacity) / line_size_out); From 872e950b51edbf3430d547e2fe4ed15ba8b18f77 Mon Sep 17 00:00:00 2001 From: seo-inyoung <62606132+seo-inyoung@users.noreply.github.com> Date: Fri, 15 May 2020 20:05:11 +0900 Subject: [PATCH 094/557] Update SECURITY.md simple error correction --- SECURITY.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SECURITY.md b/SECURITY.md index 6fc2c3aa9cc..f3a6c148b2e 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -64,7 +64,7 @@ your model, and we recommend you run the TensorFlow process in a sandbox. It is possible to write models that are secure in a sense that they can safely process untrusted inputs assuming there are no bugs. There are two main reasons -to not rely on this: first, it is easy to write models which must not be exposed +to not rely on this: First, it is easy to write models which must not be exposed to untrusted inputs, and second, there are bugs in any software system of sufficient complexity. Letting users control inputs could allow them to trigger bugs either in TensorFlow or in dependent libraries. @@ -149,7 +149,7 @@ attack (or worse). Because TensorFlow behaves correctly, this is not a vulnerability in TensorFlow (although it would be a vulnerability of this hypothetical system). -As a general rule, it is incorrect behavior for Tensorflow to access memory it +As a general rule, it is incorrect behavior for TensorFlow to access memory it does not own, or to terminate in an unclean way. Bugs in TensorFlow that lead to such behaviors constitute a vulnerability. From 103bb013d4d4ba19da0445abd9b9c627af9df817 Mon Sep 17 00:00:00 2001 From: bhack Date: Fri, 15 May 2020 14:23:20 +0200 Subject: [PATCH 095/557] Verifiy differences with test annotation --- tensorflow/python/kernel_tests/map_fn_test.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py index 0bc3307e484..81dd817687a 100644 --- a/tensorflow/python/kernel_tests/map_fn_test.py +++ b/tensorflow/python/kernel_tests/map_fn_test.py @@ -20,6 +20,7 @@ from __future__ import print_function import numpy as np +from tensorflow.python.autograph.impl import api from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes @@ -186,7 +187,8 @@ class MapFnTest(test.TestCase): self.assertAllEqual(-nums, received[1]) self.assertAllEqual(nums, received[2]) - @test_util.run_in_graph_and_eager_modes + #@test_util.run_in_graph_and_eager_modes + @test_util.run_deprecated_v1 def testMap_autograph_indirect(self): def test_function(x): cond = constant_op.constant(-1) @@ -195,6 +197,8 @@ class MapFnTest(test.TestCase): else: result = x return result + + @api.convert(recursive=False) def map_call(x): return map_fn.map_fn(test_function, x) From bbc2f3a190ff05a0bb8c30246dc71490587f434a Mon Sep 17 00:00:00 2001 From: bhack Date: Fri, 15 May 2020 15:37:38 +0200 Subject: [PATCH 096/557] Let test to fail --- tensorflow/python/kernel_tests/map_fn_test.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py index 81dd817687a..8ead634aa11 100644 --- a/tensorflow/python/kernel_tests/map_fn_test.py +++ b/tensorflow/python/kernel_tests/map_fn_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.python.autograph.impl import api +from tensorflow.python.eager import def_function from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes @@ -187,8 +187,7 @@ class MapFnTest(test.TestCase): self.assertAllEqual(-nums, received[1]) self.assertAllEqual(nums, received[2]) - #@test_util.run_in_graph_and_eager_modes - @test_util.run_deprecated_v1 + @test_util.run_in_graph_and_eager_modes def testMap_autograph_indirect(self): def test_function(x): cond = constant_op.constant(-1) @@ -198,7 +197,7 @@ class MapFnTest(test.TestCase): result = x return result - @api.convert(recursive=False) + @def_function.function def map_call(x): return map_fn.map_fn(test_function, x) From 560762e40d9bb085ea33f52b36b96a3851e1b3d2 Mon Sep 17 00:00:00 2001 From: bhack Date: Fri, 15 May 2020 16:49:53 +0200 Subject: [PATCH 097/557] Test autograph transform of fn --- tensorflow/python/ops/map_fn.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py index 2c9c678336e..e39d35c36b0 100644 --- a/tensorflow/python/ops/map_fn.py +++ b/tensorflow/python/ops/map_fn.py @@ -22,6 +22,8 @@ from __future__ import print_function import re +from tensorflow.python.autograph.core import ag_ctx as autograph_ctx +from tensorflow.python.autograph.impl import api as autograph from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import ops @@ -477,6 +479,8 @@ def map_fn(fn, elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable, elems_flat_signature) elems_value = elems_unflatten(elems_value_flat) + autographed_fn = autograph.tf_convert(fn, autograph_ctx.control_status_ctx()) + result_value = autographed_fn(elems_value) result_value = fn(elems_value) nest.assert_same_structure(fn_output_signature or elems, result_value) result_value_flat = nest.flatten(result_value) From d6dd56f74f228227dc9781bd389147df61d3784e Mon Sep 17 00:00:00 2001 From: bhack Date: Fri, 15 May 2020 17:26:04 +0200 Subject: [PATCH 098/557] Remove original fn call --- tensorflow/python/ops/map_fn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py index e39d35c36b0..b98b4ad10bc 100644 --- a/tensorflow/python/ops/map_fn.py +++ b/tensorflow/python/ops/map_fn.py @@ -481,7 +481,6 @@ def map_fn(fn, elems_value = elems_unflatten(elems_value_flat) autographed_fn = autograph.tf_convert(fn, autograph_ctx.control_status_ctx()) result_value = autographed_fn(elems_value) - result_value = fn(elems_value) nest.assert_same_structure(fn_output_signature or elems, result_value) result_value_flat = nest.flatten(result_value) result_value_batchable = _result_value_flat_to_batchable( From 64d839bb754b104e151bb49bb4ec46dbe690745d Mon Sep 17 00:00:00 2001 From: bhack Date: Fri, 15 May 2020 18:21:51 +0200 Subject: [PATCH 099/557] Fix lint and improve readibility --- tensorflow/python/ops/map_fn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py index b98b4ad10bc..40f8edfcdd1 100644 --- a/tensorflow/python/ops/map_fn.py +++ b/tensorflow/python/ops/map_fn.py @@ -479,7 +479,8 @@ def map_fn(fn, elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable, elems_flat_signature) elems_value = elems_unflatten(elems_value_flat) - autographed_fn = autograph.tf_convert(fn, autograph_ctx.control_status_ctx()) + ag_ctx = autograph_ctx.control_status_ctx() + autographed_fn = autograph.tf_convert(fn, ag_ctx) result_value = autographed_fn(elems_value) nest.assert_same_structure(fn_output_signature or elems, result_value) result_value_flat = nest.flatten(result_value) From 8dd28457699100145cad17aa4d44da81fddefda9 Mon Sep 17 00:00:00 2001 From: Eugene Kuznetsov Date: Fri, 15 May 2020 19:34:30 +0000 Subject: [PATCH 100/557] Reviewer requests --- tensorflow/stream_executor/rocm/rocm_gpu_executor.cc | 3 ++- third_party/gpus/cuda_configure.bzl | 10 +++++++--- third_party/gpus/rocm_configure.bzl | 3 +-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc index 216602a7597..fd3b5f19913 100644 --- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc +++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc @@ -133,8 +133,9 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) { GpuDriver::UnloadModule(context_, module); gpu_binary_to_module_.erase(module_it); const char* mem_it = nullptr; - for (auto x : in_memory_modules_) + for (auto x : in_memory_modules_) { if (x.second == module) mem_it = x.first; + } if (mem_it != nullptr) in_memory_modules_.erase(mem_it); } return true; diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl index ce924fe4cd2..7e779a993e2 100644 --- a/third_party/gpus/cuda_configure.bzl +++ b/third_party/gpus/cuda_configure.bzl @@ -809,20 +809,24 @@ def make_copy_files_rule(repository_ctx, name, srcs, outs): )""" % (name, "\n".join(outs), " && \\\n".join(cmds)) def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir, exceptions=None): - """Returns a rule to recursively copy a directory.""" + """Returns a rule to recursively copy a directory. + If exceptions is not None, it must be a list of files or directories in + 'src_dir'; these will be excluded from copying. + """ src_dir = _norm_path(src_dir) out_dir = _norm_path(out_dir) outs = read_dir(repository_ctx, src_dir) post_cmd='' if exceptions!=None: - outs = [x for x in outs if not any([x.startswith(y) for y in exceptions])] + outs = [x for x in outs if not any([x.startswith(src_dir+"/"+y) + for y in exceptions])] outs = [(' "%s",' % out.replace(src_dir, out_dir)) for out in outs] # '@D' already contains the relative path for a single file, see # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)" if exceptions!=None: for x in exceptions: - post_cmd+=" ; rm -fR " + x.replace(src_dir, out_dir) + post_cmd+=" ; rm -fR " + out_dir + "/" + x return """genrule( name = "%s", outs = [ diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl index 3f518fb05f1..4cfec2459e4 100644 --- a/third_party/gpus/rocm_configure.bzl +++ b/third_party/gpus/rocm_configure.bzl @@ -615,8 +615,7 @@ def _create_local_rocm_repository(repository_ctx): name = "rocm-include", src_dir = rocm_toolkit_path + "/include", out_dir = "rocm/include", - exceptions = [rocm_toolkit_path + "/include/gtest", - rocm_toolkit_path + "/include/gmock"], + exceptions = ["gtest", "gmock"], ), make_copy_dir_rule( repository_ctx, From 82519ad18676039327d29b80ed7dd098b61ce415 Mon Sep 17 00:00:00 2001 From: Ajay P Date: Fri, 15 May 2020 23:35:47 +0000 Subject: [PATCH 101/557] Fixed tests --- tensorflow/python/ops/custom_gradient.py | 6 +-- tensorflow/python/ops/gradients_test.py | 47 ++++++++++++------------ 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py index d57be41c3de..4a375e11554 100644 --- a/tensorflow/python/ops/custom_gradient.py +++ b/tensorflow/python/ops/custom_gradient.py @@ -28,12 +28,12 @@ from tensorflow.python.ops import gen_array_ops from tensorflow.python.ops import op_selector from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import variable_scope +from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import nest from tensorflow.python.util import tf_decorator from tensorflow.python.util import tf_inspect from tensorflow.python.util.tf_export import tf_export -from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients VAR_OP_TYPES = [ @@ -487,7 +487,7 @@ def recompute_grad(f): result = f(*args, **kwargs) @custom_gradient - def grad(*dresult, **grad_kwargs): + def inner_recompute_grad(*dresult, **grad_kwargs): """Nested custom gradient function for computing grads in reverse and forward mode autodiff.""" # Gradient calculation for reverse mode autodiff. variables = grad_kwargs.get("variables") @@ -517,7 +517,7 @@ def recompute_grad(f): return (grads[:len(id_args)], grads[len(id_args):]), transpose - return result, grad + return result, inner_recompute_grad return inner diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py index 9b536136cb5..e1da54e6427 100644 --- a/tensorflow/python/ops/gradients_test.py +++ b/tensorflow/python/ops/gradients_test.py @@ -1369,9 +1369,8 @@ class VariablesGradientTest(test_util.TensorFlowTestCase): sym_jac_back, num_jac = gradient_checker_v2.compute_gradient(f, inputs, delta=delta) - testcase.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol) + self.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol) - @test_util.run_in_graph_and_eager_modes def testCustomGradientRecomputeGradHigherOrder(self): @custom_gradient.recompute_grad @@ -1395,8 +1394,8 @@ class VariablesGradientTest(test_util.TensorFlowTestCase): shape=10, trainable=True, ) - - test_input = constant(np.zeros((10, 10), dtype=np.float32)) + self.evaluate(test_var.assign(np.ones([10]))) + test_input = constant(np.ones((10, 10), dtype=np.float32)) grads_re, grads = self._TestFnVariablesGradient(test_input, TestFn, test_input) @@ -1432,24 +1431,24 @@ class VariablesGradientTest(test_util.TensorFlowTestCase): def testFnRecomputeWithScopeGradientTape(self): """Checks that recompute_grad works with var scope and GradientTape.""" - def TestFn(input_t): - with variable_scope.variable_scope("inner_scope"): - test_var = variable_scope.get_variable( - name="test_var", - shape=10, - trainable=True, - ) - return input_t * test_var + def TestFn(input_t, test_var): + return input_t * test_var test_input_t = constant(np.zeros((10, 10), dtype=np.float32)) with variable_scope.variable_scope( "output_scope", reuse=variable_scope.AUTO_REUSE, use_resource=True): + with variable_scope.variable_scope("inner_scope"): + test_var = variable_scope.get_variable( + name="test_var", shape=10, trainable=True, + ) + self.evaluate(test_var.assign(np.ones([10]))) + test_fn_re = custom_gradient.recompute_grad(TestFn) with backprop.GradientTape(persistent=True) as tape: - out_re = test_fn_re(test_input_t) - out = TestFn(test_input_t) + out_re = test_fn_re(test_input_t, test_var) + out = TestFn(test_input_t, test_var) grads_re = tape.gradient(out_re, variables.trainable_variables()) grads = tape.gradient(out, variables.trainable_variables()) @@ -1464,22 +1463,22 @@ class VariablesGradientTest(test_util.TensorFlowTestCase): def testFnRecomputeWithScopeGradients(self): """Checks that recompute_grad works with var scope and gradients(..).""" - def TestFn(input_t): - with variable_scope.variable_scope("inner_scope"): - test_var = variable_scope.get_variable( - name="test_var", - shape=10, - trainable=True, - ) - return input_t * test_var + def TestFn(input_t, test_var): + return input_t * test_var test_input_t = constant(np.zeros((10, 10), dtype=np.float32)) with variable_scope.variable_scope( "output_scope", reuse=variable_scope.AUTO_REUSE, use_resource=True): + with variable_scope.variable_scope("inner_scope"): + test_var = variable_scope.get_variable( + name="test_var", shape=10, trainable=True, + ) + self.evaluate(test_var.assign(np.ones([10]))) + test_fn_re = custom_gradient.recompute_grad(TestFn) - out_re = test_fn_re(test_input_t) - out = TestFn(test_input_t) + out_re = test_fn_re(test_input_t, test_var) + out = TestFn(test_input_t, test_var) grads_re = gradients.gradients(out_re, variables.trainable_variables()) grads = gradients.gradients(out, variables.trainable_variables()) From 939b69e701c4ce749267e5b3d5d8b5557e3f1300 Mon Sep 17 00:00:00 2001 From: Ajay P Date: Sat, 16 May 2020 01:13:19 +0000 Subject: [PATCH 102/557] Added grad_wrapper to accomodate graph mode --- tensorflow/python/ops/custom_gradient.py | 60 +++++++++++++----------- tensorflow/python/ops/gradients_test.py | 42 +++++++++-------- 2 files changed, 54 insertions(+), 48 deletions(-) diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py index 4a375e11554..aa80756b859 100644 --- a/tensorflow/python/ops/custom_gradient.py +++ b/tensorflow/python/ops/custom_gradient.py @@ -486,38 +486,42 @@ def recompute_grad(f): with tape_lib.stop_recording(): result = f(*args, **kwargs) - @custom_gradient - def inner_recompute_grad(*dresult, **grad_kwargs): - """Nested custom gradient function for computing grads in reverse and forward mode autodiff.""" - # Gradient calculation for reverse mode autodiff. - variables = grad_kwargs.get("variables") - with backprop.GradientTape() as t: - id_args = [gen_array_ops.identity(x) for x in args] - t.watch(id_args) + def grad_wrapper(*wrapper_args, **grad_kwargs): + """Wrapper function to accomodate lack of kwargs in graph mode decorator.""" + @custom_gradient + def inner_recompute_grad(*dresult): + """Nested custom gradient function for computing grads in reverse and forward mode autodiff.""" + # Gradient calculation for reverse mode autodiff. + variables = grad_kwargs.get("variables") + with backprop.GradientTape() as t: + id_args = [gen_array_ops.identity(x) for x in args] + t.watch(id_args) + if variables is not None: + t.watch(variables) + with ops.control_dependencies(dresult): + with variable_scope.variable_scope(current_var_scope): + result = f(*id_args, **kwargs) + kw_vars = [] if variables is not None: - t.watch(variables) - with ops.control_dependencies(dresult): - with variable_scope.variable_scope(current_var_scope): - result = f(*id_args, **kwargs) - kw_vars = [] - if variables is not None: - kw_vars = list(variables) - grads = t.gradient(result, - list(id_args) + kw_vars, - output_gradients=dresult, - unconnected_gradients=UnconnectedGradients.ZERO) + kw_vars = list(variables) + grads = t.gradient(result, + list(id_args) + kw_vars, + output_gradients=dresult, + unconnected_gradients=UnconnectedGradients.ZERO) - def transpose(*t_args, **t_kwargs): - """Gradient function calculation for forward mode autodiff.""" - # Just throw an error since gradients / activations are not stored on tape for recompute. - raise NotImplementedError( - "recompute_grad tried to transpose grad of {}. " - "Consider not using recompute_grad in forward mode autodiff".format( - f.__name__)) + def transpose(*t_args, **t_kwargs): + """Gradient function calculation for forward mode autodiff.""" + # Just throw an error since gradients / activations are not stored on tape for recompute. + raise NotImplementedError( + "recompute_grad tried to transpose grad of {}. " + "Consider not using recompute_grad in forward mode" + "autodiff".format(f.__name__)) - return (grads[:len(id_args)], grads[len(id_args):]), transpose + return (grads[:len(id_args)], grads[len(id_args):]), transpose - return result, inner_recompute_grad + return inner_recompute_grad(*wrapper_args) + + return result, grad_wrapper return inner diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py index e1da54e6427..57fb2f4ddb3 100644 --- a/tensorflow/python/ops/gradients_test.py +++ b/tensorflow/python/ops/gradients_test.py @@ -1431,24 +1431,25 @@ class VariablesGradientTest(test_util.TensorFlowTestCase): def testFnRecomputeWithScopeGradientTape(self): """Checks that recompute_grad works with var scope and GradientTape.""" - def TestFn(input_t, test_var): - return input_t * test_var + def TestFn(input_t): + with variable_scope.variable_scope("inner_scope"): + test_var = variable_scope.get_variable( + name="test_var", + shape=10, + trainable=True, + ) + self.evaluate(test_var.assign(np.ones([10]))) + return input_t * test_var test_input_t = constant(np.zeros((10, 10), dtype=np.float32)) with variable_scope.variable_scope( "output_scope", reuse=variable_scope.AUTO_REUSE, use_resource=True): - with variable_scope.variable_scope("inner_scope"): - test_var = variable_scope.get_variable( - name="test_var", shape=10, trainable=True, - ) - self.evaluate(test_var.assign(np.ones([10]))) - test_fn_re = custom_gradient.recompute_grad(TestFn) with backprop.GradientTape(persistent=True) as tape: - out_re = test_fn_re(test_input_t, test_var) - out = TestFn(test_input_t, test_var) + out_re = test_fn_re(test_input_t) + out = TestFn(test_input_t) grads_re = tape.gradient(out_re, variables.trainable_variables()) grads = tape.gradient(out, variables.trainable_variables()) @@ -1463,22 +1464,23 @@ class VariablesGradientTest(test_util.TensorFlowTestCase): def testFnRecomputeWithScopeGradients(self): """Checks that recompute_grad works with var scope and gradients(..).""" - def TestFn(input_t, test_var): - return input_t * test_var + def TestFn(input_t): + with variable_scope.variable_scope("inner_scope"): + test_var = variable_scope.get_variable( + name="test_var", + shape=10, + trainable=True, + ) + self.evaluate(test_var.assign(np.ones([10]))) + return input_t * test_var test_input_t = constant(np.zeros((10, 10), dtype=np.float32)) with variable_scope.variable_scope( "output_scope", reuse=variable_scope.AUTO_REUSE, use_resource=True): - with variable_scope.variable_scope("inner_scope"): - test_var = variable_scope.get_variable( - name="test_var", shape=10, trainable=True, - ) - self.evaluate(test_var.assign(np.ones([10]))) - test_fn_re = custom_gradient.recompute_grad(TestFn) - out_re = test_fn_re(test_input_t, test_var) - out = TestFn(test_input_t, test_var) + out_re = test_fn_re(test_input_t) + out = TestFn(test_input_t) grads_re = gradients.gradients(out_re, variables.trainable_variables()) grads = gradients.gradients(out, variables.trainable_variables()) From ea4ef0e6faf651c9f76ef90848dc62d8aa660ac1 Mon Sep 17 00:00:00 2001 From: David Rim Date: Mon, 18 May 2020 00:03:24 -0700 Subject: [PATCH 103/557] Bumps llvm version PiperOrigin-RevId: 312025889 Change-Id: I9c2a75e34bbfb2b9f6afaf0398c9cfde6870ac3b --- tensorflow/workspace.bzl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 404d253e8bd..452152efacf 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -655,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): ) # Check out LLVM and MLIR from llvm-project. - LLVM_COMMIT = "9d4b4f344d8ea917e082cf58d66b71c0171e1650" - LLVM_SHA256 = "36e4470b5656cea3e0afb218edbdd96376fcb51dc2c5ed887b21237068baee41" + LLVM_COMMIT = "7af0c8559b6d9426dd5e977370516d2baa4c206f" + LLVM_SHA256 = "4c5efbc48755f9983a8522eddd6e448f0b93e3e75a56a507c1ecb44d367db6d5" LLVM_URLS = [ "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT), "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT), From 344f8982507cd03ba79b7e21fef6f115451ee497 Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Mon, 18 May 2020 00:28:56 -0700 Subject: [PATCH 104/557] Slightly optimize quantized add. PiperOrigin-RevId: 312028385 Change-Id: Ie1fbb3071e4e258c24db78440e1275168694fda9 --- .../lite/kernels/internal/optimized/integer_ops/add.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h index ff8e4687d58..95b78b3a6b3 100644 --- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h +++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h @@ -47,6 +47,9 @@ inline void AddElementwise(int size, const ArithmeticParams& params, const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift); const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift); + const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset); + const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset); + for (; i <= size - 16; i += 16) { const int8x16_t input1_val_original = vld1q_s8(input1_data + i); const int8x16_t input2_val_original = vld1q_s8(input2_data + i); @@ -61,13 +64,13 @@ inline void AddElementwise(int size, const ArithmeticParams& params, const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original)); const int16x8_t input1_val_high = - vaddq_s16(input1_val_s16_high, vdupq_n_s16(params.input1_offset)); + vaddq_s16(input1_val_s16_high, input1_offset_dup); const int16x8_t input2_val_high = - vaddq_s16(input2_val_s16_high, vdupq_n_s16(params.input2_offset)); + vaddq_s16(input2_val_s16_high, input2_offset_dup); const int16x8_t input1_val_low = - vaddq_s16(input1_val_s16_low, vdupq_n_s16(params.input1_offset)); + vaddq_s16(input1_val_s16_low, input1_offset_dup); const int16x8_t input2_val_low = - vaddq_s16(input2_val_s16_low, vdupq_n_s16(params.input2_offset)); + vaddq_s16(input2_val_s16_low, input2_offset_dup); const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high); const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high); const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low); From 76853076b382474ff35f4561fde231b06a5ccdfa Mon Sep 17 00:00:00 2001 From: David Rim Date: Mon, 18 May 2020 01:32:19 -0700 Subject: [PATCH 105/557] Add optimized MatrixBatchVectorMultiplyAccumulate for asymmetric inputs for sse PiperOrigin-RevId: 312035618 Change-Id: I5ae85ae9b0b646d2fe1e665c25aae6b99622dd2b --- .../internal/optimized/neon_tensor_utils.cc | 35 +++-- .../internal/optimized/neon_tensor_utils.h | 10 -- .../optimized/neon_tensor_utils_impl.h | 6 - .../internal/optimized/sse_tensor_utils.cc | 129 ++++++++++-------- .../internal/optimized/sse_tensor_utils.h | 22 +-- .../optimized/sse_tensor_utils_impl.h | 10 +- .../reference/portable_tensor_utils.cc | 29 ---- .../reference/portable_tensor_utils.h | 10 -- .../reference/portable_tensor_utils_impl.h | 6 - .../kernels/internal/tensor_utils_test.cc | 8 +- 10 files changed, 110 insertions(+), 155 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc index 4c90cd86a56..c96f298370a 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc @@ -1466,16 +1466,20 @@ void NeonMatrixBatchVectorMultiplyAccumulate( int i = 0; int32_t* scratch_ptr = scratch; for (; i <= total_size - 8; i += 8, result += 8) { - float batch_scaling_factor0 = scaling_factors[i / m_rows]; - float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows]; - if (per_channel_scale) { - batch_scaling_factor0 *= per_channel_scale[i % m_rows]; - batch_scaling_factor1 *= per_channel_scale[(i + 4) % m_rows]; - } + const float batch_scaling_factor0 = scaling_factors[i / m_rows]; + const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows]; const int batch_input_offset0 = -input_offset[i / m_rows]; const int batch_input_offset1 = -input_offset[(i + 4) / m_rows]; - const float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0); - const float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1); + float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0); + float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1); + if (per_channel_scale) { + const float32x4_t per_channel_scale0 = + vld1q_f32(&per_channel_scale[i % m_rows]); + const float32x4_t per_channel_scale1 = + vld1q_f32(&per_channel_scale[(i + 4) % m_rows]); + scaling_factor0 = vmulq_f32(scaling_factor0, per_channel_scale0); + scaling_factor1 = vmulq_f32(scaling_factor1, per_channel_scale1); + } const int32x4_t input_offset0 = vdupq_n_s32(batch_input_offset0); const int32x4_t input_offset1 = vdupq_n_s32(batch_input_offset1); const int32x4_t row_sum0 = vld1q_s32(row_sums + (i % m_rows)); @@ -1498,7 +1502,10 @@ void NeonMatrixBatchVectorMultiplyAccumulate( scratch_ptr += i; for (; i < total_size; i++) { - const float batch_scaling_factor = scaling_factors[i / m_rows]; + float batch_scaling_factor = scaling_factors[i / m_rows]; + if (per_channel_scale) { + batch_scaling_factor *= per_channel_scale[i % m_rows]; + } const int32_t zero_point = input_offset[i / m_rows]; int32_t dotprod = *(scratch_ptr++); dotprod -= row_sums[i % m_rows] * zero_point; @@ -1514,16 +1521,6 @@ void NeonMatrixBatchVectorMultiplyAccumulate( per_channel_scale, input_offset, row_sums); } -void NeonMatrixBatchVectorMultiplyAccumulate( - const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, - const int8_t* __restrict__ vectors, const float* scaling_factors, - int n_batch, float* __restrict__ result, const float* per_channel_scale, - const int32_t* input_offset) { - NeonMatrixBatchVectorMultiplyAccumulateImpl( - matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result, - per_channel_scale, input_offset, nullptr); -} - inline int64x2x2_t MulAdd(int32x4_t acc, int32x4_t lhs, int32x4_t rhs) { int64x2x2_t result; const int64x2_t lhs_low = vmovl_s32(vget_low_s32(lhs)); diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h index b978bf5f3bb..86951fcd559 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h @@ -55,16 +55,6 @@ void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix, vectors, scaling_factors, n_batch, scratch, result, context); } -void MatrixBatchVectorMultiplyAccumulate( - const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, - const int8_t* __restrict__ vectors, const float* scaling_factors, - int n_batch, float* __restrict__ result, const float* per_channel_scale, - const int32_t* input_offset) { - NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, - vectors, scaling_factors, n_batch, result, per_channel_scale, - input_offset); -} - void MatrixBatchVectorMultiplyAccumulate( const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, const int8_t* __restrict__ vectors, const float* scaling_factors, diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h index 1b043390c22..1554d07a61c 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h @@ -62,12 +62,6 @@ void NeonMatrixBatchVectorMultiplyAccumulate( const int32_t* input_offset, int32_t* scratch, int32_t* row_sums, bool* compute_row_sums, CpuBackendContext* context); -void NeonMatrixBatchVectorMultiplyAccumulate( - const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, - const int8_t* __restrict__ vectors, const float* scaling_factors, - int n_batch, float* __restrict__ result, const float* per_channel_scale, - const int32_t* input_offset); - void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights, const int32_t* bias, int32_t layer_norm_scale_a, int32_t layer_norm_scale_b, int32_t variance_limit, diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc index 7fb69e7b4f4..80cc14c6d26 100644 --- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc @@ -24,6 +24,7 @@ limitations under the License. #include +#include "tensorflow/lite/kernels/cpu_backend_context.h" #include "tensorflow/lite/kernels/internal/compatibility.h" namespace tflite { @@ -89,18 +90,24 @@ float GetFloatVectorElement(__m128 v) { } // namespace -void SseMatrixBatchVectorMultiplyAccumulate( +void SseMatrixBatchVectorMultiplyAccumulateImpl( const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, const int8_t* __restrict__ vectors, const float* __restrict__ scaling_factors, int n_batch, - float* __restrict__ result) { + float* __restrict__ result, const float* per_channel_scale, + const int32_t* input_offset, const int32_t* row_sums) { for (std::intptr_t batch = 0; batch < n_batch; ++batch) { const float batch_scaling_factor = scaling_factors[batch]; + const int32_t batch_offset = input_offset ? input_offset[batch] : 0; // Compute dot-product for every column. for (std::intptr_t row = 0; row < m_rows; ++row) { // Get the address of the first element of the row. const int8_t* __restrict__ row_ptr = matrix + row * m_cols; - + const float row_scale = + per_channel_scale ? per_channel_scale[row] * batch_scaling_factor + : batch_scaling_factor; + const int32_t row_offset = + row_sums && batch_offset ? batch_offset * row_sums[row] : 0; // Initialize the dot product sum for the row to 0. __m128i dotprod_32x4 = _mm_setzero_si128(); std::intptr_t col = 0; @@ -152,8 +159,10 @@ void SseMatrixBatchVectorMultiplyAccumulate( for (; col < m_cols; ++col) { sum += row_ptr[col] * vectors[col]; } // for col - - *result += sum * batch_scaling_factor; + if (row_offset) { + sum -= row_offset; + } + *result += sum * row_scale; ++result; } // for row @@ -165,56 +174,30 @@ void SseMatrixBatchVectorMultiplyAccumulate( const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, const int8_t* __restrict__ vectors, const float* __restrict__ scaling_factors, int n_batch, - float* __restrict__ result, const float* __restrict__ per_channel_scale, - const int32_t* __restrict__ input_offset) { - if (input_offset == nullptr) { - SseMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors, - scaling_factors, n_batch, result); - return; - } - static constexpr std::intptr_t kBlockSize = 16; - for (std::intptr_t batch = 0; batch < n_batch; ++batch) { - const float batch_scaling_factor = scaling_factors[batch]; - for (std::intptr_t row = 0; row < m_rows; ++row) { - const int8_t* __restrict__ row_ptr = matrix + row * m_cols; - float scale = batch_scaling_factor; - if (per_channel_scale != nullptr) { - scale *= per_channel_scale[row]; - } - __m128i dotprod_32x4 = _mm_setzero_si128(); - __m128i row_sum_16x8 = _mm_setzero_si128(); - std::intptr_t col = 0; - for (; col < (m_cols & ~(kBlockSize - 1)); col += kBlockSize) { - const __m128i vec_8x16 = - _mm_loadu_si128(reinterpret_cast(vectors + col)); - const __m128i row_8x16 = - _mm_loadu_si128(reinterpret_cast(row_ptr + col)); - // dotprod += vec · row - dotprod_32x4 = - _mm_add_epi32(dotprod_32x4, DotProdInt8x4x4(vec_8x16, row_8x16)); + float* __restrict__ result) { + SseMatrixBatchVectorMultiplyAccumulateImpl( + matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result, + /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr, + /*row_sums=*/nullptr); +} - // Pairwise add 16x 8-bit values; equivalently, multipy-add with 1. - // Result is 8x 16-bit values. - const __m128i row_16x8 = _mm_maddubs_epi16(_mm_set1_epi8(1), row_8x16); - row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8); - } // for col - // Pairwise add 8x 16-bit values; equivalently, multipy-add with 1. - // Result is 4x 32-bit values. - const __m128i row_sum_32x4 = - _mm_madd_epi16(row_sum_16x8, _mm_set1_epi16(1)); - int32_t sum = ReduceInt32x4(dotprod_32x4); - int32_t row_sum = ReduceInt32x4(row_sum_32x4); - // Postamble loop. - for (; col < m_cols; ++col) { - sum += row_ptr[col] * vectors[col]; - row_sum += row_ptr[col]; - } // for col - sum -= row_sum * input_offset[batch]; - *result += sum * scale; - ++result; - } // for row - vectors += m_cols; - } // for batch +void SseMatrixBatchVectorMultiplyAccumulate( + const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, + const int8_t* __restrict__ vectors, + const float* __restrict__ scaling_factors, int n_batch, + float* __restrict__ result, const float* per_channel_scale, + const int32_t* input_offset, int32_t* scratch, int32_t* row_sums, + bool* compute_row_sums, CpuBackendContext* context) { + if ((input_offset != nullptr) && (!compute_row_sums || *compute_row_sums)) { + memset(row_sums, 0, sizeof(int32_t) * m_rows); + SseReductionSumVector(matrix, row_sums, m_rows, m_cols); + if (compute_row_sums) { + *compute_row_sums = false; + } + } + SseMatrixBatchVectorMultiplyAccumulateImpl( + matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result, + per_channel_scale, input_offset, row_sums); } namespace { @@ -347,6 +330,44 @@ void SseSparseMatrixBatchVectorMultiplyAccumulate( } // for batch } +void SseReductionSumVector(const int8_t* input_vector, int32_t* output_vector, + const int output_size, const int reduction_size) { + static constexpr std::intptr_t kBlockSize = 16; + for (std::intptr_t row = 0; row < output_size; ++row) { + const int8_t* __restrict__ row_ptr = input_vector + row * reduction_size; + __m128i row_sum_16x8 = _mm_setzero_si128(); + std::intptr_t col = 0; + for (; col < (reduction_size & ~(kBlockSize - 1)); col += kBlockSize) { + const __m128i row_8x16 = + _mm_loadu_si128(reinterpret_cast(row_ptr + col)); + const __m128i row_16x8 = _mm_maddubs_epi16(_mm_set1_epi8(1), row_8x16); + row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8); + } // for col +#ifdef __SSE4_1__ + // Postamble for 8x 8-bit inputs. + if (col < (reduction_size & ~7)) { + // _mm_loadu_si64 not supported in gcc versions < 9, breaks kokoro build. + const __m128i row_16x8 = _mm_cvtepi8_epi16( + _mm_loadl_epi64(reinterpret_cast(row_ptr + col))); + // dotprod += vec · row + row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8); + col += 8; + } +#endif + const __m128i row_sum_32x4 = + _mm_madd_epi16(row_sum_16x8, _mm_set1_epi16(1)); + int32_t row_sum = ReduceInt32x4(row_sum_32x4); +#if defined(__SSE4_1__) && defined(__clang__) + // SSE 4.1: Don't try to unroll and vectorize this, already done above. +#pragma clang loop unroll(disable) vectorize(disable) +#endif + for (; col < reduction_size; col++) { + row_sum += *(row_ptr + col); + } + *(output_vector + row) += row_sum; + } +} + } // namespace tensor_utils } // namespace tflite diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h index 986e70a7823..224d811e862 100644 --- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h @@ -59,10 +59,9 @@ void MatrixBatchVectorMultiplyAccumulate( int n_batch, float* __restrict__ result, const float* per_channel_scale, const int32_t* input_offset, int32_t* scratch, int32_t* row_sums, bool* compute_row_sums, CpuBackendContext* context) { - PortableMatrixBatchVectorMultiplyAccumulate( - matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result, - per_channel_scale, input_offset, scratch, row_sums, compute_row_sums, - context); + SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, + vectors, scaling_factors, n_batch, result, per_channel_scale, + input_offset, scratch, row_sums, compute_row_sums, context); } void MatrixBatchVectorMultiplyAccumulate( @@ -75,17 +74,6 @@ void MatrixBatchVectorMultiplyAccumulate( vectors, scaling_factors, n_batch, result); } -void MatrixBatchVectorMultiplyAccumulate( - const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, - const int8_t* __restrict__ vectors, - const float* __restrict__ scaling_factors, int n_batch, - float* __restrict__ result, const float* __restrict__ per_channel_scale, - const int32_t* __restrict__ input_offset) { - SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, - vectors, scaling_factors, n_batch, result, per_channel_scale, - input_offset); -} - void SparseMatrixBatchVectorMultiplyAccumulate1x4( const float* __restrict__ matrix, const int32_t* __restrict__ segments, const int32_t* __restrict__ indices, int m_rows, int m_cols, @@ -315,8 +303,8 @@ void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector, void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector, int output_size, int reduction_size) { - NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size, - reduction_size); + SSE_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size, + reduction_size); } void MeanStddevNormalization(const float* input_vector, float* output_vector, diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h index 1996b1f30a9..c5ede624762 100644 --- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h +++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h @@ -17,6 +17,8 @@ limitations under the License. #include +#include "tensorflow/lite/kernels/cpu_backend_context.h" + #if defined(_MSC_VER) #define __restrict__ __restrict #endif @@ -38,8 +40,9 @@ void SseMatrixBatchVectorMultiplyAccumulate( const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, const int8_t* __restrict__ vectors, const float* __restrict__ scaling_factors, int n_batch, - float* __restrict__ result, const float* __restrict__ per_channel_scale, - const int32_t* __restrict__ input_offset); + float* __restrict__ result, const float* per_channel_scale, + const int32_t* input_offset, int32_t* scratch, int32_t* row_sums, + bool* compute_row_sums, CpuBackendContext* context); // Matrix multiplication for quantized values using symmetric quantization. // Sparse version. @@ -49,6 +52,9 @@ void SseSparseMatrixBatchVectorMultiplyAccumulate( const float* __restrict__ scaling_factors, int n_batch, float* __restrict__ result); +void SseReductionSumVector(const int8_t* input_vector, int32_t* output_vector, + const int output_size, const int reduction_size); + #endif // __SSSE3__ } // namespace tensor_utils diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc index 0e66dfee191..4f6db290d4f 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc @@ -161,35 +161,6 @@ void PortableMatrixBatchVectorMultiplyAccumulate( } // for batch } -void PortableMatrixBatchVectorMultiplyAccumulate( - const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, - const int8_t* __restrict__ vectors, const float* scaling_factors, - int n_batch, float* __restrict__ result, const float* per_channel_scale, - const int32_t* input_offset) { - for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) { - const float batch_scaling_factor = scaling_factors[batch]; - const float batch_offset = input_offset[batch]; - const int8_t* row_ptr = matrix; - for (int row = 0; row < m_rows; ++row) { - int32_t dotprod = 0; - float scale = batch_scaling_factor; - if (per_channel_scale) { - scale *= per_channel_scale[row]; - } -#if defined(__GNUC__) - // Prefetch the row to cache. - __builtin_prefetch(row_ptr, 0 /* prefetch for read */, - 3 /* temporal locality */); -#endif - for (int col = 0; col < m_cols; ++col, ++row_ptr) { - dotprod += (*row_ptr) * (vectors[col] - batch_offset); - } // for col - *result += dotprod * scale; - ++result; - } // for row - } // for batch -} - void PortableMatrixBatchVectorMultiplyAccumulate( const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, const int8_t* __restrict__ vectors, const float* scaling_factors, diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h index f2e6c9b4f7d..0fd7a407595 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h @@ -98,16 +98,6 @@ void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix, scaling_factors, n_batch, result); } -void MatrixBatchVectorMultiplyAccumulate( - const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, - const int8_t* __restrict__ vectors, const float* scaling_factors, - int n_batch, float* __restrict__ result, const float* per_channel_scale, - const int32_t* input_offset) { - PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors, - scaling_factors, n_batch, result, - per_channel_scale, input_offset); -} - void SparseMatrixBatchVectorMultiplyAccumulate1x4( const float* __restrict__ matrix, const int32_t* __restrict__ segments, const int32_t* __restrict__ indices, int m_rows, int m_cols, diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h index 6c15a6cd919..34767ccd942 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h @@ -83,12 +83,6 @@ void PortableMatrixBatchVectorMultiplyAccumulate( int n_batch, int32_t* scratch, float* __restrict__ result, CpuBackendContext* context); -void PortableMatrixBatchVectorMultiplyAccumulate( - const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, - const int8_t* __restrict__ vectors, const float* scaling_factors, - int n_batch, float* __restrict__ result, const float* per_channel_scale, - const int32_t* input_offset); - void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4( const float* __restrict__ matrix, const int32_t* __restrict__ segments, const int32_t* __restrict__ indices, int m_rows, int m_cols, diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc index 3ad59acdb68..878cf0d2618 100644 --- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc +++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc @@ -1136,11 +1136,15 @@ std::vector TestPerChannelDotprodMatrixBatchVectorMultiply( bool is_per_channel = true) { MatrixVectorData data = SetupMatrixVectorData(rows, cols, batch, negative, is_per_channel); - + std::vector scratch(rows * batch); + std::vector row_sums(rows); + bool compute_row_sums = true; + CpuBackendContext context; MatrixBatchVectorMultiplyAccumulate( data.matrix.data(), rows, cols, data.vectors.data(), data.scale_factors.data(), batch, &data.results[0], - data.per_channel_scales.data(), data.input_offsets.data()); + data.per_channel_scales.data(), data.input_offsets.data(), scratch.data(), + row_sums.data(), &compute_row_sums, &context); return data.results; } From de8a517f4068589fb5cd82c8a8a8dc3d5e101c0e Mon Sep 17 00:00:00 2001 From: Taehee Jeong Date: Mon, 18 May 2020 01:58:56 -0700 Subject: [PATCH 106/557] fix escape in Core ML header processing PiperOrigin-RevId: 312038605 Change-Id: I422e343729a7f27808c3f9b908460faeeaa58ce5 --- tensorflow/lite/experimental/ios/BUILD.apple | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple index a29e8bd6ed5..7e2a3623af1 100644 --- a/tensorflow/lite/experimental/ios/BUILD.apple +++ b/tensorflow/lite/experimental/ios/BUILD.apple @@ -51,7 +51,7 @@ genrule( srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"], outs = ["coreml_delegate.h"], cmd = """ - sed "s|#include \".*common.h\"|#include \"TensorFlowLiteC/common.h\"|"\ + sed 's|#include ".*common.h"|#include "TensorFlowLiteC/common.h"|'\ "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)"\ > "$@" """, From 647ef2db28957b9cb1d0df66ee9a2a37ca21ca15 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 18 May 2020 02:02:53 -0700 Subject: [PATCH 107/557] Update GraphDef version to 405. PiperOrigin-RevId: 312039077 Change-Id: I03ac966118084eb80d817cdfe98b175c75bf86aa --- tensorflow/core/public/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index 63501a14f56..7abbcd5474c 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -108,7 +108,7 @@ limitations under the License. #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0 -#define TF_GRAPH_DEF_VERSION 404 // Updated: 2020/5/17 +#define TF_GRAPH_DEF_VERSION 405 // Updated: 2020/5/18 // Checkpoint compatibility versions (the versions field in SavedSliceMeta). // From 72c50430aa5347e6c9bc1a1927a4e13db0dc766a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 18 May 2020 02:02:54 -0700 Subject: [PATCH 108/557] compat: Update forward compatibility horizon to 2020-05-18 PiperOrigin-RevId: 312039082 Change-Id: I03c04d8d9a395087e866a67ca58a263150b3f754 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index 2a99a0774ad..88a26661f82 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export # This value changes every day with an automatic CL. It can be modified in code # via `forward_compatibility_horizon()` or with the environment variable # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date. -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 17) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 18) _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS" _FORWARD_COMPATIBILITY_DATE_NUMBER = None From b2f3e8f5639a9370c9f8987a733ab3496eb87a97 Mon Sep 17 00:00:00 2001 From: Shanqing Cai Date: Mon, 18 May 2020 06:16:05 -0700 Subject: [PATCH 109/557] numerics_test.py: Move tfdbg2-specific test methods to debug_v2_ops_test.py PiperOrigin-RevId: 312065934 Change-Id: Idf576fd41ae96ed19f815bcce8848eabef036834 --- .../python/debug/lib/debug_v2_ops_test.py | 34 ++++++++++++++ .../python/kernel_tests/numerics_test.py | 46 ------------------- 2 files changed, 34 insertions(+), 46 deletions(-) diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py index c76cbeeac6c..07721920f63 100644 --- a/tensorflow/python/debug/lib/debug_v2_ops_test.py +++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py @@ -33,6 +33,7 @@ from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_util from tensorflow.python.framework import test_util +from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_debug_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import googletest @@ -680,6 +681,39 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase): self.assertAllEqual(tensor_1, tensor_2) self.assertEqual(tensor_id_1, tensor_id_2) + def testCheckNumericsV2OpNegativeAndPositiveInf(self): + """Test that CheckNumericsV2 op distinguishes negative and positive infs.""" + with self.session(graph=ops.Graph()): + t1 = constant_op.constant([-1.0, 1.0]) + t2 = constant_op.constant([0.0, 0.0]) + with self.assertRaisesRegexp( + errors.InvalidArgumentError, + r"pass through test.*had -Inf and \+Inf values"): + self.evaluate( + array_ops.check_numerics_v2(t1 / t2, message="pass through test")) + + def testCheckNumericsV2OpNegativeAndPositiveInfAndNaN(self): + """CheckNumericsV2 op distinguishes - & + infs when nan is present.""" + with self.session(graph=ops.Graph()): + t1 = constant_op.constant([-1.0, 1.0, 0.0]) + t2 = constant_op.constant([0.0, 0.0, 0.0]) + with self.assertRaisesRegexp( + errors.InvalidArgumentError, + r"pass through test.*had -Inf, \+Inf, and NaN values"): + self.evaluate( + array_ops.check_numerics_v2(t1 / t2, message="pass through test")) + + def testCheckNumericsV2PositiveInfAndNaN(self): + """Test that CheckNumericsV2 op shows sign of inf when nan is present.""" + with self.session(graph=ops.Graph()): + t1 = constant_op.constant([0.0, 1.0]) + t2 = constant_op.constant([0.0, 0.0]) + with self.assertRaisesRegexp( + errors.InvalidArgumentError, + r"pass through test.*had \+Inf and NaN values"): + self.evaluate( + array_ops.check_numerics_v2(t1 / t2, message="pass through test")) + if __name__ == "__main__": ops.enable_eager_execution() diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py index 4d31cd45289..950658bc886 100644 --- a/tensorflow/python/kernel_tests/numerics_test.py +++ b/tensorflow/python/kernel_tests/numerics_test.py @@ -24,7 +24,6 @@ import numpy as np from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes -from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops @@ -132,51 +131,6 @@ class NumericsTest(test.TestCase): r"or `tf.while_loop\(\)`\."): numerics.add_check_numerics_ops() - def testCheckNumericsV2OpNegativeAndPositiveInf(self): - """Test that CheckNumericsV2 op distinguishes negative and positive infs.""" - with self.session(graph=ops.Graph()): - t1 = constant_op.constant([-1.0, 1.0]) - t2 = constant_op.constant([0.0, 0.0]) - checked = array_ops.check_numerics_v2( - t1 / t2, message="pass through test") - caught = None - try: - self.evaluate(checked) - except errors.InvalidArgumentError as error: - caught = error - self.assertIn("had -Inf and +Inf values", caught.message) - self.assertIn("pass through test", caught.message) - - def testCheckNumericsV2OpNegativeAndPositiveInfAndNaN(self): - """CheckNumericsV2 op distinguishes - & + infs when nan is present.""" - with self.session(graph=ops.Graph()): - t1 = constant_op.constant([-1.0, 1.0, 0.0]) - t2 = constant_op.constant([0.0, 0.0, 0.0]) - checked = array_ops.check_numerics_v2( - t1 / t2, message="pass through test") - caught = None - try: - self.evaluate(checked) - except errors.InvalidArgumentError as error: - caught = error - self.assertIn("had -Inf, +Inf, and NaN values", caught.message) - self.assertIn("pass through test", caught.message) - - def testCheckNumericsV2PositiveInfAndNaN(self): - """Test that CheckNumericsV2 op shows sign of inf when nan is present.""" - with self.session(graph=ops.Graph()): - t1 = constant_op.constant([0.0, 1.0]) - t2 = constant_op.constant([0.0, 0.0]) - checked = array_ops.check_numerics_v2( - t1 / t2, message="pass through test") - caught = None - try: - self.evaluate(checked) - except errors.InvalidArgumentError as error: - caught = error - self.assertIn("had +Inf and NaN values", caught.message) - self.assertIn("pass through test", caught.message) - if __name__ == "__main__": # TODO(b/130689556): XLA CPU does not honor inf/nan which causes problems From fb416f16e2b01252326816bb311c3e6165d13bcf Mon Sep 17 00:00:00 2001 From: Shanqing Cai Date: Mon, 18 May 2020 06:28:20 -0700 Subject: [PATCH 110/557] [tfdbg] Fix source_utils_test in Python 3.8+ This is related to https://bugs.python.org/issue12458 In python 3.8, traceback reports the first instead of last line in a multi-line continuation block. Certain parts of source_utils_test.py assume that traceback always returns the last line, which is true all the way up to 3.7. In order to fix this, we use the `ast` module to extract the lineno of the first line in a multi-line continuation block. PiperOrigin-RevId: 312067389 Change-Id: I8a3ac129b3d75230a3eedd64c3605779dcab5336 --- tensorflow/python/debug/BUILD | 1 - .../python/debug/lib/source_utils_test.py | 38 ++++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD index 956e90999c7..1ef0504ecb8 100644 --- a/tensorflow/python/debug/BUILD +++ b/tensorflow/python/debug/BUILD @@ -840,7 +840,6 @@ py_test( python_version = "PY3", srcs_version = "PY2AND3", tags = [ - "no_oss_py38", #TODO(b/151449908) "no_windows", ], deps = [ diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py index faf2365fc9c..89964a21ba7 100644 --- a/tensorflow/python/debug/lib/source_utils_test.py +++ b/tensorflow/python/debug/lib/source_utils_test.py @@ -18,7 +18,9 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import ast import os +import sys import tempfile import zipfile @@ -43,7 +45,41 @@ from tensorflow.python.util import tf_inspect def line_number_above(): - return tf_inspect.stack()[1][2] - 1 + """Get lineno of the AST node immediately above this function's call site. + + It is assumed that there is no empty line(s) between the call site and the + preceding AST node. + + Returns: + The lineno of the preceding AST node, at the same level of the AST. + If the preceding AST spans multiple lines: + - In Python 3.8+, the lineno of the first line is returned. + - In older Python versions, the lineno of the last line is returned. + """ + # https://bugs.python.org/issue12458: In Python 3.8, traceback started + # to return the lineno of the first line of a multi-line continuation block, + # instead of that of the last line. Therefore, in Python 3.8+, we use `ast` to + # get the lineno of the first line. + call_site_lineno = tf_inspect.stack()[1][2] + if sys.version_info < (3, 8): + return call_site_lineno - 1 + else: + with open(__file__, "rb") as f: + source_text = f.read().decode("utf-8") + source_tree = ast.parse(source_text) + prev_node = _find_preceding_ast_node(source_tree, call_site_lineno) + return prev_node.lineno + + +def _find_preceding_ast_node(node, lineno): + """Find the ast node immediately before and not including lineno.""" + for i, child_node in enumerate(node.body): + if child_node.lineno == lineno: + return node.body[i - 1] + if hasattr(child_node, "body"): + found_node = _find_preceding_ast_node(child_node, lineno) + if found_node: + return found_node class GuessIsTensorFlowLibraryTest(test_util.TensorFlowTestCase): From ff2019a216aed7bbb1e30432b47abcfe5567f0b4 Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Mon, 18 May 2020 07:06:15 -0700 Subject: [PATCH 111/557] Optimize multiply by quantize multiplier. PiperOrigin-RevId: 312072311 Change-Id: I7d01be9aa8f1a238c6887d4770a1090899337383 --- .../internal/optimized/optimized_ops.h | 82 ++++++------------- 1 file changed, 27 insertions(+), 55 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h index b18f0f4bb5a..64598d70ee3 100644 --- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h @@ -201,63 +201,35 @@ MatrixMap MapAsMatrixWithGivenNumberOfRows(Scalar* data, // MultiplyByQuantizedMultipler. #ifdef USE_NEON inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows( - int32x4x4_t input_val, int32 quantized_multiplier, int shift) { - using gemmlowp::RoundingDivideByPOT; - using gemmlowp::SaturatingRoundingDoublingHighMul; - const int left_shift = shift > 0 ? shift : 0; - const int right_shift = shift > 0 ? 0 : -shift; + int32x4x4_t input_val, int32 quantized_multiplier, int32 shift) { + const int left_shift = std::max(shift, 0); + const int right_shift = std::min(shift, 0); int32x4x4_t result; - // The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp - // is limited to NEON. -#ifdef GEMMLOWP_NEON - const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift); - result.val[0] = - RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( - vmulq_s32(input_val.val[0], left_shifted_one_dup), - quantized_multiplier), - right_shift); - result.val[1] = - RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( - vmulq_s32(input_val.val[1], left_shifted_one_dup), - quantized_multiplier), - right_shift); - result.val[2] = - RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( - vmulq_s32(input_val.val[2], left_shifted_one_dup), - quantized_multiplier), - right_shift); - result.val[3] = - RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( - vmulq_s32(input_val.val[3], left_shifted_one_dup), - quantized_multiplier), - right_shift); -#else - for (int i = 0; i < 4; ++i) { - int32_t vals[4]; - vals[0] = RoundingDivideByPOT( - SaturatingRoundingDoublingHighMul( - vgetq_lane_s32(input_val.val[i], 0) * (1 << left_shift), - quantized_multiplier), - right_shift); - vals[1] = RoundingDivideByPOT( - SaturatingRoundingDoublingHighMul( - vgetq_lane_s32(input_val.val[i], 1) * (1 << left_shift), - quantized_multiplier), - right_shift); - vals[2] = RoundingDivideByPOT( - SaturatingRoundingDoublingHighMul( - vgetq_lane_s32(input_val.val[i], 2) * (1 << left_shift), - quantized_multiplier), - right_shift); - vals[3] = RoundingDivideByPOT( - SaturatingRoundingDoublingHighMul( - vgetq_lane_s32(input_val.val[i], 3) * (1 << left_shift), - quantized_multiplier), - right_shift); - result.val[i] = vld1q_s32(reinterpret_cast(&vals)); - } -#endif + int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier); + int32x4_t left_shift_dup = vdupq_n_s32(left_shift); + int32x4_t right_shift_dup = vdupq_n_s32(right_shift); + + result.val[0] = + vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), + multiplier_dup), + right_shift_dup); + + result.val[1] = + vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), + multiplier_dup), + right_shift_dup); + + result.val[2] = + vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), + multiplier_dup), + right_shift_dup); + + result.val[3] = + vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), + multiplier_dup), + right_shift_dup); + return result; } #endif From b5ed51fb220fa85b96268b392fe7f60804c004c3 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 18 May 2020 07:37:15 -0700 Subject: [PATCH 112/557] Resolve trivial aliases for portable TensorFlow targets. PiperOrigin-RevId: 312076343 Change-Id: I49adacfaea505bed1edb4ca51776057474d2a4ca --- tensorflow/tensorflow.bzl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 9e89094f4e7..d72bdf58186 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -874,7 +874,7 @@ def tf_gen_op_wrappers_cc( clean_dep("//tensorflow/core:ops"), clean_dep("//tensorflow/core:protos_all_cc"), ]) + if_android([ - clean_dep("//tensorflow/core:android_tensorflow_lib"), + clean_dep("//tensorflow/core:portable_tensorflow_lib"), ]), copts = tf_copts(), alwayslink = 1, @@ -891,7 +891,7 @@ def tf_gen_op_wrappers_cc( clean_dep("//tensorflow/core:ops"), clean_dep("//tensorflow/core:protos_all_cc"), ]) + if_android([ - clean_dep("//tensorflow/core:android_tensorflow_lib"), + clean_dep("//tensorflow/core:portable_tensorflow_lib"), ]), copts = tf_copts(), alwayslink = 1, From ea113ef6cdbd34203f8f951af8621dbc1e4572e6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 18 May 2020 07:41:37 -0700 Subject: [PATCH 113/557] Integrate LLVM at https://github.com/llvm/llvm-project/commit/a2a4e5aae894 PiperOrigin-RevId: 312076934 Change-Id: I12015eb4ec1278668834ca8a687d290a00eba112 --- tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc index c2b11819448..6375bf7341f 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc @@ -292,7 +292,7 @@ llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type, llvm::AllocaInst* alloca = b->CreateAlloca(type, element_count, AsStringRef(name)); if (alignment != 0) { - alloca->setAlignment(llvm::MaybeAlign(alignment)); + alloca->setAlignment(llvm::Align(alignment)); } return alloca; } From f40a063d84df3f4e0ed2a2fc78d8b79f203a03b4 Mon Sep 17 00:00:00 2001 From: Bixia Zheng Date: Mon, 18 May 2020 07:46:08 -0700 Subject: [PATCH 114/557] [TF:TRT] Enhance InstantiateBuildAndRun to support the case where the input type and output type are not the same. This is to prepare for a change to enhance the TF-TRT bridge to support the Cast operations that can be represented via IIdentityLayer. PiperOrigin-RevId: 312077452 Change-Id: Iab6bfb54d6a346eef158785f61a1311559cee855 --- .../tf2tensorrt/convert/convert_nodes_test.cc | 37 +++++++++++++++---- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc index 884ed7a5771..82c02c17e93 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc @@ -1712,7 +1712,7 @@ INSTANTIATE_TEST_CASE_P( // Builds and runs the converted network. Checks output tensor shape. Tests // output values using a matcher. -template +template void BuildAndRunConvertedNetwork(const string& name, OpConverterTest* test, const TestParamBase& p, const std::vector& input_vec, @@ -1731,12 +1731,14 @@ void BuildAndRunConvertedNetwork(const string& name, OpConverterTest* test, // runtime errors. return; } - typedef typename EnumToDataType::Type T; + typedef typename EnumToDataType::Type Tin; TensorShape shape; TF_EXPECT_OK(TensorShapeUtils::MakeShape(p.input_dims, &shape)); const DataVec input_data{ - {"input", test->AsTensor(CastTestVector(input_vec), shape)}}; - DataVec output_data{{name, test->ConstructTensor(6)}}; + {"input", + test->AsTensor(CastTestVector(input_vec), shape)}}; + typedef typename EnumToDataType::Type Tout; + DataVec output_data{{name, test->ConstructTensor(6)}}; test->BuildAndRun(input_data, &output_data); // Check the shape of the actual output tensor TF_EXPECT_OK(TensorShapeUtils::MakeShape(p.expected_output_dims, &shape)); @@ -1744,7 +1746,7 @@ void BuildAndRunConvertedNetwork(const string& name, OpConverterTest* test, << "Expected shape: " << shape.DebugString() << ", actual shape" << output_data[0].tensor.shape().DebugString(); // Cast the output to float and compare to expected output - auto out_span = GetSpanForData(output_data[0]); + auto out_span = GetSpanForData(output_data[0]); std::vector casted_output(out_span.begin(), out_span.end()); EXPECT_THAT(casted_output, matcher); } @@ -1754,16 +1756,35 @@ void InstantiateBuildAndRun(DataType tf_dtype, const string& name, const std::vector& input_vec, const Matcher>& matcher) { if (tf_dtype == DT_FLOAT) { - BuildAndRunConvertedNetwork(name, test, p, input_vec, matcher); + BuildAndRunConvertedNetwork(name, test, p, input_vec, + matcher); } else if (tf_dtype == DT_HALF) { - BuildAndRunConvertedNetwork(name, test, p, input_vec, matcher); + BuildAndRunConvertedNetwork(name, test, p, input_vec, + matcher); } else if (tf_dtype == DT_INT32) { - BuildAndRunConvertedNetwork(name, test, p, input_vec, matcher); + BuildAndRunConvertedNetwork(name, test, p, input_vec, + matcher); } else { FAIL() << "Test not supported for " << tf_dtype; } } +void InstantiateBuildAndRun(DataType input_tf_dtype, DataType output_tf_dtype, + const string& name, OpConverterTest* test, + const TestParamBase& p, + const std::vector& input_vec, + const Matcher>& matcher) { + if (input_tf_dtype == output_tf_dtype) { + InstantiateBuildAndRun(input_tf_dtype, name, test, p, input_vec, matcher); + } else if (input_tf_dtype == DT_HALF && output_tf_dtype) { + BuildAndRunConvertedNetwork(name, test, p, input_vec, + matcher); + } else { + FAIL() << "Test not supported for input " << input_tf_dtype << " output " + << output_tf_dtype; + } +} + template void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField* out) { out->Clear(); From c4e877d94a0f3ea9506c6c641ecea816d6af6113 Mon Sep 17 00:00:00 2001 From: Marcin Sielski Date: Mon, 18 May 2020 16:50:03 +0200 Subject: [PATCH 115/557] Address issues identified during review Why: * Improve build instruction on RPI. This change addresses the need by: * --depth 1 removal for git clone, * change name of the directory from tensor_src to tensorflow_src, * improve PATH setup in case other cross-tools are installed, * change the compilator version used to build the tensorflow package. --- tensorflow/lite/g3doc/guide/build_rpi.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md index a1724258118..4a39f4e7677 100644 --- a/tensorflow/lite/g3doc/guide/build_rpi.md +++ b/tensorflow/lite/g3doc/guide/build_rpi.md @@ -23,7 +23,7 @@ To cross compile TensorFlow Lite follow the steps: 1. Clone official Raspberry Pi cross-compilation toolchain: ```bash - git clone --depth 1 https://github.com/raspberrypi/tools.git rpi_tools + git clone https://github.com/raspberrypi/tools.git rpi_tools ``` 2. Clone TensorFlow repository: @@ -39,7 +39,7 @@ To cross compile TensorFlow Lite follow the steps: build dependencies: ```bash - cd tensor_src && ./tensorflow/lite/tools/make/download_dependencies.sh + cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh ``` **Note:** You only need to do this once. @@ -47,7 +47,7 @@ build dependencies: 4. To build ARMv7 binary for Raspberry Pi 2, 3 and 4 execute: ```bash - PATH=$PATH:../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/ ./tensorflow/lite/tools/make/build_rpi_lib.sh + PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH ./tensorflow/lite/tools/make/build_rpi_lib.sh ``` **Note:** This should compile a static library in: @@ -56,7 +56,7 @@ build dependencies: 5. To build ARMv6 binary for Raspberry Pi Zero execute: ```bash - PATH=$PATH:../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/ ./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6 + PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH ./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6 ``` **Note:** This should compile a static library in: @@ -64,7 +64,7 @@ build dependencies: ## Compile natively on Raspberry Pi -Instruction has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1): +Instruction has been tested on Raspberry Pi Zero, Raspbian GNU/Linux 10 (buster), gcc version 8.3.0 (Raspbian 8.3.0-6+rpi1): To natively compile TensorFlow Lite follow the steps: @@ -78,7 +78,7 @@ To natively compile TensorFlow Lite follow the steps: build dependencies: ```bash - cd tensor_src && ./tensorflow/lite/tools/make/download_dependencies.sh + cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh ``` **Note:** You only need to do this once. From 50fcac47a2652459a7f9b71255cfa1cf0077447b Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Mon, 18 May 2020 07:49:05 -0700 Subject: [PATCH 116/557] Optimize quantized mul. PiperOrigin-RevId: 312077803 Change-Id: Ib6bbf261834a828590748e2c39ad146bad7d80ae --- .../internal/optimized/integer_ops/mul.h | 139 ++++++++++++------ 1 file changed, 97 insertions(+), 42 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h index 18aeef4c8b5..0d385ec1656 100644 --- a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h +++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h @@ -38,49 +38,81 @@ inline void MulElementwise(int size, const ArithmeticParams& params, TFLITE_DCHECK_GT(params.output_offset, -256); TFLITE_DCHECK_LT(params.output_offset, 256); #ifdef USE_NEON - const auto input1_offset_vector = vdupq_n_s16(params.input1_offset); - const auto input2_offset_vector = vdupq_n_s16(params.input2_offset); - const auto output_offset_vector = vdupq_n_s16(params.output_offset); + const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset); + const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset); + const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset); const auto output_activation_min_vector = - vdup_n_s8(params.quantized_activation_min); + vdupq_n_s8(params.quantized_activation_min); const auto output_activation_max_vector = - vdup_n_s8(params.quantized_activation_max); + vdupq_n_s8(params.quantized_activation_max); const int left_shift = std::max(0, params.output_shift); const int right_shift = std::max(0, -params.output_shift); const int32x4_t left_shift_vec = vdupq_n_s32(left_shift); - for (; i <= size - 8; i += 8) { - // We load / store 8 at a time, multiplying as two sets of 4 int32s. - const auto input1_val_original = vld1_s8(input1_data + i); - const auto input2_val_original = vld1_s8(input2_data + i); - const auto input1_val_s16 = vmovl_s8(input1_val_original); - const auto input2_val_s16 = vmovl_s8(input2_val_original); - const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector); - const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector); + for (; i <= size - 16; i += 16) { + // We load / store 16 at a time, multiplying as four sets of 4 int32s. + const int8x16_t input1_val_original = vld1q_s8(input1_data + i); + const int8x16_t input2_val_original = vld1q_s8(input2_data + i); - const auto input1_val_low = vget_low_s16(input1_val); - const auto input1_val_high = vget_high_s16(input1_val); - const auto input2_val_low = vget_low_s16(input2_val); - const auto input2_val_high = vget_high_s16(input2_val); + const int16x8_t input1_val_s16_high = + vmovl_s8(vget_high_s8(input1_val_original)); + const int16x8_t input1_val_s16_low = + vmovl_s8(vget_low_s8(input1_val_original)); - auto p1 = vmull_s16(input2_val_low, input1_val_low); - auto p2 = vmull_s16(input2_val_high, input1_val_high); + const int16x8_t input2_val_s16_high = + vmovl_s8(vget_high_s8(input2_val_original)); + const int16x8_t input2_val_s16_low = + vmovl_s8(vget_low_s8(input2_val_original)); + const int16x8_t input1_val_high = + vaddq_s16(input1_val_s16_high, input1_offset_vector); + const int16x8_t input2_val_high = + vaddq_s16(input2_val_s16_high, input2_offset_vector); + const int16x8_t input1_val_low = + vaddq_s16(input1_val_s16_low, input1_offset_vector); + const int16x8_t input2_val_low = + vaddq_s16(input2_val_s16_low, input2_offset_vector); + const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high); + const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high); + const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low); + const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low); + const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high); + const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high); + const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low); + const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low); + + auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high); + auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low); + auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high); + auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low); p1 = vshlq_s32(p1, left_shift_vec); p2 = vshlq_s32(p2, left_shift_vec); + p3 = vshlq_s32(p3, left_shift_vec); + p4 = vshlq_s32(p4, left_shift_vec); + p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); + p3 = vqrdmulhq_n_s32(p3, params.output_multiplier); + p4 = vqrdmulhq_n_s32(p4, params.output_multiplier); using gemmlowp::RoundingDivideByPOT; p1 = RoundingDivideByPOT(p1, right_shift); p2 = RoundingDivideByPOT(p2, right_shift); + p3 = RoundingDivideByPOT(p3, right_shift); + p4 = RoundingDivideByPOT(p4, right_shift); const auto p1_narrowed = vqmovn_s32(p1); const auto p2_narrowed = vqmovn_s32(p2); - const auto p = - vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector); - const auto clamped = - vmax_s8(output_activation_min_vector, - vmin_s8(output_activation_max_vector, vqmovn_s16(p))); - vst1_s8(output_data + i, clamped); + const auto p3_narrowed = vqmovn_s32(p3); + const auto p4_narrowed = vqmovn_s32(p4); + + const int16x8_t p_part1 = + vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector); + const int16x8_t p_part2 = + vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector); + const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1)); + + const auto clamped = vmaxq_s8(output_activation_min_vector, + vminq_s8(output_activation_max_vector, p)); + vst1q_s8(output_data + i, clamped); } #endif // NEON @@ -117,40 +149,63 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params, const auto input2_offset_vector = vdupq_n_s16(params.input2_offset); const auto output_offset_vector = vdupq_n_s16(params.output_offset); const auto output_activation_min_vector = - vdup_n_s8(params.quantized_activation_min); + vdupq_n_s8(params.quantized_activation_min); const auto output_activation_max_vector = - vdup_n_s8(params.quantized_activation_max); + vdupq_n_s8(params.quantized_activation_max); const int left_shift = std::max(0, params.output_shift); const int right_shift = std::max(0, -params.output_shift); const int32x4_t left_shift_vec = vdupq_n_s32(left_shift); - for (; i <= size - 8; i += 8) { - // We load / store 8 at a time, multiplying as two sets of 4 int32s. - const auto input2_val_original = vld1_s8(input2_data + i); - const auto input2_val_s16 = vmovl_s8(input2_val_original); - const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector); + for (; i <= size - 16; i += 16) { + // We load / store 16 at a time, multiplying as four sets of 4 int32s. + const auto input2_val_original = vld1q_s8(input2_data + i); + const auto input2_val_s16_high = + vmovl_s8(vget_high_s8(input2_val_original)); + const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original)); - const auto input2_val_low = vget_low_s16(input2_val); - const auto input2_val_high = vget_high_s16(input2_val); + const auto input2_val_high = + vaddq_s16(input2_val_s16_high, input2_offset_vector); + const auto input2_val_low = + vaddq_s16(input2_val_s16_low, input2_offset_vector); - auto p1 = vmull_n_s16(input2_val_low, input1_val); - auto p2 = vmull_n_s16(input2_val_high, input1_val); + const auto input2_val_low_low = vget_low_s16(input2_val_low); + const auto input2_val_low_high = vget_high_s16(input2_val_low); + const auto input2_val_high_low = vget_low_s16(input2_val_high); + const auto input2_val_high_high = vget_high_s16(input2_val_high); + + auto p1 = vmull_n_s16(input2_val_high_high, input1_val); + auto p2 = vmull_n_s16(input2_val_high_low, input1_val); + auto p3 = vmull_n_s16(input2_val_low_high, input1_val); + auto p4 = vmull_n_s16(input2_val_low_low, input1_val); p1 = vshlq_s32(p1, left_shift_vec); p2 = vshlq_s32(p2, left_shift_vec); + p3 = vshlq_s32(p3, left_shift_vec); + p4 = vshlq_s32(p4, left_shift_vec); + p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); + p3 = vqrdmulhq_n_s32(p3, params.output_multiplier); + p4 = vqrdmulhq_n_s32(p4, params.output_multiplier); using gemmlowp::RoundingDivideByPOT; p1 = RoundingDivideByPOT(p1, right_shift); p2 = RoundingDivideByPOT(p2, right_shift); + p3 = RoundingDivideByPOT(p3, right_shift); + p4 = RoundingDivideByPOT(p4, right_shift); const auto p1_narrowed = vqmovn_s32(p1); const auto p2_narrowed = vqmovn_s32(p2); - const auto p = - vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector); - const auto clamped = - vmax_s8(output_activation_min_vector, - vmin_s8(output_activation_max_vector, vqmovn_s16(p))); - vst1_s8(output_data + i, clamped); + const auto p3_narrowed = vqmovn_s32(p3); + const auto p4_narrowed = vqmovn_s32(p4); + + const int16x8_t p_part1 = + vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector); + const int16x8_t p_part2 = + vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector); + const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1)); + + const auto clamped = vmaxq_s8(output_activation_min_vector, + vminq_s8(output_activation_max_vector, p)); + vst1q_s8(output_data + i, clamped); } #endif // NEON From 454195592520a68033aaf123c083e1ff7d9bb719 Mon Sep 17 00:00:00 2001 From: Marcin Sielski Date: Mon, 18 May 2020 16:58:01 +0200 Subject: [PATCH 117/557] Add clone step to native build instrunction. Why: * Improve the documentation. This change addresses the need by: * Add clone repository step, * Change rpi_armv7 to rpi_armv6 . --- tensorflow/lite/g3doc/guide/build_rpi.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md index 4a39f4e7677..c75b39cd7e5 100644 --- a/tensorflow/lite/g3doc/guide/build_rpi.md +++ b/tensorflow/lite/g3doc/guide/build_rpi.md @@ -29,7 +29,7 @@ To cross compile TensorFlow Lite follow the steps: 2. Clone TensorFlow repository: ```bash - git clone --depth 1 https://github.com/tensorflow/tensorflow.git tensorflow_src + git clone https://github.com/tensorflow/tensorflow.git tensorflow_src ``` @@ -74,7 +74,14 @@ To natively compile TensorFlow Lite follow the steps: sudo apt-get install build-essential ``` -2. Run following script at the root of the TensorFlow repository to download all the +2. Clone TensorFlow repository: + + ```bash + git clone https://github.com/tensorflow/tensorflow.git tensorflow_src + + ``` + +3. Run following script at the root of the TensorFlow repository to download all the build dependencies: ```bash @@ -83,11 +90,11 @@ build dependencies: **Note:** You only need to do this once. -3. You should then be able to compile TensorFlow Lite with: +4. You should then be able to compile TensorFlow Lite with: ```bash ./tensorflow/lite/tools/make/build_rpi_lib.sh ``` **Note:** This should compile a static library in: - `tensorflow/lite/tools/make/gen/lib/rpi_armv7/libtensorflow-lite.a`. + `tensorflow/lite/tools/make/gen/lib/rpi_armv6/libtensorflow-lite.a`. From 55aee9e55084b309d5a01dae6685d4622482d6df Mon Sep 17 00:00:00 2001 From: Bixia Zheng Date: Mon, 18 May 2020 08:55:02 -0700 Subject: [PATCH 118/557] [TF:TRT] Add utilities for converting between TF types and TRT types. PiperOrigin-RevId: 312087947 Change-Id: Ie4c47ab5c6aae97af5a83bba06e3de0637752ecf --- .../tf2tensorrt/convert/convert_nodes_test.cc | 32 ++++++----------- .../compiler/tf2tensorrt/convert/utils.cc | 35 +++++++++++++++++++ .../compiler/tf2tensorrt/convert/utils.h | 3 ++ 3 files changed, 48 insertions(+), 22 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc index 82c02c17e93..964370af6be 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc @@ -137,30 +137,18 @@ std::ostream& operator<<(std::ostream& os, const std::vector& v) { return os; } -nvinfer1::DataType TfDataTypeToTrt(DataType tf_dtype) { - switch (tf_dtype) { - case DT_FLOAT: - return nvinfer1::DataType::kFLOAT; - case DT_HALF: - return nvinfer1::DataType::kHALF; - case DT_INT32: - return nvinfer1::DataType::kINT32; - default: - QCHECK(false) << "Unexpected data type " << DataTypeString(tf_dtype); - } +nvinfer1::DataType TfDataTypeToTrt(DataType tf_type) { + nvinfer1::DataType trt_type; + Status status = TfTypeToTrtType(tf_type, &trt_type); + EXPECT_EQ(status, Status::OK()); + return trt_type; } -DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) { - switch (trt_dtype) { - case nvinfer1::DataType::kFLOAT: - return DT_FLOAT; - case nvinfer1::DataType::kHALF: - return DT_HALF; - case nvinfer1::DataType::kINT32: - return DT_INT32; - default: - QCHECK(false) << "Unexpected data type " << static_cast(trt_dtype); - } +DataType TrtDataTypeToTf(nvinfer1::DataType trt_type) { + DataType tf_type; + Status status = TrtTypeToTfType(trt_type, &tf_type); + EXPECT_EQ(status, Status::OK()); + return tf_type; } NodeDef MakeNodeDef(const string& name, const string& op, diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc index fb3ae6943d3..a4b64ec0dc5 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/errors.h" namespace tensorflow { namespace tensorrt { @@ -185,6 +186,40 @@ Status TrtDimsToTensorShape(const nvinfer1::Dims trt_dims, return Status::OK(); } +Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type) { + switch (tf_type) { + case DT_FLOAT: + *trt_type = nvinfer1::DataType::kFLOAT; + break; + case DT_HALF: + *trt_type = nvinfer1::DataType::kHALF; + break; + case DT_INT32: + *trt_type = nvinfer1::DataType::kINT32; + break; + default: + return errors::Internal("Unsupported tensorflow type"); + } + return Status::OK(); +} + +Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type) { + switch (trt_type) { + case nvinfer1::DataType::kFLOAT: + *tf_type = DT_FLOAT; + break; + case nvinfer1::DataType::kHALF: + *tf_type = DT_HALF; + break; + case nvinfer1::DataType::kINT32: + *tf_type = DT_INT32; + break; + default: + return errors::Internal("Invalid TRT type"); + } + return Status::OK(); +} + int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) { int n_bindings = engine->getNbBindings(); int n_input = 0; diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h index 5d4cf1bb851..59eeb420134 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/utils.h +++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h @@ -106,6 +106,9 @@ Status TrtDimsToTensorShape(const nvinfer1::Dims trt_dims, bool use_implicit_batch, int batch_size, TensorShape& shape); +Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type); +Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type); + // Returns a string that includes compile time TensorRT library version // information {Maj, Min, Patch}. string GetLinkedTensorRTVersion(); From 46f7108d78c6a3c0854fe66ce1cd92e5ebb3d6e2 Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Mon, 18 May 2020 09:08:29 -0700 Subject: [PATCH 119/557] Internal change PiperOrigin-RevId: 312090528 Change-Id: I474709513b01db8c24c50fd670029451c51cb622 --- tensorflow/python/keras/layers/embeddings.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py index 3f57fd6cb63..e30e93f02dc 100644 --- a/tensorflow/python/keras/layers/embeddings.py +++ b/tensorflow/python/keras/layers/embeddings.py @@ -129,8 +129,10 @@ class Embedding(Layer): # since it knows all kernels using the variable only exist on CPU. # When eager execution is enabled, the placement decision has to be made # right now. Checking for the presence of GPUs to avoid complicating the - # TPU codepaths which can handle sparse optimizers. - if context.executing_eagerly() and context.context().num_gpus(): + # TPU codepaths which can handle sparse optimizers. But if we are within + # a tf.function, we go back the graph mode logic and rely on the placer. + if (context.executing_eagerly() and context.context().num_gpus() and + not ops.inside_function()): with ops.device('cpu:0'): self.embeddings = self.add_weight( shape=(self.input_dim, self.output_dim), From 32165792a3ae4705f50d82329db0733aa01bb6ed Mon Sep 17 00:00:00 2001 From: Bixia Zheng Date: Mon, 18 May 2020 09:23:09 -0700 Subject: [PATCH 120/557] [TF:TRT] Implement cast from fp16 to fp32 with IIdentityLayer. This is the first CL to implement the request in b/150285802. Add Cast op test to convert_nodes_test. PiperOrigin-RevId: 312093049 Change-Id: I77215cf6da104f51acc93de1b03e9a179db54f0a --- .../tf2tensorrt/convert/convert_nodes.cc | 106 +++++++++++++++--- .../tf2tensorrt/convert/convert_nodes.h | 2 + .../tf2tensorrt/convert/convert_nodes_test.cc | 21 +++- 3 files changed, 109 insertions(+), 20 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc index a43b16e9e6a..e791ff9ff60 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc @@ -29,6 +29,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/strings/match.h" #include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "absl/strings/string_view.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" @@ -795,6 +796,19 @@ nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const { } } +Status TRT_TensorOrWeights::GetTfType(DataType* tf_type) const { + if (is_tensor()) { + nvinfer1::DataType trt_type = tensor()->getType(); + return TrtTypeToTfType(trt_type, tf_type); + } + + if (is_weights()) { + *tf_type = weights().GetTensor().dtype(); + return Status::OK(); + } + return errors::Internal("The object is probably not initialized"); +} + string TRT_TensorOrWeights::DebugString() const { string output = "TRT_TensorOrWeights(type="; if (is_tensor()) { @@ -1900,27 +1914,48 @@ Status CheckInputsWeights( return Status::OK(); } -Status AllowDataTypes(const OpConverterParams& params, - const std::set& allowed_dtypes, - const char* dtype_attr_name = "T") { - const auto& node_def = params.node_def; +Status GetNodeDefTfType(const NodeDef& node_def, DataType* tf_type, + const char* type_attr_name) { TFAttrs attrs(node_def); - if (!attrs.count(dtype_attr_name)) { - return errors::InvalidArgument("Attribute with name ", dtype_attr_name, + if (!attrs.count(type_attr_name)) { + return errors::InvalidArgument("Attribute with name ", type_attr_name, " not found."); } - const auto op_dtype = attrs.get(dtype_attr_name); - if (!allowed_dtypes.count(op_dtype)) { - // Build string list of allowed types. - std::ostringstream ss; - for (auto it = allowed_dtypes.begin(); it != allowed_dtypes.end(); ++it) { - if (it != allowed_dtypes.begin()) ss << ", "; - ss << DataTypeString(*it); - } - return errors::Unimplemented("Data type ", DataTypeString(op_dtype), + *tf_type = attrs.get(type_attr_name); + return Status::OK(); +} + +Status GetInputTfType(const OpConverterParams& params, DataType* tf_type, + int pos) { + const std::vector& inputs = params.inputs; + if (inputs.size() <= pos) { + return errors::Internal("Invalid input position"); + } + + return inputs[pos].GetTfType(tf_type); +} + +constexpr const char kOutputTypeAttrName[] = "T"; + +Status GetOutputTfType(const OpConverterParams& params, DataType* tf_type) { + return GetNodeDefTfType(params.node_def, tf_type, kOutputTypeAttrName); +} + +Status AllowDataTypes(const OpConverterParams& params, + const std::set& allowed_types, + const char* type_attr_name = kOutputTypeAttrName) { + const auto& node_def = params.node_def; + DataType tf_type; + TF_RETURN_IF_ERROR(GetNodeDefTfType(node_def, &tf_type, type_attr_name)); + if (!allowed_types.count(tf_type)) { + string allowed_types_string = absl::StrJoin( + allowed_types, ", ", [](string* out, const DataType& type) { + absl::StrAppendFormat(out, "%s", DataTypeString(type)); + }); + return errors::Unimplemented("Data type ", DataTypeString(tf_type), " is not supported for ", node_def.op(), - ", must be one of [", ss.str(), "], at ", - node_def.name()); + ", must be one of [", allowed_types_string, + "], at ", node_def.name()); } return Status::OK(); } @@ -4598,6 +4633,42 @@ Status ConvertUnpack(OpConverterParams* params) { return ConvertSplitHelper(params, inputs.at(0), tf_axis, num, true); } +// Supports cast fp16=>fp32 through IIdentityLayer. +Status ConvertCast(OpConverterParams* params) { + const NodeDef& node_def = params->node_def; + TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}})); + auto unsupport_cast_error = [&]() { + return errors::Unimplemented("Cast op: ", node_def.op(), + " not supported at: ", node_def.name()); + }; + + DataType input_type; + TF_RETURN_IF_ERROR(GetInputTfType(*params, &input_type, 0)); + if (input_type != DataType::DT_HALF) { + return unsupport_cast_error(); + } + + DataType output_type; + TF_RETURN_IF_ERROR(GetOutputTfType(*params, &output_type)); + if (output_type != DataType::DT_FLOAT) { + return unsupport_cast_error(); + } + + if (params->validation_only) return Status::OK(); + + nvinfer1::ITensor* input = params->inputs.at(0).tensor(); + nvinfer1::IIdentityLayer* layer = + params->converter->network()->addIdentity(*input); + layer->setPrecision(nvinfer1::DataType::kFLOAT); + + if (layer->getOutput(0)->getType() != nvinfer1::DataType::kFLOAT) { + return errors::Internal("IIdentityLayer doesn't work as expected"); + } + + params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0))); + return Status::OK(); +} + Status ConvertConcat(OpConverterParams* params) { const auto& inputs = params->inputs; const auto& node_def = params->node_def; @@ -5675,6 +5746,7 @@ static void RegisterValidatableOpConverters( (*registration)["CombinedNonMaxSuppression"] = ConvertCombinedNMS; #endif (*registration)["AddN"] = ConvertAddN; + (*registration)["Cast"] = ConvertCast; (*registration)["ConcatV2"] = ConvertConcat; (*registration)["Const"] = ConvertConst; (*registration)["Conv2D"] = ConvertConv2D; diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h index 2092aecd657..2fe8eec9675 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h @@ -294,6 +294,8 @@ class TRT_TensorOrWeights { nvinfer1::Dims GetTrtDims() const; + Status GetTfType(DataType* tf_type) const; + int batch_size() const { return batch_size_; } string DebugString() const; diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc index 964370af6be..1efc31f9e24 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc @@ -5147,6 +5147,14 @@ NodeDef CreateUnaryOp() { return T(s.WithOpName("my_unary"), input).operation.node()->def(); } +NodeDef CreateCastOp() { + Scope s = Scope::NewRootScope(); + auto input = ops::Placeholder(s.WithOpName("input"), DT_HALF); + return ops::Cast(s.WithOpName("my_unary"), input, DT_FLOAT) + .operation.node() + ->def(); +} + TEST_P(ParameterizedOpConverterTest, ConvertUnary) { const auto& spec = GetParam(); const TrtTestMode trt_mode = std::get<0>(spec); @@ -5174,6 +5182,7 @@ TEST_P(ParameterizedOpConverterTest, ConvertUnary) { ADD_OP("Asinh", ops::Asinh, std::asinh); ADD_OP("Atan", ops::Atan, std::atan); ADD_OP("Atanh", ops::Atanh, std::atanh); + op_map["Cast"] = std::make_pair(CreateCastOp, [](float x) { return x; }); ADD_OP("Ceil", ops::Ceil, std::ceil); ADD_OP("Cos", ops::Cos, std::cos); ADD_OP("Cosh", ops::Cosh, std::cosh); @@ -5212,7 +5221,13 @@ TEST_P(ParameterizedOpConverterTest, ConvertUnary) { } NodeDef node_def = op_map[op_name].first(); - AddTestTensor("input", p.input_dims, TfDataTypeToTrt(tf_dtype), trt_mode); + // TODO(bixia): we assume this test is only instantiated for DT_FLOAT for + // now. Need to find a better way to express input and output types. + DataType input_tf_dtype = op_name == "Cast" ? DT_HALF : tf_dtype; + DataType output_tf_dtype = tf_dtype; + + AddTestTensor("input", p.input_dims, TfDataTypeToTrt(input_tf_dtype), + trt_mode); RunValidationAndConversion(node_def, Status::OK(), "my_unary", p.expected_output_dims); @@ -5220,8 +5235,8 @@ TEST_P(ParameterizedOpConverterTest, ConvertUnary) { std::vector output; std::transform(input_values.begin(), input_values.end(), std::back_inserter(output), op_map[op_name].second); - InstantiateBuildAndRun(tf_dtype, "my_unary", this, p, input_values, - ArrayFloatNear(output, 0.0001, true)); + InstantiateBuildAndRun(input_tf_dtype, output_tf_dtype, "my_unary", this, p, + input_values, ArrayFloatNear(output, 0.0001, true)); } } From 9c49cda7d988680985aa194703edd72df60a57bc Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Mon, 18 May 2020 09:27:00 -0700 Subject: [PATCH 121/557] Update release notes for the 1.15.3, 2.0.2 and 2.1.1 patch releases. PiperOrigin-RevId: 312093793 Change-Id: I476369d7d3f8e8d54dd10f412f25049265fc688f --- RELEASE.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index 6c8921cf492..f251f6ceffa 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,28 @@ +# Release 2.1.1 + +## Bug Fixes and Other Changes +* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645) +* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601) +* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960) +* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770) +* Fixes a versioning bug which causes Keras layers from TF 1.x to be used instead of those from TF 2.x + +# Release 2.0.2 + +## Bug Fixes and Other Changes +* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645) +* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601) +* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960) +* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770) + +# Release 1.15.3 + +## Bug Fixes and Other Changes +* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645) +* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601) +* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960) +* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770) + # Release 2.2.0 TensorFlow 2.2 discontinues support for Python 2, [previously announced](https://groups.google.com/a/tensorflow.org/d/msg/announce/gVwS5RC8mds/dCt1ka2XAAAJ) as following [Python 2's EOL on January 1, 2020](https://www.python.org/dev/peps/pep-0373/#update). From cfdb9434054da65025c25d5dbcda029c16faf868 Mon Sep 17 00:00:00 2001 From: Ilya Tokar Date: Mon, 18 May 2020 09:35:23 -0700 Subject: [PATCH 122/557] Tweak round_to_bfloat16 to make it vectorizable. This simplifies control flow by handling positive and negative denormals separately. Should be ~40% faster. PiperOrigin-RevId: 312095390 Change-Id: I5b6388e48b8c217edb0fc4fe14c3add64fb52c65 --- tensorflow/core/lib/bfloat16/bfloat16.h | 327 ++++++++++++------------ 1 file changed, 163 insertions(+), 164 deletions(-) diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h index 4c38738593f..54d78480066 100644 --- a/tensorflow/core/lib/bfloat16/bfloat16.h +++ b/tensorflow/core/lib/bfloat16/bfloat16.h @@ -194,171 +194,170 @@ struct bfloat16 { input = f.u; bfloat16 output; + // Fast rounding algorithm that rounds a half value to nearest even. This + // reduces expected error when we convert a large number of floats. Here + // is how it works: + // + // Definitions: + // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits + // with the following tags: + // + // Sign | Exp (8 bits) | Frac (23 bits) + // S EEEEEEEE FFFFFFLRTTTTTTTTTTTTTTT + // + // S: Sign bit. + // E: Exponent bits. + // F: First 6 bits of fraction. + // L: Least significant bit of resulting bfloat16 if we truncate away the + // rest of the float32. This is also the 7th bit of fraction + // R: Rounding bit, 8th bit of fraction. + // T: Sticky bits, rest of fraction, 15 bits. + // + // To round half to nearest even, there are 3 cases where we want to round + // down (simply truncate the result of the bits away, which consists of + // rounding bit and sticky bits) and two cases where we want to round up + // (truncate then add one to the result). + // + // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of + // 1s) as the rounding bias, adds the rounding bias to the input, then + // truncates the last 16 bits away. + // + // To understand how it works, we can analyze this algorithm case by case: + // + // 1. L = 0, R = 0: + // Expect: round down, this is less than half value. + // + // Algorithm: + // - Rounding bias: 0x7fff + 0 = 0x7fff + // - Adding rounding bias to input may create any carry, depending on + // whether there is any value set to 1 in T bits. + // - R may be set to 1 if there is a carry. + // - L remains 0. + // - Note that this case also handles Inf and -Inf, where all fraction + // bits, including L, R and Ts are all 0. The output remains Inf after + // this algorithm. + // + // 2. L = 1, R = 0: + // Expect: round down, this is less than half value. + // + // Algorithm: + // - Rounding bias: 0x7fff + 1 = 0x8000 + // - Adding rounding bias to input doesn't change sticky bits but + // adds 1 to rounding bit. + // - L remains 1. + // + // 3. L = 0, R = 1, all of T are 0: + // Expect: round down, this is exactly at half, the result is already + // even (L=0). + // + // Algorithm: + // - Rounding bias: 0x7fff + 0 = 0x7fff + // - Adding rounding bias to input sets all sticky bits to 1, but + // doesn't create a carry. + // - R remains 1. + // - L remains 0. + // + // 4. L = 1, R = 1: + // Expect: round up, this is exactly at half, the result needs to be + // round to the next even number. + // + // Algorithm: + // - Rounding bias: 0x7fff + 1 = 0x8000 + // - Adding rounding bias to input doesn't change sticky bits, but + // creates a carry from rounding bit. + // - The carry sets L to 0, creates another carry bit and propagate + // forward to F bits. + // - If all the F bits are 1, a carry then propagates to the exponent + // bits, which then creates the minimum value with the next exponent + // value. Note that we won't have the case where exponents are all 1, + // since that's either a NaN (handled in the other if condition) or inf + // (handled in case 1). + // + // 5. L = 0, R = 1, any of T is 1: + // Expect: round up, this is greater than half. + // + // Algorithm: + // - Rounding bias: 0x7fff + 0 = 0x7fff + // - Adding rounding bias to input creates a carry from sticky bits, + // sets rounding bit to 0, then create another carry. + // - The second carry sets L to 1. + // + // Examples: + // + // Exact half value that is already even: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1000000000000000 + // + // This falls into case 3. We truncate the rest of 16 bits and no + // carry is created into F and L: + // + // Output: + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 + // + // Exact half value, round to next even number: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1000000000000000 + // + // This falls into case 4. We create a carry from R and T, + // which then propagates into L and F: + // + // Output: + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 + // + // + // Max denormal value round to min normal value: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1111111111111111 + // + // This falls into case 4. We create a carry from R and T, + // propagate into L and F, which then propagates into exponent + // bits: + // + // Output: + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 + // + // Max normal value round to Inf: + // Input: + // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) + // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT + // 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1111111111111111 + // + // This falls into case 4. We create a carry from R and T, + // propagate into L and F, which then propagates into exponent + // bits: + // + // Sign | Exp (8 bit) | Frac (first 7 bit) + // S E E E E E E E E F F F F F F L + // 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 + // + // + // Least significant bit of resulting bfloat. + uint32_t lsb = (input >> 16) & 1; + uint32_t rounding_bias = 0x7fff + lsb; + input += rounding_bias; + output.value = static_cast(input >> 16); + if ((f.u & 0xff800000u) == 0) { + // Flush positive denormal to 0 + output.value = 0x0; + } + if ((f.u & 0xff800000u) == 0x80000000u) { + // Flush negative denormal to -0 + output.value = 0x8000; + } if (float_isnan(v)) { - // If the value is a NaN, squash it to a qNaN with msb of fraction set, - // this makes sure after truncation we don't end up with an inf. - // - // qNaN magic: All exponent bits set + most significant bit of fraction - // set. - output.value = 0x7fc0; - } else if (std::fabs(v) < std::numeric_limits::min()) { - // Flush denormal to +/- 0.0 - output.value = std::signbit(v) ? 0x8000 : 0; - } else { - // Fast rounding algorithm that rounds a half value to nearest even. This - // reduces expected error when we convert a large number of floats. Here - // is how it works: - // - // Definitions: - // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits - // with the following tags: - // - // Sign | Exp (8 bits) | Frac (23 bits) - // S EEEEEEEE FFFFFFLRTTTTTTTTTTTTTTT - // - // S: Sign bit. - // E: Exponent bits. - // F: First 6 bits of fraction. - // L: Least significant bit of resulting bfloat16 if we truncate away the - // rest of the float32. This is also the 7th bit of fraction - // R: Rounding bit, 8th bit of fraction. - // T: Sticky bits, rest of fraction, 15 bits. - // - // To round half to nearest even, there are 3 cases where we want to round - // down (simply truncate the result of the bits away, which consists of - // rounding bit and sticky bits) and two cases where we want to round up - // (truncate then add one to the result). - // - // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of - // 1s) as the rounding bias, adds the rounding bias to the input, then - // truncates the last 16 bits away. - // - // To understand how it works, we can analyze this algorithm case by case: - // - // 1. L = 0, R = 0: - // Expect: round down, this is less than half value. - // - // Algorithm: - // - Rounding bias: 0x7fff + 0 = 0x7fff - // - Adding rounding bias to input may create any carry, depending on - // whether there is any value set to 1 in T bits. - // - R may be set to 1 if there is a carry. - // - L remains 0. - // - Note that this case also handles Inf and -Inf, where all fraction - // bits, including L, R and Ts are all 0. The output remains Inf after - // this algorithm. - // - // 2. L = 1, R = 0: - // Expect: round down, this is less than half value. - // - // Algorithm: - // - Rounding bias: 0x7fff + 1 = 0x8000 - // - Adding rounding bias to input doesn't change sticky bits but - // adds 1 to rounding bit. - // - L remains 1. - // - // 3. L = 0, R = 1, all of T are 0: - // Expect: round down, this is exactly at half, the result is already - // even (L=0). - // - // Algorithm: - // - Rounding bias: 0x7fff + 0 = 0x7fff - // - Adding rounding bias to input sets all sticky bits to 1, but - // doesn't create a carry. - // - R remains 1. - // - L remains 0. - // - // 4. L = 1, R = 1: - // Expect: round up, this is exactly at half, the result needs to be - // round to the next even number. - // - // Algorithm: - // - Rounding bias: 0x7fff + 1 = 0x8000 - // - Adding rounding bias to input doesn't change sticky bits, but - // creates a carry from rounding bit. - // - The carry sets L to 0, creates another carry bit and propagate - // forward to F bits. - // - If all the F bits are 1, a carry then propagates to the exponent - // bits, which then creates the minimum value with the next exponent - // value. Note that we won't have the case where exponents are all 1, - // since that's either a NaN (handled in the other if condition) or inf - // (handled in case 1). - // - // 5. L = 0, R = 1, any of T is 1: - // Expect: round up, this is greater than half. - // - // Algorithm: - // - Rounding bias: 0x7fff + 0 = 0x7fff - // - Adding rounding bias to input creates a carry from sticky bits, - // sets rounding bit to 0, then create another carry. - // - The second carry sets L to 1. - // - // Examples: - // - // Exact half value that is already even: - // Input: - // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) - // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT - // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1000000000000000 - // - // This falls into case 3. We truncate the rest of 16 bits and no - // carry is created into F and L: - // - // Output: - // Sign | Exp (8 bit) | Frac (first 7 bit) - // S E E E E E E E E F F F F F F L - // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 - // - // Exact half value, round to next even number: - // Input: - // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) - // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT - // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1000000000000000 - // - // This falls into case 4. We create a carry from R and T, - // which then propagates into L and F: - // - // Output: - // Sign | Exp (8 bit) | Frac (first 7 bit) - // S E E E E E E E E F F F F F F L - // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 - // - // - // Max denormal value round to min normal value: - // Input: - // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) - // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT - // 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1111111111111111 - // - // This falls into case 4. We create a carry from R and T, - // propagate into L and F, which then propagates into exponent - // bits: - // - // Output: - // Sign | Exp (8 bit) | Frac (first 7 bit) - // S E E E E E E E E F F F F F F L - // 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 - // - // Max normal value round to Inf: - // Input: - // Sign | Exp (8 bit) | Frac (first 7 bit) | Frac (last 16 bit) - // S E E E E E E E E F F F F F F L RTTTTTTTTTTTTTTT - // 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1111111111111111 - // - // This falls into case 4. We create a carry from R and T, - // propagate into L and F, which then propagates into exponent - // bits: - // - // Sign | Exp (8 bit) | Frac (first 7 bit) - // S E E E E E E E E F F F F F F L - // 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 - // - // - // Least significant bit of resulting bfloat. - uint32_t lsb = (input >> 16) & 1; - uint32_t rounding_bias = 0x7fff + lsb; - input += rounding_bias; - output.value = static_cast(input >> 16); + output.value = NAN_VALUE; } return output; } From dbc0fffedb506c12837a5eda0d87b01b659136ba Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Mon, 18 May 2020 09:35:47 -0700 Subject: [PATCH 123/557] Report remote target name for worker service RPCs. PiperOrigin-RevId: 312095453 Change-Id: I73fc7948f994426b8d62bdefd5573cfe3b5b793d --- .../rpc/grpc_remote_worker.cc | 16 ++++++++++------ .../distributed_runtime/rpc/grpc_remote_worker.h | 3 ++- .../distributed_runtime/rpc/grpc_worker_cache.cc | 6 +++--- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc index 85431acdf0c..6e706179863 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc @@ -45,7 +45,7 @@ class GrpcRemoteWorker : public WorkerInterface { explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel, ::grpc::CompletionQueue* completion_queue, thread::ThreadPool* callback_threadpool, - WorkerCacheLogger* logger) + WorkerCacheLogger* logger, const string& target) : channel_(std::move(channel)), stub_(channel_), cq_(completion_queue), @@ -66,7 +66,8 @@ class GrpcRemoteWorker : public WorkerInterface { instancesource_(Method(GrpcWorkerMethod::kCompleteInstance)), getstepsequence_(Method(GrpcWorkerMethod::kGetStepSequence)), markrecvfinished_(Method(GrpcWorkerMethod::kMarkRecvFinished)), - logger_(logger) {} + logger_(logger), + target_(target) {} ~GrpcRemoteWorker() override {} @@ -273,7 +274,7 @@ class GrpcRemoteWorker : public WorkerInterface { bool fail_fast = true) { new RPCState( &stub_, cq_, method, *request, response, std::move(done), call_opts, - callback_threadpool_, /*max_retries=*/0, fail_fast); + callback_threadpool_, /*max_retries=*/0, fail_fast, &target_); } void IssueRequest(const protobuf::Message* request, TensorResponse* response, @@ -281,7 +282,8 @@ class GrpcRemoteWorker : public WorkerInterface { CallOptions* call_opts = nullptr) { new RPCState(&stub_, cq_, method, *request, response, std::move(done), call_opts, - callback_threadpool_); + callback_threadpool_, /*max_retries=*/0, + /*fail_fast=*/true, &target_); } void IssueMarkRecvFinishedRequest(int64 request_id) { @@ -321,6 +323,7 @@ class GrpcRemoteWorker : public WorkerInterface { // Support for logging. WorkerCacheLogger* logger_; + const string target_; TF_DISALLOW_COPY_AND_ASSIGN(GrpcRemoteWorker); }; @@ -328,9 +331,10 @@ class GrpcRemoteWorker : public WorkerInterface { WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel, ::grpc::CompletionQueue* completion_queue, thread::ThreadPool* callback_threadpool, - WorkerCacheLogger* logger) { + WorkerCacheLogger* logger, + const string& target) { return new GrpcRemoteWorker(std::move(channel), completion_queue, - callback_threadpool, logger); + callback_threadpool, logger, target); } } // namespace tensorflow diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h index c0a49ecfc38..97e590e0ad1 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h @@ -29,7 +29,8 @@ class WorkerInterface; WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel, ::grpc::CompletionQueue* completion_queue, thread::ThreadPool* callback_threadpool, - WorkerCacheLogger* logger); + WorkerCacheLogger* logger, + const string& target); } // namespace tensorflow diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc index f6b6e15a2ba..1d75728ddd2 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc @@ -69,9 +69,9 @@ class GrpcWorkerCache : public WorkerCachePartial { return nullptr; } size_t index = AssignWorkerToThread(target); - return NewGrpcRemoteWorker(channel, - worker_env_->GetCompletionQueue(index), - worker_env_->GetThreadPool(), &logger_); + return NewGrpcRemoteWorker( + channel, worker_env_->GetCompletionQueue(index), + worker_env_->GetThreadPool(), &logger_, target); } } From 1b2a65c15fed4a27bc94ebbce930feea455d927f Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Mon, 18 May 2020 09:46:53 -0700 Subject: [PATCH 124/557] Add legalization from hlo.dot to lhlo.dot PiperOrigin-RevId: 312097353 Change-Id: Ia8b0fef86c77426f54090354779c62163bf97426 --- .../mlir/xla/tests/hlo-legalize-to-lhlo.mlir | 12 ++++++++++++ .../mlir/xla/transforms/hlo_legalize_to_lhlo.cc | 1 + .../mlir/xla/transforms/map_hlo_to_lhlo_op.h | 1 + 3 files changed, 14 insertions(+) diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir index 53296b257ae..68f6d172afc 100644 --- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir +++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir @@ -395,3 +395,15 @@ func @tanh_dyn(%arg0: tensor) { // CHECK: "xla_lhlo.tanh"(%arg0, %[[RESULT]]) : (memref, memref) -> () return } + +// ----- + +// CHECK-LABEL: func @dot +func @dot(%arg0: tensor<1024x1024xf32>) -> tensor<1024x1024xf32> { +// CHECK-SAME: (%[[ARG0:.*]]: [[TYPE:.*]], +// CHECK-SAME: %[[RESULT:.*]]: [[TYPE]]) +// CHECK: "xla_lhlo.dot"(%[[ARG0]], %[[ARG0]], %{{.*}}) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> () + %dot = "xla_hlo.dot"(%arg0, %arg0) + : (tensor<1024x1024xf32>, tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + return %dot : tensor<1024x1024xf32> + } diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc index 10f35768bbd..11b2ae65d8e 100644 --- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc +++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc @@ -362,6 +362,7 @@ void populateHLOToLHLOConversionPattern( HloToLhloOpConverter, HloToLhloOpConverter, HloToLhloOpConverter, + HloToLhloOpConverter, HloToLhloOpConverter, HloToLhloOpConverter, HloToLhloOpConverter, diff --git a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h index fed21e9bafc..21b954a3eb4 100644 --- a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h +++ b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h @@ -49,6 +49,7 @@ MAP_HLO_TO_LHLO(ConvertOp); MAP_HLO_TO_LHLO(CopyOp); MAP_HLO_TO_LHLO(CosOp); MAP_HLO_TO_LHLO(DivOp); +MAP_HLO_TO_LHLO(DotOp); MAP_HLO_TO_LHLO(ExpOp); MAP_HLO_TO_LHLO(ImagOp); MAP_HLO_TO_LHLO(IotaOp); From 0bf90cb2a8b241a728943d343f1cdd922e408c73 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 18 May 2020 10:12:52 -0700 Subject: [PATCH 125/557] Enable (non-gradient) tests of tf.linalg.cholesky in eager mode. PiperOrigin-RevId: 312102967 Change-Id: Icefc46a8268413dfaec42109d4f57dd07f602a54 --- .../python/kernel_tests/cholesky_op_test.py | 45 ++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py index 7d5f7715eb1..01c497a37ed 100644 --- a/tensorflow/python/kernel_tests/cholesky_op_test.py +++ b/tensorflow/python/kernel_tests/cholesky_op_test.py @@ -32,7 +32,7 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import gradient_checker from tensorflow.python.ops import linalg_ops from tensorflow.python.ops import math_ops -from tensorflow.python.ops import random_ops +from tensorflow.python.ops import stateless_random_ops from tensorflow.python.ops import variables from tensorflow.python.ops.linalg import linalg from tensorflow.python.platform import benchmark @@ -91,7 +91,7 @@ def TriAngInvCompositeGrad(l, grad): class CholeskyOpTest(test.TestCase): - def _verifyCholeskyBase(self, sess, x, chol, verification): + def _verifyCholeskyBase(self, x, chol, verification): chol_np, verification_np = self.evaluate([chol, verification]) self.assertAllClose(x, verification_np) self.assertShapeEqual(x, chol) @@ -106,11 +106,11 @@ class CholeskyOpTest(test.TestCase): def _verifyCholesky(self, x): # Verify that LL^T == x. - with self.cached_session(use_gpu=True) as sess: - chol = linalg_ops.cholesky(x) - verification = math_ops.matmul(chol, chol, adjoint_b=True) - self._verifyCholeskyBase(sess, x, chol, verification) + chol = linalg_ops.cholesky(x) + verification = math_ops.matmul(chol, chol, adjoint_b=True) + self._verifyCholeskyBase(x, chol, verification) + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def testBasic(self): data = np.array([[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]]) for dtype in (np.float32, np.float64): @@ -123,6 +123,7 @@ class CholeskyOpTest(test.TestCase): complex_data += data self._verifyCholesky(complex_data) + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def testBatch(self): simple_array = np.array([[[1., 0.], [0., 5.]]]) # shape (1, 2, 2) self._verifyCholesky(simple_array) @@ -144,21 +145,21 @@ class CholeskyOpTest(test.TestCase): matrices[i] = np.dot(matrices[i].T.conj(), matrices[i]) self._verifyCholesky(matrices) - @test_util.run_deprecated_v1 + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def testNonSquareMatrix(self): - with self.assertRaises(ValueError): + with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)): linalg_ops.cholesky(np.array([[1., 2., 3.], [3., 4., 5.]])) - with self.assertRaises(ValueError): + with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)): linalg_ops.cholesky( np.array([[[1., 2., 3.], [3., 4., 5.]], [[1., 2., 3.], [3., 4., 5.]] ])) - @test_util.run_v1_only("b/120545219") + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def testWrongDimensions(self): tensor3 = constant_op.constant([1., 2.]) - with self.assertRaises(ValueError): + with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)): linalg_ops.cholesky(tensor3) - with self.assertRaises(ValueError): + with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)): linalg_ops.cholesky(tensor3) # The below invalid Cholesky call returns an error with TF Classic and just @@ -175,21 +176,23 @@ class CholeskyOpTest(test.TestCase): self._verifyCholesky( np.array([[1., -1., 0.], [-1., 1., -1.], [0., -1., 1.]])) + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def testEmpty(self): self._verifyCholesky(np.empty([0, 2, 2])) self._verifyCholesky(np.empty([2, 0, 0])) @test_util.run_deprecated_v1 def testConcurrentExecutesWithoutError(self): - with self.session(use_gpu=True) as sess: - matrix1 = random_ops.random_normal([5, 5], seed=42) - matrix2 = random_ops.random_normal([5, 5], seed=42) - matrix1 = math_ops.matmul(matrix1, matrix1, adjoint_a=True) - matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True) - c1 = linalg_ops.cholesky(matrix1) - c2 = linalg_ops.cholesky(matrix2) - c1_val, c2_val = self.evaluate([c1, c2]) - self.assertAllClose(c1_val, c2_val) + seed = [42, 24] + matrix_shape = [5, 5] + matrix1 = stateless_random_ops.stateless_random_normal(matrix_shape, seed) + matrix2 = stateless_random_ops.stateless_random_normal(matrix_shape, seed) + matrix1 = math_ops.matmul(matrix1, matrix1, adjoint_a=True) + matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True) + c1 = linalg_ops.cholesky(matrix1) + c2 = linalg_ops.cholesky(matrix2) + c1_val, c2_val = self.evaluate([c1, c2]) + self.assertAllClose(c1_val, c2_val) class CholeskyGradTest(test.TestCase): From 83b85568fb5a5aade46a41909ee9a1b6f3643b57 Mon Sep 17 00:00:00 2001 From: Feng Liu Date: Mon, 18 May 2020 10:23:36 -0700 Subject: [PATCH 126/557] Support int8 in tflite_convert PiperOrigin-RevId: 312105323 Change-Id: I161b9b324e37f42f2026592f7c5bec8ac568c3d6 --- tensorflow/lite/python/tflite_convert.py | 6 ++- tensorflow/lite/python/tflite_convert_test.py | 39 +++++++++++++++---- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py index d0dd7313df3..c7504a3a638 100644 --- a/tensorflow/lite/python/tflite_convert.py +++ b/tensorflow/lite/python/tflite_convert.py @@ -65,6 +65,8 @@ def _parse_inference_type(value, flag): return lite_constants.FLOAT if value == "QUANTIZED_UINT8": return lite_constants.QUANTIZED_UINT8 + if value == "INT8": + return lite_constants.INT8 raise ValueError("Unsupported value for --{0}. Only FLOAT and " "QUANTIZED_UINT8 are supported.".format(flag)) @@ -352,12 +354,12 @@ def _get_tf1_flags(parser): parser.add_argument( "--inference_type", type=str.upper, - choices=["FLOAT", "QUANTIZED_UINT8"], + choices=["FLOAT", "QUANTIZED_UINT8", "INT8"], help="Target data type of real-number arrays in the output file.") parser.add_argument( "--inference_input_type", type=str.upper, - choices=["FLOAT", "QUANTIZED_UINT8"], + choices=["FLOAT", "QUANTIZED_UINT8", "INT8"], help=("Target data type of real-number input arrays. Allows for a " "different type for input arrays in the case of quantization.")) diff --git a/tensorflow/lite/python/tflite_convert_test.py b/tensorflow/lite/python/tflite_convert_test.py index 1e80907edbd..d6a35ba9248 100644 --- a/tensorflow/lite/python/tflite_convert_test.py +++ b/tensorflow/lite/python/tflite_convert_test.py @@ -98,8 +98,8 @@ class TfLiteConvertV1Test(TestModels): sess.close() flags_str = ('--graph_def_file={0} --input_arrays={1} ' - '--output_arrays={2}'.format(graph_def_file, - 'Placeholder', 'add')) + '--output_arrays={2}'.format(graph_def_file, 'Placeholder', + 'add')) self._run(flags_str, should_succeed=True) os.remove(graph_def_file) @@ -137,8 +137,31 @@ class TfLiteConvertV1Test(TestModels): sess.close() flags_str = ('--graph_def_file={0} --input_arrays={1} ' - '--output_arrays={2}'.format(graph_def_file, - 'random', 'add')) + '--output_arrays={2}'.format(graph_def_file, 'random', 'add')) + self._run(flags_str, should_succeed=True) + os.remove(graph_def_file) + + def testQATFrozenGraphDefInt8(self): + with ops.Graph().as_default(): + in_tensor_1 = array_ops.placeholder( + shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA') + in_tensor_2 = array_ops.placeholder( + shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB') + _ = array_ops.fake_quant_with_min_max_args( + in_tensor_1 + in_tensor_2, min=0., max=1., name='output', + num_bits=16) # INT8 inference type works for 16 bits fake quant. + sess = session.Session() + + # Write graph to file. + graph_def_file = self._getFilepath('model.pb') + write_graph(sess.graph_def, '', graph_def_file, False) + sess.close() + + flags_str = ('--inference_type=INT8 --std_dev_values=128,128 ' + '--mean_values=128,128 ' + '--graph_def_file={0} --input_arrays={1},{2} ' + '--output_arrays={3}'.format(graph_def_file, 'inputA', + 'inputB', 'output')) self._run(flags_str, should_succeed=True) os.remove(graph_def_file) @@ -166,8 +189,8 @@ class TfLiteConvertV1Test(TestModels): def testKerasFileMLIR(self): keras_file = self._getKerasModelFile() - flags_str = ('--keras_model_file={} --experimental_new_converter' - .format(keras_file)) + flags_str = ( + '--keras_model_file={} --experimental_new_converter'.format(keras_file)) self._run(flags_str, should_succeed=True) os.remove(keras_file) @@ -299,8 +322,8 @@ class TfLiteConvertV2Test(TestModels): def testKerasFileMLIR(self): keras_file = self._getKerasModelFile() - flags_str = ('--keras_model_file={} --experimental_new_converter' - .format(keras_file)) + flags_str = ( + '--keras_model_file={} --experimental_new_converter'.format(keras_file)) self._run(flags_str, should_succeed=True) os.remove(keras_file) From dec7430b13213974928ae395322feabc788b1664 Mon Sep 17 00:00:00 2001 From: Kibeom Kim Date: Mon, 18 May 2020 10:38:01 -0700 Subject: [PATCH 127/557] Ensure that tf_py_test tfrt test is not enabled for open source build by introducing tfrt_enabled_internal flag. PiperOrigin-RevId: 312108475 Change-Id: Ia73668bf1e8f097441ed23dd75fb1ac2c0327e1f --- tensorflow/python/data/service/BUILD | 2 ++ tensorflow/python/eager/BUILD | 2 +- tensorflow/python/keras/layers/preprocessing/BUILD | 2 ++ tensorflow/python/kernel_tests/BUILD | 5 ++++- tensorflow/python/kernel_tests/proto/BUILD | 2 +- tensorflow/python/saved_model/BUILD | 2 ++ tensorflow/tensorflow.bzl | 11 ++++++++++- 7 files changed, 22 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/data/service/BUILD b/tensorflow/python/data/service/BUILD index 19bcaa3b952..18678230205 100644 --- a/tensorflow/python/data/service/BUILD +++ b/tensorflow/python/data/service/BUILD @@ -1,4 +1,6 @@ load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension") + +# buildifier: disable=same-origin-load load("//tensorflow:tensorflow.bzl", "tf_py_test") package( diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD index c08cb8cc1c3..394b929bf1b 100644 --- a/tensorflow/python/eager/BUILD +++ b/tensorflow/python/eager/BUILD @@ -1,7 +1,7 @@ -load("//tensorflow:tensorflow.bzl", "tf_py_test") load("//tensorflow:tensorflow.bzl", "cuda_py_test") # buildifier: disable=same-origin-load +load("//tensorflow:tensorflow.bzl", "tf_py_test") load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension") load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test") load( diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD index 052a57b52f3..b580382f9d8 100644 --- a/tensorflow/python/keras/layers/preprocessing/BUILD +++ b/tensorflow/python/keras/layers/preprocessing/BUILD @@ -2,6 +2,8 @@ # Contains the Keras preprocess layers (internal TensorFlow version). load("//tensorflow:tensorflow.bzl", "tf_py_test") + +# buildifier: disable=same-origin-load load("//tensorflow:tensorflow.bzl", "cuda_py_test") load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test") load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test") diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index 13f59b74baf..cd03da9b179 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -1,8 +1,11 @@ # Tests of TensorFlow kernels written using the Python API. -load("//tensorflow:tensorflow.bzl", "sycl_py_test", "tf_custom_op_library", "tf_py_test") +load("//tensorflow:tensorflow.bzl", "sycl_py_test", "tf_custom_op_library") load("//tensorflow:tensorflow.bzl", "cuda_py_test") +# buildifier: disable=same-origin-load +load("//tensorflow:tensorflow.bzl", "tf_py_test") + package( default_visibility = ["//tensorflow:internal"], licenses = ["notice"], # Apache 2.0 diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD index d9643f3d125..0e935dfe8c4 100644 --- a/tensorflow/python/kernel_tests/proto/BUILD +++ b/tensorflow/python/kernel_tests/proto/BUILD @@ -1,7 +1,7 @@ # Tests of tf.io.*proto. -load("//tensorflow:tensorflow.bzl", "tf_py_test") load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object") +load("//tensorflow:tensorflow.bzl", "tf_py_test") load("//tensorflow/core/platform:build_config_root.bzl", "if_static") load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library") diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD index 2e5db7edd27..5c30d320fb7 100644 --- a/tensorflow/python/saved_model/BUILD +++ b/tensorflow/python/saved_model/BUILD @@ -2,6 +2,8 @@ # TensorFlow SavedModel. load("//tensorflow:tensorflow.bzl", "cuda_py_test") + +# buildifier: disable=same-origin-load load("//tensorflow:tensorflow.bzl", "tf_py_test") package( diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index d72bdf58186..70b03146f34 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -2218,6 +2218,15 @@ def tf_py_test( xla_enabled = False, grpc_enabled = False, tfrt_enabled = False, + # `tfrt_enabled` is set for some test targets, and if we enable + # TFRT tests just by that, this will enable TFRT builds for open source. + # TFRT open source is not fully integrated yet so we need a temporary + # workaround to enable TFRT only for internal builds. `tfrt_enabled_internal` + # will be set by `tensorflow.google.bzl`'s `tf_py_test` target, which is + # only applied for internal builds. + # TODO(b/156911178): Revert this temporary workaround once TFRT open source + # is fully integrated with TF. + tfrt_enabled_internal = False, **kwargs): """Create one or more python tests with extra tensorflow dependencies.""" xla_test_true_list = [] @@ -2261,7 +2270,7 @@ def tf_py_test( deps = depset(deps + xla_test_true_list), **kwargs ) - if tfrt_enabled: + if tfrt_enabled_internal: py_test( name = name + "_tfrt", size = size, From 95620005efbc52a446a232d5e74ee9fec793f918 Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Mon, 18 May 2020 10:41:07 -0700 Subject: [PATCH 128/557] Document new methods to enable XNNPACK engine in TFLite PiperOrigin-RevId: 312109175 Change-Id: Iefcbb2ef5d7c83160ef2fc09d668c8e4ac440949 --- tensorflow/lite/delegates/xnnpack/README.md | 45 ++++++++++++++++++--- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md index e0ef6f0899c..c4e3f540faf 100644 --- a/tensorflow/lite/delegates/xnnpack/README.md +++ b/tensorflow/lite/delegates/xnnpack/README.md @@ -1,15 +1,48 @@ # XNNPACK backend for TensorFlow Lite XNNPACK is a highly optimized library of floating-point neural network -inference operators for ARM, WebAssembly, and x86 platforms. This document -describes how to use the XNNPACK library as a backend for TensorFlow Lite. +inference operators for ARM, x86, and WebAssembly architectures in Android, iOS, +Windows, Linux, macOS, and Emscripten environments. This document describes how +to use the XNNPACK library as an inference engine for TensorFlow Lite. -## Enabling XNNPACK backend in TensorFlow Lite models +## Using XNNPACK engine with TensorFlow Lite interpreter XNNPACK integrates with TensorFlow Lite interpreter through the delegation -mechanism. To leverage XNNPACK library for acceleration, the users need to -create an XNNPACK delegate with the `TfLiteXNNPackDelegateCreate` function, -and call `Interpreter::ModifyGraphWithDelegate` to delegate supported parts of +mechanism. There are three methods to enable XNNPACK engine in TensorFlow Lite. + +### Enable XNNPACK via Bazel build flags (recommended) + +When building TensorFlow Lite with Bazel, add +`--define tflite_with_xnnpack=true`, and the TensorFlow Lite interpreter will +use XNNPACK engine by default. + +The exact command depends on the target platform, e.g. for Android AAR you'd use + +``` +bazel build -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \ + --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ + --define tflite_with_xnnpack=true \ + //tensorflow/lite/java:tensorflow-lite +``` + +### Enable XNNPACK via additional dependency + +Another way to enable XNNPACK is to build and link the +`//tensorflow/lite:tflite_with_xnnpack` target into your application alongside +the TensorFlow Lite framework. + +This method works on platforms which support POSIX-style weak symbols (Android, +iOS, Linux, Mac, but **NOT** Windows). + +### Enable XNNPACK via low-level delegate API (not recommended) + +While it is possible to use low-level delegate API to enable XNNPACK, this +method is **NOT RECOMMENDED** unless you need to use TensorFlow Lite both with +and without XNNPACK (e.g. for benchmarking). + +With low-level delegate API users create an XNNPACK delegate with the +`TfLiteXNNPackDelegateCreate` function, and then call +`Interpreter::ModifyGraphWithDelegate` to delegate supported parts of the model to the XNNPACK delegate. The users must destroy the delegate with `TfLiteXNNPackDelegateDelete` **after** releasing the TensorFlow Lite interpreter. The snippet below illustrates the typical usage: From 723b2b59946c3a0bfa83b0b5df408e4699c88016 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 18 May 2020 10:44:42 -0700 Subject: [PATCH 129/557] enable device tracer test. PiperOrigin-RevId: 312109916 Change-Id: Ibf8f17dc7cfd95aeb991796880161567fcb9ebe4 --- tensorflow/core/profiler/internal/gpu/BUILD | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD index e6ee8514227..c6fe4d77031 100644 --- a/tensorflow/core/profiler/internal/gpu/BUILD +++ b/tensorflow/core/profiler/internal/gpu/BUILD @@ -55,7 +55,6 @@ tf_cc_test_gpu( linkstatic = tf_kernel_tests_linkstatic(), tags = tf_cuda_tests_tags() + [ "nomac", - "notap", # b/154510273 "gpu_cupti", ], deps = [ From 9cf08f43e07c6bb47bd9d41b3c6b0f33811f77c6 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Mon, 18 May 2020 11:17:10 -0700 Subject: [PATCH 130/557] [XLA:Python] Delete deprecated methods from XLA:Python API. PiperOrigin-RevId: 312117146 Change-Id: I232b67b9c4955b7fa6ab7e3ced9446d5ca2ea0e8 --- tensorflow/compiler/xla/python/xla.cc | 114 ------------------- tensorflow/compiler/xla/python/xla_client.py | 10 +- 2 files changed, 5 insertions(+), 119 deletions(-) diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc index f10ec978399..0c4695cabf3 100644 --- a/tensorflow/compiler/xla/python/xla.cc +++ b/tensorflow/compiler/xla/python/xla.cc @@ -930,34 +930,6 @@ PYBIND11_MODULE(xla_extension, m) { "client", [](const ClientAndPtr& device) { return device.client; }) .def("__str__", &Device::DebugString) - // TODO(phawkins): remove capitalized names after updating callers. - .def("TransferToInfeed", - [](const Device& device, const LiteralSlice& literal) { - GlobalPyRefManager()->CollectGarbage(); - py::gil_scoped_release gil_release; - TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device, - device.GetLocalDeviceState()); - return local_device->client()->TransferToInfeedLocal( - literal, local_device->device_ordinal()); - }) - .def( - "TransferFromOutfeed", - [](const Device& device, const Shape& shape) -> StatusOr { - GlobalPyRefManager()->CollectGarbage(); - std::shared_ptr literal_shared; - { - py::gil_scoped_release gil_release; - TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device, - device.GetLocalDeviceState()); - TF_ASSIGN_OR_RETURN( - Literal literal, - local_device->client()->TransferFromOutfeedLocal( - shape, local_device->device_ordinal())); - - literal_shared = std::make_shared(std::move(literal)); - } - return LiteralToPython(std::move(literal_shared)); - }) .def("transfer_to_infeed", [](const Device& device, const LiteralSlice& literal) { GlobalPyRefManager()->CollectGarbage(); @@ -1244,28 +1216,6 @@ PYBIND11_MODULE(xla_extension, m) { .def("size_of_generated_code_in_bytes", &PjRtExecutable::SizeOfGeneratedCodeInBytes) .def("delete", &PjRtExecutable::Delete) - // TODO(phawkins): delete capitalized methods after updating callers. - .def("Delete", &PjRtExecutable::Delete) - .def( - "Execute", - [](const PjRtExecutable& executable, - absl::Span args) - -> StatusOr>> { - py::gil_scoped_release gil_release; - ExecuteOptions options; - options.untuple_result = true; - TF_ASSIGN_OR_RETURN( - std::vector> output_buffers, - executable.Execute(args, options)); - std::vector> outputs; - outputs.reserve(output_buffers.size()); - for (auto& buffer : output_buffers) { - outputs.push_back(WrapWithClient( - executable.client()->shared_from_this(), std::move(buffer))); - } - return outputs; - }, - py::arg("arguments")) .def( "execute", [](const PjRtExecutable& executable, @@ -1286,33 +1236,6 @@ PYBIND11_MODULE(xla_extension, m) { return outputs; }, py::arg("arguments")) - // TODO(phawkins): delete capitalized methods after updating callers. - .def( - "ExecuteOnLocalDevices", - [](const PjRtExecutable& executable, - absl::Span> args) - -> StatusOr< - std::vector>>> { - py::gil_scoped_release gil_release; - ExecuteOptions options; - options.untuple_result = true; - TF_ASSIGN_OR_RETURN( - std::vector>> - output_buffers, - executable.ExecuteOnLocalDevices(args, options)); - std::vector>> outputs; - outputs.resize(output_buffers.size()); - for (int computation = 0; computation < output_buffers.size(); - ++computation) { - for (auto& buffer : output_buffers[computation]) { - outputs[computation].push_back( - WrapWithClient(executable.client()->shared_from_this(), - std::move(buffer))); - } - } - return outputs; - }, - py::arg("arguments")) .def( "execute_on_local_devices", [](const PjRtExecutable& executable, @@ -1414,12 +1337,6 @@ PYBIND11_MODULE(xla_extension, m) { proto.ParseFromString(serialized_hlo_module_proto); return absl::make_unique(proto); })) - // TODO(phawkins): delete capitalized names after updating callers. - .def("GetProgramShape", &XlaComputation::GetProgramShape) - .def("GetSerializedProto", &GetComputationSerializedProto) - .def("GetHloText", &GetComputationHloText) - .def("GetHloDotGraph", &GetComputationHloDotGraph) - .def("Hash", &HashComputation) .def("get_hlo_module", &GetHloModule) .def("program_shape", &XlaComputation::GetProgramShape) .def("as_serialized_hlo_module_proto", &GetComputationSerializedProto) @@ -1512,28 +1429,7 @@ PYBIND11_MODULE(xla_extension, m) { }, "Builds a computation from the contents of the builder.", py::arg("root") = absl::nullopt) - .def("ClearOpMetadata", &XlaBuilder::ClearOpMetadata) .def("GetShape", &XlaBuilder::GetShape) - .def( - "GetProgramShape", - [](const XlaBuilder& builder, - absl::optional root) -> StatusOr { - return root ? builder.GetProgramShape(*root) - : builder.GetProgramShape(); - }, - py::arg("root") = absl::nullopt) - .def("IsConstant", &XlaBuilder::IsConstant) - .def("SetOpMetadata", &XlaBuilder::SetOpMetadata) - .def("SetSharding", &XlaBuilder::SetSharding) - .def("ClearSharding", &XlaBuilder::ClearSharding) - .def("SetUpAlias", - [](XlaBuilder& builder, const std::vector& output_index, - int64 param_number, const std::vector& param_index) { - builder.SetUpAlias( - ShapeIndex(output_index.begin(), output_index.end()), - param_number, - ShapeIndex(param_index.begin(), param_index.end())); - }) .def( "build", [](XlaBuilder& builder, absl::optional root) { @@ -1564,17 +1460,7 @@ PYBIND11_MODULE(xla_extension, m) { ShapeIndex(param_index.begin(), param_index.end())); }); - // TODO(phawkins): delete capitalized names after updating callers - m.def("BufferToDLPackManagedTensor", BufferToDLPackManagedTensor); m.def("buffer_to_dlpack_managed_tensor", BufferToDLPackManagedTensor); - m.def("DLPackManagedTensorToBuffer", - [](const py::capsule& tensor, std::shared_ptr client) - -> StatusOr> { - TF_ASSIGN_OR_RETURN( - std::unique_ptr buffer, - DLPackManagedTensorToBuffer(tensor, client.get())); - return WrapWithClient(std::move(client), std::move(buffer)); - }); m.def("dlpack_managed_tensor_to_buffer", [](const py::capsule& tensor, std::shared_ptr client) -> StatusOr> { diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py index d9cd906939d..76c3bc33a91 100644 --- a/tensorflow/compiler/xla/python/xla_client.py +++ b/tensorflow/compiler/xla/python/xla_client.py @@ -300,13 +300,13 @@ CompileOptions = _xla.CompileOptions # An Executable is a C++ class that duck types with the following API: # class Executable(object): # def local_devices(self) -> [Device]: -# def Execute(self, arguments : [Buffer]) -> Buffer: +# def execute(self, arguments : [Buffer]) -> Buffer: # """Execute on one replica with Buffer arguments and return value.""" # -# def SizeOfGeneratedCodeInBytes(self) -> int: +# def size_of_generated_code_in_bytes(self) -> int: # """Return generated binary size, or -1 if not known.""" # -# def ExecuteOnLocalDevices(self, arguments: [[Buffer]]) -> [Buffer]: +# def execute_on_local_devices(self, arguments: [[Buffer]]) -> [Buffer]: # """Execute on many replicas with Buffer arguments and return value. # # Args: @@ -329,7 +329,7 @@ def execute_with_python_values(executable, arguments, backend): return backend.buffer_from_pyval(arg, device=executable.local_devices()[0]) arguments = [put(arg) for arg in arguments] - outputs = executable.Execute(arguments) + outputs = executable.execute(arguments) return [x.to_py() for x in outputs] @@ -359,7 +359,7 @@ def execute_with_python_values_replicated(executable, arguments, backend): flat_arg_buffers = flat_arg_buffers[len(replica_args):] return [[x.to_py() for x in xs] - for xs in executable.ExecuteOnLocalDevices(arg_buffers)] + for xs in executable.execute_on_local_devices(arg_buffers)] class PaddingType(enum.Enum): From ef45324fc62fc9a911e5771a40f9790900500de9 Mon Sep 17 00:00:00 2001 From: Karim Nosir Date: Mon, 18 May 2020 11:26:48 -0700 Subject: [PATCH 131/557] Hexagon Delegate - Allow optional tensors as valid tensors in inputs. - Update fully connected builder to handle optional bias tensor. PiperOrigin-RevId: 312119090 Change-Id: If905792a78f61abde0f269ed252aa2501ae60815 --- .../hexagon/builders/matmul_builder.cc | 68 +++++++++-------- .../hexagon/builders/tests/matmul_test.cc | 73 +++++++++++++++++-- .../experimental/delegates/hexagon/utils.cc | 21 ++++-- 3 files changed, 116 insertions(+), 46 deletions(-) diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc index c53e62d27a7..c0c815ffdcc 100644 --- a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc +++ b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc @@ -129,35 +129,41 @@ TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs, // Bias tensor. int bias_tensor_id = inputs->data[2]; - const auto& bias_tensor = context->tensors[bias_tensor_id]; - auto* const_bias_node = - graph_builder_->AddConstNodeWithData(bias_tensor_id, bias_tensor); - graph_builder_->AddTensorWithID(bias_tensor_id, const_bias_node->GetID(), 0); - ComputeMinAndMaxQuantValues(bias_tensor, &bias_min_, &bias_max_); - auto* bias_min_const = graph_builder_->AddConstNodeWithData( - quant_bound_shape, reinterpret_cast(&bias_min_), - sizeof(bias_min_)); - auto* bias_max_const = graph_builder_->AddConstNodeWithData( - quant_bound_shape, reinterpret_cast(&bias_max_), - sizeof(bias_max_)); + TensorID matmul_and_bias_out = matmul_out, + matmul_and_bias_out_min = matmul_out_min, + matmul_and_bias_out_max = matmul_out_max; + if (bias_tensor_id != -1) { + const auto& bias_tensor = context->tensors[bias_tensor_id]; + auto* const_bias_node = + graph_builder_->AddConstNodeWithData(bias_tensor_id, bias_tensor); + graph_builder_->AddTensorWithID(bias_tensor_id, const_bias_node->GetID(), + 0); + ComputeMinAndMaxQuantValues(bias_tensor, &bias_min_, &bias_max_); + auto* bias_min_const = graph_builder_->AddConstNodeWithData( + quant_bound_shape, reinterpret_cast(&bias_min_), + sizeof(bias_min_)); + auto* bias_max_const = graph_builder_->AddConstNodeWithData( + quant_bound_shape, reinterpret_cast(&bias_max_), + sizeof(bias_max_)); - // MatMul + Bias. - auto* bias_add_op = graph_builder_->AddNode(GetTFLiteNodeID()); - bias_add_op->SetOpType(OP_QuantizedBiasAdd_32p32to32); - bias_add_op->AddInput(matmul_out); - bias_add_op->AddInput(graph_builder_->GetHexagonTensorId(bias_tensor_id)); - bias_add_op->AddInput(matmul_out_min); - bias_add_op->AddInput(matmul_out_max); - bias_add_op->AddInput(TensorID(bias_min_const->GetID(), 0)); - bias_add_op->AddInput(TensorID(bias_max_const->GetID(), 0)); - const auto& bias_add_out = - bias_add_op->AddOutput(sizeof(int32_t), 4, - {output_batch_size, output_height_size, - output_width_size, output_depth_size}); - const auto& bias_add_out_min = - bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1}); - const auto& bias_add_out_max = - bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1}); + // MatMul + Bias. + auto* bias_add_op = graph_builder_->AddNode(GetTFLiteNodeID()); + bias_add_op->SetOpType(OP_QuantizedBiasAdd_32p32to32); + bias_add_op->AddInput(matmul_out); + bias_add_op->AddInput(graph_builder_->GetHexagonTensorId(bias_tensor_id)); + bias_add_op->AddInput(matmul_out_min); + bias_add_op->AddInput(matmul_out_max); + bias_add_op->AddInput(TensorID(bias_min_const->GetID(), 0)); + bias_add_op->AddInput(TensorID(bias_max_const->GetID(), 0)); + matmul_and_bias_out = + bias_add_op->AddOutput(sizeof(int32_t), 4, + {output_batch_size, output_height_size, + output_width_size, output_depth_size}); + matmul_and_bias_out_min = + bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1}); + matmul_and_bias_out_max = + bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1}); + } // Quantize 32-bit result into 8-bit format using output tensor min/max. ComputeMinAndMaxQuantValues(context->tensors[outputs->data[0]], &output_min_, @@ -170,9 +176,9 @@ TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs, sizeof(output_max_)); auto* quantize_biasadd_op = graph_builder_->AddNode(GetTFLiteNodeID()); quantize_biasadd_op->SetOpType(OP_Requantize_32to8); - quantize_biasadd_op->AddInput(bias_add_out); - quantize_biasadd_op->AddInput(bias_add_out_min); - quantize_biasadd_op->AddInput(bias_add_out_max); + quantize_biasadd_op->AddInput(matmul_and_bias_out); + quantize_biasadd_op->AddInput(matmul_and_bias_out_min); + quantize_biasadd_op->AddInput(matmul_and_bias_out_max); quantize_biasadd_op->AddInput(TensorID(output_min_const->GetID(), 0)); quantize_biasadd_op->AddInput(TensorID(output_max_const->GetID(), 0)); node_output_ = diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc index a16e22888dd..3a5f320a6a7 100644 --- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc +++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc @@ -22,7 +22,7 @@ using testing::ElementsAreArray; class FullyConnectedOpModel : public SingleOpModelWithHexagon { public: FullyConnectedOpModel(int units, int batches, const TensorData& input, - const TensorData& output) + const TensorData& output, bool optional_bias = false) : batches_(batches), units_(units) { int total_input_size = 1; for (size_t i = 0; i < input.shape.size(); ++i) { @@ -34,9 +34,13 @@ class FullyConnectedOpModel : public SingleOpModelWithHexagon { weights_ = AddInput({input.type, {units_, input_size_}, input.min, input.max}); - auto bias_scale = GetScale(input_) * GetScale(weights_); - TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale}; - bias_ = AddInput(bias); + if (optional_bias) { + bias_ = AddNullInput(); + } else { + auto bias_scale = GetScale(input_) * GetScale(weights_); + TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale}; + bias_ = AddInput(bias); + } output_ = AddOutput(output); @@ -46,15 +50,16 @@ class FullyConnectedOpModel : public SingleOpModelWithHexagon { FullyConnectedOptionsWeightsFormat_DEFAULT, /*keep_num_dims=*/false) .Union()); - - BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)}); + BuildInterpreter({GetShape(input_), GetShape(weights_)}); // Weights & bias tensors need to be constant. // We don't use AddConstInput to allow setting filter values later. auto* weights_tensor = interpreter_->tensor(weights_); weights_tensor->allocation_type = kTfLiteMmapRo; - auto* bias_tensor = interpreter_->tensor(bias_); - bias_tensor->allocation_type = kTfLiteMmapRo; + if (!optional_bias) { + auto* bias_tensor = interpreter_->tensor(bias_); + bias_tensor->allocation_type = kTfLiteMmapRo; + } } void SetBias(const std::vector& data) { @@ -146,4 +151,56 @@ TEST(QuantizedFullyConnectedOpTest, TestQuantizedUint8) { ElementsAre(151, 152, 153, 185, 186, 187)); } +TEST(QuantizedFullyConnectedOpTest, TestQuantizedUint8_NoBias) { + FullyConnectedOpModel m( + /*units=*/3, /*batches*/ 2, + /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64}, + /*output=*/{TensorType_UINT8, {}, -127, 128}, /*optional_bias*/ true); + + m.SetWeights({ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 + }); + + m.SetInput({ + 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 + }); + + m.Invoke(); + auto reference_output = m.GetDequantizedOutput(); + + m.ApplyDelegateAndInvoke(); + + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear(reference_output))); +} + +TEST(QuantizedFullyConnectedOpTest, TestQuantizedInt8_NoBias) { + FullyConnectedOpModel m(/*units=*/3, /*batches*/ 2, + /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64}, + /*output=*/{TensorType_INT8, {}, -127, 128}, + /*optional_bias*/ true); + + m.SetWeights({ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 0 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 1 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 + }); + + m.SetInput({ + 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 + }); + + m.Invoke(); + auto reference_output = m.GetDequantizedOutput(); + + m.ApplyDelegateAndInvoke(); + + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear(reference_output))); +} + } // namespace tflite diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc index 8aff13549b8..ae7f6994657 100644 --- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc +++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc @@ -116,6 +116,9 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration, int tensor_id; for (int i = 0; i < node->inputs->size; ++i) { tensor_id = node->inputs->data[i]; + // Skip optional tensors. Builders should handle optional tensors + // not available. + if (tensor_id == -1) continue; const auto& tensor = context->tensors[tensor_id]; if (tensor.dims->size > 4) return false; } @@ -191,19 +194,22 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration, if (!InputsWithCorrectTypes(node, context, {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteUInt8, kTfLiteInt8}, - {kTfLiteInt32}})) + {kTfLiteInt32, kTfLiteNoType}})) return false; const auto& weights_tensor = context->tensors[node->inputs->data[1]]; - const auto& bias_tensor = context->tensors[node->inputs->data[2]]; - const bool weights_and_bias_const = - weights_tensor.allocation_type == kTfLiteMmapRo && - bias_tensor.allocation_type == kTfLiteMmapRo; + bool bias_const_or_no_bias = true; + if (node->inputs->data[2] != -1) { + const auto& bias_tensor = context->tensors[node->inputs->data[2]]; + bias_const_or_no_bias = bias_tensor.allocation_type == kTfLiteMmapRo; + } + const bool weights_const = + weights_tensor.allocation_type == kTfLiteMmapRo; const TfLiteFullyConnectedParams* matmul_params = reinterpret_cast( node->builtin_data); - return (weights_and_bias_const && + return (weights_const && bias_const_or_no_bias && IsActivationReluOrNone(matmul_params->activation) && matmul_params->keep_num_dims == false && matmul_params->weights_format == @@ -335,7 +341,8 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration, return false; const auto& input_tensor = context->tensors[node->inputs->data[1]]; const bool is_four_dim_or_less = input_tensor.dims->size < 5; - // We need splitting axis to be constant, so Hexagon knows output shapes. + // We need splitting axis to be constant, so Hexagon knows output + // shapes. return is_four_dim_or_less && IsConstantTensor(GetInput(context, node, 0)); } From 6f19d507f4955f571582349213c69991868379bb Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Mon, 18 May 2020 11:50:56 -0700 Subject: [PATCH 132/557] [XLA] Fix rendering of the RngBitGenerator description table PiperOrigin-RevId: 312123981 Change-Id: I9d1ecdf88dfb9f5689dcfc26f6243a192ab55dd6 --- .../compiler/xla/g3doc/operation_semantics.md | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md index 495701eaac2..002d07184a7 100644 --- a/tensorflow/compiler/xla/g3doc/operation_semantics.md +++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md @@ -2299,20 +2299,26 @@ The output is guaranteed to be a deterministic function of the initial state but it is *not* guaranteed to be deterministic between backends and different compiler versions. -`RngBitGenerator(algorithm, key, shape)` | Arguments | Type | Semantics | -|---------------- | ----------------- | ------------------------------------- | -| `algorithm` | `RandomAlgorithm` | PRNG algorithm to be used. | | -`initial_state` | `XlaOp` | Initial state for the PRNG algorithm. | | `shape` | -`Shape` | Output shape for generated data. | +`RngBitGenerator(algorithm, key, shape)` -Available values for `algorithm`: * `rng_default`: Backend specific algorithm -with backend specific shape requirements. * `rng_three_fry`: ThreeFry -counter-based PRNG algorithm. The `initial_state` shape is `u64[2]` with -arbitrary values. -[Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf) -* `rng_philox`: Philox algorithm to generate random numbers in parallel. The -`initial_state` shape is `u64[3]` with arbitrary values. -[Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf) +Arguments | Type | Semantics +--------------- | ----------------- | ------------------------------------- +`algorithm` | `RandomAlgorithm` | PRNG algorithm to be used. +`initial_state` | `XlaOp` | Initial state for the PRNG algorithm. +`shape` | `Shape` | Output shape for generated data. + +Available values for `algorithm`: + +- `rng_default`: Backend specific algorithm with backend specific shape + requirements. + +- `rng_three_fry`: ThreeFry counter-based PRNG algorithm. The `initial_state` + shape is `u64[2]` with arbitrary values. + [Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf) + +- `rng_philox`: Philox algorithm to generate random numbers in parallel. The + `initial_state` shape is `u64[3]` with arbitrary values. + [Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf) ## Scatter From 672e419c9f7e331fff4449799e8cd7c476ac4b7c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 18 May 2020 12:35:23 -0700 Subject: [PATCH 133/557] Enable tests for tf.linalg.lu in eager mode. PiperOrigin-RevId: 312132817 Change-Id: I0dd5b96cc2b3462817e0637794a623c24bd0f989 --- tensorflow/python/kernel_tests/lu_op_test.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py index 7935b66f4af..de9d8c32cb5 100644 --- a/tensorflow/python/kernel_tests/lu_op_test.py +++ b/tensorflow/python/kernel_tests/lu_op_test.py @@ -30,7 +30,7 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import linalg_ops from tensorflow.python.ops import map_fn from tensorflow.python.ops import math_ops -from tensorflow.python.ops import random_ops +from tensorflow.python.ops import stateless_random_ops from tensorflow.python.ops import variables from tensorflow.python.platform import benchmark from tensorflow.python.platform import test @@ -214,15 +214,20 @@ class LuOpTest(test.TestCase): data = np.random.rand(n, n) + 1j * np.random.rand(n, n) self._verifyLu(data) - @test_util.run_v1_only("b/120545219") + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def testEmpty(self): self._verifyLu(np.empty([0, 2, 2])) self._verifyLu(np.empty([2, 0, 0])) - @test_util.run_deprecated_v1 + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def testConcurrentExecutesWithoutError(self): - matrix1 = random_ops.random_normal([5, 5], seed=42) - matrix2 = random_ops.random_normal([5, 5], seed=42) + matrix_shape = [5, 5] + seed = [42, 24] + matrix1 = stateless_random_ops.stateless_random_normal( + shape=matrix_shape, seed=seed) + matrix2 = stateless_random_ops.stateless_random_normal( + shape=matrix_shape, seed=seed) + self.assertAllEqual(matrix1, matrix2) lu1, p1 = linalg_ops.lu(matrix1) lu2, p2 = linalg_ops.lu(matrix2) lu1_val, p1_val, lu2_val, p2_val = self.evaluate([lu1, p1, lu2, p2]) From 7254343a10ba00d48f828981cec3e3587e667ca9 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 18 May 2020 12:37:47 -0700 Subject: [PATCH 134/557] Enable tests for tf.linalg.matrix_square_root in eager mode. PiperOrigin-RevId: 312133318 Change-Id: I541a94a21594384fba30a9198ad5a7300537c498 --- .../matrix_square_root_op_test.py | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py index c36d83e2530..6cf330ed981 100644 --- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py +++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py @@ -21,10 +21,11 @@ from __future__ import print_function import numpy as np from tensorflow.python.framework import constant_op +from tensorflow.python.framework import errors_impl from tensorflow.python.framework import test_util from tensorflow.python.ops import gen_linalg_ops from tensorflow.python.ops import math_ops -from tensorflow.python.ops import random_ops +from tensorflow.python.ops import stateless_random_ops from tensorflow.python.platform import test @@ -89,31 +90,35 @@ class SquareRootOpTest(test.TestCase): self._verifySquareRootReal(np.empty([0, 2, 2])) self._verifySquareRootReal(np.empty([2, 0, 0])) - @test_util.run_v1_only("b/120545219") + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def testWrongDimensions(self): # The input to the square root should be at least a 2-dimensional tensor. tensor = constant_op.constant([1., 2.]) - with self.assertRaises(ValueError): + with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)): gen_linalg_ops.matrix_square_root(tensor) - @test_util.run_v1_only("b/120545219") + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def testNotSquare(self): - with self.assertRaises(ValueError): + with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)): tensor = constant_op.constant([[1., 0., -1.], [-1., 1., 0.]]) self.evaluate(gen_linalg_ops.matrix_square_root(tensor)) - @test_util.run_v1_only("b/120545219") + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def testConcurrentExecutesWithoutError(self): - with test_util.use_gpu(): - matrix1 = random_ops.random_normal([5, 5], seed=42) - matrix2 = random_ops.random_normal([5, 5], seed=42) - square1 = math_ops.matmul(matrix1, matrix1) - square2 = math_ops.matmul(matrix2, matrix2) - sqrt1 = gen_linalg_ops.matrix_square_root(square1) - sqrt2 = gen_linalg_ops.matrix_square_root(square2) - all_ops = [sqrt1, sqrt2] - sqrt = self.evaluate(all_ops) - self.assertAllClose(sqrt[0], sqrt[1]) + matrix_shape = [5, 5] + seed = [42, 24] + matrix1 = stateless_random_ops.stateless_random_normal( + shape=matrix_shape, seed=seed) + matrix2 = stateless_random_ops.stateless_random_normal( + shape=matrix_shape, seed=seed) + self.assertAllEqual(matrix1, matrix2) + square1 = math_ops.matmul(matrix1, matrix1) + square2 = math_ops.matmul(matrix2, matrix2) + sqrt1 = gen_linalg_ops.matrix_square_root(square1) + sqrt2 = gen_linalg_ops.matrix_square_root(square2) + all_ops = [sqrt1, sqrt2] + sqrt = self.evaluate(all_ops) + self.assertAllClose(sqrt[0], sqrt[1]) if __name__ == "__main__": From b5436f9d5fe7bdfc8e42f0b27328a8457d48ccf6 Mon Sep 17 00:00:00 2001 From: Jose Baiocchi Date: Mon, 18 May 2020 12:43:30 -0700 Subject: [PATCH 135/557] Rename TraceMe::SetMetadata to TraceMe::AppendMetadata and add lambda overload. PiperOrigin-RevId: 312134462 Change-Id: Ia1a0f7de954fba6c0b05a6beae10cc08dc803cfc --- tensorflow/core/profiler/lib/BUILD | 2 + tensorflow/core/profiler/lib/traceme.h | 56 +++++++++----- tensorflow/core/profiler/lib/traceme_encode.h | 73 +++++++++++++++---- tensorflow/python/profiler/internal/BUILD | 1 + .../profiler/internal/traceme_wrapper.cc | 10 ++- 5 files changed, 107 insertions(+), 35 deletions(-) diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD index 0aa1a5d6b67..5bb9236efb3 100644 --- a/tensorflow/core/profiler/lib/BUILD +++ b/tensorflow/core/profiler/lib/BUILD @@ -94,6 +94,7 @@ cc_library( hdrs = ["traceme.h"], visibility = ["//visibility:public"], deps = [ + ":traceme_encode", "@com_google_absl//absl/strings", "//tensorflow/core:lib", "//tensorflow/core/platform", @@ -159,6 +160,7 @@ filegroup( "profiler_session.h", "scoped_annotation.h", "traceme.h", + "traceme_encode.h", ], visibility = ["//visibility:public"], ) diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h index 2c3e3ebe6cc..ec5f6765afb 100644 --- a/tensorflow/core/profiler/lib/traceme.h +++ b/tensorflow/core/profiler/lib/traceme.h @@ -28,6 +28,7 @@ limitations under the License. #if !defined(IS_MOBILE_PLATFORM) #include "tensorflow/core/profiler/internal/traceme_recorder.h" #endif +#include "tensorflow/core/profiler/lib/traceme_encode.h" // IWYU pragma: export namespace tensorflow { namespace profiler { @@ -123,13 +124,20 @@ class TraceMe { explicit TraceMe(const char* raw, int level = 1) : TraceMe(absl::string_view(raw), level) {} - // This overload only generates the activity name if tracing is enabled. - // Useful for avoiding things like string concatenation when tracing is - // disabled. The |name_generator| may be a lambda or functor that returns a - // type that the string() constructor can take. + // This overload only generates the name (and possibly metadata) if tracing is + // enabled. Useful for avoiding expensive operations (e.g., string + // concatenation) when tracing is disabled. + // name_generator may be a lambda or functor that returns a type that the + // string() constructor can take, e.g., the result of TraceMeEncode. // name_generator is templated, rather than a std::function to avoid // allocations std::function might make even if never called. - // Usage: profiler::TraceMe([&]{ return StrCat(prefix, ":", postfix); }); + // Example Usage: + // TraceMe op_trace_me([&]() { + // return StrCat(op_name, ":", op_type); + // } + // TraceMe trace_me_with_metadata([&value1]() { + // return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}}); + // }); template explicit TraceMe(NameGeneratorT name_generator, int level = 1) { DCHECK_GE(level, 1); @@ -167,21 +175,35 @@ class TraceMe { #endif } - // Sets new_metadata in the metadata part of no_init_.name. - void SetMetadata(absl::string_view new_metadata) { + // Appends new_metadata to the TraceMe name passed to the constructor. + // metadata_generator may be a lambda or functor that returns a type that the + // string() constructor can take, e.g., the result of TraceMeEncode. + // metadata_generator is only evaluated when tracing is enabled. + // metadata_generator is templated, rather than a std::function to avoid + // allocations std::function might make even if never called. + // Example Usage: + // trace_me.AppendMetadata([&value1]() { + // return TraceMeEncode({{"key1", value1}, {"key2", 42}}); + // }); + template + void AppendMetadata(MetadataGeneratorT metadata_generator) { #if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) { if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) { - std::string& name = no_init_.name; - DCHECK(!name.empty()); - DCHECK(!new_metadata.empty()); - if (name.back() == '#') { // name already has metadata - name.back() = ','; - if (TF_PREDICT_TRUE(new_metadata.front() == '#')) { - new_metadata.remove_prefix(1); - } - } - name.append(new_metadata.data(), new_metadata.size()); + traceme_internal::AppendMetadata(&no_init_.name, metadata_generator()); + } + } +#endif + } + + // Appends new_metadata to the payload. + // This overload should only be used by other TraceMe APIs. + // Prefer the overload above instead. + void AppendMetadata(absl::string_view new_metadata) { +#if !defined(IS_MOBILE_PLATFORM) + if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) { + if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) { + traceme_internal::AppendMetadata(&no_init_.name, new_metadata); } } #endif diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h index 772f56a2153..2e23c6d878b 100644 --- a/tensorflow/core/profiler/lib/traceme_encode.h +++ b/tensorflow/core/profiler/lib/traceme_encode.h @@ -28,7 +28,7 @@ limitations under the License. namespace tensorflow { namespace profiler { -namespace internal { +namespace traceme_internal { // Copies the contents of str to the address pointed by out. // Returns the address after the copy. @@ -36,24 +36,18 @@ namespace internal { TF_ATTRIBUTE_ALWAYS_INLINE inline char* Append(char* out, absl::string_view str) { const size_t str_size = str.size(); - if (str_size > 0) { + if (TF_PREDICT_TRUE(str_size > 0)) { memcpy(out, str.data(), str_size); out += str_size; } return out; } -} // namespace internal - -// Encodes an event name and arguments into a string stored by TraceMe. -// Use within a lambda to avoid expensive operations when tracing is inactive. -// Example Usage: -// TraceMe trace_me([&name, value1]() { -// return TraceMeEncode(name, {{"key1", value1}, {"key2", 42}}); -// }); -inline std::string TraceMeEncode( +// Appends args encoded as TraceMe metadata to name. +TF_ATTRIBUTE_ALWAYS_INLINE inline std::string AppendArgs( std::string name, - std::initializer_list> args) { + const std::initializer_list>& + args) { if (TF_PREDICT_TRUE(args.size() > 0)) { const auto old_size = name.size(); auto new_size = old_size + args.size() * 2 + 1; @@ -65,9 +59,9 @@ inline std::string TraceMeEncode( char* out = begin + old_size; *out++ = '#'; for (const auto& arg : args) { - out = internal::Append(out, arg.first); + out = Append(out, arg.first); *out++ = '='; - out = internal::Append(out, arg.second.Piece()); + out = Append(out, arg.second.Piece()); *out++ = ','; } *(out - 1) = '#'; @@ -76,6 +70,57 @@ inline std::string TraceMeEncode( return name; } +// Appends new_metadata to the metadata part of name. +TF_ATTRIBUTE_ALWAYS_INLINE inline void AppendMetadata( + std::string* name, absl::string_view new_metadata) { + if (!TF_PREDICT_FALSE(new_metadata.empty())) { + if (!name->empty() && name->back() == '#') { // name already has metadata + name->back() = ','; + if (TF_PREDICT_TRUE(new_metadata.front() == '#')) { + new_metadata.remove_prefix(1); + } + } + name->append(new_metadata.data(), new_metadata.size()); + } +} + +} // namespace traceme_internal + +// Encodes an event name and arguments into TraceMe metadata. +// Use within a lambda to avoid expensive operations when tracing is disabled. +// Example Usage: +// TraceMe trace_me([value1]() { +// return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}}); +// }); +inline std::string TraceMeEncode( + std::string name, + std::initializer_list> args) { + return traceme_internal::AppendArgs(std::move(name), args); +} +inline std::string TraceMeEncode( + absl::string_view name, + std::initializer_list> args) { + return traceme_internal::AppendArgs(std::string(name), args); +} +inline std::string TraceMeEncode( + const char* name, + std::initializer_list> args) { + return traceme_internal::AppendArgs(std::string(name), args); +} + +// Encodes arguments into TraceMe metadata. +// Use within a lambda to avoid expensive operations when tracing is disabled. +// Example Usage: +// TraceMe trace_me("my_trace"); +// ... +// trace_me.AppendMetadata([value1]() { +// return TraceMeEncode({{"key1", value1}, {"key2", 42}}); +// }); +inline std::string TraceMeEncode( + std::initializer_list> args) { + return traceme_internal::AppendArgs(std::string(), args); +} + } // namespace profiler } // namespace tensorflow diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD index d9f93c2fb21..9b0f216508e 100644 --- a/tensorflow/python/profiler/internal/BUILD +++ b/tensorflow/python/profiler/internal/BUILD @@ -89,6 +89,7 @@ tf_python_pybind_extension( deps = [ "//tensorflow/core:lib", "//tensorflow/core/profiler/lib:traceme_headers", + "@com_google_absl//absl/strings", "@com_google_absl//absl/types:optional", "@pybind11", ], diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.cc b/tensorflow/python/profiler/internal/traceme_wrapper.cc index a1b5370836b..6b0098e316d 100644 --- a/tensorflow/python/profiler/internal/traceme_wrapper.cc +++ b/tensorflow/python/profiler/internal/traceme_wrapper.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include #include +#include "absl/strings/string_view.h" #include "absl/types/optional.h" #include "pybind11/pybind11.h" #include "tensorflow/core/platform/types.h" @@ -27,13 +29,13 @@ namespace { // Helper to implement TraceMe as a context manager in Python. class TraceMeWrapper { public: - explicit TraceMeWrapper(const tensorflow::string& name) : name_(name) {} + explicit TraceMeWrapper(const std::string& name) : name_(name) {} void Enter() { traceme_.emplace(std::move(name_)); } - void SetMetadata(const tensorflow::string& new_metadata) { + void SetMetadata(const std::string& new_metadata) { if (TF_PREDICT_TRUE(traceme_)) { - traceme_->SetMetadata(new_metadata); + traceme_->AppendMetadata(absl::string_view(new_metadata)); } } @@ -50,7 +52,7 @@ class TraceMeWrapper { PYBIND11_MODULE(_pywrap_traceme, m) { py::class_ traceme_class(m, "TraceMe"); - traceme_class.def(py::init()) + traceme_class.def(py::init()) .def("Enter", &TraceMeWrapper::Enter) .def("Exit", &TraceMeWrapper::Exit) .def("SetMetadata", &TraceMeWrapper::SetMetadata) From 8e661af54d9787b2a3a2371cc6efcfa1d8db6a34 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Mon, 18 May 2020 13:03:24 -0700 Subject: [PATCH 136/557] [XLA] Simplify tautological compares (and (< x A) (< x B)) to (< x A) when `a <= B` holds. This is required for figuring out the trip count of loops whose condition contains the conjunction. Such conjunctions arise from TF when a for loop with `tf.range` is lowered, or when using `tf.while_loop` with `maximum_iterations` set. PiperOrigin-RevId: 312138518 Change-Id: I12c5c7d0aeedbf0d375f3cff1d23b39aea89f64a --- .../xla/service/algebraic_simplifier.cc | 65 +++++++++++++++++++ .../xla/service/algebraic_simplifier_test.cc | 19 ++++++ 2 files changed, 84 insertions(+) diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index 55af8726dc8..ecbf2075abe 100755 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -508,6 +508,13 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor { // Tries to convert slice(reshape(X)) into reshape(slice(X)) StatusOr TryToReorderSliceAndReshape(HloInstruction* slice); + // Tries to simplify `(and (< a N) (< a K))` in cases where `N <= K` into + // `(< a N)`. This is crucial for being able to figure out the loop trip + // count. + // + // Assumes that the input is conjunction. + StatusOr TrySimplifyTautologicalCompare(HloInstruction* conjunction); + // Useful when we want to use the same visitor over multiple computations. void ResetState(HloComputation* computation); @@ -856,6 +863,57 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) { return Status::OK(); } +StatusOr AlgebraicSimplifierVisitor::TrySimplifyTautologicalCompare( + HloInstruction* conjunction) { + HloInstruction *lhs, *rhs; + if (!Match(conjunction, m::And(m::Op(&lhs), m::Op(&rhs)))) { + return false; + } + struct LessThanCompareInfo { // (LT var constant) + HloInstruction* var; + int64 constant; + }; + + auto get_compare_info_helper = + [&](HloInstruction* lhs, + HloInstruction* rhs) -> absl::optional { + if (!Match(rhs, m::Constant().WithShape( + m::Shape().IsEffectiveScalar().WithElementType( + PrimitiveType::S32)))) { + return absl::nullopt; + } + return {LessThanCompareInfo{lhs, *rhs->literal().GetFirstInteger()}}; + }; + + auto get_compare_info = + [&](HloInstruction* cmp) -> absl::optional { + HloInstruction *lhs, *rhs; + if (!Match(cmp, m::Compare(m::Op(&lhs), m::Op(&rhs)) + .WithComparisonDirection(ComparisonDirection::kLt))) { + return absl::nullopt; + } + if (auto match1 = get_compare_info_helper(lhs, rhs)) { + return match1; + } else if (auto match2 = get_compare_info_helper(rhs, lhs)) { + return match2; + } + return absl::nullopt; + }; + + absl::optional lhs_info = get_compare_info(lhs); + absl::optional rhs_info = get_compare_info(rhs); + if (lhs_info && rhs_info && lhs_info->var == rhs_info->var) { + int64 new_bound = std::min(lhs_info->constant, rhs_info->constant); + TF_RETURN_IF_ERROR(ReplaceWithNewInstruction( + conjunction, + HloInstruction::CreateCompare(lhs->shape(), lhs_info->var, + MakeScalarLike(lhs_info->var, new_bound), + ComparisonDirection::kLt))); + return true; + } + return false; +} + Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) { HloInstruction *lhs, *rhs; CHECK(Match(logical_and, m::And(m::Op(&lhs), m::Op(&rhs)))); @@ -890,6 +948,13 @@ Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) { return Status::OK(); } + // Simplify tautological conjunctions. + TF_ASSIGN_OR_RETURN(bool found_tautological_compare, + TrySimplifyTautologicalCompare(logical_and)); + if (found_tautological_compare) { + return Status::OK(); + } + return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc index 6c8e80aa963..08a004e39fe 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc @@ -5761,6 +5761,25 @@ TEST_F(AlgebraicSimplifierTest, CompareSame) { GmockMatch(m::Broadcast(m::ConstantScalar(true)))); } +TEST_F(AlgebraicSimplifierTest, CompareSimplified) { + const char* kModuleStr = R"( + HloModule m + test { + param = s32[] parameter(0) + c1 = s32[] constant(10) + c2 = s32[] constant(100) + cmp1 = pred[] compare(param, c1), direction=LT + cmp2 = pred[] compare(param, c2), direction=LT + ROOT out = pred[] and(cmp1, cmp2) + })"; + TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr)); + ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie()); + EXPECT_THAT( + m->entry_computation()->root_instruction(), + GmockMatch(m::Compare(m::Op(), m::Op().IsConstantScalar(10)) + .WithComparisonDirection(ComparisonDirection::kLt))); +} + TEST_F(AlgebraicSimplifierTest, CanDisableDotToMultiplyRewrite) { // Some backends may have better performance by treating an outer product as a // Dot, rather than a broadcast Multiply From 869920697b243622073317ddc533bdff41684c41 Mon Sep 17 00:00:00 2001 From: Jared Duke Date: Mon, 18 May 2020 13:27:55 -0700 Subject: [PATCH 137/557] [tf.lite] Use in-process conversion when the new converter is used Out-of-process conversion was a workaround for the legacy converter, which would generally crash the process when conversion failed. However, out-of-process conversion also adds a good deal of complexity, so avoid it when using the new conversion backend. PiperOrigin-RevId: 312142994 Change-Id: I7ddc83df99ccf24be6e15f46d6a116dce8321933 --- tensorflow/lite/python/convert.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py index 6b7a32f1bcc..a5fbb88132e 100644 --- a/tensorflow/lite/python/convert.py +++ b/tensorflow/lite/python/convert.py @@ -169,9 +169,10 @@ def toco_convert_protos(model_flags_str, RuntimeError: When conversion fails, an exception is raised with the error message embedded. """ - # TODO(aselle): When toco does not use fatal errors for failure, we can - # switch this on. - if not _toco_from_proto_bin: + # Historically, TOCO conversion failures would trigger a crash, so we would + # attempt to run the converter out-of-process. The MLIR conversion pipeline + # surfaces errors instead, and can be safely run in-process. + if enable_mlir_converter or not _toco_from_proto_bin: try: model_str = wrap_toco.wrapped_toco_convert(model_flags_str, toco_flags_str, input_data_str, From da67fcddef242a0c358f4acc5f263880c1863836 Mon Sep 17 00:00:00 2001 From: Sachin Joglekar Date: Mon, 18 May 2020 13:36:18 -0700 Subject: [PATCH 138/557] Edit Hexagon documentation to reflect new supported models PiperOrigin-RevId: 312144610 Change-Id: I9c8b0d9ad6ea4b745b4bb985ca143cca660a5b14 --- .../g3doc/performance/hexagon_delegate.md | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/tensorflow/lite/g3doc/performance/hexagon_delegate.md b/tensorflow/lite/g3doc/performance/hexagon_delegate.md index 60fe9465bf4..0e947d1d5e1 100644 --- a/tensorflow/lite/g3doc/performance/hexagon_delegate.md +++ b/tensorflow/lite/g3doc/performance/hexagon_delegate.md @@ -22,15 +22,15 @@ are supported, including: **Supported models:** -The Hexagon delegate currently supports quantized models generated using -[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize), -e.g., -[these quantized models](https://www.tensorflow.org/lite/guide/hosted_models#quantized_models) -hosted on the TensorFlow Lite repo. It does not (yet) support models with -[8-bit symmetric quantization spec](https://www.tensorflow.org/lite/performance/quantization_spec). -Sample models include -[MobileNet V1](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz), -[SSD Mobilenet](https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip). +The Hexagon delegate supports all models that conform to our +[8-bit symmetric quantization spec](https://www.tensorflow.org/lite/performance/quantization_spec), +including those generated using +[post-training integer quantization](https://www.tensorflow.org/lite/performance/post_training_integer_quant). +UInt8 models trained with the legacy +[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize) +path are also supported, for e.g., +[these quantized versions](https://www.tensorflow.org/lite/guide/hosted_models#quantized_models) +on our Hosted Models page. ## Hexagon Delegate Java API @@ -254,10 +254,6 @@ ro.board.platform`). ## FAQ -* Will the delegate support models created using - [post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)? - * This is tentatively planned for a future release, though there is no - concrete timeline. * Which ops are supported by the delegate? * See the current list of [supported ops and constraints](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/delegates/hexagon/README.md) * How can I tell that the model is using the DSP when I enable the delegate? From d4f71ff132a1262f4a6b05f58807e8ba3d46b83d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 18 May 2020 13:38:25 -0700 Subject: [PATCH 139/557] Enable tests for tf.linalg.tensordot in eager mode. PiperOrigin-RevId: 312144965 Change-Id: I2d75f7d9bd7f05aef6d1dee620dffcea66071b97 --- .../python/kernel_tests/tensordot_op_test.py | 43 ++++++++++++------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py index 71e448f7855..7f8c5e9781b 100644 --- a/tensorflow/python/kernel_tests/tensordot_op_test.py +++ b/tensorflow/python/kernel_tests/tensordot_op_test.py @@ -20,7 +20,7 @@ from __future__ import print_function import numpy as np -from tensorflow.python import tf2 +from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl @@ -41,16 +41,19 @@ def _add_test(test, test_name, fn): class TensordotTest(test_lib.TestCase): - @test_util.run_v1_only("b/120545219") + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def test_invalid_shape(self): a = [[1, 2], [3, 4]] b = [[1, 2], [3, 4], [5, 6]] a_axes = [1] b_axes = [0] # Invalid static shapes. - with self.assertRaises(ValueError): + with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)): math_ops.tensordot(a, b, (a_axes, b_axes)) + # Invalid dynamic shapes. + if context.executing_eagerly(): + return with self.cached_session() as sess: with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, "Matrix size-incompatible"): @@ -65,7 +68,7 @@ class TensordotTest(test_lib.TestCase): axes_ph: (a_axes, b_axes) }) - @test_util.run_v1_only("b/120545219") + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def test_invalid_axes(self): a = [[1, 2], [3, 4]] b = [[1, 2], [3, 4]] @@ -77,6 +80,8 @@ class TensordotTest(test_lib.TestCase): with self.assertRaises(IndexError): math_ops.tensordot(a, b, [[0], [7]]) + if context.executing_eagerly(): + return # Invalid dynamic axes. a_ph = array_ops.placeholder(dtypes.float32) b_ph = array_ops.placeholder(dtypes.float32) @@ -93,22 +98,22 @@ class TensordotTest(test_lib.TestCase): axes_ph: axes_value }) - # Test case for 11950 + # Test case for https://github.com/tensorflow/tensorflow/issues/11950 + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def test_valid_axis(self): for axes_value in [1, 2], [[1], [2]], [[], []], 0: - with self.cached_session(): - np_a = np.ones((3, 3)) - np_b = np.array([2, 3, 1])[None, None] - np_ans = np.tensordot(np_a, np_b, axes_value) + np_a = np.ones((3, 3)) + np_b = np.array([2, 3, 1])[None, None] + np_ans = np.tensordot(np_a, np_b, axes_value) - tf_a = array_ops.ones((3, 3), dtype=dtypes.float32) - tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None] - tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value) + tf_a = array_ops.ones((3, 3), dtype=dtypes.float32) + tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None] + tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value) - self.assertAllEqual(tf_ans.shape, np_ans.shape) - self.assertAllEqual(tf_ans, np_ans) + self.assertAllEqual(tf_ans.shape, np_ans.shape) + self.assertAllEqual(self.evaluate(tf_ans), np_ans) - @test_util.run_v1_only("b/120545219") + @test_util.run_v1_only("Shape inference test") def test_partial_shape_inference(self): for axes in ([1], [0]), 1: a = array_ops.placeholder(dtypes.float32) @@ -159,7 +164,10 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_): size=np.prod(b_shape)).reshape(b_shape).astype(dtype_) return a, b, a_dims, b_dims + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def test_tensordot(self): + if dynamic_shape_ and context.executing_eagerly(): + self.skipTest("Placeholders not support in eager mode") num_trials = min(30, num_dims_ * num_dims_) if dtype_ == np.float16: tol = 0.05 @@ -187,7 +195,10 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_): self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol) self.assertAllEqual(tf_ans.shape, np_ans.shape) + @test_util.run_in_graph_and_eager_modes(use_gpu=True) def test_tensordot_scalar_axes(self): + if dynamic_shape_ and context.executing_eagerly(): + self.skipTest("Placeholders not support in eager mode") if num_dims_ < 1: self.skipTest("Not a test") if dtype_ == np.float16: @@ -229,7 +240,7 @@ if __name__ == "__main__": for rank_b in 1, 2, 4, 5: for num_dims in range(0, min(rank_a, rank_b) + 1): # TF2 does not support placeholders under eager so we skip it - for dynamic_shape in set([False, not tf2.enabled()]): + for dynamic_shape in set([False, True]): for testcase in _get_tensordot_tests(dtype, rank_a, rank_b, num_dims, dynamic_shape): name = "%s_%s_%s_%s_%s_%s" % (testcase.__name__, dtype.__name__, From ecf503380978e04e5e47f231fcc33a49d6c9d841 Mon Sep 17 00:00:00 2001 From: Ruoxin Sang Date: Mon, 18 May 2020 13:38:32 -0700 Subject: [PATCH 140/557] Return a meaningful error for dynamic shape inputs with outside compilation head extraction in TPUs. PiperOrigin-RevId: 312144982 Change-Id: I187b58ac8759b391fdcb9649bffd979025350f55 --- .../python/distribute/tpu_strategy_test.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py index de4c975d5ef..6c93e29c028 100644 --- a/tensorflow/python/distribute/tpu_strategy_test.py +++ b/tensorflow/python/distribute/tpu_strategy_test.py @@ -28,6 +28,7 @@ from tensorflow.python.eager import def_function from tensorflow.python.eager import function from tensorflow.python.eager import remote from tensorflow.python.eager import test +from tensorflow.python.framework import config from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops @@ -140,6 +141,9 @@ class TPUStrategyTest(test.TestCase): # for non-local TPU. if FLAGS.tpu: self.skipTest("Recovery fails for non-local TPU, see b/148150981") + + # Disable automatic outside compilation. + config.set_soft_device_placement(False) strategy = get_tpu_strategy() @def_function.function @@ -164,6 +168,28 @@ class TPUStrategyTest(test.TestCase): good_run() + def test_dynamic_shape_with_outside_compilation_failure(self): + # Enable automatic outside compilation. + config.set_soft_device_placement(True) + strategy = get_tpu_strategy() + dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch( + 2, drop_remainder=False) + dataset = strategy.experimental_distribute_dataset(dataset) + iterator = iter(dataset) + + @def_function.function + def train_fn(iterator): + + def step_fn(inputs): + _, inputs = inputs + return math_ops.reduce_sum(inputs) + + return strategy.experimental_local_results( + strategy.run(step_fn, args=(next(iterator),))) + + with self.assertRaisesRegex(errors.InternalError, "Compilation failure"): + logging.info(train_fn(iterator)) + def test_computation_on_subset_cores(self): resolver = get_tpu_cluster_resolver() remote.connect_to_cluster(resolver) From 3d4c5d1b578397070d8cecbfe88d8fa06c183189 Mon Sep 17 00:00:00 2001 From: Karim Nosir Date: Mon, 18 May 2020 14:06:53 -0700 Subject: [PATCH 141/557] NFC: Update canonicalize tests to use regex. PiperOrigin-RevId: 312150354 Change-Id: Ifed616606d5c8c708a3800256c4234b9bbb3ce3c --- .../mlir/lite/tests/canonicalize.mlir | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir index 1f067aae685..5c69130c939 100644 --- a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir +++ b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir @@ -11,9 +11,9 @@ func @reshape_removeAdjacent(tensor<4x4x4xf32>) -> tensor<64xf32> { return %1 : tensor<64xf32> // CHECK-LABEL: func @reshape_removeAdjacent -// CHECK: %cst = constant dense<64> : tensor<1xi32> -// CHECK: %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32> -// CHECK: return +// CHECK: %[[CST:.*]] = constant dense<64> : tensor<1xi32> +// CHECK: %[[RESHAPE:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32> +// CHECK: return %[[RESHAPE]] } // Checks that tfl.reshape should be removed if its output has more than one @@ -29,11 +29,11 @@ func @reshape_removeAdjacentWithMultipleUse(tensor<4x4x4xf32>) -> tensor<64xf32> return %3 : tensor<64xf32> // CHECK-LABEL: func @reshape_removeAdjacentWithMultipleUse -// CHECK: %cst = constant dense<64> : tensor<1xi32> -// CHECK: %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32> -// CHECK: %1 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32> -// CHECK: %2 = addf %0, %1 -// CHECK: return %2 +// CHECK: %[[CST:.*]] = constant dense<64> : tensor<1xi32> +// CHECK: %[[RESHAPE_1:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32> +// CHECK: %[[RESHAPE_2:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32> +// CHECK: %[[RESULT:.*]] = addf %[[RESHAPE_1]], %[[RESHAPE_2]] +// CHECK: return %[[RESULT]] } // Checks that tfl.reshape should be kept if its output has more than one @@ -47,11 +47,11 @@ func @reshape_keepAdjacentWithMultipleUse(tensor<4x4x4xf32>) -> (tensor<16x4xf32 return %0, %1 : tensor<16x4xf32>, tensor<64xf32> // CHECK-LABEL: func @reshape_keepAdjacentWithMultipleUse -// CHECK: %cst = constant dense<[16, 4]> : tensor<2xi32> -// CHECK: %cst_0 = constant dense<64> : tensor<1xi32> -// CHECK: %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<2xi32>) -> tensor<16x4xf32> -// CHECK: %1 = "tfl.reshape"(%arg0, %cst_0) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32> -// CHECK: return %0, %1 +// CHECK: %[[CST:.*]] = constant dense<[16, 4]> : tensor<2xi32> +// CHECK: %[[CST_0:.*]] = constant dense<64> : tensor<1xi32> +// CHECK: %[[RESHAPE_1:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<2xi32>) -> tensor<16x4xf32> +// CHECK: %[[RESHAPE_2:.*]] = "tfl.reshape"(%arg0, %[[CST_0]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32> +// CHECK: return %[[RESHAPE_1]], %[[RESHAPE_2]] } // Checks that tfl.reshape should be removed if its output type is the same From 6dcb7268bb28221134cd1151a730e89023d59623 Mon Sep 17 00:00:00 2001 From: Anjali Sridhar Date: Mon, 18 May 2020 14:33:45 -0700 Subject: [PATCH 142/557] Rename `_get_closest` to more accurately reflect what it does. PiperOrigin-RevId: 312155516 Change-Id: I27d8dd110ace0150ea735f718ed94948a9a75a74 --- tensorflow/python/distribute/values.py | 22 +++++++++++----------- tensorflow/python/training/optimizer.py | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py index 444915aa123..84904f93104 100644 --- a/tensorflow/python/distribute/values.py +++ b/tensorflow/python/distribute/values.py @@ -139,7 +139,7 @@ class DistributedValues(object): "This method should be overridden by sub-classes which support cross-" "replica accesses.") - def _get_closest(self): + def _get_on_device_or_primary(self): """Returns value in same replica or device if possible, else the _primary.""" replica_id = _get_current_replica_id_as_int() if replica_id is None: @@ -379,7 +379,7 @@ class Mirrored(DistributedDelegate): """Holds a map from replica to values which are kept in sync.""" def _get_cross_replica(self): - return self._get_closest() + return self._get_on_device_or_primary() def _as_graph_element(self): obj = self._get() @@ -480,11 +480,11 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable, return init_op def initialized_value(self): - return self._get_closest().initialized_value() + return self._get_on_device_or_primary().initialized_value() @property def initial_value(self): - return self._get_closest().initial_value + return self._get_on_device_or_primary().initial_value @property def constraint(self): @@ -537,7 +537,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable, return self._values[replica_id].handle def eval(self, session=None): - return self._get_closest().eval(session) + return self._get_on_device_or_primary().eval(session) @property def _save_slice_info(self): @@ -552,7 +552,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable, @property def device(self): - return self._get_closest().device + return self._get_on_device_or_primary().device @property def trainable(self): @@ -587,7 +587,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable, return array_ops.identity(self._get()) def value(self): - return self._get_closest().value() + return self._get_on_device_or_primary().value() def numpy(self): if context.executing_eagerly(): @@ -961,7 +961,7 @@ class MirroredVariable(DistributedVariable, Mirrored): return array_ops.identity(Mirrored._get_cross_replica(self)) def _as_graph_element(self): - return self._get_closest()._as_graph_element() # pylint: disable=protected-access + return self._get_on_device_or_primary()._as_graph_element() # pylint: disable=protected-access def _gather_saveables_for_checkpoint(self): """Overrides Trackable method. @@ -1067,7 +1067,7 @@ class SyncOnReadVariable(DistributedVariable): """Holds a map from replica to variables whose values are reduced on save.""" def _update_replica(self, update_fn, value, **kwargs): - return update_fn(self._get_closest(), value, **kwargs) + return update_fn(self._get_on_device_or_primary(), value, **kwargs) # TODO(b/154017756): Make assign behaivor in cross replica context consistent # with MirroredVariable. @@ -1146,8 +1146,8 @@ class SyncOnReadVariable(DistributedVariable): if ds_context.in_cross_replica_context(): return self._get_cross_replica() else: - # _get_closest() returns a Variable. - return self._get_closest().value() + # _get_on_device_or_primary() returns a Variable. + return self._get_on_device_or_primary().value() def _get_cross_replica(self): if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA: diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py index 9732ea04f26..1fe8a8c729b 100644 --- a/tensorflow/python/training/optimizer.py +++ b/tensorflow/python/training/optimizer.py @@ -768,7 +768,7 @@ class Optimizer( # pylint: enable=protected-access mirrored_slot = named_slots.get(key, None) if mirrored_slot is None: return None - return mirrored_slot._get_closest() # pylint: disable=protected-access + return mirrored_slot._get_on_device_or_primary() # pylint: disable=protected-access return named_slots.get(_var_key(var), None) From 756e66db61ec5b0a642be7381f65cc87d4e64802 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 18 May 2020 15:03:26 -0700 Subject: [PATCH 143/557] Modify signature of layout_config(). PiperOrigin-RevId: 312161403 Change-Id: I9304d4839f6bcea6804dd959b131ffac7c0be6d6 --- tensorflow/compiler/xla/service/hlo_module_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h index 833d0fe59d0..964f83322a4 100644 --- a/tensorflow/compiler/xla/service/hlo_module_config.h +++ b/tensorflow/compiler/xla/service/hlo_module_config.h @@ -204,7 +204,7 @@ class HloModuleConfig { std::vector>* mutable_dot_config() { return &dot_config_; } - absl::Span>> layout_config() const { + const std::vector>>& layout_config() const { return layout_config_; } From 1a07ecf8526bca5748bf447b16586b60889cdc36 Mon Sep 17 00:00:00 2001 From: Xiao Yu Date: Mon, 18 May 2020 15:08:28 -0700 Subject: [PATCH 144/557] In TF-TFRT integration, C API will get dtype from underlying fallback tensor directly if the tfrt dtype is Unsupported. This is used to support dtypes that are not natively implemented in TFRT (e.g. DT_RESOURCE). Enable a few resnet50 tests. PiperOrigin-RevId: 312162457 Change-Id: Iece6d621120e8b20d0a0fe7b271a76dc29caa924 --- .../python/eager/benchmarks/resnet50/resnet50_test.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py index 9d049a6d59d..34ceb56d129 100644 --- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py +++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py @@ -104,7 +104,6 @@ class ResNet50Test(tf.test.TestCase): context.async_wait() self.assertEqual((2, 1000), output.shape) - @test_util.disable_tfrt('b/155260334') def test_apply(self): self._apply(defun=False) @@ -121,7 +120,6 @@ class ResNet50Test(tf.test.TestCase): def test_apply_with_defun_async(self): self._apply(defun=True, execution_mode=context.ASYNC) - @test_util.disable_tfrt('b/155260334') def test_apply_no_top(self): device, data_format = resnet50_test_util.device_and_data_format() model = resnet50.ResNet50(data_format, include_top=False) @@ -132,7 +130,6 @@ class ResNet50Test(tf.test.TestCase): if data_format == 'channels_first' else (2, 1, 1, 2048)) self.assertEqual(output_shape, output.shape) - @test_util.disable_tfrt('b/155260334') def test_apply_with_pooling(self): device, data_format = resnet50_test_util.device_and_data_format() model = resnet50.ResNet50(data_format, include_top=False, pooling='avg') @@ -141,7 +138,6 @@ class ResNet50Test(tf.test.TestCase): output = model(images, training=False) self.assertEqual((2, 2048), output.shape) - @test_util.disable_tfrt('b/155260334') def test_apply_no_average_pooling(self): device, data_format = resnet50_test_util.device_and_data_format() model = resnet50.ResNet50( @@ -153,7 +149,6 @@ class ResNet50Test(tf.test.TestCase): (2, 7, 7, 2048)) self.assertEqual(output_shape, output.shape) - @test_util.disable_tfrt('b/155260334') def test_apply_block3_strides(self): device, data_format = resnet50_test_util.device_and_data_format() model = resnet50.ResNet50( @@ -165,7 +160,6 @@ class ResNet50Test(tf.test.TestCase): (2, 1, 1, 2048)) self.assertEqual(output_shape, output.shape) - @test_util.disable_tfrt('b/155260334') def test_apply_retrieve_intermediates(self): device, data_format = resnet50_test_util.device_and_data_format() model = resnet50.ResNet50( @@ -220,7 +214,6 @@ class ResNet50Test(tf.test.TestCase): self.assertEqual(len(events), 2) self.assertEqual(events[1].summary.value[0].tag, 'loss') - @test_util.disable_tfrt('b/155260334') def test_train(self): self._test_train() @@ -228,7 +221,6 @@ class ResNet50Test(tf.test.TestCase): def test_train_async(self): self._test_train(execution_mode=context.ASYNC) - @test_util.disable_tfrt('b/155260334') def test_no_garbage(self): device, data_format = resnet50_test_util.device_and_data_format() model = resnet50.ResNet50(data_format) From 3c54ef5ab94813713ae538b76a78e1fac4ac424d Mon Sep 17 00:00:00 2001 From: Yujing Zhang Date: Mon, 18 May 2020 15:17:54 -0700 Subject: [PATCH 145/557] Support running a tf.function with packed variable inputs both locally and remotely. - Support packing multiple EagerTensors of the same dtype and shape. - Create CompositeDevices on the same task as the local host CPU, in order to correctly trigger packed TensorHandle copy from a client to a remote worker. PiperOrigin-RevId: 312164194 Change-Id: Ia15718309c8c68eb645bfe0bf967ddd6d2551b3a --- .../core/common_runtime/composite_device.cc | 12 ++-- .../core/common_runtime/composite_device.h | 5 +- .../common_runtime/composite_device_test.cc | 11 ++-- .../core/common_runtime/eager/context.cc | 7 ++- .../core/common_runtime/eager/context_test.cc | 12 ++-- .../common_runtime/eager/execute_node_test.cc | 3 +- .../eager/tensor_handle_test.cc | 3 + .../process_function_library_runtime_test.cc | 3 +- tensorflow/python/eager/backprop.py | 13 ++++ tensorflow/python/eager/context.py | 16 +++++ tensorflow/python/eager/function_test.py | 37 ++++++++++++ tensorflow/python/eager/pywrap_tensor.cc | 15 ++++- tensorflow/python/eager/pywrap_tfe.h | 3 +- tensorflow/python/eager/remote_test.py | 31 ++++++++++ tensorflow/python/framework/ops.py | 59 +++++++++++++++++++ tensorflow/python/framework/ops_test.py | 47 +++++++++++++++ tensorflow/python/tfe_wrapper.cc | 20 +++++++ 17 files changed, 274 insertions(+), 23 deletions(-) diff --git a/tensorflow/core/common_runtime/composite_device.cc b/tensorflow/core/common_runtime/composite_device.cc index 3103fa37941..7fd41e00a04 100644 --- a/tensorflow/core/common_runtime/composite_device.cc +++ b/tensorflow/core/common_runtime/composite_device.cc @@ -24,7 +24,7 @@ const char* const kCompositeDeviceType = "COMPOSITE"; std::unique_ptr CompositeDevice::MakeDevice( const std::vector& underlying_devices, const int unique_device_id, - Status* status) { + const DeviceNameUtils::ParsedName& host_name, Status* status) { if (underlying_devices.empty()) { status->Update( errors::InvalidArgument("underlying_devices should not be empty.")); @@ -62,13 +62,15 @@ std::unique_ptr CompositeDevice::MakeDevice( return nullptr; } } + + DeviceNameUtils::ParsedName parsed_composite_name = host_name; DeviceAttributes device_attributes; - parsed_name.type = kCompositeDeviceType; - device_attributes.set_device_type(parsed_name.type); - parsed_name.id = unique_device_id; + parsed_composite_name.type = kCompositeDeviceType; + parsed_composite_name.id = unique_device_id; const string composite_name = - DeviceNameUtils::ParsedNameToString(parsed_name); + DeviceNameUtils::ParsedNameToString(parsed_composite_name); device_attributes.set_name(composite_name); + device_attributes.set_device_type(kCompositeDeviceType); return absl::WrapUnique( new CompositeDevice(device_attributes, underlying_devices)); diff --git a/tensorflow/core/common_runtime/composite_device.h b/tensorflow/core/common_runtime/composite_device.h index 127e5b8303a..850eae55e8d 100644 --- a/tensorflow/core/common_runtime/composite_device.h +++ b/tensorflow/core/common_runtime/composite_device.h @@ -42,10 +42,11 @@ class CompositeDevice : public Device { return &underlying_devices_; } - // Helper for creating a CompositeDevice. + // Helper for creating a CompositeDevice on the same task as the given host + // CPU. static std::unique_ptr MakeDevice( const std::vector& underlying_devices, const int unique_device_id, - Status* status); + const DeviceNameUtils::ParsedName& host_name, Status* status); private: CompositeDevice(const DeviceAttributes& device_attributes, diff --git a/tensorflow/core/common_runtime/composite_device_test.cc b/tensorflow/core/common_runtime/composite_device_test.cc index ac2f9108ecb..73a6ae44912 100644 --- a/tensorflow/core/common_runtime/composite_device_test.cc +++ b/tensorflow/core/common_runtime/composite_device_test.cc @@ -20,12 +20,15 @@ limitations under the License. namespace tensorflow { TEST(CompositeDeviceTest, Basic) { + const string host_name = "/job:localhost/replica:0/task:0/device:CPU:0"; + DeviceNameUtils::ParsedName parsed_host_name; + EXPECT_TRUE(DeviceNameUtils::ParseFullName(host_name, &parsed_host_name)); std::vector underlying_devices; { Status status; std::unique_ptr composite_device = CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/0, - &status); + parsed_host_name, &status); EXPECT_EQ(composite_device, nullptr); EXPECT_EQ(error::INVALID_ARGUMENT, status.code()); EXPECT_TRUE(absl::StrContains(status.error_message(), @@ -41,7 +44,7 @@ TEST(CompositeDeviceTest, Basic) { "/job:localhost/replica:0/task:0/device:CPU:1"); std::unique_ptr composite_device = CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/0, - &status); + parsed_host_name, &status); TF_ASSERT_OK(status); EXPECT_EQ(composite_device->device_type(), kCompositeDeviceType); EXPECT_EQ(underlying_devices, *composite_device->underlying_devices()); @@ -53,7 +56,7 @@ TEST(CompositeDeviceTest, Basic) { "/job:localhost/replica:0/task:0/device:CPU:0"); std::unique_ptr composite_device = CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/1, - &status); + parsed_host_name, &status); EXPECT_EQ(composite_device, nullptr); EXPECT_EQ(error::INVALID_ARGUMENT, status.code()); EXPECT_TRUE( @@ -68,7 +71,7 @@ TEST(CompositeDeviceTest, Basic) { "/job:localhost/replica:0/task:0/device:GPU:0"); std::unique_ptr composite_device = CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/1, - &status); + parsed_host_name, &status); EXPECT_EQ(composite_device, nullptr); EXPECT_EQ(error::INVALID_ARGUMENT, status.code()); EXPECT_TRUE(absl::StrContains(status.error_message(), diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc index b8dfe92aac6..207c6a02d5b 100644 --- a/tensorflow/core/common_runtime/eager/context.cc +++ b/tensorflow/core/common_runtime/eager/context.cc @@ -935,8 +935,11 @@ Status EagerContext::FindOrCreateCompositeDevice( } Status s; - auto device = CompositeDevice::MakeDevice(underlying_devices, - composite_devices_.size(), &s); + // Create a CompositeDevice on the same task as the host CPU, in order to + // trigger packed TensorHandle copy from a client to a remote worker. + auto device = + CompositeDevice::MakeDevice(underlying_devices, composite_devices_.size(), + HostCPU()->parsed_name(), &s); TF_RETURN_IF_ERROR(s); *composite_device = device.get(); pflr_->AddCompositeDevice(*composite_device); diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc index f83e3f0b45d..c6ed61c80c4 100644 --- a/tensorflow/core/common_runtime/eager/context_test.cc +++ b/tensorflow/core/common_runtime/eager/context_test.cc @@ -31,7 +31,7 @@ static Device* CreateDevice(const string& type, int n) { Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; } }; DeviceAttributes attr; - attr.set_name("/job:a/replica:0/task:0/device:" + type + ":" + + attr.set_name("/job:localhost/replica:0/task:0/device:" + type + ":" + std::to_string(n)); attr.set_device_type(type); return new FakeDevice(attr); @@ -179,10 +179,10 @@ TEST_F(EagerContextTest, CompositeDevice) { TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices, &composite_device_0)); EXPECT_EQ(composite_device_0->name(), - "/job:worker/replica:0/task:0/device:COMPOSITE:0"); + "/job:localhost/replica:0/task:0/device:COMPOSITE:0"); CompositeDevice* device = nullptr; TF_EXPECT_OK(context()->FindCompositeDeviceFromName( - "/job:worker/replica:0/task:0/device:COMPOSITE:0", &device)); + "/job:localhost/replica:0/task:0/device:COMPOSITE:0", &device)); EXPECT_EQ(device, composite_device_0); CompositeDevice* composite_device_1 = nullptr; TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices, @@ -193,13 +193,13 @@ TEST_F(EagerContextTest, CompositeDevice) { TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices, &composite_device_2)); EXPECT_EQ(composite_device_2->name(), - "/job:worker/replica:0/task:0/device:COMPOSITE:1"); + "/job:localhost/replica:0/task:0/device:COMPOSITE:1"); TF_EXPECT_OK(context()->FindCompositeDeviceFromName( - "/job:worker/replica:0/task:0/device:COMPOSITE:1", &device)); + "/job:localhost/replica:0/task:0/device:COMPOSITE:1", &device)); EXPECT_EQ(device, composite_device_2); EXPECT_TRUE(errors::IsNotFound(context()->FindCompositeDeviceFromName( - "/job:worker/replica:0/task:0/device:COMPOSITE:2", &device))); + "/job:localhost/replica:0/task:0/device:COMPOSITE:2", &device))); } } // namespace diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc index 99f030322df..83fbcf5017e 100644 --- a/tensorflow/core/common_runtime/eager/execute_node_test.cc +++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc @@ -61,7 +61,8 @@ TEST(ExecuteNodeTest, ExecuteNodeArgs) { Status s; std::unique_ptr composite_device = CompositeDevice::MakeDevice({device0->name(), device1->name()}, - /*unique_device_id=*/0, &s); + /*unique_device_id=*/0, + device_mgr.HostCPU()->parsed_name(), &s); TF_ASSERT_OK(s); auto ctx = new EagerContext( diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc index 779158375de..13b634bbec4 100644 --- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc +++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc @@ -100,6 +100,7 @@ class PackedTensorHandleTest : public ::testing::Test { for (const char* name : device_names_) { devices.emplace_back(CreateDevice("GPU", name)); } + devices.emplace_back(CreateDevice("CPU", host_name_)); device_mgr_ = new StaticDeviceMgr(std::move(devices)); context_ = new EagerContext( @@ -132,6 +133,8 @@ class PackedTensorHandleTest : public ::testing::Test { "/job:worker/replica:0/task:1/device:GPU:0", "/job:worker/replica:0/task:1/device:GPU:1"}; + const char* host_name_ = "/job:worker/replica:0/task:0/device:CPU:0"; + StaticDeviceMgr* device_mgr_; EagerContext* context_; }; diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc index 247b94dc58c..5bdb4601d37 100644 --- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc +++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc @@ -820,7 +820,8 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_CompositeDevice) { Status s; std::unique_ptr composite_device = CompositeDevice::MakeDevice({device0_->name(), device1_->name()}, - /*unique_device_id=*/0, &s); + /*unique_device_id=*/0, + device_mgr_->HostCPU()->parsed_name(), &s); TF_ASSERT_OK(s); AddCompositeDevice(composite_device.get()); diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py index fb7c4055136..7a3dce7db4e 100644 --- a/tensorflow/python/eager/backprop.py +++ b/tensorflow/python/eager/backprop.py @@ -241,6 +241,11 @@ def implicit_val_and_grad(f): "function was being computed.") sources = [v.handle for v in variables] + for s in sources: + if getattr(s, "is_packed", False): + raise ValueError( + "GradientTape.gradient is not supported on packed EagerTensors yet." + ) grad = imperative_grad.imperative_grad(this_tape, nest.flatten(end_node), sources) return end_node, list(zip(grad, variables)) @@ -548,6 +553,10 @@ def make_vjp(f, params=None, persistent=True): ] args = _ensure_unique_tensor_objects(parameter_positions, args) for i in parameter_positions: + if getattr(args[i], "is_packed", False): + raise ValueError( + "GradientTape.gradient is not supported on packed EagerTensors" + "yet.") sources.append(args[i]) tape.watch(this_tape, args[i]) result = f(*args) @@ -1032,6 +1041,10 @@ class GradientTape(object): logging.WARN, "The dtype of the source tensor must be " "floating (e.g. tf.float32) when calling GradientTape.gradient, " "got %r", t.dtype) + if getattr(t, "is_packed", False): + raise ValueError( + "GradientTape.gradient is not supported on packed EagerTensors yet." + ) if output_gradients is not None: output_gradients = [None if x is None else ops.convert_to_tensor(x) diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py index 86b3d5cf95f..604a960afd5 100644 --- a/tensorflow/python/eager/context.py +++ b/tensorflow/python/eager/context.py @@ -1123,6 +1123,22 @@ class Context(object): pywrap_tfe.TFE_Py_RegisterCustomDevice(self._handle, device_capsule, device_name, device_info_capsule) + def pack_eager_tensors(self, tensors): + """Pack multiple `EagerTensor`s of the same dtype and shape. + + Args: + tensors: a list of EagerTensors to pack. + + Returns: + A packed EagerTensor. + """ + self.ensure_initialized() + if self._lazy_remote_inputs_copy is not None and ( + not self._lazy_remote_inputs_copy): + raise ValueError("Packing eager tensors is not supported when " + "lazy_remote_inputs_copy is disabled.") + return pywrap_tfe.TFE_Py_PackEagerTensors(self._handle, tensors) + def remove_function(self, name): """Remove a function from the context. diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py index 4e68f1460d9..078ca8b8878 100644 --- a/tensorflow/python/eager/function_test.py +++ b/tensorflow/python/eager/function_test.py @@ -186,6 +186,43 @@ class FunctionTest(test.TestCase, parameterized.TestCase): with self.assertRaisesRegexp(AttributeError, 'no attribute'): add(c) + def testPackedVariable(self): + with ops.device('/cpu:0'): + v0_0 = resource_variable_ops.ResourceVariable(1.0) + with ops.device('/cpu:1'): + v0_1 = resource_variable_ops.ResourceVariable(2.0) + v1_0 = resource_variable_ops.ResourceVariable(3.0) + with ops.device('/cpu:2'): + v1_1 = resource_variable_ops.ResourceVariable(4.0) + + packed_var_0 = ops.pack_eager_tensors([v0_0.handle, v0_1.handle]) + packed_var_1 = ops.pack_eager_tensors([v1_0.handle, v1_1.handle]) + + # TODO(b/145922293): use ResourceVariable.assign_add and + # ResourceVariable.read_value directly once we support packing multiple + # ResourceVariable into one ResourceVariable. + @def_function.function + def read_var(): + resource_variable_ops.assign_add_variable_op( + packed_var_0, constant_op.constant(5.0)) + resource_variable_ops.assign_add_variable_op( + packed_var_1, constant_op.constant(6.0)) + with ops.device('/cpu:0'): + read0 = resource_variable_ops.read_variable_op( + packed_var_0, dtype=dtypes.float32) + with ops.device('/cpu:1'): + read1 = resource_variable_ops.read_variable_op( + packed_var_0, dtype=dtypes.float32) + read2 = resource_variable_ops.read_variable_op( + packed_var_1, dtype=dtypes.float32) + with ops.device('/cpu:2'): + read3 = resource_variable_ops.read_variable_op( + packed_var_1, dtype=dtypes.float32) + + return read0, read1, read2, read3 + + self.assertAllEqual(read_var(), (1 + 5, 2 + 5, 3 + 6, 4 + 6)) + def testImplementsAttributeBasic(self): v = def_function.function( experimental_implements='func')(lambda x, y: x + y) diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc index a72f74b38b8..b209ddb6162 100644 --- a/tensorflow/python/eager/pywrap_tensor.cc +++ b/tensorflow/python/eager/pywrap_tensor.cc @@ -345,6 +345,8 @@ typedef struct EagerTensor { char unused[kMaxEagerTensorParentSize]; TFE_TensorHandle* handle; int64_t id; + // Indicates whether it's a packed tensor or not. + bool is_packed; // This mirrors tensorflow.core.framework.ops.Tensor._handle_data Which will // be None for tensors of type other than DT_RESOURCE. For DT_RESOURCE // tensors, this will contain a serialized HandleData proto with shape @@ -418,6 +420,7 @@ bool MaybeInvokeCreatedOnEagerTensorProfiler(EagerTensor* created_tensor) { int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) { self->id = get_uid(); self->handle = nullptr; + self->is_packed = false; Py_INCREF(Py_None); self->handle_data = Py_None; Py_INCREF(Py_None); @@ -647,6 +650,11 @@ static PyObject* EagerTensor_backing_device(EagerTensor* self) { #endif } +// Getter `is_packed`. +static PyObject* EagerTensor_is_packed(EagerTensor* self) { + return PyBool_FromLong(self->is_packed); +} + static PyGetSetDef EagerTensor_getsetters[] = { {const_cast("_id"), (getter)EagerTensor_getid, nullptr, const_cast("Tensor ID."), nullptr}, @@ -655,6 +663,9 @@ static PyGetSetDef EagerTensor_getsetters[] = { {const_cast("backing_device"), (getter)EagerTensor_backing_device, nullptr, const_cast("Device on which tensor's memory is resident."), nullptr}, + {const_cast("is_packed"), (getter)EagerTensor_is_packed, nullptr, + const_cast("Whether the EagerTensor is a packed tensor or not."), + nullptr}, {const_cast("_handle_data"), (getter)EagerTensor_handle_data, (setter)EagerTensor_sethandle_data, const_cast("Shape/DType data if the EagerTensor is a DT_RESOURCE"), @@ -813,7 +824,8 @@ TFE_TensorHandle* EagerTensor_Handle(const PyObject* o) { return reinterpret_cast(o)->handle; } -PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) { +PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle, + const bool is_packed) { if (handle == nullptr) { return nullptr; } @@ -821,6 +833,7 @@ PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) { EagerTensorType->tp_new(EagerTensorType, EmptyTuple(), EmptyDict())); if (t != nullptr) { t->id = get_uid(); + t->is_packed = is_packed; Py_INCREF(Py_None); t->handle_data = Py_None; Py_INCREF(Py_None); diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h index 92a0a200e3d..a5c9c181539 100755 --- a/tensorflow/python/eager/pywrap_tfe.h +++ b/tensorflow/python/eager/pywrap_tfe.h @@ -129,7 +129,8 @@ void TFE_DeleteContextCapsule(PyObject* context); bool EagerTensor_CheckExact(const PyObject* o); // Helper function to construct a new EagerTensor from a TFE_TensorHandle. -PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle); +PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle, + const bool is_packed = false); // Extracts the handle inside EagerTensor object `o`. Returns nullptr on error. TFE_TensorHandle* EagerTensor_Handle(const PyObject* o); diff --git a/tensorflow/python/eager/remote_test.py b/tensorflow/python/eager/remote_test.py index 32fe6372f77..710e7bf5f9d 100644 --- a/tensorflow/python/eager/remote_test.py +++ b/tensorflow/python/eager/remote_test.py @@ -40,6 +40,7 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import functional_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import variables from tensorflow.python.training import server_lib from tensorflow.python.training.server_lib import ClusterSpec @@ -324,6 +325,36 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase): self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0]) + def testMultiDeviceFunctionWithPackedVariable(self): + with ops.device('/job:worker/replica:0/task:0/device:CPU:0'): + var0 = resource_variable_ops.ResourceVariable(1.0) + with ops.device('/job:worker/replica:0/task:1/device:CPU:0'): + var1 = resource_variable_ops.ResourceVariable(2.0) + + packed_var = ops.pack_eager_tensors([var0.handle, var1.handle]) + self.assertEqual(packed_var.device, + '/job:localhost/replica:0/task:0/device:COMPOSITE:0') + self.assertEqual(packed_var.backing_device, + '/job:localhost/replica:0/task:0/device:COMPOSITE:0') + + @def_function.function + def add_variables(): + with ops.device('/job:worker/replica:0/task:0/device:CPU:0'): + read0 = resource_variable_ops.read_variable_op( + packed_var, dtype=dtypes.float32) + with ops.device('/job:worker/replica:0/task:1/device:CPU:0'): + read1 = resource_variable_ops.read_variable_op( + packed_var, dtype=dtypes.float32) + + return read0 + read1 + + # Run the function on a remote device + with ops.device('/job:worker/replica:0/task:0'): + self.assertAllEqual(add_variables().numpy(), 3.0) + + # Run the function on a local worker + self.assertAllEqual(add_variables().numpy(), 3.0) + @test_util.eager_lazy_remote_copy_on_and_off def testMultiDeviceFunctionOnRemoteDeviceWithWait(self): with ops.device('/job:worker/replica:0/task:1'): diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index 43652d51eae..5b6dac5be34 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -1394,6 +1394,65 @@ def _error_prefix(name): return "" if name is None else "%s: " % name +def pack_eager_tensors(tensors, ctx=None): + """Pack multiple `EagerTensor`s of the same dtype and shape. + + Args: + tensors: a list of EagerTensors to pack. + ctx: context.context(). + + Returns: + A packed EagerTensor. + """ + if not isinstance(tensors, list): + raise TypeError("tensors must be a list or a tuple: %s" % tensors) + + if not tensors: + raise ValueError("Empty tensors is unexpected for packing.") + + dtype = tensors[0].dtype + shape = tensors[0].shape + handle_data = tensors[0]._handle_data # pylint: disable=protected-access + is_resource = dtype == dtypes.resource + for i in range(len(tensors)): + t = tensors[i] + if not isinstance(t, EagerTensor): + raise TypeError("tensors must be a list of EagerTensors: %s" % t) + + if t.dtype != dtype: + raise ValueError( + "All tensors being packed should have the same dtype %s, " + "but the %d-th tensor is of dtype %s" % (dtype, i, t.dtype)) + if t.shape != shape: + raise ValueError( + "All tensors being packed should have the same shape %s, " + "but the %d-th tensor is of shape %s" % (shape, i, t.shape)) + # pylint: disable=protected-access + if is_resource and t._handle_data != handle_data: + raise ValueError( + "All tensors being packed should have the same handle data %s, " + "but the %d-th tensor is of handle data %s" % + (handle_data, i, t._handle_data)) + # pylint: enable=protected-access + + if ctx is None: + ctx = context.context() + + # Propogate handle data for resource variables + packed_tensor = ctx.pack_eager_tensors(tensors) + if handle_data is not None: + packed_tensor._handle_data = handle_data # pylint: disable=protected-access + + def grad_fun(_): + raise ValueError( + "Gradients through pack_eager_tensors are not supported yet.") + + tape.record_operation("pack_eager_tensors", [packed_tensor], tensors, + grad_fun) + + return packed_tensor + + def convert_to_tensor(value, dtype=None, name=None, diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py index 322df8ffac8..11193155999 100644 --- a/tensorflow/python/framework/ops_test.py +++ b/tensorflow/python/framework/ops_test.py @@ -34,6 +34,7 @@ from tensorflow.python.eager import context from tensorflow.python.eager import def_function from tensorflow.python.eager import function as eager_function from tensorflow.python.eager import wrap_function +from tensorflow.python.framework import config from tensorflow.python.framework import composite_tensor from tensorflow.python.framework import constant_op from tensorflow.python.framework import device as pydev @@ -3408,5 +3409,51 @@ class CustomConvertToCompositeTensorTest(test_util.TensorFlowTestCase): self.assertAllEqual(x_, tensor_util.constant_value(y_)) +@test_util.disable_tfrt("Packing EagerTensors is not supported yet.") +class PackEagerTensorTest(test_util.TensorFlowTestCase): + + def setUp(self): + super(PackEagerTensorTest, self).setUp() + context._reset_context() + cpus = config.list_physical_devices("CPU") + # Set 2 virtual CPUs + config.set_logical_device_configuration(cpus[0], [ + context.LogicalDeviceConfiguration(), + context.LogicalDeviceConfiguration(), + ]) + + def testPack(self): + with context.eager_mode(): + with ops.device("CPU:0"): + var0 = resource_variable_ops.ResourceVariable(1.0) + c0 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]]) + with ops.device("CPU:1"): + var1 = resource_variable_ops.ResourceVariable(2.0) + var2 = resource_variable_ops.ResourceVariable([3.0]) + c1 = constant_op.constant([9.0]) + + packed_var0 = ops.pack_eager_tensors([var0.handle, var1.handle]) + self.assertTrue(packed_var0.is_packed) + self.assertEqual(packed_var0.dtype, var0.handle.dtype) + self.assertEqual(packed_var0.shape, var0.handle.shape) + self.assertEqual(packed_var0._handle_data, var0.handle._handle_data) + self.assertIn("COMPOSITE:0", packed_var0.device) + self.assertIn("COMPOSITE:0", packed_var0.backing_device) + with self.assertRaises(errors.InvalidArgumentError): + packed_var0.numpy() + + # Different dtypes + with self.assertRaises(ValueError): + ops.pack_eager_tensors([var0.handle, c1]) + + # Different shapes + with self.assertRaises(ValueError): + ops.pack_eager_tensors([c0, c1]) + + # Different handle data + with self.assertRaises(ValueError): + ops.pack_eager_tensors([var0.handle, var2.handle]) + + if __name__ == "__main__": googletest.main() diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc index 836cafbd494..efcd912f430 100644 --- a/tensorflow/python/tfe_wrapper.cc +++ b/tensorflow/python/tfe_wrapper.cc @@ -210,6 +210,22 @@ TFE_OutputTensorHandles InputTFE_OutputTensorHandles( return output_tensor_handles; } +// Packs multiple `EagerTensor`s of the same dtype and shape into one +// `EagerTensor`. +py::object TFE_Py_PackEagerTensors_wrapper(const py::handle& context, + const py::handle& tensors) { + TFE_Context* ctx = tensorflow::InputTFE_Context(context); + TFE_InputTensorHandles handles = InputTFE_InputTensorHandles(tensors); + tensorflow::Safe_TF_StatusPtr status = tensorflow::make_safe(TF_NewStatus()); + int size = handles.size(); + TFE_TensorHandle* packed_handle = + TFE_CreatePackedTensorHandle(ctx, handles.data(), &size, status.get()); + tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get()); + PyObject* packed_tensor = + EagerTensorFromHandle(packed_handle, /*is_packed=*/true); + return tensorflow::PyoOrThrow(packed_tensor); +} + // This function was created from fusing the typemap logic in platform/base.i. py::object TFE_Py_ExecuteCancelable_wrapper( const py::handle& context, const char* device_name, const char* op_name, @@ -558,6 +574,10 @@ PYBIND11_MODULE(_pywrap_tfe, m) { m.def("TFE_Py_InitEagerTensor", [](const py::handle& o) { return tensorflow::PyoOrThrow(TFE_Py_InitEagerTensor(o.ptr())); }); + m.def("TFE_Py_PackEagerTensors", + [](const py::handle& context, const py::handle& handles) { + return tensorflow::TFE_Py_PackEagerTensors_wrapper(context, handles); + }); m.def("TFE_Py_SetEagerTensorProfiler", &TFE_Py_SetEagerTensorProfiler); m.def("TFE_Py_RegisterJVPFunction", [](const py::handle& o) { return tensorflow::PyoOrThrow(TFE_Py_RegisterJVPFunction(o.ptr())); From 4001e3dad3c6340b0c2001d89b3954f189e9aeb5 Mon Sep 17 00:00:00 2001 From: Sachin Joglekar Date: Mon, 18 May 2020 15:22:44 -0700 Subject: [PATCH 146/557] Updates GPU delegate documentation with experimental quant support PiperOrigin-RevId: 312165090 Change-Id: I8fb624f71101fce6a379ed24f6002f8f4b60245d --- tensorflow/lite/g3doc/performance/gpu.md | 2 +- .../lite/g3doc/performance/gpu_advanced.md | 189 ++++++++---------- .../g3doc/performance/model_optimization.md | 6 +- 3 files changed, 84 insertions(+), 113 deletions(-) diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md index 8762afb4c83..b5abf46f845 100644 --- a/tensorflow/lite/g3doc/performance/gpu.md +++ b/tensorflow/lite/g3doc/performance/gpu.md @@ -31,7 +31,7 @@ models. For a step-by-step tutorial, watch the [GPU Delegate for Android](https://youtu.be/Xkhgre8r5G0) video. -Note: This requires OpenGL ES 3.1 or higher. +Note: This requires OpenCL or OpenGL ES (3.1 or higher). #### Step 1. Clone the TensorFlow source code and open it in Android Studio diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md index 9f47c2e55e8..dce3eb8db6b 100644 --- a/tensorflow/lite/g3doc/performance/gpu_advanced.md +++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md @@ -1,9 +1,9 @@ # TensorFlow Lite on GPU [TensorFlow Lite](https://www.tensorflow.org/mobile/tflite/) supports several -hardware accelerators. This document describes how to use the GPU backend using -the TensorFlow Lite delegate APIs on Android (requires OpenGL ES 3.1 or higher) -and iOS (requires iOS 8 or later). +hardware accelerators. This document describes how to use the GPU backend using +the TensorFlow Lite delegate APIs on Android (requires OpenCL or OpenGL ES 3.1 +and higher) and iOS (requires iOS 8 or later). ## Benefits of GPU Acceleration @@ -35,25 +35,33 @@ power and generating less heat than the same task run on a CPU. TensorFlow Lite on GPU supports the following ops in 16-bit and 32-bit float precision: -* `ADD v1` -* `AVERAGE_POOL_2D v1` -* `CONCATENATION v1` -* `CONV_2D v1` -* `DEPTHWISE_CONV_2D v1-2` -* `FULLY_CONNECTED v1` -* `LOGISTIC v1` -* `MAX_POOL_2D v1` -* `MUL v1` -* `PAD v1` -* `PRELU v1` -* `RELU v1` -* `RELU6 v1` -* `RESHAPE v1` -* `RESIZE_BILINEAR v1` -* `SOFTMAX v1` -* `STRIDED_SLICE v1` -* `SUB v1` -* `TRANSPOSE_CONV v1` +* `ADD` +* `AVERAGE_POOL_2D` +* `CONCATENATION` +* `CONV_2D` +* `DEPTHWISE_CONV_2D v1-2` +* `EXP` +* `FULLY_CONNECTED` +* `LOGISTIC` +* `LSTM v2 (Basic LSTM only)` +* `MAX_POOL_2D` +* `MAXIMUM` +* `MINIMUM` +* `MUL` +* `PAD` +* `PRELU` +* `RELU` +* `RELU6` +* `RESHAPE` +* `RESIZE_BILINEAR v1-3` +* `SOFTMAX` +* `STRIDED_SLICE` +* `SUB` +* `TRANSPOSE_CONV` + +By default, all ops are only supported at version 1. Enabling the +[experimental quantization support](gpu_advanced.md#running-quantized-models-experimental-android-only) +allows the appropriate versions; for example, ADD v2. ## Basic Usage @@ -82,8 +90,8 @@ delegate.close(); ### Android (C/C++) For C/C++ usage of TensorFlow Lite GPU on Android, the GPU delegate can be -created with `TfLiteGpuDelegateCreate()` and destroyed with -`TfLiteGpuDelegateDelete()`. +created with `TfLiteGpuDelegateV2Create()` and destroyed with +`TfLiteGpuDelegateV2Delete()`. ```c++ // Set up interpreter. @@ -94,15 +102,7 @@ std::unique_ptr interpreter; InterpreterBuilder(*model, op_resolver)(&interpreter); // NEW: Prepare GPU delegate. -const TfLiteGpuDelegateOptions options = { - .metadata = NULL, - .compile_options = { - .precision_loss_allowed = 1, // FP16 - .preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST, - .dynamic_batch_enabled = 0, // Not fully functional yet - }, -}; -auto* delegate = TfLiteGpuDelegateCreate(&options); +auto* delegate = TfLiteGpuDelegateV2Create(/*default options=*/nullptr); if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false; // Run inference. @@ -111,9 +111,13 @@ if (interpreter->Invoke() != kTfLiteOk) return false; ReadFromOutputTensor(interpreter->typed_output_tensor(0)); // NEW: Clean up. -TfLiteGpuDelegateDelete(delegate); +TfLiteGpuDelegateV2Delete(delegate); ``` +Take a look at `TfLiteGpuDelegateOptionsV2` to create a delegate instance with +custom options. You can initialize the default options with +`TfLiteGpuDelegateOptionsV2Default()` and then modify them as necessary. + TFLite GPU for Android C/C++ uses the [Bazel](https://bazel.io) build system. The delegate can be built, for example, using the following command: @@ -165,6 +169,43 @@ called. ## Advanced Usage +### Running quantized models (Experimental, Android only) + +The GPU delegate already supports +[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant) +models. There is experimental support on Android to run 8-bit quantized as well. +This includes all flavors of quantization, including: + +* Models trained with + [Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization) +* [Post-training dynamic-range quantization](https://www.tensorflow.org/lite/performance/post_training_quant) +* [Post-training full-integer quantization](https://www.tensorflow.org/lite/performance/post_training_integer_quant) + +To optimize performance, use models that have floating-point input & output +tensors. + +This feature can be enabled using delegate options as follows: + +**C++ API** + +```c++ +// NEW: Prepare custom options with feature enabled. +TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default(); +options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT; + +auto* delegate = TfLiteGpuDelegateV2Create(options); +if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false; +``` + +**Java API** + +```java +// NEW: Prepare GPU delegate with feature turned on. +GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true)); + +Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate); +``` + ### Delegate Options for iOS `NewGpuDelegate()` accepts a `struct` of options. @@ -210,7 +251,7 @@ While it is convenient to use `nullptr`, we recommend that you explicitly set the options, to avoid any unexpected behavior if default values are changed in the future. -### Input/Output Buffers +### Input/Output Buffers (iOS only) To do computation on the GPU, data must be made available to the GPU. This often requires performing a memory copy. It is desirable not to cross the CPU/GPU @@ -229,80 +270,10 @@ To achieve best performance, TensorFlow Lite makes it possible for users to directly read from and write to the TensorFlow hardware buffer and bypass avoidable memory copies. -#### Android - -Assuming the image input is in the GPU memory, it must first be converted to an -OpenGL Shader Storage Buffer Object (SSBO). You can associate a TfLiteTensor to -a user-prepared SSBO with `Interpreter.bindGlBufferToTensor()`. Note that -`Interpreter.bindGlBufferToTensor()` must be called before -`Interpreter.modifyGraphWithDelegate()`. - -```java -// Ensure a valid EGL rendering context. -EGLContext eglContext = eglGetCurrentContext(); -if (eglContext.equals(EGL_NO_CONTEXT)) return false; - -// Create an SSBO. -int[] id = new int[1]; -glGenBuffers(id.length, id, 0); -glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]); -glBufferData(GL_SHADER_STORAGE_BUFFER, inputSize, null, GL_STREAM_COPY); -glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind -int inputSsboId = id[0]; - -// Create interpreter. -Interpreter interpreter = new Interpreter(tfliteModel); -Tensor inputTensor = interpreter.getInputTensor(0); -GpuDelegate gpuDelegate = new GpuDelegate(); -// The buffer must be bound before the delegate is installed. -gpuDelegate.bindGlBufferToTensor(inputTensor, inputSsboId); -interpreter.modifyGraphWithDelegate(gpuDelegate); - -// Run inference; the null input argument indicates use of the bound buffer for input. -fillSsboWithCameraImageTexture(inputSsboId); -float[] outputArray = new float[outputSize]; -interpreter.runInference(null, outputArray); -``` - -A similar approach can be applied to the output tensor. In that case, -`Interpreter.Options.setAllowBufferHandleOutput(true)` should be passed on, to -disable the default copying of the network's output from GPU memory to CPU -memory. - -```java -// Ensure a valid EGL rendering context. -EGLContext eglContext = eglGetCurrentContext(); -if (eglContext.equals(EGL_NO_CONTEXT)) return false; - -// Create a SSBO. -int[] id = new int[1]; -glGenBuffers(id.length, id, 0); -glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]); -glBufferData(GL_SHADER_STORAGE_BUFFER, outputSize, null, GL_STREAM_COPY); -glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind -int outputSsboId = id[0]; - -// Create interpreter. -Interpreter.Options options = (new Interpreter.Options()).setAllowBufferHandleOutput(true); -Interpreter interpreter = new Interpreter(tfliteModel, options); -Tensor outputTensor = interpreter.getOutputTensor(0); -GpuDelegate gpuDelegate = new GpuDelegate(); -// The buffer must be bound before the delegate is installed. -gpuDelegate.bindGlBufferToTensor(outputTensor, outputSsboId); -interpreter.modifyGraphWithDelegate(gpuDelegate); - -// Run inference; the null output argument indicates use of the bound buffer for output. -ByteBuffer input = getCameraImageByteBuffer(); -interpreter.runInference(input, null); -renderOutputSsbo(outputSsboId); -``` - -#### iOS - Assuming the image input is in GPU memory, it must first be converted to a `MTLBuffer` object for Metal. You can associate a TfLiteTensor to a -user-prepared `MTLBuffer` with `BindMetalBufferToTensor()`. Note that -`BindMetalBufferToTensor()` must be called before +user-prepared `MTLBuffer` with `TFLGpuDelegateBindMetalBufferToTensor()`. Note +that `TFLGpuDelegateBindMetalBufferToTensor()` must be called before `Interpreter::ModifyGraphWithDelegate()`. Additionally, the inference output is, by default, copied from GPU memory to CPU memory. This behavior can be turned off by calling `Interpreter::SetAllowBufferHandleOutput(true)` during @@ -312,8 +283,8 @@ initialization. // Prepare GPU delegate. auto* delegate = NewGpuDelegate(nullptr); interpreter->SetAllowBufferHandleOutput(true); // disable default gpu->cpu copy -if (!BindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false; -if (!BindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false; +if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false; +if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false; if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false; // Run inference. diff --git a/tensorflow/lite/g3doc/performance/model_optimization.md b/tensorflow/lite/g3doc/performance/model_optimization.md index feb6cfecea6..c66b06f9b59 100644 --- a/tensorflow/lite/g3doc/performance/model_optimization.md +++ b/tensorflow/lite/g3doc/performance/model_optimization.md @@ -89,9 +89,9 @@ The following types of quantization are available in TensorFlow Lite: Technique | Data requirements | Size reduction | Accuracy | Supported hardware ------------------------------------------------------------------------------------------------------- | -------------------------------- | -------------- | --------------------------- | ------------------ [Post-training float16 quantization](post_training_float16_quant.ipynb) | No data | Up to 50% | Insignificant accuracy loss | CPU, GPU -[Post-training dynamic range quantization](post_training_quant.ipynb) | No data | Up to 75% | Accuracy loss | CPU -[Post-training integer quantization](post_training_integer_quant.ipynb) | Unlabelled representative sample | Up to 75% | Smaller accuracy loss | CPU, EdgeTPU, Hexagon DSP -[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Labelled training data | Up to 75% | Smallest accuracy loss | CPU, EdgeTPU, Hexagon DSP +[Post-training dynamic range quantization](post_training_quant.ipynb) | No data | Up to 75% | Accuracy loss | CPU, GPU (Android) +[Post-training integer quantization](post_training_integer_quant.ipynb) | Unlabelled representative sample | Up to 75% | Smaller accuracy loss | CPU, GPU (Android), EdgeTPU, Hexagon DSP +[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Labelled training data | Up to 75% | Smallest accuracy loss | CPU, GPU (Android), EdgeTPU, Hexagon DSP Below are the latency and accuracy results for post-training quantization and quantization-aware training on a few models. All latency numbers are measured on From f5c5747f134b3dfd42b1d546f1842aa2e1e70670 Mon Sep 17 00:00:00 2001 From: Jared Duke Date: Mon, 18 May 2020 15:29:57 -0700 Subject: [PATCH 147/557] Re-enable signal kernel tests on py38 PiperOrigin-RevId: 312166420 Change-Id: Ie18cf2e29d8a05d57675ce3e75b06509205a4e61 --- tensorflow/python/kernel_tests/signal/BUILD | 1 - .../python/kernel_tests/signal/test_util.py | 4 +--- .../kernel_tests/signal/window_ops_test.py | 17 ++++++++--------- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD index adb12a5e850..bd893184570 100644 --- a/tensorflow/python/kernel_tests/signal/BUILD +++ b/tensorflow/python/kernel_tests/signal/BUILD @@ -149,7 +149,6 @@ cuda_py_tests( python_version = "PY3", shard_count = 4, tags = [ - "no_oss_py38", #TODO(b/151631881) "no_windows_gpu", ], deps = [ diff --git a/tensorflow/python/kernel_tests/signal/test_util.py b/tensorflow/python/kernel_tests/signal/test_util.py index 1e95fe4b28f..e8d477a843b 100644 --- a/tensorflow/python/kernel_tests/signal/test_util.py +++ b/tensorflow/python/kernel_tests/signal/test_util.py @@ -50,7 +50,7 @@ def grappler_optimize(graph, fetches=None, config_proto=None): return tf_optimizer.OptimizeGraph(config_proto, metagraph) -def tflite_convert(fn, input_templates, use_mlir=False): +def tflite_convert(fn, input_templates): """Converts the provided fn to tf.lite model. Args: @@ -59,7 +59,6 @@ def tflite_convert(fn, input_templates, use_mlir=False): input_templates: A list of Tensors, ndarrays or TensorSpecs describing the inputs that fn expects. The actual values of the Tensors or ndarrays are unused. - use_mlir: Experimental. Whether to use the tf.lite MLIR converter. Returns: The serialized tf.lite model. @@ -67,7 +66,6 @@ def tflite_convert(fn, input_templates, use_mlir=False): fn = def_function.function(fn) concrete_func = fn.get_concrete_function(*input_templates) converter = lite.TFLiteConverterV2([concrete_func]) - converter.experimental_new_converter = use_mlir return converter.convert() diff --git a/tensorflow/python/kernel_tests/signal/window_ops_test.py b/tensorflow/python/kernel_tests/signal/window_ops_test.py index 9f5fe6f64c7..9432e70c7f2 100644 --- a/tensorflow/python/kernel_tests/signal/window_ops_test.py +++ b/tensorflow/python/kernel_tests/signal/window_ops_test.py @@ -156,15 +156,14 @@ class WindowOpsTest(test.TestCase, parameterized.TestCase): self.assertLen(rewritten_graph.node, 1) @parameterized.parameters( - # Due to control flow, only MLIR is supported. # Only float32 is supported. - (window_ops.hann_window, 10, False, dtypes.float32, True), - (window_ops.hann_window, 10, True, dtypes.float32, True), - (window_ops.hamming_window, 10, False, dtypes.float32, True), - (window_ops.hamming_window, 10, True, dtypes.float32, True), - (window_ops.vorbis_window, 12, None, dtypes.float32, True)) - def test_tflite_convert(self, window_fn, window_length, periodic, dtype, - use_mlir): + (window_ops.hann_window, 10, False, dtypes.float32), + (window_ops.hann_window, 10, True, dtypes.float32), + (window_ops.hamming_window, 10, False, dtypes.float32), + (window_ops.hamming_window, 10, True, dtypes.float32), + (window_ops.vorbis_window, 12, None, dtypes.float32)) + def test_tflite_convert(self, window_fn, window_length, periodic, dtype): + def fn(window_length): try: return window_fn(window_length, periodic=periodic, dtype=dtype) @@ -172,7 +171,7 @@ class WindowOpsTest(test.TestCase, parameterized.TestCase): return window_fn(window_length, dtype=dtype) tflite_model = test_util.tflite_convert( - fn, [tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32)], use_mlir) + fn, [tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32)]) window_length = np.array(window_length).astype(np.int32) actual_output, = test_util.evaluate_tflite_model( tflite_model, [window_length]) From 94108993a3adc322b67d35244c8488ead4034dee Mon Sep 17 00:00:00 2001 From: Michael Gester Date: Mon, 18 May 2020 15:35:17 -0700 Subject: [PATCH 148/557] Allow static result shape for unranked operand in shape verifier Previously, a static result shape for an unranked operand produced an error in shape verifier. This was too restrictive because shape inference is often incomplete at this point. PiperOrigin-RevId: 312167322 Change-Id: Ia198f07699174a4ea3c77099c9408def95e058be --- tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc | 9 ++++++--- tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc index 78623ca3c61..69b8f15320f 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc @@ -2603,9 +2603,12 @@ LogicalResult VerifyShapeOperandAndResult(Operation *op, Type operand_type, << variadic_idx_str << " to match rank of operand" << variadic_idx_str; } else if (result_ranked_type.hasStaticShape()) { - // The operand is an unranked tensor, verify that the result is dynamic. - return op->emitOpError("requires dynamic shape result") - << variadic_idx_str << " for unranked operand" << variadic_idx_str; + // The operand is an unranked tensor, print a warning if the result + // is static. + // Note: We do not handle this situation as an error, this would be too + // restrictive due to incompleteness of shape inference at this point. + op->emitWarning("has static shape result") + << variadic_idx_str << " for unranked operand" << variadic_idx_str; } Type element_type = result_ranked_type.getElementType(); diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir index ffa287e0e53..3560fec7b7d 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir @@ -1326,7 +1326,7 @@ func @testShapeMismatchDim(tensor<1x32x32x16xf32>) -> tensor<2xi32> { func @testShapeWrongResultDimDynamic(tensor<*xf32>) -> tensor<2xi32> { ^bb0(%arg0: tensor<*xf32>): - // expected-error @+1 {{requires dynamic shape result for unranked operand}} + // expected-warning @+1 {{has static shape result for unranked operand}} %0 = "tf.Shape"(%arg0) {T = "tfdtype$DT_FLOAT", output = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> tensor<2xi32> return %0 : tensor<2xi32> } @@ -1370,7 +1370,7 @@ func @testShapeNMismatchDim(tensor<1x32x32x16xf32>) -> tensor<2xi32> { func @testShapeNWrongResultDimDynamic(tensor<*xf32>) -> tensor<2xi32> { ^bb0(%arg0: tensor<*xf32>): - // expected-error @+1 {{requires dynamic shape result #1 for unranked operand #1}} + // expected-warning @+1 {{has static shape result #1 for unranked operand #1}} %0:2 = "tf.ShapeN"(%arg0, %arg0) : (tensor<*xf32>, tensor<*xf32>) -> (tensor, tensor<2xi32>) return %0#1 : tensor<2xi32> } @@ -1428,7 +1428,7 @@ func @testVariableShapeMismatchDim(%arg0: tensor<*x!tf.resource>>) -> tensor<2xi32> { - // expected-error @+1 {{requires dynamic shape result for unranked operand}} + // expected-warning @+1 {{has static shape result for unranked operand}} %0 = "tf.VariableShape"(%arg0) {output = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource>>) -> tensor<2xi32> return %0 : tensor<2xi32> } From 1acf6989bf72de324f61be20491a7c017a7da5c6 Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Mon, 18 May 2020 15:51:05 -0700 Subject: [PATCH 149/557] Fix argument check tests to work in eager mode PiperOrigin-RevId: 312170271 Change-Id: Ie7ffb52cf63559255b5463d651eb72b924a3c3bf --- .../core/kernels/reverse_sequence_op.cc | 44 +++++----- .../kernel_tests/reverse_sequence_op_test.py | 83 +++++++++---------- tensorflow/python/ops/array_ops.py | 8 +- 3 files changed, 67 insertions(+), 68 deletions(-) diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc index 0e112133915..b5b62bc76ca 100644 --- a/tensorflow/core/kernels/reverse_sequence_op.cc +++ b/tensorflow/core/kernels/reverse_sequence_op.cc @@ -43,9 +43,9 @@ typedef Eigen::GpuDevice GPUDevice; template void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) { const Tensor& input = context->input(0); - const Tensor& seq_lens = context->input(1); + const Tensor& seq_lengths = context->input(1); - auto seq_lens_t = seq_lens.vec(); + auto seq_lens_t = seq_lengths.vec(); std::vector seq_lens_vec(seq_lens_t.size()); @@ -56,15 +56,16 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) { OP_REQUIRES(context, batch_dim != seq_dim, errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim)); OP_REQUIRES(context, seq_dim < input.dims(), - errors::InvalidArgument("seq_dim must be < input.dims()", "( ", + errors::InvalidArgument("seq_dim must be < input rank", " ( ", seq_dim, " vs. ", input.dims(), ")")); OP_REQUIRES(context, batch_dim < input.dims(), - errors::InvalidArgument("batch_dim must be < input.dims()", "( ", + errors::InvalidArgument("batch_dim must be < input rank", " ( ", batch_dim, " vs. ", input.dims(), ")")); - OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim), - errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim, - "), ", "(", seq_lens.NumElements(), - " vs. ", input.dim_size(batch_dim), ")")); + OP_REQUIRES( + context, seq_lengths.NumElements() == input.dim_size(batch_dim), + errors::InvalidArgument("Length of seq_lengths != input.dims(", batch_dim, + "), ", "(", seq_lengths.NumElements(), " vs. ", + input.dim_size(batch_dim), ")")); for (size_t d = 0; d < seq_lens_vec.size(); ++d) { OP_REQUIRES(context, seq_lens_vec[d] >= 0, @@ -77,21 +78,22 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) { void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) { const Tensor& input = context->input(0); - const Tensor& seq_lens = context->input(1); + const Tensor& seq_lengths = context->input(1); OP_REQUIRES(context, batch_dim != seq_dim, errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim)); OP_REQUIRES(context, seq_dim < input.dims(), - errors::InvalidArgument("seq_dim must be < input.dims()", "( ", + errors::InvalidArgument("seq_dim must be < input rank", " ( ", seq_dim, " vs. ", input.dims(), ")")); OP_REQUIRES(context, batch_dim < input.dims(), - errors::InvalidArgument("batch_dim must be < input.dims()", "( ", + errors::InvalidArgument("batch_dim must be < input rank", " ( ", batch_dim, " vs. ", input.dims(), ")")); - OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim), - errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim, - "), ", "(", seq_lens.NumElements(), - " vs. ", input.dim_size(batch_dim), ")")); + OP_REQUIRES( + context, seq_lengths.NumElements() == input.dim_size(batch_dim), + errors::InvalidArgument("Length of seq_lengths != input.dims(", batch_dim, + "), ", "(", seq_lengths.NumElements(), " vs. ", + input.dim_size(batch_dim), ")")); } template <> @@ -117,14 +119,14 @@ class ReverseSequenceOp : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& input = context->input(0); - const Tensor& seq_lens = context->input(1); + const Tensor& seq_lengths = context->input(1); // Preliminary validation of sizes. - OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens.shape()), - errors::InvalidArgument("seq_lens input must be 1-dim, not ", - seq_lens.dims())); + OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lengths.shape()), + errors::InvalidArgument("seq_lengths must be 1-dim, not ", + seq_lengths.dims())); - auto seq_lens_t = seq_lens.vec(); + auto seq_lens_t = seq_lengths.vec(); CheckErrors(context, batch_dim_, seq_dim_); if (!context->status().ok()) return; @@ -186,7 +188,7 @@ namespace functor { void ReverseSequence::Compute( \ const GPUDevice& d, typename TTypes::ConstTensor input, \ int32 batch_dim, int32 seq_dim, \ - typename TTypes::ConstVec seq_lens, \ + typename TTypes::ConstVec seq_lengths, \ typename TTypes::Tensor output); \ extern template struct ReverseSequence; diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py index 05307c9834a..267decff38b 100644 --- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py +++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py @@ -19,10 +19,11 @@ from __future__ import division from __future__ import print_function import numpy as np -from six.moves import xrange # pylint: disable=redefined-builtin +from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import gradient_checker @@ -135,56 +136,52 @@ class ReverseSequenceTest(test.TestCase): print("ReverseSequence gradient error = %g" % err) self.assertLess(err, 1e-8) - @test_util.run_deprecated_v1 def testShapeFunctionEdgeCases(self): - t = array_ops.reverse_sequence( - array_ops.placeholder( - dtypes.float32, shape=None), - seq_lengths=array_ops.placeholder( - dtypes.int64, shape=(32,)), - batch_axis=0, - seq_axis=1) - self.assertIs(t.get_shape().ndims, None) + # Enter graph mode since we want to test partial shapes + with context.graph_mode(): + t = array_ops.reverse_sequence( + array_ops.placeholder(dtypes.float32, shape=None), + seq_lengths=array_ops.placeholder(dtypes.int64, shape=(32,)), + batch_axis=0, + seq_axis=1) + self.assertIs(t.get_shape().ndims, None) + def testInvalidArguments(self): # Batch size mismatched between input and seq_lengths. - with self.assertRaises(ValueError): - array_ops.reverse_sequence( - array_ops.placeholder( - dtypes.float32, shape=(32, 2, 3)), - seq_lengths=array_ops.placeholder( - dtypes.int64, shape=(33,)), - seq_axis=3) + # seq_length too long + with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError), + (r"Dimensions must be equal|" + r"Length of seq_lengths != input.dims\(0\)")): + array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2, 2], seq_axis=1) + + # seq_length too short + with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError), + (r"Dimensions must be equal|" + r"Length of seq_lengths != input.dims\(0\)")): + array_ops.reverse_sequence([[1, 2], [3, 4]], [2], seq_axis=1) + + # Invalid seq_length shape + with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError), + ("Shape must be rank 1 but is rank 2|" + "seq_lengths must be 1-dim")): + array_ops.reverse_sequence([[1, 2], [3, 4]], [[2, 2]], seq_axis=1) # seq_axis out of bounds. - with self.assertRaisesRegexp(ValueError, "seq_dim must be < input rank"): - array_ops.reverse_sequence( - array_ops.placeholder( - dtypes.float32, shape=(32, 2, 3)), - seq_lengths=array_ops.placeholder( - dtypes.int64, shape=(32,)), - seq_axis=3) + with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError), + "seq_dim must be < input rank"): + array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], seq_axis=2) # batch_axis out of bounds. - with self.assertRaisesRegexp(ValueError, "batch_dim must be < input rank"): - array_ops.reverse_sequence( - array_ops.placeholder( - dtypes.float32, shape=(32, 2, 3)), - seq_lengths=array_ops.placeholder( - dtypes.int64, shape=(32,)), - seq_axis=0, - batch_axis=3) + with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError), + "batch_dim must be < input rank"): + array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], + seq_axis=1, + batch_axis=3) - with self.cached_session(): - inputs = array_ops.placeholder(dtypes.float32, shape=(32, 2, 3)) - seq_lengths = array_ops.placeholder(dtypes.int64, shape=(32,)) - output = array_ops.reverse_sequence( - inputs, seq_lengths=seq_lengths, - seq_axis=0) # batch_axis default is 0 - with self.assertRaisesOpError("batch_dim == seq_dim"): - output.eval(feed_dict={ - inputs: np.random.rand(32, 2, 3), - seq_lengths: xrange(32) - }) + with self.assertRaisesRegexp((errors.OpError, errors.InvalidArgumentError), + "batch_dim == seq_dim == 0"): + output = array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], seq_axis=0) + self.evaluate(output) if __name__ == "__main__": diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index a2640925a38..ce0755fc782 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -4473,8 +4473,8 @@ def reverse_sequence(input, dimension `seq_axis`. The elements of `seq_lengths` must obey `seq_lengths[i] <= - input.dims[seq_dim]`, and `seq_lengths` must be a vector of length - `input.dims[batch_dim]`. + input.dims[seq_axis]`, and `seq_lengths` must be a vector of length + `input.dims[batch_axis]`. The output slice `i` along dimension `batch_axis` is then given by input slice `i`, with the first `seq_lengths[i]` slices along @@ -4496,8 +4496,8 @@ def reverse_sequence(input, Args: input: A `Tensor`. The input to reverse. seq_lengths: A `Tensor`. Must be one of the following types: `int32`, - `int64`. 1-D with length `input.dims(batch_dim)` and `max(seq_lengths) <= - input.dims(seq_dim)` + `int64`. 1-D with length `input.dims(batch_axis)` and `max(seq_lengths) <= + input.dims(seq_axis)` seq_axis: An `int`. The dimension which is partially reversed. batch_axis: An optional `int`. Defaults to `0`. The dimension along which reversal is performed. From ad6e816328507f80c30d25d73b0c03219d339dd6 Mon Sep 17 00:00:00 2001 From: Hanhan Wang Date: Mon, 18 May 2020 16:06:46 -0700 Subject: [PATCH 150/557] Add lowering from xla_hlo/lhlo reverse op to Linalg. This is only supported for static shape. PiperOrigin-RevId: 312173157 Change-Id: Iab149f02153597ef5a967628397fcac9a4db1329 --- .../xla/tests/hlo-legalize-to-linalg.mlir | 13 ++++++++ .../xla/tests/lhlo-legalize-to-linalg.mlir | 13 ++++++++ .../xla/transforms/xla_legalize_to_linalg.cc | 30 +++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir index a856ee5e83c..a27bf2cff79 100644 --- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir +++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir @@ -542,3 +542,16 @@ func @convert_f32_to_i32(%input: tensor<2x2xf32>) -> tensor<2x2xi32> { // CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32): // CHECK-NEXT: %[[RESULT:.*]] = fptosi %[[OPERAND_IN]] : f32 to i32 // CHECK-NEXT: linalg.yield %[[RESULT]] : i32 + +// ----- + +// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)> +// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)> +// CHECK-LABEL: func @reverse +func @reverse(%input: tensor<2x3xf32>) -> tensor<2x3xf32> { + %result = "xla_hlo.reverse"(%input) { + dimensions = dense<1> : tensor<1xi64> + } : (tensor<2x3xf32>) -> tensor<2x3xf32> + return %result : tensor<2x3xf32> +} +// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]] diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir index bb8010b520c..626e905695c 100644 --- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir +++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir @@ -636,3 +636,16 @@ func @reshape_2D_4D(%arg0: memref<12x42xi32>, %arg1 : memref<12x1x42x1xi32>) { return } // CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]] + +// ----- + +// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)> +// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)> +// CHECK-LABEL: func @reverse +func @reverse(%arg0: memref<2x3xf32>, %arg1: memref<2x3xf32>) { + "xla_lhlo.reverse"(%arg0, %arg1) { + dimensions = dense<1> : tensor<1xi64> + } : (memref<2x3xf32>, memref<2x3xf32>) -> () + return +} +// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]] diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc index 799a20aa693..2b496677d62 100644 --- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc +++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc @@ -573,6 +573,34 @@ class ConstConverter : public OpConversionPattern { } }; +// TODO(b/156787842): Support the lowering for dynamic shapes. +template +class ReverseConverter + : public DataMovementOpConverter, OpTy, + isLHLO> { + public: + using DataMovementOpConverter, OpTy, + isLHLO>::DataMovementOpConverter; + static ArrayAttr getIndexingMapsAttr(OpTy op, Builder* b) { + auto resultType = + getXLAOpResultType(op).template cast(); + auto nloops = resultType.getRank(); + SmallVector inputExprs; + inputExprs.reserve(nloops); + for (int i = 0; i < nloops; ++i) + inputExprs.push_back(b->getAffineDimExpr(i)); + for (auto dim : op.dimensions()) { + int i = dim.getZExtValue(); + if (resultType.isDynamicDim(i)) return {}; + int n = resultType.getShape()[i]; + inputExprs[i] = b->getAffineConstantExpr(n - 1) - inputExprs[i]; + } + return b->getAffineMapArrayAttr( + {AffineMap::get(nloops, /*symbolCount=*/0, inputExprs, b->getContext()), + b->getMultiDimIdentityMap(nloops)}); + } +}; + class SliceConverter : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; @@ -642,6 +670,7 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context, PointwiseToLinalgConverter, PointwiseToLinalgConverter, ReshapeAddRemoveDimConverter, + ReverseConverter, ScalarPointwiseToStandardConverter, SliceConverter >(context); @@ -742,6 +771,7 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context, PointwiseToLinalgConverter, ReshapeAddRemoveDimConverter, ReshapeOpConverter, + ReverseConverter, TransposeConverter>(context); } From ad6798a2f62ae2cb7f433af7b721bf14b9850dde Mon Sep 17 00:00:00 2001 From: Berkin Ilbeyi Date: Mon, 18 May 2020 17:01:57 -0700 Subject: [PATCH 151/557] [XLA] Fix alternate memory allocation of conditional operands. Consider the following flattened HLO schedule of a conditional: 1: a = fusion() true_computation: 2: parameter = parameter(0) 3: ... 4: ... false_computation: 5: parameter = parameter(0) 6: ... 7: ... 8: conditional = conditional(pred, a, a) 9: b = fusion(a) When we had a tensor that was a conditional operand (e.g. "a" in the example), we reserved the alternate memory for the entire 1-8 range. This meant that when we tried to allocate inside the called computations of the conditional, the offset we picked wasn't available since it would fall within the 1-8 range. This CL now reserves the conditional until the parameter of the earliest called computations (1-2 range). To allow efficient use of alternate memory by avoiding a very large conditional from claiming the offset for the entire called computation, the conditional operand might die within the called computation, allowing other HLOs inside the called computations to reclaim that alternate memory offset. This creates a subtlety for subsequent uses of conditional operands (e.g. "a" is used by a fusion at 9). These subsequent uses will force evictions (and then do another prefetch). After optimization, the graph might look like the following: a (Alternate Mem) = fusion() cs0 = copy-start(a) # Must evict a because the allocation may die within # called computation. cd0 (Default Mem) = copy-done(cs0) true_computation: parameter (Alternate Mem) = parameter(0) ... # parameter's alternate memory allocation may die here and another tensor # might use the same offset. false_computation: parameter (Alternate Mem) = parameter(0) ... # parameter's alternate memory allocation may die here and another tensor # might use the same offset. conditional = conditional(pred, a, a) cs1 = copy-start(cd0) # May prefetch the value back to alternate memory. cd1 (Alternate Mem) = copy-done(cs1) b = fusion(cd1) PiperOrigin-RevId: 312182824 Change-Id: I3ff5d019025ef96ced1aed4f6d170df677273348 --- .../xla/service/memory_space_assignment.cc | 296 ++++++++++++---- .../xla/service/memory_space_assignment.h | 18 +- .../service/memory_space_assignment_test.cc | 321 +++++++++++++++++- 3 files changed, 563 insertions(+), 72 deletions(-) diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc index 431e6af2dc0..81a8a102402 100644 --- a/tensorflow/compiler/xla/service/memory_space_assignment.cc +++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc @@ -502,7 +502,8 @@ bool AlternateMemoryBestFitHeap::IsIntervalAllowedInAlternateMemory( } bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory( - const HloUse& use) const { + const AllocationValue& value, const HloUse& use) const { + const auto& instruction_schedule = hlo_live_range_.instruction_schedule(); if (use.instruction->opcode() == HloOpcode::kWhile) { HloComputation* while_body = use.instruction->while_body(); @@ -512,7 +513,6 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory( HloValue* parameter_value = &alias_analysis_.dataflow_analysis().GetUniqueValueAt( while_body->parameter_instruction(0), use.operand_index); - const auto& instruction_schedule = hlo_live_range_.instruction_schedule(); int64 parameter_time = instruction_schedule.at(while_body->parameter_instruction(0)); int64 root_time = instruction_schedule.at(while_body->root_instruction()); @@ -567,7 +567,54 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory( "there is a required default memory assignment."; return false; } + } else if (use.instruction->opcode() == HloOpcode::kConditional) { + // For any use of this conditional (the same value might be passed into + // multiple called computations), determine if the parameter->first use + // dependency is short. + int64 conditional_time = instruction_schedule.at(use.instruction); + for (const HloUse& other_use : value.uses()) { + if (other_use.instruction != use.instruction) { + continue; + } + HloComputation* called_computation = + use.instruction->called_computations().at(other_use.operand_number - + 1); + const HloInstruction* parameter_instruction = + called_computation->parameter_instruction(0); + HloValue* parameter_value = + &alias_analysis_.dataflow_analysis().GetUniqueValueAt( + parameter_instruction, other_use.operand_index); + int64 parameter_time = instruction_schedule.at(parameter_instruction); + int64 min_use_time = conditional_time; + for (const HloUse& parameter_use : parameter_value->uses()) { + if (parameter_use.instruction->parent() == called_computation && + parameter_use.instruction->opcode() != + HloOpcode::kGetTupleElement && + parameter_use.instruction->opcode() != HloOpcode::kTuple && + parameter_use.instruction->opcode() != HloOpcode::kBitcast) { + min_use_time = std::min( + min_use_time, instruction_schedule.at(parameter_use.instruction)); + } + } + if (options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy( + parameter_value->shape(), parameter_time, min_use_time)) { + VLOG(4) << "Conditional allocation allowed in alternate memory for " + "computation = " + << called_computation->name() + << ", parameter time = " << parameter_time + << ", min use time = " << min_use_time; + return true; + } else { + VLOG(4) << "Conditional allocation not allowed in alternate memory for " + "computation = " + << called_computation->name() + << ", parameter time = " << parameter_time + << ", min use time = " << min_use_time; + } + } + return false; } + return true; } @@ -769,20 +816,12 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() { if (position.instruction->opcode() == HloOpcode::kConditional) { VLOG(3) << "Adding required assignment for condition output: " << value->ToShortString(); - required_assignments_[value].push_back( - {MemorySpace::kDefault, - instruction_schedule.at(position.instruction), - /*chunk=*/absl::nullopt}); + AddRequiredAssignment(position.instruction, position.index, + MemorySpace::kDefault); for (const HloComputation* called_computation : position.instruction->called_computations()) { - HloValue* root_value = - &alias_analysis_.dataflow_analysis().GetUniqueValueAt( - called_computation->root_instruction(), position.index); - required_assignments_[root_value].push_back( - {MemorySpace::kDefault, - instruction_schedule.at( - called_computation->root_instruction()), - /*chunk=*/absl::nullopt}); + AddRequiredAssignment(called_computation->root_instruction(), + position.index, MemorySpace::kDefault); } } } @@ -808,9 +847,13 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() { } // Iterate over the uses. - for (HloUse use : allocation_value.uses()) { + for (int use_idx = 0; use_idx < allocation_value.uses().size(); + ++use_idx) { + const HloUse& use = allocation_value.uses().at(use_idx); int64 use_time = instruction_schedule.at(use.instruction); int64 latest_prefetch_time = use_time; + bool allow_no_copy_alternate_mem_allocation = true; + absl::optional earliest_prefetch_time = absl::nullopt; // Sequential calls include kWhile, kCall, and kConditional opcodes. bool is_sequential_call = @@ -857,14 +900,41 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() { // when we look at uses within the while loop body. use_time = instruction_schedule.at(while_body->parameter_instruction(0)); + } else if (use.instruction->opcode() == HloOpcode::kConditional) { + // Replace the use time with the earliest parameter of called + // computations. + for (const HloComputation* called_computation : + use.instruction->called_computations()) { + use_time = std::min( + use_time, instruction_schedule.at( + called_computation->parameter_instruction(0))); + } } } // Add a required assignment in default memory if the use not allowed in // alternate memory. - if (!IsUseAllowedInAlternateMemory(use)) { - required_assignments_[allocation_value.value()].push_back( - {MemorySpace::kDefault, use_time, /*chunk=*/absl::nullopt}); + if (!IsUseAllowedInAlternateMemory(allocation_value, use)) { + AddRequiredAssignment(allocation_value.value(), use.instruction, + MemorySpace::kDefault, use_time); + } else if (use_idx > 0) { + // We allow buffers in alternate memory that are passed into + // conditionals to give up their alternate memory allocation inside + // the called computation. This means that if a conditional operator + // has an alternate memory allocation, subsequent uses cannot use the + // same alternate memory allocation in order not to clobber data. So + // we force default memory allocation for these subsequent uses. + const HloUse& previous_use = allocation_value.uses().at(use_idx - 1); + if (previous_use.instruction->opcode() == HloOpcode::kConditional && + previous_use.instruction != use.instruction) { + allow_no_copy_alternate_mem_allocation = false; + earliest_prefetch_time = + instruction_schedule.at(previous_use.instruction); + VLOG(3) << "Previous use (" << previous_use.ToString() + << ") of use (" << use.ToString() + << ") is a conditional, so this use will need to evict. " + << "Earliest prefetch time = " << *earliest_prefetch_time; + } } // Bitcasts don't define buffers and don't directly consume buffers. @@ -872,10 +942,16 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() { // bitcasts will be handled specially. if (use.instruction->opcode() != HloOpcode::kBitcast) { AllocationRequest request; - request.start_time = definition_time; + // Rarely, (e.g., when conditional true and false parameters are the + // same), definition time can be the time of the conditional and use + // time is the parameter use, which is less. + request.start_time = std::min(definition_time, use_time); request.end_time = use_time; request.latest_prefetch_time = latest_prefetch_time; request.size = interval.size; + request.allow_no_copy_alternate_mem_allocation = + allow_no_copy_alternate_mem_allocation; + request.earliest_prefetch_time = earliest_prefetch_time; request.preferred_offset = preferred_offset; request.use = use; request.allocation_value = &allocation_value; @@ -1061,35 +1137,42 @@ void AlternateMemoryBestFitHeap::AddAliasedRequiredAssignment( if (aliased_allocation->memory_space() == MemorySpace::kAlternate) { chunk = aliased_allocation->chunk(); } - const auto& instruction_schedule = hlo_live_range_.instruction_schedule(); - HloValue* value = - &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index); - int64 instruction_time = instruction_schedule.at(instruction); + AddRequiredAssignment(instruction, index, aliased_allocation->memory_space(), + chunk); +} + +void AlternateMemoryBestFitHeap::AddRequiredAssignment( + const HloValue* value, const HloInstruction* instruction, + MemorySpaceAssignment::MemorySpace memory_space, int64 time, + absl::optional chunk) { // Check for existing required assignment at this time and make sure it is the // same as this if there is one. - auto existing_required_assignment = - RequiredMemoryAssignmentAt(value, instruction_time); + auto existing_required_assignment = RequiredMemoryAssignmentAt(value, time); if (existing_required_assignment) { - CHECK(aliased_allocation->memory_space() == - existing_required_assignment->memory_space); + CHECK(memory_space == existing_required_assignment->memory_space) + << "inst = " << instruction->ToString() << " at " << time; CHECK((!chunk && !existing_required_assignment->chunk) || chunk->offset == existing_required_assignment->chunk->offset); - VLOG(3) << "Not adding aliased required assignment because there is one " - "already: " - << value->ToShortString() << " at " << instruction_time << " at " - << (aliased_allocation->memory_space() == MemorySpace::kDefault - ? "def" - : "alt"); - return; + VLOG(3) << "Not adding required assignment because there is one already: " + << value->ToShortString() << " at " << time << " at " + << (memory_space == MemorySpace::kDefault ? "def" : "alt"); + } else { + VLOG(3) << "Adding required assignment: " << value->ToShortString() + << " at " << time << " at " + << (memory_space == MemorySpace::kDefault ? "def" : "alt"); + required_assignments_[value].push_back({memory_space, time, chunk}); } +} - required_assignments_[value].push_back( - {aliased_allocation->memory_space(), instruction_time, chunk}); - VLOG(3) << "Adding aliased required assignment: " << value->ToShortString() - << " at " << instruction_time << " at " - << (aliased_allocation->memory_space() == MemorySpace::kDefault - ? "def" - : "alt"); +void AlternateMemoryBestFitHeap::AddRequiredAssignment( + const HloInstruction* instruction, ShapeIndex index, + MemorySpace memory_space, absl::optional chunk) { + const HloValue* value = + &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index); + int64 instruction_time = + hlo_live_range_.instruction_schedule().at(instruction); + AddRequiredAssignment(value, instruction, memory_space, instruction_time, + chunk); } void AlternateMemoryBestFitHeap::AddInputAndOutputRequiredAssignments() { @@ -1289,6 +1372,7 @@ bool AlternateMemoryBestFitHeap::FindAllocation( // First try keeping the allocation entirely in the alternate memory. if (required_memory_space_at_start != MemorySpace::kDefault && required_memory_space_at_end != MemorySpace::kDefault && + request.allow_no_copy_alternate_mem_allocation && AllocateInAlternateMemoryNoCopy(request)) { return true; } @@ -1618,9 +1702,14 @@ bool AlternateMemoryBestFitHeap::Prefetch( // ^ ^ // Copy Copy // Start Done - options_.prefetch_interval_picker->Begin( - request.use, prev_allocation_in_default_mem.earliest_available_time(), - request.latest_prefetch_time); + int64 earliest_prefetch_time = + prev_allocation_in_default_mem.earliest_available_time(); + if (request.earliest_prefetch_time) { + earliest_prefetch_time = + std::max(earliest_prefetch_time, *request.earliest_prefetch_time); + } + options_.prefetch_interval_picker->Begin(request.use, earliest_prefetch_time, + request.latest_prefetch_time); VLOG(3) << "Trying prefetch picker = " << options_.prefetch_interval_picker->ToDebugString(); @@ -2435,6 +2524,34 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() { std::tuple> events; + auto add_allocation_and_verify = [&](int64 start_time, int64 end_time, + const Chunk& chunk, + const HloValue* value) { + events[std::make_tuple(start_time, /*is_free=*/false, value->id())] = + std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC); + events[std::make_tuple(end_time, /*is_free=*/true, value->id())] = + std::make_tuple(value, chunk, HeapSimulatorTrace::Event::FREE); + + // Get the chunks overlapping in time and search if they overlap in space + // as well. + // TODO(berkin): For now checking against end_time - 1 (exclusive), but we + // really should check against end_time (inclusive) for cases where the + // operand can't share buffer with user (see + // HloDataflowAnalysis::CanShareOperandBufferWithUser). + for (const Chunk& overlapping_chunk : + interval_tree.ChunksOverlappingInTime(start_time, end_time - 1)) { + if (chunk.OverlapsWith(overlapping_chunk)) { + return InternalError( + ("Value %s (%d, %d) off: %d size: %d overlaps with another chunk" + " off: %d size: %d"), + value->ToShortString(), start_time, end_time, chunk.offset, + chunk.size, overlapping_chunk.offset, overlapping_chunk.size); + } + } + interval_tree.Add(start_time, end_time - 1, chunk); + return Status::OK(); + }; + // Go through all instructions in the module to ensure CopyStart/CopyDone // instructions copy between alternate memory and default memory. for (const HloComputation* computation : @@ -2470,34 +2587,73 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() { for (const HloValue* value : buffer.values()) { const HloLiveRange::TimeBound& time_bound = hlo_live_range->buffer_live_ranges().at(value); - events[std::make_tuple(time_bound.start, /*is_free=*/false, - value->id())] = - std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC); - events[std::make_tuple(time_bound.end, /*is_free=*/true, value->id())] = - std::make_tuple(value, chunk, HeapSimulatorTrace::Event::FREE); - - VLOG(3) << " buffer: " << buffer.ToString() - << " value: " << value->ToShortString() << ": (" - << time_bound.start << ", " << time_bound.end - << ") off: " << chunk.offset << ", size: " << chunk.size; - // Get the chunks overlapping in time and search if they overlap in space - // as well. - // TODO(berkin): For now checking against end_time - 1 (exclusive), but we - // really should check against end_time (inclusive) for cases where the - // operand can't share buffer with user (see - // HloDataflowAnalysis::CanShareOperandBufferWithUser). - for (const Chunk& overlapping_chunk : - interval_tree.ChunksOverlappingInTime(time_bound.start, - time_bound.end - 1)) { - if (chunk.OverlapsWith(overlapping_chunk)) { - return InternalError( - ("Buffer %s (%d, %d) off: %d size: %d overlaps with another chunk" - " off: %d size: %d"), - buffer.ToString(), time_bound.start, time_bound.end, chunk.offset, - chunk.size, overlapping_chunk.offset, overlapping_chunk.size); + const HloInstruction* last_use_instruction = nullptr; + int64 last_use_time = time_bound.start; + for (const HloUse& use : value->uses()) { + int64 use_time = + hlo_live_range->instruction_schedule().at(use.instruction); + if (use_time > last_use_time) { + last_use_time = use_time; + last_use_instruction = use.instruction; } } - interval_tree.Add(time_bound.start, time_bound.end - 1, chunk); + + if (last_use_instruction && + last_use_instruction->opcode() == HloOpcode::kConditional) { + // Special case when verifying conditional: we internally split the use + // of alternate memory in conditionals, so fish them out from the + // conditionals. + VLOG(3) << " Splitting conditional buffer: " << buffer.ToString() + << " value: " << value->ToShortString() << ": (" + << time_bound.start << ", " << time_bound.end + << ") off: " << chunk.offset << ", size: " << chunk.size; + int64 earliest_computation_start_time = time_bound.end; + for (const HloComputation* called_computation : + last_use_instruction->called_computations()) { + earliest_computation_start_time = + std::min(earliest_computation_start_time, + hlo_live_range->computation_span_times() + .at(called_computation) + .start); + int64 parameter_time = -1; + int64 last_use_time = -1; + for (const HloPosition& position : value->positions()) { + if (position.instruction->opcode() == HloOpcode::kParameter && + position.instruction->parent() == called_computation) { + parameter_time = hlo_live_range->instruction_schedule().at( + position.instruction); + break; + } + } + for (const HloUse& use : value->uses()) { + if (use.instruction->parent() == called_computation) { + last_use_time = std::max( + last_use_time, + hlo_live_range->instruction_schedule().at(use.instruction)); + } + } + if (last_use_time != -1) { + CHECK_NE(parameter_time, -1); + VLOG(3) << " computation: " << called_computation->name() << ": (" + << parameter_time << ", " << last_use_time << ")"; + TF_RETURN_IF_ERROR(add_allocation_and_verify( + parameter_time, last_use_time, chunk, value)); + } + } + VLOG(3) << " from beginning until first computation: (" + << time_bound.start << ", " + << (earliest_computation_start_time - 1) << ")"; + TF_RETURN_IF_ERROR(add_allocation_and_verify( + time_bound.start, earliest_computation_start_time - 1, chunk, + value)); + } else { + VLOG(3) << " buffer: " << buffer.ToString() + << " value: " << value->ToShortString() << ": (" + << time_bound.start << ", " << time_bound.end + << ") off: " << chunk.offset << ", size: " << chunk.size; + TF_RETURN_IF_ERROR(add_allocation_and_verify( + time_bound.start, time_bound.end, chunk, value)); + } } } diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h index 727b8da6c08..340446d21dd 100644 --- a/tensorflow/compiler/xla/service/memory_space_assignment.h +++ b/tensorflow/compiler/xla/service/memory_space_assignment.h @@ -816,11 +816,16 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap { // use_times is a sorted sequence of the times of all uses. // latest_prefetch_time is the latest time we can schedule the CopyDone for a // prefetch. + // If allow_no_copy_alternate_mem_allocation is false, an eviction is forced. + // If earliest_prefetch_time is set, prefetches cannot start before this + // value. struct AllocationRequest { int64 start_time; int64 end_time; int64 latest_prefetch_time; int64 size; + bool allow_no_copy_alternate_mem_allocation; + absl::optional earliest_prefetch_time; absl::optional preferred_offset; HloUse use; MemorySpaceAssignment::AllocationValue* allocation_value; @@ -841,7 +846,8 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap { bool IsIntervalAllowedInAlternateMemory(const BufferInterval& interval) const; // Returns true if the use is allowed in the alternate memory. - bool IsUseAllowedInAlternateMemory(const HloUse& use) const; + bool IsUseAllowedInAlternateMemory(const AllocationValue& value, + const HloUse& use) const; // Given an HloValue, creates AllocationValue objects and corresponding // AllocationSequences and appends them into allocation_sequence_list_. @@ -895,6 +901,16 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap { const HloInstruction* instruction, ShapeIndex index, const MemorySpaceAssignment::Allocation* aliased_allocation); + // This sets a required assignment. CHECK fails if there is a conflicting + // required assignment at the same time. + void AddRequiredAssignment(const HloValue* value, + const HloInstruction* instruction, + MemorySpace memory_space, int64 time, + absl::optional chunk = absl::nullopt); + void AddRequiredAssignment(const HloInstruction* instruction, + ShapeIndex index, MemorySpace memory_space, + absl::optional chunk = absl::nullopt); + // Adds input and outputs as required assignments. void AddInputAndOutputRequiredAssignments(); diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc index 984f2e7b4ea..a9be3850d89 100644 --- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc +++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc @@ -1663,6 +1663,324 @@ TEST_P(MemorySpaceAssignmentTest, ControlPredecessorsBug) { AssignMemorySpace(module.get()); } +TEST_P(MemorySpaceAssignmentTest, ConditionalShouldBeAllocatedInAlternateMem) { + // Checks if simple conditionals get alternate memory allocations. + absl::string_view hlo_string = R"( + HloModule CondAllocation, is_scheduled=true + + true_computation { + p0 = (f32[3]{0}) parameter(0) + gte = f32[3]{0} get-tuple-element(p0), index=0 + ROOT neg1 = f32[3]{0} negate(gte) + } + + false_computation { + p0 = (f32[3]{0}) parameter(0) + gte = f32[3]{0} get-tuple-element(p0), index=0 + ROOT neg2 = f32[3]{0} negate(gte) + } + + ENTRY entry { + p0 = f32[3]{0} parameter(0) + p1 = pred[] parameter(1) + copy = f32[3]{0} copy(p0) + tuple = (f32[3]{0}) tuple(copy) + ROOT conditional = f32[3]{0} conditional(p1, tuple, tuple), true_computation=true_computation, false_computation=false_computation + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, + ParseAndReturnVerifiedModule(hlo_string)); + AssignMemorySpace(module.get()); + + if (GetParam()) { + // Check that copy and gtes got alternate memory allocations. + auto copy = + module->GetComputationWithName("entry")->GetInstructionWithName("copy"); + EXPECT_EQ(copy->shape().layout().memory_space(), kAlternateMemorySpace); + auto neg1 = module->GetComputationWithName("true_computation") + ->GetInstructionWithName("neg1"); + auto neg1_operand = neg1->operand(0); + EXPECT_EQ(neg1_operand->shape().layout().memory_space(), + kAlternateMemorySpace); + auto neg2 = module->GetComputationWithName("false_computation") + ->GetInstructionWithName("neg2"); + auto neg2_operand = neg2->operand(0); + EXPECT_EQ(neg2_operand->shape().layout().memory_space(), + kAlternateMemorySpace); + } +} + +TEST_P(MemorySpaceAssignmentTest, ConditionalAvoidsUnnecessaryPrefetch) { + // Checks if we avoid unnecessary allocation in alternate memory if the input + // won't be used in the computation for a long time. + absl::string_view hlo_string = R"( + HloModule CondAllocation, is_scheduled=true + + true_computation { + p0 = (f32[3]{0}, f32[3]{0}) parameter(0) + gte0 = f32[3]{0} get-tuple-element(p0), index=0 + neg0 = f32[3]{0} negate(gte0) + neg1 = f32[3]{0} negate(neg0) + neg2 = f32[3]{0} negate(neg1) + neg3 = f32[3]{0} negate(neg2) + neg4 = f32[3]{0} negate(neg3) + neg5 = f32[3]{0} negate(neg4) + neg6 = f32[3]{0} negate(neg5) + neg7 = f32[3]{0} negate(neg6) + neg8 = f32[3]{0} negate(neg7) + neg9 = f32[3]{0} negate(neg8) + gte1 = f32[3]{0} get-tuple-element(p0), index=1 + ROOT add = f32[3]{0} add(neg9, gte1) + } + + false_computation { + p0 = (f32[3]{0}) parameter(0) + gte = f32[3]{0} get-tuple-element(p0), index=0 + ROOT neg = f32[3]{0} negate(gte) + } + + ENTRY entry { + p0 = f32[3]{0} parameter(0) + p1 = pred[] parameter(1) + copy0 = f32[3]{0} copy(p0) + copy1 = f32[3]{0} copy(p0) + tuple0 = (f32[3]{0}, f32[3]{0}) tuple(copy0, copy1) + tuple1 = (f32[3]{0}) tuple(copy0) + ROOT conditional = f32[3]{0} conditional(p1, tuple0, tuple1), true_computation=true_computation, false_computation=false_computation + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, + ParseAndReturnVerifiedModule(hlo_string)); + AssignMemorySpace(module.get()); + + if (GetParam()) { + // Check that copy1 doesn't get unnecessarily allocated in alternate mem + // (due to long negate chain in true_computation) but is prefetched before + // add. + auto copy0 = + module->GetComputationWithName("entry")->GetInstructionWithName( + "copy0"); + EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace); + auto copy1 = + module->GetComputationWithName("entry")->GetInstructionWithName( + "copy1"); + EXPECT_EQ(copy1->shape().layout().memory_space(), kDefaultMemorySpace); + auto add = module->GetComputationWithName("true_computation") + ->GetInstructionWithName("add"); + auto add_operand = add->operand(1); + EXPECT_EQ(add_operand->shape().layout().memory_space(), + kAlternateMemorySpace); + } +} + +TEST_P(MemorySpaceAssignmentTest, ConditionalMultiUse) { + // Make sure there is an evict when there is a conditional use followed by + // another use. + absl::string_view hlo_string = R"( + HloModule CondAllocation, is_scheduled=true + + true_computation { + p0 = (f32[3]{0}, f32[3]{0}) parameter(0) + gte0 = f32[3]{0} get-tuple-element(p0), index=0 + gte1 = f32[3]{0} get-tuple-element(p0), index=1 + add0 = f32[3]{0} add(gte0, gte1) + neg0 = f32[3]{0} negate(add0) + neg1 = f32[3]{0} negate(neg0) + neg2 = f32[3]{0} negate(neg1) + neg3 = f32[3]{0} negate(neg2) + neg4 = f32[3]{0} negate(neg3) + neg5 = f32[3]{0} negate(neg4) + neg6 = f32[3]{0} negate(neg5) + neg7 = f32[3]{0} negate(neg6) + neg8 = f32[3]{0} negate(neg7) + ROOT neg9 = f32[3]{0} negate(neg8) + } + + false_computation { + p0 = (f32[3]{0}) parameter(0) + gte = f32[3]{0} get-tuple-element(p0), index=0 + ROOT neg = f32[3]{0} negate(gte) + } + + ENTRY entry { + p0 = f32[3]{0} parameter(0) + p1 = pred[] parameter(1) + copy0 = f32[3]{0} copy(p0) + copy1 = f32[3]{0} copy(p0) + tuple0 = (f32[3]{0}, f32[3]{0}) tuple(copy0, copy1) + tuple1 = (f32[3]{0}) tuple(copy0) + conditional = f32[3]{0} conditional(p1, tuple0, tuple1), true_computation=true_computation, false_computation=false_computation + ROOT add1 = f32[3]{0} add(copy1, conditional) + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, + ParseAndReturnVerifiedModule(hlo_string)); + AssignMemorySpace(module.get()); + + if (GetParam()) { + // Make sure the copy1->add edge is in alternate memory. Before conditional, + // this should be evicted to default memory and neg uses the input from + // default memory. + auto copy1 = + module->GetComputationWithName("entry")->GetInstructionWithName( + "copy1"); + EXPECT_EQ(copy1->shape().layout().memory_space(), kAlternateMemorySpace); + auto add0 = module->GetComputationWithName("true_computation") + ->GetInstructionWithName("add0"); + auto add0_operand = add0->operand(1); + EXPECT_EQ(add0_operand->shape().layout().memory_space(), + kAlternateMemorySpace); + auto add1 = + module->GetComputationWithName("entry")->GetInstructionWithName("add1"); + auto add1_operand = add1->operand(0); + EXPECT_EQ(add1_operand->shape().layout().memory_space(), + kDefaultMemorySpace); + EXPECT_EQ(add1_operand->opcode(), HloOpcode::kCopyDone); + } +} + +TEST_P(MemorySpaceAssignmentTest, ConditionalMultiUseInWhile) { + absl::string_view hlo_string = R"( + HloModule CondAllocation, is_scheduled=true + + true_computation { + p0 = (f32[3]{0}) parameter(0) + gte = f32[3]{0} get-tuple-element(p0), index=0 + ROOT neg1 = f32[3]{0} negate(gte) + } + + false_computation { + p0 = (f32[3]{0}) parameter(0) + gte = f32[3]{0} get-tuple-element(p0), index=0 + ROOT neg2 = f32[3]{0} negate(gte) + } + + while_cond { + p0 = (f32[3]{0}, f32[3]{0}, pred[]) parameter(0) + ROOT gte = pred[] get-tuple-element(p0), index=2 + } + + while_body { + p0 = (f32[3]{0}, f32[3]{0}, pred[]) parameter(0) + gte0 = f32[3]{0} get-tuple-element(p0), index=0 + gte1 = f32[3]{0} get-tuple-element(p0), index=1 + gte2 = pred[] get-tuple-element(p0), index=2 + cond_tuple = (f32[3]{0}) tuple(gte0) + conditional = f32[3]{0} conditional(gte2, cond_tuple, cond_tuple), true_computation=true_computation, false_computation=false_computation + add = f32[3]{0} add(conditional, gte1) + neg0 = f32[3]{0} negate(add) + neg1 = f32[3]{0} negate(neg0) + ROOT tuple = (f32[3]{0}, f32[3]{0}, pred[]) tuple(gte0, neg1, gte2) + } + + ENTRY entry { + p0 = f32[3]{0} parameter(0) + p1 = pred[] parameter(1) + copy0 = f32[3]{0} copy(p0) + copy1 = f32[3]{0} copy(p0) + tuple = (f32[3]{0}, f32[3]{0}, pred[]) tuple(copy0, copy1, p1) + while = (f32[3]{0}, f32[3]{0}, pred[]) while(tuple), condition=while_cond, body=while_body + ROOT gte = f32[3]{0} get-tuple-element(while), index=1 + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, + ParseAndReturnVerifiedModule(hlo_string)); + AssignMemorySpace(module.get()); + + if (GetParam()) { + // Make sure copy1/while{0}/cond_tuple{0} gets alternate memory allocation. + // This will force an eviction and a prefetch for while body root. + auto copy0 = + module->GetComputationWithName("entry")->GetInstructionWithName( + "copy0"); + EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace); + auto conditional = module->GetComputationWithName("while_body") + ->GetInstructionWithName("conditional"); + auto conditional_operand = conditional->operand(1); + EXPECT_EQ(ShapeUtil::GetSubshape(conditional_operand->shape(), {0}) + .layout() + .memory_space(), + kAlternateMemorySpace); + auto while_root = + module->GetComputationWithName("while_body")->root_instruction(); + auto while_root_operand = while_root->operand(0); + EXPECT_THAT( + while_root_operand, + op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace, + op::AsyncCopy(kDefaultMemorySpace, kAlternateMemorySpace, + op::GetTupleElement(op::Parameter(0))))); + } +} + +TEST_P(MemorySpaceAssignmentTest, NestedConditional) { + absl::string_view hlo_string = R"( + HloModule CondAllocation, is_scheduled=true + + true_computation2 { + p0 = (f32[3]{0}) parameter(0) + gte = f32[3]{0} get-tuple-element(p0), index=0 + ROOT neg1 = f32[3]{0} negate(gte) + } + + false_computation2 { + p0 = (f32[3]{0}) parameter(0) + gte = f32[3]{0} get-tuple-element(p0), index=0 + ROOT neg2 = f32[3]{0} negate(gte) + } + + true_computation1 { + p0 = (f32[3]{0}) parameter(0) + gte = f32[3]{0} get-tuple-element(p0), index=0 + slice = f32[1]{0} slice(gte), slice={[0:1]} + bitcast = f32[] bitcast(slice) + constant = f32[] constant(0.0) + compare = pred[] compare(bitcast, constant), direction=GT + ROOT conditional = f32[3]{0} conditional(compare, p0, p0), true_computation=true_computation2, false_computation=false_computation2 + } + + false_computation1 { + p0 = (f32[3]{0}) parameter(0) + gte = f32[3]{0} get-tuple-element(p0), index=0 + ROOT neg3 = f32[3]{0} negate(gte) + } + + + ENTRY entry { + p0 = f32[3]{0} parameter(0) + p1 = pred[] parameter(1) + copy = f32[3]{0} copy(p0) + tuple = (f32[3]{0}) tuple(copy) + ROOT conditional = f32[3]{0} conditional(p1, tuple, tuple), true_computation=true_computation1, false_computation=false_computation1 + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, + ParseAndReturnVerifiedModule(hlo_string)); + AssignMemorySpace(module.get()); + + if (GetParam()) { + // Make sure alternate memory allocation gets propagated into both levels of + // conditional. + auto copy = + module->GetComputationWithName("entry")->GetInstructionWithName("copy"); + EXPECT_EQ(copy->shape().layout().memory_space(), kAlternateMemorySpace); + auto neg1_operand = module->GetComputationWithName("true_computation2") + ->GetInstructionWithName("neg1") + ->operand(0); + auto neg2_operand = module->GetComputationWithName("false_computation2") + ->GetInstructionWithName("neg2") + ->operand(0); + auto neg3_operand = module->GetComputationWithName("false_computation1") + ->GetInstructionWithName("neg3") + ->operand(0); + EXPECT_EQ(neg1_operand->shape().layout().memory_space(), + kAlternateMemorySpace); + EXPECT_EQ(neg2_operand->shape().layout().memory_space(), + kAlternateMemorySpace); + EXPECT_EQ(neg3_operand->shape().layout().memory_space(), + kAlternateMemorySpace); + } +} + TEST_P(MemorySpaceAssignmentTest, RequestIdentifierShouldNotBeAllocatedInAlternateMem) { // Ensure that request identifier returned by Send/Recv HLOs are not allocated @@ -2149,7 +2467,8 @@ TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule3) { AssignMemorySpace(module.get(), -1, 5); } -TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule4) { +// TODO(berkin): This might be an incorrect input graph, investigate. +TEST_P(MemorySpaceAssignmentTest, DISABLED_NonEntryComputationSchedule4) { auto module = CreateNewVerifiedModule(); Shape shape = ShapeUtil::MakeShape(xla::F32, {2, 3}); Shape shape2 = ShapeUtil::MakeShape(xla::F32, {3, 3}); From acaaab2504a94711a4c1084328c79c10b7c9a594 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 18 May 2020 17:09:11 -0700 Subject: [PATCH 152/557] Rename TransformTensorV2 op to TransformTensorBilinearV2 op. PiperOrigin-RevId: 312184091 Change-Id: I5450142e1022f72705bc5fbdf6c99c94cdbb346b --- tensorflow/lite/delegates/gpu/common/model_builder.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc index 46856a70a7c..964c8289f83 100644 --- a/tensorflow/lite/delegates/gpu/common/model_builder.cc +++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc @@ -2350,7 +2350,7 @@ class TransformTensorOperationParser : public TFLiteOperationParser { private: }; -class TransformTensorV2OperationParser : public TFLiteOperationParser { +class TransformTensorBilinearV2OperationParser : public TFLiteOperationParser { public: absl::Status IsSupported(const TfLiteContext* context, const TfLiteNode* tflite_node, @@ -2368,7 +2368,7 @@ class TransformTensorV2OperationParser : public TFLiteOperationParser { RETURN_IF_ERROR(reader->AddInput(node, 1)); // bbox RETURN_IF_ERROR(reader->AddOutputs(node)); - std::string op_name = "transform_tensor_v2"; + std::string op_name = "transform_tensor_bilinear_v2"; node->operation.type = op_name; BHWC output_shape; RETURN_IF_ERROR( @@ -2731,8 +2731,8 @@ std::unique_ptr NewOperationParser( if (custom_name == "TransformTensor") { return std::make_unique(); } - if (custom_name == "TransformTensorV2") { - return std::make_unique(); + if (custom_name == "TransformTensorBilinearV2") { + return std::make_unique(); } if (custom_name == "TransformLandmarks") { return std::make_unique(); From 637c14abf840d83e0f6177694030455d6af35937 Mon Sep 17 00:00:00 2001 From: Zhenyu Tan Date: Mon, 18 May 2020 17:25:05 -0700 Subject: [PATCH 153/557] Add SparseCrossV2 which supports strong_hash with salt, and fingerprint doens't take `hash_key`. hash function will be run before FingerprintCat. PiperOrigin-RevId: 312186543 Change-Id: I67a51645250b9d0714b757c85dabf1137e64b167 --- .../base_api/api_def_SparseCrossHashed.pbtxt | 104 +++ .../base_api/api_def_SparseCrossV2.pbtxt | 91 ++ .../api_def_SparseCrossHashed.pbtxt | 4 + .../python_api/api_def_SparseCrossV2.pbtxt | 4 + tensorflow/core/kernels/sparse_cross_op.cc | 805 ++++++++++++------ tensorflow/core/ops/sparse_ops.cc | 40 + .../kernel_tests/sparse_cross_op_test.py | 592 +++++++++++++ .../api/golden/v1/tensorflow.raw_ops.pbtxt | 8 + .../api/golden/v2/tensorflow.raw_ops.pbtxt | 8 + 9 files changed, 1417 insertions(+), 239 deletions(-) create mode 100644 tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt create mode 100644 tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt create mode 100644 tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt create mode 100644 tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt new file mode 100644 index 00000000000..2c4340cb9b7 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt @@ -0,0 +1,104 @@ +op { + graph_op_name: "SparseCrossHashed" + in_arg { + name: "indices" + description: < 0 else hashed_value. +END + } + in_arg { + name: "strong_hash" + description: <