From fbf407383c93774d10bd7c45cd66788a070b0e07 Mon Sep 17 00:00:00 2001 From: Nick Kreeger Date: Fri, 19 Jun 2020 10:40:30 -0700 Subject: [PATCH] Reduce the size of TfLiteTensor for the TF Micro runtime. This change uses the existing micro-specific build flag (TF_LITE_STATIC_MEMORY) to reduce the size of TfLiteTensor. In this build setting, only the minimum number of fields required for preparing and initializing a model in TFLM are used. This build define is opt-in only for internal builds and continues to be enabled by default in Makefile builds./ All TFLM internal targets can be built with this flag by adding '--copt=-DTF_LITE_STATIC_MEMORY'. This change reduces the sizeof(TfLiteTensor) to 64 bytes (64bit systems) down from 112 bytes (64 bit systems). TfLiteTensor struct reduced by 1.75x (~43% reduction) Tail allocation reduced by: 2,592kb (~12.5% reduction) Total allocation reduced by: 2,592kb (~12% reduction) Optimized results from memory_arena_threshold_test: Keyword Model: -------------- [RecordingMicroAllocator] Arena allocation total 18448 bytes [RecordingMicroAllocator] Arena allocation head 672 bytes [RecordingMicroAllocator] Arena allocation tail 17776 bytes [RecordingMicroAllocator] 'TfLiteTensor struct' used 3456 bytes with alignment overhead (requested 3456 bytes for 54 tensors) [RecordingMicroAllocator] 'TfLiteTensor quantization data' used 1728 bytes with alignment overhead (requested 1728 bytes for 108 allocations) [RecordingMicroAllocator] 'TfLiteTensor variable buffer data' used 10240 bytes with alignment overhead (requested 10240 bytes for 7 allocations) [RecordingMicroAllocator] 'NodeAndRegistration struct' used 1200 bytes with alignment overhead (requested 1200 bytes for 15 NodeAndRegistration structs) [RecordingMicroAllocator] 'Operator runtime data' used 148 bytes with alignment overhead (requested 148 bytes for 13 OpData structs) Test Conv Model: ---------------- [RecordingMicroAllocator] Arena allocation total 10960 bytes [RecordingMicroAllocator] Arena allocation head 7744 bytes [RecordingMicroAllocator] Arena allocation tail 3216 bytes [RecordingMicroAllocator] 'TfLiteTensor struct' used 960 bytes with alignment overhead (requested 960 bytes for 15 tensors) [RecordingMicroAllocator] 'TfLiteTensor quantization data' used 768 bytes with alignment overhead (requested 752 bytes for 24 allocations) [RecordingMicroAllocator] 'TfLiteTensor variable buffer data' used 0 bytes with alignment overhead (requested 0 bytes for 0 allocations) [RecordingMicroAllocator] 'NodeAndRegistration struct' used 560 bytes with alignment overhead (requested 560 bytes for 7 NodeAndRegistration structs) [RecordingMicroAllocator] 'Operator runtime data' used 136 bytes with alignment overhead (requested 136 bytes for 5 OpData structs) PiperOrigin-RevId: 317335359 Change-Id: Ic3d4d2c3e62249f072ece8f621f9ef94eaa28589 --- tensorflow/lite/c/common.h | 46 +++++++++++++++++++ .../lite/micro/memory_arena_threshold_test.cc | 19 ++++++-- .../lite/micro/micro_interpreter_test.cc | 2 +- tensorflow/lite/micro/tools/make/Makefile | 2 + .../benchmark/experimental/c/c_api_types.h | 46 +++++++++++++++++++ 5 files changed, 111 insertions(+), 4 deletions(-) diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h index 15823784d12..9093e5d50ad 100644 --- a/tensorflow/lite/c/common.h +++ b/tensorflow/lite/c/common.h @@ -375,6 +375,7 @@ typedef struct TfLiteSparsity { // An tensor in the interpreter system which is a wrapper around a buffer of // data including a dimensionality (or NULL if not currently defined). +#ifndef TF_LITE_STATIC_MEMORY typedef struct TfLiteTensor { // The data type specification for data stored in `data`. This affects // what member of `data` union should be used. @@ -439,6 +440,51 @@ typedef struct TfLiteTensor { // `dims_signature` contains [1, -1, -1, 3]). const TfLiteIntArray* dims_signature; } TfLiteTensor; +#else +// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct +// contains only the minimum fields required to initialize and prepare a micro +// inference graph. The fields in this struct have been ordered from +// largest-to-smallest for optimal struct sizeof. +// +// NOTE: This flag is opt-in only at compile time. +typedef struct TfLiteTensor { + // TODO(b/155784997): Consider consolidating these quantization fields: + // Quantization information. Replaces params field above. + TfLiteQuantization quantization; + + // Quantization information. + TfLiteQuantizationParams params; + + // A union of data pointers. The appropriate type should be used for a typed + // tensor based on `type`. + TfLitePtrUnion data; + + // A pointer to a structure representing the dimensionality interpretation + // that the buffer should have. NOTE: the product of elements of `dims` + // and the element datatype size should be equal to `bytes` below. + TfLiteIntArray* dims; + + // The number of bytes required to store the data of this Tensor. I.e. + // (bytes of each element) * dims[0] * ... * dims[n-1]. For example, if + // type is kTfLiteFloat32 and dims = {3, 2} then + // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24. + size_t bytes; + + // The data type specification for data stored in `data`. This affects + // what member of `data` union should be used. + TfLiteType type; + + // How memory is mapped + // kTfLiteMmapRo: Memory mapped read only. + // i.e. weights + // kTfLiteArenaRw: Arena allocated read write memory + // (i.e. temporaries, outputs). + TfLiteAllocationType allocation_type; + + // True if the tensor is a variable. + bool is_variable; +} TfLiteTensor; +#endif // TF_LITE_STATIC_MEMORY #ifndef TF_LITE_STATIC_MEMORY // Free data memory of tensor `t`. diff --git a/tensorflow/lite/micro/memory_arena_threshold_test.cc b/tensorflow/lite/micro/memory_arena_threshold_test.cc index 4f49b57112a..b45de85a21b 100644 --- a/tensorflow/lite/micro/memory_arena_threshold_test.cc +++ b/tensorflow/lite/micro/memory_arena_threshold_test.cc @@ -41,9 +41,17 @@ constexpr int kKeywordModelNodeAndRegistrationCount = 15; // NOTE: These values are measured on x86-64: // TODO(b/158651472): Consider auditing these values on non-64 bit systems. +// +// Run this test with '--copt=-DTF_LITE_MICRO_OPTIMIZED_RUNTIME' to get +// optimized memory runtime values: +#ifdef TF_LITE_STATIC_MEMORY +constexpr int kKeywordModelTotalSize = 18448; +constexpr int kKeywordModelTailSize = 17776; +#else constexpr int kKeywordModelTotalSize = 21040; -constexpr int kKeywordModelHeadSize = 672; constexpr int kKeywordModelTailSize = 20368; +#endif +constexpr int kKeywordModelHeadSize = 672; constexpr int kKeywordModelTfLiteTensorVariableBufferDataSize = 10240; constexpr int kKeywordModelTfLiteTensorQuantizationDataSize = 1728; constexpr int kKeywordModelOpRuntimeDataSize = 148; @@ -56,9 +64,14 @@ constexpr int kTestConvModelNodeAndRegistrationCount = 7; // NOTE: These values are measured on x86-64: // TODO(b/158651472): Consider auditing these values on non-64 bit systems. +#ifdef TF_LITE_STATIC_MEMORY +constexpr int kTestConvModelTotalSize = 10960; +constexpr int kTestConvModelTailSize = 3216; +#else constexpr int kTestConvModelTotalSize = 11680; -constexpr int kTestConvModelHeadSize = 7744; constexpr int kTestConvModelTailSize = 3936; +#endif +constexpr int kTestConvModelHeadSize = 7744; constexpr int kTestConvModelTfLiteTensorQuantizationDataSize = 768; constexpr int kTestConvModelOpRuntimeDataSize = 136; @@ -81,7 +94,7 @@ void EnsureAllocatedSizeThreshold(const char* allocation_type, size_t actual, TF_LITE_MICRO_EXPECT_NEAR(actual, expected, kAllocationThreshold); if (actual != expected) { TF_LITE_REPORT_ERROR(micro_test::reporter, - "%s threshold failed: %ld != %ld", allocation_type, + "%s threshold failed: %d != %d", allocation_type, actual, expected); } } else { diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc index f54c212b573..c577d8cb513 100644 --- a/tensorflow/lite/micro/micro_interpreter_test.cc +++ b/tensorflow/lite/micro/micro_interpreter_test.cc @@ -284,7 +284,7 @@ TF_LITE_MICRO_TEST(TestIncompleteInitializationAllocationsWithSmallArena) { tflite::testing::MockOpResolver mock_resolver; // 1kb is too small for the ComplexMockModel: - constexpr size_t allocator_buffer_size = 1048; + constexpr size_t allocator_buffer_size = 500; uint8_t allocator_buffer[allocator_buffer_size]; tflite::RecordingMicroAllocator* allocator = diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile index 8b6cba06a0b..a75c59b05c9 100644 --- a/tensorflow/lite/micro/tools/make/Makefile +++ b/tensorflow/lite/micro/tools/make/Makefile @@ -75,6 +75,8 @@ TEST_SCRIPT := tensorflow/lite/micro/testing/test_linux_binary.sh MICROLITE_LIBS := -lm # TODO(b/150240249): Add in -fno-rtti once that works for the Xtensa toolchain. +# TODO(b/159155203): Consider TF_LITE_STATIC_MEMORY to align more with the fact +# this flag is for an optimized micro runtime. CXXFLAGS := -std=c++11 -DTF_LITE_STATIC_MEMORY CCFLAGS := -std=c11 -DTF_LITE_STATIC_MEMORY ARFLAGS := -r diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h index 15823784d12..9093e5d50ad 100644 --- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h +++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h @@ -375,6 +375,7 @@ typedef struct TfLiteSparsity { // An tensor in the interpreter system which is a wrapper around a buffer of // data including a dimensionality (or NULL if not currently defined). +#ifndef TF_LITE_STATIC_MEMORY typedef struct TfLiteTensor { // The data type specification for data stored in `data`. This affects // what member of `data` union should be used. @@ -439,6 +440,51 @@ typedef struct TfLiteTensor { // `dims_signature` contains [1, -1, -1, 3]). const TfLiteIntArray* dims_signature; } TfLiteTensor; +#else +// Specific reduced TfLiteTensor struct for TF Micro runtime. This struct +// contains only the minimum fields required to initialize and prepare a micro +// inference graph. The fields in this struct have been ordered from +// largest-to-smallest for optimal struct sizeof. +// +// NOTE: This flag is opt-in only at compile time. +typedef struct TfLiteTensor { + // TODO(b/155784997): Consider consolidating these quantization fields: + // Quantization information. Replaces params field above. + TfLiteQuantization quantization; + + // Quantization information. + TfLiteQuantizationParams params; + + // A union of data pointers. The appropriate type should be used for a typed + // tensor based on `type`. + TfLitePtrUnion data; + + // A pointer to a structure representing the dimensionality interpretation + // that the buffer should have. NOTE: the product of elements of `dims` + // and the element datatype size should be equal to `bytes` below. + TfLiteIntArray* dims; + + // The number of bytes required to store the data of this Tensor. I.e. + // (bytes of each element) * dims[0] * ... * dims[n-1]. For example, if + // type is kTfLiteFloat32 and dims = {3, 2} then + // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24. + size_t bytes; + + // The data type specification for data stored in `data`. This affects + // what member of `data` union should be used. + TfLiteType type; + + // How memory is mapped + // kTfLiteMmapRo: Memory mapped read only. + // i.e. weights + // kTfLiteArenaRw: Arena allocated read write memory + // (i.e. temporaries, outputs). + TfLiteAllocationType allocation_type; + + // True if the tensor is a variable. + bool is_variable; +} TfLiteTensor; +#endif // TF_LITE_STATIC_MEMORY #ifndef TF_LITE_STATIC_MEMORY // Free data memory of tensor `t`.