Reduce excessive RAM in TFLM by using the existing flatbuffer quantization data for scales.

Currently, TFLM manually allocates a tail chunk to store "quantization" tensor data on TfLiteTensor objects. The size of these allocations vary based on the type of model - conv1d/2d models tend to be rich since quantization data is stored "per channel". This change simply points the scale data at the existing value in the flatbuffer. The flatbuffer schema stores float values as flatbuffers::Vector<float> and the TfLiteAffineQuantization struct can point the scale pointer at these values. Unfortunately, the zero point values are stored as flatbuffers::Vector<int64_t> and can not be reused. This allocation will be addressed in a future change. Keyword Model ~2% reduction in tail allocation: ----------------------------------------------- [RecordingMicroAllocator] Arena allocation total 21040 bytes [RecordingMicroAllocator] Arena allocation head 672 bytes [RecordingMicroAllocator] Arena allocation tail 20368 bytes [RecordingMicroAllocator] 'TfLiteTensor struct' used 6048 bytes with alignment overhead (requested 6048 bytes for 54 tensors) [RecordingMicroAllocator] 'TfLiteTensor quantization data' used 1728 bytes with alignment overhead (requested 1728 bytes for 108 allocations) [RecordingMicroAllocator] 'TfLiteTensor variable buffer data' used 10240 bytes with alignment overhead (requested 10240 bytes for 7 allocations) [RecordingMicroAllocator] 'NodeAndRegistration struct' used 1200 bytes with alignment overhead (requested 1200 bytes for 15 NodeAndRegistration structs) [RecordingMicroAllocator] 'Operator runtime data' used 148 bytes with alignment overhead (requested 148 bytes for 13 OpData structs) Test Conv Model ~10% reduction in tail allocation: ----------------------------------------------- [RecordingMicroAllocator] Arena allocation total 11680 bytes [RecordingMicroAllocator] Arena allocation head 7744 bytes [RecordingMicroAllocator] Arena allocation tail 3936 bytes [RecordingMicroAllocator] 'TfLiteTensor struct' used 1680 bytes with alignment overhead (requested 1680 bytes for 15 tensors) [RecordingMicroAllocator] 'TfLiteTensor quantization data' used 768 bytes with alignment overhead (requested 752 bytes for 24 allocations) [RecordingMicroAllocator] 'TfLiteTensor variable buffer data' used 0 bytes with alignment overhead (requested 0 bytes for 0 allocations) [RecordingMicroAllocator] 'NodeAndRegistration struct' used 560 bytes with alignment overhead (requested 560 bytes for 7 NodeAndRegistration structs) [RecordingMicroAllocator] 'Operator runtime data' used 136 bytes with alignment overhead (requested 136 bytes for 5 OpData structs) PiperOrigin-RevId: 316556393 Change-Id: Iadadab51019d2787d11af9713b3639f087afa7bc
2020-06-15 15:22:07 -07:00 · 2020-06-15 15:22:07 -07:00 · 072c2f5d0d
commit 072c2f5d0d
parent 9136f5775e
3 changed files with 34 additions and 25 deletions
--- a/tensorflow/lite/micro/memory_arena_threshold_test.cc
+++ b/tensorflow/lite/micro/memory_arena_threshold_test.cc
@ -41,11 +41,11 @@ constexpr int kKeywordModelNodeAndRegistrationCount = 15;

 // NOTE: These values are measured on x86-64:
 // TODO(b/158651472): Consider auditing these values on non-64 bit systems.
-constexpr int kKeywordModelTotalSize = 21472;
+constexpr int kKeywordModelTotalSize = 21040;
 constexpr int kKeywordModelHeadSize = 672;
-constexpr int kKeywordModelTailSize = 20800;
+constexpr int kKeywordModelTailSize = 20368;
 constexpr int kKeywordModelTfLiteTensorVariableBufferDataSize = 10240;
-constexpr int kKeywordModelTfLiteTensorQuantizationDataSize = 2160;
+constexpr int kKeywordModelTfLiteTensorQuantizationDataSize = 1728;
 constexpr int kKeywordModelOpRuntimeDataSize = 148;

 constexpr int kTestConvModelArenaSize = 12 * 1024;
@ -56,10 +56,10 @@ constexpr int kTestConvModelNodeAndRegistrationCount = 7;

 // NOTE: These values are measured on x86-64:
 // TODO(b/158651472): Consider auditing these values on non-64 bit systems.
-constexpr int kTestConvModelTotalSize = 12128;
+constexpr int kTestConvModelTotalSize = 11680;
 constexpr int kTestConvModelHeadSize = 7744;
-constexpr int kTestConvModelTailSize = 4384;
-constexpr int kTestConvModelTfLiteTensorQuantizationDataSize = 1216;
+constexpr int kTestConvModelTailSize = 3936;
+constexpr int kTestConvModelTfLiteTensorQuantizationDataSize = 768;
 constexpr int kTestConvModelOpRuntimeDataSize = 136;

 struct ModelAllocationThresholds {
@ -73,11 +73,17 @@ struct ModelAllocationThresholds {
  size_t op_runtime_data_size = 0;
 };

-void EnsureAllocatedSizeThreshold(size_t actual, size_t expected) {
+void EnsureAllocatedSizeThreshold(const char* allocation_type, size_t actual,
+                                  size_t expected) {
  // TODO(b/158651472): Better auditing of non-64 bit systems:
  if (kIs64BitSystem) {
    // 64-bit systems should check floor and ceiling to catch memory savings:
    TF_LITE_MICRO_EXPECT_NEAR(actual, expected, kAllocationThreshold);
+    if (actual != expected) {
+      TF_LITE_REPORT_ERROR(micro_test::reporter,
+                           "%s threshold failed: %ld != %ld", allocation_type,
+                           actual, expected);
+    }
  } else {
    // Non-64 bit systems should just expect allocation does not exceed the
    // ceiling:
@ -91,33 +97,37 @@ void ValidateModelAllocationThresholds(
  allocator.PrintAllocations();

  EnsureAllocatedSizeThreshold(
-      allocator.GetSimpleMemoryAllocator()->GetUsedBytes(),
+      "Total", allocator.GetSimpleMemoryAllocator()->GetUsedBytes(),
      thresholds.total_alloc_size);
  EnsureAllocatedSizeThreshold(
-      allocator.GetSimpleMemoryAllocator()->GetHeadUsedBytes(),
+      "Head", allocator.GetSimpleMemoryAllocator()->GetHeadUsedBytes(),
      thresholds.head_alloc_size);
  EnsureAllocatedSizeThreshold(
-      allocator.GetSimpleMemoryAllocator()->GetTailUsedBytes(),
+      "Tail", allocator.GetSimpleMemoryAllocator()->GetTailUsedBytes(),
      thresholds.tail_alloc_size);
  EnsureAllocatedSizeThreshold(
+      "TfLiteTensor",
      allocator
          .GetRecordedAllocation(
              tflite::RecordedAllocationType::kTfLiteTensorArray)
          .used_bytes,
      sizeof(TfLiteTensor) * thresholds.tensor_count);
  EnsureAllocatedSizeThreshold(
+      "VariableBufferData",
      allocator
          .GetRecordedAllocation(
              tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData)
          .used_bytes,
      thresholds.tensor_variable_buffer_data_size);
  EnsureAllocatedSizeThreshold(
+      "QuantizationData",
      allocator
          .GetRecordedAllocation(tflite::RecordedAllocationType::
                                     kTfLiteTensorArrayQuantizationData)
          .used_bytes,
      thresholds.tensor_quantization_data_size);
  EnsureAllocatedSizeThreshold(
+      "NodeAndRegistration",
      allocator
          .GetRecordedAllocation(
              tflite::RecordedAllocationType::kNodeAndRegistrationArray)
@ -125,6 +135,7 @@ void ValidateModelAllocationThresholds(
      sizeof(tflite::NodeAndRegistration) *
          thresholds.node_and_registration_count);
  EnsureAllocatedSizeThreshold(
+      "OpData",
      allocator.GetRecordedAllocation(tflite::RecordedAllocationType::kOpData)
          .used_bytes,
      thresholds.op_runtime_data_size);
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@ -466,6 +466,8 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
  TF_LITE_ENSURE_STATUS(BytesRequiredForTensor(
      flatbuffer_tensor, &result->bytes, &type_size, error_reporter));

+  // TODO(b/159043126): Cleanup endian casting by doing all endian casting in
+  // one spot:
  if (flatbuffer_tensor.shape() == nullptr) {
    // flatbuffer_tensor.shape() can return a nullptr in the case of a scalar
    // tensor.
@ -513,6 +515,10 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
                           "Unable to allocate TfLiteAffineQuantization.\n");
      return kTfLiteError;
    }
+
+    // TODO(b/153688719): Reduce tail allocation by using a global zero-point
+    // buffer. This value can not be reused from the flatbuffer since the
+    // zero_point is stored as a int64_t.
    quantization->zero_point =
        reinterpret_cast<TfLiteIntArray*>(allocator->AllocateFromTail(
            TfLiteIntArrayGetSizeInBytes(channels), alignof(TfLiteIntArray)));
@ -522,22 +528,14 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
      return kTfLiteError;
    }

-    quantization->scale = reinterpret_cast<TfLiteFloatArray*>(
-        allocator->AllocateFromTail(TfLiteFloatArrayGetSizeInBytes(channels),
-                                    alignof(TfLiteFloatArray)));
-    if (quantization->scale == nullptr) {
-      TF_LITE_REPORT_ERROR(error_reporter,
-                           "Unable to allocate quantization->scale.\n");
-      return kTfLiteError;
-    }
+    // TODO(b/159043126): Check for big endian before casting flatbuffer values.
+    quantization->scale = const_cast<TfLiteFloatArray*>(
+        reinterpret_cast<const TfLiteFloatArray*>(src_quantization->scale()));

    quantization->zero_point->size = channels;
-    quantization->scale->size = channels;
    int* zero_point_data = quantization->zero_point->data;
-    float* scale_data = quantization->scale->data;
    for (int i = 0; i < channels; i++) {
      zero_point_data[i] = src_quantization->zero_point()->Get(i);
-      scale_data[i] = src_quantization->scale()->Get(i);
    }
    // TODO(rocky): Need to add a micro_allocator test case that fails when
    // this is not copied:
@ -815,8 +813,10 @@ TfLiteStatus MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
    }

    // Disregard const qualifier to workaround with existing API.
+    // TODO(b/159043126): Check for big endian before casting flatbuffer values.
    TfLiteIntArray* inputs_array = const_cast<TfLiteIntArray*>(
        reinterpret_cast<const TfLiteIntArray*>(op->inputs()));
+    // TODO(b/159043126): Check for big endian before casting flatbuffer values.
    TfLiteIntArray* outputs_array = const_cast<TfLiteIntArray*>(
        reinterpret_cast<const TfLiteIntArray*>(op->outputs()));

--- a/tensorflow/lite/micro/recording_micro_allocator_test.cc
+++ b/tensorflow/lite/micro/recording_micro_allocator_test.cc
@ -93,7 +93,6 @@ TF_LITE_MICRO_TEST(TestRecordsTensorArrayQuantizationData) {
      quantized_tensor_count++;
      size_t num_channels = quantization_params->scale()->size();
      quantized_channel_bytes += TfLiteIntArrayGetSizeInBytes(num_channels);
-      quantized_channel_bytes += TfLiteFloatArrayGetSizeInBytes(num_channels);
    }
  }

@ -106,10 +105,9 @@ TF_LITE_MICRO_TEST(TestRecordsTensorArrayQuantizationData) {
      micro_allocator->GetRecordedAllocation(
          tflite::RecordedAllocationType::kTfLiteTensorArrayQuantizationData);

-  // Each quantized tensors has 3 mallocs (quant struct, scale dimensions, zero
-  // point dimensions):
+  // Each quantized tensors has 2 mallocs (quant struct, zero point dimensions):
  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count,
-                          quantized_tensor_count * 3);
+                          quantized_tensor_count * 2);
  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.requested_bytes,
                          expected_requested_bytes);
  TF_LITE_MICRO_EXPECT_GE(recorded_allocation.used_bytes,