Merge pull request #38121 from jenselofsson:offline_memory_planner

PiperOrigin-RevId: 316533499 Change-Id: Id967e853081829f4c974cf7527a628724ed0edc2
2020-06-15 13:37:09 -07:00 · 2020-06-15 13:37:09 -07:00 · 4381963d2d
parent 52736a6adc 708ecda43e
commit 4381963d2d
10 changed files with 649 additions and 92 deletions
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
@ -46,6 +47,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
                         "to supported version %d.\n",
                         model->version(), TFLITE_SCHEMA_VERSION);
  }
  PrintModelData(model, error_reporter);
  // Pull in only the operation implementations we need.
  // This relies on a complete list of all the ops needed by this graph.
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
@ -48,10 +48,10 @@ GreedyMemoryPlanner::GreedyMemoryPlanner(unsigned char* scratch_buffer,
  requirements_ = reinterpret_cast<BufferRequirements*>(next_free);
  next_free += sizeof(BufferRequirements) * max_buffer_count_;
-  buffer_sizes_sorted_by_size_ = reinterpret_cast<int*>(next_free);
+  buffer_sizes_sorted_ = reinterpret_cast<int*>(next_free);
  next_free += sizeof(int) * max_buffer_count_;
-  buffer_ids_sorted_by_size_ = reinterpret_cast<int*>(next_free);
+  buffer_ids_sorted_ = reinterpret_cast<int*>(next_free);
  next_free += sizeof(int) * max_buffer_count_;
  buffers_sorted_by_offset_ = reinterpret_cast<ListEntry*>(next_free);
@ -76,11 +76,24 @@ TfLiteStatus GreedyMemoryPlanner::AddBuffer(
  current->size = size;
  current->first_time_used = first_time_used;
  current->last_time_used = last_time_used;
  current->offline_offset = kOnlinePlannedBuffer;
  ++buffer_count_;
  need_to_calculate_offsets_ = true;
  return kTfLiteOk;
 }
 TfLiteStatus GreedyMemoryPlanner::AddBuffer(
    tflite::ErrorReporter* error_reporter, int size, int first_time_used,
    int last_time_used, int offline_offset) {
  BufferRequirements* current = &requirements_[buffer_count_];
  if (AddBuffer(error_reporter, size, first_time_used, last_time_used) !=
      kTfLiteOk) {
    return kTfLiteError;
  }
  current->offline_offset = offline_offset;
  return kTfLiteOk;
 }
 bool GreedyMemoryPlanner::DoesEntryOverlapInTime(
    const GreedyMemoryPlanner::ListEntry* entry, const int first_time_used,
    const int last_time_used) const {
@ -102,7 +115,7 @@ GreedyMemoryPlanner::NextSimultaneouslyActiveBuffer(
  ListEntry* result = nullptr;
  ListEntry* candidate_next_entry;
  if (start == nullptr) {
-    candidate_next_entry = &buffers_sorted_by_offset_[0];
+    candidate_next_entry = &buffers_sorted_by_offset_[first_entry_index_];
  } else {
    if (start->next_entry_index == -1) {
      return nullptr;
@ -134,29 +147,51 @@ void GreedyMemoryPlanner::CalculateOffsetsIfNeeded() {
  // This helps find a more compact layout. Intuitively, you can think
  // about putting the large buffers in place first, and then the
  // smaller buffers can fit in the gaps, rather than fragmenting the
-  // gaps with small buffers at the beginning.
+  // gaps with small buffers at the beginning. Add offline planned offsets
  // first in the list, since they have a predetermined offset.
  int idx_from_tail = buffer_count_;
  int idx_from_head = 0;
  for (int i = 0; i < buffer_count_; ++i) {
-    buffer_sizes_sorted_by_size_[i] = requirements_[i].size;
+    if (requirements_[i].offline_offset == kOnlinePlannedBuffer) {
-    buffer_ids_sorted_by_size_[i] = i;
+      idx_from_tail--;
-    buffer_offsets_[i] = -1;
+      buffer_sizes_sorted_[idx_from_tail] = requirements_[i].size;
      buffer_ids_sorted_[idx_from_tail] = i;
      buffer_offsets_[i] = -1;
    } else {
      buffer_sizes_sorted_[idx_from_head] = requirements_[i].size;
      buffer_ids_sorted_[idx_from_head] = i;
      buffer_offsets_[i] = requirements_[i].offline_offset;
      idx_from_head++;
    }
  }
  // This sorting algorithm is naive, and may end up taking a very long time
  // with hundreds of buffers.
  ReverseSortInPlace(buffer_sizes_sorted_by_size_, buffer_ids_sorted_by_size_,
                     buffer_count_);
-  // Put the largest buffer at offset zero to start the process.
+  // This sorting algorithm is naive, and may end up taking a very long time
-  ListEntry* first_entry = &buffers_sorted_by_offset_[0];
+  // with hundreds of buffers. Do not sort the offline planned offsets.
-  first_entry->offset = 0;
+  ReverseSortInPlace(&buffer_sizes_sorted_[idx_from_head],
-  first_entry->requirements_index = buffer_ids_sorted_by_size_[0];
+                     &buffer_ids_sorted_[idx_from_head],
-  first_entry->next_entry_index = -1;
+                     buffer_count_ - idx_from_head);
  // Initialize the first entry to the first buffer in
  // buffer_ids_sorted_.
  //   - If there are no offline planned offsets, the largest buffer will be
  //     first, and the buffers will be handled in size order.
  //   - If offline offsets are present, these will be handled first in order
  //     for the greedy algorithm to utilized gaps in the offline plan.
  first_entry_index_ = 0;
  next_free_entry_ = 1;
-  buffer_offsets_[buffer_ids_sorted_by_size_[0]] = 0;
+  ListEntry* first_entry = &buffers_sorted_by_offset_[first_entry_index_];
  first_entry->next_entry_index = -1;  // to mark the entry as end of list
  int buffer_id = buffer_ids_sorted_[0];
  first_entry->requirements_index = buffer_id;
  if (requirements_[buffer_id].offline_offset == kOnlinePlannedBuffer) {
    buffer_offsets_[buffer_id] = 0;
  }
  first_entry->offset = buffer_offsets_[buffer_id];
  // Work through the rest of the buffers to find a good gap to place each one.
  for (int i = 1; i < buffer_count_; ++i) {
    // The id is the order the buffer was originally added by the client.
-    const int buffer_id = buffer_ids_sorted_by_size_[i];
+    buffer_id = buffer_ids_sorted_[i];
    // Look at what size and time range the buffer needs to be active.
    BufferRequirements* wanted_requirements = &requirements_[buffer_id];
    const int wanted_size = wanted_requirements->size;
@ -168,37 +203,43 @@ void GreedyMemoryPlanner::CalculateOffsetsIfNeeded() {
    // so that it's easy to find the next buffer in memory, and so the gap.
    // The candidate_entry variable holds the buffer that we're considering
    // placing the current buffer after.
-    ListEntry* prior_entry = nullptr;
+
    int candidate_offset = 0;
    // Loop through the offset-ordered list of buffers, looking for gaps.
-    while (true) {
+    if (wanted_requirements->offline_offset == kOnlinePlannedBuffer) {
-      // Find out what the next active buffer is.
+      ListEntry* prior_entry = nullptr;
-      ListEntry* next_entry = NextSimultaneouslyActiveBuffer(
+      while (true) {
-          prior_entry, wanted_first_time_used, wanted_last_time_used);
+        // Find out what the next active buffer is.
        ListEntry* next_entry = NextSimultaneouslyActiveBuffer(
            prior_entry, wanted_first_time_used, wanted_last_time_used);
-      if (prior_entry) {
+        if (prior_entry) {
-        BufferRequirements* candidate_requirements =
+          BufferRequirements* candidate_requirements =
-            &requirements_[prior_entry->requirements_index];
+              &requirements_[prior_entry->requirements_index];
-        const int prior_entry_offset =
+          const int prior_entry_offset =
-            prior_entry->offset + candidate_requirements->size;
+              prior_entry->offset + candidate_requirements->size;
-        if (prior_entry_offset > candidate_offset) {
+          if (prior_entry_offset > candidate_offset) {
-          candidate_offset = prior_entry_offset;
+            candidate_offset = prior_entry_offset;
          }
        }
        if (next_entry == nullptr) {
          // We're at the end of the list, so we can always append the buffer
          // here.
          break;
        }
        // Find out how much space there is between us and the next buffer.
        const int gap = next_entry->offset - candidate_offset;
        if (gap >= wanted_size) {
          // This entry has a big enough gap between it and the next, so
          // use it!
          break;
        }
        // The gap wasn't big enough, so move on to another candidate.
        prior_entry = next_entry;
      }
-      if (next_entry == nullptr) {
+    } else {
-        // We're at the end of the list, so we can always append the buffer
+      // Offline planned offset are to be considered constant
-        // here.
+      candidate_offset = wanted_requirements->offline_offset;
        break;
      }
      // Find out how much space there is between us and the next buffer.
      const int gap = next_entry->offset - candidate_offset;
      if (gap >= wanted_size) {
        // This entry has a big enough gap between it and the next, so
        // use it!
        break;
      }
      // The gap wasn't big enough, so move on to another candidate.
      prior_entry = next_entry;
    }
    // At this point, we've either found a gap (possibly at the end of the
    // list) and want to place the buffer there, or there are no other active
@ -212,26 +253,36 @@ void GreedyMemoryPlanner::CalculateOffsetsIfNeeded() {
    new_entry->requirements_index = buffer_id;
    const int new_entry_index = next_free_entry_;
    ++next_free_entry_;
-    ListEntry* current_entry = first_entry;
+
-    // Make sure that we insert the buffer at the correct place in the ordered
+    if (first_entry->offset > candidate_offset) {
-    // list.
+      // The new entry offset is smaller than the first entry offset =>
-    while (true) {
+      // replace the first entry
-      const int next_entry_index = current_entry->next_entry_index;
+      first_entry = new_entry;
-      if (next_entry_index == -1) {
+      first_entry->next_entry_index = first_entry_index_;
-        // We're at the end of the list, so just add the new entry here.
+      first_entry_index_ = new_entry_index;
-        current_entry->next_entry_index = new_entry_index;
+    } else {
-        new_entry->next_entry_index = -1;
+      ListEntry* current_entry = first_entry;
-        break;
+      // Make sure that we insert the buffer at the correct place in the
      // buffer-offset-ordered list
      while (true) {
        const int next_entry_index = current_entry->next_entry_index;
        if (next_entry_index == -1) {
          // We're at the end of the list, so just add the new entry here.
          current_entry->next_entry_index = new_entry_index;
          new_entry->next_entry_index = -1;
          break;
        }
        // not at the end of the list -> take a look at next entry
        ListEntry* next_entry = &buffers_sorted_by_offset_[next_entry_index];
        if (next_entry->offset > candidate_offset) {
          // We're at the right spot to do an insertion and retain the sorting
          // order, so place the new entry here.
          new_entry->next_entry_index = current_entry->next_entry_index;
          current_entry->next_entry_index = new_entry_index;
          break;
        }
        current_entry = next_entry;
      }
      ListEntry* next_entry = &buffers_sorted_by_offset_[next_entry_index];
      if (next_entry->offset > candidate_offset) {
        // We're at the right spot to do an insertion and retain the sorting
        // order, so place the new entry here.
        new_entry->next_entry_index = current_entry->next_entry_index;
        current_entry->next_entry_index = new_entry_index;
        break;
      }
      current_entry = next_entry;
    }
  }
 }
@ -241,7 +292,7 @@ size_t GreedyMemoryPlanner::GetMaximumMemorySize() {
  if (buffer_count_ == 0) {
    return 0;
  }
-  ListEntry* entry = &buffers_sorted_by_offset_[0];
+  ListEntry* entry = &buffers_sorted_by_offset_[first_entry_index_];
  size_t max_size = 0;
  while (entry) {
    BufferRequirements* requirements =
--- a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
+++ b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.h
@ -21,6 +21,8 @@ limitations under the License.
 namespace tflite {
 constexpr int kOnlinePlannedBuffer = -1;
 // A memory planner that uses a greedy algorithm to arrange buffers in memory
 // to minimize the overall arena size needed.
 //
@ -59,6 +61,12 @@ class GreedyMemoryPlanner : public MemoryPlanner {
  TfLiteStatus AddBuffer(ErrorReporter* error_reporter, int size,
                         int first_time_used, int last_time_used) override;
  // Record details of an offline planned buffer offset we want to place.
  // offline_offset is the buffer offset from the start of the arena.
  TfLiteStatus AddBuffer(ErrorReporter* error_reporter, int size,
                         int first_time_used, int last_time_used,
                         int offline_offset);
  // Returns the high-water mark of used memory. This is the minimum size of a
  // memory arena you'd need to allocate to hold these buffers.
  size_t GetMaximumMemorySize() override;
@ -90,8 +98,8 @@ class GreedyMemoryPlanner : public MemoryPlanner {
  static size_t per_buffer_size() {
    const int per_buffer_size =
        sizeof(BufferRequirements) +  // requirements_
-        sizeof(int) +                 // buffer_sizes_sorted_by_size_
+        sizeof(int) +                 // buffer_sizes_sorted_
-        sizeof(int) +                 // buffer_ids_sorted_by_size_
+        sizeof(int) +                 // buffer_ids_sorted_
        sizeof(ListEntry) +           // buffers_sorted_by_offset_
        sizeof(int);                  // buffer_offsets_;
    return per_buffer_size;
@ -121,16 +129,25 @@ class GreedyMemoryPlanner : public MemoryPlanner {
  // Records the client-provided information about each buffer.
  struct BufferRequirements {
    int size;
    int offline_offset;
    int first_time_used;
    int last_time_used;
  };
  // Working arrays used during the layout algorithm.
  BufferRequirements* requirements_;
-  int* buffer_sizes_sorted_by_size_;
+  // buffer_sizes_sorted_ and buffer_ids_sorted_ are sorted according to:
-  int* buffer_ids_sorted_by_size_;
+  //   {
  //     offline planned buffers,
  //     online planned buffers sorted by size
  //   }
  int* buffer_sizes_sorted_;
  int* buffer_ids_sorted_;
  ListEntry* buffers_sorted_by_offset_;
-  int next_free_entry_;
+  int next_free_entry_;    // Index of the next free entry of
                           // buffers_sorted_by_offset_
  int first_entry_index_;  // Index of the first entry (smallest offset) of
                           // buffers_sorted_by_offset_
  // Stores the outcome of the plan, the location of each buffer in the arena.
  int* buffer_offsets_;
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@ -39,16 +39,19 @@ namespace {
 // Used to hold information used during allocation calculations.
 struct AllocationInfo {
  size_t bytes;
  void** output_ptr;
  int first_created;
  int last_used;
  int32_t offline_offset;
  bool needs_allocating;
  void** output_ptr;
 };
 // We align tensor buffers to 16-byte boundaries, since this is a common
 // requirement for SIMD extensions.
 constexpr int kBufferAlignment = 16;
 constexpr char kOfflineMemAllocMetadata[] = "OfflineMemoryAllocation";
 // Instance of a zero-length int to pass as tensor dims for a flatbuffer
 // Tensor with no shape. Note that the second member of a TfLiteArray is a
 // flexible array member, which is not strictly valid C++. However it is
@ -77,6 +80,71 @@ class MicroBuiltinDataAllocator : public BuiltinDataAllocator {
  TF_LITE_REMOVE_VIRTUAL_DELETE
 };
 #if !defined(__clang__)
 // Helper function to check flatbuffer metadata correctness. This function is
 // not called by default. Hence it's not linked in to the final binary code.
 TfLiteStatus CheckOfflinePlannedOffsets(const Model* model,
                                        ErrorReporter* error_reporter) {
  // Suppress compile warning for unused function
  (void)CheckOfflinePlannedOffsets;
  if (model->metadata()) {
    for (size_t i = 0; i < model->metadata()->size(); ++i) {
      auto metadata = model->metadata()->Get(i);
      if (strncmp(metadata->name()->c_str(), kOfflineMemAllocMetadata,
                  strlen(kOfflineMemAllocMetadata)) == 0) {
        auto* subgraphs = model->subgraphs();
        const SubGraph* subgraph = (*subgraphs)[0];
        const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors =
            subgraph->tensors();
        const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
            model->buffers();
        int nbr_tflite_tensors = tensors->size();
        auto* buffer = (*buffers)[metadata->buffer()];
        auto* array = buffer->data();
        const uint32_t* metadata_buffer = (uint32_t*)array->data();
        int version = metadata_buffer[0];
        int subgraph_idx = metadata_buffer[1];
        const int nbr_offline_offsets = metadata_buffer[2];
        int* offline_planner_offsets = (int*)&metadata_buffer[3];
        TF_LITE_REPORT_ERROR(error_reporter, "==== Model metadata info: =====");
        TF_LITE_REPORT_ERROR(error_reporter,
                             "Offline planner metadata found, version %d, "
                             "subgraph %d, nbr offline offsets %d",
                             version, subgraph_idx, nbr_offline_offsets);
        for (int j = 0; j < nbr_offline_offsets; ++j) {
          TF_LITE_REPORT_ERROR(
              error_reporter,
              "Offline planner tensor index %d, offline offset: %d", j,
              offline_planner_offsets[j]);
        }
        if (version != 1) {
          TF_LITE_REPORT_ERROR(error_reporter, "Version not supported! (%d)\n",
                               version);
          return kTfLiteError;
        }
        if (subgraph_idx != 0) {
          TF_LITE_REPORT_ERROR(error_reporter,
                               "Only 1 subgraph supported! Subgraph idx (%d)\n",
                               subgraph_idx);
          return kTfLiteError;
        }
        if (nbr_tflite_tensors != nbr_offline_offsets) {
          TF_LITE_REPORT_ERROR(error_reporter,
                               "Nbr of offline buffer offsets (%d) in metadata "
                               "not equal nbr tensors (%d)\n",
                               nbr_offline_offsets, nbr_tflite_tensors);
          return kTfLiteError;
        }
      }
    }
  }
  return kTfLiteOk;
 }
 #endif
 // A helper class to construct AllocationInfo array. This array contains the
 // lifetime of tensors / scratch_buffer and will be used to calculate the memory
 // plan. Methods need to be called in order from `Init`, `Add*`, to `Finish`.
@ -94,9 +162,17 @@ class AllocationInfoBuilder {
    return Allocate();
  }
  // Check if model contains offline planned buffer offsets.
  //  - If there's no metadata available, offline_planner_offsets is not set
  //  - If there's metadata available, offline_planner_offsets will point to the
  //    first offset in the metadata buffer list.
  TfLiteStatus GetOfflinePlannedOffsets(const Model* model,
                                        int32_t** offline_planner_offsets);
  // Add allocaiton information for the tensors.
-  TfLiteStatus AddTensors(const SubGraph* subgraph,
+  TfLiteStatus AddTensors(const SubGraph* subgraph, int32_t* offline_offsets,
                          TfLiteTensor* runtime_tensors);
  // Add allocation information for the scratch buffers.
  TfLiteStatus AddScratchBuffers(internal::ScratchBufferHandle* buffer_handles);
@ -130,6 +206,7 @@ TfLiteStatus AllocationInfoBuilder::Allocate() {
 }
 TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
                                               int32_t* offline_offsets,
                                               TfLiteTensor* runtime_tensors) {
  // Set up allocation info for all tensors.
  for (size_t i = 0; i < tensor_count_; ++i) {
@ -141,6 +218,11 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
    current->last_used = -1;
    current->needs_allocating = (runtime_tensors[i].data.data == nullptr) &&
                                (!subgraph->tensors()->Get(i)->is_variable());
    if (offline_offsets) {
      current->offline_offset = offline_offsets[i];
    } else {
      current->offline_offset = kOnlinePlannedBuffer;
    }
  }
  for (size_t i = 0; i < subgraph->inputs()->size(); ++i) {
@ -198,6 +280,52 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
  return kTfLiteOk;
 }
 // The tensor offsets will be encoded in the metadata:[Metadata] field of the
 // Model. The following encoding applies:
 //
 // | Metadata component |                 Value                                |
 // |    name:string     | “OfflineMemoryAllocation”                            |
 // |    buffer:unit     | Index of buffer containing memory allocation data    |
 //
 // The buffer contents for the memory allocation is a list of 32-bit integers.
 // The number of tensors, n, must be equal to the number of tensors defined in
 // the model. The following encoding applies:
 //
 // |  Offset |                            Value                                |
 // |    0    | Offline allocation format version – set to 0                    |
 // |    1    | Subgraph index to which this allocation applies                 |
 // |    2    | Number offsets following: n                                     |
 // |    3    | Arena byte offset of tensor #0 or -1 to allocate at runtime     |
 // |    4    | Arena byte offset of tensor #1 or -1 to allocate at runtime     |
 // | 3+(n-1) | Arena byte offset of tensor #(n-1) or -1 to allocate at runtime |
 TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
    const Model* model, int32_t** offline_planner_offsets) {
  if (model->metadata()) {
    for (size_t i = 0; i < model->metadata()->size(); ++i) {
      auto metadata = model->metadata()->Get(i);
      if (strncmp(metadata->name()->c_str(), kOfflineMemAllocMetadata,
                  strlen(kOfflineMemAllocMetadata)) == 0) {
        const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
            model->buffers();
        auto* buffer = (*buffers)[metadata->buffer()];
        auto* array = buffer->data();
        const uint32_t* metadata_buffer = (uint32_t*)array->data();
        const size_t nbr_tensors = (size_t)metadata_buffer[2];
        *offline_planner_offsets = (int32_t*)&metadata_buffer[3];
        if (tensor_count_ != nbr_tensors) {
          TF_LITE_REPORT_ERROR(reporter_,
                               "Nbr of offline buffer offsets (%d) in metadata "
                               "not equal nbr tensors (%d)\n",
                               nbr_tensors, tensor_count_);
          return kTfLiteError;
        }
      }
    }
  }
  return kTfLiteOk;
 }
 TfLiteStatus AllocationInfoBuilder::AddScratchBuffers(
    internal::ScratchBufferHandle* buffer_handles) {
  // Set up allocation info for buffers.
@ -210,11 +338,13 @@ TfLiteStatus AllocationInfoBuilder::AddScratchBuffers(
    current->first_created = handle->node_idx;
    current->last_used = handle->node_idx;
    current->needs_allocating = true;
    current->offline_offset = kOnlinePlannedBuffer;
  }
  return kTfLiteOk;
 }
-TfLiteStatus CreatePlan(ErrorReporter* error_reporter, MemoryPlanner* planner,
+TfLiteStatus CreatePlan(ErrorReporter* error_reporter,
                        GreedyMemoryPlanner* planner,
                        const AllocationInfo* allocation_info,
                        size_t allocation_info_size) {
  // Add the tensors to our allocation plan.
@ -223,9 +353,15 @@ TfLiteStatus CreatePlan(ErrorReporter* error_reporter, MemoryPlanner* planner,
    if (current->needs_allocating) {
      size_t aligned_bytes_required =
          AlignSizeUp(current->bytes, kBufferAlignment);
-      TF_LITE_ENSURE_STATUS(
+      if (current->offline_offset == kOnlinePlannedBuffer) {
-          planner->AddBuffer(error_reporter, aligned_bytes_required,
+        TF_LITE_ENSURE_STATUS(
-                             current->first_created, current->last_used));
+            planner->AddBuffer(error_reporter, aligned_bytes_required,
                               current->first_created, current->last_used));
      } else {
        TF_LITE_ENSURE_STATUS(planner->AddBuffer(
            error_reporter, aligned_bytes_required, current->first_created,
            current->last_used, current->offline_offset));
      }
    }
  }
  return kTfLiteOk;
@ -466,7 +602,6 @@ TfLiteStatus MicroAllocator::StartModelAllocation(
  const SubGraph* subgraph = GetSubGraphFromModel(model);
  TFLITE_DCHECK(subgraph != nullptr);
  model_is_allocating_ = true;
  TF_LITE_ENSURE_STATUS(
@ -491,7 +626,7 @@ TfLiteStatus MicroAllocator::FinishModelAllocation(const Model* model,
  const SubGraph* subgraph = GetSubGraphFromModel(model);
  TFLITE_DCHECK(subgraph != nullptr);
-  TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan(context, subgraph));
+  TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan(model, context, subgraph));
  TF_LITE_ENSURE_STATUS(AllocateVariables(context, subgraph));
  model_is_allocating_ = false;
@ -739,7 +874,8 @@ const SubGraph* MicroAllocator::GetSubGraphFromModel(const Model* model) {
  return (*subgraphs)[0];
 }
-TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(TfLiteContext* context,
+TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(const Model* model,
                                                    TfLiteContext* context,
                                                    const SubGraph* subgraph) {
  // Create static memory plan
  // 1. Calculate AllocationInfo to know the lifetime of each tensor/buffer.
@ -756,7 +892,13 @@ TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(TfLiteContext* context,
    AllocationInfoBuilder builder(error_reporter_, &tmp_allocator);
    TF_LITE_ENSURE_STATUS(
        builder.Init(subgraph->tensors()->size(), scratch_buffer_count_));
-    TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph, context->tensors));
+
    int32_t* offline_planner_offsets = nullptr;
    TF_LITE_ENSURE_STATUS(
        builder.GetOfflinePlannedOffsets(model, &offline_planner_offsets));
    TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph, offline_planner_offsets,
                                             context->tensors));
    TF_LITE_ENSURE_STATUS(builder.AddScratchBuffers(scratch_buffer_handles_));
    const AllocationInfo* allocation_info = builder.Finish();
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@ -189,7 +189,8 @@ class MicroAllocator {
  // Commits a memory plan for all non-persistent buffer allocations in the
  // 'head' section of the memory arena.
-  virtual TfLiteStatus CommitStaticMemoryPlan(TfLiteContext* context,
+  virtual TfLiteStatus CommitStaticMemoryPlan(const Model* model,
                                              TfLiteContext* context,
                                              const SubGraph* subgraph);
  // A simple memory allocator that always allocate from the arena tail or head.
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@ -253,10 +253,10 @@ TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
  // bytes = 2 * 2 * 3 * sizeof(float32) = 48, same for other tensors.
  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[0].bytes);
  // t1 can't reuse any memory, as n0 requires both t0 and t1.
-  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(96, context.tensors[1].data.uint8 - start);
  // t2 can't reuse any memory, as n1 requires both t0 and t2. Also n2 requires
  // both t1 and t2.
-  TF_LITE_MICRO_EXPECT_EQ(96, context.tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[2].data.uint8 - start);
  // t3 reuses the same memory from t0 as t0 is not an input to any node.
  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[3].data.uint8 - start);
@ -308,4 +308,218 @@ TF_LITE_MICRO_TEST(TestAllocationForComplexModelAllocation) {
                                                       /*count=*/3);
 }
 TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
  int version = 1;
  int subgraph = 0;
  constexpr int nbr_tensors = 4;
  tflite::testing::MockOpResolver mock_resolver;
  tflite::NodeAndRegistration* node_and_registration;
  const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                nbr_tensors] = {version, subgraph,
                                                nbr_tensors,  // header
                                                // memory offsets:
                                                -1, -1, -1, -1};
  // The structure is identical to the one in
  // TestAllocationForModelsWithBranches
  int num_conns = 3;
  tflite::testing::NodeConnection node_list[3] = {{
                                                      {0},  // input
                                                      {1}   // output
                                                  },
                                                  {
                                                      {0},  // input
                                                      {2}   // output
                                                  },
                                                  {
                                                      {1, 2},  // input1, input2
                                                      {3}      // output
                                                  }};
  const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
      nbr_tensors, metadata_buffer, node_list, num_conns);
  TfLiteContext context;
  constexpr size_t arena_size = 4096;
  uint8_t arena[arena_size];
  tflite::MicroAllocator* allocator =
      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
  TF_LITE_MICRO_EXPECT_EQ(
      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
                                                 &node_and_registration));
  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                          allocator->FinishModelAllocation(model, &context));
  // Since all of the tensors are online planned and the model structure is
  // identical to that in TestAllocationForModelsWithBranches,
  // the offsets be should identical to that test.
  uint8_t* start = context.tensors[0].data.uint8;
  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[0].bytes);
  TF_LITE_MICRO_EXPECT_EQ(96, context.tensors[1].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[2].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[3].data.uint8 - start);
 }
 TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
  constexpr int nbr_tensors = 4;
  tflite::testing::MockOpResolver mock_resolver;
  tflite::NodeAndRegistration* node_and_registration;
  const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                nbr_tensors] = {1,  0, nbr_tensors,
                                                0,    // t0
                                                48,   // t1
                                                0,    // t2
                                                48};  // t3
  int t0 = 0;
  int t1 = 1;
  int t2 = 2;
  int t3 = 3;
  int num_conns = 3;
  tflite::testing::NodeConnection node_list[3] = {{
                                                      {t0},  // input
                                                      {t1}   // output
                                                  },
                                                  {
                                                      {t1},  // input
                                                      {t2}   // output
                                                  },
                                                  {
                                                      {t2},  // input
                                                      {t3}   // output
                                                  }};
  const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
      nbr_tensors, metadata_buffer, node_list, num_conns);
  TfLiteContext context;
  constexpr size_t arena_size = 4096;
  uint8_t arena[arena_size];
  tflite::MicroAllocator* allocator =
      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
  TF_LITE_MICRO_EXPECT_EQ(
      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
                                                 &node_and_registration));
  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                          allocator->FinishModelAllocation(model, &context));
  uint8_t* start = context.tensors[0].data.uint8;
  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[1].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[2].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[3].data.uint8 - start);
 }
 TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
  constexpr int nbr_tensors = 4;
  tflite::testing::MockOpResolver mock_resolver;
  tflite::NodeAndRegistration* node_and_registration;
  const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                nbr_tensors] = {
      1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors
      // memory offsets:
      0,    // t0
      0,    // t1
      48,   // t2
      -1};  // t3
  int t0 = 0;
  int t1 = 1;
  int t2 = 2;
  int t3 = 3;
  int num_conns = 2;
  tflite::testing::NodeConnection node_list[2] = {
      {
          {t0, t1},  // input, scratch
          {t2}       // output
      },
      {
          {t2},  // input
          {t3}   // output
      },
  };
  const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
      nbr_tensors, metadata_buffer, node_list, num_conns);
  TfLiteContext context;
  constexpr size_t arena_size = 4096;
  uint8_t arena[arena_size];
  tflite::MicroAllocator* allocator =
      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
  TF_LITE_MICRO_EXPECT_EQ(
      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
                                                 &node_and_registration));
  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                          allocator->FinishModelAllocation(model, &context));
  uint8_t* start = context.tensors[0].data.uint8;
  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[1].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[2].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[3].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[0].bytes);
 }
 TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
  constexpr int nbr_tensors = 5;
  tflite::testing::MockOpResolver mock_resolver;
  tflite::NodeAndRegistration* node_and_registration;
  const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
                                nbr_tensors] = {
      1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors
      // memory offsets:
      0,    // t0
      48,   // t1
      -1,   // t2
      0,    // t3
      -1};  // t4
  int t0 = 0;
  int t1 = 1;
  int t2 = 2;
  int t3 = 3;
  int t4 = 4;
  int num_conns = 2;
  tflite::testing::NodeConnection node_list[2] = {
      {
          {t0, t1},  // input, scratch
          {t2},      // output
      },
      {
          {t2},      // input
          {t3, t4},  // output1, output2
      },
  };
  const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
      nbr_tensors, metadata_buffer, node_list, num_conns);
  TfLiteContext context;
  constexpr size_t arena_size = 4096;
  uint8_t arena[arena_size];
  tflite::MicroAllocator* allocator =
      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
  TF_LITE_MICRO_EXPECT_EQ(
      kTfLiteOk, allocator->StartModelAllocation(model, &context, mock_resolver,
                                                 &node_and_registration));
  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                          allocator->FinishModelAllocation(model, &context));
  uint8_t* start = context.tensors[0].data.uint8;
  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[0].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[1].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(96, context.tensors[2].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(48, context.tensors[4].data.uint8 - start);
  TF_LITE_MICRO_EXPECT_EQ(0, context.tensors[3].data.uint8 - start);
 }
 TF_LITE_MICRO_TESTS_END
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.cc
@ -27,6 +27,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@ -111,6 +112,33 @@ const char* AllocTypeName(TfLiteAllocationType type) {
 }
 }  // namespace
 // Helper function to print model flatbuffer data. This function is not called
 // by default. Hence it's not linked in to the final binary code.
 void PrintModelData(const Model* model, ErrorReporter* error_reporter) {
  auto* subgraphs = model->subgraphs();
  const SubGraph* subgraph = (*subgraphs)[0];
  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors =
      subgraph->tensors();
  const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
      model->buffers();
  TF_LITE_REPORT_ERROR(error_reporter, "==== Model info: =====");
  for (size_t i = 0; i < tensors->size(); ++i) {
    const tflite::Tensor& flatbuffer_tensor = *tensors->Get(i);
    size_t type_size, tensor_size;
    auto* buffer = (*buffers)[flatbuffer_tensor.buffer()];
    auto* array = buffer->data();
    int array_size = 0;
    if (array) {
      array_size = array->size();
    }
    BytesRequiredForTensor(flatbuffer_tensor, &tensor_size, &type_size,
                           error_reporter);
    TF_LITE_REPORT_ERROR(
        error_reporter, "Tensor index: %d arena tensor %d size %d ", i,
        !array_size && !flatbuffer_tensor.is_variable(), tensor_size);
  }
 }
 // Prints a dump of what tensors and what nodes are in the interpreter.
 void PrintInterpreterState(MicroInterpreter* interpreter) {
  printf("Interpreter has %zu tensors and %zu nodes\n",
--- a/tensorflow/lite/micro/micro_optional_debug_tools.h
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.h
@ -20,6 +20,9 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_interpreter.h"
 namespace tflite {
 // Helper function to print model flatbuffer data. This function is not called
 // by default. Hence it's not linked in to the final binary code.
 void PrintModelData(const Model* model, ErrorReporter* error_reporter);
 // Prints a dump of what tensors and what nodes are in the interpreter.
 void PrintInterpreterState(MicroInterpreter* interpreter);
 }  // namespace tflite
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@ -55,7 +55,7 @@ class StackAllocator : public flatbuffers::Allocator {
    return *inst;
  }
-  static constexpr size_t kStackAllocatorSize = 4096;
+  static constexpr size_t kStackAllocatorSize = 8192;
 private:
  uint8_t data_backing_[kStackAllocatorSize];
@ -101,6 +101,9 @@ class ModelBuilder {
  Node AddNode(Operator op, std::initializer_list<Tensor> inputs,
               std::initializer_list<Tensor> outputs);
  void AddMetadata(const char* description_string,
                   const int32_t* metadata_buffer_data, size_t num_elements);
  // Constructs the flatbuffer model using `builder_` and return a pointer to
  // it. The returned model has the same lifetime as `builder_`.
  const Model* BuildModel(std::initializer_list<Tensor> inputs,
@ -123,6 +126,16 @@ class ModelBuilder {
  static constexpr int kMaxTensors = 50;
  flatbuffers::Offset<tflite::Tensor> tensors_[kMaxTensors];
  static constexpr int kMaxMetadataBuffers = 10;
  static constexpr int kMaxMetadatas = 10;
  flatbuffers::Offset<Metadata> metadata_[kMaxMetadatas];
  flatbuffers::Offset<Buffer> metadata_buffers_[kMaxMetadataBuffers];
  int nbr_of_metadata_buffers_ = 0;
  int next_tensor_id_ = 0;
 };
@ -149,13 +162,33 @@ ModelBuilder::Node ModelBuilder::AddNode(
  return next_operator_id_ - 1;
 }
 void ModelBuilder::AddMetadata(const char* description_string,
                               const int32_t* metadata_buffer_data,
                               size_t num_elements) {
  metadata_[ModelBuilder::nbr_of_metadata_buffers_] =
      CreateMetadata(*builder_, builder_->CreateString(description_string),
                     1 + ModelBuilder::nbr_of_metadata_buffers_);
  metadata_buffers_[nbr_of_metadata_buffers_] = tflite::CreateBuffer(
      *builder_, builder_->CreateVector((uint8_t*)metadata_buffer_data,
                                        sizeof(uint32_t) * num_elements));
  ModelBuilder::nbr_of_metadata_buffers_++;
 }
 const Model* ModelBuilder::BuildModel(
    std::initializer_list<ModelBuilder::Tensor> inputs,
    std::initializer_list<ModelBuilder::Tensor> outputs) {
  // Model schema requires an empty buffer at idx 0.
-  constexpr size_t kBufferSize = 1;
+  size_t buffer_size = 1 + ModelBuilder::nbr_of_metadata_buffers_;
-  const flatbuffers::Offset<Buffer> buffers[kBufferSize] = {
+  flatbuffers::Offset<Buffer> buffers[kMaxMetadataBuffers];
-      tflite::CreateBuffer(*builder_)};
+  buffers[0] = tflite::CreateBuffer(*builder_);
  // Place the metadata buffers first in the buffer since the indices for them
  // have already been set in AddMetadata()
  for (int i = 1; i < ModelBuilder::nbr_of_metadata_buffers_ + 1; ++i) {
    buffers[i] = metadata_buffers_[i - 1];
  }
  // TFLM only supports single subgraph.
  constexpr size_t subgraphs_size = 1;
@ -166,12 +199,26 @@ const Model* ModelBuilder::BuildModel(
          builder_->CreateVector(outputs.begin(), outputs.size()),
          builder_->CreateVector(operators_, next_operator_id_),
          builder_->CreateString("test_subgraph"))};
-  const flatbuffers::Offset<Model> model_offset = tflite::CreateModel(
+
-      *builder_, 0,
+  flatbuffers::Offset<Model> model_offset;
-      builder_->CreateVector(operator_codes_, next_operator_code_id_),
+  if (ModelBuilder::nbr_of_metadata_buffers_ > 0) {
-      builder_->CreateVector(subgraphs, subgraphs_size),
+    model_offset = tflite::CreateModel(
-      builder_->CreateString("teset_model"),
+        *builder_, 0,
-      builder_->CreateVector(buffers, kBufferSize));
+        builder_->CreateVector(operator_codes_, next_operator_code_id_),
        builder_->CreateVector(subgraphs, subgraphs_size),
        builder_->CreateString("teset_model"),
        builder_->CreateVector(buffers, buffer_size), 0,
        builder_->CreateVector(metadata_,
                               ModelBuilder::nbr_of_metadata_buffers_));
  } else {
    model_offset = tflite::CreateModel(
        *builder_, 0,
        builder_->CreateVector(operator_codes_, next_operator_code_id_),
        builder_->CreateVector(subgraphs, subgraphs_size),
        builder_->CreateString("teset_model"),
        builder_->CreateVector(buffers, buffer_size));
  }
  tflite::FinishModelBuffer(*builder_, model_offset);
  void* model_pointer = builder_->GetBufferPointer();
  const Model* model = flatbuffers::GetRoot<Model>(model_pointer);
@ -250,6 +297,35 @@ const Model* BuildSimpleModelWithBranch() {
  return model_builder.BuildModel({t0}, {t3});
 }
 const Model* BuildModelWithOfflinePlanning(int number_of_tensors,
                                           const int32_t* metadata_buffer,
                                           NodeConnection* node_conn,
                                           int num_conns) {
  using flatbuffers::Offset;
  flatbuffers::FlatBufferBuilder* fb_builder = BuilderInstance();
  ModelBuilder model_builder(fb_builder);
  const int op_id =
      model_builder.RegisterOp(BuiltinOperator_CUSTOM, "mock_custom",
                               /* version= */ 0);
  for (int i = 0; i < number_of_tensors; ++i) {
    model_builder.AddTensor(TensorType_FLOAT32, {2, 2, 3});
  }
  for (int i = 0; i < num_conns; ++i) {
    model_builder.AddNode(op_id, node_conn[i].input, node_conn[i].output);
  }
  model_builder.AddMetadata(
      "OfflineMemoryAllocation", metadata_buffer,
      number_of_tensors + tflite::testing::kOfflinePlannerHeaderSize);
  return model_builder.BuildModel(node_conn[0].input,
                                  node_conn[num_conns - 1].output);
 }
 const Model* BuildSimpleMockModel() {
  using flatbuffers::Offset;
  flatbuffers::FlatBufferBuilder* builder = BuilderInstance();
@ -638,6 +714,15 @@ const Model* GetSimpleModelWithBranch() {
  return model;
 }
 const Model* GetModelWithOfflinePlanning(int num_tensors,
                                         const int32_t* metadata_buffer,
                                         NodeConnection* node_conn,
                                         int num_conns) {
  const Model* model = BuildModelWithOfflinePlanning(
      num_tensors, metadata_buffer, node_conn, num_conns);
  return model;
 }
 const Model* GetSimpleStatefulModel() {
  static Model* model = nullptr;
  if (!model) {
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@ -30,6 +30,14 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 constexpr int kOfflinePlannerHeaderSize = 3;
 struct NodeConnection_ {
  std::initializer_list<int32_t> input;
  std::initializer_list<int32_t> output;
 };
 typedef struct NodeConnection_ NodeConnection;
 // A simple operator that returns the median of the input with the number of
 // times the kernel was invoked. The implementation below is deliberately
 // complicated, just to demonstrate how kernel memory planning works.
@ -82,6 +90,12 @@ const Model* GetComplexMockModel();
 // Returns a simple flatbuffer model with two branches.
 const Model* GetSimpleModelWithBranch();
 // Returns a simple flatbuffer model with offline planned tensors
 const Model* GetModelWithOfflinePlanning(int num_tensors,
                                         const int32_t* metadata_buffer,
                                         NodeConnection* node_conn,
                                         int num_conns);
 // Returns a flatbuffer model with `simple_stateful_op`
 const Model* GetSimpleStatefulModel();