From 4f8a6dd61c58a9075f7feb487e0976f429668b55 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 29 Aug 2019 11:09:20 -0700
Subject: [PATCH] Implemented "Greedy by size planner" for that produces more
 optimal memory allocation. - in Mobilenet V1 has 35% has memory reduction -
 in Mobilenet V2 has 2% reduction Majority of all other tested models
 demonstrated ~10-15% improvement.

PiperOrigin-RevId: 266182266
---
 tensorflow/lite/BUILD                       |   1 -
 tensorflow/lite/allocation.h                |   2 +-
 tensorflow/lite/arena_planner.cc            | 211 ++++++++--------
 tensorflow/lite/arena_planner.h             |  52 ++--
 tensorflow/lite/arena_planner_test.cc       | 254 ++++++++++++--------
 tensorflow/lite/core/subgraph.cc            |  25 +-
 tensorflow/lite/core/subgraph.h             |   8 -
 tensorflow/lite/interpreter_test.cc         |  15 +-
 tensorflow/lite/memory_planner.h            |   5 -
 tensorflow/lite/simple_memory_arena.cc      |  78 +++---
 tensorflow/lite/simple_memory_arena.h       |  33 +--
 tensorflow/lite/simple_memory_arena_test.cc |  58 ++---
 12 files changed, 367 insertions(+), 375 deletions(-)
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index e353edd121e..c62d7ec9219 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -174,7 +174,6 @@ cc_library(
     ],
     copts = TFLITE_DEFAULT_COPTS,
     deps = [
-        ":simple_memory_arena",
         ":string",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api",
diff --git a/tensorflow/lite/allocation.h b/tensorflow/lite/allocation.h
index baf9ac3d421..d76207bbdac 100644
--- a/tensorflow/lite/allocation.h
+++ b/tensorflow/lite/allocation.h
@@ -19,11 +19,11 @@ limitations under the License.
 
 #include <cstdio>
 #include <cstdlib>
+#include <memory>
 #include <vector>
 
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/simple_memory_arena.h"
 #include "tensorflow/lite/string.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc
index 3258f612c18..e1a042b2188 100644
--- a/tensorflow/lite/arena_planner.cc
+++ b/tensorflow/lite/arena_planner.cc
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/arena_planner.h"
+
+#include <algorithm>
+#include <limits>
+#include <type_traits>
 #include <utility>
 
 namespace tflite {
+namespace {
 
-struct AllocationInfo {
-  // The node index requesting this allocation.
-  int node;
-  // The tensor index to be allocated or deallocated.
-  int tensor;
-  // Whether to allocate or deallocate
-  enum Type { ALLOC, DEALLOC } type;
-};
+constexpr size_t kNotAssigned = std::numeric_limits<size_t>::max();
+
+}  // namespace
 
 ArenaPlanner::ArenaPlanner(TfLiteContext* context,
                            std::unique_ptr<GraphInfo> graph_info,
@@ -55,52 +55,42 @@ TfLiteStatus ArenaPlanner::ResetAllocations() {
   TF_LITE_ENSURE_STATUS(persistent_arena_.Clear());
   allocs_.clear();
   allocs_.resize(graph_info_->num_tensors());
-  // Note that we only clear the alloc_queue_ when re-planning allocations, as
-  // it should only change when the graph topology itself changes.
+  order_.clear();
+  was_added_.clear();
   return kTfLiteOk;
 }
 
 TfLiteStatus ArenaPlanner::PlanAllocations() {
   // Invalidate any existing data.
   TF_LITE_ENSURE_STATUS(ResetAllocations());
-  // The alloc_queue_ is specific to the graph topology, and will be
-  // completely reconstructed from graph data here.
-  alloc_queue_.clear();
+  // Maybe other verb instead of 'Assigned'
+  alloc_node_.assign(graph_info_->num_tensors(), kNotAssigned);
+  dealloc_node_.assign(graph_info_->num_tensors(), kNotAssigned);
 
   // Keeps track of references to each tensor.
   std::vector<int> refcounts(graph_info_->num_tensors(), 0);
-  // `allocated` and `deallocated` are technically list of boolean values.
-  // We're saving the compiled binary size by using `vector<int>`.
-  std::vector<int> allocated(graph_info_->num_tensors(), false);
-  std::vector<int> deallocated(graph_info_->num_tensors(), false);
 
-  auto allocate = [this, &allocated, &deallocated](int node,
-                                                   int tensor) -> TfLiteStatus {
-    if (allocated[tensor]) {
+  auto allocate = [this](int node, int tensor) -> TfLiteStatus {
+    if (alloc_node_[tensor] != kNotAssigned) {
+      // Tensor has already been allocated.
       return kTfLiteOk;
     }
-    TF_LITE_ENSURE(context_, !deallocated[tensor]);
-    alloc_queue_.push_back({node, tensor, AllocationInfo::ALLOC});
-    allocated[tensor] = true;
+    TF_LITE_ENSURE(context_, dealloc_node_[tensor] == kNotAssigned);
+    alloc_node_[tensor] = node;
     return kTfLiteOk;
   };
 
-  auto deallocate = [this, &allocated, &deallocated](
-                        int node, int tensor) -> TfLiteStatus {
-    if (!allocated[tensor]) {
-      // Do not enqueue a DEALLOC if the tensor is never allocated.
+  auto deallocate = [this](int node, int tensor) -> TfLiteStatus {
+    if (alloc_node_[tensor] == kNotAssigned) {
+      // We don't need to deallocate the tensor, that is never allocated.
       // This happened with the constant tensors.
       return kTfLiteOk;
     }
-    TF_LITE_ENSURE(context_, !deallocated[tensor]);
-    alloc_queue_.push_back({node, tensor, AllocationInfo::DEALLOC});
+    TF_LITE_ENSURE(context_, dealloc_node_[tensor] == kNotAssigned);
+    dealloc_node_[tensor] = node;
     return kTfLiteOk;
   };
 
-  // There will be an entry in alloc_queue_ for the allocation of each tensor
-  // and another for their deallocation.
-  alloc_queue_.reserve(2 * graph_info_->num_tensors());
-
   // We must make sure the output tensors are never overwritten. We do that by
   // artificially adding one to their ref-counts so they are never selected
   // for deallocation.
@@ -188,12 +178,27 @@ TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) {
   // Grow the size of `allocs_` if necessary. This allows allocating temporary
   // tensors in op's `prepare` function.
   TF_LITE_ENSURE(context_, graph_info_->num_tensors() >= allocs_.size());
+  alloc_node_.resize(graph_info_->num_tensors(), kNotAssigned);
+  dealloc_node_.resize(graph_info_->num_tensors(), kNotAssigned);
   allocs_.resize(graph_info_->num_tensors());
+  was_added_.assign(graph_info_->num_tensors(), false);
+  order_.clear();
+  // Set allocation and deallocation for temporary tensors.
+  for (size_t i = first_node; i <= last_node && i < graph_info_->num_nodes();
+       ++i) {
+    const TfLiteNode& node = graph_info_->node(i);
+    TfLiteIntArray* node_temporaries = node.temporaries;
+    for (int j = 0; j < node_temporaries->size; ++j) {
+      int tensor_index = node_temporaries->data[j];
+      alloc_node_[tensor_index] = i;
+      dealloc_node_[tensor_index] = i;
+    }
+  }
 
   TF_LITE_ENSURE_STATUS(CalculateAllocations(first_node, last_node));
   TF_LITE_ENSURE_STATUS(Commit());
 
-  for (int i = 0; i < static_cast<int>(graph_info_->num_tensors()); ++i) {
+  for (size_t i = 0; i < graph_info_->num_tensors(); ++i) {
     // TODO(ahentz): we could do this only for the tensors that were modified
     // in CalculateAllocations(), instead of redoing it for tensors that
     // already had proper pointers. However we must be very careful, because
@@ -204,48 +209,71 @@ TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ArenaPlanner::Commit() {
-  TF_LITE_ENSURE_STATUS(arena_.Commit(context_));
-  TF_LITE_ENSURE_STATUS(persistent_arena_.Commit(context_));
+TfLiteStatus ArenaPlanner::CalculateAllocations(int first_node, int last_node) {
+  for (size_t i = 0; i < graph_info_->num_tensors(); ++i) {
+    if (alloc_node_[i] >= first_node && alloc_node_[i] <= last_node) {
+      AddTensorIfNeeded(i);
+    }
+  }
+
+  std::sort(order_.begin(), order_.end(), CompareBySize(this));
+
+  // Vector of ids of already allocated tensors, ordered by offset.
+  for (const auto& tensor_index : order_) {
+    TfLiteTensor& tensor = *graph_info_->tensor(tensor_index);
+    if (tensor.allocation_type == kTfLiteArenaRw) {
+      TF_LITE_ENSURE_STATUS(arena_.Allocate(
+          context_, tensor_alignment_, tensor.bytes, alloc_node_[tensor_index],
+          dealloc_node_[tensor_index], &allocs_[tensor_index]));
+    }
+    if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
+      TF_LITE_ENSURE_STATUS(persistent_arena_.Allocate(
+          context_, tensor_alignment_, tensor.bytes, alloc_node_[tensor_index],
+          std::numeric_limits<size_t>::max(), &allocs_[tensor_index]));
+    }
+  }
   return kTfLiteOk;
 }
 
-TfLiteStatus ArenaPlanner::CalculateAllocations(int first_node, int last_node) {
-  int active_node = first_node;
-  // When dynamic tensors are present this method is called multiple times.
-  // The items in the alloc_queue_ referring to nodes before first_node were
-  // processed previously and should be skipped. Entries after last_node are
-  // not yet ready to be handled.
-  for (const auto& alloc_info : alloc_queue_) {
-    if (alloc_info.node < first_node) continue;
-    if (alloc_info.node > last_node) break;
-    if (alloc_info.node == active_node) {
-      // This is the first allocation/deallocation for a given node.  It is
-      // time to deallocate the previous temporaries and allocate new ones.
-      if (active_node != first_node) {
-        TF_LITE_ENSURE_STATUS(
-            CalculateDeallocationOfInternalTensors(active_node - 1));
-      }
-      TF_LITE_ENSURE_STATUS(CalculateAllocationOfInternalTensors(active_node));
-      ++active_node;
-    }
-    // Handle the current item.
-    if (alloc_info.type == AllocationInfo::ALLOC) {
-      TF_LITE_ENSURE_STATUS(CalculateTensorAllocation(alloc_info.tensor));
-    } else {
-      TF_LITE_ENSURE_STATUS(CalculateTensorDeallocation(alloc_info.tensor));
+void ArenaPlanner::AddTensorIfNeeded(int tensor_index) {
+  if (!was_added_[tensor_index]) {
+    was_added_[tensor_index] = true;
+    order_.push_back(tensor_index);
+  }
+}
+
+bool ArenaPlanner::CompareBySize::operator()(const int idx1,
+                                             const int idx2) const {
+  // Tensors that have lifespan through the whole model inference time are
+  // allocated at the beginning of memory slice. Their respective order doesn't
+  // matter in fact, so here they are sorted by index.
+  if (planner->alloc_node_[idx1] == 0 &&
+      planner->dealloc_node_[idx1] == kNotAssigned) {
+    if (planner->alloc_node_[idx2] == 0 &&
+        planner->dealloc_node_[idx2] == kNotAssigned) {
+      return idx1 < idx2;
     }
+    return true;
+  }
+  if (planner->alloc_node_[idx2] == 0 &&
+      planner->dealloc_node_[idx2] == kNotAssigned) {
+    return false;
   }
 
-  // For the case if the graph is empty the node index can be negative since we
-  // substract from the active node, so the node_index can be zero for those
-  // cases
-  if (active_node > 0) {
-    // Don't forget to deallocate temporaries of last node.
-    TF_LITE_ENSURE_STATUS(
-        CalculateDeallocationOfInternalTensors(active_node - 1));
+  // All other tensors are sorted in non-increasing order of their size.
+  auto size1 = planner->graph_info_->tensor(idx1)->bytes;
+  auto size2 = planner->graph_info_->tensor(idx2)->bytes;
+  if (size1 != size2) {
+    return size1 > size2;
   }
 
+  // Tensors with equal size are sorted in order of their allocation time.
+  return planner->alloc_node_[idx1] < planner->alloc_node_[idx2];
+}
+
+TfLiteStatus ArenaPlanner::Commit() {
+  TF_LITE_ENSURE_STATUS(arena_.Commit(context_));
+  TF_LITE_ENSURE_STATUS(persistent_arena_.Commit(context_));
   return kTfLiteOk;
 }
 
@@ -266,51 +294,4 @@ TfLiteStatus ArenaPlanner::ResolveTensorAllocation(int tensor_index) {
   return kTfLiteOk;
 }
 
-TfLiteStatus ArenaPlanner::CalculateTensorAllocation(int tensor_index) {
-  TfLiteTensor& tensor = *graph_info_->tensor(tensor_index);
-  if (tensor.allocation_type == kTfLiteArenaRw) {
-    TF_LITE_ENSURE_STATUS(arena_.Allocate(
-        context_, tensor_alignment_, tensor.bytes, &allocs_[tensor_index]));
-  }
-  if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
-    TF_LITE_ENSURE_STATUS(persistent_arena_.Allocate(
-        context_, tensor_alignment_, tensor.bytes, &allocs_[tensor_index]));
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus ArenaPlanner::CalculateTensorDeallocation(int tensor_index) {
-  TfLiteTensor& tensor = *graph_info_->tensor(tensor_index);
-  if (tensor.allocation_type == kTfLiteArenaRw) {
-    TF_LITE_ENSURE_STATUS(arena_.Deallocate(context_, allocs_[tensor_index]));
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus ArenaPlanner::CalculateAllocationOfInternalTensors(
-    int node_index) {
-  if (node_index < static_cast<int>(graph_info_->num_nodes())) {
-    const TfLiteNode& node = graph_info_->node(static_cast<size_t>(node_index));
-    TfLiteIntArray* node_temporaries = node.temporaries;
-    for (int i = 0; i < node_temporaries->size; ++i) {
-      int tensor_index = node_temporaries->data[i];
-      TF_LITE_ENSURE_STATUS(CalculateTensorAllocation(tensor_index));
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus ArenaPlanner::CalculateDeallocationOfInternalTensors(
-    int node_index) {
-  if (node_index < static_cast<int>(graph_info_->num_nodes())) {
-    const TfLiteNode& node = graph_info_->node(static_cast<size_t>(node_index));
-    TfLiteIntArray* node_temporaries = node.temporaries;
-    for (int i = 0; i < node_temporaries->size; ++i) {
-      int tensor_index = node_temporaries->data[i];
-      TF_LITE_ENSURE_STATUS(CalculateTensorDeallocation(tensor_index));
-    }
-  }
-  return kTfLiteOk;
-}
-
 }  // namespace tflite
diff --git a/tensorflow/lite/arena_planner.h b/tensorflow/lite/arena_planner.h
index 569e5d98db2..31f9fc6d8e3 100644
--- a/tensorflow/lite/arena_planner.h
+++ b/tensorflow/lite/arena_planner.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_ARENA_PLANNER_H_
 #define TENSORFLOW_LITE_ARENA_PLANNER_H_
 
+#include <limits>
 #include <memory>
 #include <vector>
 
@@ -44,17 +45,12 @@ struct AllocationInfo;
 // execution. Since dynamic tensors don't have sizes until after the
 // corresponding operation is executed, this class supports incremental
 // planning.
-//
-// TODO(b/127354079): Remove the constrain below when the issue is fixed.
-// WARNING: MemoryPlanner's behavior must be deterministic. If the first N
-// nodes are unchanged, it must produce exactly the same allocation plan for
-// the first N nodes.
 class ArenaPlanner : public MemoryPlanner {
  public:
   // Ownership of 'context' is not taken and it must remain util the
-  // ArenaPlanner is destroyed. If 'preserve_inputs' is true the inputs to the
-  // graph will not share memory with any other tensor, effectively preserving
-  // them until the end of inference.
+  // ArenaPlanner is destroyed. If 'preserve_inputs' is true the inputs
+  // to the graph will not share memory with any other tensor, effectively
+  // preserving them until the end of inference.
   ArenaPlanner(TfLiteContext* context, std::unique_ptr<GraphInfo> graph_info,
                bool preserve_inputs, bool preserve_intermediates,
                int tensor_alignment = kDefaultTensorAlignment);
@@ -82,29 +78,37 @@ class ArenaPlanner : public MemoryPlanner {
   // position inside the corresponding arena buffer.
   TfLiteStatus ResolveTensorAllocation(int tensor_index);
 
-  // Register an allocation for the given tensor.
-  TfLiteStatus CalculateTensorAllocation(int tensor_index);
+  void AddTensorIfNeeded(int tensor_index);
 
-  // Register a deallocation for the given tensor.
-  TfLiteStatus CalculateTensorDeallocation(int tensor_index);
-
-  // Register an allocation for all internal (temporary) tensors of
-  // 'node_index'.
-  TfLiteStatus CalculateAllocationOfInternalTensors(int node_index);
-
-  // Register a deallocation for all internal (temporary) tensors of
-  // 'node_index'.
-  TfLiteStatus CalculateDeallocationOfInternalTensors(int node_index);
+  // Comparator to sort tensors for the allocation algorithm:
+  // - Tensors that have lifespan through the whole model inference time go
+  // first;
+  // - Other tensors (e.g. intermediate and temporary ones) are sorted in
+  // non-increasing order of their size. If sizes of two tensors are equal, the
+  // one that needs to be allocated earlier goes first.
+  struct CompareBySize {
+    explicit CompareBySize(const ArenaPlanner* planner) : planner(planner) {}
+    bool operator()(const int idx1, const int idx2) const;
+    const ArenaPlanner* planner;
+  };
 
   TfLiteContext* context_;
   std::unique_ptr<GraphInfo> graph_info_;
 
   // Stores allocation data for all tensors.
-  std::vector<ArenaAlloc> allocs_;
+  std::vector<ArenaAllocWithUsageInterval> allocs_;
 
-  // A chronological list of instructions to allocate and deallocate tensors,
-  // reflecting the way they are used in the graph.
-  std::vector<AllocationInfo> alloc_queue_;
+  // First node, that uses the tensor. It needs to be allocated before
+  // execution of the node's operation.
+  std::vector<size_t> alloc_node_;
+
+  // Last node, that uses the tensor. It can be deallocated after execution of
+  // the node's operation.
+  std::vector<size_t> dealloc_node_;
+
+  // Indices of tensors in order their allocation offsets will be calculated.
+  std::vector<size_t> order_;
+  std::vector<char> was_added_;  // avoiding std::vector<bool> as bitset
 
   // Raw memory buffer that is allocated for all temporary and graph outputs
   // that are declared kTfLiteArenaRw.
diff --git a/tensorflow/lite/arena_planner_test.cc b/tensorflow/lite/arena_planner_test.cc
index 0e80d429c0d..83a08c0a487 100644
--- a/tensorflow/lite/arena_planner_test.cc
+++ b/tensorflow/lite/arena_planner_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/testing/util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace {
@@ -198,7 +198,7 @@ class ArenaPlannerTest : public ::testing::Test {
       offset += kTensorAlignment - offset % kTensorAlignment;
     }
     return offset;
-  };
+  }
 
   TfLiteContext context_;
   TestGraph* graph_;
@@ -211,18 +211,6 @@ TEST_F(ArenaPlannerTest, EmptyGraph) {
   Execute(0, 10);
 }
 
-TEST_F(ArenaPlannerTest, DeallocationOfInputTensor) {
-  // This is a negative TC, which will try to make sure that no allocation for
-  // input tensors is done, when making call with negative node_index, since
-  // previous check was doing comparison of node_index which was int and
-  // unsigned int, implicit conversion was passing this case, as the negative
-  // number was converted to unsigned it making it invalid.The new check
-  // takes care of this problem and removes the warning as well.
-  TestGraph graph({-1}, {}, {1});
-  SetGraph(&graph);
-  Execute(0, 10);
-}
-
 TEST_F(ArenaPlannerTest, GraphWithNoOps) {
   TestGraph graph({0, 10}, {}, {5, 11});
   SetGraph(&graph);
@@ -239,8 +227,8 @@ TEST_F(ArenaPlannerTest, GraphWithOneOp) {
   TestGraph graph({1}, {{{1}, {2}, {}}}, {2});
   SetGraph(&graph);
   Execute(0, 10);
-  EXPECT_EQ(GetOffset(1), 0);
-  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(2), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(2));
 }
 
 TEST_F(ArenaPlannerTest, ZeroSizedTensors) {
@@ -264,12 +252,12 @@ TEST_F(ArenaPlannerTest, SimpleGraph) {
   Execute(0, 10);
 
   // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +4 +5 -2 -0 +3 -4 -5
-  EXPECT_EQ(GetOffset(0), 0);
-  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
-  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
-  EXPECT_EQ(GetOffset(4), GetOffsetAfter(2));
-  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
-  EXPECT_EQ(GetOffset(3), 0);
+  EXPECT_EQ(GetOffset(5), 0);
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(0), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(1), 0);
 }
 
 TEST_F(ArenaPlannerTest, SimpleGraphInputsPreserved) {
@@ -287,13 +275,10 @@ TEST_F(ArenaPlannerTest, SimpleGraphInputsPreserved) {
   // Alloc(+) and dealloc(-) order: +0 +1 +2 +4 +5 -2 +3 -4 -5
   EXPECT_EQ(GetOffset(0), 0);
   EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
-  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
-  EXPECT_EQ(GetOffset(4), GetOffsetAfter(2));
-  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
-  // Because we are keeping the inputs alive until the end (due to
-  // preserve_inputs=true), the output tensor will not be able to use that
-  // space. It will end up using the same are as tensor #2.
-  EXPECT_EQ(GetOffset(3), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(4));
 }
 
 TEST_F(ArenaPlannerTest, SimpleGraphWithTemporary) {
@@ -309,12 +294,12 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithTemporary) {
   Execute(0, 10);
 
   // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4
-  EXPECT_EQ(GetOffset(0), 0);
-  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
-  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
-  EXPECT_EQ(GetOffset(5), GetOffsetAfter(2));
-  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
   EXPECT_EQ(GetOffset(3), 0);
+  EXPECT_EQ(GetOffset(5), 0);
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(0), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(1), 0);
 }
 
 TEST_F(ArenaPlannerTest, SimpleGraphWithOptionals) {
@@ -330,12 +315,12 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithOptionals) {
   Execute(0, 10);
 
   // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +4 +5 -2 -0 +3 -4 -5
-  EXPECT_EQ(GetOffset(0), 0);
-  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
-  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
-  EXPECT_EQ(GetOffset(4), GetOffsetAfter(2));
-  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
-  EXPECT_EQ(GetOffset(3), 0);
+  EXPECT_EQ(GetOffset(5), 0);
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(0), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(1), 0);
 }
 
 TEST_F(ArenaPlannerTest, SimpleGraphWithLargeTensor) {
@@ -355,12 +340,12 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithLargeTensor) {
   Execute(0, 10);
 
   // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4
-  EXPECT_EQ(GetOffset(0), 0);
-  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(1), 0);
   EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
-  EXPECT_EQ(GetOffset(5), GetOffsetAfter(0));
-  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(0), GetOffsetAfter(2));
   EXPECT_EQ(GetOffset(3), 0);
+  EXPECT_EQ(GetOffset(5), 0);
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
 }
 
 TEST_F(ArenaPlannerTest, SimpleGraphWithPersistentTensor) {
@@ -386,12 +371,12 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithPersistentTensor) {
   EXPECT_NE((*graph.tensors())[0].data.raw, (*graph.tensors())[1].data.raw);
 
   // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4
-  EXPECT_EQ(GetOffset(0), 0);
-  EXPECT_EQ(GetOffset(1), 0);
-  EXPECT_EQ(GetOffset(2), GetOffsetAfter(0));
-  EXPECT_EQ(GetOffset(5), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(5), 0);
   EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
   EXPECT_EQ(GetOffset(3), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(0), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(1), 0);
 }
 
 TEST_F(ArenaPlannerTest, SimpleGraphWithDynamicTensor) {
@@ -413,11 +398,11 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithDynamicTensor) {
   EXPECT_EQ((*graph.tensors())[1].data.raw, nullptr);
 
   // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4
-  EXPECT_EQ(GetOffset(0), 0);
-  EXPECT_EQ(GetOffset(2), GetOffsetAfter(0));
-  EXPECT_EQ(GetOffset(5), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(5), 0);
   EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
   EXPECT_EQ(GetOffset(3), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(0), GetOffsetAfter(2));
 }
 
 TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) {
@@ -446,10 +431,10 @@ TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) {
   //   Op4: +10 -4 -5 -8
 
   Execute(0, 0);
-  EXPECT_EQ(GetOffset(0), 0);
-  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
-  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
-  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(3), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(0), GetOffsetAfter(1));
   EXPECT_TRUE(is_unallocated(6));
   EXPECT_TRUE(is_unallocated(4));
   EXPECT_TRUE(is_unallocated(5));
@@ -459,62 +444,61 @@ TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) {
   EXPECT_TRUE(is_unallocated(10));
 
   Execute(1, 1);
-  EXPECT_EQ(GetOffset(0), 0);
-  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
-  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
-  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
-  EXPECT_EQ(GetOffset(6), GetOffsetAfter(3));
-  EXPECT_EQ(GetOffset(4), GetOffsetAfter(6));
-  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
+  EXPECT_EQ(GetOffset(3), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(0), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
   EXPECT_TRUE(is_unallocated(7));
   EXPECT_TRUE(is_unallocated(9));
   EXPECT_TRUE(is_unallocated(8));
   EXPECT_TRUE(is_unallocated(10));
 
   Execute(2, 2);
-  EXPECT_EQ(GetOffset(0), 0);
-  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
-  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
-  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
-  EXPECT_EQ(GetOffset(6), GetOffsetAfter(3));
-  EXPECT_EQ(GetOffset(4), GetOffsetAfter(6));
-  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
-  // Here's an interesting allocation. Even though #6 requires only 21 bytes,
-  // its deallocation freed up 24 bytes due to the alignment requirements in
-  // the arena. That means we can fit #7 in the same space!
-  EXPECT_EQ(GetOffset(7), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(3), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(0), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  // #7 (24 bytes) is allocated at the place, where #0 and #6 (4+24=28 bytes)
+  // were before their deallocation.
+  EXPECT_EQ(GetOffset(7), GetOffsetAfter(1));
   EXPECT_TRUE(is_unallocated(9));
   EXPECT_TRUE(is_unallocated(8));
   EXPECT_TRUE(is_unallocated(10));
 
   Execute(3, 3);
-  EXPECT_EQ(GetOffset(0), 0);
-  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
-  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
-  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
-  EXPECT_EQ(GetOffset(6), GetOffsetAfter(3));
-  EXPECT_EQ(GetOffset(4), GetOffsetAfter(6));
-  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
-  EXPECT_EQ(GetOffset(7), GetOffsetAfter(3));
-  // The deallocation of #0, #1 and #2 freed up 24 bytes but that's not enough
-  // for #9, so it goes at the end.
-  EXPECT_EQ(GetOffset(9), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(0), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(7), GetOffsetAfter(1));
+  // The deallocation of #1 and #2 frees up 20 bytes but that's not enough
+  // neither for #9, nor for #8, so they both go at the end.
+  EXPECT_EQ(GetOffset(9), GetOffsetAfter(4));
   EXPECT_EQ(GetOffset(8), GetOffsetAfter(9));
   EXPECT_TRUE(is_unallocated(10));
 
   Execute(4, 4);
-  EXPECT_EQ(GetOffset(0), 0);
-  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
-  EXPECT_EQ(GetOffset(2), GetOffsetAfter(1));
-  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
-  EXPECT_EQ(GetOffset(6), GetOffsetAfter(3));
-  EXPECT_EQ(GetOffset(4), GetOffsetAfter(6));
-  EXPECT_EQ(GetOffset(5), GetOffsetAfter(4));
-  EXPECT_EQ(GetOffset(7), GetOffsetAfter(3));
-  EXPECT_EQ(GetOffset(9), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), 0);
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(0), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(7), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(9), GetOffsetAfter(4));
   EXPECT_EQ(GetOffset(8), GetOffsetAfter(9));
-  // There's just enough space at the beginning for #10 due to the
-  // deallocation of #0, #1, #2 and #3 (total 36 bytes, #10 needs
+  // There is enough space at the beginning for #10 due to the
+  // deallocation of #7, #1, #2 and #3 (total 56 bytes, #10 needs
   // only 33.)
   EXPECT_EQ(GetOffset(10), 0);
 }
@@ -547,6 +531,86 @@ TEST_F(ArenaPlannerTest, ModifiedGraph) {
   EXPECT_EQ(GetOffset(3), GetOffsetAfter(1));
 }
 
+TEST_F(ArenaPlannerTest, ComplexGraph) {
+  TestGraph graph({0},
+                  {
+                      /* in, out, tmp */
+                      {{0}, {1}, {}},
+                      {{1}, {2}, {}},
+                      {{1}, {3}, {}},
+                      {{1}, {4}, {}},
+                      {{2, 3, 4}, {5}, {}},
+                      {{5}, {6}, {}},
+                      {{5}, {7}, {}},
+                      {{6, 7}, {8}, {}},
+                  },
+                  {8});
+  (*graph.tensors())[0].bytes = 32;
+  (*graph.tensors())[1].bytes = 28;
+  (*graph.tensors())[2].bytes = 36;
+  (*graph.tensors())[3].bytes = 16;
+  (*graph.tensors())[4].bytes = 8;
+  (*graph.tensors())[5].bytes = 64;
+  (*graph.tensors())[6].bytes = 10;
+  (*graph.tensors())[7].bytes = 40;
+  SetGraph(&graph);
+  Execute(0, 10);
+
+  // Alloc(+) and dealloc(-) order: +0 +1 -0 +2 +3 +4 -1 +5 -2 -3 -4 +6 +7 -5 +8
+  EXPECT_EQ(GetOffset(5), 0);
+  EXPECT_EQ(GetOffset(7), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(7));
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(5));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(2));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(3));
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(8), 0);
+}
+
+TEST_F(ArenaPlannerTest, GraphWithIntermediates) {
+  TestGraph graph({0, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0}, {2}, {3}},
+                      {{1, 2}, {4, 5}, {}},
+                      {{5}, {6, 7}, {8, 9, 10}},
+                      {{4, 6}, {11}, {12}},
+                      {{11}, {13}, {}},
+                      {{7, 13}, {14}, {15}},
+                  },
+                  {11, 14});
+  SetGraph(&graph, /*preserve_inputs=*/true);
+  Execute(0, 10);
+
+  // Alloc(+) and dealloc(-) order by operation:
+  // Op0: +0 +1 +2 +3 -3
+  // Op1: +4 +5 -2 -4
+  // Op2: +6 +7 +8 +9 +10 -8 -9 -10 -5
+  // Op3: +11 +12 -12 -4 -6
+  // Op4: +13
+  // Op5: +14 +15 -7 -13 -15
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(15), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(14), GetOffsetAfter(15));
+  EXPECT_EQ(GetOffset(13), GetOffsetAfter(14));
+  EXPECT_EQ(GetOffset(12), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(11), GetOffsetAfter(13));
+  EXPECT_EQ(GetOffset(10), GetOffsetAfter(1));
+  EXPECT_EQ(GetOffset(9), GetOffsetAfter(10));
+  EXPECT_EQ(GetOffset(8), GetOffsetAfter(9));
+  EXPECT_EQ(GetOffset(7), GetOffsetAfter(11));
+  EXPECT_EQ(GetOffset(6), GetOffsetAfter(8));
+  EXPECT_EQ(GetOffset(5), GetOffsetAfter(6));
+  EXPECT_EQ(GetOffset(4), GetOffsetAfter(7));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(1));
+
+  // 2 is allocated in the smallest suitable gap, which is not equal to the
+  // first available one.
+  EXPECT_EQ(GetOffset(2), GetOffsetAfter(5));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index d2c6b874702..6097e1fd7d1 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -162,7 +162,6 @@ Subgraph::Subgraph(ErrorReporter* error_reporter,
     : external_contexts_(external_contexts),
       error_reporter_(error_reporter),
       next_execution_plan_index_to_prepare_(0),
-      next_execution_plan_index_to_plan_allocation_(0),
       subgraphs_(subgraphs),
       resource_variables_(resource_variables) {
   context_.impl_ = static_cast<void*>(this);
@@ -496,7 +495,6 @@ TfLiteStatus Subgraph::AllocateTensors() {
   }
 
   next_execution_plan_index_to_prepare_ = 0;
-  next_execution_plan_index_to_plan_allocation_ = 0;
   if (memory_planner_) {
     TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
   }
@@ -695,13 +693,10 @@ TfLiteStatus Subgraph::PrepareOpsAndTensors() {
 
   TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
       next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
-  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
 
   TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
-      next_execution_plan_index_to_plan_allocation_,
-      last_exec_plan_index_prepared));
-  next_execution_plan_index_to_plan_allocation_ =
-      last_exec_plan_index_prepared + 1;
+      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
+  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
 
   return kTfLiteOk;
 }
@@ -776,22 +771,6 @@ TfLiteStatus Subgraph::Invoke() {
     if (tensor_resized_since_op_invoke_ &&
         HasDynamicTensor(context_, node.outputs)) {
       next_execution_plan_index_to_prepare_ = execution_plan_index + 1;
-
-      // This happens when an intermediate dynamic tensor is resized.
-      // We don't have to prepare all the ops, but we need to recompute
-      // the allocation plan.
-      //
-      // This is a workaround for b/127354079. It relies on the property that
-      // ArenaPlanner's behavior is deterministic. A better solution is being
-      // able to "Rewind" to a specific index in ArenaPlanner.
-      // TODO(b/127354079): Improve ArenaPlanner and remove this mechanism.
-      if (next_execution_plan_index_to_plan_allocation_ >
-          next_execution_plan_index_to_prepare_) {
-        next_execution_plan_index_to_plan_allocation_ = 0;
-        if (memory_planner_) {
-          TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
-        }
-      }
     }
   }
 
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index b9736d89f9a..c80a0f355c9 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -527,14 +527,6 @@ class Subgraph {
   // NOTE: this relies on the order of nodes that is in topological order.
   int next_execution_plan_index_to_prepare_;
 
-  // This is similar to `next_execution_plan_index_to_prepare_`, but it tracks
-  // which nodes' allocation is planned with the arena planner.
-  //
-  // This is a workaround for b/127354079. It shouldn't be necessary if
-  // ArenaPlanner can "rewind" to a specific point.
-  // TODO(b/127354079): Improve ArenaPlanner and remove this mechanism.
-  int next_execution_plan_index_to_plan_allocation_;
-
   // WARNING: This is an experimental interface that is subject to change.
   // This is a list of node indices (to index into nodes_and_registration).
   // This represents a valid topological sort (dependency ordered) execution
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index f6d8bae4eff..fa303549363 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -364,15 +364,14 @@ TEST(BasicInterpreter, CheckArenaAllocation) {
   ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
 
   ASSERT_LT(interpreter.tensor(0)->data.raw, interpreter.tensor(1)->data.raw);
-  ASSERT_LT(interpreter.tensor(1)->data.raw, interpreter.tensor(2)->data.raw);
-  ASSERT_LT(interpreter.tensor(2)->data.raw, interpreter.tensor(3)->data.raw);
-  ASSERT_LT(interpreter.tensor(3)->data.raw, interpreter.tensor(4)->data.raw);
-  ASSERT_LT(interpreter.tensor(4)->data.raw, interpreter.tensor(5)->data.raw);
-  ASSERT_LT(interpreter.tensor(5)->data.raw, interpreter.tensor(7)->data.raw);
-  ASSERT_EQ(interpreter.tensor(6)->data.raw, interpreter.tensor(2)->data.raw);
-  // #7 is the one with the largest pointer.
+  ASSERT_LT(interpreter.tensor(1)->data.raw, interpreter.tensor(3)->data.raw);
+  ASSERT_EQ(interpreter.tensor(3)->data.raw, interpreter.tensor(9)->data.raw);
+  ASSERT_LT(interpreter.tensor(3)->data.raw, interpreter.tensor(5)->data.raw);
+  ASSERT_LT(interpreter.tensor(5)->data.raw, interpreter.tensor(2)->data.raw);
+  ASSERT_EQ(interpreter.tensor(2)->data.raw, interpreter.tensor(7)->data.raw);
+  ASSERT_LT(interpreter.tensor(2)->data.raw, interpreter.tensor(4)->data.raw);
+  // #4 is the one with the largest pointer.
   ASSERT_EQ(interpreter.tensor(8)->data.raw, nullptr);
-  ASSERT_EQ(interpreter.tensor(9)->data.raw, interpreter.tensor(5)->data.raw);
 }
 
 TEST(BasicInterpreter, BufferAccess) {
diff --git a/tensorflow/lite/memory_planner.h b/tensorflow/lite/memory_planner.h
index 2c2b357d52f..fa2a44a1c89 100644
--- a/tensorflow/lite/memory_planner.h
+++ b/tensorflow/lite/memory_planner.h
@@ -21,11 +21,6 @@ namespace tflite {
 
 // A MemoryPlanner is responsible for planning and executing a number of
 // memory-related operations that are necessary in TF Lite.
-//
-// TODO(b/127354079): Remove the constrain below when the issue is fixed.
-// WARNING: MemoryPlanner's behavior must be deterministic. If the first N
-// nodes are unchanged, it must produce exactly the same allocation plan for
-// the first N nodes.
 class MemoryPlanner {
  public:
   virtual ~MemoryPlanner() {}
diff --git a/tensorflow/lite/simple_memory_arena.cc b/tensorflow/lite/simple_memory_arena.cc
index 88bdf50c9b6..167a845cd23 100644
--- a/tensorflow/lite/simple_memory_arena.cc
+++ b/tensorflow/lite/simple_memory_arena.cc
@@ -31,73 +31,55 @@ T AlignTo(size_t alignment, T offset) {
 }  // namespace
 
 namespace tflite {
-
-TfLiteStatus SimpleMemoryArena::Allocate(TfLiteContext* context,
-                                         size_t alignment, size_t size,
-                                         ArenaAlloc* new_alloc) {
+TfLiteStatus SimpleMemoryArena::Allocate(
+    TfLiteContext* context, size_t alignment, size_t size, size_t first_node,
+    size_t last_node, ArenaAllocWithUsageInterval* new_alloc) {
   TF_LITE_ENSURE(context, alignment <= arena_alignment_);
+  new_alloc->first_node = first_node;
+  new_alloc->last_node = last_node;
+  new_alloc->size = size;
 
   if (size == 0) {
     new_alloc->offset = 0;
-    new_alloc->size = 0;
     return kTfLiteOk;
   }
 
-  size_t current_top = 0;
-
-  if (!allocs_.empty()) {
-    auto last = allocs_.rbegin();
-    current_top = last->offset + last->size;
-  }
-
   // If we don't find a better gap just allocate at the end of the buffer.
-  size_t best_offset = AlignTo(alignment, current_top);
-  size_t best_offset_fit = std::numeric_limits<size_t>::max();
-  auto best_insertion_it = allocs_.end();
+  const size_t kNotAssigned = std::numeric_limits<size_t>::max();
+  size_t best_offset = kNotAssigned;
+  size_t best_offset_fit = kNotAssigned;
 
   // Go through the sorted allocs and look at the gaps between them.
   size_t current_offset = 0;
-  for (auto it = allocs_.begin(); it != allocs_.end(); ++it) {
+  for (const auto& alloc : ordered_allocs_) {
+    if (alloc.last_node < first_node || alloc.first_node > last_node) {
+      // Usage interval of alloc doesn't intersect with current tensor's usage
+      // interval, so we skip it.
+      continue;
+    }
     size_t aligned_current_offset = AlignTo(alignment, current_offset);
     // If we found a gap larger than required size, and smaller than previous
     // best fit, take it.
-    if (aligned_current_offset + size <= it->offset &&
-        it->offset - current_offset < best_offset_fit) {
+    if (aligned_current_offset + size <= alloc.offset &&
+        alloc.offset - aligned_current_offset < best_offset_fit) {
       best_offset = aligned_current_offset;
-      best_offset_fit = it->offset - current_offset;
-      best_insertion_it = it;
+      best_offset_fit = alloc.offset - current_offset;
     }
-    current_offset = it->offset + it->size;
+    current_offset = std::max(current_offset, alloc.offset + alloc.size);
+  }
+  if (best_offset == kNotAssigned) {
+    best_offset = AlignTo(alignment, current_offset);
   }
 
   // Update the required buffer size.
   high_water_mark_ = std::max(high_water_mark_, best_offset + size);
-
   new_alloc->offset = best_offset;
-  new_alloc->size = size;
-  allocs_.insert(best_insertion_it, *new_alloc);
 
-  return kTfLiteOk;
-}
-
-TfLiteStatus SimpleMemoryArena::Deallocate(TfLiteContext* context,
-                                           const ArenaAlloc& alloc) {
-  if (alloc.size == 0) {
-    return kTfLiteOk;
+  auto insertion_it = ordered_allocs_.begin();
+  while (insertion_it != ordered_allocs_.end() && *insertion_it < *new_alloc) {
+    ++insertion_it;
   }
-
-  int erased_allocs_count = 0;
-  auto it = allocs_.begin();
-  while (it != allocs_.end()) {
-    if (it->offset == alloc.offset) {
-      TF_LITE_ENSURE_EQ(context, it->size, alloc.size);
-      erased_allocs_count++;
-      it = allocs_.erase(it);
-    } else {
-      ++it;
-    }
-  }
-  TF_LITE_ENSURE_EQ(context, erased_allocs_count, 1);
+  ordered_allocs_.insert(insertion_it, *new_alloc);
   return kTfLiteOk;
 }
 
@@ -128,9 +110,9 @@ TfLiteStatus SimpleMemoryArena::Commit(TfLiteContext* context) {
   return underlying_buffer_ != nullptr ? kTfLiteOk : kTfLiteError;
 }
 
-TfLiteStatus SimpleMemoryArena::ResolveAlloc(TfLiteContext* context,
-                                             const ArenaAlloc& alloc,
-                                             char** output_ptr) {
+TfLiteStatus SimpleMemoryArena::ResolveAlloc(
+    TfLiteContext* context, const ArenaAllocWithUsageInterval& alloc,
+    char** output_ptr) {
   TF_LITE_ENSURE(context, committed_);
   TF_LITE_ENSURE(context, output_ptr != nullptr);
   if (alloc.size == 0) {
@@ -144,7 +126,7 @@ TfLiteStatus SimpleMemoryArena::ResolveAlloc(TfLiteContext* context,
 TfLiteStatus SimpleMemoryArena::Clear() {
   committed_ = false;
   high_water_mark_ = 0;
-  allocs_.clear();
+  ordered_allocs_.clear();
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/simple_memory_arena.h b/tensorflow/lite/simple_memory_arena.h
index 42203c0c0a3..6fec52d15b0 100644
--- a/tensorflow/lite/simple_memory_arena.h
+++ b/tensorflow/lite/simple_memory_arena.h
@@ -22,16 +22,19 @@ limitations under the License.
 namespace tflite {
 
 // This little structure holds the offset and the size for a dynamic memory
-// allocation in the memory arena. When the arena is committed and the
-// underlying buffer is set, the alloc can be resolved into an actual memory
-// pointer.
-struct ArenaAlloc {
-  ArenaAlloc() : offset(0), size(0) {}
-
+// allocation in the memory arena as well as first_node and last_node that use
+// corresponding tensor. It means that continuous part of memory with this size
+// needs to be allocated before execution of operation in the first node and can
+// be deallocated after execution of the operation in the last_node. When the
+// arena is committed and the underlying buffer is set, the alloc can be
+// resolved into an actual memory pointer.
+struct ArenaAllocWithUsageInterval {
   size_t offset;
   size_t size;
+  size_t first_node;
+  size_t last_node;
 
-  inline bool operator<(const ArenaAlloc& other) const {
+  inline bool operator<(const ArenaAllocWithUsageInterval& other) const {
     return offset < other.offset;
   }
 };
@@ -48,12 +51,14 @@ class SimpleMemoryArena {
         arena_alignment_(arena_alignment),
         high_water_mark_(0),
         underlying_buffer_size_(0),
-        allocs_() {}
+        ordered_allocs_() {}
 
+  // Schedule memory allocation for a tensor with a given size, assuming that it
+  // needs to be allocated before the execution of first_node, and deallocated
+  // after the execution of last_node.
   TfLiteStatus Allocate(TfLiteContext* context, size_t alignment, size_t size,
-                        ArenaAlloc* new_alloc);
-
-  TfLiteStatus Deallocate(TfLiteContext* context, const ArenaAlloc& alloc);
+                        size_t first_node, size_t last_node,
+                        ArenaAllocWithUsageInterval* new_alloc);
 
   inline size_t RequiredBufferSize() {
     // Add in a small amount of padding to reduce the chance of resize events
@@ -64,7 +69,8 @@ class SimpleMemoryArena {
 
   TfLiteStatus Commit(TfLiteContext* context);
 
-  TfLiteStatus ResolveAlloc(TfLiteContext* context, const ArenaAlloc& alloc,
+  TfLiteStatus ResolveAlloc(TfLiteContext* context,
+                            const ArenaAllocWithUsageInterval& alloc,
                             char** output_ptr);
 
   TfLiteStatus Clear();
@@ -80,8 +86,7 @@ class SimpleMemoryArena {
   std::unique_ptr<char[]> underlying_buffer_;
   size_t underlying_buffer_size_;
   char* underlying_buffer_aligned_ptr_;
-  // TODO(maciekc): add list iterator to the ArenaAlloc to lookup quickly.
-  std::list<ArenaAlloc> allocs_;
+  std::list<ArenaAllocWithUsageInterval> ordered_allocs_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/simple_memory_arena_test.cc b/tensorflow/lite/simple_memory_arena_test.cc
index caf13db2c1a..5300ebe5ea2 100644
--- a/tensorflow/lite/simple_memory_arena_test.cc
+++ b/tensorflow/lite/simple_memory_arena_test.cc
@@ -24,39 +24,33 @@ namespace {
 TEST(SimpleMemoryArenaTest, BasicArenaOperations) {
   TfLiteContext context;
   SimpleMemoryArena arena(64);
-  ArenaAlloc allocs[6];
+  ArenaAllocWithUsageInterval allocs[6];
 
-  arena.Allocate(&context, 32, 2047, &allocs[0]);
-  arena.Allocate(&context, 32, 2047, &allocs[1]);
-  arena.Allocate(&context, 32, 2047, &allocs[2]);
-  arena.Deallocate(&context, allocs[0]);
-  arena.Allocate(&context, 32, 1023, &allocs[3]);
-  arena.Allocate(&context, 32, 2047, &allocs[4]);
-  arena.Deallocate(&context, allocs[1]);
-  arena.Allocate(&context, 32, 1023, &allocs[5]);
+  arena.Allocate(&context, 32, 2047, 1, 3, &allocs[0]);
+  arena.Allocate(&context, 32, 2047, 2, 5, &allocs[1]);
+  arena.Allocate(&context, 32, 2047, 3, 6, &allocs[2]);
+  arena.Allocate(&context, 32, 2047, 5, 6, &allocs[3]);
+  arena.Allocate(&context, 32, 1023, 4, 6, &allocs[4]);
+  arena.Allocate(&context, 32, 1023, 6, 6, &allocs[5]);
 
   EXPECT_EQ(allocs[0].offset, 0);
   EXPECT_EQ(allocs[1].offset, 2048);
   EXPECT_EQ(allocs[2].offset, 4096);
   EXPECT_EQ(allocs[3].offset, 0);
   EXPECT_EQ(allocs[4].offset, 6144);
-  EXPECT_EQ(allocs[5].offset, 1024);
+  EXPECT_EQ(allocs[5].offset, 2048);
 }
 
 TEST(SimpleMemoryArenaTest, BasicZeroAlloc) {
   TfLiteContext context;
   SimpleMemoryArena arena(64);
-  ArenaAlloc alloc;
+  ArenaAllocWithUsageInterval alloc;
 
   // Zero-sized allocs should have a 0 offset and size.
-  ASSERT_EQ(arena.Allocate(&context, 32, 0, &alloc), kTfLiteOk);
+  ASSERT_EQ(arena.Allocate(&context, 32, 0, 1, 2, &alloc), kTfLiteOk);
   EXPECT_EQ(alloc.offset, 0);
   EXPECT_EQ(alloc.size, 0);
 
-  // Deallocation of zero-sized allocs should always succeed (even redundantly).
-  ASSERT_EQ(arena.Deallocate(&context, alloc), kTfLiteOk);
-  ASSERT_EQ(arena.Deallocate(&context, alloc), kTfLiteOk);
-
   // The zero-sized alloc should resolve to null.
   char* resolved_ptr = nullptr;
   ASSERT_EQ(arena.Commit(&context), kTfLiteOk);
@@ -67,15 +61,13 @@ TEST(SimpleMemoryArenaTest, BasicZeroAlloc) {
 TEST(SimpleMemoryArenaTest, InterleavedZeroAlloc) {
   TfLiteContext context;
   SimpleMemoryArena arena(64);
-  ArenaAlloc allocs[4];
+  ArenaAllocWithUsageInterval allocs[4];
 
   // Interleave some zero and non-zero-sized allocations and deallocations.
-  ASSERT_EQ(arena.Allocate(&context, 32, 2047, &allocs[0]), kTfLiteOk);
-  ASSERT_EQ(arena.Allocate(&context, 32, 0, &allocs[1]), kTfLiteOk);
-  ASSERT_EQ(arena.Allocate(&context, 32, 1023, &allocs[2]), kTfLiteOk);
-  ASSERT_EQ(arena.Deallocate(&context, allocs[1]), kTfLiteOk);
-  ASSERT_EQ(arena.Deallocate(&context, allocs[2]), kTfLiteOk);
-  ASSERT_EQ(arena.Allocate(&context, 32, 2047, &allocs[3]), kTfLiteOk);
+  ASSERT_EQ(arena.Allocate(&context, 32, 2047, 0, 4, &allocs[0]), kTfLiteOk);
+  ASSERT_EQ(arena.Allocate(&context, 32, 0, 1, 2, &allocs[1]), kTfLiteOk);
+  ASSERT_EQ(arena.Allocate(&context, 32, 1023, 1, 2, &allocs[2]), kTfLiteOk);
+  ASSERT_EQ(arena.Allocate(&context, 32, 2047, 3, 4, &allocs[3]), kTfLiteOk);
 
   // Deallocation of a zero-sized alloc should not impact the allocator offsets.
   EXPECT_EQ(allocs[0].offset, 0);
@@ -87,11 +79,11 @@ TEST(SimpleMemoryArenaTest, InterleavedZeroAlloc) {
 TEST(SimpleMemoryArenaTest, TestAfterClear) {
   TfLiteContext context;
   SimpleMemoryArena arena(64);
-  ArenaAlloc allocs[9];
+  ArenaAllocWithUsageInterval allocs[9];
 
-  arena.Allocate(&context, 32, 2047, &allocs[0]);
-  arena.Allocate(&context, 32, 2047, &allocs[1]);
-  arena.Allocate(&context, 32, 2047, &allocs[2]);
+  arena.Allocate(&context, 32, 2047, 0, 2, &allocs[0]);
+  arena.Allocate(&context, 32, 2047, 1, 2, &allocs[1]);
+  arena.Allocate(&context, 32, 2047, 1, 2, &allocs[2]);
   arena.Commit(&context);
 
   EXPECT_EQ(allocs[0].offset, 0);
@@ -101,9 +93,9 @@ TEST(SimpleMemoryArenaTest, TestAfterClear) {
   arena.Clear();
 
   // Test with smaller allocs.
-  arena.Allocate(&context, 32, 1023, &allocs[3]);
-  arena.Allocate(&context, 32, 1023, &allocs[4]);
-  arena.Allocate(&context, 32, 1023, &allocs[5]);
+  arena.Allocate(&context, 32, 1023, 0, 2, &allocs[3]);
+  arena.Allocate(&context, 32, 1023, 1, 2, &allocs[4]);
+  arena.Allocate(&context, 32, 1023, 1, 2, &allocs[5]);
   arena.Commit(&context);
 
   EXPECT_EQ(allocs[3].offset, 0);
@@ -113,9 +105,9 @@ TEST(SimpleMemoryArenaTest, TestAfterClear) {
   arena.Clear();
 
   // Test larger allocs which should require a reallocation.
-  arena.Allocate(&context, 32, 4095, &allocs[6]);
-  arena.Allocate(&context, 32, 4095, &allocs[7]);
-  arena.Allocate(&context, 32, 4095, &allocs[8]);
+  arena.Allocate(&context, 32, 4095, 0, 2, &allocs[6]);
+  arena.Allocate(&context, 32, 4095, 1, 2, &allocs[7]);
+  arena.Allocate(&context, 32, 4095, 1, 2, &allocs[8]);
   arena.Commit(&context);
 
   EXPECT_EQ(allocs[6].offset, 0);